From 8697aad6eba4b7a0b8c988d5e0e27d7d176fdd45 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 9 Dec 2013 10:36:25 -0700 Subject: [PATCH 01/61] initial workflow object --- qiime/workflow/core.py | 77 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 77 insertions(+) create mode 100644 qiime/workflow/core.py diff --git a/qiime/workflow/core.py b/qiime/workflow/core.py new file mode 100644 index 0000000000..c4425435ef --- /dev/null +++ b/qiime/workflow/core.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python + +class Workflow(object): + """Arbitrary worflow support structure""" + def __init__(self, ShortCircuit=True, **kwargs): + """Build thy self + + ShortCiruit : if True, enables ignoring function groups when a given + item has failed + + kwargs are stored as self.Options. Support for arbitrary Stats is + implicit + """ + self.Options = kwargs + self.Stats = defaultdict(int) + self.ShortCircuit = ShortCircuit + + def _construct_iterator(self, **kwargs): + """Define the central iterator""" + raise NotImplementedError("Must be implemented") + + def _assign_function_groups(self, **kwargs): + """Determine what function groups will be used + + A function group is simply a function that subsequently calls the + methods of interested. For instance, you may have a _process_seqs + function group, that then calls _check_length, _split_sequence, etc. + """ + raise NotImplementedError("Must be implemented") + + def _initialize_item_state(self, item): + """Initialie the per-item state in self""" + raise NotImplementedError("Must be implemented") + + def __call__(self, success_callback, failed_callback, **kwargs): + """Operate on all the data + + success_callback : method to call on a successful item prior to + yielding + failed_callback : method to call on a failed item prior to yielding + kwargs : these will get passed to the iterator constructor and to the + the method that determines the function groups + """ + gen = self._construct_iterator(**kwargs) + function_groups = self._assign_function_groups(**kwargs) + + for item in gen: + self._initialize_item_state(item) + + for f in function_groups: + f(item) + + if self.Failed: + yield failed_callback(self.FinalState) + else: + yield success_callback(self.FinalState) + + def requires(self, f, IsValid=True, Option=None, Values=None): + """Decorator that executes a function if requirements are met + + f : the decorated function + IsValid : execute the function if self.Failed is False + Option : a required option + Values : required values associated with an option + """ + if not isinstance(values, set): + if isinstance(values, Iterable) + values = set(values) + else: + values = set([values]) + + def decorated(self, item): + if IsValid and self.Failed: + return + + if self.Options[option] in values: + f(item) From 45b2b576377f712c086e6682c2eb2b258744cc02 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 9 Dec 2013 15:08:49 -0700 Subject: [PATCH 02/61] core workflow object --- qiime/workflow/core.py | 61 ++++++-- tests/test_workflow/test_core.py | 251 +++++++++++++++++++++++++++++++ 2 files changed, 296 insertions(+), 16 deletions(-) create mode 100644 tests/test_workflow/test_core.py diff --git a/qiime/workflow/core.py b/qiime/workflow/core.py index c4425435ef..4b2b6419d3 100644 --- a/qiime/workflow/core.py +++ b/qiime/workflow/core.py @@ -1,5 +1,7 @@ #!/usr/bin/env python +from collections import Iterable, defaultdict + class Workflow(object): """Arbitrary worflow support structure""" def __init__(self, ShortCircuit=True, **kwargs): @@ -14,7 +16,9 @@ def __init__(self, ShortCircuit=True, **kwargs): self.Options = kwargs self.Stats = defaultdict(int) self.ShortCircuit = ShortCircuit - + self.Failed = False + self.FinalState = None + def _construct_iterator(self, **kwargs): """Define the central iterator""" raise NotImplementedError("Must be implemented") @@ -32,7 +36,7 @@ def _initialize_item_state(self, item): """Initialie the per-item state in self""" raise NotImplementedError("Must be implemented") - def __call__(self, success_callback, failed_callback, **kwargs): + def __call__(self, success_callback=None, failed_callback=None, **kwargs): """Operate on all the data success_callback : method to call on a successful item prior to @@ -41,37 +45,62 @@ def __call__(self, success_callback, failed_callback, **kwargs): kwargs : these will get passed to the iterator constructor and to the the method that determines the function groups """ + if success_callback is None: + success_callback = lambda x: x + gen = self._construct_iterator(**kwargs) function_groups = self._assign_function_groups(**kwargs) for item in gen: + self.Failed = False self._initialize_item_state(item) for f in function_groups: f(item) - if self.Failed: + if self.Failed and failed_callback is not None: yield failed_callback(self.FinalState) else: yield success_callback(self.FinalState) - def requires(self, f, IsValid=True, Option=None, Values=None): - """Decorator that executes a function if requirements are met - +class requires(object): + """Decorator that executes a function if requirements are met""" + def __init__(self, IsValid=True, Option=None, Values=None): + """ f : the decorated function IsValid : execute the function if self.Failed is False Option : a required option Values : required values associated with an option """ - if not isinstance(values, set): - if isinstance(values, Iterable) - values = set(values) + # self here is the requires object + self.IsValid = IsValid + self.Option = Option + self.Values = Values + + if not isinstance(self.Values, set): + if isinstance(self.Values, Iterable): + self.Values = set(self.Values) else: - values = set([values]) - - def decorated(self, item): - if IsValid and self.Failed: + self.Values = set([self.Values]) + + def __call__(outer_self, f): + # outer_self is the requires object + # self is expected to be a Workflow object + def decorated_with_option(self, *args, **kwargs): + if outer_self.IsValid and (self.Failed and self.ShortCircuit): return - - if self.Options[option] in values: - f(item) + + opt = self.Options.get(outer_self.Option, 'MISSING OPTION') + if opt != 'MISSING OPTION' and opt in outer_self.Values: + f(self, *args, **kwargs) + + def decorated_without_option(self, *args, **kwargs): + if outer_self.IsValid and (self.Failed and self.ShortCircuit): + return + + f(self, *args, **kwargs) + + if outer_self.Option is not None: + return decorated_with_option + else: + return decorated_without_option diff --git a/tests/test_workflow/test_core.py b/tests/test_workflow/test_core.py new file mode 100644 index 0000000000..cc36a4b83e --- /dev/null +++ b/tests/test_workflow/test_core.py @@ -0,0 +1,251 @@ +#!/usr/bin/env python + +from itertools import izip +from qiime.workflow.core import Workflow, requires +from cogent.util.unit_test import TestCase, main + +class MockWorkflow(Workflow): + def _construct_iterator(self, **kwargs): + to_gen = [] + for k in sorted(kwargs): + if k.startswith('iter'): + to_gen.append(kwargs[k]) + if len(to_gen) == 1: + return (x for x in to_gen[0]) + else: + return izip(*to_gen) + + def _assign_function_groups(self, **kwargs): + groups = [] + if 'A' in kwargs: + groups.append(self.groupA) + if 'B' in kwargs: + groups.append(self.groupB) + if 'C' in kwargs: + groups.append(self.groupC) + return groups + + def _initialize_item_state(self, item): + self.Foo = None + self.Bar = None + + def groupA(self, item): + self.methodA1(item) + self.methodA2(item) + + def groupB(self, item): + self.methodB1(item) + self.methodB2(item) + + @requires(IsValid=True) + def groupC(self, item): + self.methodC1(item) + self.methodC2(item) + + @requires(IsValid=False) # always execute + def methodA1(self, item): + name = 'A1' + self.Stats[name] += 1 + if item == 'fail %s' % name: + self.Failed = True + self.FinalState = (name, item) + + def methodA2(self, item): + name = 'A2' + self.Stats[name] += 1 + if item == 'fail %s' % name: + self.Failed = True + self.FinalState = (name, item) + + @requires(IsValid=False) + def methodB1(self, item): + name = 'B1' + self.Stats[name] += 1 + if item == 'fail %s' % name: + self.Failed = True + self.FinalState = 'failed' + else: + self.FinalState = (name, item) + + @requires(Option='foo', Values=[1,2,3]) + def methodB2(self, item): + name = 'B2' + self.Stats[name] += 1 + if item == 'fail %s' % name: + self.Failed = True + self.FinalState = 'failed' + else: + self.FinalState = (name, item) + + @requires(IsValid=True) + def methodC1(self, item): + name = 'C1' + self.Stats[name] += 1 + if item == 'fail %s' % name: + self.Failed = True + self.FinalState = (name, item) + + @requires(IsValid=True, Option='C2', Values=[1,2,3]) + def methodC2(self, item): + name = 'C2' + self.Stats[name] += 1 + if item == 'fail %s' % name: + self.Failed = True + self.FinalState = (name, item) + +class WorkflowTests(TestCase): + def setUp(self): + self.obj_short = MockWorkflow(**{'A1':'foo', 'xyz':10,'C2':2}) + self.obj_noshort = MockWorkflow(ShortCircuit=False, **{'A1':'foo', + 'xyz':10,'C2':2}) + + def test_init(self): + self.assertEqual(self.obj_short.Options, {'A1':'foo', 'xyz':10, 'C2':2}) + self.assertEqual(self.obj_short.Stats, {}) + self.assertTrue(self.obj_short.ShortCircuit) + self.assertEqual(self.obj_noshort.Options, {'A1':'foo', 'xyz':10, + 'C2':2}) + self.assertEqual(self.obj_noshort.Stats, {}) + self.assertFalse(self.obj_noshort.ShortCircuit) + + def test_construct_iterator(self): + exp_1gen = [1,2,3,4,5] + exp_2gen = [(1,6),(2,7),(3,8),(4,9),(5,10)] + + single_iter = {'iter_x':[1,2,3,4,5]} + double_iter = {'iter_x':[1,2,3,4,5], 'iter_y':[6,7,8,9,10]} + + obs_1gen = list(self.obj_short._construct_iterator(**single_iter)) + obs_2gen = list(self.obj_short._construct_iterator(**double_iter)) + + self.assertEqual(obs_1gen, exp_1gen) + self.assertEqual(obs_2gen, exp_2gen) + + def test_assign_function_groups(self): + exp_None = [] + exp_AB = [self.obj_short.groupA, self.obj_short.groupB] + + obs_None = self.obj_short._assign_function_groups() + obs_AB = self.obj_short._assign_function_groups(**{'A':None, 'B':None}) + + self.assertEqual(obs_None, exp_None) + self.assertEqual(obs_AB, exp_AB) + + def test_initialize_item_state(self): + self.obj_short._initialize_item_state(None) + self.assertEqual(self.obj_short.Foo, None) + self.assertEqual(self.obj_short.Bar, None) + + def test_call_AC_no_fail(self): + exp_stats = {'A1':5, 'A2':5, 'C1':5, 'C2':5} + exp_result = [('C2',1), ('C2',2), ('C2',3), ('C2',4), ('C2', 5)] + + kwargs = {'iter_x':[1,2,3,4,5], 'A':None, 'C':None, 'C2':1} + obs_result = list(self.obj_short(None, None, **kwargs)) + + self.assertEqual(obs_result, exp_result) + self.assertEqual(self.obj_short.Stats, exp_stats) + + def test_call_AC_fail(self): + exp_stats = {'A1':5, 'A2':5, 'C1':4, 'C2':4} + + kwargs = {'iter_x':[1,2,'fail A2',4,5], 'A':None, 'C':None, 'C2':1} + + # pass in a failed callback to capture the result, and pause execution + gen = self.obj_short(None, lambda x: x, **kwargs) + + r1 = gen.next() + self.assertEqual(r1, ('C2', 1)) + self.assertFalse(self.obj_short.Failed) + + r2 = gen.next() + self.assertEqual(r2, ('C2', 2)) + self.assertFalse(self.obj_short.Failed) + + r3 = gen.next() + self.assertEqual(self.obj_short.FinalState, ('A2', 'fail A2')) + self.assertTrue(self.obj_short.Failed) + self.assertEqual(r3, ('A2', 'fail A2')) + + r4 = gen.next() + self.assertEqual(r4, ('C2', 4)) + self.assertFalse(self.obj_short.Failed) + + r5 = gen.next() + self.assertEqual(r5, ('C2', 5)) + self.assertFalse(self.obj_short.Failed) + + self.assertEqual(self.obj_short.Stats, exp_stats) + + def test_call_AC_fail_noshort(self): + exp_stats = {'A1':5, 'A2':5, 'C1':5, 'C2':5} + + kwargs = {'iter_x':[1,2,'fail A2',4,5], 'A':None, 'C':None, 'C2':1} + + # pass in a failed callback to capture the result, and pause execution + gen = self.obj_noshort(None, lambda x: x, **kwargs) + + r1 = gen.next() + self.assertEqual(r1, ('C2', 1)) + self.assertFalse(self.obj_noshort.Failed) + + r2 = gen.next() + self.assertEqual(r2, ('C2', 2)) + self.assertFalse(self.obj_noshort.Failed) + + r3 = gen.next() + self.assertEqual(self.obj_noshort.FinalState, ('C2', 'fail A2')) + self.assertTrue(self.obj_noshort.Failed) + + r4 = gen.next() + self.assertEqual(r4, ('C2', 4)) + self.assertFalse(self.obj_noshort.Failed) + + r5 = gen.next() + self.assertEqual(r5, ('C2', 5)) + self.assertFalse(self.obj_noshort.Failed) + + self.assertEqual(self.obj_noshort.Stats, exp_stats) + +class RequiresTests(TestCase): + def test_methodb1(self): + obj = MockWorkflow() + obj.methodB1('test') + self.assertEqual(obj.FinalState, ('B1', 'test')) + self.assertFalse(obj.Failed) + + # methodb1 executes regardless of if self.Failed + obj.Failed = True + obj.methodB1('test 2') + self.assertEqual(obj.FinalState, ('B1', 'test 2')) + + obj.Failed = False + obj.methodB1('fail B1') + self.assertEqual(obj.FinalState, 'failed') + + self.assertEqual(obj.Stats, {'B1':3}) + + def test_methodb2_accept(self): + # methodb2 is setup to be valid when foo is in [1,2,3], make sure we + # can execute + obj = MockWorkflow(**{'foo':1}) + obj.methodB2('test') + self.assertEqual(obj.FinalState, ('B2', 'test')) + self.assertEqual(obj.Stats, {'B2':1}) + + # methodb2 will not execute if self.Failed + obj.Failed = True + obj.methodB2('test 2') + self.assertEqual(obj.FinalState, ('B2', 'test')) + self.assertEqual(obj.Stats, {'B2':1}) + + def test_methodb2_ignore(self): + # methodb2 is setup to be valid when foo is in [1, 2, 3], make sure + # we do not execute + obj = MockWorkflow(**{'foo':'bar'}) + obj.methodB2('test') + self.assertEqual(obj.FinalState, None) + self.assertEqual(obj.Stats, {}) + +if __name__ == '__main__': + main() From 6a7e1306a9cc757dac8a695672134b4d8a659503 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 10 Dec 2013 10:28:58 -0700 Subject: [PATCH 03/61] improved doc, simplified logic --- qiime/workflow/core.py | 80 ++++++++++++++++++++------------ tests/test_workflow/test_core.py | 66 +++++++++++--------------- 2 files changed, 79 insertions(+), 67 deletions(-) diff --git a/qiime/workflow/core.py b/qiime/workflow/core.py index 4b2b6419d3..9ae6a3e595 100644 --- a/qiime/workflow/core.py +++ b/qiime/workflow/core.py @@ -1,7 +1,11 @@ #!/usr/bin/env python +from functools import update_wrapper from collections import Iterable, defaultdict +# thank you Flask project... +_missing = object() + class Workflow(object): """Arbitrary worflow support structure""" def __init__(self, ShortCircuit=True, **kwargs): @@ -19,10 +23,6 @@ def __init__(self, ShortCircuit=True, **kwargs): self.Failed = False self.FinalState = None - def _construct_iterator(self, **kwargs): - """Define the central iterator""" - raise NotImplementedError("Must be implemented") - def _assign_function_groups(self, **kwargs): """Determine what function groups will be used @@ -32,13 +32,10 @@ def _assign_function_groups(self, **kwargs): """ raise NotImplementedError("Must be implemented") - def _initialize_item_state(self, item): - """Initialie the per-item state in self""" - raise NotImplementedError("Must be implemented") - - def __call__(self, success_callback=None, failed_callback=None, **kwargs): + def __call__(self, it, success_callback=None, failed_callback=None, **kwargs): """Operate on all the data + it : an iterator success_callback : method to call on a successful item prior to yielding failed_callback : method to call on a failed item prior to yielding @@ -46,28 +43,32 @@ def __call__(self, success_callback=None, failed_callback=None, **kwargs): the method that determines the function groups """ if success_callback is None: - success_callback = lambda x: x + success_callback = lambda x: x.FinalState - gen = self._construct_iterator(**kwargs) function_groups = self._assign_function_groups(**kwargs) - for item in gen: + # note: can also implement a peek and prune approach where only the + # methods that execute on the first item (w/o short circuiting) are + # subsequently left in the workflow. The functions can then be + # chained as well. this reduces the number of function calls, but + # likely adds a little more complexity into using this object + + for item in it: self.Failed = False - self._initialize_item_state(item) + self.FinalState = None for f in function_groups: f(item) if self.Failed and failed_callback is not None: - yield failed_callback(self.FinalState) + yield failed_callback(self) else: - yield success_callback(self.FinalState) + yield success_callback(self) class requires(object): """Decorator that executes a function if requirements are met""" def __init__(self, IsValid=True, Option=None, Values=None): """ - f : the decorated function IsValid : execute the function if self.Failed is False Option : a required option Values : required values associated with an option @@ -83,24 +84,45 @@ def __init__(self, IsValid=True, Option=None, Values=None): else: self.Values = set([self.Values]) - def __call__(outer_self, f): + if _missing in self.Values: + raise ValueError("_missing cannot be in Values!") + + def doShortCircuit(self, wrapped): + if self.IsValid and (wrapped.Failed and wrapped.ShortCircuit): + return True + else: + return False + + def __call__(self, f): + """Wrap a function + + f : the function to wrap + """ # outer_self is the requires object # self is expected to be a Workflow object - def decorated_with_option(self, *args, **kwargs): - if outer_self.IsValid and (self.Failed and self.ShortCircuit): + def decorated_with_option(dec_self, *args, **kwargs): + """A decorated function that has an option to validate + + dec_self : this is "self" for the decorated function + """ + if self.doShortCircuit(dec_self): return - - opt = self.Options.get(outer_self.Option, 'MISSING OPTION') - if opt != 'MISSING OPTION' and opt in outer_self.Values: - f(self, *args, **kwargs) + + value = dec_self.Options.get(self.Option, _missing) + if value in self.Values: + f(dec_self, *args, **kwargs) - def decorated_without_option(self, *args, **kwargs): - if outer_self.IsValid and (self.Failed and self.ShortCircuit): + def decorated_without_option(dec_self, *args, **kwargs): + """A decorated function that does not have an option to validate + + dec_self : this is "self" for the decorated function + """ + if self.doShortCircuit(dec_self): return - f(self, *args, **kwargs) + f(dec_self, *args, **kwargs) - if outer_self.Option is not None: - return decorated_with_option + if self.Option is None: + return update_wrapper(decorated_without_option, f) else: - return decorated_without_option + return update_wrapper(decorated_with_option, f) diff --git a/tests/test_workflow/test_core.py b/tests/test_workflow/test_core.py index cc36a4b83e..edab7c1cf6 100644 --- a/tests/test_workflow/test_core.py +++ b/tests/test_workflow/test_core.py @@ -4,17 +4,18 @@ from qiime.workflow.core import Workflow, requires from cogent.util.unit_test import TestCase, main -class MockWorkflow(Workflow): - def _construct_iterator(self, **kwargs): - to_gen = [] - for k in sorted(kwargs): - if k.startswith('iter'): - to_gen.append(kwargs[k]) - if len(to_gen) == 1: - return (x for x in to_gen[0]) - else: - return izip(*to_gen) +def construct_iterator(**kwargs): + """make an iterator for testing purposes""" + to_gen = [] + for k in sorted(kwargs): + if k.startswith('iter'): + to_gen.append(kwargs[k]) + if len(to_gen) == 1: + return (x for x in to_gen[0]) + else: + return izip(*to_gen) +class MockWorkflow(Workflow): def _assign_function_groups(self, **kwargs): groups = [] if 'A' in kwargs: @@ -25,10 +26,6 @@ def _assign_function_groups(self, **kwargs): groups.append(self.groupC) return groups - def _initialize_item_state(self, item): - self.Foo = None - self.Bar = None - def groupA(self, item): self.methodA1(item) self.methodA2(item) @@ -108,19 +105,6 @@ def test_init(self): self.assertEqual(self.obj_noshort.Stats, {}) self.assertFalse(self.obj_noshort.ShortCircuit) - def test_construct_iterator(self): - exp_1gen = [1,2,3,4,5] - exp_2gen = [(1,6),(2,7),(3,8),(4,9),(5,10)] - - single_iter = {'iter_x':[1,2,3,4,5]} - double_iter = {'iter_x':[1,2,3,4,5], 'iter_y':[6,7,8,9,10]} - - obs_1gen = list(self.obj_short._construct_iterator(**single_iter)) - obs_2gen = list(self.obj_short._construct_iterator(**double_iter)) - - self.assertEqual(obs_1gen, exp_1gen) - self.assertEqual(obs_2gen, exp_2gen) - def test_assign_function_groups(self): exp_None = [] exp_AB = [self.obj_short.groupA, self.obj_short.groupB] @@ -131,28 +115,30 @@ def test_assign_function_groups(self): self.assertEqual(obs_None, exp_None) self.assertEqual(obs_AB, exp_AB) - def test_initialize_item_state(self): - self.obj_short._initialize_item_state(None) - self.assertEqual(self.obj_short.Foo, None) - self.assertEqual(self.obj_short.Bar, None) - def test_call_AC_no_fail(self): + single_iter = construct_iterator(**{'iter_x':[1,2,3,4,5]}) + sf = lambda x: x.FinalState # success function + exp_stats = {'A1':5, 'A2':5, 'C1':5, 'C2':5} exp_result = [('C2',1), ('C2',2), ('C2',3), ('C2',4), ('C2', 5)] - kwargs = {'iter_x':[1,2,3,4,5], 'A':None, 'C':None, 'C2':1} - obs_result = list(self.obj_short(None, None, **kwargs)) + kwargs = {'A':None, 'C':None} + obs_result = list(self.obj_short(single_iter, sf, None, **kwargs)) self.assertEqual(obs_result, exp_result) self.assertEqual(self.obj_short.Stats, exp_stats) def test_call_AC_fail(self): + single_iter = construct_iterator(**{'iter_x':[1,2,'fail A2',4,5]}) + sf = lambda x: x.FinalState # success function + ff = lambda x: x.FinalState # failed function + exp_stats = {'A1':5, 'A2':5, 'C1':4, 'C2':4} - kwargs = {'iter_x':[1,2,'fail A2',4,5], 'A':None, 'C':None, 'C2':1} + kwargs = {'A':None, 'C':None, 'C2':1} # pass in a failed callback to capture the result, and pause execution - gen = self.obj_short(None, lambda x: x, **kwargs) + gen = self.obj_short(single_iter, sf, ff, **kwargs) r1 = gen.next() self.assertEqual(r1, ('C2', 1)) @@ -178,12 +164,16 @@ def test_call_AC_fail(self): self.assertEqual(self.obj_short.Stats, exp_stats) def test_call_AC_fail_noshort(self): + single_iter = construct_iterator(**{'iter_x':[1,2,'fail A2',4,5]}) + sf = lambda x: x.FinalState # success function + ff = lambda x: x.FinalState # failed function + exp_stats = {'A1':5, 'A2':5, 'C1':5, 'C2':5} - kwargs = {'iter_x':[1,2,'fail A2',4,5], 'A':None, 'C':None, 'C2':1} + kwargs = {'A':None, 'C':None} # pass in a failed callback to capture the result, and pause execution - gen = self.obj_noshort(None, lambda x: x, **kwargs) + gen = self.obj_noshort(single_iter, sf, ff, **kwargs) r1 = gen.next() self.assertEqual(r1, ('C2', 1)) From 7d89866b9b6207be6e990a821e647ab00159b893 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 10 Dec 2013 11:34:26 -0700 Subject: [PATCH 04/61] few things: * improved docstrings * removed required methods for subclasses to implement * an iterator is now passed in instead of constructed * workflow is determined up front, so functions dependent on options are now actually completely avoided * function priority can be assigned --- qiime/workflow/core.py | 83 +++++++++++++++++------- tests/test_workflow/test_core.py | 107 +++++++++++++++++-------------- 2 files changed, 119 insertions(+), 71 deletions(-) diff --git a/qiime/workflow/core.py b/qiime/workflow/core.py index 9ae6a3e595..d7f29ba902 100644 --- a/qiime/workflow/core.py +++ b/qiime/workflow/core.py @@ -1,10 +1,21 @@ #!/usr/bin/env python +from itertools import chain from functools import update_wrapper from collections import Iterable, defaultdict +__author__ = "Daniel McDonald" +__copyright__ = "Copyright 2013, The QIIME Project" +__credits__ = ["Daniel McDonald", "Tony Walters"] +__license__ = "BSD" # NOTE, this script does _not_ import GPL code +__version__ = "1.7.0-dev" +__maintainer__ = "Daniel McDonald" +__email__ = "mcdonadt@colorado.edu" +__status__ = "Development" + # thank you Flask project... _missing = object() +_executed = object() class Workflow(object): """Arbitrary worflow support structure""" @@ -23,45 +34,60 @@ def __init__(self, ShortCircuit=True, **kwargs): self.Failed = False self.FinalState = None - def _assign_function_groups(self, **kwargs): - """Determine what function groups will be used - - A function group is simply a function that subsequently calls the - methods of interested. For instance, you may have a _process_seqs - function group, that then calls _check_length, _split_sequence, etc. + def _all_workflow_methods(self, default_priority=0): + """Get all workflow methods + + Methods are sorted by priority """ - raise NotImplementedError("Must be implemented") + methods = [getattr(self, f) for f in dir(self) if f.startswith('wf_')] + key = lambda x: getattr(x, 'Priority', default_priority) + return sorted(methods, key=key, reverse=True) + + def _get_workflow(self, it): + """Get the methods executed, sorted by priority""" + # save state + shortcircuit_state = self.ShortCircuit + self.ShortCircuit = False + stats = self.Stats.copy() + + all_wk_methods = self._all_workflow_methods() + + peek = it.next() + generator_reset = chain([peek], it) + + executed = [] + for f in all_wk_methods: + if f(peek) is _executed: + executed.append(f) + + # restore state + self.ShortCircuit = shortcircuit_state + self.Stats = stats + + return generator_reset, executed - def __call__(self, it, success_callback=None, failed_callback=None, **kwargs): + def __call__(self, it, success_callback=None, fail_callback=None): """Operate on all the data it : an iterator success_callback : method to call on a successful item prior to yielding - failed_callback : method to call on a failed item prior to yielding - kwargs : these will get passed to the iterator constructor and to the - the method that determines the function groups + fail_callback : method to call on a failed item prior to yielding """ if success_callback is None: success_callback = lambda x: x.FinalState - function_groups = self._assign_function_groups(**kwargs) - - # note: can also implement a peek and prune approach where only the - # methods that execute on the first item (w/o short circuiting) are - # subsequently left in the workflow. The functions can then be - # chained as well. this reduces the number of function calls, but - # likely adds a little more complexity into using this object - + it, workflow = self._get_workflow(it) + for item in it: self.Failed = False self.FinalState = None - for f in function_groups: + for f in workflow: f(item) - if self.Failed and failed_callback is not None: - yield failed_callback(self) + if self.Failed and fail_callback is not None: + yield fail_callback(self) else: yield success_callback(self) @@ -111,7 +137,8 @@ def decorated_with_option(dec_self, *args, **kwargs): value = dec_self.Options.get(self.Option, _missing) if value in self.Values: f(dec_self, *args, **kwargs) - + return _executed + def decorated_without_option(dec_self, *args, **kwargs): """A decorated function that does not have an option to validate @@ -121,8 +148,18 @@ def decorated_without_option(dec_self, *args, **kwargs): return f(dec_self, *args, **kwargs) + return _executed if self.Option is None: return update_wrapper(decorated_without_option, f) else: return update_wrapper(decorated_with_option, f) + +class priority(object): + """Sets a function priority""" + def __init__(self, Priority): + self.Priority = Priority + + def __call__(self, f): + f.Priority = self.Priority + return f diff --git a/tests/test_workflow/test_core.py b/tests/test_workflow/test_core.py index edab7c1cf6..2f5125bbee 100644 --- a/tests/test_workflow/test_core.py +++ b/tests/test_workflow/test_core.py @@ -1,9 +1,18 @@ #!/usr/bin/env python from itertools import izip -from qiime.workflow.core import Workflow, requires +from qiime.workflow.core import Workflow, requires, priority from cogent.util.unit_test import TestCase, main +__author__ = "Daniel McDonald" +__copyright__ = "Copyright 2013, The QIIME Project" +__credits__ = ["Daniel McDonald", +__license__ = "BSD" # NOTE, this script does _not_ import GPL code +__version__ = "1.7.0-dev" +__maintainer__ = "Daniel McDonald" +__email__ = "mcdonadt@colorado.edu" +__status__ = "Development" + def construct_iterator(**kwargs): """make an iterator for testing purposes""" to_gen = [] @@ -16,26 +25,20 @@ def construct_iterator(**kwargs): return izip(*to_gen) class MockWorkflow(Workflow): - def _assign_function_groups(self, **kwargs): - groups = [] - if 'A' in kwargs: - groups.append(self.groupA) - if 'B' in kwargs: - groups.append(self.groupB) - if 'C' in kwargs: - groups.append(self.groupC) - return groups - - def groupA(self, item): + @priority(90) + @requires(Option='A', Values=True) + def wf_groupA(self, item): self.methodA1(item) self.methodA2(item) - def groupB(self, item): + @requires(Option='B', Values=True) + def wf_groupB(self, item): self.methodB1(item) self.methodB2(item) - @requires(IsValid=True) - def groupC(self, item): + @priority(10) + @requires(Option='C', Values=True) + def wf_groupC(self, item): self.methodC1(item) self.methodC2(item) @@ -92,38 +95,38 @@ def methodC2(self, item): class WorkflowTests(TestCase): def setUp(self): - self.obj_short = MockWorkflow(**{'A1':'foo', 'xyz':10,'C2':2}) - self.obj_noshort = MockWorkflow(ShortCircuit=False, **{'A1':'foo', - 'xyz':10,'C2':2}) + self.obj_short = MockWorkflow(**{'A':True, 'C':True}) + self.obj_noshort = MockWorkflow(ShortCircuit=False, **{'A':True, + 'C':True}) + + def test_get_workflow(self): + gen = single_iter = construct_iterator(**{'iter_x':[1,2,3,4,5]}) + exp_wf = [self.obj_short.wf_groupA, self.obj_short.wf_groupC] + obs_gen, obs_wf = self.obj_short._get_workflow(gen) + self.assertEqual(obs_wf, exp_wf) + self.assertEqual(list(obs_gen), [1,2,3,4,5]) + + self.assertEqual(self.obj_short.Stats, {}) + self.assertTrue(self.obj_short.ShortCircuit) + def test_init(self): - self.assertEqual(self.obj_short.Options, {'A1':'foo', 'xyz':10, 'C2':2}) + self.assertEqual(self.obj_short.Options, {'A':True, 'C':True}) self.assertEqual(self.obj_short.Stats, {}) self.assertTrue(self.obj_short.ShortCircuit) - self.assertEqual(self.obj_noshort.Options, {'A1':'foo', 'xyz':10, - 'C2':2}) + self.assertEqual(self.obj_noshort.Options, {'A':True, 'C':True}) self.assertEqual(self.obj_noshort.Stats, {}) self.assertFalse(self.obj_noshort.ShortCircuit) - def test_assign_function_groups(self): - exp_None = [] - exp_AB = [self.obj_short.groupA, self.obj_short.groupB] - - obs_None = self.obj_short._assign_function_groups() - obs_AB = self.obj_short._assign_function_groups(**{'A':None, 'B':None}) - - self.assertEqual(obs_None, exp_None) - self.assertEqual(obs_AB, exp_AB) - def test_call_AC_no_fail(self): single_iter = construct_iterator(**{'iter_x':[1,2,3,4,5]}) sf = lambda x: x.FinalState # success function - exp_stats = {'A1':5, 'A2':5, 'C1':5, 'C2':5} - exp_result = [('C2',1), ('C2',2), ('C2',3), ('C2',4), ('C2', 5)] + exp_stats = {'A1':5, 'A2':5, 'C1':5} + # C2 isn't executed as its requirements aren't met in the Options + exp_result = [('C1',1), ('C1',2), ('C1',3), ('C1',4), ('C1', 5)] - kwargs = {'A':None, 'C':None} - obs_result = list(self.obj_short(single_iter, sf, None, **kwargs)) + obs_result = list(self.obj_short(single_iter, sf, None)) self.assertEqual(obs_result, exp_result) self.assertEqual(self.obj_short.Stats, exp_stats) @@ -135,10 +138,9 @@ def test_call_AC_fail(self): exp_stats = {'A1':5, 'A2':5, 'C1':4, 'C2':4} - kwargs = {'A':None, 'C':None, 'C2':1} - + self.obj_short.Options['C2'] = 1 # pass in a failed callback to capture the result, and pause execution - gen = self.obj_short(single_iter, sf, ff, **kwargs) + gen = self.obj_short(single_iter, sf, ff) r1 = gen.next() self.assertEqual(r1, ('C2', 1)) @@ -168,31 +170,29 @@ def test_call_AC_fail_noshort(self): sf = lambda x: x.FinalState # success function ff = lambda x: x.FinalState # failed function - exp_stats = {'A1':5, 'A2':5, 'C1':5, 'C2':5} - - kwargs = {'A':None, 'C':None} + exp_stats = {'A1':5, 'A2':5, 'C1':5} # pass in a failed callback to capture the result, and pause execution - gen = self.obj_noshort(single_iter, sf, ff, **kwargs) + gen = self.obj_noshort(single_iter, sf, ff) r1 = gen.next() - self.assertEqual(r1, ('C2', 1)) + self.assertEqual(r1, ('C1', 1)) self.assertFalse(self.obj_noshort.Failed) r2 = gen.next() - self.assertEqual(r2, ('C2', 2)) + self.assertEqual(r2, ('C1', 2)) self.assertFalse(self.obj_noshort.Failed) r3 = gen.next() - self.assertEqual(self.obj_noshort.FinalState, ('C2', 'fail A2')) + self.assertEqual(self.obj_noshort.FinalState, ('C1', 'fail A2')) self.assertTrue(self.obj_noshort.Failed) r4 = gen.next() - self.assertEqual(r4, ('C2', 4)) + self.assertEqual(r4, ('C1', 4)) self.assertFalse(self.obj_noshort.Failed) r5 = gen.next() - self.assertEqual(r5, ('C2', 5)) + self.assertEqual(r5, ('C1', 5)) self.assertFalse(self.obj_noshort.Failed) self.assertEqual(self.obj_noshort.Stats, exp_stats) @@ -236,6 +236,17 @@ def test_methodb2_ignore(self): obj.methodB2('test') self.assertEqual(obj.FinalState, None) self.assertEqual(obj.Stats, {}) - + +class PriorityTests(TestCase): + def test_dec(self): + @priority(10) + def foo(x,y,z): + """doc check""" + return x+y+z + + self.assertEqual(foo.Priority, 10) + self.assertEqual(foo.__name__, 'foo') + self.assertEqual(foo.__doc__, 'doc check') + if __name__ == '__main__': main() From 8e39195756418f85aec95dc8e1887785cc84ac61 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 10 Dec 2013 12:18:33 -0700 Subject: [PATCH 05/61] few things: * simplified get_workflow * updated license and unittest import * added all_wf_methods test --- qiime/workflow/core.py | 14 +++----------- tests/test_workflow/test_core.py | 13 ++++++++++--- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/qiime/workflow/core.py b/qiime/workflow/core.py index d7f29ba902..a52749ab90 100644 --- a/qiime/workflow/core.py +++ b/qiime/workflow/core.py @@ -34,7 +34,7 @@ def __init__(self, ShortCircuit=True, **kwargs): self.Failed = False self.FinalState = None - def _all_workflow_methods(self, default_priority=0): + def _all_wf_methods(self, default_priority=0): """Get all workflow methods Methods are sorted by priority @@ -50,19 +50,13 @@ def _get_workflow(self, it): self.ShortCircuit = False stats = self.Stats.copy() - all_wk_methods = self._all_workflow_methods() - peek = it.next() - generator_reset = chain([peek], it) - - executed = [] - for f in all_wk_methods: - if f(peek) is _executed: - executed.append(f) + executed = [f for f in self._all_wf_methods() if f(peek) is _executed] # restore state self.ShortCircuit = shortcircuit_state self.Stats = stats + generator_reset = chain([peek], it) return generator_reset, executed @@ -124,8 +118,6 @@ def __call__(self, f): f : the function to wrap """ - # outer_self is the requires object - # self is expected to be a Workflow object def decorated_with_option(dec_self, *args, **kwargs): """A decorated function that has an option to validate diff --git a/tests/test_workflow/test_core.py b/tests/test_workflow/test_core.py index 2f5125bbee..a4cd266a31 100644 --- a/tests/test_workflow/test_core.py +++ b/tests/test_workflow/test_core.py @@ -2,12 +2,12 @@ from itertools import izip from qiime.workflow.core import Workflow, requires, priority -from cogent.util.unit_test import TestCase, main +from unittest import TestCase, main __author__ = "Daniel McDonald" __copyright__ = "Copyright 2013, The QIIME Project" -__credits__ = ["Daniel McDonald", -__license__ = "BSD" # NOTE, this script does _not_ import GPL code +__credits__ = ["Daniel McDonald"] +__license__ = "BSD" # NOTE, does not import any GPL code __version__ = "1.7.0-dev" __maintainer__ = "Daniel McDonald" __email__ = "mcdonadt@colorado.edu" @@ -118,6 +118,13 @@ def test_init(self): self.assertEqual(self.obj_noshort.Stats, {}) self.assertFalse(self.obj_noshort.ShortCircuit) + def test_all_wf_methods(self): + # note on priority: groupA:90, groupC:10, groupB:0 (default) + exp = [self.obj_short.wf_groupA, self.obj_short.wf_groupC, + self.obj_short.wf_groupB] + obs = self.obj_short._all_wf_methods() + self.assertEqual(obs, exp) + def test_call_AC_no_fail(self): single_iter = construct_iterator(**{'iter_x':[1,2,3,4,5]}) sf = lambda x: x.FinalState # success function From 5001f4c0ca6a86aae62c905fb7dbfc817643018c Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 10 Dec 2013 13:47:14 -0700 Subject: [PATCH 06/61] starting workflow version of quality_filter_fasta --- qiime/quality_filter_fasta.py | 37 ++++++++++ tests/test_quality_filter_fasta.py | 112 +++++++++++++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 qiime/quality_filter_fasta.py create mode 100644 tests/test_quality_filter_fasta.py diff --git a/qiime/quality_filter_fasta.py b/qiime/quality_filter_fasta.py new file mode 100644 index 0000000000..8f83fca159 --- /dev/null +++ b/qiime/quality_filter_fasta.py @@ -0,0 +1,37 @@ +#!/usr/bin/env python + +from qiime.workflow.core import Workflow, requires, priority, _continuous +from cogent.parse.fasta import MinimalFastaParser +from qiime.parse import MinimalQualParser +from itertools import chain, izip + +def _fasta_qual_strict(fasta_gen, qual_gen): + for (seq_id, seq), (qual_id, qual) in izip(fasta_gen, qual_gen): + if seq_id != qual_id: + raise ValueError("%s is not equal to %s!" % (seq_id, qual_id)) + if len(seq) != len(qual): + raise ValueError("%s is not equal length to %s!" % (seq_id,qual_id)) + + yield (seq_id, seq, qual_id, qual) + +def fasta_qual_iterator(fasta_fps, qual_fps=None): + fasta_gens = chain(*map(MinimalFastaParser, fasta_fps)) + + if qual_fps is not None: + qual_gens = chain(*map(MinimalQualParser, qual_fps)) + gen = _fasta_qual_strict(fasta_gens, qual_gens) + else: + qual_gens = None + gen = ((seq_id, seq, None, None) for seq_id, seq in fasta_gens) + + return gen + +class QualFilterFastaWorkflow(Workflow): + @priority(90) + @requires(Option='min_seq_len', Values=_continuous) + def wf_length_check(self, item): + seq_id, seq, qual_id, qual = item + + if len(seq) < self.Options['min_seq_len']: + self.Failed = True + self.Stats['min_seq_len'] += 1 diff --git a/tests/test_quality_filter_fasta.py b/tests/test_quality_filter_fasta.py new file mode 100644 index 0000000000..96e36b7d7b --- /dev/null +++ b/tests/test_quality_filter_fasta.py @@ -0,0 +1,112 @@ +#!/usr/bin/env python + +from cogent.util.unit_test import TestCase, main +from cogent.parse.fasta import MinimalFastaParser +from qiime.parse import MinimalQualParser +from itertools import chain +from numpy import array +from qiime.quality_filter_fasta import (_fasta_qual_strict, + fasta_qual_iterator) + +class IteratorTests(TestCase): + def setUp(self): + fasta1_gen = MinimalFastaParser(fasta1.splitlines()) + qual1_gen = MinimalQualParser(qual1.splitlines()) + fasta2_gen = MinimalFastaParser(fasta2.splitlines()) + qual2_gen = MinimalQualParser(qual2.splitlines()) + qual2_bad_gen = MinimalQualParser(qual2_bad.splitlines()) + + self.fasta_gen = chain(fasta1_gen, fasta2_gen) + self.qual_gen = chain(qual1_gen, qual2_gen) + + self.reversed_fasta_gen = chain(fasta2_gen, fasta1_gen) + self.qual_bad_gen = chain(qual1_gen, qual2_bad_gen) + + def test_fasta_qual_strict_simple(self): + exp = [('a', 'abcde', 'a', array([1, 2, 3, 4, 5])), + ('b', 'asdasdasd', 'b', array([1,1,1,1,1,1,1,1,1])), + ('c', '123123', 'c', array([2, 2, 2, 2, 2, 2])), + ('x', 'abcdefg', 'x', array([1, 2, 3, 4, 5, 6, 7])), + ('y', 'popopo', 'y', array([1, 1, 1, 1, 1, 1]))] + + obs = _fasta_qual_strict(self.fasta_gen, self.qual_gen) + for o,e in zip(obs,exp): + osi, osd, oqi, oqd = o + esi, esd, eqi, eqd = e + self.assertEqual((osi, osd, oqi), (esi, esd, eqi)) + self.assertTrue((oqd == eqd).all()) + + def test_fasta_qual_strict_mismatch_ids(self): + with self.assertRaises(ValueError): + g = _fasta_qual_strict(self.reversed_fasta_gen, self.qual_gen) + _ = list(g) + + def test_fasta_qual_strict_mismatch_length(self): + with self.assertRaises(ValueError): + _ = list(_fasta_qual_strict(self.fasta_gen, self.qual_bad_gen)) + + def test_fasta_qual_iterators_just_fasta(self): + exp = [('a', 'abcde', None, None), + ('b', 'asdasdasd', None, None), + ('c', '123123', None, None), + ('x', 'abcdefg', None, None), + ('y', 'popopo', None, None)] + + open_fps = map(lambda x: x.splitlines(), [fasta1, fasta2]) + obs = list(fasta_qual_iterator(open_fps)) + self.assertEqual(obs, exp) + + def test_fasta_qual_iterators_fasta_qual(self): + exp = [('a', 'abcde', 'a', array([1, 2, 3, 4, 5])), + ('b', 'asdasdasd', 'b', array([1,1,1,1,1,1,1,1,1])), + ('c', '123123', 'c', array([2, 2, 2, 2, 2, 2])), + ('x', 'abcdefg', 'x', array([1, 2, 3, 4, 5, 6, 7])), + ('y', 'popopo', 'y', array([1, 1, 1, 1, 1, 1]))] + + splitter = lambda x: x.splitlines() + fasta_fps = map(splitter, [fasta1, fasta2]) + qual_fps = map(splitter, [qual1, qual2]) + + obs = fasta_qual_iterator(fasta_fps, qual_fps) + for o,e in zip(obs, exp): + osi, osd, oqi, oqd = o + esi, esd, eqi, eqd = e + self.assertEqual((osi, osd, oqi), (esi, esd, eqi)) + self.assertTrue((oqd == eqd).all()) + +fasta1 = """>a +abcde +>b +asdasdasd +>c +123123 +""" + +fasta2 = """>x +abcdefg +>y +popopo +""" + +qual1 = """>a +1 2 3 4 5 +>b +1 1 1 1 1 1 1 1 1 +>c +2 2 2 2 2 2 +""" + +qual2 = """>x +1 2 3 4 5 6 7 +>y +1 1 1 1 1 1 +""" + +qual2_bad = """>x +1 2 3 4 5 6 +>y +1 1 1 1 1 1 +""" + +if __name__ == '__main__': + main() From 62d5b2c12d3ea150e1896d5fcbb9b4d7e8b578fe Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 10 Dec 2013 13:56:45 -0700 Subject: [PATCH 07/61] support for 'that the value exists' type requirements --- qiime/workflow/core.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/qiime/workflow/core.py b/qiime/workflow/core.py index a52749ab90..42a3e3c7ce 100644 --- a/qiime/workflow/core.py +++ b/qiime/workflow/core.py @@ -17,6 +17,11 @@ _missing = object() _executed = object() +class Exists(object): + def __contains__(self, item): + return True +option_exists = Exists() + class Workflow(object): """Arbitrary worflow support structure""" def __init__(self, ShortCircuit=True, **kwargs): @@ -87,7 +92,7 @@ def __call__(self, it, success_callback=None, fail_callback=None): class requires(object): """Decorator that executes a function if requirements are met""" - def __init__(self, IsValid=True, Option=None, Values=None): + def __init__(self, IsValid=True, Option=None, Values=_missing): """ IsValid : execute the function if self.Failed is False Option : a required option @@ -96,16 +101,16 @@ def __init__(self, IsValid=True, Option=None, Values=None): # self here is the requires object self.IsValid = IsValid self.Option = Option - self.Values = Values - if not isinstance(self.Values, set): - if isinstance(self.Values, Iterable): - self.Values = set(self.Values) + if Values is _missing: + self.Values = option_exists + elif not isinstance(Values, set): + if isinstance(Values, Iterable): + self.Values = set(Values) else: - self.Values = set([self.Values]) - - if _missing in self.Values: - raise ValueError("_missing cannot be in Values!") + self.Values = set([Values]) + else: + self.Values = Values def doShortCircuit(self, wrapped): if self.IsValid and (wrapped.Failed and wrapped.ShortCircuit): @@ -147,6 +152,7 @@ def decorated_without_option(dec_self, *args, **kwargs): else: return update_wrapper(decorated_with_option, f) + class priority(object): """Sets a function priority""" def __init__(self, Priority): From 75758acfc540109414f22d7f742b39b4fa88bd2f Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 10 Dec 2013 14:02:07 -0700 Subject: [PATCH 08/61] lighted a requirement --- qiime/quality_filter_fasta.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/qiime/quality_filter_fasta.py b/qiime/quality_filter_fasta.py index 8f83fca159..80b8a169c8 100644 --- a/qiime/quality_filter_fasta.py +++ b/qiime/quality_filter_fasta.py @@ -28,7 +28,7 @@ def fasta_qual_iterator(fasta_fps, qual_fps=None): class QualFilterFastaWorkflow(Workflow): @priority(90) - @requires(Option='min_seq_len', Values=_continuous) + @requires(Option='min_seq_len') def wf_length_check(self, item): seq_id, seq, qual_id, qual = item From 28a18d6bf057b40960e2f4171f09d5badfc20067 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 10 Dec 2013 18:46:49 -0700 Subject: [PATCH 09/61] added * no_requirements * wf_ functions now need to be tagged --- qiime/workflow/core.py | 25 ++++++++++++++++++++++--- tests/test_workflow/test_core.py | 16 ++++++++++++++-- 2 files changed, 36 insertions(+), 5 deletions(-) diff --git a/qiime/workflow/core.py b/qiime/workflow/core.py index 42a3e3c7ce..699c9c3b17 100644 --- a/qiime/workflow/core.py +++ b/qiime/workflow/core.py @@ -24,6 +24,7 @@ def __contains__(self, item): class Workflow(object): """Arbitrary worflow support structure""" + def __init__(self, ShortCircuit=True, **kwargs): """Build thy self @@ -39,6 +40,10 @@ def __init__(self, ShortCircuit=True, **kwargs): self.Failed = False self.FinalState = None + for f in self._all_wf_methods(): + if not hasattr(f, '__workflowtag__'): + raise AttributeError("%s isn't a workflow method!" % f.__name__) + def _all_wf_methods(self, default_priority=0): """Get all workflow methods @@ -90,6 +95,17 @@ def __call__(self, it, success_callback=None, fail_callback=None): else: yield success_callback(self) + @staticmethod + def tagFunction(f): + setattr(f, '__workflowtag__', None) + +def no_requirements(f): + def decorated(self, *args, **kwargs): + f(self, *args, **kwargs) + return _executed + Workflow.tagFunction(decorated) + return decorated + class requires(object): """Decorator that executes a function if requirements are met""" def __init__(self, IsValid=True, Option=None, Values=_missing): @@ -131,8 +147,9 @@ def decorated_with_option(dec_self, *args, **kwargs): if self.doShortCircuit(dec_self): return - value = dec_self.Options.get(self.Option, _missing) - if value in self.Values: + s_opt = self.Option + ds_opts = dec_self.Options + if s_opt in ds_opts and ds_opts[s_opt] in self.Values: f(dec_self, *args, **kwargs) return _executed @@ -147,12 +164,14 @@ def decorated_without_option(dec_self, *args, **kwargs): f(dec_self, *args, **kwargs) return _executed + Workflow.tagFunction(decorated_with_option) + Workflow.tagFunction(decorated_without_option) + if self.Option is None: return update_wrapper(decorated_without_option, f) else: return update_wrapper(decorated_with_option, f) - class priority(object): """Sets a function priority""" def __init__(self, Priority): diff --git a/tests/test_workflow/test_core.py b/tests/test_workflow/test_core.py index a4cd266a31..27b1ff9852 100644 --- a/tests/test_workflow/test_core.py +++ b/tests/test_workflow/test_core.py @@ -1,7 +1,8 @@ #!/usr/bin/env python from itertools import izip -from qiime.workflow.core import Workflow, requires, priority +from qiime.workflow.core import (Workflow, requires, priority, + no_requirements) from unittest import TestCase, main __author__ = "Daniel McDonald" @@ -77,7 +78,7 @@ def methodB2(self, item): else: self.FinalState = (name, item) - @requires(IsValid=True) + @no_requirements def methodC1(self, item): name = 'C1' self.Stats[name] += 1 @@ -99,6 +100,17 @@ def setUp(self): self.obj_noshort = MockWorkflow(ShortCircuit=False, **{'A':True, 'C':True}) + def test_untagged_wf_method(self): + class WFTest(Workflow): + @no_requirements + def wf_1(self): + pass + def wf_2(self): + pass + + with self.assertRaises(AttributeError): + _ = WFTest() + def test_get_workflow(self): gen = single_iter = construct_iterator(**{'iter_x':[1,2,3,4,5]}) exp_wf = [self.obj_short.wf_groupA, self.obj_short.wf_groupC] From 8eeb7a4fe1dfcce23d328b8de232c87377287a2e Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 10 Dec 2013 20:09:51 -0700 Subject: [PATCH 10/61] updated docs, tagging wf_ functions --- qiime/workflow/core.py | 70 ++++++++++++++++++++++++++++---- tests/test_workflow/test_core.py | 9 ++-- 2 files changed, 68 insertions(+), 11 deletions(-) diff --git a/qiime/workflow/core.py b/qiime/workflow/core.py index 699c9c3b17..ea338df5f3 100644 --- a/qiime/workflow/core.py +++ b/qiime/workflow/core.py @@ -1,5 +1,54 @@ #!/usr/bin/env python +"""Perform multiple method calls, determined at runtime, on independent items + +Construct arbitrarily complex workflows in which the specific methods run are +determined at runtime. These methods are applied to items that are assumed to +be independent. + +As an example: + +class MyWorkflow(Workflow): + @priority(100) + @no_requirements + def wf_mul(self, item): + self.FinalState *= item + + @priority(10) + @requires(Option='add_value') + def wf_add(self, item): + self.FinalState += item + + @priority(10) + @requires(Option='sub_value', Values=[1,5,10]) + def wf_sub(self, item): + self.FinalState -= item + self.FinalState -= self.Options['sub_value'] + + @priority(1000) + @requires(IsValid=False) + def wf_init(self, item): + self.FinalState = item + +# (i * i) + i - i - 5 +wf = MyWorkflow(Options={'add_value':None, 'sub_value':5}) +gen = (i for i in range(10)) +for i in wf(gen): + print i + +# (i * i) - i - 10 +wf = MyWorkflow(Options={'sub_value':10}) +gen = (i for i in range(10)) +for i in wf(gen): + print i + +# (i * i) +wf = MyWorkflow() +gen = (i for i in range(10)) +for i in wf(gen): + print i +""" + from itertools import chain from functools import update_wrapper from collections import Iterable, defaultdict @@ -25,16 +74,22 @@ def __contains__(self, item): class Workflow(object): """Arbitrary worflow support structure""" - def __init__(self, ShortCircuit=True, **kwargs): + def __init__(self, ShortCircuit=True, Debug=True, Options=None): """Build thy self ShortCiruit : if True, enables ignoring function groups when a given item has failed + Options : runtime options, {'option':values} - kwargs are stored as self.Options. Support for arbitrary Stats is - implicit + All workflow methods (i.e., those starting with "wk_") must be decorated + by either "no_requirements" or "requires". This ensures that the methods + support the automatic workflow determination mechanism. """ - self.Options = kwargs + if Options is None: + self.Options = {} + else: + self.Options = Options + self.Stats = defaultdict(int) self.ShortCircuit = ShortCircuit self.Failed = False @@ -82,11 +137,11 @@ def __call__(self, it, success_callback=None, fail_callback=None): success_callback = lambda x: x.FinalState it, workflow = self._get_workflow(it) - + for item in it: self.Failed = False self.FinalState = None - + for f in workflow: f(item) @@ -104,7 +159,7 @@ def decorated(self, *args, **kwargs): f(self, *args, **kwargs) return _executed Workflow.tagFunction(decorated) - return decorated + return update_wrapper(decorated, f) class requires(object): """Decorator that executes a function if requirements are met""" @@ -149,6 +204,7 @@ def decorated_with_option(dec_self, *args, **kwargs): s_opt = self.Option ds_opts = dec_self.Options + if s_opt in ds_opts and ds_opts[s_opt] in self.Values: f(dec_self, *args, **kwargs) return _executed diff --git a/tests/test_workflow/test_core.py b/tests/test_workflow/test_core.py index 27b1ff9852..5366040c6c 100644 --- a/tests/test_workflow/test_core.py +++ b/tests/test_workflow/test_core.py @@ -96,8 +96,9 @@ def methodC2(self, item): class WorkflowTests(TestCase): def setUp(self): - self.obj_short = MockWorkflow(**{'A':True, 'C':True}) - self.obj_noshort = MockWorkflow(ShortCircuit=False, **{'A':True, + self.obj_short = MockWorkflow(Options={'A':True, 'C':True}) + self.obj_noshort = MockWorkflow(ShortCircuit=False, Options=\ + {'A':True, 'C':True}) def test_untagged_wf_method(self): @@ -237,7 +238,7 @@ def test_methodb1(self): def test_methodb2_accept(self): # methodb2 is setup to be valid when foo is in [1,2,3], make sure we # can execute - obj = MockWorkflow(**{'foo':1}) + obj = MockWorkflow(Options={'foo':1}) obj.methodB2('test') self.assertEqual(obj.FinalState, ('B2', 'test')) self.assertEqual(obj.Stats, {'B2':1}) @@ -251,7 +252,7 @@ def test_methodb2_accept(self): def test_methodb2_ignore(self): # methodb2 is setup to be valid when foo is in [1, 2, 3], make sure # we do not execute - obj = MockWorkflow(**{'foo':'bar'}) + obj = MockWorkflow(Options={'foo':'bar'}) obj.methodB2('test') self.assertEqual(obj.FinalState, None) self.assertEqual(obj.Stats, {}) From 6e7fe9bf5e196d2bcd439478895165d59c9ec73d Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 10 Dec 2013 20:13:37 -0700 Subject: [PATCH 11/61] removed a priority from the example --- qiime/workflow/core.py | 1 - 1 file changed, 1 deletion(-) diff --git a/qiime/workflow/core.py b/qiime/workflow/core.py index ea338df5f3..b6ad397a59 100644 --- a/qiime/workflow/core.py +++ b/qiime/workflow/core.py @@ -19,7 +19,6 @@ def wf_mul(self, item): def wf_add(self, item): self.FinalState += item - @priority(10) @requires(Option='sub_value', Values=[1,5,10]) def wf_sub(self, item): self.FinalState -= item From c71cd07d6c7c48b6327a122671eb4932c0a7b3af Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 11 Dec 2013 00:04:05 -0700 Subject: [PATCH 12/61] possible parallel support, doc/example --- qiime/workflow/core.py | 51 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 3 deletions(-) diff --git a/qiime/workflow/core.py b/qiime/workflow/core.py index b6ad397a59..d66590ac2b 100644 --- a/qiime/workflow/core.py +++ b/qiime/workflow/core.py @@ -46,6 +46,40 @@ def wf_init(self, item): gen = (i for i in range(10)) for i in wf(gen): print i + + +# assumes MyWorkflow is in a separate module +As a multiprocess example: + +# assumes: "ipcluster start -n 4" +from IPython.parallel import Client +from time import sleep +from example import MyWorkflow + +def exec_wf(): + result = [] + for i in wf(gen): + result.append(i) + return result + +c = Client() +dv = c[:] +opts = {'add_value':None, 'sub_value':5} +nprocs = len(c) +for rank, worker in enumerate(c): + worker.execute("from example import MyWorkflow") + worker.execute("gen = (i for i in range(10))") + worker['rank'] = rank + worker['opts'] = opts + worker['nprocs'] = nprocs + worker.execute("wf = MyWorkflow(Options=opts, Rank=rank, NProcs=nprocs)") +ar = dv.apply_async(exec_wf) +while not ar.ready(): + sleep(1) + +# not merged, but same result as the first single core example +for foo in ar.get(): + print foo """ from itertools import chain @@ -73,7 +107,8 @@ def __contains__(self, item): class Workflow(object): """Arbitrary worflow support structure""" - def __init__(self, ShortCircuit=True, Debug=True, Options=None): + def __init__(self, ShortCircuit=True, Debug=True, Options=None, Rank=0, + NProcs=1, Finalize=None): """Build thy self ShortCiruit : if True, enables ignoring function groups when a given @@ -93,6 +128,9 @@ def __init__(self, ShortCircuit=True, Debug=True, Options=None): self.ShortCircuit = ShortCircuit self.Failed = False self.FinalState = None + self.Rank = Rank + self.NProcs = NProcs + self.Finalize = Finalize for f in self._all_wf_methods(): if not hasattr(f, '__workflowtag__'): @@ -137,17 +175,24 @@ def __call__(self, it, success_callback=None, fail_callback=None): it, workflow = self._get_workflow(it) + count = 0 for item in it: self.Failed = False self.FinalState = None - for f in workflow: - f(item) + if count % self.NProcs == self.Rank: + for f in workflow: + f(item) if self.Failed and fail_callback is not None: yield fail_callback(self) else: yield success_callback(self) + + count += 1 + + if self.Finalize is not None: + self.Finalize(self) @staticmethod def tagFunction(f): From c2429e8c7d16cfc990d86992a0d8f10967f6e9f1 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 11 Dec 2013 10:08:47 -0700 Subject: [PATCH 13/61] full python path for import example --- qiime/workflow/core.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/qiime/workflow/core.py b/qiime/workflow/core.py index d66590ac2b..b844820d6e 100644 --- a/qiime/workflow/core.py +++ b/qiime/workflow/core.py @@ -54,7 +54,7 @@ def wf_init(self, item): # assumes: "ipcluster start -n 4" from IPython.parallel import Client from time import sleep -from example import MyWorkflow +from qiime.workflow.example import MyWorkflow def exec_wf(): result = [] @@ -67,13 +67,13 @@ def exec_wf(): opts = {'add_value':None, 'sub_value':5} nprocs = len(c) for rank, worker in enumerate(c): - worker.execute("from example import MyWorkflow") + worker.execute("from qiime.workflow.example import MyWorkflow") worker.execute("gen = (i for i in range(10))") worker['rank'] = rank worker['opts'] = opts worker['nprocs'] = nprocs worker.execute("wf = MyWorkflow(Options=opts, Rank=rank, NProcs=nprocs)") -ar = dv.apply_async(exec_wf) +ar = dv.apply_sync(exec_wf) while not ar.ready(): sleep(1) @@ -183,6 +183,8 @@ def __call__(self, it, success_callback=None, fail_callback=None): if count % self.NProcs == self.Rank: for f in workflow: f(item) + else: + continue if self.Failed and fail_callback is not None: yield fail_callback(self) From 0653803d6470a9ce462c3837c58edfd2ed9cc5d6 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 11 Dec 2013 13:04:45 -0700 Subject: [PATCH 14/61] merging in more methods --- qiime/quality_filter_fasta.py | 104 +++++++++++++++++++++++++++++++++- 1 file changed, 102 insertions(+), 2 deletions(-) diff --git a/qiime/quality_filter_fasta.py b/qiime/quality_filter_fasta.py index 80b8a169c8..70ca78fd48 100644 --- a/qiime/quality_filter_fasta.py +++ b/qiime/quality_filter_fasta.py @@ -6,15 +6,29 @@ from itertools import chain, izip def _fasta_qual_strict(fasta_gen, qual_gen): + """Yield fasta and qual together + + Raises ValueError if the sequence IDs and quality IDs are not in the same + order. Raises ValueError if the sequence length does not match the length + of the quality score. + """ for (seq_id, seq), (qual_id, qual) in izip(fasta_gen, qual_gen): if seq_id != qual_id: raise ValueError("%s is not equal to %s!" % (seq_id, qual_id)) if len(seq) != len(qual): raise ValueError("%s is not equal length to %s!" % (seq_id,qual_id)) - yield (seq_id, seq, qual_id, qual) + yield (seq_id, seq, qual) def fasta_qual_iterator(fasta_fps, qual_fps=None): + """Yield fasta and qual data + + Expects file-like objects. If qual_fps is not None, quality scores are + yielded, otherwise None is yielded for the quality. Specifically, the + tuple yielded is always of the form: + + (seq_id, seq, qual) + """ fasta_gens = chain(*map(MinimalFastaParser, fasta_fps)) if qual_fps is not None: @@ -22,16 +36,102 @@ def fasta_qual_iterator(fasta_fps, qual_fps=None): gen = _fasta_qual_strict(fasta_gens, qual_gens) else: qual_gens = None - gen = ((seq_id, seq, None, None) for seq_id, seq in fasta_gens) + gen = ((seq_id, seq, None) for seq_id, seq in fasta_gens) return gen +SEQ_ID_INDEX = 0 +SEQ_INDEX = 1 +QUAL_INDEX = 2 + class QualFilterFastaWorkflow(Workflow): + FinalState = {'fwd_primer':None, + 'rev_primer':None, + 'seq':None, + 'qual':None, + 'original_barcode':None, + 'corrected_barcode':None} + + @priority(1000) + @no_requirements + def wf_init(self, item): + # reset final state + for k in self.FinalState: + self.FinalState[k] = None + @priority(90) @requires(Option='min_seq_len') def wf_length_check(self, item): + """Checks minimum sequence length""" seq_id, seq, qual_id, qual = item if len(seq) < self.Options['min_seq_len']: self.Failed = True self.Stats['min_seq_len'] += 1 + + @priority(89) + @requires(IsValid=True) + def wf_check_primer(self, item): + """ """ + self._set_primers(item) + + self._local_align_forward_primer(item) + self._ + @requires(IsValid=False, Option='ids_primers') + def _set_primers(self, item): + """ """ + seq_id = item[SEQ_ID_INDEX] + + if self.Options['suppress_sample_id_check']: + primers = self.Options['ids_primers']['all_primers'] + else: + seq_label = seq_id.split('_')[0] + if seq_label not in self.Options['ids_primers']: + self.Stats['seq_id_not_in_mapping'] += 1 + self.Failed = True + else: + primers = self.Options['ids_primers'][seq_label] + else: + primers = ids_primers['all_primers'] + + self._primers = primers + + @requires(Option='local_align_forward_primer', Values=False) + @requires(Option='max_primer_mismatch') + @requires(Option='retain_primer', Values=False) + def _count_mismatches(self, item): + """ """ + seq = item[SEQ_INDEX] + for primer in self._primers: + exceeds_mismatch = count_mismatches(seq, primer, + self.Options['max_primer_mismatch']) + if not exceeds_mismatch: + self.Stats['exceeds_max_primer_mismatch'] += 1 + if not retain_primer: + fasta_seq = fasta_seq[len(primer):] + qual_seq = qual_seq[len(primer):] + failed = False + break + @requires(Option='local_align_forward_primer', Values=True) + @requires(Option='max_primer_mismatch') + def _local_align_forward_primer(self, item): + seq = item[SEQ_INDEX] + qual = item[QUAL_INDEX] + + failed = True + max_primer_mismatch = self.Options['max_primer_mismatch'] + for primer in self._primers: + mismatches, hit_start = local_align_primer_seq(primer, fasta_seq) + if mismatches <= max_primer_mismatch: + seq = seq[hit_start + len(primer):] + qual = seq[hit_start + len(primer):] + failed = False + break + + if failed: + self.Stats['max_primer_mismatch'] += 1 + self.Stats['exceeds_max_primer_mismatch'] = 1 + else: + self.FinalState['fwd_primer'] = primer + self.FinalState['seq'] = seq + self.FinalState['qual'] = qual From cab54f4cd34d1a6d2c5c068f0f9919e6a1eab0d4 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 17 Dec 2013 09:50:30 -0700 Subject: [PATCH 15/61] added more methods --- qiime/quality_filter_fasta.py | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/qiime/quality_filter_fasta.py b/qiime/quality_filter_fasta.py index 70ca78fd48..a635de14e5 100644 --- a/qiime/quality_filter_fasta.py +++ b/qiime/quality_filter_fasta.py @@ -52,6 +52,8 @@ class QualFilterFastaWorkflow(Workflow): 'original_barcode':None, 'corrected_barcode':None} + ### Start Workflow methods + @priority(1000) @no_requirements def wf_init(self, item): @@ -76,7 +78,10 @@ def wf_check_primer(self, item): self._set_primers(item) self._local_align_forward_primer(item) - self._ + self._count_mismatches(item) + + ### End Workflow methods + @requires(IsValid=False, Option='ids_primers') def _set_primers(self, item): """ """ @@ -102,19 +107,33 @@ def _set_primers(self, item): def _count_mismatches(self, item): """ """ seq = item[SEQ_INDEX] + qual = item[QUAL_INDEX] + + failed = True for primer in self._primers: exceeds_mismatch = count_mismatches(seq, primer, self.Options['max_primer_mismatch']) if not exceeds_mismatch: self.Stats['exceeds_max_primer_mismatch'] += 1 if not retain_primer: - fasta_seq = fasta_seq[len(primer):] - qual_seq = qual_seq[len(primer):] + seq = seq[len(primer):] + qual = qual[len(primer):] failed = False break + + ### should decompose this + if failed: + self.Stats['max_primer_mismatch'] += 1 + self.Stats['exceeds_max_primer_mismatch'] = 1 + else: + self.FinalState['fwd_primer'] = primer + self.FinalState['seq'] = seq + ### + @requires(Option='local_align_forward_primer', Values=True) @requires(Option='max_primer_mismatch') def _local_align_forward_primer(self, item): + """ """ seq = item[SEQ_INDEX] qual = item[QUAL_INDEX] From 8bdf43d1bbbc10ed326a24d69b815857c536b541 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 17 Dec 2013 13:33:15 -0700 Subject: [PATCH 16/61] removed parallel, added staging function --- qiime/workflow/core.py | 63 ++++++++---------------------------------- 1 file changed, 12 insertions(+), 51 deletions(-) diff --git a/qiime/workflow/core.py b/qiime/workflow/core.py index b844820d6e..769fe522d6 100644 --- a/qiime/workflow/core.py +++ b/qiime/workflow/core.py @@ -46,40 +46,6 @@ def wf_init(self, item): gen = (i for i in range(10)) for i in wf(gen): print i - - -# assumes MyWorkflow is in a separate module -As a multiprocess example: - -# assumes: "ipcluster start -n 4" -from IPython.parallel import Client -from time import sleep -from qiime.workflow.example import MyWorkflow - -def exec_wf(): - result = [] - for i in wf(gen): - result.append(i) - return result - -c = Client() -dv = c[:] -opts = {'add_value':None, 'sub_value':5} -nprocs = len(c) -for rank, worker in enumerate(c): - worker.execute("from qiime.workflow.example import MyWorkflow") - worker.execute("gen = (i for i in range(10))") - worker['rank'] = rank - worker['opts'] = opts - worker['nprocs'] = nprocs - worker.execute("wf = MyWorkflow(Options=opts, Rank=rank, NProcs=nprocs)") -ar = dv.apply_sync(exec_wf) -while not ar.ready(): - sleep(1) - -# not merged, but same result as the first single core example -for foo in ar.get(): - print foo """ from itertools import chain @@ -107,13 +73,17 @@ def __contains__(self, item): class Workflow(object): """Arbitrary worflow support structure""" - def __init__(self, ShortCircuit=True, Debug=True, Options=None, Rank=0, - NProcs=1, Finalize=None): + def __init__(self, ShortCircuit=True, Debug=True, Options=None, + Mapping=None, StagingFunction=None): """Build thy self ShortCiruit : if True, enables ignoring function groups when a given item has failed + Debug : Enable debug mode Options : runtime options, {'option':values} + Mapping : Optional metadata mapping + StagingFunction : Optional staging function that can setup additional + state in self, such as providing self.Barcodes, etc All workflow methods (i.e., those starting with "wk_") must be decorated by either "no_requirements" or "requires". This ensures that the methods @@ -128,14 +98,15 @@ def __init__(self, ShortCircuit=True, Debug=True, Options=None, Rank=0, self.ShortCircuit = ShortCircuit self.Failed = False self.FinalState = None - self.Rank = Rank - self.NProcs = NProcs - self.Finalize = Finalize + self.Mapping = Mapping for f in self._all_wf_methods(): if not hasattr(f, '__workflowtag__'): raise AttributeError("%s isn't a workflow method!" % f.__name__) + if StagingFunction is not None: + StagingFunction(self) + def _all_wf_methods(self, default_priority=0): """Get all workflow methods @@ -175,27 +146,17 @@ def __call__(self, it, success_callback=None, fail_callback=None): it, workflow = self._get_workflow(it) - count = 0 for item in it: self.Failed = False - self.FinalState = None - if count % self.NProcs == self.Rank: - for f in workflow: - f(item) - else: - continue + for f in workflow: + f(item) if self.Failed and fail_callback is not None: yield fail_callback(self) else: yield success_callback(self) - count += 1 - - if self.Finalize is not None: - self.Finalize(self) - @staticmethod def tagFunction(f): setattr(f, '__workflowtag__', None) From f1ae29cc031878bd73b8a5ff37db6c6ea7f01093 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 17 Dec 2013 13:35:21 -0700 Subject: [PATCH 17/61] added sanity_check --- qiime/process_seqs.py | 185 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 qiime/process_seqs.py diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py new file mode 100644 index 0000000000..eb3947e406 --- /dev/null +++ b/qiime/process_seqs.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python + +from qiime.workflow.core import Workflow, requires, priority, _continuous +from cogent.parse.fasta import MinimalFastaParser +from qiime.parse import MinimalQualParser +from itertools import chain, izip + +def _fasta_qual_strict(fasta_gen, qual_gen): + """Yield fasta and qual together + + Raises ValueError if the sequence IDs and quality IDs are not in the same + order. Raises ValueError if the sequence length does not match the length + of the quality score. + """ + for (seq_id, seq), (qual_id, qual) in izip(fasta_gen, qual_gen): + if seq_id != qual_id: + raise ValueError("%s is not equal to %s!" % (seq_id, qual_id)) + if len(seq) != len(qual): + raise ValueError("%s is not equal length to %s!" % (seq_id,qual_id)) + + yield (seq_id, seq, qual) + +def fasta_qual_iterator(fasta_fps, qual_fps=None): + """Yield fasta and qual data + + Expects file-like objects. If qual_fps is not None, quality scores are + yielded, otherwise None is yielded for the quality. Specifically, the + tuple yielded is always of the form: + + (seq_id, seq, qual) + """ + fasta_gens = chain(*map(MinimalFastaParser, fasta_fps)) + + if qual_fps is not None: + qual_gens = chain(*map(MinimalQualParser, qual_fps)) + gen = _fasta_qual_strict(fasta_gens, qual_gens) + else: + qual_gens = None + gen = ((seq_id, seq, None) for seq_id, seq in fasta_gens) + + return gen + +SEQ_ID_INDEX = 0 +SEQ_INDEX = 1 +QUAL_INDEX = 2 + +class SequenceWorkflow(Workflow): + FinalState = {'fwd_primer':None, + 'rev_primer':None, + 'seq':None, + 'qual':None, + 'original_barcode':None, + 'corrected_barcode':None} + + def _sanity_check(self): + name = self.__name__ + if not hasattr(self, 'Mapping'): + raise AttributeError("%s is missing Mapping!" % name) + if not hasattr(self, 'Barcodes'): + raise AttributeError("%s is missing Mapping!" % name) + + ### Start Workflow methods + + @priority(1000) + @no_requirements + def wf_init(self, item): + self._init_final_state(item) + + @priority(900) + @requires(Option='barcode_type', Values=['hamming_8','golay_12','variable']) + def wf_demultiplex(self, item): + self._correct_golay12(item) + self._correct_hamming8(item) + self._correct_variable(item) + + @priority(90) + @requires(Option='min_seq_len') + def wf_length_check(self, item): + """Checks minimum sequence length""" + seq_id, seq, qual_id, qual = item + + if len(seq) < self.Options['min_seq_len']: + self.Failed = True + self.Stats['min_seq_len'] += 1 + + @priority(89) + @requires(IsValid=True) + def wf_check_primer(self, item): + """ """ + self._set_primers(item) + + self._local_align_forward_primer(item) + self._count_mismatches(item) + + ### End Workflow methods + + @requires(Option='barcode_type', Values='golay_12') + def _correct_golay12(self, item): + pass + + @requires(Option='barcode_type', Values='hamming_8') + def _correct_hamming8(self, item): + pass + + @requires(Option='barcode_type', Values='variable') + def _correct_variable(self, item): + pass + + def _init_final_state(self, item): + """Reset final state""" + for k in self.FinalState: + self.FinalState[k] = None + + @requires(IsValid=False, Option='ids_primers') + def _set_primers(self, item): + """ """ + seq_id = item[SEQ_ID_INDEX] + + if self.Options['suppress_sample_id_check']: + primers = self.Options['ids_primers']['all_primers'] + else: + seq_label = seq_id.split('_')[0] + if seq_label not in self.Options['ids_primers']: + self.Stats['seq_id_not_in_mapping'] += 1 + self.Failed = True + else: + primers = self.Options['ids_primers'][seq_label] + else: + primers = ids_primers['all_primers'] + + self._primers = primers + + @requires(Option='local_align_forward_primer', Values=False) + @requires(Option='max_primer_mismatch') + @requires(Option='retain_primer', Values=False) + def _count_mismatches(self, item): + """ """ + seq = item[SEQ_INDEX] + qual = item[QUAL_INDEX] + + failed = True + for primer in self._primers: + exceeds_mismatch = count_mismatches(seq, primer, + self.Options['max_primer_mismatch']) + if not exceeds_mismatch: + self.Stats['exceeds_max_primer_mismatch'] += 1 + if not retain_primer: + seq = seq[len(primer):] + qual = qual[len(primer):] + failed = False + break + + ### should decompose this + if failed: + self.Stats['max_primer_mismatch'] += 1 + self.Stats['exceeds_max_primer_mismatch'] = 1 + else: + self.FinalState['fwd_primer'] = primer + self.FinalState['seq'] = seq + ### + + @requires(Option='local_align_forward_primer', Values=True) + @requires(Option='max_primer_mismatch') + def _local_align_forward_primer(self, item): + """ """ + seq = item[SEQ_INDEX] + qual = item[QUAL_INDEX] + + failed = True + max_primer_mismatch = self.Options['max_primer_mismatch'] + for primer in self._primers: + mismatches, hit_start = local_align_primer_seq(primer, fasta_seq) + if mismatches <= max_primer_mismatch: + seq = seq[hit_start + len(primer):] + qual = seq[hit_start + len(primer):] + failed = False + break + + if failed: + self.Stats['max_primer_mismatch'] += 1 + self.Stats['exceeds_max_primer_mismatch'] = 1 + else: + self.FinalState['fwd_primer'] = primer + self.FinalState['seq'] = seq + self.FinalState['qual'] = qual From b98a02d2340144710e6c63b5b81bfc84175676b1 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 17 Dec 2013 13:53:16 -0700 Subject: [PATCH 18/61] sanity check, staging method --- qiime/workflow/core.py | 28 ++++++++++++++++++++-------- tests/test_workflow/test_core.py | 3 +++ 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/qiime/workflow/core.py b/qiime/workflow/core.py index 769fe522d6..6a7e408fde 100644 --- a/qiime/workflow/core.py +++ b/qiime/workflow/core.py @@ -73,17 +73,14 @@ def __contains__(self, item): class Workflow(object): """Arbitrary worflow support structure""" - def __init__(self, ShortCircuit=True, Debug=True, Options=None, - Mapping=None, StagingFunction=None): + def __init__(self, ShortCircuit=True, Debug=True, Options=None, **kwargs): """Build thy self ShortCiruit : if True, enables ignoring function groups when a given item has failed Debug : Enable debug mode Options : runtime options, {'option':values} - Mapping : Optional metadata mapping - StagingFunction : Optional staging function that can setup additional - state in self, such as providing self.Barcodes, etc + kwargs : Additional arguments will be added to self All workflow methods (i.e., those starting with "wk_") must be decorated by either "no_requirements" or "requires". This ensures that the methods @@ -98,14 +95,29 @@ def __init__(self, ShortCircuit=True, Debug=True, Options=None, self.ShortCircuit = ShortCircuit self.Failed = False self.FinalState = None - self.Mapping = Mapping + + for k,v in kwargs.iteritems(): + if hasattr(self, k): + raise AttributeError("%s exists in self!" % k) + setattr(self, k, v) for f in self._all_wf_methods(): if not hasattr(f, '__workflowtag__'): raise AttributeError("%s isn't a workflow method!" % f.__name__) - if StagingFunction is not None: - StagingFunction(self) + self._stage_state() + self._sanity_check() + + def _stage_state(self): + """Stage any additional data necessary for the workflow + + This does not need to be overloaded + """ + pass + + def _sanity_check(self): + """Perform a sanity check on self""" + raise NotImplementedError("Must implement a sanity check!") def _all_wf_methods(self, default_priority=0): """Get all workflow methods diff --git a/tests/test_workflow/test_core.py b/tests/test_workflow/test_core.py index 5366040c6c..f2bb5b90ad 100644 --- a/tests/test_workflow/test_core.py +++ b/tests/test_workflow/test_core.py @@ -26,6 +26,9 @@ def construct_iterator(**kwargs): return izip(*to_gen) class MockWorkflow(Workflow): + def _sanity_check(self): + pass + @priority(90) @requires(Option='A', Values=True) def wf_groupA(self, item): From 61ec0d953a6396d0e2af8c8544cbb588295afe54 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 17 Dec 2013 18:08:02 -0700 Subject: [PATCH 19/61] further progress --- qiime/process_seqs.py | 153 +++++++++++++------- qiime/quality_filter_fasta.py | 156 -------------------- qiime/workflow/core.py | 237 +++++++++++++++++++------------ tests/test_workflow/test_core.py | 34 ++++- 4 files changed, 277 insertions(+), 303 deletions(-) delete mode 100644 qiime/quality_filter_fasta.py diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index eb3947e406..6df8012390 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -4,6 +4,10 @@ from cogent.parse.fasta import MinimalFastaParser from qiime.parse import MinimalQualParser from itertools import chain, izip +from qiime.util import MetadataMap + +from qiime.hamming import decode as decode_hamming_8 +from qiime.golay import decode as decode_golay_12 def _fasta_qual_strict(fasta_gen, qual_gen): """Yield fasta and qual together @@ -40,6 +44,10 @@ def fasta_qual_iterator(fasta_fps, qual_fps=None): return gen +def _count_mismatches(seq1, seq2): + """Counts mismatches between two sequences""" + return sum([a == b for a,b in zip(seq1, seq2)]) + SEQ_ID_INDEX = 0 SEQ_INDEX = 1 QUAL_INDEX = 2 @@ -49,15 +57,32 @@ class SequenceWorkflow(Workflow): 'rev_primer':None, 'seq':None, 'qual':None, + 'sample':None, 'original_barcode':None, - 'corrected_barcode':None} - + 'corrected_barcode':None, + 'final_barcode':None, + 'corrected_barcode_errors':None} + + def _stage_state(self): + """Fish out barcodes from the mapping data""" + # set all the barcodes + bcs = {} + for sample in self.Mapping.SampleIds: + sample_bc = self.Mapping.getCategoryValue(sample, 'barcode') + if sample_bc in bcs: + raise ValueError("Duplicate barcode found for sample %s" \ + % sample) + else: + bcs[sample_bc] = sample + self.Barcodes = frozenset(bcs) + def _sanity_check(self): name = self.__name__ if not hasattr(self, 'Mapping'): raise AttributeError("%s is missing Mapping!" % name) - if not hasattr(self, 'Barcodes'): - raise AttributeError("%s is missing Mapping!" % name) + + if not isinstance(self.Mapping, MetadataMap): + raise AttributeError("self.Mapping is not of type MetadataMap") ### Start Workflow methods @@ -67,10 +92,21 @@ def wf_init(self, item): self._init_final_state(item) @priority(900) - @requires(Option='barcode_type', Values=['hamming_8','golay_12','variable']) - def wf_demultiplex(self, item): + @requires(Option='max_bc_errors') + @requires(Option='barcode_type', Values=['hamming_8','golay_12']) + def wf_demultiplex_fixed(self, item): self._correct_golay12(item) self._correct_hamming8(item) + + bc_errors = self.Options['max_bc_errors'] + if self.FinalState['corrected_barcode_errors'] > bc_errors: + self.Failed = True + self.Stats['exceeds_bc_errors'] += 1 + + @priority(900) + @requires(Option='barcode_type', Values='variable') + def wf_demultiplex_variable(self, item): + raise NotImplementedError("variable length barcodes not supported yet") self._correct_variable(item) @priority(90) @@ -84,81 +120,88 @@ def wf_length_check(self, item): self.Stats['min_seq_len'] += 1 @priority(89) - @requires(IsValid=True) + @requires(Option='instrument-type', Values='454') + @requires(Option='disable_primer_check', Values=False) def wf_check_primer(self, item): """ """ - self._set_primers(item) - - self._local_align_forward_primer(item) self._count_mismatches(item) + self._local_align_forward_primer(item) ### End Workflow methods + def _check_exact_barcode(self): + """Check for a match""" + return self.FinalState['original_barcode'] in self.Barcodes + @requires(Option='barcode_type', Values='golay_12') def _correct_golay12(self, item): - pass + """ """ + self._correct_encoded_barcode(item, decode_golay_12, 12) @requires(Option='barcode_type', Values='hamming_8') def _correct_hamming8(self, item): - pass + """ """ + self._correct_encoded_barcode(item, decode_hamming_8, 8) - @requires(Option='barcode_type', Values='variable') - def _correct_variable(self, item): - pass + def _correct_encoded_barcode(self, item, method, bc_length): + putative_bc = item[SEQ_INDEX][:bc_length] + self.FinalState['original_barcode'] = putative_bc + + if self._check_exact_barcode(): + self.FinalState['corrected_barcode_errors'] = 0 + final_bc = putative_bc + sample = self.Barcodes.get(putative_bc, None) + else: + corrected, num_errors = method(putative_bc) + final_bc = corrected + + self.FinalState['corrected_barcode'] = corrected + self.FinalState['corrected_barcode_errors'] = num_errors + self.Stats['barcodes_corrected'] += 1 + sample = self.Barcodes.get(corrected, None) + + self.FinalState['final_barcode'] = final_bc + + if sample is None: + self.Failed = True + else: + self.FinalState['sample'] = sample def _init_final_state(self, item): """Reset final state""" for k in self.FinalState: self.FinalState[k] = None - @requires(IsValid=False, Option='ids_primers') - def _set_primers(self, item): - """ """ - seq_id = item[SEQ_ID_INDEX] - - if self.Options['suppress_sample_id_check']: - primers = self.Options['ids_primers']['all_primers'] - else: - seq_label = seq_id.split('_')[0] - if seq_label not in self.Options['ids_primers']: - self.Stats['seq_id_not_in_mapping'] += 1 - self.Failed = True - else: - primers = self.Options['ids_primers'][seq_label] - else: - primers = ids_primers['all_primers'] - - self._primers = primers - - @requires(Option='local_align_forward_primer', Values=False) + ##### the requires are likely wrong here @requires(Option='max_primer_mismatch') - @requires(Option='retain_primer', Values=False) def _count_mismatches(self, item): """ """ seq = item[SEQ_INDEX] qual = item[QUAL_INDEX] + + exp_primer = self.Mapping.getCategoryValue(self.FinalState['sample'], + 'LinkerPrimerSequence')) + len_primer = len(exp_primer) + obs_primer = seq[:len_primer] + + mismatches = _count_mismatches(obs_primer, exp_primer) + + if not self.Options['retain_primer']: + seq = seq[len_primer:] + qual = qual[len_primer:] - failed = True - for primer in self._primers: - exceeds_mismatch = count_mismatches(seq, primer, - self.Options['max_primer_mismatch']) - if not exceeds_mismatch: - self.Stats['exceeds_max_primer_mismatch'] += 1 - if not retain_primer: - seq = seq[len(primer):] - qual = qual[len(primer):] - failed = False - break - - ### should decompose this - if failed: + if mismatches > self.Options['max_primer_mismatch']: + self.Failed = True self.Stats['max_primer_mismatch'] += 1 self.Stats['exceeds_max_primer_mismatch'] = 1 - else: - self.FinalState['fwd_primer'] = primer - self.FinalState['seq'] = seq - ### + + self.FinalState['fwd_primer'] = obs_primer + self.FinalState['seq'] = seq + ##### for truncating i believe, but isn't clear why we need to attempt to + ##### align against all possible primers instead of just the one we expect + + ### THIS IS STILL IN PROGRESS @requires(Option='local_align_forward_primer', Values=True) @requires(Option='max_primer_mismatch') def _local_align_forward_primer(self, item): @@ -183,3 +226,5 @@ def _local_align_forward_primer(self, item): self.FinalState['fwd_primer'] = primer self.FinalState['seq'] = seq self.FinalState['qual'] = qual + + diff --git a/qiime/quality_filter_fasta.py b/qiime/quality_filter_fasta.py deleted file mode 100644 index a635de14e5..0000000000 --- a/qiime/quality_filter_fasta.py +++ /dev/null @@ -1,156 +0,0 @@ -#!/usr/bin/env python - -from qiime.workflow.core import Workflow, requires, priority, _continuous -from cogent.parse.fasta import MinimalFastaParser -from qiime.parse import MinimalQualParser -from itertools import chain, izip - -def _fasta_qual_strict(fasta_gen, qual_gen): - """Yield fasta and qual together - - Raises ValueError if the sequence IDs and quality IDs are not in the same - order. Raises ValueError if the sequence length does not match the length - of the quality score. - """ - for (seq_id, seq), (qual_id, qual) in izip(fasta_gen, qual_gen): - if seq_id != qual_id: - raise ValueError("%s is not equal to %s!" % (seq_id, qual_id)) - if len(seq) != len(qual): - raise ValueError("%s is not equal length to %s!" % (seq_id,qual_id)) - - yield (seq_id, seq, qual) - -def fasta_qual_iterator(fasta_fps, qual_fps=None): - """Yield fasta and qual data - - Expects file-like objects. If qual_fps is not None, quality scores are - yielded, otherwise None is yielded for the quality. Specifically, the - tuple yielded is always of the form: - - (seq_id, seq, qual) - """ - fasta_gens = chain(*map(MinimalFastaParser, fasta_fps)) - - if qual_fps is not None: - qual_gens = chain(*map(MinimalQualParser, qual_fps)) - gen = _fasta_qual_strict(fasta_gens, qual_gens) - else: - qual_gens = None - gen = ((seq_id, seq, None) for seq_id, seq in fasta_gens) - - return gen - -SEQ_ID_INDEX = 0 -SEQ_INDEX = 1 -QUAL_INDEX = 2 - -class QualFilterFastaWorkflow(Workflow): - FinalState = {'fwd_primer':None, - 'rev_primer':None, - 'seq':None, - 'qual':None, - 'original_barcode':None, - 'corrected_barcode':None} - - ### Start Workflow methods - - @priority(1000) - @no_requirements - def wf_init(self, item): - # reset final state - for k in self.FinalState: - self.FinalState[k] = None - - @priority(90) - @requires(Option='min_seq_len') - def wf_length_check(self, item): - """Checks minimum sequence length""" - seq_id, seq, qual_id, qual = item - - if len(seq) < self.Options['min_seq_len']: - self.Failed = True - self.Stats['min_seq_len'] += 1 - - @priority(89) - @requires(IsValid=True) - def wf_check_primer(self, item): - """ """ - self._set_primers(item) - - self._local_align_forward_primer(item) - self._count_mismatches(item) - - ### End Workflow methods - - @requires(IsValid=False, Option='ids_primers') - def _set_primers(self, item): - """ """ - seq_id = item[SEQ_ID_INDEX] - - if self.Options['suppress_sample_id_check']: - primers = self.Options['ids_primers']['all_primers'] - else: - seq_label = seq_id.split('_')[0] - if seq_label not in self.Options['ids_primers']: - self.Stats['seq_id_not_in_mapping'] += 1 - self.Failed = True - else: - primers = self.Options['ids_primers'][seq_label] - else: - primers = ids_primers['all_primers'] - - self._primers = primers - - @requires(Option='local_align_forward_primer', Values=False) - @requires(Option='max_primer_mismatch') - @requires(Option='retain_primer', Values=False) - def _count_mismatches(self, item): - """ """ - seq = item[SEQ_INDEX] - qual = item[QUAL_INDEX] - - failed = True - for primer in self._primers: - exceeds_mismatch = count_mismatches(seq, primer, - self.Options['max_primer_mismatch']) - if not exceeds_mismatch: - self.Stats['exceeds_max_primer_mismatch'] += 1 - if not retain_primer: - seq = seq[len(primer):] - qual = qual[len(primer):] - failed = False - break - - ### should decompose this - if failed: - self.Stats['max_primer_mismatch'] += 1 - self.Stats['exceeds_max_primer_mismatch'] = 1 - else: - self.FinalState['fwd_primer'] = primer - self.FinalState['seq'] = seq - ### - - @requires(Option='local_align_forward_primer', Values=True) - @requires(Option='max_primer_mismatch') - def _local_align_forward_primer(self, item): - """ """ - seq = item[SEQ_INDEX] - qual = item[QUAL_INDEX] - - failed = True - max_primer_mismatch = self.Options['max_primer_mismatch'] - for primer in self._primers: - mismatches, hit_start = local_align_primer_seq(primer, fasta_seq) - if mismatches <= max_primer_mismatch: - seq = seq[hit_start + len(primer):] - qual = seq[hit_start + len(primer):] - failed = False - break - - if failed: - self.Stats['max_primer_mismatch'] += 1 - self.Stats['exceeds_max_primer_mismatch'] = 1 - else: - self.FinalState['fwd_primer'] = primer - self.FinalState['seq'] = seq - self.FinalState['qual'] = qual diff --git a/qiime/workflow/core.py b/qiime/workflow/core.py index 6a7e408fde..e3c95b8bf8 100644 --- a/qiime/workflow/core.py +++ b/qiime/workflow/core.py @@ -51,6 +51,7 @@ def wf_init(self, item): from itertools import chain from functools import update_wrapper from collections import Iterable, defaultdict +from types import MethodType __author__ = "Daniel McDonald" __copyright__ = "Copyright 2013, The QIIME Project" @@ -70,10 +71,108 @@ def __contains__(self, item): return True option_exists = Exists() +def _debug_trace_wrapper(obj, f): + """Trace a function call""" + def wrapped(self, *args, **kwargs): + if not hasattr(obj, 'DebugTrace'): + raise AttributeError("%s does not have DebugTrace!" % obj.__class__) + + obj.DebugTrace.append(f.__name__) + return f(self, *args, **kwargs) + + return update_wrapper(wrapped, f) + +def _tag_function(f): + """Tag, you're it""" + setattr(f, '__workflowtag__', None) + +class priority(object): + """Sets a function priority""" + def __init__(self, Priority): + self.Priority = Priority + + def __call__(self, f): + f.Priority = self.Priority + return f + +def no_requirements(f): + def decorated(self, *args, **kwargs): + f(self, *args, **kwargs) + return _executed + _tag_function(decorated) + return update_wrapper(decorated, f) + +class requires(object): + """Decorator that executes a function if requirements are met""" + def __init__(self, IsValid=True, Option=None, Values=_missing): + """ + IsValid : execute the function if self.Failed is False + Option : a required option + Values : required values associated with an option + """ + # self here is the requires object + self.IsValid = IsValid + self.Option = Option + + if Values is _missing: + self.Values = option_exists + elif not isinstance(Values, set): + if isinstance(Values, Iterable): + self.Values = set(Values) + else: + self.Values = set([Values]) + else: + self.Values = Values + + def doShortCircuit(self, wrapped): + if self.IsValid and (wrapped.Failed and wrapped.ShortCircuit): + return True + else: + return False + + def __call__(self, f): + """Wrap a function + + f : the function to wrap + """ + def decorated_with_option(dec_self, *args, **kwargs): + """A decorated function that has an option to validate + + dec_self : this is "self" for the decorated function + """ + if self.doShortCircuit(dec_self): + return + + s_opt = self.Option + ds_opts = dec_self.Options + + if s_opt in ds_opts and ds_opts[s_opt] in self.Values: + f(dec_self, *args, **kwargs) + return _executed + + def decorated_without_option(dec_self, *args, **kwargs): + """A decorated function that does not have an option to validate + + dec_self : this is "self" for the decorated function + """ + if self.doShortCircuit(dec_self): + return + + f(dec_self, *args, **kwargs) + return _executed + + _tag_function(decorated_with_option) + _tag_function(decorated_without_option) + + if self.Option is None: + return update_wrapper(decorated_without_option, f) + else: + return update_wrapper(decorated_with_option, f) + class Workflow(object): """Arbitrary worflow support structure""" - def __init__(self, ShortCircuit=True, Debug=True, Options=None, **kwargs): + def __init__(self, ShortCircuit=True, Debug=False, Options=None, **kwargs): """Build thy self ShortCiruit : if True, enables ignoring function groups when a given @@ -91,11 +190,16 @@ def __init__(self, ShortCircuit=True, Debug=True, Options=None, **kwargs): else: self.Options = Options + ### collections.Counter instead? self.Stats = defaultdict(int) self.ShortCircuit = ShortCircuit self.Failed = False self.FinalState = None - + self.Debug = Debug + + if self.Debug: + self.DebugTrace = [] + for k,v in kwargs.iteritems(): if hasattr(self, k): raise AttributeError("%s exists in self!" % k) @@ -105,8 +209,28 @@ def __init__(self, ShortCircuit=True, Debug=True, Options=None, **kwargs): if not hasattr(f, '__workflowtag__'): raise AttributeError("%s isn't a workflow method!" % f.__name__) - self._stage_state() self._sanity_check() + self._stage_state() + self._setup_debug() + + def _setup_debug(self): + """Wrap all methods with debug trace support""" + if not self.Debug: + return + + _ignore = set(['_get_workflow','_all_wf_methods','_sanity_check', + '_stage_state']) + + for attrname in dir(self): + if attrname.startswith('__'): + continue + if attrname in _ignore: + continue + + attr = getattr(self, attrname) + + if isinstance(attr, MethodType): + setattr(self, attrname, _debug_trace_wrapper(self, attr)) def _stage_state(self): """Stage any additional data necessary for the workflow @@ -126,7 +250,20 @@ def _all_wf_methods(self, default_priority=0): """ methods = [getattr(self, f) for f in dir(self) if f.startswith('wf_')] key = lambda x: getattr(x, 'Priority', default_priority) - return sorted(methods, key=key, reverse=True) + methods_sorted = sorted(methods, key=key, reverse=True) + + if methods_sorted[0] != self.wf_SETUP_DEBUG_TRACE: + name = methods_sorted[0].__name__ + debug_prio = self.wf_SETUP_DEBUG_TRACE.Priority + + raise AttributeError("Method %s has a higher priority than the " + "debug trace method. Please set its priority " + "below %d." % (name, debug_prio)) + + if not self.Debug: + methods_sorted.pop(0) + + return methods_sorted def _get_workflow(self, it): """Get the methods executed, sorted by priority""" @@ -145,6 +282,11 @@ def _get_workflow(self, it): return generator_reset, executed + @priority(99999999) + @no_requirements + def wf_SETUP_DEBUG_TRACE(self, item): + self.DebugTrace = [] + def __call__(self, it, success_callback=None, fail_callback=None): """Operate on all the data @@ -168,90 +310,3 @@ def __call__(self, it, success_callback=None, fail_callback=None): yield fail_callback(self) else: yield success_callback(self) - - @staticmethod - def tagFunction(f): - setattr(f, '__workflowtag__', None) - -def no_requirements(f): - def decorated(self, *args, **kwargs): - f(self, *args, **kwargs) - return _executed - Workflow.tagFunction(decorated) - return update_wrapper(decorated, f) - -class requires(object): - """Decorator that executes a function if requirements are met""" - def __init__(self, IsValid=True, Option=None, Values=_missing): - """ - IsValid : execute the function if self.Failed is False - Option : a required option - Values : required values associated with an option - """ - # self here is the requires object - self.IsValid = IsValid - self.Option = Option - - if Values is _missing: - self.Values = option_exists - elif not isinstance(Values, set): - if isinstance(Values, Iterable): - self.Values = set(Values) - else: - self.Values = set([Values]) - else: - self.Values = Values - - def doShortCircuit(self, wrapped): - if self.IsValid and (wrapped.Failed and wrapped.ShortCircuit): - return True - else: - return False - - def __call__(self, f): - """Wrap a function - - f : the function to wrap - """ - def decorated_with_option(dec_self, *args, **kwargs): - """A decorated function that has an option to validate - - dec_self : this is "self" for the decorated function - """ - if self.doShortCircuit(dec_self): - return - - s_opt = self.Option - ds_opts = dec_self.Options - - if s_opt in ds_opts and ds_opts[s_opt] in self.Values: - f(dec_self, *args, **kwargs) - return _executed - - def decorated_without_option(dec_self, *args, **kwargs): - """A decorated function that does not have an option to validate - - dec_self : this is "self" for the decorated function - """ - if self.doShortCircuit(dec_self): - return - - f(dec_self, *args, **kwargs) - return _executed - - Workflow.tagFunction(decorated_with_option) - Workflow.tagFunction(decorated_without_option) - - if self.Option is None: - return update_wrapper(decorated_without_option, f) - else: - return update_wrapper(decorated_with_option, f) - -class priority(object): - """Sets a function priority""" - def __init__(self, Priority): - self.Priority = Priority - - def __call__(self, f): - f.Priority = self.Priority - return f diff --git a/tests/test_workflow/test_core.py b/tests/test_workflow/test_core.py index f2bb5b90ad..d732a3e09a 100644 --- a/tests/test_workflow/test_core.py +++ b/tests/test_workflow/test_core.py @@ -100,10 +100,11 @@ def methodC2(self, item): class WorkflowTests(TestCase): def setUp(self): self.obj_short = MockWorkflow(Options={'A':True, 'C':True}) + self.obj_debug = MockWorkflow(Debug=True, Options={'A':True, 'C':True}) self.obj_noshort = MockWorkflow(ShortCircuit=False, Options=\ {'A':True, 'C':True}) - + def test_untagged_wf_method(self): class WFTest(Workflow): @no_requirements @@ -115,8 +116,37 @@ def wf_2(self): with self.assertRaises(AttributeError): _ = WFTest() + def test_get_workflow_debug(self): + gen = construct_iterator(**{'iter_x':[1,2,3,4,5]}) + exp_wf = [self.obj_debug.wf_SETUP_DEBUG_TRACE, self.obj_debug.wf_groupA, + self.obj_debug.wf_groupC] + obs_gen, obs_wf = self.obj_debug._get_workflow(gen) + + self.assertEqual(obs_wf, exp_wf) + self.assertEqual(list(obs_gen), [1,2,3,4,5]) + + self.assertEqual(self.obj_debug.Stats, {}) + self.assertTrue(self.obj_debug.ShortCircuit) + + def test_debug_trace(self): + gen = construct_iterator(**{'iter_x':[1,2,3,4,5]}) + obj = self.obj_debug(gen) + + exp = ('C1',1) + obs = obj.next() + self.assertEqual(obs, exp) + + exp = ['wf_groupA', + 'methodA1', + 'methodA2', + 'wf_groupC', + 'methodC1', + 'methodC2'] + obs = self.obj_debug.DebugTrace + self.assertEqual(obs, exp) + def test_get_workflow(self): - gen = single_iter = construct_iterator(**{'iter_x':[1,2,3,4,5]}) + gen = construct_iterator(**{'iter_x':[1,2,3,4,5]}) exp_wf = [self.obj_short.wf_groupA, self.obj_short.wf_groupC] obs_gen, obs_wf = self.obj_short._get_workflow(gen) From 357fbaff0958c3a2980bfbce555c3d8c04b1b137 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 18 Dec 2013 09:46:03 -0700 Subject: [PATCH 20/61] added a ValidData method --- qiime/workflow/core.py | 20 ++++++++++++++++- tests/test_workflow/test_core.py | 38 +++++++++++++++++++++++++++++++- 2 files changed, 56 insertions(+), 2 deletions(-) diff --git a/qiime/workflow/core.py b/qiime/workflow/core.py index e3c95b8bf8..4adc440e80 100644 --- a/qiime/workflow/core.py +++ b/qiime/workflow/core.py @@ -104,15 +104,21 @@ def decorated(self, *args, **kwargs): class requires(object): """Decorator that executes a function if requirements are met""" - def __init__(self, IsValid=True, Option=None, Values=_missing): + def __init__(self, IsValid=True, Option=None, Values=_missing, + ValidData=None): """ IsValid : execute the function if self.Failed is False Option : a required option Values : required values associated with an option + ValidData : data level requirements, this must be a function with the + following signature: f(*args, **kwargs) returning True. NOTE: if + ValidData returns False on the first item evaluated, the decorated + function will be removed from the remaining workflow. """ # self here is the requires object self.IsValid = IsValid self.Option = Option + self.ValidData = ValidData if Values is _missing: self.Values = option_exists @@ -135,6 +141,10 @@ def __call__(self, f): f : the function to wrap """ + ### not sure how I feel about having multiple functions in here. + ### also, the handling of Data is a bit dirty as it is now replicated + ### over these functions. It is ideal to keep the functions slim, thus + ### the multiple functions, but this could explode if not careful def decorated_with_option(dec_self, *args, **kwargs): """A decorated function that has an option to validate @@ -143,6 +153,10 @@ def decorated_with_option(dec_self, *args, **kwargs): if self.doShortCircuit(dec_self): return + if self.ValidData is not None: + if not self.ValidData(*args, **kwargs): + return + s_opt = self.Option ds_opts = dec_self.Options @@ -158,6 +172,10 @@ def decorated_without_option(dec_self, *args, **kwargs): if self.doShortCircuit(dec_self): return + if self.ValidData is not None: + if not self.ValidData(*args, **kwargs): + return + f(dec_self, *args, **kwargs) return _executed diff --git a/tests/test_workflow/test_core.py b/tests/test_workflow/test_core.py index d732a3e09a..baf23f23e7 100644 --- a/tests/test_workflow/test_core.py +++ b/tests/test_workflow/test_core.py @@ -104,7 +104,7 @@ def setUp(self): self.obj_noshort = MockWorkflow(ShortCircuit=False, Options=\ {'A':True, 'C':True}) - + def test_untagged_wf_method(self): class WFTest(Workflow): @no_requirements @@ -250,7 +250,43 @@ def test_call_AC_fail_noshort(self): self.assertEqual(self.obj_noshort.Stats, exp_stats) +class MockWorkflowReqTest(Workflow): + def _sanity_check(self): + pass + + @priority(5) + @requires(ValidData=lambda x: x < 3) + def wf_needs_data(self, item): + name = 'needs_data' + self.Stats[name] += 1 + if item == 'fail %s' % name: + self.Failed = True + self.FinalState = (name, item) + + @priority(10) + @no_requirements + def wf_always_run(self, item): + name = 'always_run' + self.Stats[name] += 1 + if item == 'fail %s' % name: + self.Failed = True + self.FinalState = (name, item) + class RequiresTests(TestCase): + def test_validdata(self): + obj = MockWorkflowReqTest() + single_iter = construct_iterator(**{'iter_x':[1,2,3,4,5]}) + + exp_stats = {'needs_data':2, 'always_run':5} + # C2 isn't executed as its requirements aren't met in the Options + exp_result = [('needs_data',1), ('needs_data',2), ('always_run',3), + ('always_run',4), ('always_run', 5)] + + obs_result = list(obj(single_iter)) + + self.assertEqual(obs_result, exp_result) + self.assertEqual(obj.Stats, exp_stats) + def test_methodb1(self): obj = MockWorkflow() obj.methodB1('test') From 19776b6dcdff7321e7b669fd7a5260fa65f27e1f Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Sun, 22 Dec 2013 11:41:30 -0700 Subject: [PATCH 21/61] fasta/fastq iterators --- qiime/process_seqs.py | 247 ++++++++++++++++++++++++++++--------- tests/test_process_seqs.py | 247 +++++++++++++++++++++++++++++++++++++ 2 files changed, 436 insertions(+), 58 deletions(-) create mode 100644 tests/test_process_seqs.py diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index 6df8012390..c8580e612c 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -1,89 +1,216 @@ #!/usr/bin/env python -from qiime.workflow.core import Workflow, requires, priority, _continuous +from qiime.workflow.core import Workflow, requires, priority, no_requirements from cogent.parse.fasta import MinimalFastaParser +from cogent.parse.fastq import MinimalFastqParser from qiime.parse import MinimalQualParser from itertools import chain, izip from qiime.util import MetadataMap - -from qiime.hamming import decode as decode_hamming_8 +from qiime.parse import is_casava_v180_or_later +from qiime.split_libraries import expand_degeneracies +from qiime.hamming import decode_barcode_8 as decode_hamming_8 from qiime.golay import decode as decode_golay_12 +from qiime.quality import ascii_to_phred33, ascii_to_phred64 +from numpy import array + +# pre allocate the iterable return. This is done for performance reasons to +# avoid frequent reallocations and to ensure a consistent object type +_iter_prealloc = {'SequenceID':None, + 'Sequence':None, + 'Qual':None, + 'Barcode':None} -def _fasta_qual_strict(fasta_gen, qual_gen): +def _reset_iter_prealloc(): + for k in _iter_prealloc: + _iter_prealloc[k] = None + +def _fasta_qual_gen(fasta_gen, qual_gen): """Yield fasta and qual together Raises ValueError if the sequence IDs and quality IDs are not in the same order. Raises ValueError if the sequence length does not match the length of the quality score. + + Note: object yielded is updated on each iteration. A new object is _not_ + created on each iteration. This is done for performance reasons, where + quick testing showed a 50% reduction in runtime. """ for (seq_id, seq), (qual_id, qual) in izip(fasta_gen, qual_gen): if seq_id != qual_id: raise ValueError("%s is not equal to %s!" % (seq_id, qual_id)) if len(seq) != len(qual): raise ValueError("%s is not equal length to %s!" % (seq_id,qual_id)) + + _iter_prealloc['SequenceID'] = seq_id + _iter_prealloc['Sequence'] = seq + _iter_prealloc['Qual'] = qual - yield (seq_id, seq, qual) + yield _iter_prealloc -def fasta_qual_iterator(fasta_fps, qual_fps=None): +def _fasta_gen(fasta_gens): + """Yield fasta data + + Note: object yielded is updated on each iteration. A new object is _not_ + created on each iteration. This is done for performance reasons, where + quick testing showed a 50% reduction in runtime. + """ + for id_, seq in fasta_gens: + _iter_prealloc['SequenceID'] = id_ + _iter_prealloc['Sequence'] = seq + yield _iter_prealloc + +def fasta_iterator(fasta_fps, qual_fps=None): """Yield fasta and qual data Expects file-like objects. If qual_fps is not None, quality scores are - yielded, otherwise None is yielded for the quality. Specifically, the - tuple yielded is always of the form: + yielded. The return will either be: + + {'SequenceID':foo, 'Sequence':bar, 'Qual':array([])} + + or - (seq_id, seq, qual) + {'SequenceID':foo, 'Sequence':bar, 'Qual':None} + + Note: object yielded is updated on each iteration. A new object is _not_ + created on each iteration. This is done for performance reasons, where + quick testing showed a 50% reduction in runtime. """ + _reset_iter_prealloc() + fasta_gens = chain(*map(MinimalFastaParser, fasta_fps)) if qual_fps is not None: qual_gens = chain(*map(MinimalQualParser, qual_fps)) - gen = _fasta_qual_strict(fasta_gens, qual_gens) + gen = _fasta_qual_gen(fasta_gens, qual_gens) else: qual_gens = None - gen = ((seq_id, seq, None) for seq_id, seq in fasta_gens) + gen = _fasta_gen(fasta_gens) return gen +def _fastq_barcode_gen(fastq_gens, barcode_gens, phred_f): + """Yield fastq and barcode data + + Note: object yielded is updated on each iteration. A new object is _not_ + created on each iteration. This is done for performance reasons, where + quick testing showed a 50% reduction in runtime. + """ + _gen = izip(fastq_gens, barcode_gens) + for (seqid, seq, qual), (bc_seqid, bc_seq, bc_qual) in _gen: + if seqid != bc_seqid: + raise ValueError("%s is not equal to %s!" % (seqid, bc_seqid)) + _iter_prealloc['SequenceID'] = seqid + _iter_prealloc['Sequence'] = seq + _iter_prealloc['Qual'] = array(map(phred_f, qual)) + _iter_prealloc['Barcode'] = bc_seq + + yield _iter_prealloc + +def _fastq_gen(fastq_gens, phred_f): + """Yield fastq data + + Note: object yielded is updated on each iteration. A new object is _not_ + created on each iteration. This is done for performance reasons, where + quick testing showed a 50% reduction in runtime. + """ + for (seqid, seq, qual) in fastq_gens: + _iter_prealloc['SequenceID'] = seqid + _iter_prealloc['Sequence'] = seq + _iter_prealloc['Qual'] = array(map(phred_f, qual)) + + yield _iter_prealloc + +def fastq_iterator(fastq_fps, barcode_fps=None): + """Yield fastq data + + Expects file-like objects. If barcode_fps is not None, barcodes are also + yielded. The return will either be: + + {'SequenceID':foo, 'Sequence':bar, 'Qual':array([]), 'Barcode':foobar} + + or + + {'SequenceID':foo, 'Sequence':bar, 'Qual':array([]), 'Barcode':None} + + Note: object yielded is updated on each iteration. A new object is _not_ + created on each iteration. This is done for performance reasons, where + quick testing showed a 50% reduction in runtime. + """ + _reset_iter_prealloc() + + fastq_gens = chain(*map(MinimalFastqParser, fastq_fps)) + + # peek + first_item = fastq_gens.next() + seqid, seq, qual = first_item + fastq_gens = chain([first_item], fastq_gens) + + # from qiime.parse.parse_fastq_qual_score (v1.8.0) + if is_casava_v180_or_later('@%s' % seqid): + ascii_to_phred_f = ascii_to_phred33 + else: + ascii_to_phred_f = ascii_to_phred64 + + if barcode_fps: + barcode_gens = chain(*map(MinimalFastqParser, barcode_fps)) + gen = _fastq_barcode_gen(fastq_gens, barcode_gens, ascii_to_phred_f) + else: + gen = _fastq_gen(fastq_gens, ascii_to_phred_f) + + return gen + +### cythonize def _count_mismatches(seq1, seq2): """Counts mismatches between two sequences""" - return sum([a == b for a,b in zip(seq1, seq2)]) + return sum([a != b for a,b in zip(seq1, seq2)]) -SEQ_ID_INDEX = 0 -SEQ_INDEX = 1 -QUAL_INDEX = 2 +def _has_qual(item): + return item['Qual'] is not None class SequenceWorkflow(Workflow): - FinalState = {'fwd_primer':None, - 'rev_primer':None, - 'seq':None, - 'qual':None, - 'sample':None, - 'original_barcode':None, - 'corrected_barcode':None, - 'final_barcode':None, - 'corrected_barcode_errors':None} + FinalState = {'Forward primer':None, + 'Reverse primer':None, + 'Sequence':None, + 'Qual':None, + 'Sample':None, + 'Original barcode':None, + 'Corrected barcode':None, + 'Final barcode':None, + 'Corrected barcode errors':None} def _stage_state(self): - """Fish out barcodes from the mapping data""" - # set all the barcodes + """Fish out barcodes and primers from the mapping data""" bcs = {} + primers = {} for sample in self.Mapping.SampleIds: - sample_bc = self.Mapping.getCategoryValue(sample, 'barcode') + sample_bc = self.Mapping.getCategoryValue(sample, 'BarcodeSequence') if sample_bc in bcs: raise ValueError("Duplicate barcode found for sample %s" \ % sample) - else: - bcs[sample_bc] = sample - self.Barcodes = frozenset(bcs) + bcs[sample_bc] = sample + + sample_primers = self.Mapping.getCategoryValue(sample, + 'LinkerPrimerSequence') + all_sample_primers = sample_primers.split(',') + primers[sample_bc] = expand_degeneracies(all_sample_primers) + + self.Barcodes = bcs + self.Primers = primers def _sanity_check(self): - name = self.__name__ + name = self.__class__ if not hasattr(self, 'Mapping'): raise AttributeError("%s is missing Mapping!" % name) if not isinstance(self.Mapping, MetadataMap): raise AttributeError("self.Mapping is not of type MetadataMap") + if not hasattr(self, 'Barcodes'): + raise AttributeError("%s does not have Barcodes!" % name) + + if not hasattr(self, 'Primers'): + raise AttributeError("%s does not have Primers!" % name) + ### Start Workflow methods @priority(1000) @@ -91,7 +218,12 @@ def _sanity_check(self): def wf_init(self, item): self._init_final_state(item) - @priority(900) + @priority(200) + @requires(ValidData=_has_qual) + def wf_read_quality(self, item): + pass + + @priority(100) @requires(Option='max_bc_errors') @requires(Option='barcode_type', Values=['hamming_8','golay_12']) def wf_demultiplex_fixed(self, item): @@ -103,7 +235,7 @@ def wf_demultiplex_fixed(self, item): self.Failed = True self.Stats['exceeds_bc_errors'] += 1 - @priority(900) + @priority(100) @requires(Option='barcode_type', Values='variable') def wf_demultiplex_variable(self, item): raise NotImplementedError("variable length barcodes not supported yet") @@ -113,26 +245,22 @@ def wf_demultiplex_variable(self, item): @requires(Option='min_seq_len') def wf_length_check(self, item): """Checks minimum sequence length""" - seq_id, seq, qual_id, qual = item + seq_id, seq, qual = item if len(seq) < self.Options['min_seq_len']: self.Failed = True self.Stats['min_seq_len'] += 1 @priority(89) - @requires(Option='instrument-type', Values='454') + @requires(Option='instrument_type', Values='454') @requires(Option='disable_primer_check', Values=False) def wf_check_primer(self, item): """ """ - self._count_mismatches(item) - self._local_align_forward_primer(item) + self._count_primer_mismatches(item) + #self._local_align_forward_primer(item) ### End Workflow methods - def _check_exact_barcode(self): - """Check for a match""" - return self.FinalState['original_barcode'] in self.Barcodes - @requires(Option='barcode_type', Values='golay_12') def _correct_golay12(self, item): """ """ @@ -146,15 +274,14 @@ def _correct_hamming8(self, item): def _correct_encoded_barcode(self, item, method, bc_length): putative_bc = item[SEQ_INDEX][:bc_length] self.FinalState['original_barcode'] = putative_bc - - if self._check_exact_barcode(): + + if putative_bc in self.Barcodes: self.FinalState['corrected_barcode_errors'] = 0 final_bc = putative_bc sample = self.Barcodes.get(putative_bc, None) else: corrected, num_errors = method(putative_bc) final_bc = corrected - self.FinalState['corrected_barcode'] = corrected self.FinalState['corrected_barcode_errors'] = num_errors self.Stats['barcodes_corrected'] += 1 @@ -172,28 +299,32 @@ def _init_final_state(self, item): for k in self.FinalState: self.FinalState[k] = None - ##### the requires are likely wrong here @requires(Option='max_primer_mismatch') - def _count_mismatches(self, item): + def _count_primer_mismatches(self, item): """ """ seq = item[SEQ_INDEX] qual = item[QUAL_INDEX] - exp_primer = self.Mapping.getCategoryValue(self.FinalState['sample'], - 'LinkerPrimerSequence')) - len_primer = len(exp_primer) - obs_primer = seq[:len_primer] - - mismatches = _count_mismatches(obs_primer, exp_primer) - - if not self.Options['retain_primer']: - seq = seq[len_primer:] - qual = qual[len_primer:] + obs_barcode = self.FinalState['final_barcode'] + len_barcode = len(obs_barcode) + + exp_primers = self.Primers[obs_barcode] + len_primer = len(exp_primers[0]) - if mismatches > self.Options['max_primer_mismatch']: + obs_primer = seq[len_barcode:len_barcode + len_primer] + + mm = array([_count_mismatches(obs_primer, p) for p in exp_primers]) + + if (mm > self.Options['max_primer_mismatch']).all(): self.Failed = True self.Stats['max_primer_mismatch'] += 1 - self.Stats['exceeds_max_primer_mismatch'] = 1 + self.Stats['exceeds_max_primer_mismatch'] += 1 + + ### should decompose + if not self.Options['retain_primer']: + seq = seq[len_primer:] + if qual is not None: + qual = qual[len_primer:] self.FinalState['fwd_primer'] = obs_primer self.FinalState['seq'] = seq diff --git a/tests/test_process_seqs.py b/tests/test_process_seqs.py new file mode 100644 index 0000000000..8bc69f519c --- /dev/null +++ b/tests/test_process_seqs.py @@ -0,0 +1,247 @@ +#!/usr/bin/env python + +from cogent.util.unit_test import TestCase, main +from cogent.parse.fasta import MinimalFastaParser +from cogent.parse.fastq import MinimalFastqParser +from qiime.parse import MinimalQualParser +from itertools import chain +from numpy import array +from qiime.process_seqs import (_fasta_qual_gen, + fasta_iterator, _fastq_barcode_gen, fastq_iterator, _fastq_gen) +from qiime.quality import ascii_to_phred64 +from itertools import izip + +class FastqIteratorTests(TestCase): + def setUp(self): + fastq1_gen = MinimalFastqParser(fastq1.splitlines()) + fastq2_gen = MinimalFastqParser(fastq2.splitlines()) + barcodes1_gen = MinimalFastqParser(barcodes1.splitlines()) + barcodes2_gen = MinimalFastqParser(barcodes2.splitlines()) + + self.fastq_gen = chain(fastq1_gen, fastq2_gen) + self.barcodes_gen = chain(barcodes1_gen, barcodes2_gen) + + self.reversed_fastq_gen = chain(fastq2_gen, fastq1_gen) + + def test_fastq_barcode_gen_simple(self): + exp_data = [('a', 'abcde', 'test1', array([33,34,35,36,37])), + ('b', 'asdasdasd', 'test2', array([33,51,36] * 3)), + ('c', '123123', 'test3', array([-15, -14, -13] * 2)), + ('x', 'abcdefg', 'test4', array([33,34,35,36,37,38,39])), + ('y', 'popopo', 'test5', array([48,47] * 3))] + exp = [] + for id_,seq,bc,qual in exp_data: + exp.append({'SequenceID':id_, 'Sequence':seq, 'Qual':qual, + 'Barcode':bc}) + + obs = _fastq_barcode_gen(self.fastq_gen, self.barcodes_gen, + ascii_to_phred64) + for o,e in izip(obs,exp): + self.assertEqual(o['SequenceID'], e['SequenceID']) + self.assertEqual(o['Sequence'], e['Sequence']) + self.assertTrue((o['Qual'] == e['Qual']).all()) + self.assertEqual(o['Barcode'], e['Barcode']) + + def test_fasta_barcode_gen_mismatch_ids(self): + with self.assertRaises(ValueError): + g = _fasta_qual_gen(self.reversed_fastq_gen, self.barcodes_gen) + _ = list(g) + + def test_fastq_iterators_just_fastq(self): + exp_data = [('a', 'abcde', array([33,34,35,36,37])), + ('b', 'asdasdasd', array([33,51,36] * 3)), + ('c', '123123', array([-15, -14, -13] * 2)), + ('x', 'abcdefg', array([33,34,35,36,37,38,39])), + ('y', 'popopo', array([48,47] * 3))] + exp = [] + for id_,seq,qual in exp_data: + exp.append({'SequenceID':id_, 'Sequence':seq, 'Qual':qual, + 'Barcode':None}) + + open_fps = map(lambda x: x.splitlines(), [fastq1, fastq2]) + obs = [d.copy() for d in fastq_iterator(open_fps)] + self.assertEqual(obs, exp) + + def test_fastq_iterators_barcodes(self): + exp_data = [('a', 'abcde', 'test1', array([33,34,35,36,37])), + ('b', 'asdasdasd', 'test2', array([33,51,36] * 3)), + ('c', '123123', 'test3', array([-15, -14, -13] * 2)), + ('x', 'abcdefg', 'test4', array([33,34,35,36,37,38,39])), + ('y', 'popopo', 'test5', array([48,47] * 3))] + exp = [] + for id_,seq,bc,qual in exp_data: + exp.append({'SequenceID':id_, 'Sequence':seq, 'Qual':qual, + 'Barcode':bc}) + + splitter = lambda x: x.splitlines() + fastq_fps = map(splitter, [fastq1, fastq2]) + bc_fps = map(splitter, [barcodes1, barcodes2]) + + obs = fastq_iterator(fastq_fps, bc_fps) + for o,e in izip(obs,exp): + self.assertEqual(o['SequenceID'], e['SequenceID']) + self.assertEqual(o['Sequence'], e['Sequence']) + self.assertTrue((o['Qual'] == e['Qual']).all()) + self.assertEqual(o['Barcode'], e['Barcode']) + +class FastaIteratorTests(TestCase): + def setUp(self): + fasta1_gen = MinimalFastaParser(fasta1.splitlines()) + qual1_gen = MinimalQualParser(qual1.splitlines()) + fasta2_gen = MinimalFastaParser(fasta2.splitlines()) + qual2_gen = MinimalQualParser(qual2.splitlines()) + qual2_bad_gen = MinimalQualParser(qual2_bad.splitlines()) + + self.fasta_gen = chain(fasta1_gen, fasta2_gen) + self.qual_gen = chain(qual1_gen, qual2_gen) + + self.reversed_fasta_gen = chain(fasta2_gen, fasta1_gen) + self.qual_bad_gen = chain(qual1_gen, qual2_bad_gen) + + def test_fasta_qual_gen_simple(self): + exp_data = [('a', 'abcde', array([1, 2, 3, 4, 5])), + ('b', 'asdasdasd', array([1,1,1,1,1,1,1,1,1])), + ('c', '123123', array([2, 2, 2, 2, 2, 2])), + ('x', 'abcdefg', array([1, 2, 3, 4, 5, 6, 7])), + ('y', 'popopo', array([1, 1, 1, 1, 1, 1]))] + exp = [] + for id_,seq,qual in exp_data: + exp.append({'SequenceID':id_, 'Sequence':seq, 'Qual':qual, + 'Barcode':None}) + + obs = _fasta_qual_gen(self.fasta_gen, self.qual_gen) + for o,e in izip(obs,exp): + self.assertEqual(o['SequenceID'], e['SequenceID']) + self.assertEqual(o['Sequence'], e['Sequence']) + self.assertTrue((o['Qual'] == e['Qual']).all()) + + def test_fasta_qual_gen_mismatch_ids(self): + with self.assertRaises(ValueError): + g = _fasta_qual_gen(self.reversed_fasta_gen, self.qual_gen) + _ = list(g) + + def test_fasta_qual_gen_mismatch_length(self): + with self.assertRaises(ValueError): + _ = list(_fasta_qual_gen(self.fasta_gen, self.qual_bad_gen)) + + def test_fasta_iterators_just_fasta(self): + exp_data = [('a', 'abcde', None), + ('b', 'asdasdasd', None), + ('c', '123123', None), + ('x', 'abcdefg', None), + ('y', 'popopo', None)] + + exp = [] + for id_,seq,qual in exp_data: + exp.append({'SequenceID':id_, 'Sequence':seq, 'Qual':qual, + 'Barcode':None}) + + open_fps = map(lambda x: x.splitlines(), [fasta1, fasta2]) + obs = [d.copy() for d in fasta_iterator(open_fps)] + self.assertEqual(obs, exp) + + def test_fasta_iterators_fasta_qual(self): + exp_data = [('a', 'abcde', array([1, 2, 3, 4, 5])), + ('b', 'asdasdasd', array([1,1,1,1,1,1,1,1,1])), + ('c', '123123', array([2, 2, 2, 2, 2, 2])), + ('x', 'abcdefg', array([1, 2, 3, 4, 5, 6, 7])), + ('y', 'popopo', array([1, 1, 1, 1, 1, 1]))] + + exp = [] + for id_,seq,qual in exp_data: + exp.append({'SequenceID':id_, 'Sequence':seq, 'Qual':qual, + 'Barcode':None}) + splitter = lambda x: x.splitlines() + fasta_fps = map(splitter, [fasta1, fasta2]) + qual_fps = map(splitter, [qual1, qual2]) + + obs = fasta_iterator(fasta_fps, qual_fps) + for o,e in izip(obs, exp): + self.assertEqual(o['SequenceID'], e['SequenceID']) + self.assertEqual(o['Sequence'], e['Sequence']) + self.assertTrue((o['Qual'] == e['Qual']).all()) + +fasta1 = """>a +abcde +>b +asdasdasd +>c +123123 +""" + +fasta2 = """>x +abcdefg +>y +popopo +""" + +qual1 = """>a +1 2 3 4 5 +>b +1 1 1 1 1 1 1 1 1 +>c +2 2 2 2 2 2 +""" + +qual2 = """>x +1 2 3 4 5 6 7 +>y +1 1 1 1 1 1 +""" + +qual2_bad = """>x +1 2 3 4 5 6 +>y +1 1 1 1 1 1 +""" + +fastq1 = """@a +abcde ++a +abcde +@b +asdasdasd ++b +asdasdasd +@c +123123 ++c +123123 +""" + +fastq2 = """@x +abcdefg ++x +abcdefg +@y +popopo ++y +popopo +""" + +barcodes1 = """@a +test1 ++a +1234 +@b +test2 ++b +12345 +@c +test3 ++c +aaccb +""" + +barcodes2 = """@x +test4 ++x +12312 +@y +test5 ++y +33333 +""" + +if __name__ == '__main__': + main() From d545782eba28785134ea3be9bab862a7624850e6 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Sun, 22 Dec 2013 11:47:58 -0700 Subject: [PATCH 22/61] methods now use item as a dict, update keys in finalstate --- qiime/process_seqs.py | 50 +++++++++++++++++++++---------------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index c8580e612c..03a5925919 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -231,7 +231,7 @@ def wf_demultiplex_fixed(self, item): self._correct_hamming8(item) bc_errors = self.Options['max_bc_errors'] - if self.FinalState['corrected_barcode_errors'] > bc_errors: + if self.FinalState['Corrected barcode errors'] > bc_errors: self.Failed = True self.Stats['exceeds_bc_errors'] += 1 @@ -242,12 +242,10 @@ def wf_demultiplex_variable(self, item): self._correct_variable(item) @priority(90) - @requires(Option='min_seq_len') + @requires(Option='min_seq_len', ValidData=_has_qual) def wf_length_check(self, item): """Checks minimum sequence length""" - seq_id, seq, qual = item - - if len(seq) < self.Options['min_seq_len']: + if len(item['Qual']) < self.Options['min_seq_len']: self.Failed = True self.Stats['min_seq_len'] += 1 @@ -272,27 +270,31 @@ def _correct_hamming8(self, item): self._correct_encoded_barcode(item, decode_hamming_8, 8) def _correct_encoded_barcode(self, item, method, bc_length): - putative_bc = item[SEQ_INDEX][:bc_length] - self.FinalState['original_barcode'] = putative_bc + if item['Barcode'] is not None: + putative_bc = item['Barcode'] + else: + putative_bc = item['Sequence'][:bc_length] + + self.FinalState['Original barcode'] = putative_bc if putative_bc in self.Barcodes: - self.FinalState['corrected_barcode_errors'] = 0 + self.FinalState['Corrected barcode errors'] = 0 final_bc = putative_bc sample = self.Barcodes.get(putative_bc, None) else: corrected, num_errors = method(putative_bc) final_bc = corrected - self.FinalState['corrected_barcode'] = corrected - self.FinalState['corrected_barcode_errors'] = num_errors - self.Stats['barcodes_corrected'] += 1 + self.FinalState['Corrected barcode'] = corrected + self.FinalState['Corrected barcode errors'] = num_errors + self.Stats['Barcodes corrected'] += 1 sample = self.Barcodes.get(corrected, None) - self.FinalState['final_barcode'] = final_bc + self.FinalState['Final barcode'] = final_bc if sample is None: self.Failed = True else: - self.FinalState['sample'] = sample + self.FinalState['Sample'] = sample def _init_final_state(self, item): """Reset final state""" @@ -302,10 +304,10 @@ def _init_final_state(self, item): @requires(Option='max_primer_mismatch') def _count_primer_mismatches(self, item): """ """ - seq = item[SEQ_INDEX] - qual = item[QUAL_INDEX] + seq = item['Sequence'] + qual = item['Qual'] - obs_barcode = self.FinalState['final_barcode'] + obs_barcode = self.FinalState['Final barcode'] len_barcode = len(obs_barcode) exp_primers = self.Primers[obs_barcode] @@ -326,8 +328,8 @@ def _count_primer_mismatches(self, item): if qual is not None: qual = qual[len_primer:] - self.FinalState['fwd_primer'] = obs_primer - self.FinalState['seq'] = seq + self.FinalState['Forward primer'] = obs_primer + self.FinalState['Sequence'] = seq ##### for truncating i believe, but isn't clear why we need to attempt to ##### align against all possible primers instead of just the one we expect @@ -337,8 +339,8 @@ def _count_primer_mismatches(self, item): @requires(Option='max_primer_mismatch') def _local_align_forward_primer(self, item): """ """ - seq = item[SEQ_INDEX] - qual = item[QUAL_INDEX] + seq = item['Sequence'] + qual = item['Qual'] failed = True max_primer_mismatch = self.Options['max_primer_mismatch'] @@ -354,8 +356,6 @@ def _local_align_forward_primer(self, item): self.Stats['max_primer_mismatch'] += 1 self.Stats['exceeds_max_primer_mismatch'] = 1 else: - self.FinalState['fwd_primer'] = primer - self.FinalState['seq'] = seq - self.FinalState['qual'] = qual - - + self.FinalState['Forward primer'] = primer + self.FinalState['Sequence'] = seq + self.FinalState['Qual'] = qual From 0ea41531803df97e80353c28b01fe7bd892c5921 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 30 Dec 2013 13:15:46 -0700 Subject: [PATCH 23/61] more workflow progress --- qiime/process_seqs.py | 8 +------- qiime/workflow/core.py | 10 ++++++---- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index 03a5925919..45c47058a0 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -198,19 +198,13 @@ def _stage_state(self): self.Primers = primers def _sanity_check(self): - name = self.__class__ + name = self.__class__.__name__ if not hasattr(self, 'Mapping'): raise AttributeError("%s is missing Mapping!" % name) if not isinstance(self.Mapping, MetadataMap): raise AttributeError("self.Mapping is not of type MetadataMap") - if not hasattr(self, 'Barcodes'): - raise AttributeError("%s does not have Barcodes!" % name) - - if not hasattr(self, 'Primers'): - raise AttributeError("%s does not have Primers!" % name) - ### Start Workflow methods @priority(1000) diff --git a/qiime/workflow/core.py b/qiime/workflow/core.py index 4adc440e80..714498c7ef 100644 --- a/qiime/workflow/core.py +++ b/qiime/workflow/core.py @@ -123,7 +123,9 @@ def __init__(self, IsValid=True, Option=None, Values=_missing, if Values is _missing: self.Values = option_exists elif not isinstance(Values, set): - if isinstance(Values, Iterable): + if isinstance(Values, str): + self.Values = Values + elif isinstance(Values, Iterable): self.Values = set(Values) else: self.Values = set([Values]) @@ -212,7 +214,6 @@ def __init__(self, ShortCircuit=True, Debug=False, Options=None, **kwargs): self.Stats = defaultdict(int) self.ShortCircuit = ShortCircuit self.Failed = False - self.FinalState = None self.Debug = Debug if self.Debug: @@ -324,7 +325,8 @@ def __call__(self, it, success_callback=None, fail_callback=None): for f in workflow: f(item) - if self.Failed and fail_callback is not None: - yield fail_callback(self) + if self.Failed: + if fail_callback is not None: + yield fail_callback(self) else: yield success_callback(self) From 27dfca64fac836ce430abf1badfec300d482f4db Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 29 Jan 2014 18:49:30 -0700 Subject: [PATCH 24/61] resolving pep8 warnings/errors --- qiime/process_seqs.py | 111 +++++++++++++++++++++++------------------- 1 file changed, 61 insertions(+), 50 deletions(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index 45c47058a0..13b82af370 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -1,5 +1,7 @@ #!/usr/bin/env python +"""Filter poor quality reads, trim barcodes/primers and assign to samples""" + from qiime.workflow.core import Workflow, requires, priority, no_requirements from cogent.parse.fasta import MinimalFastaParser from cogent.parse.fastq import MinimalFastqParser @@ -15,12 +17,13 @@ # pre allocate the iterable return. This is done for performance reasons to # avoid frequent reallocations and to ensure a consistent object type -_iter_prealloc = {'SequenceID':None, +_iter_prealloc = {'SequenceID':None, 'Sequence':None, 'Qual':None, 'Barcode':None} def _reset_iter_prealloc(): + """Reset the buffer""" for k in _iter_prealloc: _iter_prealloc[k] = None @@ -28,19 +31,20 @@ def _fasta_qual_gen(fasta_gen, qual_gen): """Yield fasta and qual together Raises ValueError if the sequence IDs and quality IDs are not in the same - order. Raises ValueError if the sequence length does not match the length - of the quality score. - + order. Raises ValueError if the sequence length does not match the length + of the quality score. + Note: object yielded is updated on each iteration. A new object is _not_ - created on each iteration. This is done for performance reasons, where + created on each iteration. This is done for performance reasons, where quick testing showed a 50% reduction in runtime. """ for (seq_id, seq), (qual_id, qual) in izip(fasta_gen, qual_gen): if seq_id != qual_id: raise ValueError("%s is not equal to %s!" % (seq_id, qual_id)) if len(seq) != len(qual): - raise ValueError("%s is not equal length to %s!" % (seq_id,qual_id)) - + raise ValueError("%s is not equal length to %s!" % (seq_id, + qual_id)) + _iter_prealloc['SequenceID'] = seq_id _iter_prealloc['Sequence'] = seq _iter_prealloc['Qual'] = qual @@ -49,9 +53,9 @@ def _fasta_qual_gen(fasta_gen, qual_gen): def _fasta_gen(fasta_gens): """Yield fasta data - + Note: object yielded is updated on each iteration. A new object is _not_ - created on each iteration. This is done for performance reasons, where + created on each iteration. This is done for performance reasons, where quick testing showed a 50% reduction in runtime. """ for id_, seq in fasta_gens: @@ -62,25 +66,25 @@ def _fasta_gen(fasta_gens): def fasta_iterator(fasta_fps, qual_fps=None): """Yield fasta and qual data - Expects file-like objects. If qual_fps is not None, quality scores are + Expects file-like objects. If qual_fps is not None, quality scores are yielded. The return will either be: {'SequenceID':foo, 'Sequence':bar, 'Qual':array([])} - or + or {'SequenceID':foo, 'Sequence':bar, 'Qual':None} - + Note: object yielded is updated on each iteration. A new object is _not_ - created on each iteration. This is done for performance reasons, where + created on each iteration. This is done for performance reasons, where quick testing showed a 50% reduction in runtime. """ _reset_iter_prealloc() - fasta_gens = chain(*map(MinimalFastaParser, fasta_fps)) - + fasta_gens = chain(*[MinimalFastaParser(f) for f in fasta_fps]) + if qual_fps is not None: - qual_gens = chain(*map(MinimalQualParser, qual_fps)) + qual_gens = chain(*[MinimalQualParser(f) for f in qual_fps]) gen = _fasta_qual_gen(fasta_gens, qual_gens) else: qual_gens = None @@ -90,9 +94,9 @@ def fasta_iterator(fasta_fps, qual_fps=None): def _fastq_barcode_gen(fastq_gens, barcode_gens, phred_f): """Yield fastq and barcode data - + Note: object yielded is updated on each iteration. A new object is _not_ - created on each iteration. This is done for performance reasons, where + created on each iteration. This is done for performance reasons, where quick testing showed a 50% reduction in runtime. """ _gen = izip(fastq_gens, barcode_gens) @@ -101,22 +105,22 @@ def _fastq_barcode_gen(fastq_gens, barcode_gens, phred_f): raise ValueError("%s is not equal to %s!" % (seqid, bc_seqid)) _iter_prealloc['SequenceID'] = seqid _iter_prealloc['Sequence'] = seq - _iter_prealloc['Qual'] = array(map(phred_f, qual)) + _iter_prealloc['Qual'] = array([phred_f(q) for q in qual]) _iter_prealloc['Barcode'] = bc_seq yield _iter_prealloc def _fastq_gen(fastq_gens, phred_f): """Yield fastq data - + Note: object yielded is updated on each iteration. A new object is _not_ - created on each iteration. This is done for performance reasons, where + created on each iteration. This is done for performance reasons, where quick testing showed a 50% reduction in runtime. """ for (seqid, seq, qual) in fastq_gens: _iter_prealloc['SequenceID'] = seqid _iter_prealloc['Sequence'] = seq - _iter_prealloc['Qual'] = array(map(phred_f, qual)) + _iter_prealloc['Qual'] = array([phred_f(q) for q in qual]) yield _iter_prealloc @@ -125,21 +129,21 @@ def fastq_iterator(fastq_fps, barcode_fps=None): Expects file-like objects. If barcode_fps is not None, barcodes are also yielded. The return will either be: - + {'SequenceID':foo, 'Sequence':bar, 'Qual':array([]), 'Barcode':foobar} - or + or {'SequenceID':foo, 'Sequence':bar, 'Qual':array([]), 'Barcode':None} - + Note: object yielded is updated on each iteration. A new object is _not_ - created on each iteration. This is done for performance reasons, where + created on each iteration. This is done for performance reasons, where quick testing showed a 50% reduction in runtime. """ _reset_iter_prealloc() - fastq_gens = chain(*map(MinimalFastqParser, fastq_fps)) - + fastq_gens = chain(*[MinimalFastqParser(f) for f in fastq_fps]) + # peek first_item = fastq_gens.next() seqid, seq, qual = first_item @@ -152,22 +156,24 @@ def fastq_iterator(fastq_fps, barcode_fps=None): ascii_to_phred_f = ascii_to_phred64 if barcode_fps: - barcode_gens = chain(*map(MinimalFastqParser, barcode_fps)) + barcode_gens = chain(*[MinimalFastqParser(f) for f in barcode_fps]) gen = _fastq_barcode_gen(fastq_gens, barcode_gens, ascii_to_phred_f) else: gen = _fastq_gen(fastq_gens, ascii_to_phred_f) return gen -### cythonize +### can cythonize def _count_mismatches(seq1, seq2): """Counts mismatches between two sequences""" - return sum([a != b for a,b in zip(seq1, seq2)]) + return sum([a != b for a, b in zip(seq1, seq2)]) def _has_qual(item): + """Check if an item has Qual""" return item['Qual'] is not None class SequenceWorkflow(Workflow): + """Implement the sequence processing workflow""" FinalState = {'Forward primer':None, 'Reverse primer':None, 'Sequence':None, @@ -177,7 +183,7 @@ class SequenceWorkflow(Workflow): 'Corrected barcode':None, 'Final barcode':None, 'Corrected barcode errors':None} - + def _stage_state(self): """Fish out barcodes and primers from the mapping data""" bcs = {} @@ -193,7 +199,7 @@ def _stage_state(self): 'LinkerPrimerSequence') all_sample_primers = sample_primers.split(',') primers[sample_bc] = expand_degeneracies(all_sample_primers) - + self.Barcodes = bcs self.Primers = primers @@ -201,7 +207,7 @@ def _sanity_check(self): name = self.__class__.__name__ if not hasattr(self, 'Mapping'): raise AttributeError("%s is missing Mapping!" % name) - + if not isinstance(self.Mapping, MetadataMap): raise AttributeError("self.Mapping is not of type MetadataMap") @@ -210,17 +216,20 @@ def _sanity_check(self): @priority(1000) @no_requirements def wf_init(self, item): + """Perform per sequence state initialization""" self._init_final_state(item) @priority(200) @requires(ValidData=_has_qual) def wf_read_quality(self, item): + """Check sequence quality""" pass @priority(100) @requires(Option='max_bc_errors') - @requires(Option='barcode_type', Values=['hamming_8','golay_12']) + @requires(Option='barcode_type', Values=['hamming_8', 'golay_12']) def wf_demultiplex_fixed(self, item): + """Demultiplex fixed length barcodes""" self._correct_golay12(item) self._correct_hamming8(item) @@ -228,10 +237,11 @@ def wf_demultiplex_fixed(self, item): if self.FinalState['Corrected barcode errors'] > bc_errors: self.Failed = True self.Stats['exceeds_bc_errors'] += 1 - + @priority(100) @requires(Option='barcode_type', Values='variable') def wf_demultiplex_variable(self, item): + """Demultiplex variable length barcodes""" raise NotImplementedError("variable length barcodes not supported yet") self._correct_variable(item) @@ -247,7 +257,7 @@ def wf_length_check(self, item): @requires(Option='instrument_type', Values='454') @requires(Option='disable_primer_check', Values=False) def wf_check_primer(self, item): - """ """ + """Check for a valid primer""" self._count_primer_mismatches(item) #self._local_align_forward_primer(item) @@ -255,26 +265,27 @@ def wf_check_primer(self, item): @requires(Option='barcode_type', Values='golay_12') def _correct_golay12(self, item): - """ """ + """Correct and decode a Golay 12nt barcode""" self._correct_encoded_barcode(item, decode_golay_12, 12) @requires(Option='barcode_type', Values='hamming_8') def _correct_hamming8(self, item): - """ """ + """Correct and decode a Hamming 8nt barcode""" self._correct_encoded_barcode(item, decode_hamming_8, 8) def _correct_encoded_barcode(self, item, method, bc_length): + """Correct and decode an encoded barcode""" if item['Barcode'] is not None: putative_bc = item['Barcode'] else: putative_bc = item['Sequence'][:bc_length] self.FinalState['Original barcode'] = putative_bc - + if putative_bc in self.Barcodes: self.FinalState['Corrected barcode errors'] = 0 final_bc = putative_bc - sample = self.Barcodes.get(putative_bc, None) + sample = self.Barcodes[putative_bc] else: corrected, num_errors = method(putative_bc) final_bc = corrected @@ -291,16 +302,16 @@ def _correct_encoded_barcode(self, item, method, bc_length): self.FinalState['Sample'] = sample def _init_final_state(self, item): - """Reset final state""" + """Reset per sequence state""" for k in self.FinalState: self.FinalState[k] = None - + @requires(Option='max_primer_mismatch') def _count_primer_mismatches(self, item): - """ """ + """Assess primer mismatches""" seq = item['Sequence'] qual = item['Qual'] - + obs_barcode = self.FinalState['Final barcode'] len_barcode = len(obs_barcode) @@ -308,20 +319,20 @@ def _count_primer_mismatches(self, item): len_primer = len(exp_primers[0]) obs_primer = seq[len_barcode:len_barcode + len_primer] - + mm = array([_count_mismatches(obs_primer, p) for p in exp_primers]) - + if (mm > self.Options['max_primer_mismatch']).all(): self.Failed = True self.Stats['max_primer_mismatch'] += 1 self.Stats['exceeds_max_primer_mismatch'] += 1 - + ### should decompose if not self.Options['retain_primer']: seq = seq[len_primer:] if qual is not None: qual = qual[len_primer:] - + self.FinalState['Forward primer'] = obs_primer self.FinalState['Sequence'] = seq @@ -339,7 +350,7 @@ def _local_align_forward_primer(self, item): failed = True max_primer_mismatch = self.Options['max_primer_mismatch'] for primer in self._primers: - mismatches, hit_start = local_align_primer_seq(primer, fasta_seq) + mismatches, hit_start = local_align_primer_seq(primer, seq) if mismatches <= max_primer_mismatch: seq = seq[hit_start + len(primer):] qual = seq[hit_start + len(primer):] From 4cf60da3534c9f1f244c3784de140ac2f8504585 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 29 Jan 2014 19:12:35 -0700 Subject: [PATCH 25/61] starting ProcessSeqsWorkflowTests --- tests/test_process_seqs.py | 453 ++++++++++++++++++++++++++++++++++--- 1 file changed, 425 insertions(+), 28 deletions(-) diff --git a/tests/test_process_seqs.py b/tests/test_process_seqs.py index 8bc69f519c..3c825831a8 100644 --- a/tests/test_process_seqs.py +++ b/tests/test_process_seqs.py @@ -1,22 +1,23 @@ #!/usr/bin/env python +from itertools import chain, izip +from numpy import array from cogent.util.unit_test import TestCase, main from cogent.parse.fasta import MinimalFastaParser from cogent.parse.fastq import MinimalFastqParser from qiime.parse import MinimalQualParser -from itertools import chain -from numpy import array from qiime.process_seqs import (_fasta_qual_gen, - fasta_iterator, _fastq_barcode_gen, fastq_iterator, _fastq_gen) + fasta_iterator, _fastq_barcode_gen, fastq_iterator, _fastq_gen, + SequenceWorkflow) from qiime.quality import ascii_to_phred64 -from itertools import izip +from qiime.util import MetadataMap class FastqIteratorTests(TestCase): def setUp(self): - fastq1_gen = MinimalFastqParser(fastq1.splitlines()) - fastq2_gen = MinimalFastqParser(fastq2.splitlines()) - barcodes1_gen = MinimalFastqParser(barcodes1.splitlines()) - barcodes2_gen = MinimalFastqParser(barcodes2.splitlines()) + fastq1_gen = MinimalFastqParser(fastq1_simple.splitlines()) + fastq2_gen = MinimalFastqParser(fastq2_simple.splitlines()) + barcodes1_gen = MinimalFastqParser(barcodes1_simple.splitlines()) + barcodes2_gen = MinimalFastqParser(barcodes2_simple.splitlines()) self.fastq_gen = chain(fastq1_gen, fastq2_gen) self.barcodes_gen = chain(barcodes1_gen, barcodes2_gen) @@ -58,7 +59,7 @@ def test_fastq_iterators_just_fastq(self): exp.append({'SequenceID':id_, 'Sequence':seq, 'Qual':qual, 'Barcode':None}) - open_fps = map(lambda x: x.splitlines(), [fastq1, fastq2]) + open_fps = map(lambda x: x.splitlines(), [fastq1_simple, fastq2_simple]) obs = [d.copy() for d in fastq_iterator(open_fps)] self.assertEqual(obs, exp) @@ -74,8 +75,8 @@ def test_fastq_iterators_barcodes(self): 'Barcode':bc}) splitter = lambda x: x.splitlines() - fastq_fps = map(splitter, [fastq1, fastq2]) - bc_fps = map(splitter, [barcodes1, barcodes2]) + fastq_fps = map(splitter, [fastq1_simple, fastq2_simple]) + bc_fps = map(splitter, [barcodes1_simple, barcodes2_simple]) obs = fastq_iterator(fastq_fps, bc_fps) for o,e in izip(obs,exp): @@ -86,11 +87,11 @@ def test_fastq_iterators_barcodes(self): class FastaIteratorTests(TestCase): def setUp(self): - fasta1_gen = MinimalFastaParser(fasta1.splitlines()) - qual1_gen = MinimalQualParser(qual1.splitlines()) - fasta2_gen = MinimalFastaParser(fasta2.splitlines()) - qual2_gen = MinimalQualParser(qual2.splitlines()) - qual2_bad_gen = MinimalQualParser(qual2_bad.splitlines()) + fasta1_gen = MinimalFastaParser(fasta1_simple.splitlines()) + qual1_gen = MinimalQualParser(qual1_simple.splitlines()) + fasta2_gen = MinimalFastaParser(fasta2_simple.splitlines()) + qual2_gen = MinimalQualParser(qual2_simple.splitlines()) + qual2_bad_gen = MinimalQualParser(qual2_simple_bad.splitlines()) self.fasta_gen = chain(fasta1_gen, fasta2_gen) self.qual_gen = chain(qual1_gen, qual2_gen) @@ -136,7 +137,7 @@ def test_fasta_iterators_just_fasta(self): exp.append({'SequenceID':id_, 'Sequence':seq, 'Qual':qual, 'Barcode':None}) - open_fps = map(lambda x: x.splitlines(), [fasta1, fasta2]) + open_fps = map(lambda x: x.splitlines(), [fasta1_simple, fasta2_simple]) obs = [d.copy() for d in fasta_iterator(open_fps)] self.assertEqual(obs, exp) @@ -152,8 +153,8 @@ def test_fasta_iterators_fasta_qual(self): exp.append({'SequenceID':id_, 'Sequence':seq, 'Qual':qual, 'Barcode':None}) splitter = lambda x: x.splitlines() - fasta_fps = map(splitter, [fasta1, fasta2]) - qual_fps = map(splitter, [qual1, qual2]) + fasta_fps = map(splitter, [fasta1_simple, fasta2_simple]) + qual_fps = map(splitter, [qual1_simple, qual2_simple]) obs = fasta_iterator(fasta_fps, qual_fps) for o,e in izip(obs, exp): @@ -161,7 +162,34 @@ def test_fasta_iterators_fasta_qual(self): self.assertEqual(o['Sequence'], e['Sequence']) self.assertTrue((o['Qual'] == e['Qual']).all()) -fasta1 = """>a + +class ProcessSeqsWorkflowTests(TestCase): + """Basing structure off of test_split_libraries_fastq.py""" + def setUp(self): + self.fastq1 = fastq1.split('\n') + self.barcode_fastq1 = barcode_fastq1.split('\n') + self.fastq2 = fastq2.split('\n') + self.barcode_fastq2 = barcode_fastq2.split('\n') + self.fastq1_expected_no_qual_unassigned = fastq1_expected_no_qual_unassigned + self.fastq1_expected_default = fastq1_expected_default + self.fastq2_expected_default = fastq2_expected_default + self.fastq1_expected_single_barcode = fastq1_expected_single_barcode + self.mapping = mapping + + def _make_workflow_obj(self, options): + return SequenceWorkflow(options, Mapping=self.mapping) + + def test_workflow_construction(self): + x = self._make_workflow_obj({'foo':'bar'}) + self.assertEqual(x.Stats, {}) + + def test_wf_init(self): + wf_obj = self._make_workflow_obj({'foo':'bar'}) + wf_obj.FinalState['Sequence'] = 'w00t' + wf_obj.wf_init(None) + self.assertEqual(set(wf_obj.FinalState.values()), set([None])) + +fasta1_simple = """>a abcde >b asdasdasd @@ -169,13 +197,13 @@ def test_fasta_iterators_fasta_qual(self): 123123 """ -fasta2 = """>x +fasta2_simple = """>x abcdefg >y popopo """ -qual1 = """>a +qual1_simple = """>a 1 2 3 4 5 >b 1 1 1 1 1 1 1 1 1 @@ -183,19 +211,19 @@ def test_fasta_iterators_fasta_qual(self): 2 2 2 2 2 2 """ -qual2 = """>x +qual2_simple = """>x 1 2 3 4 5 6 7 >y 1 1 1 1 1 1 """ -qual2_bad = """>x +qual2_simple_bad = """>x 1 2 3 4 5 6 >y 1 1 1 1 1 1 """ -fastq1 = """@a +fastq1_simple = """@a abcde +a abcde @@ -209,7 +237,7 @@ def test_fasta_iterators_fasta_qual(self): 123123 """ -fastq2 = """@x +fastq2_simple = """@x abcdefg +x abcdefg @@ -219,7 +247,7 @@ def test_fasta_iterators_fasta_qual(self): popopo """ -barcodes1 = """@a +barcodes1_simple = """@a test1 +a 1234 @@ -233,7 +261,7 @@ def test_fasta_iterators_fasta_qual(self): aaccb """ -barcodes2 = """@x +barcodes2_simple = """@x test4 +x 12312 @@ -243,5 +271,374 @@ def test_fasta_iterators_fasta_qual(self): 33333 """ +mapping = MetadataMap( + {'s1':{'BarcodeSequence':'AAAAAAAAAAAA', 'LinkerPrimerSequence':''}, + 's2':{'BarcodeSequence':'AAAAAAAAAAAC', 'LinkerPrimerSequence':''}, + 's3':{'BarcodeSequence':'AAAAAAAAAAAG', 'LinkerPrimerSequence':''}, + 's4':{'BarcodeSequence':'AAAAAAAAAAAT', 'LinkerPrimerSequence':''} + }, []) + +fastq1 = """@990:2:4:11271:5323#1/1 +GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC ++ +bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U` +@990:2:4:11271:5323#1/1 +GGTTACCTTGTTACGACTTCACCCCAATCATCGGCCCCACCTTAGACAGCTGACTCCTAAAAGGTTATCTCACCGG ++ +bbcbbbbbbbbbbbbbbbbbbbbbbbbbb_bbbbbbbbaba_b^bY_`aa^bPb`bbbbHYGYZTbb^_ab[^baT +@990:2:4:11272:9538#1/1 +GCACACACCGCCCGTCACACCATCCGAGTTGGAGGTACCCGAAGCCGGTAGTCTAACCGCAAGGAGGACGCTGTCG ++ +b_bbbbbbbbbbbbbbbbbbbbbbbbbbabaa^a`[bbbb`bbbbTbbabb]b][_a`a]acaaacbaca_a^`aa +@990:2:4:11272:9538#1/1 +GGCTACCTTGTTACGACTTCACCCTCCTCACTAAACGTACCTTCGACAGCGTCCTCCTTGCGGTTAGACTACCGGC ++ +bb^bbbbbbbbbbbbbbbbbbbbbbbabbbb``bbb`__bbbbbbIWRXX`R``\`\Y\^__ba^a[Saaa_]O]O +@990:2:4:11272:7447#1/1 +GCACACACCGCCCGTCACACCATCCGAGTTGGGGGTACCCGAAGCCGGCAGTCTAACCGCAAGGAGGACGCTGTCG ++ +b`bbbbbbbbbbbbbbb`^bbbbbYbbbbb\___`_bbab^aaaU^\`BBBBBBBBBBBBBBBBBBBBBBBBBBBB +@990:2:4:11272:7447#1/1 +GGATACCTTGTTACGACTTCACCCTCCTCACTCATCGTACCCTCGACAGCGTCCTCCTTGCTGTTAGACTTCCGGC ++ +b`bbbbbbbbbbbbbbb`^bbbbbYbbbbb\___`_bbab^aaaU^\`BBBBBBBBBBBBBBBBBBBBBBBBBBBB +@990:2:4:11272:19991#1/1 +GCACTCACCGCCCGTCACGCCACGGAAGCCGGCTGCACCTGAAGCCGGTGGGGCAACCGGCTGTCCCTTTTAGCGG ++ +bbbbbbbbbbbbbbbbbbbbbXbbb_bbbabbb`aZ[U]\OTYXV`TbBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@990:2:4:11272:19991#1/1 +GGCTACCTTGTTACGACTTCGCCCCAGTCACCGACCACACCCTCGACGGCTGCCTCCGGCTGGCCCTTTCCACCCA ++ +bbbbbbbbbbbbbbbbbbbba`bbbbbbbbbb`abb_aacbbbbb]___]\[\^^[aOcBBBBBBBBBBBBBBBBB +@990:2:4:11272:4315#1/1 +GTACTCACCGCCCGTCACGCCATGGGAGTTGGGCTTACCTGAAGCCCGCGAGCTAACCGGAAAGGGGGGGATGTGG ++ +bbbb_bbbbbbbbbb```Q```BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@990:2:4:11272:4315#1/1 +GGCTACCTTGTTACGACTTCACCCCCGTCGCTCGGCGTACCTTCGACCGCTGCCTCCTTTTGGTTATATCTCCGGG ++ +``Q``````_``````````K]]aBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@990:2:4:11272:5533#1/1 +GCACACACCGCCCGTCACACCACGAGAGTCGGCAACACCCGAAGTCGGTGAGGTAACCCCGAAAGGGGAGCCAGCC ++ +``Q``````_``````````K]]aBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB +@990:2:4:11272:5533#0/1 +GGATACCTTGTTACGACTTCACCCCAATCATCGACCCCACCTTCGGCGGCTGGCTCCCCTTTCGGGGGTACCTCAC ++ +bbbbbbbbbbbbbbbbbbbbbXbbb_bbbabbb`aZ[U]\OTYXV`TbBBBBBBBBBBBBBBBBBBBBBBBBBBBB +""" + +barcode_fastq1 = """@990:2:4:11271:5323#1/2 +AAAAAAAAAAAA ++ +bbbbbbbbbbbb +@990:2:4:11271:5323#1/2 +AAAAAAAAAAAC ++ +bbcbbbbbbbbb +@990:2:4:11272:9538#1/2 +AAAAAAAAAAAA ++ +b_bbbbbbbbbb +@990:2:4:11272:9538#1/2 +AAAAAAAAAAAT ++ +bb^bbbbbbbbb +@990:2:4:11272:7447#1/2 +AAAAAAAAAAAA ++ +b`bbbbbbbbbb +@990:2:4:11272:7447#1/2 +AAAAAAAAAAAA ++ +b`bbbbbbbbbb +@990:2:4:11272:19991#1/2 +AAAAAAAAAAAC ++ +bbbbbbbbbbbb +@990:2:4:11272:19991#1/2 +AAAAAAAAAAAC ++ +bbbbbbbbbbbb +@990:2:4:11272:4315#1/2 +AAAAAAAAAAAT ++ +bbbb_bbbbbbb +@990:2:4:11272:4315#1/2 +AAAAAAAAAAAT ++ +``Q``````_`` +@990:2:4:11272:5533#1/2 +GAAAAAAAAAAT ++ +``Q``````_`` +@990:2:4:11272:5533#0/2 +AAAAAAAAAAAT ++ +bbbbbbbbbbbb +""" + +fastq2 = """@M00176:17:000000000-A0CNA:1:1:15487:1773 1:N:0:0 +GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC ++ +bbbbbbbbbbBBBBBBBBBBBBBBBY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U` +@M00176:17:000000000-A0CNA:1:1:17088:1773 1:N:0:0 +GGTTACCTTGTTACGACTTCACCCCAATCATCGGCCCCACCTTAGACAGCTGACTCCTAAAAGGTTATCTCACCGG ++ +bbcbbbbbbbbbbbbbbbbbbbbbbbbbb_bbbbbbbbaba_b^bY_`aa^bPb`bbbbHYGYZTbb^_ab[^baT +@M00176:17:000000000-A0CNA:1:1:16738:1773 1:N:0:0 +GCACACACCGCCCGTCACACCATCCGAGTTGGAGGTACCCGAAGCCGGTAGTCTAACCGCAAGGAGGACGCTGTCG ++ +b_bbbbbbbbbbbbbbbbbbbbbbbbbbabaa^a`[bbbb`bbbbTbbabb]b][_a`a]acaaacbaca_a^`aa +@M00176:17:000000000-A0CNA:1:1:12561:1773 1:N:0:0 +GGCTACCTTGTTACGACTTCACCCTCCTCACTAAACGTACCTTCGACAGCGTCCTCCTTGCGGTTAGACTACCGGC ++ +bb^bbbBBBBbbbbbbbbbbbbbbbbabbbb``bbb`__bbbbbbIWRXX`R``\`\Y\^__ba^a[Saaa_]O]O +@M00176:17:000000000-A0CNA:1:1:14596:1773 1:N:0:0 +GCACACACCGCCCGTCACACCATCCGAGTTGGGGGTACCCGAAGCCGGCAGTCTAACCGCAAGGAGGACGCTGTCG ++ +b`bbbbbbbbbbbbbbb`^bbbbbYbbbbb\___`_bbab^aaaU^\`############################ +@M00176:17:000000000-A0CNA:1:1:12515:1774 1:N:0:0 +GGATACCTTGTTACGACTTCACCCTCCTCACTCATCGTACCCTCGACAGCGTCCTCCTTGCTGTTAGACTTCCGGC ++ +b`bbbbbbbbbbbbbbb`^bbbbbYbbbbb\___`_bbab^aaaU^\`############################ +@M00176:17:000000000-A0CNA:1:1:17491:1774 1:N:0:0 +GCACTCACCGCCCGTCACGCCACGGAAGCCGGCTGCACCTGAAGCCGGTGGGGCAACCGGCTGTCCCTTTTAGCGG ++ +bbbbbbbbbbbbbbbbbbbbbXbbb_bbbabbb`aZ[U]\OTYXV`Tb############################ +@M00176:17:000000000-A0CNA:1:1:16427:1774 1:N:0:0 +GGCTACCTTGTTACGACTTCGCCCCAGTCACCGACCACACCCTCGACGGCTGCCTCCGGCTGGCCCTTTCCACCCA ++ +bbbbbbbbbbbbbbbbbbbba`bbbbbbbbbb`abb_aacbbbbb]___]\[\^^[aOc################# +@M00176:17:000000000-A0CNA:1:1:13372:1775 1:N:0:0 +GTACTCACCGCCCGTCACGCCATGGGAGTTGGGCTTACCTGAAGCCCGCGAGCTAACCGGAAAGGGGGGGATGTGG ++ +bbbb_bbbbbbbbbb```Q```###################################################### +@M00176:17:000000000-A0CNA:1:1:14806:1775 1:N:0:0 +GGCTACCTTGTTACGACTTCACCCCCGTCGCTCGGCGTACCTTCGACCGCTGCCTCCTTTTGGTTATATCTCCGGG ++ +``Q``````_``BBBB````K]]a#################################################### +@M00176:17:000000000-A0CNA:1:1:13533:1775 1:N:0:0 +GCACACACCGCCCGTCACACCACGAGAGTCGGCAACACCCGAAGTCGGTGAGGTAACCCCGAAAGGGGAGCCAGCC ++ +``Q``````_``````````K]]a#################################################### +@M00176:17:000000000-A0CNA:1:1:18209:1775 1:N:0:0 +GGATACCTTGTTACGACTTCACCCCAATCATCGACCCCACCTTCGGCGGCTGGCTCCCCTTTCGGGGGTACCTCAC ++ +bbbbbbbbbbbbbbbbbbbbbXbbb_bbbabbb`aZ[U]\OTYXV`Tb############################ +""" + +barcode_fastq2 = """@M00176:17:000000000-A0CNA:1:1:15487:1773 2:N:0:0 +AAAAAAAAAAAA ++ +bbbbbbbbbbbb +@M00176:17:000000000-A0CNA:1:1:17088:1773 2:N:0:0 +AAAAAAAAAAAC ++ +bbcbbbbbbbbb +@M00176:17:000000000-A0CNA:1:1:16738:1773 2:N:0:0 +AAAAAAAAAAAA ++ +b_bbbbbbbbbb +@M00176:17:000000000-A0CNA:1:1:12561:1773 2:N:0:0 +AAAAAAAAAAAT ++ +bb^bbbbbbbbb +@M00176:17:000000000-A0CNA:1:1:14596:1773 2:N:0:0 +AAAAAAAAAAAA ++ +b`bbbbbbbbbb +@M00176:17:000000000-A0CNA:1:1:12515:1774 2:N:0:0 +AAAAAAAAAAAA ++ +b`bbbbbbbbbb +@M00176:17:000000000-A0CNA:1:1:17491:1774 2:N:0:0 +AAAAAAAAAAAC ++ +bbbbbbbbbbbb +@M00176:17:000000000-A0CNA:1:1:16427:1774 2:N:0:0 +AAAAAAAAAAAC ++ +bbbbbbbbbbbb +@M00176:17:000000000-A0CNA:1:1:13372:1775 2:N:0:0 +AAAAAAAAAAAT ++ +bbbb_bbbbbbb +@M00176:17:000000000-A0CNA:1:1:14806:1775 2:N:0:0 +AAAAAAAAAAAT ++ +``Q``````_`` +@M00176:17:000000000-A0CNA:1:1:13533:1775 2:N:0:0 +GAAAAAAAAAAT ++ +``Q``````_`` +@M00176:17:000000000-A0CNA:1:1:18209:1775 2:N:0:0 +AAAAAAAAAAAT ++ +bbbbbbbbbbbb +""" + + +fastq1_expected_no_qual_unassigned = [ + ("s1_0 990:2:4:11271:5323#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", + "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC", + "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`", + 0), + ("s2_1 990:2:4:11271:5323#1/1 orig_bc=AAAAAAAAAAAC new_bc=AAAAAAAAAAAC bc_diffs=0", + "GGTTACCTTGTTACGACTTCACCCCAATCATCGGCCCCACCTTAGACAGCTGACTCCTAAAAGGTTATCTCACCGG", + "bbcbbbbbbbbbbbbbbbbbbbbbbbbbb_bbbbbbbbaba_b^bY_`aa^bPb`bbbbHYGYZTbb^_ab[^baT", + 1), + ("s1_2 990:2:4:11272:9538#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", + "GCACACACCGCCCGTCACACCATCCGAGTTGGAGGTACCCGAAGCCGGTAGTCTAACCGCAAGGAGGACGCTGTCG", + "b_bbbbbbbbbbbbbbbbbbbbbbbbbbabaa^a`[bbbb`bbbbTbbabb]b][_a`a]acaaacbaca_a^`aa", + 2), + ("s4_3 990:2:4:11272:9538#1/1 orig_bc=AAAAAAAAAAAT new_bc=AAAAAAAAAAAT bc_diffs=0", + "GGCTACCTTGTTACGACTTCACCCTCCTCACTAAACGTACCTTCGACAGCGTCCTCCTTGCGGTTAGACTACCGGC", + "bb^bbbbbbbbbbbbbbbbbbbbbbbabbbb``bbb`__bbbbbbIWRXX`R``\`\Y\^__ba^a[Saaa_]O]O", + 3), + ("s1_4 990:2:4:11272:7447#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", + "GCACACACCGCCCGTCACACCATCCGAGTTGGGGGTACCCGAAGCCGGCAGTCTAACCGCAAGGAGGACGCTGTCG", + "b`bbbbbbbbbbbbbbb`^bbbbbYbbbbb\___`_bbab^aaaU^\`BBBBBBBBBBBBBBBBBBBBBBBBBBBB", + 4), + ("s1_5 990:2:4:11272:7447#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", + "GGATACCTTGTTACGACTTCACCCTCCTCACTCATCGTACCCTCGACAGCGTCCTCCTTGCTGTTAGACTTCCGGC", + "b`bbbbbbbbbbbbbbb`^bbbbbYbbbbb\___`_bbab^aaaU^\`BBBBBBBBBBBBBBBBBBBBBBBBBBBB", + 5), + ("s2_6 990:2:4:11272:19991#1/1 orig_bc=AAAAAAAAAAAC new_bc=AAAAAAAAAAAC bc_diffs=0", + "GCACTCACCGCCCGTCACGCCACGGAAGCCGGCTGCACCTGAAGCCGGTGGGGCAACCGGCTGTCCCTTTTAGCGG", + "bbbbbbbbbbbbbbbbbbbbbXbbb_bbbabbb`aZ[U]\OTYXV`TbBBBBBBBBBBBBBBBBBBBBBBBBBBBB", + 6), + ("s2_7 990:2:4:11272:19991#1/1 orig_bc=AAAAAAAAAAAC new_bc=AAAAAAAAAAAC bc_diffs=0", + "GGCTACCTTGTTACGACTTCGCCCCAGTCACCGACCACACCCTCGACGGCTGCCTCCGGCTGGCCCTTTCCACCCA", + "bbbbbbbbbbbbbbbbbbbba`bbbbbbbbbb`abb_aacbbbbb]___]\[\^^[aOcBBBBBBBBBBBBBBBBB", + 7), + ("s4_8 990:2:4:11272:4315#1/1 orig_bc=AAAAAAAAAAAT new_bc=AAAAAAAAAAAT bc_diffs=0", + "GTACTCACCGCCCGTCACGCCATGGGAGTTGGGCTTACCTGAAGCCCGCGAGCTAACCGGAAAGGGGGGGATGTGG", + "bbbb_bbbbbbbbbb```Q```BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB", + 8), + ("s4_9 990:2:4:11272:4315#1/1 orig_bc=AAAAAAAAAAAT new_bc=AAAAAAAAAAAT bc_diffs=0", + "GGCTACCTTGTTACGACTTCACCCCCGTCGCTCGGCGTACCTTCGACCGCTGCCTCCTTTTGGTTATATCTCCGGG", + "``Q``````_``````````K]]aBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB", + 9), + ("Unassigned_10 990:2:4:11272:5533#1/1 orig_bc=GAAAAAAAAAAT new_bc=GAAAAAAAAAAT bc_diffs=0", + "GCACACACCGCCCGTCACACCACGAGAGTCGGCAACACCCGAAGTCGGTGAGGTAACCCCGAAAGGGGAGCCAGCC", + "``Q``````_``````````K]]aBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB", + 10), + ("s4_11 990:2:4:11272:5533#0/1 orig_bc=AAAAAAAAAAAT new_bc=AAAAAAAAAAAT bc_diffs=0", + "GGATACCTTGTTACGACTTCACCCCAATCATCGACCCCACCTTCGGCGGCTGGCTCCCCTTTCGGGGGTACCTCAC", + "bbbbbbbbbbbbbbbbbbbbbXbbb_bbbabbb`aZ[U]\OTYXV`TbBBBBBBBBBBBBBBBBBBBBBBBBBBBB", + 11)] + +fastq1_expected_default = [ + ("s1_0 990:2:4:11271:5323#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", + "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC", + "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`", + 0), + ("s2_1 990:2:4:11271:5323#1/1 orig_bc=AAAAAAAAAAAC new_bc=AAAAAAAAAAAC bc_diffs=0", + "GGTTACCTTGTTACGACTTCACCCCAATCATCGGCCCCACCTTAGACAGCTGACTCCTAAAAGGTTATCTCACCGG", + "bbcbbbbbbbbbbbbbbbbbbbbbbbbbb_bbbbbbbbaba_b^bY_`aa^bPb`bbbbHYGYZTbb^_ab[^baT", + 1), + ("s1_2 990:2:4:11272:9538#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", + "GCACACACCGCCCGTCACACCATCCGAGTTGGAGGTACCCGAAGCCGGTAGTCTAACCGCAAGGAGGACGCTGTCG", + "b_bbbbbbbbbbbbbbbbbbbbbbbbbbabaa^a`[bbbb`bbbbTbbabb]b][_a`a]acaaacbaca_a^`aa", + 2), + ("s4_3 990:2:4:11272:9538#1/1 orig_bc=AAAAAAAAAAAT new_bc=AAAAAAAAAAAT bc_diffs=0", + "GGCTACCTTGTTACGACTTCACCCTCCTCACTAAACGTACCTTCGACAGCGTCCTCCTTGCGGTTAGACTACCGGC", + "bb^bbbbbbbbbbbbbbbbbbbbbbbabbbb``bbb`__bbbbbbIWRXX`R``\`\Y\^__ba^a[Saaa_]O]O", + 3), + ("s1_4 990:2:4:11272:7447#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", + "GCACACACCGCCCGTCACACCATCCGAGTTGGGGGTACCCGAAGCCGG", + "b`bbbbbbbbbbbbbbb`^bbbbbYbbbbb\___`_bbab^aaaU^\`", + 4), + ("s1_5 990:2:4:11272:7447#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", + "GGATACCTTGTTACGACTTCACCCTCCTCACTCATCGTACCCTCGACA", + "b`bbbbbbbbbbbbbbb`^bbbbbYbbbbb\___`_bbab^aaaU^\`", + 5), + ("s2_6 990:2:4:11272:19991#1/1 orig_bc=AAAAAAAAAAAC new_bc=AAAAAAAAAAAC bc_diffs=0", + "GCACTCACCGCCCGTCACGCCACGGAAGCCGGCTGCACCTGAAGCCGG", + "bbbbbbbbbbbbbbbbbbbbbXbbb_bbbabbb`aZ[U]\OTYXV`Tb", + 6), + ("s2_7 990:2:4:11272:19991#1/1 orig_bc=AAAAAAAAAAAC new_bc=AAAAAAAAAAAC bc_diffs=0", + "GGCTACCTTGTTACGACTTCGCCCCAGTCACCGACCACACCCTCGACGGCTGCCTCCGG", + "bbbbbbbbbbbbbbbbbbbba`bbbbbbbbbb`abb_aacbbbbb]___]\[\^^[aOc", + 7), + ("s4_8 990:2:4:11272:5533#0/1 orig_bc=AAAAAAAAAAAT new_bc=AAAAAAAAAAAT bc_diffs=0", + "GGATACCTTGTTACGACTTCACCCCAATCATCGACCCCACCTTCGGCG", + "bbbbbbbbbbbbbbbbbbbbbXbbb_bbbabbb`aZ[U]\OTYXV`Tb", 8)] + +fastq1_expected_single_barcode = [ + ("s1_0 990:2:4:11271:5323#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", + "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC", + "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`", + 0), + ("s1_1 990:2:4:11271:5323#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", + "GGTTACCTTGTTACGACTTCACCCCAATCATCGGCCCCACCTTAGACAGCTGACTCCTAAAAGGTTATCTCACCGG", + "bbcbbbbbbbbbbbbbbbbbbbbbbbbbb_bbbbbbbbaba_b^bY_`aa^bPb`bbbbHYGYZTbb^_ab[^baT", + 1), + ("s1_2 990:2:4:11272:9538#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", + "GCACACACCGCCCGTCACACCATCCGAGTTGGAGGTACCCGAAGCCGGTAGTCTAACCGCAAGGAGGACGCTGTCG", + "b_bbbbbbbbbbbbbbbbbbbbbbbbbbabaa^a`[bbbb`bbbbTbbabb]b][_a`a]acaaacbaca_a^`aa", + 2), + ("s1_3 990:2:4:11272:9538#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", + "GGCTACCTTGTTACGACTTCACCCTCCTCACTAAACGTACCTTCGACAGCGTCCTCCTTGCGGTTAGACTACCGGC", + "bb^bbbbbbbbbbbbbbbbbbbbbbbabbbb``bbb`__bbbbbbIWRXX`R``\`\Y\^__ba^a[Saaa_]O]O", + 3), + ("s1_4 990:2:4:11272:7447#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", + "GCACACACCGCCCGTCACACCATCCGAGTTGGGGGTACCCGAAGCCGG", + "b`bbbbbbbbbbbbbbb`^bbbbbYbbbbb\___`_bbab^aaaU^\`", + 4), + ("s1_5 990:2:4:11272:7447#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", + "GGATACCTTGTTACGACTTCACCCTCCTCACTCATCGTACCCTCGACA", + "b`bbbbbbbbbbbbbbb`^bbbbbYbbbbb\___`_bbab^aaaU^\`", + 5), + ("s1_6 990:2:4:11272:19991#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", + "GCACTCACCGCCCGTCACGCCACGGAAGCCGGCTGCACCTGAAGCCGG", + "bbbbbbbbbbbbbbbbbbbbbXbbb_bbbabbb`aZ[U]\OTYXV`Tb", + 6), + ("s1_7 990:2:4:11272:19991#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", + "GGCTACCTTGTTACGACTTCGCCCCAGTCACCGACCACACCCTCGACGGCTGCCTCCGG", + "bbbbbbbbbbbbbbbbbbbba`bbbbbbbbbb`abb_aacbbbbb]___]\[\^^[aOc", + 7), + ("s1_8 990:2:4:11272:5533#0/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", + "GGATACCTTGTTACGACTTCACCCCAATCATCGACCCCACCTTCGGCG", + "bbbbbbbbbbbbbbbbbbbbbXbbb_bbbabbb`aZ[U]\OTYXV`Tb", 8)] + +fastq2_expected_default = [ + ("s1_0 M00176:17:000000000-A0CNA:1:1:15487:1773 1:N:0:0 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", + "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC", + "bbbbbbbbbbBBBBBBBBBBBBBBBY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`", + 0), + ("s2_1 M00176:17:000000000-A0CNA:1:1:17088:1773 1:N:0:0 orig_bc=AAAAAAAAAAAC new_bc=AAAAAAAAAAAC bc_diffs=0", + "GGTTACCTTGTTACGACTTCACCCCAATCATCGGCCCCACCTTAGACAGCTGACTCCTAAAAGGTTATCTCACCGG", + "bbcbbbbbbbbbbbbbbbbbbbbbbbbbb_bbbbbbbbaba_b^bY_`aa^bPb`bbbbHYGYZTbb^_ab[^baT", + 1), + ("s1_2 M00176:17:000000000-A0CNA:1:1:16738:1773 1:N:0:0 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", + "GCACACACCGCCCGTCACACCATCCGAGTTGGAGGTACCCGAAGCCGGTAGTCTAACCGCAAGGAGGACGCTGTCG", + "b_bbbbbbbbbbbbbbbbbbbbbbbbbbabaa^a`[bbbb`bbbbTbbabb]b][_a`a]acaaacbaca_a^`aa", + 2), + ("s4_3 M00176:17:000000000-A0CNA:1:1:12561:1773 1:N:0:0 orig_bc=AAAAAAAAAAAT new_bc=AAAAAAAAAAAT bc_diffs=0", + "GGCTACCTTGTTACGACTTCACCCTCCTCACTAAACGTACCTTCGACAGCGTCCTCCTTGCGGTTAGACTACCGGC", + "bb^bbbBBBBbbbbbbbbbbbbbbbbabbbb``bbb`__bbbbbbIWRXX`R``\`\Y\^__ba^a[Saaa_]O]O", + 3), + ("s1_4 M00176:17:000000000-A0CNA:1:1:14596:1773 1:N:0:0 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", + "GCACACACCGCCCGTCACACCATCCGAGTTGGGGGTACCCGAAGCCGG", + "b`bbbbbbbbbbbbbbb`^bbbbbYbbbbb\___`_bbab^aaaU^\`", + 4), + ("s1_5 M00176:17:000000000-A0CNA:1:1:12515:1774 1:N:0:0 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", + "GGATACCTTGTTACGACTTCACCCTCCTCACTCATCGTACCCTCGACA", + "b`bbbbbbbbbbbbbbb`^bbbbbYbbbbb\___`_bbab^aaaU^\`", + 5), + ("s2_6 M00176:17:000000000-A0CNA:1:1:17491:1774 1:N:0:0 orig_bc=AAAAAAAAAAAC new_bc=AAAAAAAAAAAC bc_diffs=0", + "GCACTCACCGCCCGTCACGCCACGGAAGCCGGCTGCACCTGAAGCCGG", + "bbbbbbbbbbbbbbbbbbbbbXbbb_bbbabbb`aZ[U]\OTYXV`Tb", + 6), + ("s2_7 M00176:17:000000000-A0CNA:1:1:16427:1774 1:N:0:0 orig_bc=AAAAAAAAAAAC new_bc=AAAAAAAAAAAC bc_diffs=0", + "GGCTACCTTGTTACGACTTCGCCCCAGTCACCGACCACACCCTCGACGGCTGCCTCCGG", + "bbbbbbbbbbbbbbbbbbbba`bbbbbbbbbb`abb_aacbbbbb]___]\[\^^[aOc", + 7), + ("s4_8 M00176:17:000000000-A0CNA:1:1:18209:1775 1:N:0:0 orig_bc=AAAAAAAAAAAT new_bc=AAAAAAAAAAAT bc_diffs=0", + "GGATACCTTGTTACGACTTCACCCCAATCATCGACCCCACCTTCGGCG", + "bbbbbbbbbbbbbbbbbbbbbXbbb_bbbabbb`aZ[U]\OTYXV`Tb", 8)] + if __name__ == '__main__': main() From e38c252089da9cef179af0d8306fba8eb03bb005 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 29 Jan 2014 19:52:47 -0700 Subject: [PATCH 26/61] reorganizing groups --- qiime/process_seqs.py | 122 +++++++++++++++++++++++++++++------------- 1 file changed, 85 insertions(+), 37 deletions(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index 13b82af370..86fa012f88 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -173,7 +173,15 @@ def _has_qual(item): return item['Qual'] is not None class SequenceWorkflow(Workflow): - """Implement the sequence processing workflow""" + """Implement the sequence processing workflow + + All workflow methods expect an item that is dict-like with the following + keys and value types: + SequenceID : str + Sequence : str + Qual : np.array or None + Barcode : str or None + """ FinalState = {'Forward primer':None, 'Reverse primer':None, 'Sequence':None, @@ -212,26 +220,45 @@ def _sanity_check(self): raise AttributeError("self.Mapping is not of type MetadataMap") ### Start Workflow methods + ### NEED TO ADD STRONG DEFINITIONS OF EXPECTED STATE CHANGES @priority(1000) @no_requirements def wf_init(self, item): - """Perform per sequence state initialization""" + """Perform per sequence state initialization + + This workflow group will reset FinalState. + """ self._init_final_state(item) @priority(200) @requires(ValidData=_has_qual) - def wf_read_quality(self, item): + def wf_quality(self, item): """Check sequence quality""" pass - @priority(100) - @requires(Option='max_bc_errors') - @requires(Option='barcode_type', Values=['hamming_8', 'golay_12']) - def wf_demultiplex_fixed(self, item): - """Demultiplex fixed length barcodes""" - self._correct_golay12(item) - self._correct_hamming8(item) + @priority(150) + @requires(Option='demultiplex', Values=True) + def wf_demultiplex(self, item): + """Demultiplex a sequence + + If the sequence has not Failed, the following fields in FinalState will + be set: + + Sample + Original barcode + Final barcode + + In addition, the following field may be set: + + Corrected barcode + Corrected barcode errors + + This workflow group can trigger Failed and update Stats + """ + self._demultiplex_golay12(item) + self._demultiplex_hamming8(item) + self._demultiplex_other(item) bc_errors = self.Options['max_bc_errors'] if self.FinalState['Corrected barcode errors'] > bc_errors: @@ -239,41 +266,48 @@ def wf_demultiplex_fixed(self, item): self.Stats['exceeds_bc_errors'] += 1 @priority(100) - @requires(Option='barcode_type', Values='variable') - def wf_demultiplex_variable(self, item): - """Demultiplex variable length barcodes""" - raise NotImplementedError("variable length barcodes not supported yet") - self._correct_variable(item) - - @priority(90) - @requires(Option='min_seq_len', ValidData=_has_qual) - def wf_length_check(self, item): - """Checks minimum sequence length""" - if len(item['Qual']) < self.Options['min_seq_len']: - self.Failed = True - self.Stats['min_seq_len'] += 1 + @requires(Option='check_primer', Values=True) + def wf_primer(self, item): + """Perform primer validation - @priority(89) - @requires(Option='instrument_type', Values='454') - @requires(Option='disable_primer_check', Values=False) - def wf_check_primer(self, item): - """Check for a valid primer""" - self._count_primer_mismatches(item) - #self._local_align_forward_primer(item) + Primer validation may update the following keys in FinalState: + + Forward primer + Reverse primer + + This workflow group can trigger Failed and update Stats + """ + self._primer_instrument_454(item) + + @priority(50) + @no_requirements + def wf_sequence(self, item): + """Final sequence level checks + + Sequence level checks will not alter FinalState but may trigger Failed + and update Stats + """ + self._sequence_ambiguous_count(item) + self._sequence_length_check(item) ### End Workflow methods @requires(Option='barcode_type', Values='golay_12') - def _correct_golay12(self, item): + def _demultiplex_golay12(self, item): """Correct and decode a Golay 12nt barcode""" - self._correct_encoded_barcode(item, decode_golay_12, 12) + self._demultiplex_encoded_barcode(item, decode_golay_12, 12) @requires(Option='barcode_type', Values='hamming_8') - def _correct_hamming8(self, item): + def _demultiplex_hamming8(self, item): """Correct and decode a Hamming 8nt barcode""" - self._correct_encoded_barcode(item, decode_hamming_8, 8) + self._demultiplex_encoded_barcode(item, decode_hamming_8, 8) - def _correct_encoded_barcode(self, item, method, bc_length): + @requires(Option='barcode_type', Values='variable') + def _demultiplex_other(self, item): + """Decode a variable length barcode""" + raise NotImplementedError + + def _demultiplex_encoded_barcode(self, item, method, bc_length): """Correct and decode an encoded barcode""" if item['Barcode'] is not None: putative_bc = item['Barcode'] @@ -306,8 +340,14 @@ def _init_final_state(self, item): for k in self.FinalState: self.FinalState[k] = None + @requires(Option='instrument_type', Values='454') + def wf_check_primer(self, item): + """Check for a valid primer""" + self._count_primer_mismatches(item) + #self._local_align_forward_primer(item) + @requires(Option='max_primer_mismatch') - def _count_primer_mismatches(self, item): + def _primer_count_mismatches(self, item): """Assess primer mismatches""" seq = item['Sequence'] qual = item['Qual'] @@ -342,7 +382,7 @@ def _count_primer_mismatches(self, item): ### THIS IS STILL IN PROGRESS @requires(Option='local_align_forward_primer', Values=True) @requires(Option='max_primer_mismatch') - def _local_align_forward_primer(self, item): + def _primer_local_align_forward(self, item): """ """ seq = item['Sequence'] qual = item['Qual'] @@ -364,3 +404,11 @@ def _local_align_forward_primer(self, item): self.FinalState['Forward primer'] = primer self.FinalState['Sequence'] = seq self.FinalState['Qual'] = qual + + @requires(Option='min_seq_len') + def _sequence_length_check(self, item): + """Checks minimum sequence length""" + if len(item['Sequence']) < self.Options['min_seq_len']: + self.Failed = True + self.Stats['min_seq_len'] += 1 + From f899099479a0241a0fd6ecaf87411ce14806f992 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Thu, 30 Jan 2014 17:50:29 -0700 Subject: [PATCH 27/61] split_libraries_fastq methods in place --- qiime/process_seqs.py | 173 +++++++++++++++++++++++++++++++----------- 1 file changed, 127 insertions(+), 46 deletions(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index 86fa012f88..80b67e8aec 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -172,6 +172,32 @@ def _has_qual(item): """Check if an item has Qual""" return item['Qual'] is not None +### notes on splitlib fastq options: +# barcode_read_fps: via Command +# store_qual_scores: via Command +# sample_ids: via Command +# store_demultiplexed_fastq: via Command +# retain_unassigned_reads: via Command (Failed == False, Sample == None) +# max_bad_run_length: via wf_quality, +# UNTESTED +# min_per_read_length_fraction: via wf_quality, if truncation happens, do +# in place update on item +# STUBBED OUT +# sequence_max_n: via wf_sequence +# STUBBED OUT (ambiguous_count), UNTESTED +# start_seq_id: via Command, but also hopefully deprecated in favor of +# HDF5 format +# rev_comp_barcode: via Command and iterators? only if the barcodes are separate +# then it is possible to do at the iterator level... +# rev_comp_mapping_barcodes: via Command +# rev_comp: via Command and iterators +# phred_quality_threshold: via wf_quality +# STUBBED OUT, basically implemented? split_libraries_fastq is difficult to read... +# barcode_type: via wf_demultiplex +# DONE +# max_barcode_error: via wf_demultiplex +# phred_offset: via Command and iterators + class SequenceWorkflow(Workflow): """Implement the sequence processing workflow @@ -220,22 +246,30 @@ def _sanity_check(self): raise AttributeError("self.Mapping is not of type MetadataMap") ### Start Workflow methods - ### NEED TO ADD STRONG DEFINITIONS OF EXPECTED STATE CHANGES @priority(1000) @no_requirements def wf_init(self, item): """Perform per sequence state initialization - - This workflow group will reset FinalState. + + This workflow group will reset FinalState and will set the following in + FinalState: + + Sequence """ self._init_final_state(item) @priority(200) @requires(ValidData=_has_qual) def wf_quality(self, item): - """Check sequence quality""" - pass + """Check sequence quality + + This workflow group may update _item_ in the event of a sequence + truncation due to quality! + + """ + self._quality_max_bad_run_length(item) + self._quality_min_per_read_length_fraction(item) @priority(150) @requires(Option='demultiplex', Values=True) @@ -248,23 +282,20 @@ def wf_demultiplex(self, item): Sample Original barcode Final barcode - + In addition, the following field may be set: Corrected barcode Corrected barcode errors - + This workflow group can trigger Failed and update Stats """ self._demultiplex_golay12(item) self._demultiplex_hamming8(item) self._demultiplex_other(item) + self._demultiplex_max_barcode_error(item) - bc_errors = self.Options['max_bc_errors'] - if self.FinalState['Corrected barcode errors'] > bc_errors: - self.Failed = True - self.Stats['exceeds_bc_errors'] += 1 - + ### should this be wf_instrument for instriument specific checks? @priority(100) @requires(Option='check_primer', Values=True) def wf_primer(self, item): @@ -272,6 +303,7 @@ def wf_primer(self, item): Primer validation may update the following keys in FinalState: + Sequence Forward primer Reverse primer @@ -283,15 +315,62 @@ def wf_primer(self, item): @no_requirements def wf_sequence(self, item): """Final sequence level checks - + Sequence level checks will not alter FinalState but may trigger Failed and update Stats """ - self._sequence_ambiguous_count(item) self._sequence_length_check(item) + self._sequence_ambiguous_count(item) ### End Workflow methods + ### Start quality methods + + @requires(Option='phred_quality_threshold') + @requires(Option='max_bad_run_length') + def _quality_max_bad_run_length(self, item): + """Fail sequence if there is a poor quality run + + Warning: this method can modify item in place + """ + max_bad_run_length = self.Options['max_bad_run_length'] + phred_quality_threshold = self.Options['phred_quality_threshold'] + + # can cythonize + run_length = 0 + max_run_length = 0 + run_start_idx = 0 + max_run_start_idx = 0 + for idx, v in enumerate(item['Qual']): + if v <= phred_quality_threshold: + max_run_length += 1 + else: + if run_length > max_run_length: + max_run_length = run_length + max_run_start_idx = run_start_idx + run_length = 0 + run_start_idx = idx + + if max_run_length > max_bad_run_length: + item['Qual'] = item['Qual'][:max_run_start_idx] + item['Seq'] = item['Sequence'][:max_run_start_idx] + self.Stats['_quality_max_bad_run_length'] += 1 + + @requires(Option='phred_quality_threshold') + @requires(Option='min_per_read_length_fraction') + def _quality_min_per_read_length_fraction(self, item): + """Fail a sequence if a percentage of bad quality calls exist""" + bad_bases = item['Qual'] < self.Options['phred_quality_threshold'] + bad_bases_count = bad_bases.sum(dtype=float) + threshold = self.Options['min_per_read_length_fraction'] + + if (bad_bases_count / len(item['Sequence'])) < threshold: + self.Failed = True + self.Stats['min_per_read_length_fraction'] += 1 + + ### End quality methods + + ### Start demultiplex methods @requires(Option='barcode_type', Values='golay_12') def _demultiplex_golay12(self, item): """Correct and decode a Golay 12nt barcode""" @@ -307,7 +386,9 @@ def _demultiplex_other(self, item): """Decode a variable length barcode""" raise NotImplementedError - def _demultiplex_encoded_barcode(self, item, method, bc_length): + #### use kwargs for method and bc_length + def _demultiplex_encoded_barcode(self, item, method=decode_golay_12, + bc_length=12): """Correct and decode an encoded barcode""" if item['Barcode'] is not None: putative_bc = item['Barcode'] @@ -334,17 +415,33 @@ def _demultiplex_encoded_barcode(self, item, method, bc_length): self.Failed = True else: self.FinalState['Sample'] = sample + + @requires(Option='max_barcode_error') + def _demultiplex_max_barcode_error(self, item): + """ """ + bc_errors = self.Options['max_bc_errors'] + if self.FinalState['Corrected barcode errors'] > bc_errors: + self.Failed = True + self.Stats['exceeds_bc_errors'] += 1 + + ### End demultiplex methods + + ### Start init methods def _init_final_state(self, item): """Reset per sequence state""" for k in self.FinalState: self.FinalState[k] = None + self.FinalState['Sequence'] = item['Sequence'] + + ### End init methods + + ### Start primer methods @requires(Option='instrument_type', Values='454') - def wf_check_primer(self, item): + def _primer_instrument_454(self, item): """Check for a valid primer""" - self._count_primer_mismatches(item) - #self._local_align_forward_primer(item) + self._primer_count_mismatches(item) @requires(Option='max_primer_mismatch') def _primer_count_mismatches(self, item): @@ -376,35 +473,10 @@ def _primer_count_mismatches(self, item): self.FinalState['Forward primer'] = obs_primer self.FinalState['Sequence'] = seq - ##### for truncating i believe, but isn't clear why we need to attempt to - ##### align against all possible primers instead of just the one we expect + ### End primer methods + + ### Start sequence methods - ### THIS IS STILL IN PROGRESS - @requires(Option='local_align_forward_primer', Values=True) - @requires(Option='max_primer_mismatch') - def _primer_local_align_forward(self, item): - """ """ - seq = item['Sequence'] - qual = item['Qual'] - - failed = True - max_primer_mismatch = self.Options['max_primer_mismatch'] - for primer in self._primers: - mismatches, hit_start = local_align_primer_seq(primer, seq) - if mismatches <= max_primer_mismatch: - seq = seq[hit_start + len(primer):] - qual = seq[hit_start + len(primer):] - failed = False - break - - if failed: - self.Stats['max_primer_mismatch'] += 1 - self.Stats['exceeds_max_primer_mismatch'] = 1 - else: - self.FinalState['Forward primer'] = primer - self.FinalState['Sequence'] = seq - self.FinalState['Qual'] = qual - @requires(Option='min_seq_len') def _sequence_length_check(self, item): """Checks minimum sequence length""" @@ -412,3 +484,12 @@ def _sequence_length_check(self, item): self.Failed = True self.Stats['min_seq_len'] += 1 + @requires(Option='ambiguous_count') + def _sequence_ambiguous_count(self, item): + """Fail if the number of N characters is greater than threshold""" + count = item['Sequence'].count('N') + if count > self.Options['ambiguous_count']: + self.Failed = True + self.Stats['ambiguous_count'] += 1 + + ### End sequence methods From b7c6a8f6087662d700721c1a955352a689acf79e Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Thu, 30 Jan 2014 19:47:31 -0700 Subject: [PATCH 28/61] tests for quality_max_bad_run_length --- qiime/process_seqs.py | 9 +++++++-- tests/test_process_seqs.py | 30 ++++++++++++++++++++++++++---- 2 files changed, 33 insertions(+), 6 deletions(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index 80b67e8aec..50e6435867 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -341,6 +341,7 @@ def _quality_max_bad_run_length(self, item): max_run_length = 0 run_start_idx = 0 max_run_start_idx = 0 + for idx, v in enumerate(item['Qual']): if v <= phred_quality_threshold: max_run_length += 1 @@ -348,12 +349,16 @@ def _quality_max_bad_run_length(self, item): if run_length > max_run_length: max_run_length = run_length max_run_start_idx = run_start_idx + run_length = 0 run_start_idx = idx + if max_run_length == 0: + max_run_start_idx = run_start_idx + if max_run_length > max_bad_run_length: - item['Qual'] = item['Qual'][:max_run_start_idx] - item['Seq'] = item['Sequence'][:max_run_start_idx] + item['Qual'] = item['Qual'][:max_run_start_idx+1] + item['Sequence'] = item['Sequence'][:max_run_start_idx+1] self.Stats['_quality_max_bad_run_length'] += 1 @requires(Option='phred_quality_threshold') diff --git a/tests/test_process_seqs.py b/tests/test_process_seqs.py index 3c825831a8..4859fd6376 100644 --- a/tests/test_process_seqs.py +++ b/tests/test_process_seqs.py @@ -177,17 +177,39 @@ def setUp(self): self.mapping = mapping def _make_workflow_obj(self, options): - return SequenceWorkflow(options, Mapping=self.mapping) + return SequenceWorkflow(Options=options, Mapping=self.mapping) def test_workflow_construction(self): x = self._make_workflow_obj({'foo':'bar'}) - self.assertEqual(x.Stats, {}) def test_wf_init(self): wf_obj = self._make_workflow_obj({'foo':'bar'}) wf_obj.FinalState['Sequence'] = 'w00t' - wf_obj.wf_init(None) - self.assertEqual(set(wf_obj.FinalState.values()), set([None])) + wf_obj.wf_init({'Sequence':'foo'}) + self.assertEqual(set(wf_obj.FinalState.values()), set([None, 'foo'])) + + def test_quality_max_bad_run_length(self): + wf_obj = self._make_workflow_obj({'phred_quality_threshold':5, + 'max_bad_run_length':3}) + item1 = {'Sequence':'AATTGGCC', + 'Qual':array([6, 6, 6, 6, 6, 6, 6, 6])} + exp1 = item1.copy() + + item2 = {'Sequence':'AATTGGCC', + 'Qual':array([6, 6, 6, 1, 1, 6, 6, 6])} + exp2 = item2.copy() + + item3 = {'Sequence':'AATTGGCC', + 'Qual':array([6, 6, 1, 1, 1, 1, 6, 6])} + exp3 = {'Sequence':'AA', 'Qual':array([6, 6])} + + wf_obj._quality_max_bad_run_length(item1) + wf_obj._quality_max_bad_run_length(item2) + wf_obj._quality_max_bad_run_length(item3) + + self.assertEqual(item1, exp1) + self.assertEqual(item2, exp2) + self.assertEqual(item3, exp3) fasta1_simple = """>a abcde From a58c1545a586bf838f12af3dd3f79aac66ac40e4 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Thu, 30 Jan 2014 19:53:25 -0700 Subject: [PATCH 29/61] tests for min_per_read_length_fraction --- qiime/process_seqs.py | 6 +++--- tests/test_process_seqs.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index 50e6435867..f94e762629 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -367,9 +367,9 @@ def _quality_min_per_read_length_fraction(self, item): """Fail a sequence if a percentage of bad quality calls exist""" bad_bases = item['Qual'] < self.Options['phred_quality_threshold'] bad_bases_count = bad_bases.sum(dtype=float) - threshold = self.Options['min_per_read_length_fraction'] - - if (bad_bases_count / len(item['Sequence'])) < threshold: + threshold = 1 - self.Options['min_per_read_length_fraction'] + + if (bad_bases_count / len(item['Sequence'])) > threshold: self.Failed = True self.Stats['min_per_read_length_fraction'] += 1 diff --git a/tests/test_process_seqs.py b/tests/test_process_seqs.py index 4859fd6376..37ca81efbc 100644 --- a/tests/test_process_seqs.py +++ b/tests/test_process_seqs.py @@ -211,6 +211,34 @@ def test_quality_max_bad_run_length(self): self.assertEqual(item2, exp2) self.assertEqual(item3, exp3) + def test_quality_min_per_read_length_fraction(self): + wf_obj = self._make_workflow_obj({'phred_quality_threshold':5, + 'min_per_read_length_fraction':0.6}) + item1 = {'Sequence':'AATTGGCC', + 'Qual':array([6, 6, 6, 6, 6, 6, 6, 6])} + exp1 = item1.copy() + + item2 = {'Sequence':'AATTGGCC', + 'Qual':array([6, 1, 6, 1, 1, 6, 6, 6])} + exp2 = item2.copy() + + item3 = {'Sequence':'AATTGGCC', + 'Qual':array([6, 6, 1, 1, 1, 1, 6, 6])} + exp3 = {'Sequence':'AATTGGCC', 'Qual':array([6, 6, 1, 1, 1, 1, 6, 6])} + + wf_obj._quality_min_per_read_length_fraction(item1) + self.assertFalse(wf_obj.Failed) + + wf_obj._quality_min_per_read_length_fraction(item2) + self.assertFalse(wf_obj.Failed) + + wf_obj._quality_min_per_read_length_fraction(item3) + self.assertTrue(wf_obj.Failed) + + self.assertEqual(item1, exp1) + self.assertEqual(item2, exp2) + self.assertEqual(item3, exp3) + fasta1_simple = """>a abcde >b From d02e017668954d48467e20463981b910f85c55e8 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Thu, 30 Jan 2014 21:57:43 -0700 Subject: [PATCH 30/61] added tests for _demultiplex_encoded_barcode --- qiime/process_seqs.py | 1 + tests/test_process_seqs.py | 51 +++++++++++++++++++++++++++++++++++++- 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index f94e762629..993e919d0c 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -418,6 +418,7 @@ def _demultiplex_encoded_barcode(self, item, method=decode_golay_12, if sample is None: self.Failed = True + self.Stats['Unknown barcode'] += 1 else: self.FinalState['Sample'] = sample diff --git a/tests/test_process_seqs.py b/tests/test_process_seqs.py index 37ca81efbc..af84f31505 100644 --- a/tests/test_process_seqs.py +++ b/tests/test_process_seqs.py @@ -238,7 +238,55 @@ def test_quality_min_per_read_length_fraction(self): self.assertEqual(item1, exp1) self.assertEqual(item2, exp2) self.assertEqual(item3, exp3) + + def test_demultiplex_golay12(self): + # this is a wrapper, tested in test_deultiplex_encoded_barcode + pass + + def test_demultiplex_hamming8(self): + # this is a wrapper, tested in test_deultiplex_encoded_barcode + pass + + def test_demultiplex_encoded_barcode(self): + wf_obj = self._make_workflow_obj({}) + + needs_a_fix = {'Barcode':'GGAGACAAGGGT', 'Sequence':'AATTGGCC'} + exact = {'Barcode':'GGAGACAAGGGA', 'Sequence':'AATTGGCC'} + from_sequence = {'Barcode':None, 'Sequence':'GGAGACAAGGGAAATTAATT'} + unknown_barcode = {'Barcode':'ACACCTGGTGAT', 'Sequence':'AATTGGCC'} + + wf_obj.wf_init(needs_a_fix) + wf_obj._demultiplex_encoded_barcode(needs_a_fix) + self.assertEqual(wf_obj.FinalState['Original barcode'], 'GGAGACAAGGGT') + self.assertEqual(wf_obj.FinalState['Corrected barcode errors'], 1) + self.assertEqual(wf_obj.FinalState['Corrected barcode'], 'GGAGACAAGGGA') + self.assertEqual(wf_obj.FinalState['Sample'], 's5') + self.assertFalse(wf_obj.Failed) + + wf_obj.wf_init(exact) + wf_obj._demultiplex_encoded_barcode(exact) + self.assertEqual(wf_obj.FinalState['Original barcode'], 'GGAGACAAGGGA') + self.assertEqual(wf_obj.FinalState['Corrected barcode errors'], 0) + self.assertEqual(wf_obj.FinalState['Corrected barcode'], None) + self.assertEqual(wf_obj.FinalState['Sample'], 's5') + self.assertFalse(wf_obj.Failed) + wf_obj.wf_init(from_sequence) + wf_obj._demultiplex_encoded_barcode(from_sequence) + self.assertEqual(wf_obj.FinalState['Original barcode'], 'GGAGACAAGGGA') + self.assertEqual(wf_obj.FinalState['Corrected barcode errors'], 0) + self.assertEqual(wf_obj.FinalState['Corrected barcode'], None) + self.assertEqual(wf_obj.FinalState['Sample'], 's5') + self.assertFalse(wf_obj.Failed) + + wf_obj.wf_init(unknown_barcode) + wf_obj._demultiplex_encoded_barcode(unknown_barcode) + self.assertEqual(wf_obj.FinalState['Original barcode'], 'ACACCTGGTGAT') + self.assertEqual(wf_obj.FinalState['Corrected barcode errors'], 0) + self.assertEqual(wf_obj.FinalState['Corrected barcode'], 'ACACCTGGTGAT') + self.assertEqual(wf_obj.FinalState['Sample'], None) + self.assertTrue(wf_obj.Failed) + fasta1_simple = """>a abcde >b @@ -325,7 +373,8 @@ def test_quality_min_per_read_length_fraction(self): {'s1':{'BarcodeSequence':'AAAAAAAAAAAA', 'LinkerPrimerSequence':''}, 's2':{'BarcodeSequence':'AAAAAAAAAAAC', 'LinkerPrimerSequence':''}, 's3':{'BarcodeSequence':'AAAAAAAAAAAG', 'LinkerPrimerSequence':''}, - 's4':{'BarcodeSequence':'AAAAAAAAAAAT', 'LinkerPrimerSequence':''} + 's4':{'BarcodeSequence':'AAAAAAAAAAAT', 'LinkerPrimerSequence':''}, + 's5':{'BarcodeSequence':'GGAGACAAGGGA', 'LinkerPrimerSequence':''} }, []) fastq1 = """@990:2:4:11271:5323#1/1 From 5b7d60b8175794eefdb9ca0e0ae7f69329a752d1 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Thu, 30 Jan 2014 22:00:47 -0700 Subject: [PATCH 31/61] added tests for _demultiplex_max_barcode_error --- qiime/process_seqs.py | 2 +- tests/test_process_seqs.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index 993e919d0c..5b0c55bbec 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -425,7 +425,7 @@ def _demultiplex_encoded_barcode(self, item, method=decode_golay_12, @requires(Option='max_barcode_error') def _demultiplex_max_barcode_error(self, item): """ """ - bc_errors = self.Options['max_bc_errors'] + bc_errors = self.Options['max_barcode_error'] if self.FinalState['Corrected barcode errors'] > bc_errors: self.Failed = True self.Stats['exceeds_bc_errors'] += 1 diff --git a/tests/test_process_seqs.py b/tests/test_process_seqs.py index af84f31505..e5abef3899 100644 --- a/tests/test_process_seqs.py +++ b/tests/test_process_seqs.py @@ -287,6 +287,23 @@ def test_demultiplex_encoded_barcode(self): self.assertEqual(wf_obj.FinalState['Sample'], None) self.assertTrue(wf_obj.Failed) + def test_demultiplex_max_barcode_error(self): + wf_obj = self._make_workflow_obj({'max_barcode_error':0}) + + needs_a_fix = {'Barcode':'GGAGACAAGGGT', 'Sequence':'AATTGGCC'} + exact = {'Barcode':'GGAGACAAGGGA', 'Sequence':'AATTGGCC'} + + wf_obj.wf_init(exact) + wf_obj._demultiplex_encoded_barcode(exact) + wf_obj._demultiplex_max_barcode_error(exact) + self.assertFalse(wf_obj.Failed) + + wf_obj.wf_init(needs_a_fix) + wf_obj._demultiplex_encoded_barcode(needs_a_fix) + self.assertFalse(wf_obj.Failed) + wf_obj._demultiplex_max_barcode_error(needs_a_fix) + self.assertTrue(wf_obj.Failed) + fasta1_simple = """>a abcde >b From c44aee1095de573191e48687051a84563e7f9600 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Thu, 30 Jan 2014 22:04:54 -0700 Subject: [PATCH 32/61] added test docstrings --- tests/test_process_seqs.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tests/test_process_seqs.py b/tests/test_process_seqs.py index e5abef3899..4219178671 100644 --- a/tests/test_process_seqs.py +++ b/tests/test_process_seqs.py @@ -25,6 +25,7 @@ def setUp(self): self.reversed_fastq_gen = chain(fastq2_gen, fastq1_gen) def test_fastq_barcode_gen_simple(self): + """Test simple fastq/barcode generation""" exp_data = [('a', 'abcde', 'test1', array([33,34,35,36,37])), ('b', 'asdasdasd', 'test2', array([33,51,36] * 3)), ('c', '123123', 'test3', array([-15, -14, -13] * 2)), @@ -43,12 +44,14 @@ def test_fastq_barcode_gen_simple(self): self.assertTrue((o['Qual'] == e['Qual']).all()) self.assertEqual(o['Barcode'], e['Barcode']) - def test_fasta_barcode_gen_mismatch_ids(self): + def test_fastq_barcode_gen_mismatch_ids(self): + """Verify fastq barcode mismatch error""" with self.assertRaises(ValueError): g = _fasta_qual_gen(self.reversed_fastq_gen, self.barcodes_gen) _ = list(g) def test_fastq_iterators_just_fastq(self): + """Test iterating fastq without barcodes""" exp_data = [('a', 'abcde', array([33,34,35,36,37])), ('b', 'asdasdasd', array([33,51,36] * 3)), ('c', '123123', array([-15, -14, -13] * 2)), @@ -64,6 +67,7 @@ def test_fastq_iterators_just_fastq(self): self.assertEqual(obs, exp) def test_fastq_iterators_barcodes(self): + """Test iterating fastq with barcodes""" exp_data = [('a', 'abcde', 'test1', array([33,34,35,36,37])), ('b', 'asdasdasd', 'test2', array([33,51,36] * 3)), ('c', '123123', 'test3', array([-15, -14, -13] * 2)), @@ -100,6 +104,7 @@ def setUp(self): self.qual_bad_gen = chain(qual1_gen, qual2_bad_gen) def test_fasta_qual_gen_simple(self): + """Test fasta/qual gen""" exp_data = [('a', 'abcde', array([1, 2, 3, 4, 5])), ('b', 'asdasdasd', array([1,1,1,1,1,1,1,1,1])), ('c', '123123', array([2, 2, 2, 2, 2, 2])), @@ -117,15 +122,18 @@ def test_fasta_qual_gen_simple(self): self.assertTrue((o['Qual'] == e['Qual']).all()) def test_fasta_qual_gen_mismatch_ids(self): + """Verify fasta/qual id mismatch error""" with self.assertRaises(ValueError): g = _fasta_qual_gen(self.reversed_fasta_gen, self.qual_gen) _ = list(g) def test_fasta_qual_gen_mismatch_length(self): + """Verify fasta/qual mismatch error""" with self.assertRaises(ValueError): _ = list(_fasta_qual_gen(self.fasta_gen, self.qual_bad_gen)) def test_fasta_iterators_just_fasta(self): + """Test that we can iterate over just fasta""" exp_data = [('a', 'abcde', None), ('b', 'asdasdasd', None), ('c', '123123', None), @@ -142,6 +150,7 @@ def test_fasta_iterators_just_fasta(self): self.assertEqual(obs, exp) def test_fasta_iterators_fasta_qual(self): + """Test that we can iterate over fasta with qual""" exp_data = [('a', 'abcde', array([1, 2, 3, 4, 5])), ('b', 'asdasdasd', array([1,1,1,1,1,1,1,1,1])), ('c', '123123', array([2, 2, 2, 2, 2, 2])), @@ -177,18 +186,22 @@ def setUp(self): self.mapping = mapping def _make_workflow_obj(self, options): + """Helper method for creating workflows""" return SequenceWorkflow(Options=options, Mapping=self.mapping) def test_workflow_construction(self): + """Make sure we can construct using our helper method""" x = self._make_workflow_obj({'foo':'bar'}) def test_wf_init(self): + """Check the initialization method""" wf_obj = self._make_workflow_obj({'foo':'bar'}) wf_obj.FinalState['Sequence'] = 'w00t' wf_obj.wf_init({'Sequence':'foo'}) self.assertEqual(set(wf_obj.FinalState.values()), set([None, 'foo'])) def test_quality_max_bad_run_length(self): + """Verify max bad run length quality trimming""" wf_obj = self._make_workflow_obj({'phred_quality_threshold':5, 'max_bad_run_length':3}) item1 = {'Sequence':'AATTGGCC', @@ -212,6 +225,7 @@ def test_quality_max_bad_run_length(self): self.assertEqual(item3, exp3) def test_quality_min_per_read_length_fraction(self): + """Verify minimum quality per read length""" wf_obj = self._make_workflow_obj({'phred_quality_threshold':5, 'min_per_read_length_fraction':0.6}) item1 = {'Sequence':'AATTGGCC', @@ -248,6 +262,7 @@ def test_demultiplex_hamming8(self): pass def test_demultiplex_encoded_barcode(self): + """Verify decoding barcodes""" wf_obj = self._make_workflow_obj({}) needs_a_fix = {'Barcode':'GGAGACAAGGGT', 'Sequence':'AATTGGCC'} @@ -288,6 +303,7 @@ def test_demultiplex_encoded_barcode(self): self.assertTrue(wf_obj.Failed) def test_demultiplex_max_barcode_error(self): + """Verify failing max_barcode_error checking""" wf_obj = self._make_workflow_obj({'max_barcode_error':0}) needs_a_fix = {'Barcode':'GGAGACAAGGGT', 'Sequence':'AATTGGCC'} From 2fe6baf0666c610fac57cbe29361ea6cef4ca45a Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Fri, 31 Jan 2014 08:46:49 -0700 Subject: [PATCH 33/61] tests for primer_check_forward --- qiime/process_seqs.py | 29 +++++++++--- tests/test_process_seqs.py | 94 +++++++++++++++++++++++++++++++++++++- 2 files changed, 116 insertions(+), 7 deletions(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index 5b0c55bbec..9546e85e20 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -399,6 +399,8 @@ def _demultiplex_encoded_barcode(self, item, method=decode_golay_12, putative_bc = item['Barcode'] else: putative_bc = item['Sequence'][:bc_length] + ### if this case happens, need to update item['Sequence'] to + ### trim off the barcode! self.FinalState['Original barcode'] = putative_bc @@ -422,6 +424,8 @@ def _demultiplex_encoded_barcode(self, item, method=decode_golay_12, else: self.FinalState['Sample'] = sample + ### really need the requires to be a nonnone value: + # @requires(Option='max_barcode_error', Values=_not_none) @requires(Option='max_barcode_error') def _demultiplex_max_barcode_error(self, item): """ """ @@ -447,21 +451,30 @@ def _init_final_state(self, item): @requires(Option='instrument_type', Values='454') def _primer_instrument_454(self, item): """Check for a valid primer""" - self._primer_count_mismatches(item) + self._primer_check_forward(item) + @requires(Option='retain_primer') @requires(Option='max_primer_mismatch') - def _primer_count_mismatches(self, item): - """Assess primer mismatches""" + def _primer_check_forward(self, item): + """Attempt to determine if the forward primer exists and trim if there + + Warning: this method may do an in place update on item if retain primer + False. + """ seq = item['Sequence'] qual = item['Qual'] obs_barcode = self.FinalState['Final barcode'] - len_barcode = len(obs_barcode) + exp_primers = self.Primers.get(obs_barcode, None) + + if exp_primers is None: + self.Stats['unknown_primer_barcode_pair'] += 1 + self.Failed = True + return - exp_primers = self.Primers[obs_barcode] len_primer = len(exp_primers[0]) - obs_primer = seq[len_barcode:len_barcode + len_primer] + obs_primer = seq[:len_primer] mm = array([_count_mismatches(obs_primer, p) for p in exp_primers]) @@ -469,15 +482,19 @@ def _primer_count_mismatches(self, item): self.Failed = True self.Stats['max_primer_mismatch'] += 1 self.Stats['exceeds_max_primer_mismatch'] += 1 + return ### should decompose if not self.Options['retain_primer']: seq = seq[len_primer:] + item['Sequence'] = seq if qual is not None: qual = qual[len_primer:] + item['Qual'] = qual self.FinalState['Forward primer'] = obs_primer self.FinalState['Sequence'] = seq + self.FinalState['Qual'] = qual ### End primer methods diff --git a/tests/test_process_seqs.py b/tests/test_process_seqs.py index 4219178671..6c20b70a60 100644 --- a/tests/test_process_seqs.py +++ b/tests/test_process_seqs.py @@ -320,6 +320,98 @@ def test_demultiplex_max_barcode_error(self): wf_obj._demultiplex_max_barcode_error(needs_a_fix) self.assertTrue(wf_obj.Failed) + def test_primer_instrument_454(self): + # individual tests for each method call by this function + pass + + def test_primer_check_forward(self): + """ """ + wf_obj = self._make_workflow_obj({'max_primer_mismatch':2, + 'retain_primer':False}) + item1 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGGCC', + 'Qual':array([1,2,3,4,5,6,7,8])} + item2 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGCCC', + 'Qual':array([1,2,3,4,5,6,7,8])} + item3 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'GGTTGCCC', + 'Qual':array([1,2,3,4,5,6,7,8])} + exp_item1 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'CC', + 'Qual':array([7,8])} + exp_item2 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'CC', + 'Qual':array([7,8])} + exp_item3 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'GGTTGCCC', + 'Qual':array([1,2,3,4,5,6,7,8])} + + # item is modified in place in these operations as retain_primer is False + wf_obj.wf_init(item1) + wf_obj.FinalState['Final barcode'] = 'AAAAAAAAAAAA' + wf_obj._primer_check_forward(item1) + self.assertEqual(item1, exp_item1) + self.assertEqual(wf_obj.FinalState['Sequence'], 'CC') + self.assertEqual(wf_obj.FinalState['Qual'], array([7,8])) + self.assertEqual(wf_obj.FinalState['Forward primer'], 'AATTGG') + self.assertFalse(wf_obj.Failed) + + wf_obj.wf_init(item2) + wf_obj.FinalState['Final barcode'] = 'AAAAAAAAAAAA' + wf_obj._primer_check_forward(item2) + self.assertEqual(item2, exp_item2) + self.assertEqual(wf_obj.FinalState['Sequence'], 'CC') + self.assertEqual(wf_obj.FinalState['Qual'], array([7,8])) + self.assertEqual(wf_obj.FinalState['Forward primer'], 'AATTGC') + self.assertFalse(wf_obj.Failed) + + wf_obj.wf_init(item3) + wf_obj.FinalState['Final barcode'] = 'AAAAAAAAAAAA' + wf_obj._primer_check_forward(item3) + self.assertEqual(item3, exp_item3) + self.assertEqual(wf_obj.FinalState['Sequence'], 'GGTTGCCC') + self.assertEqual(wf_obj.FinalState['Qual'], None) + self.assertEqual(wf_obj.FinalState['Forward primer'], None) + self.assertTrue(wf_obj.Failed) + + # item is not modified in place as retain priemr is True + wf_obj = self._make_workflow_obj({'max_primer_mismatch':2, + 'retain_primer':True}) + item1 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGGCC', + 'Qual':array([1,2,3,4,5,6,7,8])} + item2 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGCCC', + 'Qual':array([1,2,3,4,5,6,7,8])} + item3 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'GGTTGCCC', + 'Qual':array([1,2,3,4,5,6,7,8])} + exp_item1 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGGCC', + 'Qual':array([1,2,3,4,5,6,7,8])} + exp_item2 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGCCC', + 'Qual':array([1,2,3,4,5,6,7,8])} + exp_item3 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'GGTTGCCC', + 'Qual':array([1,2,3,4,5,6,7,8])} + + wf_obj.wf_init(item1) + wf_obj.FinalState['Final barcode'] = 'AAAAAAAAAAAA' + wf_obj._primer_check_forward(item1) + self.assertEqual(item1, exp_item1) + self.assertEqual(wf_obj.FinalState['Sequence'], 'AATTGGCC') + self.assertEqual(wf_obj.FinalState['Qual'], array([1,2,3,4,5,6,7,8])) + self.assertEqual(wf_obj.FinalState['Forward primer'], 'AATTGG') + self.assertFalse(wf_obj.Failed) + + wf_obj.wf_init(item2) + wf_obj.FinalState['Final barcode'] = 'AAAAAAAAAAAA' + wf_obj._primer_check_forward(item2) + self.assertEqual(item2, exp_item2) + self.assertEqual(wf_obj.FinalState['Sequence'], 'AATTGCCC') + self.assertEqual(wf_obj.FinalState['Qual'], array([1,2,3,4,5,6,7,8])) + self.assertEqual(wf_obj.FinalState['Forward primer'], 'AATTGC') + self.assertFalse(wf_obj.Failed) + + wf_obj.wf_init(item3) + wf_obj.FinalState['Final barcode'] = 'AAAAAAAAAAAA' + wf_obj._primer_check_forward(item3) + self.assertEqual(item3, exp_item3) + self.assertEqual(wf_obj.FinalState['Sequence'], 'GGTTGCCC') + self.assertEqual(wf_obj.FinalState['Qual'], None) + self.assertEqual(wf_obj.FinalState['Forward primer'], None) + self.assertTrue(wf_obj.Failed) + fasta1_simple = """>a abcde >b @@ -403,7 +495,7 @@ def test_demultiplex_max_barcode_error(self): """ mapping = MetadataMap( - {'s1':{'BarcodeSequence':'AAAAAAAAAAAA', 'LinkerPrimerSequence':''}, + {'s1':{'BarcodeSequence':'AAAAAAAAAAAA', 'LinkerPrimerSequence':'AATTGG,AATTCC'}, 's2':{'BarcodeSequence':'AAAAAAAAAAAC', 'LinkerPrimerSequence':''}, 's3':{'BarcodeSequence':'AAAAAAAAAAAG', 'LinkerPrimerSequence':''}, 's4':{'BarcodeSequence':'AAAAAAAAAAAT', 'LinkerPrimerSequence':''}, From 2ef930d50a6b38a74e1d2e0aaa4649c423a0004d Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Fri, 31 Jan 2014 08:52:27 -0700 Subject: [PATCH 34/61] tests for sequence_length_check --- tests/test_process_seqs.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tests/test_process_seqs.py b/tests/test_process_seqs.py index 6c20b70a60..a82c023aed 100644 --- a/tests/test_process_seqs.py +++ b/tests/test_process_seqs.py @@ -325,7 +325,7 @@ def test_primer_instrument_454(self): pass def test_primer_check_forward(self): - """ """ + """Pull the forward primer as expected""" wf_obj = self._make_workflow_obj({'max_primer_mismatch':2, 'retain_primer':False}) item1 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGGCC', @@ -411,7 +411,22 @@ def test_primer_check_forward(self): self.assertEqual(wf_obj.FinalState['Qual'], None) self.assertEqual(wf_obj.FinalState['Forward primer'], None) self.assertTrue(wf_obj.Failed) - + + def test_sequence_length_check(self): + """Check the length of the sequence""" + wf_obj = self._make_workflow_obj({'min_seq_len':5}) + item1 = {'Sequence':'AATTGGCC'} + item2 = {'Sequence':'AATT'} + + wf_obj._sequence_length_check(item1) + self.assertFalse(wf_obj.Failed) + + wf_obj._sequence_length_check(item2) + self.assertTrue(wf_obj.Failed) + + def test_sequence_ambiguous_count(self): + pass + fasta1_simple = """>a abcde >b From ac0a3920c00ae0e8bfd49cc9ffa3c5dc6b1e4a2a Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Fri, 31 Jan 2014 08:56:59 -0700 Subject: [PATCH 35/61] tests for sequence_ambiguous_count and force init of wf_obj.Failed --- tests/test_process_seqs.py | 34 +++++++++++++++++++++++++++++++++- 1 file changed, 33 insertions(+), 1 deletion(-) diff --git a/tests/test_process_seqs.py b/tests/test_process_seqs.py index a82c023aed..29e5635b54 100644 --- a/tests/test_process_seqs.py +++ b/tests/test_process_seqs.py @@ -240,12 +240,15 @@ def test_quality_min_per_read_length_fraction(self): 'Qual':array([6, 6, 1, 1, 1, 1, 6, 6])} exp3 = {'Sequence':'AATTGGCC', 'Qual':array([6, 6, 1, 1, 1, 1, 6, 6])} + wf_obj.Failed = False # note, normally handled by Workflow.__call__ wf_obj._quality_min_per_read_length_fraction(item1) self.assertFalse(wf_obj.Failed) + wf_obj.Failed = False # note, normally handled by Workflow.__call__ wf_obj._quality_min_per_read_length_fraction(item2) self.assertFalse(wf_obj.Failed) + wf_obj.Failed = False # note, normally handled by Workflow.__call__ wf_obj._quality_min_per_read_length_fraction(item3) self.assertTrue(wf_obj.Failed) @@ -271,6 +274,7 @@ def test_demultiplex_encoded_barcode(self): unknown_barcode = {'Barcode':'ACACCTGGTGAT', 'Sequence':'AATTGGCC'} wf_obj.wf_init(needs_a_fix) + wf_obj.Failed = False # note, normally handled by Workflow.__call__ wf_obj._demultiplex_encoded_barcode(needs_a_fix) self.assertEqual(wf_obj.FinalState['Original barcode'], 'GGAGACAAGGGT') self.assertEqual(wf_obj.FinalState['Corrected barcode errors'], 1) @@ -279,6 +283,7 @@ def test_demultiplex_encoded_barcode(self): self.assertFalse(wf_obj.Failed) wf_obj.wf_init(exact) + wf_obj.Failed = False # note, normally handled by Workflow.__call__ wf_obj._demultiplex_encoded_barcode(exact) self.assertEqual(wf_obj.FinalState['Original barcode'], 'GGAGACAAGGGA') self.assertEqual(wf_obj.FinalState['Corrected barcode errors'], 0) @@ -287,6 +292,7 @@ def test_demultiplex_encoded_barcode(self): self.assertFalse(wf_obj.Failed) wf_obj.wf_init(from_sequence) + wf_obj.Failed = False # note, normally handled by Workflow.__call__ wf_obj._demultiplex_encoded_barcode(from_sequence) self.assertEqual(wf_obj.FinalState['Original barcode'], 'GGAGACAAGGGA') self.assertEqual(wf_obj.FinalState['Corrected barcode errors'], 0) @@ -295,6 +301,7 @@ def test_demultiplex_encoded_barcode(self): self.assertFalse(wf_obj.Failed) wf_obj.wf_init(unknown_barcode) + wf_obj.Failed = False # note, normally handled by Workflow.__call__ wf_obj._demultiplex_encoded_barcode(unknown_barcode) self.assertEqual(wf_obj.FinalState['Original barcode'], 'ACACCTGGTGAT') self.assertEqual(wf_obj.FinalState['Corrected barcode errors'], 0) @@ -315,8 +322,10 @@ def test_demultiplex_max_barcode_error(self): self.assertFalse(wf_obj.Failed) wf_obj.wf_init(needs_a_fix) + wf_obj.Failed = False # note, normally handled by Workflow.__call__ wf_obj._demultiplex_encoded_barcode(needs_a_fix) self.assertFalse(wf_obj.Failed) + wf_obj.Failed = False # note, normally handled by Workflow.__call__ wf_obj._demultiplex_max_barcode_error(needs_a_fix) self.assertTrue(wf_obj.Failed) @@ -343,6 +352,7 @@ def test_primer_check_forward(self): # item is modified in place in these operations as retain_primer is False wf_obj.wf_init(item1) + wf_obj.Failed = False # note, normally handled by Workflow.__call__ wf_obj.FinalState['Final barcode'] = 'AAAAAAAAAAAA' wf_obj._primer_check_forward(item1) self.assertEqual(item1, exp_item1) @@ -352,6 +362,7 @@ def test_primer_check_forward(self): self.assertFalse(wf_obj.Failed) wf_obj.wf_init(item2) + wf_obj.Failed = False # note, normally handled by Workflow.__call__ wf_obj.FinalState['Final barcode'] = 'AAAAAAAAAAAA' wf_obj._primer_check_forward(item2) self.assertEqual(item2, exp_item2) @@ -361,6 +372,7 @@ def test_primer_check_forward(self): self.assertFalse(wf_obj.Failed) wf_obj.wf_init(item3) + wf_obj.Failed = False # note, normally handled by Workflow.__call__ wf_obj.FinalState['Final barcode'] = 'AAAAAAAAAAAA' wf_obj._primer_check_forward(item3) self.assertEqual(item3, exp_item3) @@ -386,6 +398,7 @@ def test_primer_check_forward(self): 'Qual':array([1,2,3,4,5,6,7,8])} wf_obj.wf_init(item1) + wf_obj.Failed = False # note, normally handled by Workflow.__call__ wf_obj.FinalState['Final barcode'] = 'AAAAAAAAAAAA' wf_obj._primer_check_forward(item1) self.assertEqual(item1, exp_item1) @@ -395,6 +408,7 @@ def test_primer_check_forward(self): self.assertFalse(wf_obj.Failed) wf_obj.wf_init(item2) + wf_obj.Failed = False # note, normally handled by Workflow.__call__ wf_obj.FinalState['Final barcode'] = 'AAAAAAAAAAAA' wf_obj._primer_check_forward(item2) self.assertEqual(item2, exp_item2) @@ -404,6 +418,7 @@ def test_primer_check_forward(self): self.assertFalse(wf_obj.Failed) wf_obj.wf_init(item3) + wf_obj.Failed = False # note, normally handled by Workflow.__call__ wf_obj.FinalState['Final barcode'] = 'AAAAAAAAAAAA' wf_obj._primer_check_forward(item3) self.assertEqual(item3, exp_item3) @@ -418,14 +433,31 @@ def test_sequence_length_check(self): item1 = {'Sequence':'AATTGGCC'} item2 = {'Sequence':'AATT'} + wf_obj.Failed = False # note, normally handled by Workflow.__call__ wf_obj._sequence_length_check(item1) self.assertFalse(wf_obj.Failed) + wf_obj.Failed = False # note, normally handled by Workflow.__call__ wf_obj._sequence_length_check(item2) self.assertTrue(wf_obj.Failed) def test_sequence_ambiguous_count(self): - pass + wf_obj = self._make_workflow_obj({'ambiguous_count':2}) + item1 = {'Sequence':'AATTGGCC'} + item2 = {'Sequence':'AANNNTT'} + item3 = {'Sequence':'AANTT'} + + wf_obj.Failed = False # note, normally handled by Workflow.__call__ + wf_obj._sequence_ambiguous_count(item1) + self.assertFalse(wf_obj.Failed) + + wf_obj.Failed = False # note, normally handled by Workflow.__call__ + wf_obj._sequence_ambiguous_count(item2) + self.assertTrue(wf_obj.Failed) + + wf_obj.Failed = False # note, normally handled by Workflow.__call__ + wf_obj._sequence_ambiguous_count(item3) + self.assertFalse(wf_obj.Failed) fasta1_simple = """>a abcde From 3c11bc08e9808425f6011c49720509817e168174 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Fri, 31 Jan 2014 09:18:22 -0700 Subject: [PATCH 36/61] tests for _count_mismatches --- tests/test_process_seqs.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/tests/test_process_seqs.py b/tests/test_process_seqs.py index 29e5635b54..cda6f6439b 100644 --- a/tests/test_process_seqs.py +++ b/tests/test_process_seqs.py @@ -8,7 +8,7 @@ from qiime.parse import MinimalQualParser from qiime.process_seqs import (_fasta_qual_gen, fasta_iterator, _fastq_barcode_gen, fastq_iterator, _fastq_gen, - SequenceWorkflow) + SequenceWorkflow, _count_mismatches) from qiime.quality import ascii_to_phred64 from qiime.util import MetadataMap @@ -171,6 +171,19 @@ def test_fasta_iterators_fasta_qual(self): self.assertEqual(o['Sequence'], e['Sequence']) self.assertTrue((o['Qual'] == e['Qual']).all()) +class SupportTests(TestCase): + def setUp(self): + pass + + def test_count_mismatches(self): + """Count mismatches in sequences""" + s1 = "AATTGGCC" + s2 = "AATTCCCC" + + self.assertEqual(_count_mismatches(s1, s1), 0) + self.assertEqual(_count_mismatches(s1, s2), 2) + self.assertEqual(_count_mismatches(s2, s1), 2) + self.assertEqual(_count_mismatches(s2, s2), 0) class ProcessSeqsWorkflowTests(TestCase): """Basing structure off of test_split_libraries_fastq.py""" From e9357db9ad17be2cf345eaa749e85f15f4bbaaf1 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Fri, 31 Jan 2014 09:41:49 -0700 Subject: [PATCH 37/61] whitespace --- qiime/process_seqs.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index 9546e85e20..a633527318 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -200,11 +200,11 @@ def _has_qual(item): class SequenceWorkflow(Workflow): """Implement the sequence processing workflow - + All workflow methods expect an item that is dict-like with the following keys and value types: SequenceID : str - Sequence : str + Sequence : str Qual : np.array or None Barcode : str or None """ @@ -368,7 +368,7 @@ def _quality_min_per_read_length_fraction(self, item): bad_bases = item['Qual'] < self.Options['phred_quality_threshold'] bad_bases_count = bad_bases.sum(dtype=float) threshold = 1 - self.Options['min_per_read_length_fraction'] - + if (bad_bases_count / len(item['Sequence'])) > threshold: self.Failed = True self.Stats['min_per_read_length_fraction'] += 1 @@ -423,12 +423,12 @@ def _demultiplex_encoded_barcode(self, item, method=decode_golay_12, self.Stats['Unknown barcode'] += 1 else: self.FinalState['Sample'] = sample - + ### really need the requires to be a nonnone value: # @requires(Option='max_barcode_error', Values=_not_none) @requires(Option='max_barcode_error') def _demultiplex_max_barcode_error(self, item): - """ """ + """Fail a sequence if it exceeds a max number of barcode errors""" bc_errors = self.Options['max_barcode_error'] if self.FinalState['Corrected barcode errors'] > bc_errors: self.Failed = True From 90a8cf8594184226bc9ba405fb7c64ba3c30c3bb Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Fri, 31 Jan 2014 10:25:34 -0700 Subject: [PATCH 38/61] removed some extraneous comments --- qiime/process_seqs.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index a633527318..09e4d451a0 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -178,24 +178,12 @@ def _has_qual(item): # sample_ids: via Command # store_demultiplexed_fastq: via Command # retain_unassigned_reads: via Command (Failed == False, Sample == None) -# max_bad_run_length: via wf_quality, -# UNTESTED -# min_per_read_length_fraction: via wf_quality, if truncation happens, do -# in place update on item -# STUBBED OUT -# sequence_max_n: via wf_sequence -# STUBBED OUT (ambiguous_count), UNTESTED # start_seq_id: via Command, but also hopefully deprecated in favor of # HDF5 format # rev_comp_barcode: via Command and iterators? only if the barcodes are separate # then it is possible to do at the iterator level... # rev_comp_mapping_barcodes: via Command # rev_comp: via Command and iterators -# phred_quality_threshold: via wf_quality -# STUBBED OUT, basically implemented? split_libraries_fastq is difficult to read... -# barcode_type: via wf_demultiplex -# DONE -# max_barcode_error: via wf_demultiplex # phred_offset: via Command and iterators class SequenceWorkflow(Workflow): From 7d5d040b5f29ce60aad45e05e7a0445eae772196 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Sun, 16 Mar 2014 13:55:04 -0600 Subject: [PATCH 39/61] stuff and stuff --- qiime/process_seqs.py | 249 ++++++++++++++++++++++-------------------- 1 file changed, 133 insertions(+), 116 deletions(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index 09e4d451a0..8490c9dd22 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -15,153 +15,170 @@ from qiime.quality import ascii_to_phred33, ascii_to_phred64 from numpy import array -# pre allocate the iterable return. This is done for performance reasons to -# avoid frequent reallocations and to ensure a consistent object type -_iter_prealloc = {'SequenceID':None, - 'Sequence':None, - 'Qual':None, - 'Barcode':None} +class CannotHandleData(exception): + pass -def _reset_iter_prealloc(): - """Reset the buffer""" - for k in _iter_prealloc: - _iter_prealloc[k] = None +class SequenceIterator(object): + """Provide a standard API for interacting with sequence files + + + """ + # pre allocate the iterable return. This is done for performance reasons to + # avoid frequent reallocations and to ensure a consistent object type + _iter_prealloc = {'SequenceID':None, + 'Sequence':None, + 'Qual':None, + 'Barcode':None} -def _fasta_qual_gen(fasta_gen, qual_gen): - """Yield fasta and qual together + def __init__(self, fasta=None, qual=None, fastq=None, rc=False, + rc_barcodes=None): + if fasta is not None and fastq is not None: + raise CannotHandleData("Cannot handle both fasta and fastq files") - Raises ValueError if the sequence IDs and quality IDs are not in the same - order. Raises ValueError if the sequence length does not match the length - of the quality score. + if fasta is None qual is not None: + raise CannotHandleData("Cannot process qual without fasta data") - Note: object yielded is updated on each iteration. A new object is _not_ - created on each iteration. This is done for performance reasons, where - quick testing showed a 50% reduction in runtime. - """ - for (seq_id, seq), (qual_id, qual) in izip(fasta_gen, qual_gen): - if seq_id != qual_id: - raise ValueError("%s is not equal to %s!" % (seq_id, qual_id)) - if len(seq) != len(qual): - raise ValueError("%s is not equal length to %s!" % (seq_id, - qual_id)) - _iter_prealloc['SequenceID'] = seq_id - _iter_prealloc['Sequence'] = seq - _iter_prealloc['Qual'] = qual + def _reset_iter_prealloc(): + """Reset the buffer""" + for k in _iter_prealloc: + _iter_prealloc[k] = None - yield _iter_prealloc + def _fasta_qual_gen(fasta_gen, qual_gen): + """Yield fasta and qual together -def _fasta_gen(fasta_gens): - """Yield fasta data + Raises ValueError if the sequence IDs and quality IDs are not in the same + order. Raises ValueError if the sequence length does not match the length + of the quality score. - Note: object yielded is updated on each iteration. A new object is _not_ - created on each iteration. This is done for performance reasons, where - quick testing showed a 50% reduction in runtime. - """ - for id_, seq in fasta_gens: - _iter_prealloc['SequenceID'] = id_ - _iter_prealloc['Sequence'] = seq - yield _iter_prealloc + Note: object yielded is updated on each iteration. A new object is _not_ + created on each iteration. This is done for performance reasons, where + quick testing showed a 50% reduction in runtime. + """ + for (seq_id, seq), (qual_id, qual) in izip(fasta_gen, qual_gen): + if seq_id != qual_id: + raise ValueError("%s is not equal to %s!" % (seq_id, qual_id)) + if len(seq) != len(qual): + raise ValueError("%s is not equal length to %s!" % (seq_id, + qual_id)) -def fasta_iterator(fasta_fps, qual_fps=None): - """Yield fasta and qual data + _iter_prealloc['SequenceID'] = seq_id + _iter_prealloc['Sequence'] = seq + _iter_prealloc['Qual'] = qual - Expects file-like objects. If qual_fps is not None, quality scores are - yielded. The return will either be: + yield _iter_prealloc - {'SequenceID':foo, 'Sequence':bar, 'Qual':array([])} + def _fasta_gen(fasta_gens): + """Yield fasta data - or + Note: object yielded is updated on each iteration. A new object is _not_ + created on each iteration. This is done for performance reasons, where + quick testing showed a 50% reduction in runtime. + """ + for id_, seq in fasta_gens: + _iter_prealloc['SequenceID'] = id_ + _iter_prealloc['Sequence'] = seq + yield _iter_prealloc - {'SequenceID':foo, 'Sequence':bar, 'Qual':None} + def fasta_iterator(fasta_fps, qual_fps=None): + """Yield fasta and qual data - Note: object yielded is updated on each iteration. A new object is _not_ - created on each iteration. This is done for performance reasons, where - quick testing showed a 50% reduction in runtime. - """ - _reset_iter_prealloc() + Expects file-like objects. If qual_fps is not None, quality scores are + yielded. The return will either be: - fasta_gens = chain(*[MinimalFastaParser(f) for f in fasta_fps]) + {'SequenceID':foo, 'Sequence':bar, 'Qual':array([])} - if qual_fps is not None: - qual_gens = chain(*[MinimalQualParser(f) for f in qual_fps]) - gen = _fasta_qual_gen(fasta_gens, qual_gens) - else: - qual_gens = None - gen = _fasta_gen(fasta_gens) + or - return gen + {'SequenceID':foo, 'Sequence':bar, 'Qual':None} -def _fastq_barcode_gen(fastq_gens, barcode_gens, phred_f): - """Yield fastq and barcode data + Note: object yielded is updated on each iteration. A new object is _not_ + created on each iteration. This is done for performance reasons, where + quick testing showed a 50% reduction in runtime. + """ + _reset_iter_prealloc() - Note: object yielded is updated on each iteration. A new object is _not_ - created on each iteration. This is done for performance reasons, where - quick testing showed a 50% reduction in runtime. - """ - _gen = izip(fastq_gens, barcode_gens) - for (seqid, seq, qual), (bc_seqid, bc_seq, bc_qual) in _gen: - if seqid != bc_seqid: - raise ValueError("%s is not equal to %s!" % (seqid, bc_seqid)) - _iter_prealloc['SequenceID'] = seqid - _iter_prealloc['Sequence'] = seq - _iter_prealloc['Qual'] = array([phred_f(q) for q in qual]) - _iter_prealloc['Barcode'] = bc_seq - - yield _iter_prealloc - -def _fastq_gen(fastq_gens, phred_f): - """Yield fastq data - - Note: object yielded is updated on each iteration. A new object is _not_ - created on each iteration. This is done for performance reasons, where - quick testing showed a 50% reduction in runtime. - """ - for (seqid, seq, qual) in fastq_gens: - _iter_prealloc['SequenceID'] = seqid - _iter_prealloc['Sequence'] = seq - _iter_prealloc['Qual'] = array([phred_f(q) for q in qual]) + fasta_gens = chain(*[MinimalFastaParser(f) for f in fasta_fps]) - yield _iter_prealloc + if qual_fps is not None: + qual_gens = chain(*[MinimalQualParser(f) for f in qual_fps]) + gen = _fasta_qual_gen(fasta_gens, qual_gens) + else: + qual_gens = None + gen = _fasta_gen(fasta_gens) + + return gen -def fastq_iterator(fastq_fps, barcode_fps=None): - """Yield fastq data + def _fastq_barcode_gen(fastq_gens, barcode_gens, phred_f): + """Yield fastq and barcode data - Expects file-like objects. If barcode_fps is not None, barcodes are also - yielded. The return will either be: + Note: object yielded is updated on each iteration. A new object is _not_ + created on each iteration. This is done for performance reasons, where + quick testing showed a 50% reduction in runtime. + """ + _gen = izip(fastq_gens, barcode_gens) + for (seqid, seq, qual), (bc_seqid, bc_seq, bc_qual) in _gen: + if seqid != bc_seqid: + raise ValueError("%s is not equal to %s!" % (seqid, bc_seqid)) + _iter_prealloc['SequenceID'] = seqid + _iter_prealloc['Sequence'] = seq + _iter_prealloc['Qual'] = array([phred_f(q) for q in qual]) + _iter_prealloc['Barcode'] = bc_seq + + yield _iter_prealloc + + def _fastq_gen(fastq_gens, phred_f): + """Yield fastq data + + Note: object yielded is updated on each iteration. A new object is _not_ + created on each iteration. This is done for performance reasons, where + quick testing showed a 50% reduction in runtime. + """ + for (seqid, seq, qual) in fastq_gens: + _iter_prealloc['SequenceID'] = seqid + _iter_prealloc['Sequence'] = seq + _iter_prealloc['Qual'] = array([phred_f(q) for q in qual]) - {'SequenceID':foo, 'Sequence':bar, 'Qual':array([]), 'Barcode':foobar} + yield _iter_prealloc - or + def fastq_iterator(fastq_fps, barcode_fps=None): + """Yield fastq data - {'SequenceID':foo, 'Sequence':bar, 'Qual':array([]), 'Barcode':None} + Expects file-like objects. If barcode_fps is not None, barcodes are also + yielded. The return will either be: - Note: object yielded is updated on each iteration. A new object is _not_ - created on each iteration. This is done for performance reasons, where - quick testing showed a 50% reduction in runtime. - """ - _reset_iter_prealloc() + {'SequenceID':foo, 'Sequence':bar, 'Qual':array([]), 'Barcode':foobar} - fastq_gens = chain(*[MinimalFastqParser(f) for f in fastq_fps]) + or - # peek - first_item = fastq_gens.next() - seqid, seq, qual = first_item - fastq_gens = chain([first_item], fastq_gens) + {'SequenceID':foo, 'Sequence':bar, 'Qual':array([]), 'Barcode':None} - # from qiime.parse.parse_fastq_qual_score (v1.8.0) - if is_casava_v180_or_later('@%s' % seqid): - ascii_to_phred_f = ascii_to_phred33 - else: - ascii_to_phred_f = ascii_to_phred64 + Note: object yielded is updated on each iteration. A new object is _not_ + created on each iteration. This is done for performance reasons, where + quick testing showed a 50% reduction in runtime. + """ + _reset_iter_prealloc() - if barcode_fps: - barcode_gens = chain(*[MinimalFastqParser(f) for f in barcode_fps]) - gen = _fastq_barcode_gen(fastq_gens, barcode_gens, ascii_to_phred_f) - else: - gen = _fastq_gen(fastq_gens, ascii_to_phred_f) + fastq_gens = chain(*[MinimalFastqParser(f) for f in fastq_fps]) + + # peek + first_item = fastq_gens.next() + seqid, seq, qual = first_item + fastq_gens = chain([first_item], fastq_gens) + + # from qiime.parse.parse_fastq_qual_score (v1.8.0) + if is_casava_v180_or_later('@%s' % seqid): + ascii_to_phred_f = ascii_to_phred33 + else: + ascii_to_phred_f = ascii_to_phred64 + + if barcode_fps: + barcode_gens = chain(*[MinimalFastqParser(f) for f in barcode_fps]) + gen = _fastq_barcode_gen(fastq_gens, barcode_gens, ascii_to_phred_f) + else: + gen = _fastq_gen(fastq_gens, ascii_to_phred_f) - return gen + return gen ### can cythonize def _count_mismatches(seq1, seq2): From 2adbff1313b512aebd6b53075f3e3883720f9017 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Sun, 20 Apr 2014 18:11:22 -0600 Subject: [PATCH 40/61] MAINT: reflecting finalized workflow and iterators in skbio --- qiime/process_seqs.py | 553 ++++++++++++++++-------------------------- 1 file changed, 212 insertions(+), 341 deletions(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index 8490c9dd22..fd84f823df 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -2,192 +2,25 @@ """Filter poor quality reads, trim barcodes/primers and assign to samples""" -from qiime.workflow.core import Workflow, requires, priority, no_requirements -from cogent.parse.fasta import MinimalFastaParser -from cogent.parse.fastq import MinimalFastqParser -from qiime.parse import MinimalQualParser -from itertools import chain, izip -from qiime.util import MetadataMap -from qiime.parse import is_casava_v180_or_later -from qiime.split_libraries import expand_degeneracies -from qiime.hamming import decode_barcode_8 as decode_hamming_8 -from qiime.golay import decode as decode_golay_12 -from qiime.quality import ascii_to_phred33, ascii_to_phred64 -from numpy import array - -class CannotHandleData(exception): - pass - -class SequenceIterator(object): - """Provide a standard API for interacting with sequence files - - - """ - # pre allocate the iterable return. This is done for performance reasons to - # avoid frequent reallocations and to ensure a consistent object type - _iter_prealloc = {'SequenceID':None, - 'Sequence':None, - 'Qual':None, - 'Barcode':None} - - def __init__(self, fasta=None, qual=None, fastq=None, rc=False, - rc_barcodes=None): - if fasta is not None and fastq is not None: - raise CannotHandleData("Cannot handle both fasta and fastq files") - - if fasta is None qual is not None: - raise CannotHandleData("Cannot process qual without fasta data") - - - def _reset_iter_prealloc(): - """Reset the buffer""" - for k in _iter_prealloc: - _iter_prealloc[k] = None - - def _fasta_qual_gen(fasta_gen, qual_gen): - """Yield fasta and qual together - - Raises ValueError if the sequence IDs and quality IDs are not in the same - order. Raises ValueError if the sequence length does not match the length - of the quality score. - - Note: object yielded is updated on each iteration. A new object is _not_ - created on each iteration. This is done for performance reasons, where - quick testing showed a 50% reduction in runtime. - """ - for (seq_id, seq), (qual_id, qual) in izip(fasta_gen, qual_gen): - if seq_id != qual_id: - raise ValueError("%s is not equal to %s!" % (seq_id, qual_id)) - if len(seq) != len(qual): - raise ValueError("%s is not equal length to %s!" % (seq_id, - qual_id)) - - _iter_prealloc['SequenceID'] = seq_id - _iter_prealloc['Sequence'] = seq - _iter_prealloc['Qual'] = qual - - yield _iter_prealloc - - def _fasta_gen(fasta_gens): - """Yield fasta data - - Note: object yielded is updated on each iteration. A new object is _not_ - created on each iteration. This is done for performance reasons, where - quick testing showed a 50% reduction in runtime. - """ - for id_, seq in fasta_gens: - _iter_prealloc['SequenceID'] = id_ - _iter_prealloc['Sequence'] = seq - yield _iter_prealloc - - def fasta_iterator(fasta_fps, qual_fps=None): - """Yield fasta and qual data - - Expects file-like objects. If qual_fps is not None, quality scores are - yielded. The return will either be: - - {'SequenceID':foo, 'Sequence':bar, 'Qual':array([])} - - or - - {'SequenceID':foo, 'Sequence':bar, 'Qual':None} - - Note: object yielded is updated on each iteration. A new object is _not_ - created on each iteration. This is done for performance reasons, where - quick testing showed a 50% reduction in runtime. - """ - _reset_iter_prealloc() - - fasta_gens = chain(*[MinimalFastaParser(f) for f in fasta_fps]) - - if qual_fps is not None: - qual_gens = chain(*[MinimalQualParser(f) for f in qual_fps]) - gen = _fasta_qual_gen(fasta_gens, qual_gens) - else: - qual_gens = None - gen = _fasta_gen(fasta_gens) - - return gen - - def _fastq_barcode_gen(fastq_gens, barcode_gens, phred_f): - """Yield fastq and barcode data - - Note: object yielded is updated on each iteration. A new object is _not_ - created on each iteration. This is done for performance reasons, where - quick testing showed a 50% reduction in runtime. - """ - _gen = izip(fastq_gens, barcode_gens) - for (seqid, seq, qual), (bc_seqid, bc_seq, bc_qual) in _gen: - if seqid != bc_seqid: - raise ValueError("%s is not equal to %s!" % (seqid, bc_seqid)) - _iter_prealloc['SequenceID'] = seqid - _iter_prealloc['Sequence'] = seq - _iter_prealloc['Qual'] = array([phred_f(q) for q in qual]) - _iter_prealloc['Barcode'] = bc_seq - - yield _iter_prealloc - - def _fastq_gen(fastq_gens, phred_f): - """Yield fastq data - - Note: object yielded is updated on each iteration. A new object is _not_ - created on each iteration. This is done for performance reasons, where - quick testing showed a 50% reduction in runtime. - """ - for (seqid, seq, qual) in fastq_gens: - _iter_prealloc['SequenceID'] = seqid - _iter_prealloc['Sequence'] = seq - _iter_prealloc['Qual'] = array([phred_f(q) for q in qual]) - - yield _iter_prealloc - - def fastq_iterator(fastq_fps, barcode_fps=None): - """Yield fastq data - - Expects file-like objects. If barcode_fps is not None, barcodes are also - yielded. The return will either be: - - {'SequenceID':foo, 'Sequence':bar, 'Qual':array([]), 'Barcode':foobar} - - or - - {'SequenceID':foo, 'Sequence':bar, 'Qual':array([]), 'Barcode':None} - - Note: object yielded is updated on each iteration. A new object is _not_ - created on each iteration. This is done for performance reasons, where - quick testing showed a 50% reduction in runtime. - """ - _reset_iter_prealloc() +import numpy as np - fastq_gens = chain(*[MinimalFastqParser(f) for f in fastq_fps]) +from collections import Counter +from itertools import izip - # peek - first_item = fastq_gens.next() - seqid, seq, qual = first_item - fastq_gens = chain([first_item], fastq_gens) +from skbio.core.workflow import Workflow, requires, method, not_none +from qiime.hamming import decode_barcode_8 as decode_hamming_8 +from qiime.golay import decode as decode_golay_12 - # from qiime.parse.parse_fastq_qual_score (v1.8.0) - if is_casava_v180_or_later('@%s' % seqid): - ascii_to_phred_f = ascii_to_phred33 - else: - ascii_to_phred_f = ascii_to_phred64 - if barcode_fps: - barcode_gens = chain(*[MinimalFastqParser(f) for f in barcode_fps]) - gen = _fastq_barcode_gen(fastq_gens, barcode_gens, ascii_to_phred_f) - else: - gen = _fastq_gen(fastq_gens, ascii_to_phred_f) +def count_mismatches(seq1, seq2): + """Counts mismatches between two sequences""" + return sum(a != b for a, b in izip(seq1, seq2)) - return gen -### can cythonize -def _count_mismatches(seq1, seq2): - """Counts mismatches between two sequences""" - return sum([a != b for a, b in zip(seq1, seq2)]) +def has_qual(state): + """Check if state has Qual""" + return state['Qual'] is not None -def _has_qual(item): - """Check if an item has Qual""" - return item['Qual'] is not None ### notes on splitlib fastq options: # barcode_read_fps: via Command @@ -195,7 +28,7 @@ def _has_qual(item): # sample_ids: via Command # store_demultiplexed_fastq: via Command # retain_unassigned_reads: via Command (Failed == False, Sample == None) -# start_seq_id: via Command, but also hopefully deprecated in favor of +# start_seq_id: via Command, but also hopefully deprecated in favor of # HDF5 format # rev_comp_barcode: via Command and iterators? only if the barcodes are separate # then it is possible to do at the iterator level... @@ -206,138 +39,187 @@ def _has_qual(item): class SequenceWorkflow(Workflow): """Implement the sequence processing workflow - All workflow methods expect an item that is dict-like with the following - keys and value types: - SequenceID : str - Sequence : str - Qual : np.array or None - Barcode : str or None - """ - FinalState = {'Forward primer':None, - 'Reverse primer':None, - 'Sequence':None, - 'Qual':None, - 'Sample':None, - 'Original barcode':None, - 'Corrected barcode':None, - 'Final barcode':None, - 'Corrected barcode errors':None} - - def _stage_state(self): - """Fish out barcodes and primers from the mapping data""" - bcs = {} - primers = {} - for sample in self.Mapping.SampleIds: - sample_bc = self.Mapping.getCategoryValue(sample, 'BarcodeSequence') - if sample_bc in bcs: - raise ValueError("Duplicate barcode found for sample %s" \ - % sample) - bcs[sample_bc] = sample - - sample_primers = self.Mapping.getCategoryValue(sample, - 'LinkerPrimerSequence') - all_sample_primers = sample_primers.split(',') - primers[sample_bc] = expand_degeneracies(all_sample_primers) - - self.Barcodes = bcs - self.Primers = primers - - def _sanity_check(self): - name = self.__class__.__name__ - if not hasattr(self, 'Mapping'): - raise AttributeError("%s is missing Mapping!" % name) - - if not isinstance(self.Mapping, MetadataMap): - raise AttributeError("self.Mapping is not of type MetadataMap") + The sequence processing workflow manages the following tasks, executed in + the following order:: + + 1. Quality filtering and trimming of primary sequence data + 2. Demultiplexing and assigning reads to samples + 3. Validating primers + 4. Sequence level quality checks (e.g., ambiguous bases) + + Execution of a task will only happen if it is relevant for the data. For + instance, quality checks are only performed if the data being operated on + has quality scores associated. Runtime control through options are also + supported, such that, for instance, the Golay decoder is only executed if + indicated by the options passed to the `SequenceWorkflow`. + + Any task can trigger `failed` and update `stats`. + + Parameters + ---------- + options : dict + Runtime options. See ``Options`` for more details + barcodes : dict + Mapping of barcode nucleotide sequence to a sample ID + primers : dict + Mapping of nucleotide sequence to enumerated possible primers + + Options + ------- + ## DESCRIBE EACH OPTION THAT CAN AFFECT WHAT METHODS ARE EXECUTED + + Attributes + ---------- + state + stats + options + barcodes + primers + + """ + + def __init__(self, *args, **kwargs): + if 'barcodes' not in kwargs: + kwargs['barcodes'] = {} + if 'primers' not in kwargs: + kwargs['primers'] = {} + + kwargs['state'] = {'Forward primer': None, + 'Reverse primer': None, + 'Sequence': None, + 'Qual': None, + 'Barcode': None, + 'Barcode Qual': None, + 'Sample': None, + 'Original barcode': None, + 'Corrected barcode': None, + 'Final barcode': None, + 'Corrected barcode errors': None} + + kwargs['stats'] = Counter() + + super(SequenceWorkflow, self).__init__(self, *args, **kwargs) + + def initialize_state(self, item): + """Reset `state` and update with the current `item` + + Parameters + ---------- + item : dict + An item from the `Workflow` generator + """ + for k in self.state: + self.state[k] = None + self.state.update(item) ### Start Workflow methods - @priority(1000) - @no_requirements - def wf_init(self, item): - """Perform per sequence state initialization - - This workflow group will reset FinalState and will set the following in - FinalState: - - Sequence - """ - self._init_final_state(item) - - @priority(200) - @requires(ValidData=_has_qual) - def wf_quality(self, item): + @method(priority=200) + @requires(state=has_qual) + def wf_quality(self): """Check sequence quality - This workflow group may update _item_ in the event of a sequence - truncation due to quality! - + Changes to `state` + ------------------ + This workflow group may trim `state['Sequence']` and `state['Qual']` if + quality trimming is enabled. + + Triggers for `failed` + --------------------- + - If to many nucleotides in `Sequence` are of poor quality. + + Impacted `stats` + ---------------- + quality_max_bad_run_length + Incremented if the read contained a run of poor quality bases + min_per_read_length_fraction + Incrememted if to many positions in `Sequence` are of poor quality """ - self._quality_max_bad_run_length(item) - self._quality_min_per_read_length_fraction(item) + self._quality_max_bad_run_length() + self._quality_min_per_read_length_fraction() - @priority(150) - @requires(Option='demultiplex', Values=True) - def wf_demultiplex(self, item): + @method(priority=150) + @requires(option='demultiplex', values=True) + def wf_demultiplex(self): """Demultiplex a sequence - If the sequence has not Failed, the following fields in FinalState will - be set: - - Sample - Original barcode - Final barcode - - In addition, the following field may be set: - - Corrected barcode - Corrected barcode errors - - This workflow group can trigger Failed and update Stats + Changes to `state` + ------------------ + Sample + Original barcode + Final barcode + Barcode errors + + Triggers for `failed` + --------------------- + - If a sequence could not be associated to a sample + - If the number of errors observed in the barcode exceed tolerance + + Impacted `stats` + ---------------- + barcode_corrected + Incremented if a barcode was corrected + unknown_barcode + Incremented if an unknown barcode was observed + exceed_barcode_error + Incremented if the number of observed barcode + errors exceeded tolerance """ - self._demultiplex_golay12(item) - self._demultiplex_hamming8(item) - self._demultiplex_other(item) - self._demultiplex_max_barcode_error(item) + self._demultiplex_golay12() + self._demultiplex_hamming8() + self._demultiplex_other() + self._demultiplex_max_barcode_error() ### should this be wf_instrument for instriument specific checks? - @priority(100) - @requires(Option='check_primer', Values=True) - def wf_primer(self, item): + @method(priority=100) + @requires(option='check_primer', values=True) + def wf_primer(self): """Perform primer validation - Primer validation may update the following keys in FinalState: - - Sequence - Forward primer - Reverse primer - - This workflow group can trigger Failed and update Stats + Changes to `state` + ------------------ + Sequence + Qual + Forward primer + Reverse primer + + Triggers for `failed` + --------------------- + - If the `primer` mapping does not contain primers associated with the + nucleotide barcode + Impacted `stats` + ---------------- + unknown_primer_barcode_pair """ - self._primer_instrument_454(item) + self._primer_instrument_454() - @priority(50) - @no_requirements - def wf_sequence(self, item): + @method(priority=50) + def wf_sequence(self): """Final sequence level checks Sequence level checks will not alter FinalState but may trigger Failed and update Stats + + Changes to `state` + ------------------ + + Triggers for `failed` + --------------------- + + Impacted `stats` + ---------------- """ - self._sequence_length_check(item) - self._sequence_ambiguous_count(item) + self._sequence_length_check() + self._sequence_ambiguous_count() ### End Workflow methods ### Start quality methods - @requires(Option='phred_quality_threshold') - @requires(Option='max_bad_run_length') - def _quality_max_bad_run_length(self, item): - """Fail sequence if there is a poor quality run - - Warning: this method can modify item in place - """ + @requires(option='phred_quality_threshold') + @requires(option='max_bad_run_length') + def _quality_max_bad_run_length(self): + """Fail sequence if there is a poor quality run""" max_bad_run_length = self.Options['max_bad_run_length'] phred_quality_threshold = self.Options['phred_quality_threshold'] @@ -347,7 +229,7 @@ def _quality_max_bad_run_length(self, item): run_start_idx = 0 max_run_start_idx = 0 - for idx, v in enumerate(item['Qual']): + for idx, v in enumerate(self.state['Qual']): if v <= phred_quality_threshold: max_run_length += 1 else: @@ -362,112 +244,101 @@ def _quality_max_bad_run_length(self, item): max_run_start_idx = run_start_idx if max_run_length > max_bad_run_length: - item['Qual'] = item['Qual'][:max_run_start_idx+1] - item['Sequence'] = item['Sequence'][:max_run_start_idx+1] - self.Stats['_quality_max_bad_run_length'] += 1 + self.state['Qual'] = self.state['Qual'][:max_run_start_idx+1] + self.state['Sequence'] = self.state['Sequence'][:max_run_start_idx+1] + self.stats['_quality_max_bad_run_length'] += 1 @requires(Option='phred_quality_threshold') @requires(Option='min_per_read_length_fraction') - def _quality_min_per_read_length_fraction(self, item): + def _quality_min_per_read_length_fraction(self): """Fail a sequence if a percentage of bad quality calls exist""" - bad_bases = item['Qual'] < self.Options['phred_quality_threshold'] + bad_bases = self.state['Qual'] < self.Options['phred_quality_threshold'] bad_bases_count = bad_bases.sum(dtype=float) threshold = 1 - self.Options['min_per_read_length_fraction'] - if (bad_bases_count / len(item['Sequence'])) > threshold: - self.Failed = True - self.Stats['min_per_read_length_fraction'] += 1 + if (bad_bases_count / len(self.state['Sequence'])) > threshold: + self.failed = True + self.stats['min_per_read_length_fraction'] += 1 ### End quality methods ### Start demultiplex methods @requires(Option='barcode_type', Values='golay_12') - def _demultiplex_golay12(self, item): + def _demultiplex_golay12(self): """Correct and decode a Golay 12nt barcode""" - self._demultiplex_encoded_barcode(item, decode_golay_12, 12) + self._demultiplex_encoded_barcode(decode_golay_12, 12) @requires(Option='barcode_type', Values='hamming_8') - def _demultiplex_hamming8(self, item): + def _demultiplex_hamming8(self): """Correct and decode a Hamming 8nt barcode""" - self._demultiplex_encoded_barcode(item, decode_hamming_8, 8) + self._demultiplex_encoded_barcode(decode_hamming_8, 8) @requires(Option='barcode_type', Values='variable') - def _demultiplex_other(self, item): + def _demultiplex_other(self): """Decode a variable length barcode""" raise NotImplementedError #### use kwargs for method and bc_length - def _demultiplex_encoded_barcode(self, item, method=decode_golay_12, - bc_length=12): + def _demultiplex_encoded_barcode(self, method, bc_length): """Correct and decode an encoded barcode""" - if item['Barcode'] is not None: - putative_bc = item['Barcode'] + if self.state['Barcode'] is not None: + from_sequence = False + putative_bc = self.state['Barcode'] else: - putative_bc = item['Sequence'][:bc_length] - ### if this case happens, need to update item['Sequence'] to - ### trim off the barcode! + from_sequence = True + putative_bc = self.state['Sequence'][:bc_length] self.FinalState['Original barcode'] = putative_bc if putative_bc in self.Barcodes: - self.FinalState['Corrected barcode errors'] = 0 + self.FinalState['Barcode errors'] = 0 final_bc = putative_bc sample = self.Barcodes[putative_bc] else: corrected, num_errors = method(putative_bc) final_bc = corrected - self.FinalState['Corrected barcode'] = corrected - self.FinalState['Corrected barcode errors'] = num_errors - self.Stats['Barcodes corrected'] += 1 + self.FinalState['Barcode errors'] = num_errors + self.Stats['barcode_corrected'] += 1 sample = self.Barcodes.get(corrected, None) self.FinalState['Final barcode'] = final_bc + if from_sequence: + self.state['Sequence'] = self.state['Sequence'][bc_length:] + if sample is None: self.Failed = True - self.Stats['Unknown barcode'] += 1 + self.Stats['unknown_barcode'] += 1 else: self.FinalState['Sample'] = sample - ### really need the requires to be a nonnone value: - # @requires(Option='max_barcode_error', Values=_not_none) - @requires(Option='max_barcode_error') - def _demultiplex_max_barcode_error(self, item): + @requires(Option='max_barcode_error', Values=not_none) + def _demultiplex_max_barcode_error(self): """Fail a sequence if it exceeds a max number of barcode errors""" bc_errors = self.Options['max_barcode_error'] - if self.FinalState['Corrected barcode errors'] > bc_errors: + if self.FinalState['Barcode errors'] > bc_errors: self.Failed = True - self.Stats['exceeds_bc_errors'] += 1 + self.Stats['exceed_barcode_error'] += 1 ### End demultiplex methods - ### Start init methods - - def _init_final_state(self, item): - """Reset per sequence state""" - for k in self.FinalState: - self.FinalState[k] = None - self.FinalState['Sequence'] = item['Sequence'] - - ### End init methods - ### Start primer methods @requires(Option='instrument_type', Values='454') - def _primer_instrument_454(self, item): + def _primer_instrument_454(self): """Check for a valid primer""" - self._primer_check_forward(item) + self._primer_check_forward() @requires(Option='retain_primer') @requires(Option='max_primer_mismatch') - def _primer_check_forward(self, item): + def _primer_check_forward(self): """Attempt to determine if the forward primer exists and trim if there - + Warning: this method may do an in place update on item if retain primer False. """ - seq = item['Sequence'] - qual = item['Qual'] + seq = self.state['Sequence'] + qual = self.state['Qual'] obs_barcode = self.FinalState['Final barcode'] exp_primers = self.Primers.get(obs_barcode, None) @@ -481,7 +352,7 @@ def _primer_check_forward(self, item): obs_primer = seq[:len_primer] - mm = array([_count_mismatches(obs_primer, p) for p in exp_primers]) + mm = np.array([count_mismatches(obs_primer, p) for p in exp_primers]) if (mm > self.Options['max_primer_mismatch']).all(): self.Failed = True @@ -492,10 +363,10 @@ def _primer_check_forward(self, item): ### should decompose if not self.Options['retain_primer']: seq = seq[len_primer:] - item['Sequence'] = seq + self.state['Sequence'] = seq if qual is not None: qual = qual[len_primer:] - item['Qual'] = qual + self.state['Qual'] = qual self.FinalState['Forward primer'] = obs_primer self.FinalState['Sequence'] = seq @@ -506,16 +377,16 @@ def _primer_check_forward(self, item): ### Start sequence methods @requires(Option='min_seq_len') - def _sequence_length_check(self, item): + def _sequence_length_check(self): """Checks minimum sequence length""" - if len(item['Sequence']) < self.Options['min_seq_len']: + if len(self.state['Sequence']) < self.Options['min_seq_len']: self.Failed = True self.Stats['min_seq_len'] += 1 @requires(Option='ambiguous_count') - def _sequence_ambiguous_count(self, item): + def _sequence_ambiguous_count(self): """Fail if the number of N characters is greater than threshold""" - count = item['Sequence'].count('N') + count = self.state['Sequence'].count('N') if count > self.Options['ambiguous_count']: self.Failed = True self.Stats['ambiguous_count'] += 1 From 9018eaf0a87c0091da682891d0b45864eb7e2ac9 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Sun, 20 Apr 2014 18:27:04 -0600 Subject: [PATCH 41/61] MAINT: more updated to reflect finalized workflow --- qiime/process_seqs.py | 102 ++++++++++++++++++++---------------------- 1 file changed, 49 insertions(+), 53 deletions(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index fd84f823df..98300c7447 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -197,8 +197,8 @@ def wf_primer(self): def wf_sequence(self): """Final sequence level checks - Sequence level checks will not alter FinalState but may trigger Failed - and update Stats + Sequence level checks will not alter `state` but may trigger Failed + and update `stats` Changes to `state` ------------------ @@ -220,8 +220,8 @@ def wf_sequence(self): @requires(option='max_bad_run_length') def _quality_max_bad_run_length(self): """Fail sequence if there is a poor quality run""" - max_bad_run_length = self.Options['max_bad_run_length'] - phred_quality_threshold = self.Options['phred_quality_threshold'] + max_bad_run_length = self.options['max_bad_run_length'] + phred_quality_threshold = self.options['phred_quality_threshold'] # can cythonize run_length = 0 @@ -248,13 +248,13 @@ def _quality_max_bad_run_length(self): self.state['Sequence'] = self.state['Sequence'][:max_run_start_idx+1] self.stats['_quality_max_bad_run_length'] += 1 - @requires(Option='phred_quality_threshold') - @requires(Option='min_per_read_length_fraction') + @requires(option='phred_quality_threshold') + @requires(option='min_per_read_length_fraction') def _quality_min_per_read_length_fraction(self): """Fail a sequence if a percentage of bad quality calls exist""" - bad_bases = self.state['Qual'] < self.Options['phred_quality_threshold'] + bad_bases = self.state['Qual'] < self.options['phred_quality_threshold'] bad_bases_count = bad_bases.sum(dtype=float) - threshold = 1 - self.Options['min_per_read_length_fraction'] + threshold = 1 - self.options['min_per_read_length_fraction'] if (bad_bases_count / len(self.state['Sequence'])) > threshold: self.failed = True @@ -263,17 +263,17 @@ def _quality_min_per_read_length_fraction(self): ### End quality methods ### Start demultiplex methods - @requires(Option='barcode_type', Values='golay_12') + @requires(option='barcode_type', values='golay_12') def _demultiplex_golay12(self): """Correct and decode a Golay 12nt barcode""" self._demultiplex_encoded_barcode(decode_golay_12, 12) - @requires(Option='barcode_type', Values='hamming_8') + @requires(option='barcode_type', values='hamming_8') def _demultiplex_hamming8(self): """Correct and decode a Hamming 8nt barcode""" self._demultiplex_encoded_barcode(decode_hamming_8, 8) - @requires(Option='barcode_type', Values='variable') + @requires(option='barcode_type', values='variable') def _demultiplex_other(self): """Decode a variable length barcode""" raise NotImplementedError @@ -288,49 +288,49 @@ def _demultiplex_encoded_barcode(self, method, bc_length): from_sequence = True putative_bc = self.state['Sequence'][:bc_length] - self.FinalState['Original barcode'] = putative_bc + self.state['Original barcode'] = putative_bc - if putative_bc in self.Barcodes: - self.FinalState['Barcode errors'] = 0 + if putative_bc in self.barcodes: + self.state['Barcode errors'] = 0 final_bc = putative_bc - sample = self.Barcodes[putative_bc] + sample = self.barcodes[putative_bc] else: corrected, num_errors = method(putative_bc) final_bc = corrected - self.FinalState['Barcode errors'] = num_errors - self.Stats['barcode_corrected'] += 1 - sample = self.Barcodes.get(corrected, None) + self.state['Barcode errors'] = num_errors + self.stats['barcode_corrected'] += 1 + sample = self.barcodes.get(corrected, None) - self.FinalState['Final barcode'] = final_bc + self.state['Final barcode'] = final_bc if from_sequence: self.state['Sequence'] = self.state['Sequence'][bc_length:] if sample is None: - self.Failed = True - self.Stats['unknown_barcode'] += 1 + self.failed = True + self.stats['unknown_barcode'] += 1 else: - self.FinalState['Sample'] = sample + self.state['Sample'] = sample - @requires(Option='max_barcode_error', Values=not_none) + @requires(option='max_barcode_error', values=not_none) def _demultiplex_max_barcode_error(self): """Fail a sequence if it exceeds a max number of barcode errors""" - bc_errors = self.Options['max_barcode_error'] - if self.FinalState['Barcode errors'] > bc_errors: - self.Failed = True - self.Stats['exceed_barcode_error'] += 1 + bc_errors = self.options['max_barcode_error'] + if self.state['Barcode errors'] > bc_errors: + self.failed = True + self.stats['exceed_barcode_error'] += 1 ### End demultiplex methods ### Start primer methods - @requires(Option='instrument_type', Values='454') + @requires(option='instrument_type', values='454') def _primer_instrument_454(self): """Check for a valid primer""" self._primer_check_forward() - @requires(Option='retain_primer') - @requires(Option='max_primer_mismatch') + @requires(option='retain_primer') + @requires(option='max_primer_mismatch', values=not_none) def _primer_check_forward(self): """Attempt to determine if the forward primer exists and trim if there @@ -340,12 +340,12 @@ def _primer_check_forward(self): seq = self.state['Sequence'] qual = self.state['Qual'] - obs_barcode = self.FinalState['Final barcode'] - exp_primers = self.Primers.get(obs_barcode, None) + obs_barcode = self.state['Final barcode'] + exp_primers = self.primers.get(obs_barcode, None) if exp_primers is None: - self.Stats['unknown_primer_barcode_pair'] += 1 - self.Failed = True + self.stats['unknown_primer_barcode_pair'] += 1 + self.failed = True return len_primer = len(exp_primers[0]) @@ -354,41 +354,37 @@ def _primer_check_forward(self): mm = np.array([count_mismatches(obs_primer, p) for p in exp_primers]) - if (mm > self.Options['max_primer_mismatch']).all(): - self.Failed = True - self.Stats['max_primer_mismatch'] += 1 - self.Stats['exceeds_max_primer_mismatch'] += 1 + if (mm > self.options['max_primer_mismatch']).all(): + self.failed = True + self.stats['exceeds_max_primer_mismatch'] += 1 return - ### should decompose - if not self.Options['retain_primer']: + if not self.options['retain_primer']: seq = seq[len_primer:] - self.state['Sequence'] = seq if qual is not None: qual = qual[len_primer:] - self.state['Qual'] = qual - self.FinalState['Forward primer'] = obs_primer - self.FinalState['Sequence'] = seq - self.FinalState['Qual'] = qual + self.state['Forward primer'] = obs_primer + self.state['Sequence'] = seq + self.state['Qual'] = qual ### End primer methods ### Start sequence methods - @requires(Option='min_seq_len') + @requires(option='min_seq_len') def _sequence_length_check(self): """Checks minimum sequence length""" - if len(self.state['Sequence']) < self.Options['min_seq_len']: - self.Failed = True - self.Stats['min_seq_len'] += 1 + if len(self.state['Sequence']) < self.options['min_seq_len']: + self.failed = True + self.stats['min_seq_len'] += 1 - @requires(Option='ambiguous_count') + @requires(option='ambiguous_count') def _sequence_ambiguous_count(self): """Fail if the number of N characters is greater than threshold""" count = self.state['Sequence'].count('N') - if count > self.Options['ambiguous_count']: - self.Failed = True - self.Stats['ambiguous_count'] += 1 + if count > self.options['ambiguous_count']: + self.failed = True + self.stats['ambiguous_count'] += 1 ### End sequence methods From c48893295b6bd413d054a5cd78a44ddbe65a11a1 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Sun, 20 Apr 2014 21:31:37 -0600 Subject: [PATCH 42/61] ENH: new Seqs object for generating sequence data for the workflow --- qiime/process_seqs.py | 102 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 101 insertions(+), 1 deletion(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index 98300c7447..7ed53838e2 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -7,6 +7,7 @@ from collections import Counter from itertools import izip +from skbio.factory.sequence import factory from skbio.core.workflow import Workflow, requires, method, not_none from qiime.hamming import decode_barcode_8 as decode_hamming_8 from qiime.golay import decode as decode_golay_12 @@ -17,11 +18,16 @@ def count_mismatches(seq1, seq2): return sum(a != b for a, b in izip(seq1, seq2)) -def has_qual(state): +def has_sequence_qual(state): """Check if state has Qual""" return state['Qual'] is not None +def has_barcode_qual(state): + """Check if state has Barcode Qual""" + return state['Barcode Qual'] is not None + + ### notes on splitlib fastq options: # barcode_read_fps: via Command # store_qual_scores: via Command @@ -36,6 +42,100 @@ def has_qual(state): # rev_comp: via Command and iterators # phred_offset: via Command and iterators + +class Seqs(object): + """Augmented sequence iterators + + This sequence iterator allows for optionally combining sequence reads with + barcode data, as well as performing transforms independently on the reads + or the barcode data. Barcode quality, if available, is also yielded. + + Attributes + ---------- + reads + barcodes + + Examples + -------- + >>> import os + >>> out = open('test_barcodes.fna', 'w') + >>> out.write(">s1\nAT\n>s2\nGC\n") + >>> out.close() + >>> out = open('test_seqs.fq', 'w') + >>> out.write("@s1\nAAAT\n+\nghgh\n@s2\nTTGG\n+\nfggh\n") + >>> outgz.close() + + >>> from qiime.process_seqs import Seqs + >>> it = Seqs(seq='test_seqs.fq', barcode='test_barcodes.fna') + >>> for rec in it: + ... print rec['SequenceID'] + ... print rec['Sequence'] + ... print rec['Qual'] + ... print rec['BarcodeID'] + ... print rec['Barcode'] + ... print rec['BarcodeQual'] + s1 + AAAT + [39 40 39 40] + s1 + AT + None + s2 + TTGG + [38 39 39 40] + s2 + GC + None + >>> os.remove('test_seqs.fq') + >>> os.remove('test_barcodes.fna') + + """ + + def __init__(self, seq, qual=None, barcode=None, barcode_qual=None, + seq_kwargs=None, barcode_kwargs=None): + + seq_kwargs = {} if seq_kwargs is None else seq_kwargs + self.reads = factory(seq=seq, qual=qual, **seq_kwargs) + + if barcode is None: + self.barcodes = None + else: + barcode_kwargs = {} if barcode_kwargs is None else barcode_kwargs + self.barcodes = factory(seq=barcode, qual=barcode_qual, + **barcode_kwargs) + + def __iter__(self): + remap = (('SequenceID', 'BarcodeID'), + ('Sequence', 'Barcode'), + ('QualID', 'BarcodeQualID'), + ('Qual', 'BarcodeQual')) + + rec = {'SequenceID': None, + 'Sequence': None, + 'QualID': None, + 'Qual': None, + 'BarcodeID': None, + 'Barcode': None, + 'BarcodeQualID': None, + 'BarcodeQual': None} + + if self.barcodes is None: + for seq in self.reads: + rec.update(seq) + yield rec + else: + for seq, barcode in izip(self.reads, self.barcodes): + rec.update(seq) + rec.update({new_k: barcode[old_k] for new_k, old_k in remap}) + + if rec['SequenceID'] != rec['BarcodeID']: + raise ValueError("ID mismatch. SequenceID: %s, " + "BarcodeID: %s" % (rec['SequenceID'], + rec['BarcodeID'])) + + yield rec + + class SequenceWorkflow(Workflow): """Implement the sequence processing workflow From c5f9f33199441a15491560b7c587a0b3e0205358 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Mon, 21 Apr 2014 10:08:23 -0600 Subject: [PATCH 43/61] ENH: added IterAdapter and tests --- qiime/process_seqs.py | 53 ++++---- tests/test_process_seqs.py | 249 ++++++++----------------------------- 2 files changed, 83 insertions(+), 219 deletions(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index 7ed53838e2..5201375674 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -43,22 +43,25 @@ def has_barcode_qual(state): # phred_offset: via Command and iterators -class Seqs(object): - """Augmented sequence iterators +class IterAdapter(object): + """Sequence iterator adapter This sequence iterator allows for optionally combining sequence reads with barcode data, as well as performing transforms independently on the reads or the barcode data. Barcode quality, if available, is also yielded. + Essentially, this object augments the yielded type from the standard + scikit-bio `SequenceIterator` objects as to include optional information + about barcodes. + Attributes ---------- - reads - barcodes + seq + barcode Examples -------- - >>> import os - >>> out = open('test_barcodes.fna', 'w') + >>> from skbio.core.iterator import FastqIterator >>> out.write(">s1\nAT\n>s2\nGC\n") >>> out.close() >>> out = open('test_seqs.fq', 'w') @@ -91,18 +94,9 @@ class Seqs(object): """ - def __init__(self, seq, qual=None, barcode=None, barcode_qual=None, - seq_kwargs=None, barcode_kwargs=None): - - seq_kwargs = {} if seq_kwargs is None else seq_kwargs - self.reads = factory(seq=seq, qual=qual, **seq_kwargs) - - if barcode is None: - self.barcodes = None - else: - barcode_kwargs = {} if barcode_kwargs is None else barcode_kwargs - self.barcodes = factory(seq=barcode, qual=barcode_qual, - **barcode_kwargs) + def __init__(self, seq, barcode=None): + self.seq = seq + self.barcode = barcode def __iter__(self): remap = (('SequenceID', 'BarcodeID'), @@ -119,23 +113,34 @@ def __iter__(self): 'BarcodeQualID': None, 'BarcodeQual': None} - if self.barcodes is None: - for seq in self.reads: + if self.barcode is None: + for seq in self.seq: rec.update(seq) yield rec else: - for seq, barcode in izip(self.reads, self.barcodes): + for seq, barcode in izip(self.seq, self.barcode): rec.update(seq) - rec.update({new_k: barcode[old_k] for new_k, old_k in remap}) + rec.update({new_k: barcode[old_k] for old_k, new_k in remap}) + + base_seq_id = self._base_id(rec['SequenceID']) + base_bc_id = self._base_id(rec['BarcodeID']) - if rec['SequenceID'] != rec['BarcodeID']: + if base_seq_id != base_bc_id: raise ValueError("ID mismatch. SequenceID: %s, " "BarcodeID: %s" % (rec['SequenceID'], rec['BarcodeID'])) yield rec + def _base_id(self, id_): + """Fetch the base ID from a FASTQ sequence ID""" + base_pre180 = id_.split('/', 1)[0] + base_post180 = id_.split(' ', 1)[0] + if len(base_pre180) < len(base_post180): + return base_pre180 + else: + return base_post180 class SequenceWorkflow(Workflow): """Implement the sequence processing workflow @@ -215,7 +220,7 @@ def initialize_state(self, item): ### Start Workflow methods @method(priority=200) - @requires(state=has_qual) + @requires(state=has_sequence_qual) def wf_quality(self): """Check sequence quality diff --git a/tests/test_process_seqs.py b/tests/test_process_seqs.py index cda6f6439b..7406af9ac8 100644 --- a/tests/test_process_seqs.py +++ b/tests/test_process_seqs.py @@ -1,175 +1,16 @@ #!/usr/bin/env python -from itertools import chain, izip -from numpy import array -from cogent.util.unit_test import TestCase, main -from cogent.parse.fasta import MinimalFastaParser -from cogent.parse.fastq import MinimalFastqParser -from qiime.parse import MinimalQualParser -from qiime.process_seqs import (_fasta_qual_gen, - fasta_iterator, _fastq_barcode_gen, fastq_iterator, _fastq_gen, - SequenceWorkflow, _count_mismatches) -from qiime.quality import ascii_to_phred64 +from unittest import TestCase, main + +import numpy as np + +from future.builtins import zip + +from skbio.core.iterator import FastqIterator +from skbio.parse.sequences import parse_fastq +from qiime.process_seqs import IterAdapter, SequenceWorkflow, count_mismatches from qiime.util import MetadataMap -class FastqIteratorTests(TestCase): - def setUp(self): - fastq1_gen = MinimalFastqParser(fastq1_simple.splitlines()) - fastq2_gen = MinimalFastqParser(fastq2_simple.splitlines()) - barcodes1_gen = MinimalFastqParser(barcodes1_simple.splitlines()) - barcodes2_gen = MinimalFastqParser(barcodes2_simple.splitlines()) - - self.fastq_gen = chain(fastq1_gen, fastq2_gen) - self.barcodes_gen = chain(barcodes1_gen, barcodes2_gen) - - self.reversed_fastq_gen = chain(fastq2_gen, fastq1_gen) - - def test_fastq_barcode_gen_simple(self): - """Test simple fastq/barcode generation""" - exp_data = [('a', 'abcde', 'test1', array([33,34,35,36,37])), - ('b', 'asdasdasd', 'test2', array([33,51,36] * 3)), - ('c', '123123', 'test3', array([-15, -14, -13] * 2)), - ('x', 'abcdefg', 'test4', array([33,34,35,36,37,38,39])), - ('y', 'popopo', 'test5', array([48,47] * 3))] - exp = [] - for id_,seq,bc,qual in exp_data: - exp.append({'SequenceID':id_, 'Sequence':seq, 'Qual':qual, - 'Barcode':bc}) - - obs = _fastq_barcode_gen(self.fastq_gen, self.barcodes_gen, - ascii_to_phred64) - for o,e in izip(obs,exp): - self.assertEqual(o['SequenceID'], e['SequenceID']) - self.assertEqual(o['Sequence'], e['Sequence']) - self.assertTrue((o['Qual'] == e['Qual']).all()) - self.assertEqual(o['Barcode'], e['Barcode']) - - def test_fastq_barcode_gen_mismatch_ids(self): - """Verify fastq barcode mismatch error""" - with self.assertRaises(ValueError): - g = _fasta_qual_gen(self.reversed_fastq_gen, self.barcodes_gen) - _ = list(g) - - def test_fastq_iterators_just_fastq(self): - """Test iterating fastq without barcodes""" - exp_data = [('a', 'abcde', array([33,34,35,36,37])), - ('b', 'asdasdasd', array([33,51,36] * 3)), - ('c', '123123', array([-15, -14, -13] * 2)), - ('x', 'abcdefg', array([33,34,35,36,37,38,39])), - ('y', 'popopo', array([48,47] * 3))] - exp = [] - for id_,seq,qual in exp_data: - exp.append({'SequenceID':id_, 'Sequence':seq, 'Qual':qual, - 'Barcode':None}) - - open_fps = map(lambda x: x.splitlines(), [fastq1_simple, fastq2_simple]) - obs = [d.copy() for d in fastq_iterator(open_fps)] - self.assertEqual(obs, exp) - - def test_fastq_iterators_barcodes(self): - """Test iterating fastq with barcodes""" - exp_data = [('a', 'abcde', 'test1', array([33,34,35,36,37])), - ('b', 'asdasdasd', 'test2', array([33,51,36] * 3)), - ('c', '123123', 'test3', array([-15, -14, -13] * 2)), - ('x', 'abcdefg', 'test4', array([33,34,35,36,37,38,39])), - ('y', 'popopo', 'test5', array([48,47] * 3))] - exp = [] - for id_,seq,bc,qual in exp_data: - exp.append({'SequenceID':id_, 'Sequence':seq, 'Qual':qual, - 'Barcode':bc}) - - splitter = lambda x: x.splitlines() - fastq_fps = map(splitter, [fastq1_simple, fastq2_simple]) - bc_fps = map(splitter, [barcodes1_simple, barcodes2_simple]) - - obs = fastq_iterator(fastq_fps, bc_fps) - for o,e in izip(obs,exp): - self.assertEqual(o['SequenceID'], e['SequenceID']) - self.assertEqual(o['Sequence'], e['Sequence']) - self.assertTrue((o['Qual'] == e['Qual']).all()) - self.assertEqual(o['Barcode'], e['Barcode']) - -class FastaIteratorTests(TestCase): - def setUp(self): - fasta1_gen = MinimalFastaParser(fasta1_simple.splitlines()) - qual1_gen = MinimalQualParser(qual1_simple.splitlines()) - fasta2_gen = MinimalFastaParser(fasta2_simple.splitlines()) - qual2_gen = MinimalQualParser(qual2_simple.splitlines()) - qual2_bad_gen = MinimalQualParser(qual2_simple_bad.splitlines()) - - self.fasta_gen = chain(fasta1_gen, fasta2_gen) - self.qual_gen = chain(qual1_gen, qual2_gen) - - self.reversed_fasta_gen = chain(fasta2_gen, fasta1_gen) - self.qual_bad_gen = chain(qual1_gen, qual2_bad_gen) - - def test_fasta_qual_gen_simple(self): - """Test fasta/qual gen""" - exp_data = [('a', 'abcde', array([1, 2, 3, 4, 5])), - ('b', 'asdasdasd', array([1,1,1,1,1,1,1,1,1])), - ('c', '123123', array([2, 2, 2, 2, 2, 2])), - ('x', 'abcdefg', array([1, 2, 3, 4, 5, 6, 7])), - ('y', 'popopo', array([1, 1, 1, 1, 1, 1]))] - exp = [] - for id_,seq,qual in exp_data: - exp.append({'SequenceID':id_, 'Sequence':seq, 'Qual':qual, - 'Barcode':None}) - - obs = _fasta_qual_gen(self.fasta_gen, self.qual_gen) - for o,e in izip(obs,exp): - self.assertEqual(o['SequenceID'], e['SequenceID']) - self.assertEqual(o['Sequence'], e['Sequence']) - self.assertTrue((o['Qual'] == e['Qual']).all()) - - def test_fasta_qual_gen_mismatch_ids(self): - """Verify fasta/qual id mismatch error""" - with self.assertRaises(ValueError): - g = _fasta_qual_gen(self.reversed_fasta_gen, self.qual_gen) - _ = list(g) - - def test_fasta_qual_gen_mismatch_length(self): - """Verify fasta/qual mismatch error""" - with self.assertRaises(ValueError): - _ = list(_fasta_qual_gen(self.fasta_gen, self.qual_bad_gen)) - - def test_fasta_iterators_just_fasta(self): - """Test that we can iterate over just fasta""" - exp_data = [('a', 'abcde', None), - ('b', 'asdasdasd', None), - ('c', '123123', None), - ('x', 'abcdefg', None), - ('y', 'popopo', None)] - - exp = [] - for id_,seq,qual in exp_data: - exp.append({'SequenceID':id_, 'Sequence':seq, 'Qual':qual, - 'Barcode':None}) - - open_fps = map(lambda x: x.splitlines(), [fasta1_simple, fasta2_simple]) - obs = [d.copy() for d in fasta_iterator(open_fps)] - self.assertEqual(obs, exp) - - def test_fasta_iterators_fasta_qual(self): - """Test that we can iterate over fasta with qual""" - exp_data = [('a', 'abcde', array([1, 2, 3, 4, 5])), - ('b', 'asdasdasd', array([1,1,1,1,1,1,1,1,1])), - ('c', '123123', array([2, 2, 2, 2, 2, 2])), - ('x', 'abcdefg', array([1, 2, 3, 4, 5, 6, 7])), - ('y', 'popopo', array([1, 1, 1, 1, 1, 1]))] - - exp = [] - for id_,seq,qual in exp_data: - exp.append({'SequenceID':id_, 'Sequence':seq, 'Qual':qual, - 'Barcode':None}) - splitter = lambda x: x.splitlines() - fasta_fps = map(splitter, [fasta1_simple, fasta2_simple]) - qual_fps = map(splitter, [qual1_simple, qual2_simple]) - - obs = fasta_iterator(fasta_fps, qual_fps) - for o,e in izip(obs, exp): - self.assertEqual(o['SequenceID'], e['SequenceID']) - self.assertEqual(o['Sequence'], e['Sequence']) - self.assertTrue((o['Qual'] == e['Qual']).all()) class SupportTests(TestCase): def setUp(self): @@ -180,10 +21,28 @@ def test_count_mismatches(self): s1 = "AATTGGCC" s2 = "AATTCCCC" - self.assertEqual(_count_mismatches(s1, s1), 0) - self.assertEqual(_count_mismatches(s1, s2), 2) - self.assertEqual(_count_mismatches(s2, s1), 2) - self.assertEqual(_count_mismatches(s2, s2), 0) + self.assertEqual(count_mismatches(s1, s1), 0) + self.assertEqual(count_mismatches(s1, s2), 2) + self.assertEqual(count_mismatches(s2, s1), 2) + self.assertEqual(count_mismatches(s2, s2), 0) + +class IterAdapterTests(TestCase): + def test_iter(self): + seq_raw = fastq1.splitlines() + bc_raw = barcode_fastq1.splitlines() + + seq = FastqIterator([seq_raw]) + barcode = FastqIterator([bc_raw]) + it = IterAdapter(seq=seq, barcode=barcode) + + for rec, s, b in zip(it, parse_fastq(seq_raw), parse_fastq(bc_raw)): + self.assertEqual(rec['SequenceID'], s[0]) + self.assertEqual(rec['Sequence'], s[1]) + np.testing.assert_equal(rec['Qual'], s[2]) + self.assertEqual(rec['BarcodeID'], b[0]) + self.assertEqual(rec['Barcode'], b[1]) + np.testing.assert_equal(rec['BarcodeQual'], b[2]) + class ProcessSeqsWorkflowTests(TestCase): """Basing structure off of test_split_libraries_fastq.py""" @@ -205,7 +64,7 @@ def _make_workflow_obj(self, options): def test_workflow_construction(self): """Make sure we can construct using our helper method""" x = self._make_workflow_obj({'foo':'bar'}) - + def test_wf_init(self): """Check the initialization method""" wf_obj = self._make_workflow_obj({'foo':'bar'}) @@ -220,7 +79,7 @@ def test_quality_max_bad_run_length(self): item1 = {'Sequence':'AATTGGCC', 'Qual':array([6, 6, 6, 6, 6, 6, 6, 6])} exp1 = item1.copy() - + item2 = {'Sequence':'AATTGGCC', 'Qual':array([6, 6, 6, 1, 1, 6, 6, 6])} exp2 = item2.copy() @@ -244,7 +103,7 @@ def test_quality_min_per_read_length_fraction(self): item1 = {'Sequence':'AATTGGCC', 'Qual':array([6, 6, 6, 6, 6, 6, 6, 6])} exp1 = item1.copy() - + item2 = {'Sequence':'AATTGGCC', 'Qual':array([6, 1, 6, 1, 1, 6, 6, 6])} exp2 = item2.copy() @@ -268,7 +127,7 @@ def test_quality_min_per_read_length_fraction(self): self.assertEqual(item1, exp1) self.assertEqual(item2, exp2) self.assertEqual(item3, exp3) - + def test_demultiplex_golay12(self): # this is a wrapper, tested in test_deultiplex_encoded_barcode pass @@ -280,7 +139,7 @@ def test_demultiplex_hamming8(self): def test_demultiplex_encoded_barcode(self): """Verify decoding barcodes""" wf_obj = self._make_workflow_obj({}) - + needs_a_fix = {'Barcode':'GGAGACAAGGGT', 'Sequence':'AATTGGCC'} exact = {'Barcode':'GGAGACAAGGGA', 'Sequence':'AATTGGCC'} from_sequence = {'Barcode':None, 'Sequence':'GGAGACAAGGGAAATTAATT'} @@ -303,7 +162,7 @@ def test_demultiplex_encoded_barcode(self): self.assertEqual(wf_obj.FinalState['Corrected barcode'], None) self.assertEqual(wf_obj.FinalState['Sample'], 's5') self.assertFalse(wf_obj.Failed) - + wf_obj.wf_init(from_sequence) wf_obj.Failed = False # note, normally handled by Workflow.__call__ wf_obj._demultiplex_encoded_barcode(from_sequence) @@ -348,19 +207,19 @@ def test_primer_instrument_454(self): def test_primer_check_forward(self): """Pull the forward primer as expected""" - wf_obj = self._make_workflow_obj({'max_primer_mismatch':2, + wf_obj = self._make_workflow_obj({'max_primer_mismatch':2, 'retain_primer':False}) - item1 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGGCC', + item1 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGGCC', 'Qual':array([1,2,3,4,5,6,7,8])} - item2 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGCCC', + item2 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGCCC', 'Qual':array([1,2,3,4,5,6,7,8])} - item3 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'GGTTGCCC', + item3 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'GGTTGCCC', 'Qual':array([1,2,3,4,5,6,7,8])} - exp_item1 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'CC', + exp_item1 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'CC', 'Qual':array([7,8])} - exp_item2 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'CC', + exp_item2 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'CC', 'Qual':array([7,8])} - exp_item3 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'GGTTGCCC', + exp_item3 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'GGTTGCCC', 'Qual':array([1,2,3,4,5,6,7,8])} # item is modified in place in these operations as retain_primer is False @@ -390,24 +249,24 @@ def test_primer_check_forward(self): wf_obj._primer_check_forward(item3) self.assertEqual(item3, exp_item3) self.assertEqual(wf_obj.FinalState['Sequence'], 'GGTTGCCC') - self.assertEqual(wf_obj.FinalState['Qual'], None) + self.assertEqual(wf_obj.FinalState['Qual'], None) self.assertEqual(wf_obj.FinalState['Forward primer'], None) self.assertTrue(wf_obj.Failed) # item is not modified in place as retain priemr is True - wf_obj = self._make_workflow_obj({'max_primer_mismatch':2, + wf_obj = self._make_workflow_obj({'max_primer_mismatch':2, 'retain_primer':True}) - item1 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGGCC', + item1 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGGCC', 'Qual':array([1,2,3,4,5,6,7,8])} - item2 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGCCC', + item2 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGCCC', 'Qual':array([1,2,3,4,5,6,7,8])} - item3 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'GGTTGCCC', + item3 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'GGTTGCCC', 'Qual':array([1,2,3,4,5,6,7,8])} - exp_item1 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGGCC', + exp_item1 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGGCC', 'Qual':array([1,2,3,4,5,6,7,8])} - exp_item2 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGCCC', + exp_item2 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGCCC', 'Qual':array([1,2,3,4,5,6,7,8])} - exp_item3 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'GGTTGCCC', + exp_item3 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'GGTTGCCC', 'Qual':array([1,2,3,4,5,6,7,8])} wf_obj.wf_init(item1) @@ -436,10 +295,10 @@ def test_primer_check_forward(self): wf_obj._primer_check_forward(item3) self.assertEqual(item3, exp_item3) self.assertEqual(wf_obj.FinalState['Sequence'], 'GGTTGCCC') - self.assertEqual(wf_obj.FinalState['Qual'], None) + self.assertEqual(wf_obj.FinalState['Qual'], None) self.assertEqual(wf_obj.FinalState['Forward primer'], None) self.assertTrue(wf_obj.Failed) - + def test_sequence_length_check(self): """Check the length of the sequence""" wf_obj = self._make_workflow_obj({'min_seq_len':5}) From 27b8ef4dc8b7bd3f723a428dfc6f7b99fcd86138 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 12 Aug 2014 12:10:47 -0600 Subject: [PATCH 44/61] MAINT: back inline with skbio changes --- qiime/process_seqs.py | 54 ++--- tests/test_process_seqs.py | 397 +++++++++++++++++++------------------ 2 files changed, 222 insertions(+), 229 deletions(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index 5201375674..9d9f5b1b16 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -2,13 +2,13 @@ """Filter poor quality reads, trim barcodes/primers and assign to samples""" -import numpy as np from collections import Counter from itertools import izip -from skbio.factory.sequence import factory +import numpy as np from skbio.core.workflow import Workflow, requires, method, not_none + from qiime.hamming import decode_barcode_8 as decode_hamming_8 from qiime.golay import decode as decode_golay_12 @@ -28,21 +28,6 @@ def has_barcode_qual(state): return state['Barcode Qual'] is not None -### notes on splitlib fastq options: -# barcode_read_fps: via Command -# store_qual_scores: via Command -# sample_ids: via Command -# store_demultiplexed_fastq: via Command -# retain_unassigned_reads: via Command (Failed == False, Sample == None) -# start_seq_id: via Command, but also hopefully deprecated in favor of -# HDF5 format -# rev_comp_barcode: via Command and iterators? only if the barcodes are separate -# then it is possible to do at the iterator level... -# rev_comp_mapping_barcodes: via Command -# rev_comp: via Command and iterators -# phred_offset: via Command and iterators - - class IterAdapter(object): """Sequence iterator adapter @@ -61,15 +46,15 @@ class IterAdapter(object): Examples -------- - >>> from skbio.core.iterator import FastqIterator + >>> out = open("test_barcodes.fna", 'w') >>> out.write(">s1\nAT\n>s2\nGC\n") >>> out.close() >>> out = open('test_seqs.fq', 'w') >>> out.write("@s1\nAAAT\n+\nghgh\n@s2\nTTGG\n+\nfggh\n") >>> outgz.close() - >>> from qiime.process_seqs import Seqs - >>> it = Seqs(seq='test_seqs.fq', barcode='test_barcodes.fna') + >>> from qiime.process_seqs import IterAdapter + >>> it = IterAdapter(seq='test_seqs.fq', barcode='test_barcodes.fna') >>> for rec in it: ... print rec['SequenceID'] ... print rec['Sequence'] @@ -94,7 +79,7 @@ class IterAdapter(object): """ - def __init__(self, seq, barcode=None): + def __init__(self, seq, barcode=None, **kwargs): self.seq = seq self.barcode = barcode @@ -141,6 +126,8 @@ def _base_id(self, id_): return base_pre180 else: return base_post180 + + class SequenceWorkflow(Workflow): """Implement the sequence processing workflow @@ -186,24 +173,23 @@ class SequenceWorkflow(Workflow): def __init__(self, *args, **kwargs): if 'barcodes' not in kwargs: kwargs['barcodes'] = {} + if 'primers' not in kwargs: kwargs['primers'] = {} - kwargs['state'] = {'Forward primer': None, - 'Reverse primer': None, - 'Sequence': None, - 'Qual': None, - 'Barcode': None, - 'Barcode Qual': None, - 'Sample': None, - 'Original barcode': None, - 'Corrected barcode': None, - 'Final barcode': None, - 'Corrected barcode errors': None} + state = {'Forward primer': None, + 'Reverse primer': None, + 'Sequence': None, + 'Qual': None, + 'Barcode': None, + 'Barcode Qual': None, + 'Sample': None, + 'Original barcode': None, + 'Final barcode': None, + 'Barcode errors': None} kwargs['stats'] = Counter() - - super(SequenceWorkflow, self).__init__(self, *args, **kwargs) + super(SequenceWorkflow, self).__init__(state, *args, **kwargs) def initialize_state(self, item): """Reset `state` and update with the current `item` diff --git a/tests/test_process_seqs.py b/tests/test_process_seqs.py index 7406af9ac8..94fe1502ce 100644 --- a/tests/test_process_seqs.py +++ b/tests/test_process_seqs.py @@ -3,11 +3,12 @@ from unittest import TestCase, main import numpy as np +import numpy.testing as npt from future.builtins import zip +from skbio import FastqIterator +from skbio import parse_fastq -from skbio.core.iterator import FastqIterator -from skbio.parse.sequences import parse_fastq from qiime.process_seqs import IterAdapter, SequenceWorkflow, count_mismatches from qiime.util import MetadataMap @@ -31,11 +32,11 @@ def test_iter(self): seq_raw = fastq1.splitlines() bc_raw = barcode_fastq1.splitlines() - seq = FastqIterator([seq_raw]) - barcode = FastqIterator([bc_raw]) + seq = FastqIterator([seq_raw], phred_offset=64) + barcode = FastqIterator([bc_raw], phred_offset=64) it = IterAdapter(seq=seq, barcode=barcode) - for rec, s, b in zip(it, parse_fastq(seq_raw), parse_fastq(bc_raw)): + for rec, s, b in zip(it, parse_fastq(seq_raw, phred_offset=64), parse_fastq(bc_raw, phred_offset=64)): self.assertEqual(rec['SequenceID'], s[0]) self.assertEqual(rec['Sequence'], s[1]) np.testing.assert_equal(rec['Qual'], s[2]) @@ -56,77 +57,90 @@ def setUp(self): self.fastq2_expected_default = fastq2_expected_default self.fastq1_expected_single_barcode = fastq1_expected_single_barcode self.mapping = mapping + self.primers = \ + {v['BarcodeSequence']: v['LinkerPrimerSequence'].split(',') + for v in mapping._metadata.values()} + self.barcodes = {v['BarcodeSequence']: k + for k, v in mapping._metadata.items()} def _make_workflow_obj(self, options): """Helper method for creating workflows""" - return SequenceWorkflow(Options=options, Mapping=self.mapping) + return SequenceWorkflow(options=options, mapping=self.mapping, + primers=self.primers, barcodes=self.barcodes) def test_workflow_construction(self): """Make sure we can construct using our helper method""" x = self._make_workflow_obj({'foo':'bar'}) - def test_wf_init(self): + def test_initialize_state(self): """Check the initialization method""" wf_obj = self._make_workflow_obj({'foo':'bar'}) - wf_obj.FinalState['Sequence'] = 'w00t' - wf_obj.wf_init({'Sequence':'foo'}) - self.assertEqual(set(wf_obj.FinalState.values()), set([None, 'foo'])) + wf_obj.state['Sequence'] = 'w00t' + wf_obj.initialize_state({'Sequence':'foo'}) + self.assertEqual(set(wf_obj.state.values()), set([None, 'foo'])) def test_quality_max_bad_run_length(self): """Verify max bad run length quality trimming""" - wf_obj = self._make_workflow_obj({'phred_quality_threshold':5, - 'max_bad_run_length':3}) - item1 = {'Sequence':'AATTGGCC', - 'Qual':array([6, 6, 6, 6, 6, 6, 6, 6])} + wf_obj = self._make_workflow_obj({'phred_quality_threshold': 5, + 'max_bad_run_length': 3}) + item1 = {'Sequence': 'AATTGGCC', + 'Qual': np.array([6, 6, 6, 6, 6, 6, 6, 6])} exp1 = item1.copy() - item2 = {'Sequence':'AATTGGCC', - 'Qual':array([6, 6, 6, 1, 1, 6, 6, 6])} + item2 = {'Sequence': 'AATTGGCC', + 'Qual': np.array([6, 6, 6, 1, 1, 6, 6, 6])} exp2 = item2.copy() - item3 = {'Sequence':'AATTGGCC', - 'Qual':array([6, 6, 1, 1, 1, 1, 6, 6])} - exp3 = {'Sequence':'AA', 'Qual':array([6, 6])} + item3 = {'Sequence': 'AATTGGCC', + 'Qual': np.array([6, 6, 1, 1, 1, 1, 6, 6])} + exp3 = {'Sequence': 'AA', 'Qual': np.array([6, 6])} - wf_obj._quality_max_bad_run_length(item1) - wf_obj._quality_max_bad_run_length(item2) - wf_obj._quality_max_bad_run_length(item3) + wf_obj.state = item1 + wf_obj._quality_max_bad_run_length() + wf_obj.state = item2 + wf_obj._quality_max_bad_run_length() + wf_obj.state = item3 + wf_obj._quality_max_bad_run_length() - self.assertEqual(item1, exp1) - self.assertEqual(item2, exp2) - self.assertEqual(item3, exp3) + npt.assert_equal(item1, exp1) + npt.assert_equal(item2, exp2) + npt.assert_equal(item3, exp3) def test_quality_min_per_read_length_fraction(self): """Verify minimum quality per read length""" - wf_obj = self._make_workflow_obj({'phred_quality_threshold':5, - 'min_per_read_length_fraction':0.6}) - item1 = {'Sequence':'AATTGGCC', - 'Qual':array([6, 6, 6, 6, 6, 6, 6, 6])} + wf_obj = self._make_workflow_obj({'phred_quality_threshold': 5, + 'min_per_read_length_fraction': 0.6}) + item1 = {'Sequence': 'AATTGGCC', + 'Qual': np.array([6, 6, 6, 6, 6, 6, 6, 6])} exp1 = item1.copy() - item2 = {'Sequence':'AATTGGCC', - 'Qual':array([6, 1, 6, 1, 1, 6, 6, 6])} + item2 = {'Sequence': 'AATTGGCC', + 'Qual': np.array([6, 1, 6, 1, 1, 6, 6, 6])} exp2 = item2.copy() - item3 = {'Sequence':'AATTGGCC', - 'Qual':array([6, 6, 1, 1, 1, 1, 6, 6])} - exp3 = {'Sequence':'AATTGGCC', 'Qual':array([6, 6, 1, 1, 1, 1, 6, 6])} + item3 = {'Sequence': 'AATTGGCC', + 'Qual': np.array([6, 6, 1, 1, 1, 1, 6, 6])} + exp3 = {'Sequence': 'AATTGGCC', + 'Qual': np.array([6, 6, 1, 1, 1, 1, 6, 6])} - wf_obj.Failed = False # note, normally handled by Workflow.__call__ - wf_obj._quality_min_per_read_length_fraction(item1) - self.assertFalse(wf_obj.Failed) + wf_obj.state = item1 + wf_obj.failed = False + wf_obj._quality_min_per_read_length_fraction() + self.assertFalse(wf_obj.failed) - wf_obj.Failed = False # note, normally handled by Workflow.__call__ - wf_obj._quality_min_per_read_length_fraction(item2) - self.assertFalse(wf_obj.Failed) + wf_obj.state = item2 + wf_obj.failed = False + wf_obj._quality_min_per_read_length_fraction() + self.assertFalse(wf_obj.failed) - wf_obj.Failed = False # note, normally handled by Workflow.__call__ - wf_obj._quality_min_per_read_length_fraction(item3) - self.assertTrue(wf_obj.Failed) + wf_obj.state = item3 + wf_obj.failed = False + wf_obj._quality_min_per_read_length_fraction() + self.assertTrue(wf_obj.failed) - self.assertEqual(item1, exp1) - self.assertEqual(item2, exp2) - self.assertEqual(item3, exp3) + npt.assert_equal(item1, exp1) + npt.assert_equal(item2, exp2) + npt.assert_equal(item3, exp3) def test_demultiplex_golay12(self): # this is a wrapper, tested in test_deultiplex_encoded_barcode @@ -138,68 +152,72 @@ def test_demultiplex_hamming8(self): def test_demultiplex_encoded_barcode(self): """Verify decoding barcodes""" - wf_obj = self._make_workflow_obj({}) + wf_obj = self._make_workflow_obj({'demultiplex': True, + 'barcode_type': 'golay_12'}) needs_a_fix = {'Barcode':'GGAGACAAGGGT', 'Sequence':'AATTGGCC'} exact = {'Barcode':'GGAGACAAGGGA', 'Sequence':'AATTGGCC'} from_sequence = {'Barcode':None, 'Sequence':'GGAGACAAGGGAAATTAATT'} unknown_barcode = {'Barcode':'ACACCTGGTGAT', 'Sequence':'AATTGGCC'} - wf_obj.wf_init(needs_a_fix) - wf_obj.Failed = False # note, normally handled by Workflow.__call__ - wf_obj._demultiplex_encoded_barcode(needs_a_fix) - self.assertEqual(wf_obj.FinalState['Original barcode'], 'GGAGACAAGGGT') - self.assertEqual(wf_obj.FinalState['Corrected barcode errors'], 1) - self.assertEqual(wf_obj.FinalState['Corrected barcode'], 'GGAGACAAGGGA') - self.assertEqual(wf_obj.FinalState['Sample'], 's5') - self.assertFalse(wf_obj.Failed) - - wf_obj.wf_init(exact) - wf_obj.Failed = False # note, normally handled by Workflow.__call__ - wf_obj._demultiplex_encoded_barcode(exact) - self.assertEqual(wf_obj.FinalState['Original barcode'], 'GGAGACAAGGGA') - self.assertEqual(wf_obj.FinalState['Corrected barcode errors'], 0) - self.assertEqual(wf_obj.FinalState['Corrected barcode'], None) - self.assertEqual(wf_obj.FinalState['Sample'], 's5') - self.assertFalse(wf_obj.Failed) - - wf_obj.wf_init(from_sequence) - wf_obj.Failed = False # note, normally handled by Workflow.__call__ - wf_obj._demultiplex_encoded_barcode(from_sequence) - self.assertEqual(wf_obj.FinalState['Original barcode'], 'GGAGACAAGGGA') - self.assertEqual(wf_obj.FinalState['Corrected barcode errors'], 0) - self.assertEqual(wf_obj.FinalState['Corrected barcode'], None) - self.assertEqual(wf_obj.FinalState['Sample'], 's5') - self.assertFalse(wf_obj.Failed) - - wf_obj.wf_init(unknown_barcode) - wf_obj.Failed = False # note, normally handled by Workflow.__call__ - wf_obj._demultiplex_encoded_barcode(unknown_barcode) - self.assertEqual(wf_obj.FinalState['Original barcode'], 'ACACCTGGTGAT') - self.assertEqual(wf_obj.FinalState['Corrected barcode errors'], 0) - self.assertEqual(wf_obj.FinalState['Corrected barcode'], 'ACACCTGGTGAT') - self.assertEqual(wf_obj.FinalState['Sample'], None) - self.assertTrue(wf_obj.Failed) + wf_obj.initialize_state(needs_a_fix) + wf_obj.failed = False + wf_obj.wf_demultiplex() + + self.assertEqual(wf_obj.state['Original barcode'], 'GGAGACAAGGGT') + self.assertEqual(wf_obj.state['Barcode errors'], 1) + self.assertEqual(wf_obj.state['Final barcode'], 'GGAGACAAGGGA') + self.assertEqual(wf_obj.state['Sample'], 's5') + self.assertFalse(wf_obj.failed) + + wf_obj.initialize_state(exact) + wf_obj.failed = False + wf_obj.wf_demultiplex() + + self.assertEqual(wf_obj.state['Original barcode'], 'GGAGACAAGGGA') + self.assertEqual(wf_obj.state['Barcode errors'], 0) + self.assertEqual(wf_obj.state['Final barcode'], 'GGAGACAAGGGA') + self.assertEqual(wf_obj.state['Sample'], 's5') + self.assertFalse(wf_obj.failed) + + wf_obj.initialize_state(from_sequence) + wf_obj.failed = False + wf_obj.wf_demultiplex() + + self.assertEqual(wf_obj.state['Original barcode'], 'GGAGACAAGGGA') + self.assertEqual(wf_obj.state['Barcode errors'], 0) + self.assertEqual(wf_obj.state['Final barcode'], 'GGAGACAAGGGA') + self.assertEqual(wf_obj.state['Sample'], 's5') + self.assertFalse(wf_obj.failed) + + wf_obj.initialize_state(unknown_barcode) + wf_obj.failed = False + wf_obj.wf_demultiplex() + + self.assertEqual(wf_obj.state['Original barcode'], 'ACACCTGGTGAT') + self.assertEqual(wf_obj.state['Barcode errors'], 0) + self.assertEqual(wf_obj.state['Final barcode'], 'ACACCTGGTGAT') + self.assertEqual(wf_obj.state['Sample'], None) + self.assertTrue(wf_obj.failed) def test_demultiplex_max_barcode_error(self): """Verify failing max_barcode_error checking""" - wf_obj = self._make_workflow_obj({'max_barcode_error':0}) + wf_obj = self._make_workflow_obj({'demultiplex': True, + 'barcode_type': 'golay_12', + 'max_barcode_error':0}) needs_a_fix = {'Barcode':'GGAGACAAGGGT', 'Sequence':'AATTGGCC'} exact = {'Barcode':'GGAGACAAGGGA', 'Sequence':'AATTGGCC'} - wf_obj.wf_init(exact) - wf_obj._demultiplex_encoded_barcode(exact) - wf_obj._demultiplex_max_barcode_error(exact) - self.assertFalse(wf_obj.Failed) + wf_obj.failed = False + wf_obj.initialize_state(exact) + wf_obj.wf_demultiplex() + self.assertFalse(wf_obj.failed) - wf_obj.wf_init(needs_a_fix) - wf_obj.Failed = False # note, normally handled by Workflow.__call__ - wf_obj._demultiplex_encoded_barcode(needs_a_fix) - self.assertFalse(wf_obj.Failed) - wf_obj.Failed = False # note, normally handled by Workflow.__call__ - wf_obj._demultiplex_max_barcode_error(needs_a_fix) - self.assertTrue(wf_obj.Failed) + wf_obj.initialize_state(needs_a_fix) + wf_obj.failed = False + wf_obj.wf_demultiplex() + self.assertTrue(wf_obj.failed) def test_primer_instrument_454(self): # individual tests for each method call by this function @@ -207,111 +225,96 @@ def test_primer_instrument_454(self): def test_primer_check_forward(self): """Pull the forward primer as expected""" + # primer details sourced from self.mapping + wf_obj = self._make_workflow_obj({'max_primer_mismatch':2, 'retain_primer':False}) - item1 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGGCC', - 'Qual':array([1,2,3,4,5,6,7,8])} - item2 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGCCC', - 'Qual':array([1,2,3,4,5,6,7,8])} - item3 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'GGTTGCCC', - 'Qual':array([1,2,3,4,5,6,7,8])} - exp_item1 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'CC', - 'Qual':array([7,8])} - exp_item2 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'CC', - 'Qual':array([7,8])} - exp_item3 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'GGTTGCCC', - 'Qual':array([1,2,3,4,5,6,7,8])} - - # item is modified in place in these operations as retain_primer is False - wf_obj.wf_init(item1) - wf_obj.Failed = False # note, normally handled by Workflow.__call__ - wf_obj.FinalState['Final barcode'] = 'AAAAAAAAAAAA' - wf_obj._primer_check_forward(item1) - self.assertEqual(item1, exp_item1) - self.assertEqual(wf_obj.FinalState['Sequence'], 'CC') - self.assertEqual(wf_obj.FinalState['Qual'], array([7,8])) - self.assertEqual(wf_obj.FinalState['Forward primer'], 'AATTGG') - self.assertFalse(wf_obj.Failed) - - wf_obj.wf_init(item2) - wf_obj.Failed = False # note, normally handled by Workflow.__call__ - wf_obj.FinalState['Final barcode'] = 'AAAAAAAAAAAA' - wf_obj._primer_check_forward(item2) - self.assertEqual(item2, exp_item2) - self.assertEqual(wf_obj.FinalState['Sequence'], 'CC') - self.assertEqual(wf_obj.FinalState['Qual'], array([7,8])) - self.assertEqual(wf_obj.FinalState['Forward primer'], 'AATTGC') - self.assertFalse(wf_obj.Failed) - - wf_obj.wf_init(item3) - wf_obj.Failed = False # note, normally handled by Workflow.__call__ - wf_obj.FinalState['Final barcode'] = 'AAAAAAAAAAAA' - wf_obj._primer_check_forward(item3) - self.assertEqual(item3, exp_item3) - self.assertEqual(wf_obj.FinalState['Sequence'], 'GGTTGCCC') - self.assertEqual(wf_obj.FinalState['Qual'], None) - self.assertEqual(wf_obj.FinalState['Forward primer'], None) - self.assertTrue(wf_obj.Failed) + item1 = {'Final barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGGCC', + 'Qual':np.array([1,2,3,4,5,6,7,8])} + item2 = {'Final barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGCCC', + 'Qual':np.array([1,2,3,4,5,6,7,8])} + item3 = {'Final barcode':'AAAAAAAAAAAA', 'Sequence':'GGTTGCCC', + 'Qual':np.array([1,2,3,4,5,6,7,8])} + + wf_obj.initialize_state(item1) + wf_obj.failed = False + wf_obj._primer_check_forward() + + self.assertEqual(wf_obj.state['Sequence'], 'CC') + npt.assert_equal(wf_obj.state['Qual'], np.array([7,8])) + self.assertEqual(wf_obj.state['Forward primer'], 'AATTGG') + self.assertFalse(wf_obj.failed) + + wf_obj.initialize_state(item2) + wf_obj.failed = False + wf_obj._primer_check_forward() + + self.assertEqual(wf_obj.state['Sequence'], 'CC') + npt.assert_equal(wf_obj.state['Qual'], np.array([7,8])) + self.assertEqual(wf_obj.state['Forward primer'], 'AATTGC') + self.assertFalse(wf_obj.failed) + + wf_obj.initialize_state(item3) + wf_obj.failed = False + wf_obj._primer_check_forward() + + self.assertEqual(wf_obj.state['Sequence'], 'GGTTGCCC') + npt.assert_equal(wf_obj.state['Qual'], np.array([1,2,3,4,5,6,7,8])) + self.assertEqual(wf_obj.state['Forward primer'], None) + self.assertTrue(wf_obj.failed) # item is not modified in place as retain priemr is True wf_obj = self._make_workflow_obj({'max_primer_mismatch':2, 'retain_primer':True}) - item1 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGGCC', - 'Qual':array([1,2,3,4,5,6,7,8])} - item2 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGCCC', - 'Qual':array([1,2,3,4,5,6,7,8])} - item3 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'GGTTGCCC', - 'Qual':array([1,2,3,4,5,6,7,8])} - exp_item1 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGGCC', - 'Qual':array([1,2,3,4,5,6,7,8])} - exp_item2 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGCCC', - 'Qual':array([1,2,3,4,5,6,7,8])} - exp_item3 = {'Barcode':'AAAAAAAAAAAA', 'Sequence':'GGTTGCCC', - 'Qual':array([1,2,3,4,5,6,7,8])} - - wf_obj.wf_init(item1) - wf_obj.Failed = False # note, normally handled by Workflow.__call__ - wf_obj.FinalState['Final barcode'] = 'AAAAAAAAAAAA' - wf_obj._primer_check_forward(item1) - self.assertEqual(item1, exp_item1) - self.assertEqual(wf_obj.FinalState['Sequence'], 'AATTGGCC') - self.assertEqual(wf_obj.FinalState['Qual'], array([1,2,3,4,5,6,7,8])) - self.assertEqual(wf_obj.FinalState['Forward primer'], 'AATTGG') - self.assertFalse(wf_obj.Failed) - - wf_obj.wf_init(item2) - wf_obj.Failed = False # note, normally handled by Workflow.__call__ - wf_obj.FinalState['Final barcode'] = 'AAAAAAAAAAAA' - wf_obj._primer_check_forward(item2) - self.assertEqual(item2, exp_item2) - self.assertEqual(wf_obj.FinalState['Sequence'], 'AATTGCCC') - self.assertEqual(wf_obj.FinalState['Qual'], array([1,2,3,4,5,6,7,8])) - self.assertEqual(wf_obj.FinalState['Forward primer'], 'AATTGC') - self.assertFalse(wf_obj.Failed) - - wf_obj.wf_init(item3) - wf_obj.Failed = False # note, normally handled by Workflow.__call__ - wf_obj.FinalState['Final barcode'] = 'AAAAAAAAAAAA' - wf_obj._primer_check_forward(item3) - self.assertEqual(item3, exp_item3) - self.assertEqual(wf_obj.FinalState['Sequence'], 'GGTTGCCC') - self.assertEqual(wf_obj.FinalState['Qual'], None) - self.assertEqual(wf_obj.FinalState['Forward primer'], None) - self.assertTrue(wf_obj.Failed) + item1 = {'Final barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGGCC', + 'Qual':np.array([1,2,3,4,5,6,7,8])} + item2 = {'Final barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGCCC', + 'Qual':np.array([1,2,3,4,5,6,7,8])} + item3 = {'Final barcode':'AAAAAAAAAAAA', 'Sequence':'GGTTGCCC', + 'Qual':np.array([1,2,3,4,5,6,7,8])} + + wf_obj.initialize_state(item1) + wf_obj.failed = False + wf_obj._primer_check_forward() + + self.assertEqual(wf_obj.state['Sequence'], 'AATTGGCC') + npt.assert_equal(wf_obj.state['Qual'], np.array([1,2,3,4,5,6,7,8])) + self.assertEqual(wf_obj.state['Forward primer'], 'AATTGG') + self.assertFalse(wf_obj.failed) + + wf_obj.initialize_state(item2) + wf_obj.failed = False + wf_obj._primer_check_forward() + + self.assertEqual(wf_obj.state['Sequence'], 'AATTGCCC') + npt.assert_equal(wf_obj.state['Qual'], np.array([1,2,3,4,5,6,7,8])) + self.assertEqual(wf_obj.state['Forward primer'], 'AATTGC') + self.assertFalse(wf_obj.failed) + + wf_obj.initialize_state(item3) + wf_obj.failed = False + wf_obj._primer_check_forward() + + self.assertEqual(wf_obj.state['Sequence'], 'GGTTGCCC') + npt.assert_equal(wf_obj.state['Qual'], np.array([1,2,3,4,5,6,7,8])) + self.assertEqual(wf_obj.state['Forward primer'], None) + self.assertTrue(wf_obj.failed) def test_sequence_length_check(self): """Check the length of the sequence""" - wf_obj = self._make_workflow_obj({'min_seq_len':5}) + wf_obj = self._make_workflow_obj(options={'min_seq_len':5}) item1 = {'Sequence':'AATTGGCC'} item2 = {'Sequence':'AATT'} - wf_obj.Failed = False # note, normally handled by Workflow.__call__ - wf_obj._sequence_length_check(item1) - self.assertFalse(wf_obj.Failed) + wf_obj.state = item1 + wf_obj.failed = False # note, normally handled by Workflow.__call__ + wf_obj._sequence_length_check() + self.assertFalse(wf_obj.failed) - wf_obj.Failed = False # note, normally handled by Workflow.__call__ - wf_obj._sequence_length_check(item2) - self.assertTrue(wf_obj.Failed) + wf_obj.state = item2 + wf_obj.failed = False # note, normally handled by Workflow.__call__ + wf_obj._sequence_length_check() + self.assertTrue(wf_obj.failed) def test_sequence_ambiguous_count(self): wf_obj = self._make_workflow_obj({'ambiguous_count':2}) @@ -319,17 +322,21 @@ def test_sequence_ambiguous_count(self): item2 = {'Sequence':'AANNNTT'} item3 = {'Sequence':'AANTT'} - wf_obj.Failed = False # note, normally handled by Workflow.__call__ - wf_obj._sequence_ambiguous_count(item1) - self.assertFalse(wf_obj.Failed) + wf_obj.state = item1 + wf_obj.failed = False + wf_obj._sequence_ambiguous_count() + self.assertFalse(wf_obj.failed) + + wf_obj.state = item2 + wf_obj.failed = False + wf_obj._sequence_ambiguous_count() + self.assertTrue(wf_obj.failed) - wf_obj.Failed = False # note, normally handled by Workflow.__call__ - wf_obj._sequence_ambiguous_count(item2) - self.assertTrue(wf_obj.Failed) + wf_obj.state = item3 + wf_obj.failed = False + wf_obj._sequence_ambiguous_count() + self.assertFalse(wf_obj.failed) - wf_obj.Failed = False # note, normally handled by Workflow.__call__ - wf_obj._sequence_ambiguous_count(item3) - self.assertFalse(wf_obj.Failed) fasta1_simple = """>a abcde From 5ce61066ecd73c3a1c1756331a1bdac12fca5b00 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 12 Aug 2014 13:12:49 -0600 Subject: [PATCH 45/61] DOC: major docstring updates --- qiime/process_seqs.py | 177 ++++++++++++++++++++++++++++++------------ 1 file changed, 127 insertions(+), 50 deletions(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index 9d9f5b1b16..6510bcf0fc 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -150,16 +150,78 @@ class SequenceWorkflow(Workflow): Parameters ---------- options : dict - Runtime options. See ``Options`` for more details + Runtime options. See Options for more details barcodes : dict - Mapping of barcode nucleotide sequence to a sample ID + Mapping of nucleotide barcode sequence to sample IDs primers : dict - Mapping of nucleotide sequence to enumerated possible primers + Mapping of nucleotide barcode sequences to possible primers Options ------- ## DESCRIBE EACH OPTION THAT CAN AFFECT WHAT METHODS ARE EXECUTED + State + ----- + The following keys are available in ``state``: + + Forward primer : str or None + The forward primer if applicable and if found. + Reverse primer : str or None + The reverse primer if applicable and if found. + Sequence : str + The sequence, trimmed as defined by runtime options (e.g., barcode, + quality, etc). + Qual : np.array(int) or None + Quality scores, trimmed as defined by runtime options (e.g., barcode, + quality, etc) or None if quality scores are not associated with the + sequences. + Barcode: str or None + The corresponding barcode if available prior to processing as may be + done with index reads. + Barcode qual: np.array(int) or None + The corresponding barcode quality if available prior to processing as + may be done with index reads. + Sample: str or None + The sample the sequence is associated with if a sample was determined + Original barcode: str or None + The original barcode observed in the sequence if the barcode is part of + the sequence, the index read, or None if no barcodes are in the data. + Final barcode: str or None + The final barcode which maybe error corrected or None if barcodes are + not applicable. + Barcode errors: int or None + The number of observed errors in the barcode sequence or None if + barcodes are not applicable. + + Stats + ----- + The following counts are tracked during a run. Note, because the + SequenceWorkflow short circuits if a failure is observed during processing, + and the specific steps executed are dependent on the runtime options and + data, the stats may be dependent on runtime conditions. For instance, since + demultiplexing is performed prior to sequence quality checks, if a failure + occurs during demultiplexing then quality stats, such as min_seq_len, will + not get incremented for those sequences. + + quality_max_bad_run_length + Number of sequences containing a run of poor quality bases + min_per_read_length_fraction + Number of sequences containing excessive poor quality bases + barcode_corrected + Number of sequences in which a barcode was corrected + unknown_barcode + Number of sequences in which an unknown barcode was observed + exceed_barcode_error + Number of barcodes with errors that exceeded tolerance + unknown_primer_barcode_pair + Number of unknown primer barcode pairs + exceeds_max_primer_mismatch + Number of primer mismatches exceeds tolerance + min_seq_len + Number of sequences whose length did not meet tolerance + ambiguous_count + Number of sequences that contained to many ambiguous characters + Attributes ---------- state @@ -168,7 +230,7 @@ class SequenceWorkflow(Workflow): barcodes primers - """ + """ def __init__(self, *args, **kwargs): if 'barcodes' not in kwargs: @@ -188,7 +250,17 @@ def __init__(self, *args, **kwargs): 'Final barcode': None, 'Barcode errors': None} - kwargs['stats'] = Counter() + kwargs['stats'] = { + 'quality_max_bad_run_length': 0, + 'min_per_read_length_fraction': 0, + 'barcode_corrected': 0, + 'unknown_barcode': 0, + 'exceed_barcode_error': 0, + 'unknown_primer_barcode_pair': 0, + 'exceeds_max_primer_mismatch': 0, + 'min_seq_len': 0, + 'ambiguous_count': 0} + super(SequenceWorkflow, self).__init__(state, *args, **kwargs) def initialize_state(self, item): @@ -210,21 +282,19 @@ def initialize_state(self, item): def wf_quality(self): """Check sequence quality + Notes + ----- + Changes to `state` - ------------------ - This workflow group may trim `state['Sequence']` and `state['Qual']` if - quality trimming is enabled. + ################## + + * `Sequence` and `Qual` may be trimmed if quality trimming is enabled. Triggers for `failed` - --------------------- - - If to many nucleotides in `Sequence` are of poor quality. - - Impacted `stats` - ---------------- - quality_max_bad_run_length - Incremented if the read contained a run of poor quality bases - min_per_read_length_fraction - Incrememted if to many positions in `Sequence` are of poor quality + ##################### + + * If to many nucleotides in `Sequence` are of poor quality. + """ self._quality_max_bad_run_length() self._quality_min_per_read_length_fraction() @@ -234,53 +304,57 @@ def wf_quality(self): def wf_demultiplex(self): """Demultiplex a sequence + Notes + ----- + Changes to `state` - ------------------ - Sample - Original barcode - Final barcode - Barcode errors + ################## + + * `Sample` will be set if an associated sample could be determined. + * `Original barcode` will be set to the original barcode regardless of + if the barcode occurred within sequence or as an index. + * `Final barcode` will be set to the final barcode with correction if + applicable. + * `Barcode errors` will contain the number of observed barcode errors. Triggers for `failed` - --------------------- - - If a sequence could not be associated to a sample - - If the number of errors observed in the barcode exceed tolerance - - Impacted `stats` - ---------------- - barcode_corrected - Incremented if a barcode was corrected - unknown_barcode - Incremented if an unknown barcode was observed - exceed_barcode_error - Incremented if the number of observed barcode - errors exceeded tolerance + ##################### + + * If a `Sequence` could not be associated to a sample. + * If the number of errors observed in the `Original barcode` exceed + tolerance. + """ self._demultiplex_golay12() self._demultiplex_hamming8() self._demultiplex_other() self._demultiplex_max_barcode_error() - ### should this be wf_instrument for instriument specific checks? + # Should this be wf_instrument for instriument specific checks? @method(priority=100) @requires(option='check_primer', values=True) def wf_primer(self): """Perform primer validation + Notes + ----- + Changes to `state` - ------------------ - Sequence - Qual - Forward primer - Reverse primer + ################## + + * `Sequence` may be trimmed if a primer is found, and if the runtime + option `retain_primer` is `False`. + * `Qual` will be trimmed if `Sequence` is trimmed. + * `Forward primer` will be set if a forward primer is identified. + * `Reverse primer` will be set if a reverse primer is identified. Triggers for `failed` - --------------------- - - If the `primer` mapping does not contain primers associated with the - nucleotide barcode - Impacted `stats` - ---------------- - unknown_primer_barcode_pair + ##################### + + * If the `primer` mapping does not contain primers associated with the + nucleotide barcode. + * If the number of primer mismatches exceeds tolerance. + """ self._primer_instrument_454() @@ -294,11 +368,14 @@ def wf_sequence(self): Changes to `state` ------------------ + No changes to state are made. + Triggers for `failed` --------------------- - Impacted `stats` - ---------------- + * If a sequence does not mean `min_seq_len`. + * If the number of ambiguous bases exceed `ambiguous_count`. + """ self._sequence_length_check() self._sequence_ambiguous_count() @@ -337,7 +414,7 @@ def _quality_max_bad_run_length(self): if max_run_length > max_bad_run_length: self.state['Qual'] = self.state['Qual'][:max_run_start_idx+1] self.state['Sequence'] = self.state['Sequence'][:max_run_start_idx+1] - self.stats['_quality_max_bad_run_length'] += 1 + self.stats['quality_max_bad_run_length'] += 1 @requires(option='phred_quality_threshold') @requires(option='min_per_read_length_fraction') From c8d0f9c82eb50fa2df6a5db15ce55ab257da3672 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 12 Aug 2014 14:10:56 -0600 Subject: [PATCH 46/61] ENH/DOC: once through for cleanup, added quick method to count runs of poor qual --- qiime/process_seqs.py | 140 +++++++++++++++++++++++-------------- tests/test_process_seqs.py | 16 ++++- 2 files changed, 102 insertions(+), 54 deletions(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index 6510bcf0fc..2fc18c48b7 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -2,20 +2,55 @@ """Filter poor quality reads, trim barcodes/primers and assign to samples""" - -from collections import Counter -from itertools import izip - import numpy as np +from future.builtins import zip from skbio.core.workflow import Workflow, requires, method, not_none from qiime.hamming import decode_barcode_8 as decode_hamming_8 from qiime.golay import decode as decode_golay_12 +def runs_of_ones(bits): + """Find positions and lengths of all runs of 1s + + Notes + ----- + Based on this SO post: + + http://stackoverflow.com/questions/1066758/find-length-of-sequences-of-ide\ + ntical-values-in-a-numpy-array + + Parameters + ---------- + bits : np.array(bool) + The vector to check for runs + + Returns + ------- + run_starts : np.array(int) + The index positions of the start of any observed runs (inclusive) + run_ends : np.array(int) + The index positions of the end of any observed runs (exclusive) + run_lengths : np.array(int) + The length of each run in index order with respect to run_starts and + run_ends + """ + # make sure all runs of ones are well-bounded + bounded = np.hstack(([0], bits, [0])) + + # get 1 at run starts and -1 at run ends + difs = np.diff(bounded) + run_starts, = np.where(difs > 0) + run_ends, = np.where(difs < 0) + + # because of the diff, the run_starts and run_ends are offset and need to + # be corrected to reflect index positions in `bits` + return (run_starts, run_ends, run_ends - run_starts) + + def count_mismatches(seq1, seq2): """Counts mismatches between two sequences""" - return sum(a != b for a, b in izip(seq1, seq2)) + return sum(a != b for a, b in zip(seq1, seq2)) def has_sequence_qual(state): @@ -31,6 +66,9 @@ def has_barcode_qual(state): class IterAdapter(object): """Sequence iterator adapter + Notes + ----- + This sequence iterator allows for optionally combining sequence reads with barcode data, as well as performing transforms independently on the reads or the barcode data. Barcode quality, if available, is also yielded. @@ -44,6 +82,12 @@ class IterAdapter(object): seq barcode + Raises + ------ + ValueError + If the sequence ID and barcode ID do not match (if barcodes are + provided). + Examples -------- >>> out = open("test_barcodes.fna", 'w') @@ -79,7 +123,7 @@ class IterAdapter(object): """ - def __init__(self, seq, barcode=None, **kwargs): + def __init__(self, seq, barcode=None): self.seq = seq self.barcode = barcode @@ -103,7 +147,7 @@ def __iter__(self): rec.update(seq) yield rec else: - for seq, barcode in izip(self.seq, self.barcode): + for seq, barcode in zip(self.seq, self.barcode): rec.update(seq) rec.update({new_k: barcode[old_k] for old_k, new_k in remap}) @@ -131,6 +175,8 @@ def _base_id(self, id_): class SequenceWorkflow(Workflow): """Implement the sequence processing workflow + Notes + ----- The sequence processing workflow manages the following tasks, executed in the following order:: @@ -199,9 +245,9 @@ class SequenceWorkflow(Workflow): SequenceWorkflow short circuits if a failure is observed during processing, and the specific steps executed are dependent on the runtime options and data, the stats may be dependent on runtime conditions. For instance, since - demultiplexing is performed prior to sequence quality checks, if a failure - occurs during demultiplexing then quality stats, such as min_seq_len, will - not get incremented for those sequences. + demultiplexing is performed after sequence quality checks, if a failure + occurs during quality checks then barcode stats, such as + exceed_barcode_error, will not be reflective of those sequences. quality_max_bad_run_length Number of sequences containing a run of poor quality bases @@ -284,17 +330,15 @@ def wf_quality(self): Notes ----- + Overall sequence quality checks and trimming. Changes to `state` ################## - * `Sequence` and `Qual` may be trimmed if quality trimming is enabled. Triggers for `failed` ##################### - * If to many nucleotides in `Sequence` are of poor quality. - """ self._quality_max_bad_run_length() self._quality_min_per_read_length_fraction() @@ -306,10 +350,10 @@ def wf_demultiplex(self): Notes ----- + Demultiplexing methods to assign sequences back to samples. Changes to `state` ################## - * `Sample` will be set if an associated sample could be determined. * `Original barcode` will be set to the original barcode regardless of if the barcode occurred within sequence or as an index. @@ -319,11 +363,9 @@ def wf_demultiplex(self): Triggers for `failed` ##################### - * If a `Sequence` could not be associated to a sample. * If the number of errors observed in the `Original barcode` exceed tolerance. - """ self._demultiplex_golay12() self._demultiplex_hamming8() @@ -338,10 +380,10 @@ def wf_primer(self): Notes ----- + Primer validation methods. Changes to `state` ################## - * `Sequence` may be trimmed if a primer is found, and if the runtime option `retain_primer` is `False`. * `Qual` will be trimmed if `Sequence` is trimmed. @@ -350,11 +392,9 @@ def wf_primer(self): Triggers for `failed` ##################### - * If the `primer` mapping does not contain primers associated with the nucleotide barcode. * If the number of primer mismatches exceeds tolerance. - """ self._primer_instrument_454() @@ -362,17 +402,17 @@ def wf_primer(self): def wf_sequence(self): """Final sequence level checks - Sequence level checks will not alter `state` but may trigger Failed - and update `stats` + Notes + ----- + Sequence level checks will not alter `state` but may trigger `failed` + and update `stats`. Changes to `state` - ------------------ - + ################## No changes to state are made. Triggers for `failed` - --------------------- - + ##################### * If a sequence does not mean `min_seq_len`. * If the number of ambiguous bases exceed `ambiguous_count`. @@ -380,48 +420,43 @@ def wf_sequence(self): self._sequence_length_check() self._sequence_ambiguous_count() - ### End Workflow methods + ### End Workflow groups methods ### Start quality methods @requires(option='phred_quality_threshold') @requires(option='max_bad_run_length') def _quality_max_bad_run_length(self): - """Fail sequence if there is a poor quality run""" + """Fail if there is a poor quality run""" max_bad_run_length = self.options['max_bad_run_length'] phred_quality_threshold = self.options['phred_quality_threshold'] - # can cythonize - run_length = 0 - max_run_length = 0 - run_start_idx = 0 - max_run_start_idx = 0 - - for idx, v in enumerate(self.state['Qual']): - if v <= phred_quality_threshold: - max_run_length += 1 - else: - if run_length > max_run_length: - max_run_length = run_length - max_run_start_idx = run_start_idx - - run_length = 0 - run_start_idx = idx - - if max_run_length == 0: - max_run_start_idx = run_start_idx - - if max_run_length > max_bad_run_length: - self.state['Qual'] = self.state['Qual'][:max_run_start_idx+1] - self.state['Sequence'] = self.state['Sequence'][:max_run_start_idx+1] + # cythonizable + poor_quality = self.state['Qual'] < phred_quality_threshold + poor_start, poor_stop, poor_length = runs_of_ones(poor_quality) + + if poor_length.size: + worst_idx = np.argmax(poor_length) + worst_len = poor_length[worst_idx] + worst_start_idx = poor_start[worst_idx] + else: + worst_idx = None + worst_len = -1 + worst_start_idx = None + + if worst_len > max_bad_run_length: + self.state['Qual'] = self.state['Qual'][:worst_start_idx] + self.state['Sequence'] = self.state['Sequence'][:worst_start_idx] self.stats['quality_max_bad_run_length'] += 1 @requires(option='phred_quality_threshold') @requires(option='min_per_read_length_fraction') def _quality_min_per_read_length_fraction(self): """Fail a sequence if a percentage of bad quality calls exist""" - bad_bases = self.state['Qual'] < self.options['phred_quality_threshold'] + phred_quality_threshold = self.options['phred_quality_threshold'] + bad_bases = self.state['Qual'] < phred_quality_threshold bad_bases_count = bad_bases.sum(dtype=float) + threshold = 1 - self.options['min_per_read_length_fraction'] if (bad_bases_count / len(self.state['Sequence'])) > threshold: @@ -446,7 +481,6 @@ def _demultiplex_other(self): """Decode a variable length barcode""" raise NotImplementedError - #### use kwargs for method and bc_length def _demultiplex_encoded_barcode(self, method, bc_length): """Correct and decode an encoded barcode""" if self.state['Barcode'] is not None: @@ -536,7 +570,7 @@ def _primer_check_forward(self): self.state['Sequence'] = seq self.state['Qual'] = qual - ### End primer methods + # End primer methods ### Start sequence methods diff --git a/tests/test_process_seqs.py b/tests/test_process_seqs.py index 94fe1502ce..f3ff10f67d 100644 --- a/tests/test_process_seqs.py +++ b/tests/test_process_seqs.py @@ -9,7 +9,8 @@ from skbio import FastqIterator from skbio import parse_fastq -from qiime.process_seqs import IterAdapter, SequenceWorkflow, count_mismatches +from qiime.process_seqs import (IterAdapter, SequenceWorkflow, + count_mismatches, runs_of_ones) from qiime.util import MetadataMap @@ -27,6 +28,19 @@ def test_count_mismatches(self): self.assertEqual(count_mismatches(s2, s1), 2) self.assertEqual(count_mismatches(s2, s2), 0) + def test_runs_of_ones(self): + # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 + bits = np.array([0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1, 0]) + exp_starts = np.array([2, 5, 9]) + exp_ends = np.array([4, 6, 13]) + exp_lengths = np.array([2, 1, 4]) + + obs_starts, obs_ends, obs_lengths = runs_of_ones(bits) + + npt.assert_equal(obs_starts, exp_starts) + npt.assert_equal(obs_ends, exp_ends) + npt.assert_equal(obs_lengths, exp_lengths) + class IterAdapterTests(TestCase): def test_iter(self): seq_raw = fastq1.splitlines() From a05cf865f4f4f8d9d67331af4b11fb7eed2bea9f Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 12 Aug 2014 14:31:20 -0600 Subject: [PATCH 47/61] STY: pep8 --- qiime/process_seqs.py | 24 +--- tests/test_process_seqs.py | 275 ++++++++----------------------------- 2 files changed, 60 insertions(+), 239 deletions(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index 2fc18c48b7..bad1f3ad6e 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -120,7 +120,6 @@ class IterAdapter(object): None >>> os.remove('test_seqs.fq') >>> os.remove('test_barcodes.fna') - """ def __init__(self, seq, barcode=None): @@ -321,8 +320,6 @@ def initialize_state(self, item): self.state[k] = None self.state.update(item) - ### Start Workflow methods - @method(priority=200) @requires(state=has_sequence_qual) def wf_quality(self): @@ -420,10 +417,6 @@ def wf_sequence(self): self._sequence_length_check() self._sequence_ambiguous_count() - ### End Workflow groups methods - - ### Start quality methods - @requires(option='phred_quality_threshold') @requires(option='max_bad_run_length') def _quality_max_bad_run_length(self): @@ -463,9 +456,6 @@ def _quality_min_per_read_length_fraction(self): self.failed = True self.stats['min_per_read_length_fraction'] += 1 - ### End quality methods - - ### Start demultiplex methods @requires(option='barcode_type', values='golay_12') def _demultiplex_golay12(self): """Correct and decode a Golay 12nt barcode""" @@ -481,7 +471,7 @@ def _demultiplex_other(self): """Decode a variable length barcode""" raise NotImplementedError - def _demultiplex_encoded_barcode(self, method, bc_length): + def _demultiplex_encoded_barcode(self, decode_method, bc_length): """Correct and decode an encoded barcode""" if self.state['Barcode'] is not None: from_sequence = False @@ -497,7 +487,7 @@ def _demultiplex_encoded_barcode(self, method, bc_length): final_bc = putative_bc sample = self.barcodes[putative_bc] else: - corrected, num_errors = method(putative_bc) + corrected, num_errors = decode_method(putative_bc) final_bc = corrected self.state['Barcode errors'] = num_errors self.stats['barcode_corrected'] += 1 @@ -522,10 +512,6 @@ def _demultiplex_max_barcode_error(self): self.failed = True self.stats['exceed_barcode_error'] += 1 - ### End demultiplex methods - - ### Start primer methods - @requires(option='instrument_type', values='454') def _primer_instrument_454(self): """Check for a valid primer""" @@ -570,10 +556,6 @@ def _primer_check_forward(self): self.state['Sequence'] = seq self.state['Qual'] = qual - # End primer methods - - ### Start sequence methods - @requires(option='min_seq_len') def _sequence_length_check(self): """Checks minimum sequence length""" @@ -588,5 +570,3 @@ def _sequence_ambiguous_count(self): if count > self.options['ambiguous_count']: self.failed = True self.stats['ambiguous_count'] += 1 - - ### End sequence methods diff --git a/tests/test_process_seqs.py b/tests/test_process_seqs.py index f3ff10f67d..c6dc492171 100644 --- a/tests/test_process_seqs.py +++ b/tests/test_process_seqs.py @@ -10,7 +10,7 @@ from skbio import parse_fastq from qiime.process_seqs import (IterAdapter, SequenceWorkflow, - count_mismatches, runs_of_ones) + count_mismatches, runs_of_ones) from qiime.util import MetadataMap @@ -41,6 +41,7 @@ def test_runs_of_ones(self): npt.assert_equal(obs_ends, exp_ends) npt.assert_equal(obs_lengths, exp_lengths) + class IterAdapterTests(TestCase): def test_iter(self): seq_raw = fastq1.splitlines() @@ -50,7 +51,8 @@ def test_iter(self): barcode = FastqIterator([bc_raw], phred_offset=64) it = IterAdapter(seq=seq, barcode=barcode) - for rec, s, b in zip(it, parse_fastq(seq_raw, phred_offset=64), parse_fastq(bc_raw, phred_offset=64)): + for rec, s, b in zip(it, parse_fastq(seq_raw, phred_offset=64), + parse_fastq(bc_raw, phred_offset=64)): self.assertEqual(rec['SequenceID'], s[0]) self.assertEqual(rec['Sequence'], s[1]) np.testing.assert_equal(rec['Qual'], s[2]) @@ -66,14 +68,10 @@ def setUp(self): self.barcode_fastq1 = barcode_fastq1.split('\n') self.fastq2 = fastq2.split('\n') self.barcode_fastq2 = barcode_fastq2.split('\n') - self.fastq1_expected_no_qual_unassigned = fastq1_expected_no_qual_unassigned - self.fastq1_expected_default = fastq1_expected_default - self.fastq2_expected_default = fastq2_expected_default - self.fastq1_expected_single_barcode = fastq1_expected_single_barcode self.mapping = mapping self.primers = \ - {v['BarcodeSequence']: v['LinkerPrimerSequence'].split(',') - for v in mapping._metadata.values()} + {v['BarcodeSequence']: v['LinkerPrimerSequence'].split(',') + for v in mapping._metadata.values()} self.barcodes = {v['BarcodeSequence']: k for k, v in mapping._metadata.items()} @@ -84,13 +82,13 @@ def _make_workflow_obj(self, options): def test_workflow_construction(self): """Make sure we can construct using our helper method""" - x = self._make_workflow_obj({'foo':'bar'}) + self._make_workflow_obj({'foo': 'bar'}) def test_initialize_state(self): """Check the initialization method""" - wf_obj = self._make_workflow_obj({'foo':'bar'}) + wf_obj = self._make_workflow_obj({'foo': 'bar'}) wf_obj.state['Sequence'] = 'w00t' - wf_obj.initialize_state({'Sequence':'foo'}) + wf_obj.initialize_state({'Sequence': 'foo'}) self.assertEqual(set(wf_obj.state.values()), set([None, 'foo'])) def test_quality_max_bad_run_length(self): @@ -169,10 +167,10 @@ def test_demultiplex_encoded_barcode(self): wf_obj = self._make_workflow_obj({'demultiplex': True, 'barcode_type': 'golay_12'}) - needs_a_fix = {'Barcode':'GGAGACAAGGGT', 'Sequence':'AATTGGCC'} - exact = {'Barcode':'GGAGACAAGGGA', 'Sequence':'AATTGGCC'} - from_sequence = {'Barcode':None, 'Sequence':'GGAGACAAGGGAAATTAATT'} - unknown_barcode = {'Barcode':'ACACCTGGTGAT', 'Sequence':'AATTGGCC'} + needs_a_fix = {'Barcode': 'GGAGACAAGGGT', 'Sequence': 'AATTGGCC'} + exact = {'Barcode': 'GGAGACAAGGGA', 'Sequence': 'AATTGGCC'} + from_sequence = {'Barcode': None, 'Sequence': 'GGAGACAAGGGAAATTAATT'} + unknown_barcode = {'Barcode': 'ACACCTGGTGAT', 'Sequence': 'AATTGGCC'} wf_obj.initialize_state(needs_a_fix) wf_obj.failed = False @@ -218,10 +216,10 @@ def test_demultiplex_max_barcode_error(self): """Verify failing max_barcode_error checking""" wf_obj = self._make_workflow_obj({'demultiplex': True, 'barcode_type': 'golay_12', - 'max_barcode_error':0}) + 'max_barcode_error': 0}) - needs_a_fix = {'Barcode':'GGAGACAAGGGT', 'Sequence':'AATTGGCC'} - exact = {'Barcode':'GGAGACAAGGGA', 'Sequence':'AATTGGCC'} + needs_a_fix = {'Barcode': 'GGAGACAAGGGT', 'Sequence': 'AATTGGCC'} + exact = {'Barcode': 'GGAGACAAGGGA', 'Sequence': 'AATTGGCC'} wf_obj.failed = False wf_obj.initialize_state(exact) @@ -241,21 +239,21 @@ def test_primer_check_forward(self): """Pull the forward primer as expected""" # primer details sourced from self.mapping - wf_obj = self._make_workflow_obj({'max_primer_mismatch':2, - 'retain_primer':False}) - item1 = {'Final barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGGCC', - 'Qual':np.array([1,2,3,4,5,6,7,8])} - item2 = {'Final barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGCCC', - 'Qual':np.array([1,2,3,4,5,6,7,8])} - item3 = {'Final barcode':'AAAAAAAAAAAA', 'Sequence':'GGTTGCCC', - 'Qual':np.array([1,2,3,4,5,6,7,8])} + wf_obj = self._make_workflow_obj({'max_primer_mismatch': 2, + 'retain_primer': False}) + item1 = {'Final barcode': 'AAAAAAAAAAAA', 'Sequence': 'AATTGGCC', + 'Qual': np.array([1, 2, 3, 4, 5, 6, 7, 8])} + item2 = {'Final barcode': 'AAAAAAAAAAAA', 'Sequence': 'AATTGCCC', + 'Qual': np.array([1, 2, 3, 4, 5, 6, 7, 8])} + item3 = {'Final barcode': 'AAAAAAAAAAAA', 'Sequence': 'GGTTGCCC', + 'Qual': np.array([1, 2, 3, 4, 5, 6, 7, 8])} wf_obj.initialize_state(item1) wf_obj.failed = False wf_obj._primer_check_forward() self.assertEqual(wf_obj.state['Sequence'], 'CC') - npt.assert_equal(wf_obj.state['Qual'], np.array([7,8])) + npt.assert_equal(wf_obj.state['Qual'], np.array([7, 8])) self.assertEqual(wf_obj.state['Forward primer'], 'AATTGG') self.assertFalse(wf_obj.failed) @@ -264,7 +262,7 @@ def test_primer_check_forward(self): wf_obj._primer_check_forward() self.assertEqual(wf_obj.state['Sequence'], 'CC') - npt.assert_equal(wf_obj.state['Qual'], np.array([7,8])) + npt.assert_equal(wf_obj.state['Qual'], np.array([7, 8])) self.assertEqual(wf_obj.state['Forward primer'], 'AATTGC') self.assertFalse(wf_obj.failed) @@ -273,26 +271,28 @@ def test_primer_check_forward(self): wf_obj._primer_check_forward() self.assertEqual(wf_obj.state['Sequence'], 'GGTTGCCC') - npt.assert_equal(wf_obj.state['Qual'], np.array([1,2,3,4,5,6,7,8])) + npt.assert_equal(wf_obj.state['Qual'], + np.array([1, 2, 3, 4, 5, 6, 7, 8])) self.assertEqual(wf_obj.state['Forward primer'], None) self.assertTrue(wf_obj.failed) # item is not modified in place as retain priemr is True - wf_obj = self._make_workflow_obj({'max_primer_mismatch':2, - 'retain_primer':True}) - item1 = {'Final barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGGCC', - 'Qual':np.array([1,2,3,4,5,6,7,8])} - item2 = {'Final barcode':'AAAAAAAAAAAA', 'Sequence':'AATTGCCC', - 'Qual':np.array([1,2,3,4,5,6,7,8])} - item3 = {'Final barcode':'AAAAAAAAAAAA', 'Sequence':'GGTTGCCC', - 'Qual':np.array([1,2,3,4,5,6,7,8])} + wf_obj = self._make_workflow_obj({'max_primer_mismatch': 2, + 'retain_primer': True}) + item1 = {'Final barcode': 'AAAAAAAAAAAA', 'Sequence': 'AATTGGCC', + 'Qual': np.array([1, 2, 3, 4, 5, 6, 7, 8])} + item2 = {'Final barcode': 'AAAAAAAAAAAA', 'Sequence': 'AATTGCCC', + 'Qual': np.array([1, 2, 3, 4, 5, 6, 7, 8])} + item3 = {'Final barcode': 'AAAAAAAAAAAA', 'Sequence': 'GGTTGCCC', + 'Qual': np.array([1, 2, 3, 4, 5, 6, 7, 8])} wf_obj.initialize_state(item1) wf_obj.failed = False wf_obj._primer_check_forward() self.assertEqual(wf_obj.state['Sequence'], 'AATTGGCC') - npt.assert_equal(wf_obj.state['Qual'], np.array([1,2,3,4,5,6,7,8])) + npt.assert_equal(wf_obj.state['Qual'], + np.array([1, 2, 3, 4, 5, 6, 7, 8])) self.assertEqual(wf_obj.state['Forward primer'], 'AATTGG') self.assertFalse(wf_obj.failed) @@ -301,7 +301,8 @@ def test_primer_check_forward(self): wf_obj._primer_check_forward() self.assertEqual(wf_obj.state['Sequence'], 'AATTGCCC') - npt.assert_equal(wf_obj.state['Qual'], np.array([1,2,3,4,5,6,7,8])) + npt.assert_equal(wf_obj.state['Qual'], + np.array([1, 2, 3, 4, 5, 6, 7, 8])) self.assertEqual(wf_obj.state['Forward primer'], 'AATTGC') self.assertFalse(wf_obj.failed) @@ -310,31 +311,32 @@ def test_primer_check_forward(self): wf_obj._primer_check_forward() self.assertEqual(wf_obj.state['Sequence'], 'GGTTGCCC') - npt.assert_equal(wf_obj.state['Qual'], np.array([1,2,3,4,5,6,7,8])) + npt.assert_equal(wf_obj.state['Qual'], + np.array([1, 2, 3, 4, 5, 6, 7, 8])) self.assertEqual(wf_obj.state['Forward primer'], None) self.assertTrue(wf_obj.failed) def test_sequence_length_check(self): """Check the length of the sequence""" - wf_obj = self._make_workflow_obj(options={'min_seq_len':5}) - item1 = {'Sequence':'AATTGGCC'} - item2 = {'Sequence':'AATT'} + wf_obj = self._make_workflow_obj(options={'min_seq_len': 5}) + item1 = {'Sequence': 'AATTGGCC'} + item2 = {'Sequence': 'AATT'} wf_obj.state = item1 - wf_obj.failed = False # note, normally handled by Workflow.__call__ + wf_obj.failed = False wf_obj._sequence_length_check() self.assertFalse(wf_obj.failed) wf_obj.state = item2 - wf_obj.failed = False # note, normally handled by Workflow.__call__ + wf_obj.failed = False wf_obj._sequence_length_check() self.assertTrue(wf_obj.failed) def test_sequence_ambiguous_count(self): - wf_obj = self._make_workflow_obj({'ambiguous_count':2}) - item1 = {'Sequence':'AATTGGCC'} - item2 = {'Sequence':'AANNNTT'} - item3 = {'Sequence':'AANTT'} + wf_obj = self._make_workflow_obj({'ambiguous_count': 2}) + item1 = {'Sequence': 'AATTGGCC'} + item2 = {'Sequence': 'AANNNTT'} + item3 = {'Sequence': 'AANTT'} wf_obj.state = item1 wf_obj.failed = False @@ -435,12 +437,13 @@ def test_sequence_ambiguous_count(self): """ mapping = MetadataMap( - {'s1':{'BarcodeSequence':'AAAAAAAAAAAA', 'LinkerPrimerSequence':'AATTGG,AATTCC'}, - 's2':{'BarcodeSequence':'AAAAAAAAAAAC', 'LinkerPrimerSequence':''}, - 's3':{'BarcodeSequence':'AAAAAAAAAAAG', 'LinkerPrimerSequence':''}, - 's4':{'BarcodeSequence':'AAAAAAAAAAAT', 'LinkerPrimerSequence':''}, - 's5':{'BarcodeSequence':'GGAGACAAGGGA', 'LinkerPrimerSequence':''} - }, []) + {'s1': {'BarcodeSequence': 'AAAAAAAAAAAA', + 'LinkerPrimerSequence': 'AATTGG,AATTCC'}, + 's2': {'BarcodeSequence': 'AAAAAAAAAAAC', 'LinkerPrimerSequence': ''}, + 's3': {'BarcodeSequence': 'AAAAAAAAAAAG', 'LinkerPrimerSequence': ''}, + 's4': {'BarcodeSequence': 'AAAAAAAAAAAT', 'LinkerPrimerSequence': ''}, + 's5': {'BarcodeSequence': 'GGAGACAAGGGA', 'LinkerPrimerSequence': ''}}, + []) fastq1 = """@990:2:4:11271:5323#1/1 GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC @@ -642,167 +645,5 @@ def test_sequence_ambiguous_count(self): bbbbbbbbbbbb """ - -fastq1_expected_no_qual_unassigned = [ - ("s1_0 990:2:4:11271:5323#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", - "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC", - "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`", - 0), - ("s2_1 990:2:4:11271:5323#1/1 orig_bc=AAAAAAAAAAAC new_bc=AAAAAAAAAAAC bc_diffs=0", - "GGTTACCTTGTTACGACTTCACCCCAATCATCGGCCCCACCTTAGACAGCTGACTCCTAAAAGGTTATCTCACCGG", - "bbcbbbbbbbbbbbbbbbbbbbbbbbbbb_bbbbbbbbaba_b^bY_`aa^bPb`bbbbHYGYZTbb^_ab[^baT", - 1), - ("s1_2 990:2:4:11272:9538#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", - "GCACACACCGCCCGTCACACCATCCGAGTTGGAGGTACCCGAAGCCGGTAGTCTAACCGCAAGGAGGACGCTGTCG", - "b_bbbbbbbbbbbbbbbbbbbbbbbbbbabaa^a`[bbbb`bbbbTbbabb]b][_a`a]acaaacbaca_a^`aa", - 2), - ("s4_3 990:2:4:11272:9538#1/1 orig_bc=AAAAAAAAAAAT new_bc=AAAAAAAAAAAT bc_diffs=0", - "GGCTACCTTGTTACGACTTCACCCTCCTCACTAAACGTACCTTCGACAGCGTCCTCCTTGCGGTTAGACTACCGGC", - "bb^bbbbbbbbbbbbbbbbbbbbbbbabbbb``bbb`__bbbbbbIWRXX`R``\`\Y\^__ba^a[Saaa_]O]O", - 3), - ("s1_4 990:2:4:11272:7447#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", - "GCACACACCGCCCGTCACACCATCCGAGTTGGGGGTACCCGAAGCCGGCAGTCTAACCGCAAGGAGGACGCTGTCG", - "b`bbbbbbbbbbbbbbb`^bbbbbYbbbbb\___`_bbab^aaaU^\`BBBBBBBBBBBBBBBBBBBBBBBBBBBB", - 4), - ("s1_5 990:2:4:11272:7447#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", - "GGATACCTTGTTACGACTTCACCCTCCTCACTCATCGTACCCTCGACAGCGTCCTCCTTGCTGTTAGACTTCCGGC", - "b`bbbbbbbbbbbbbbb`^bbbbbYbbbbb\___`_bbab^aaaU^\`BBBBBBBBBBBBBBBBBBBBBBBBBBBB", - 5), - ("s2_6 990:2:4:11272:19991#1/1 orig_bc=AAAAAAAAAAAC new_bc=AAAAAAAAAAAC bc_diffs=0", - "GCACTCACCGCCCGTCACGCCACGGAAGCCGGCTGCACCTGAAGCCGGTGGGGCAACCGGCTGTCCCTTTTAGCGG", - "bbbbbbbbbbbbbbbbbbbbbXbbb_bbbabbb`aZ[U]\OTYXV`TbBBBBBBBBBBBBBBBBBBBBBBBBBBBB", - 6), - ("s2_7 990:2:4:11272:19991#1/1 orig_bc=AAAAAAAAAAAC new_bc=AAAAAAAAAAAC bc_diffs=0", - "GGCTACCTTGTTACGACTTCGCCCCAGTCACCGACCACACCCTCGACGGCTGCCTCCGGCTGGCCCTTTCCACCCA", - "bbbbbbbbbbbbbbbbbbbba`bbbbbbbbbb`abb_aacbbbbb]___]\[\^^[aOcBBBBBBBBBBBBBBBBB", - 7), - ("s4_8 990:2:4:11272:4315#1/1 orig_bc=AAAAAAAAAAAT new_bc=AAAAAAAAAAAT bc_diffs=0", - "GTACTCACCGCCCGTCACGCCATGGGAGTTGGGCTTACCTGAAGCCCGCGAGCTAACCGGAAAGGGGGGGATGTGG", - "bbbb_bbbbbbbbbb```Q```BBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB", - 8), - ("s4_9 990:2:4:11272:4315#1/1 orig_bc=AAAAAAAAAAAT new_bc=AAAAAAAAAAAT bc_diffs=0", - "GGCTACCTTGTTACGACTTCACCCCCGTCGCTCGGCGTACCTTCGACCGCTGCCTCCTTTTGGTTATATCTCCGGG", - "``Q``````_``````````K]]aBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB", - 9), - ("Unassigned_10 990:2:4:11272:5533#1/1 orig_bc=GAAAAAAAAAAT new_bc=GAAAAAAAAAAT bc_diffs=0", - "GCACACACCGCCCGTCACACCACGAGAGTCGGCAACACCCGAAGTCGGTGAGGTAACCCCGAAAGGGGAGCCAGCC", - "``Q``````_``````````K]]aBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBBB", - 10), - ("s4_11 990:2:4:11272:5533#0/1 orig_bc=AAAAAAAAAAAT new_bc=AAAAAAAAAAAT bc_diffs=0", - "GGATACCTTGTTACGACTTCACCCCAATCATCGACCCCACCTTCGGCGGCTGGCTCCCCTTTCGGGGGTACCTCAC", - "bbbbbbbbbbbbbbbbbbbbbXbbb_bbbabbb`aZ[U]\OTYXV`TbBBBBBBBBBBBBBBBBBBBBBBBBBBBB", - 11)] - -fastq1_expected_default = [ - ("s1_0 990:2:4:11271:5323#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", - "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC", - "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`", - 0), - ("s2_1 990:2:4:11271:5323#1/1 orig_bc=AAAAAAAAAAAC new_bc=AAAAAAAAAAAC bc_diffs=0", - "GGTTACCTTGTTACGACTTCACCCCAATCATCGGCCCCACCTTAGACAGCTGACTCCTAAAAGGTTATCTCACCGG", - "bbcbbbbbbbbbbbbbbbbbbbbbbbbbb_bbbbbbbbaba_b^bY_`aa^bPb`bbbbHYGYZTbb^_ab[^baT", - 1), - ("s1_2 990:2:4:11272:9538#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", - "GCACACACCGCCCGTCACACCATCCGAGTTGGAGGTACCCGAAGCCGGTAGTCTAACCGCAAGGAGGACGCTGTCG", - "b_bbbbbbbbbbbbbbbbbbbbbbbbbbabaa^a`[bbbb`bbbbTbbabb]b][_a`a]acaaacbaca_a^`aa", - 2), - ("s4_3 990:2:4:11272:9538#1/1 orig_bc=AAAAAAAAAAAT new_bc=AAAAAAAAAAAT bc_diffs=0", - "GGCTACCTTGTTACGACTTCACCCTCCTCACTAAACGTACCTTCGACAGCGTCCTCCTTGCGGTTAGACTACCGGC", - "bb^bbbbbbbbbbbbbbbbbbbbbbbabbbb``bbb`__bbbbbbIWRXX`R``\`\Y\^__ba^a[Saaa_]O]O", - 3), - ("s1_4 990:2:4:11272:7447#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", - "GCACACACCGCCCGTCACACCATCCGAGTTGGGGGTACCCGAAGCCGG", - "b`bbbbbbbbbbbbbbb`^bbbbbYbbbbb\___`_bbab^aaaU^\`", - 4), - ("s1_5 990:2:4:11272:7447#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", - "GGATACCTTGTTACGACTTCACCCTCCTCACTCATCGTACCCTCGACA", - "b`bbbbbbbbbbbbbbb`^bbbbbYbbbbb\___`_bbab^aaaU^\`", - 5), - ("s2_6 990:2:4:11272:19991#1/1 orig_bc=AAAAAAAAAAAC new_bc=AAAAAAAAAAAC bc_diffs=0", - "GCACTCACCGCCCGTCACGCCACGGAAGCCGGCTGCACCTGAAGCCGG", - "bbbbbbbbbbbbbbbbbbbbbXbbb_bbbabbb`aZ[U]\OTYXV`Tb", - 6), - ("s2_7 990:2:4:11272:19991#1/1 orig_bc=AAAAAAAAAAAC new_bc=AAAAAAAAAAAC bc_diffs=0", - "GGCTACCTTGTTACGACTTCGCCCCAGTCACCGACCACACCCTCGACGGCTGCCTCCGG", - "bbbbbbbbbbbbbbbbbbbba`bbbbbbbbbb`abb_aacbbbbb]___]\[\^^[aOc", - 7), - ("s4_8 990:2:4:11272:5533#0/1 orig_bc=AAAAAAAAAAAT new_bc=AAAAAAAAAAAT bc_diffs=0", - "GGATACCTTGTTACGACTTCACCCCAATCATCGACCCCACCTTCGGCG", - "bbbbbbbbbbbbbbbbbbbbbXbbb_bbbabbb`aZ[U]\OTYXV`Tb", 8)] - -fastq1_expected_single_barcode = [ - ("s1_0 990:2:4:11271:5323#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", - "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC", - "bbbbbbbbbbbbbbbbbbbbbbbbbY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`", - 0), - ("s1_1 990:2:4:11271:5323#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", - "GGTTACCTTGTTACGACTTCACCCCAATCATCGGCCCCACCTTAGACAGCTGACTCCTAAAAGGTTATCTCACCGG", - "bbcbbbbbbbbbbbbbbbbbbbbbbbbbb_bbbbbbbbaba_b^bY_`aa^bPb`bbbbHYGYZTbb^_ab[^baT", - 1), - ("s1_2 990:2:4:11272:9538#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", - "GCACACACCGCCCGTCACACCATCCGAGTTGGAGGTACCCGAAGCCGGTAGTCTAACCGCAAGGAGGACGCTGTCG", - "b_bbbbbbbbbbbbbbbbbbbbbbbbbbabaa^a`[bbbb`bbbbTbbabb]b][_a`a]acaaacbaca_a^`aa", - 2), - ("s1_3 990:2:4:11272:9538#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", - "GGCTACCTTGTTACGACTTCACCCTCCTCACTAAACGTACCTTCGACAGCGTCCTCCTTGCGGTTAGACTACCGGC", - "bb^bbbbbbbbbbbbbbbbbbbbbbbabbbb``bbb`__bbbbbbIWRXX`R``\`\Y\^__ba^a[Saaa_]O]O", - 3), - ("s1_4 990:2:4:11272:7447#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", - "GCACACACCGCCCGTCACACCATCCGAGTTGGGGGTACCCGAAGCCGG", - "b`bbbbbbbbbbbbbbb`^bbbbbYbbbbb\___`_bbab^aaaU^\`", - 4), - ("s1_5 990:2:4:11272:7447#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", - "GGATACCTTGTTACGACTTCACCCTCCTCACTCATCGTACCCTCGACA", - "b`bbbbbbbbbbbbbbb`^bbbbbYbbbbb\___`_bbab^aaaU^\`", - 5), - ("s1_6 990:2:4:11272:19991#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", - "GCACTCACCGCCCGTCACGCCACGGAAGCCGGCTGCACCTGAAGCCGG", - "bbbbbbbbbbbbbbbbbbbbbXbbb_bbbabbb`aZ[U]\OTYXV`Tb", - 6), - ("s1_7 990:2:4:11272:19991#1/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", - "GGCTACCTTGTTACGACTTCGCCCCAGTCACCGACCACACCCTCGACGGCTGCCTCCGG", - "bbbbbbbbbbbbbbbbbbbba`bbbbbbbbbb`abb_aacbbbbb]___]\[\^^[aOc", - 7), - ("s1_8 990:2:4:11272:5533#0/1 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", - "GGATACCTTGTTACGACTTCACCCCAATCATCGACCCCACCTTCGGCG", - "bbbbbbbbbbbbbbbbbbbbbXbbb_bbbabbb`aZ[U]\OTYXV`Tb", 8)] - -fastq2_expected_default = [ - ("s1_0 M00176:17:000000000-A0CNA:1:1:15487:1773 1:N:0:0 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", - "GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC", - "bbbbbbbbbbBBBBBBBBBBBBBBBY``\`bbbbbbbbbbbbb`bbbbab`a`_[ba_aa]b^_bIWTTQ^YR^U`", - 0), - ("s2_1 M00176:17:000000000-A0CNA:1:1:17088:1773 1:N:0:0 orig_bc=AAAAAAAAAAAC new_bc=AAAAAAAAAAAC bc_diffs=0", - "GGTTACCTTGTTACGACTTCACCCCAATCATCGGCCCCACCTTAGACAGCTGACTCCTAAAAGGTTATCTCACCGG", - "bbcbbbbbbbbbbbbbbbbbbbbbbbbbb_bbbbbbbbaba_b^bY_`aa^bPb`bbbbHYGYZTbb^_ab[^baT", - 1), - ("s1_2 M00176:17:000000000-A0CNA:1:1:16738:1773 1:N:0:0 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", - "GCACACACCGCCCGTCACACCATCCGAGTTGGAGGTACCCGAAGCCGGTAGTCTAACCGCAAGGAGGACGCTGTCG", - "b_bbbbbbbbbbbbbbbbbbbbbbbbbbabaa^a`[bbbb`bbbbTbbabb]b][_a`a]acaaacbaca_a^`aa", - 2), - ("s4_3 M00176:17:000000000-A0CNA:1:1:12561:1773 1:N:0:0 orig_bc=AAAAAAAAAAAT new_bc=AAAAAAAAAAAT bc_diffs=0", - "GGCTACCTTGTTACGACTTCACCCTCCTCACTAAACGTACCTTCGACAGCGTCCTCCTTGCGGTTAGACTACCGGC", - "bb^bbbBBBBbbbbbbbbbbbbbbbbabbbb``bbb`__bbbbbbIWRXX`R``\`\Y\^__ba^a[Saaa_]O]O", - 3), - ("s1_4 M00176:17:000000000-A0CNA:1:1:14596:1773 1:N:0:0 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", - "GCACACACCGCCCGTCACACCATCCGAGTTGGGGGTACCCGAAGCCGG", - "b`bbbbbbbbbbbbbbb`^bbbbbYbbbbb\___`_bbab^aaaU^\`", - 4), - ("s1_5 M00176:17:000000000-A0CNA:1:1:12515:1774 1:N:0:0 orig_bc=AAAAAAAAAAAA new_bc=AAAAAAAAAAAA bc_diffs=0", - "GGATACCTTGTTACGACTTCACCCTCCTCACTCATCGTACCCTCGACA", - "b`bbbbbbbbbbbbbbb`^bbbbbYbbbbb\___`_bbab^aaaU^\`", - 5), - ("s2_6 M00176:17:000000000-A0CNA:1:1:17491:1774 1:N:0:0 orig_bc=AAAAAAAAAAAC new_bc=AAAAAAAAAAAC bc_diffs=0", - "GCACTCACCGCCCGTCACGCCACGGAAGCCGGCTGCACCTGAAGCCGG", - "bbbbbbbbbbbbbbbbbbbbbXbbb_bbbabbb`aZ[U]\OTYXV`Tb", - 6), - ("s2_7 M00176:17:000000000-A0CNA:1:1:16427:1774 1:N:0:0 orig_bc=AAAAAAAAAAAC new_bc=AAAAAAAAAAAC bc_diffs=0", - "GGCTACCTTGTTACGACTTCGCCCCAGTCACCGACCACACCCTCGACGGCTGCCTCCGG", - "bbbbbbbbbbbbbbbbbbbba`bbbbbbbbbb`abb_aacbbbbb]___]\[\^^[aOc", - 7), - ("s4_8 M00176:17:000000000-A0CNA:1:1:18209:1775 1:N:0:0 orig_bc=AAAAAAAAAAAT new_bc=AAAAAAAAAAAT bc_diffs=0", - "GGATACCTTGTTACGACTTCACCCCAATCATCGACCCCACCTTCGGCG", - "bbbbbbbbbbbbbbbbbbbbbXbbb_bbbabbb`aZ[U]\OTYXV`Tb", 8)] - if __name__ == '__main__': main() From ccfc59606a7bb314522b82aa943e2558c21c4602 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 12 Aug 2014 14:42:38 -0600 Subject: [PATCH 48/61] MAINT: removing unused variables --- tests/test_process_seqs.py | 82 -------------------------------------- 1 file changed, 82 deletions(-) diff --git a/tests/test_process_seqs.py b/tests/test_process_seqs.py index c6dc492171..921477e108 100644 --- a/tests/test_process_seqs.py +++ b/tests/test_process_seqs.py @@ -354,88 +354,6 @@ def test_sequence_ambiguous_count(self): self.assertFalse(wf_obj.failed) -fasta1_simple = """>a -abcde ->b -asdasdasd ->c -123123 -""" - -fasta2_simple = """>x -abcdefg ->y -popopo -""" - -qual1_simple = """>a -1 2 3 4 5 ->b -1 1 1 1 1 1 1 1 1 ->c -2 2 2 2 2 2 -""" - -qual2_simple = """>x -1 2 3 4 5 6 7 ->y -1 1 1 1 1 1 -""" - -qual2_simple_bad = """>x -1 2 3 4 5 6 ->y -1 1 1 1 1 1 -""" - -fastq1_simple = """@a -abcde -+a -abcde -@b -asdasdasd -+b -asdasdasd -@c -123123 -+c -123123 -""" - -fastq2_simple = """@x -abcdefg -+x -abcdefg -@y -popopo -+y -popopo -""" - -barcodes1_simple = """@a -test1 -+a -1234 -@b -test2 -+b -12345 -@c -test3 -+c -aaccb -""" - -barcodes2_simple = """@x -test4 -+x -12312 -@y -test5 -+y -33333 -""" - mapping = MetadataMap( {'s1': {'BarcodeSequence': 'AAAAAAAAAAAA', 'LinkerPrimerSequence': 'AATTGG,AATTCC'}, From a44062fee8db5843b281f294e6897cfa080f7033 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 12 Aug 2014 16:35:15 -0600 Subject: [PATCH 49/61] MAINT: removing unused code --- qiime/process_seqs.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index bad1f3ad6e..d9e52bfb3c 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -58,11 +58,6 @@ def has_sequence_qual(state): return state['Qual'] is not None -def has_barcode_qual(state): - """Check if state has Barcode Qual""" - return state['Barcode Qual'] is not None - - class IterAdapter(object): """Sequence iterator adapter From b7ff5b1d6fb99baf15bd6897d2b840ea2808c77b Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 12 Aug 2014 16:36:16 -0600 Subject: [PATCH 50/61] TST: inc. coverage, simple full workflow test --- tests/test_process_seqs.py | 186 ++++++++++++++++++++++++++++++++++++- 1 file changed, 185 insertions(+), 1 deletion(-) diff --git a/tests/test_process_seqs.py b/tests/test_process_seqs.py index 921477e108..2dada4f5c7 100644 --- a/tests/test_process_seqs.py +++ b/tests/test_process_seqs.py @@ -247,6 +247,8 @@ def test_primer_check_forward(self): 'Qual': np.array([1, 2, 3, 4, 5, 6, 7, 8])} item3 = {'Final barcode': 'AAAAAAAAAAAA', 'Sequence': 'GGTTGCCC', 'Qual': np.array([1, 2, 3, 4, 5, 6, 7, 8])} + item4 = {'Final barcode': 'AAAAABCAAAAA', 'Sequence': 'AATTGCCCA', + 'Qual': np.array([1, 2, 3, 4, 5, 6, 7, 8])} wf_obj.initialize_state(item1) wf_obj.failed = False @@ -276,7 +278,18 @@ def test_primer_check_forward(self): self.assertEqual(wf_obj.state['Forward primer'], None) self.assertTrue(wf_obj.failed) - # item is not modified in place as retain priemr is True + wf_obj.initialize_state(item4) + wf_obj.failed = False + wf_obj._primer_check_forward() + + # have the primer, but the barcode isn't associated with the primer + self.assertEqual(wf_obj.state['Sequence'], 'AATTGCCCA') + npt.assert_equal(wf_obj.state['Qual'], + np.array([1, 2, 3, 4, 5, 6, 7, 8])) + self.assertEqual(wf_obj.state['Forward primer'], None) + self.assertTrue(wf_obj.failed) + + # item is not modified in place as retain primer is True wf_obj = self._make_workflow_obj({'max_primer_mismatch': 2, 'retain_primer': True}) item1 = {'Final barcode': 'AAAAAAAAAAAA', 'Sequence': 'AATTGGCC', @@ -353,6 +366,22 @@ def test_sequence_ambiguous_count(self): wf_obj._sequence_ambiguous_count() self.assertFalse(wf_obj.failed) + def test_full_process_simple(self): + """Just demux""" + wf_obj = self._make_workflow_obj({'demultiplex': True, + 'barcode_type': 'golay_12'}) + + seq_raw = fastq1.splitlines() + bc_raw = barcode_fastq1.splitlines() + + seq = FastqIterator([seq_raw], phred_offset=64) + barcode = FastqIterator([bc_raw], phred_offset=64) + it = IterAdapter(seq=seq, barcode=barcode) + + for obs, exp in zip(wf_obj(it), fastq1_expected): + for k in exp: + npt.assert_equal(obs[k], exp[k]) + mapping = MetadataMap( {'s1': {'BarcodeSequence': 'AAAAAAAAAAAA', @@ -363,6 +392,161 @@ def test_sequence_ambiguous_count(self): 's5': {'BarcodeSequence': 'GGAGACAAGGGA', 'LinkerPrimerSequence': ''}}, []) +# just demultiplex +fastq1_expected = [ + {'Barcode errors': 0, + 'Final barcode': 'AAAAAAAAAAAA', + 'Original barcode': 'AAAAAAAAAAAA', + 'Qual': np.array([34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 25, 32, 32, + 28, 32, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 32, 34, 34, 34, 34, 33, 34, 32, 33, 32, 31, 27, 34, + 33, 31, 33, 33, 29, 34, 30, 31, 34, 9, 23, 20, 20, 17, + 30, 25, 18, 30, 21, 32], dtype=np.int8), + 'Sequence': ('GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCT' + 'TTTAGGAGTCAGCTGTC'), + 'SequenceID': '990:2:4:11271:5323#1/1'}, + + {'Barcode errors': 0, + 'Final barcode': 'AAAAAAAAAAAC', + 'Original barcode': 'AAAAAAAAAAAC', + 'Qual': np.array([34, 34, 35, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 31, 34, 34, 34, 34, 34, 34, 34, 34, 33, 34, 33, 31, + 34, 30, 34, 25, 31, 32, 33, 33, 30, 34, 16, 34, 32, 34, + 34, 34, 34, 8, 25, 7, 25, 26, 20, 34, 34, 30, 31, 33, + 34, 27, 30, 34, 33, 20], dtype=np.int8), + 'Sequence': ('GGTTACCTTGTTACGACTTCACCCCAATCATCGGCCCCACCTTAGACAGCTGACTCCTA' + 'AAAGGTTATCTCACCGG'), + 'SequenceID': '990:2:4:11271:5323#1/1'}, + + {'Barcode errors': 0, + 'Final barcode': 'AAAAAAAAAAAA', + 'Original barcode': 'AAAAAAAAAAAA', + 'Qual': np.array([34, 31, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 33, 34, 33, 33, 30, 33, 32, 27, 34, 34, 34, 34, 32, 34, + 34, 34, 34, 20, 34, 34, 33, 34, 34, 29, 34, 29, 27, 31, + 33, 32, 33, 29, 33, 35, 33, 33, 33, 35, 34, 33, 35, 33, + 31, 33, 30, 32, 33, 33], dtype=np.int8), + 'Sequence': ('GCACACACCGCCCGTCACACCATCCGAGTTGGAGGTACCCGAAGCCGGTAGTCTAACCG' + 'CAAGGAGGACGCTGTCG'), + 'SequenceID': '990:2:4:11272:9538#1/1'}, + + {'Barcode errors': 0, + 'Final barcode': 'AAAAAAAAAAAT', + 'Original barcode': 'AAAAAAAAAAAT', + 'Qual': np.array([34, 34, 30, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 33, 34, + 34, 34, 34, 32, 32, 34, 34, 34, 32, 31, 31, 34, 34, 34, + 34, 34, 34, 9, 23, 18, 24, 24, 32, 18, 32, 32, 28, 32, + 28, 25, 28, 30, 31, 31, 34, 33, 30, 33, 27, 19, 33, 33, + 33, 31, 29, 15, 29, 15], dtype=np.int8), + 'Sequence': ('GGCTACCTTGTTACGACTTCACCCTCCTCACTAAACGTACCTTCGACAGCGTCCTCCTT' + 'GCGGTTAGACTACCGGC'), + 'SequenceID': '990:2:4:11272:9538#1/1'}, + + {'Barcode errors': 0, + 'Final barcode': 'AAAAAAAAAAAA', + 'Original barcode': 'AAAAAAAAAAAA', + 'Qual': np.array([34, 32, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 32, 30, 34, 34, 34, 34, 34, 25, 34, 34, 34, + 34, 34, 28, 31, 31, 31, 32, 31, 34, 34, 33, 34, 30, 33, + 33, 33, 21, 30, 28, 32, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + dtype=np.int8), + 'Sequence': ('GCACACACCGCCCGTCACACCATCCGAGTTGGGGGTACCCGAAGCCGGCAGTCTAACCG' + 'CAAGGAGGACGCTGTCG'), + 'SequenceID': '990:2:4:11272:7447#1/1'}, + + {'Barcode errors': 0, + 'Final barcode': 'AAAAAAAAAAAA', + 'Original barcode': 'AAAAAAAAAAAA', + 'Qual': np.array([34, 32, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 32, 30, 34, 34, 34, 34, 34, 25, 34, 34, 34, + 34, 34, 28, 31, 31, 31, 32, 31, 34, 34, 33, 34, 30, 33, + 33, 33, 21, 30, 28, 32, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + dtype=np.int8), + 'Sequence': ('GGATACCTTGTTACGACTTCACCCTCCTCACTCATCGTACCCTCGACAGCGTCCTCCTT' + 'GCTGTTAGACTTCCGGC'), + 'SequenceID': '990:2:4:11272:7447#1/1'}, + + {'Barcode errors': 0, + 'Final barcode': 'AAAAAAAAAAAC', + 'Original barcode': 'AAAAAAAAAAAC', + 'Qual': np.array([34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 24, 34, 34, 34, 31, 34, 34, + 34, 33, 34, 34, 34, 32, 33, 26, 27, 21, 29, 28, 15, 20, + 25, 24, 22, 32, 20, 34, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + dtype=np.int8), + 'Sequence': ('GCACTCACCGCCCGTCACGCCACGGAAGCCGGCTGCACCTGAAGCCGGTGGGGCAACCG' + 'GCTGTCCCTTTTAGCGG'), + 'SequenceID': '990:2:4:11272:19991#1/1'}, + + {'Barcode errors': 0, + 'Final barcode': 'AAAAAAAAAAAC', + 'Original barcode': 'AAAAAAAAAAAC', + 'Qual': np.array([34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 33, 32, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 32, 33, 34, 34, 31, 33, 33, 35, 34, 34, + 34, 34, 34, 29, 31, 31, 31, 29, 28, 27, 28, 30, 30, 27, + 33, 15, 35, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2], dtype=np.int8), + 'Sequence': ('GGCTACCTTGTTACGACTTCGCCCCAGTCACCGACCACACCCTCGACGGCTGCCTCCGG' + 'CTGGCCCTTTCCACCCA'), + 'SequenceID': '990:2:4:11272:19991#1/1'}, + + {'Barcode errors': 0, + 'Final barcode': 'AAAAAAAAAAAT', + 'Original barcode': 'AAAAAAAAAAAT', + 'Qual': np.array([34, 34, 34, 34, 31, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 32, 32, 32, 17, 32, 32, 32, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2], dtype=np.int8), + 'Sequence': ('GTACTCACCGCCCGTCACGCCATGGGAGTTGGGCTTACCTGAAGCCCGCGAGCTAACCG' + 'GAAAGGGGGGGATGTGG'), + 'SequenceID': '990:2:4:11272:4315#1/1'}, + + {'Barcode errors': 0, + 'Final barcode': 'AAAAAAAAAAAT', + 'Original barcode': 'AAAAAAAAAAAT', + 'Qual': np.array([32, 32, 17, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 11, 29, 29, 33, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=np.int8), + 'Sequence': ('GGCTACCTTGTTACGACTTCACCCCCGTCGCTCGGCGTACCTTCGACCGCTGCCTCCTT' + 'TTGGTTATATCTCCGGG'), + 'SequenceID': '990:2:4:11272:4315#1/1'}, + + {'Barcode errors': 2, + 'Final barcode': 'AAAAAAAAAAAA', + 'Original barcode': 'GAAAAAAAAAAT', + 'Qual': np.array([32, 32, 17, 32, 32, 32, 32, 32, 32, 31, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 11, 29, 29, 33, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2], dtype=np.int8), + 'Sequence': ('GCACACACCGCCCGTCACACCACGAGAGTCGGCAACACCCGAAGTCGGTGAGGTAACCC' + 'CGAAAGGGGAGCCAGCC'), + 'SequenceID': '990:2:4:11272:5533#1/1'}, + + {'Barcode errors': 0, + 'Final barcode': 'AAAAAAAAAAAT', + 'Original barcode': 'AAAAAAAAAAAT', + 'Qual': np.array([34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 24, 34, 34, 34, 31, 34, 34, + 34, 33, 34, 34, 34, 32, 33, 26, 27, 21, 29, 28, 15, 20, + 25, 24, 22, 32, 20, 34, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2], + dtype=np.int8), + 'Sequence': ('GGATACCTTGTTACGACTTCACCCCAATCATCGACCCCACCTTCGGCGGCTGGCTCCCC' + 'TTTCGGGGGTACCTCAC'), + 'SequenceID': '990:2:4:11272:5533#0/1'}] + fastq1 = """@990:2:4:11271:5323#1/1 GCACTCACCGCCCGTCACACCACGAAAGTTGGTAACACCCGAAGCCGGTGAGATAACCTTTTAGGAGTCAGCTGTC + From 2040e6e6f260942be389dc3a86bf7c523782d192 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 12 Aug 2014 17:07:31 -0600 Subject: [PATCH 51/61] DOC/BUG: added options, fixed min_per_read_length --- qiime/process_seqs.py | 57 ++++++++++++++++++++++++++++++-------- tests/test_process_seqs.py | 12 ++++---- 2 files changed, 51 insertions(+), 18 deletions(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index d9e52bfb3c..341fc3a2ae 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -198,7 +198,32 @@ class SequenceWorkflow(Workflow): Options ------- - ## DESCRIBE EACH OPTION THAT CAN AFFECT WHAT METHODS ARE EXECUTED + All options are considered optional. + + demultiplex : bool + Whether to attempt demultiplex or not. + check_primer : bool + Whether to attempt a primer check or not + phred_quality_threshold : int + Minimum PHRED quality score + instrument_type : {454} + Instrument specific checks + max_bad_run_length : int + Maximum number of low quality base calls allowed before truncation + min_per_read_length_fraction : float + Minimum fraction of consecutive high quality base calls to include + barcode_type : {golay_12, hamming_8} + The type of barcode used + max_barcode_error : int + The maximum number of allowed errors within a barcode + retain_primer : bool + Whether to remove or keep the primer in the sequence + max_primer_mismatch : int + Maximum number of mismatches allowed in the primer + min_seq_len : int + Minimum sequence length + max_ambig_count : int + Maximum number of ambiguous bases allowed State ----- @@ -259,7 +284,7 @@ class SequenceWorkflow(Workflow): Number of primer mismatches exceeds tolerance min_seq_len Number of sequences whose length did not meet tolerance - ambiguous_count + max_ambig_count Number of sequences that contained to many ambiguous characters Attributes @@ -299,7 +324,7 @@ def __init__(self, *args, **kwargs): 'unknown_primer_barcode_pair': 0, 'exceeds_max_primer_mismatch': 0, 'min_seq_len': 0, - 'ambiguous_count': 0} + 'max_ambig_count': 0} super(SequenceWorkflow, self).__init__(state, *args, **kwargs) @@ -406,7 +431,7 @@ def wf_sequence(self): Triggers for `failed` ##################### * If a sequence does not mean `min_seq_len`. - * If the number of ambiguous bases exceed `ambiguous_count`. + * If the number of ambiguous bases exceed `max_ambig_count`. """ self._sequence_length_check() @@ -440,14 +465,22 @@ def _quality_max_bad_run_length(self): @requires(option='phred_quality_threshold') @requires(option='min_per_read_length_fraction') def _quality_min_per_read_length_fraction(self): - """Fail a sequence if a percentage of bad quality calls exist""" + """Fail a sequence if it lacks a long high quality run""" + min_high_qual_read_frac = self.options['min_per_read_length_fraction'] phred_quality_threshold = self.options['phred_quality_threshold'] - bad_bases = self.state['Qual'] < phred_quality_threshold - bad_bases_count = bad_bases.sum(dtype=float) - threshold = 1 - self.options['min_per_read_length_fraction'] + # cythonizable + good_quality = self.state['Qual'] >= phred_quality_threshold + good_start, good_stop, good_length = runs_of_ones(good_quality) + + if good_length.size: + best_idx = np.argmax(good_length) + best_len = good_length[best_idx] + else: + best_idx = None + best_len = -1 - if (bad_bases_count / len(self.state['Sequence'])) > threshold: + if best_len < (min_high_qual_read_frac * len(self.state['Sequence'])): self.failed = True self.stats['min_per_read_length_fraction'] += 1 @@ -558,10 +591,10 @@ def _sequence_length_check(self): self.failed = True self.stats['min_seq_len'] += 1 - @requires(option='ambiguous_count') + @requires(option='max_ambig_count') def _sequence_ambiguous_count(self): """Fail if the number of N characters is greater than threshold""" count = self.state['Sequence'].count('N') - if count > self.options['ambiguous_count']: + if count > self.options['max_ambig_count']: self.failed = True - self.stats['ambiguous_count'] += 1 + self.stats['max_ambig_count'] += 1 diff --git a/tests/test_process_seqs.py b/tests/test_process_seqs.py index 2dada4f5c7..8ee5fe28f0 100644 --- a/tests/test_process_seqs.py +++ b/tests/test_process_seqs.py @@ -127,13 +127,13 @@ def test_quality_min_per_read_length_fraction(self): exp1 = item1.copy() item2 = {'Sequence': 'AATTGGCC', - 'Qual': np.array([6, 1, 6, 1, 1, 6, 6, 6])} + 'Qual': np.array([6, 1, 6, 1, 6, 6, 1, 6])} exp2 = item2.copy() item3 = {'Sequence': 'AATTGGCC', - 'Qual': np.array([6, 6, 1, 1, 1, 1, 6, 6])} + 'Qual': np.array([1, 1, 1, 6, 6, 6, 6, 6])} exp3 = {'Sequence': 'AATTGGCC', - 'Qual': np.array([6, 6, 1, 1, 1, 1, 6, 6])} + 'Qual': np.array([1, 1, 1, 6, 6, 6, 6, 6])} wf_obj.state = item1 wf_obj.failed = False @@ -143,12 +143,12 @@ def test_quality_min_per_read_length_fraction(self): wf_obj.state = item2 wf_obj.failed = False wf_obj._quality_min_per_read_length_fraction() - self.assertFalse(wf_obj.failed) + self.assertTrue(wf_obj.failed) wf_obj.state = item3 wf_obj.failed = False wf_obj._quality_min_per_read_length_fraction() - self.assertTrue(wf_obj.failed) + self.assertFalse(wf_obj.failed) npt.assert_equal(item1, exp1) npt.assert_equal(item2, exp2) @@ -346,7 +346,7 @@ def test_sequence_length_check(self): self.assertTrue(wf_obj.failed) def test_sequence_ambiguous_count(self): - wf_obj = self._make_workflow_obj({'ambiguous_count': 2}) + wf_obj = self._make_workflow_obj({'max_ambig_count': 2}) item1 = {'Sequence': 'AATTGGCC'} item2 = {'Sequence': 'AANNNTT'} item3 = {'Sequence': 'AANTT'} From 71173a0423df883ef5cb3aa4dcc9513da1e8e9f8 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Tue, 12 Aug 2014 18:37:16 -0600 Subject: [PATCH 52/61] ENH/TST: ambig check on barcode --- qiime/process_seqs.py | 17 +++++++++++++++++ tests/test_process_seqs.py | 28 ++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index 341fc3a2ae..d85dd3b850 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -58,6 +58,10 @@ def has_sequence_qual(state): return state['Qual'] is not None +def has_barcode(state): + return state['Barcode'] is not None + + class IterAdapter(object): """Sequence iterator adapter @@ -286,6 +290,8 @@ class SequenceWorkflow(Workflow): Number of sequences whose length did not meet tolerance max_ambig_count Number of sequences that contained to many ambiguous characters + index_ambig_count + The number of index reads that have ambiguous bases Attributes ---------- @@ -324,6 +330,7 @@ def __init__(self, *args, **kwargs): 'unknown_primer_barcode_pair': 0, 'exceeds_max_primer_mismatch': 0, 'min_seq_len': 0, + 'index_ambig_count': 0, 'max_ambig_count': 0} super(SequenceWorkflow, self).__init__(state, *args, **kwargs) @@ -356,9 +363,11 @@ def wf_quality(self): Triggers for `failed` ##################### * If to many nucleotides in `Sequence` are of poor quality. + * If the index barcode contains ambiguity """ self._quality_max_bad_run_length() self._quality_min_per_read_length_fraction() + self._quality_index_ambiguity() @method(priority=150) @requires(option='demultiplex', values=True) @@ -484,6 +493,14 @@ def _quality_min_per_read_length_fraction(self): self.failed = True self.stats['min_per_read_length_fraction'] += 1 + @requires(state=has_barcode) + def _quality_index_ambiguity(self): + barcode_characters = set(self.state['Barcode']) + valid_characters = set(['A', 'T', 'G', 'C', 'a', 't', 'g', 'c']) + if not barcode_characters.issubset(valid_characters): + self.failed = True + self.stats['index_ambig_count'] += 1 + @requires(option='barcode_type', values='golay_12') def _demultiplex_golay12(self): """Correct and decode a Golay 12nt barcode""" diff --git a/tests/test_process_seqs.py b/tests/test_process_seqs.py index 8ee5fe28f0..30f40cf765 100644 --- a/tests/test_process_seqs.py +++ b/tests/test_process_seqs.py @@ -366,6 +366,34 @@ def test_sequence_ambiguous_count(self): wf_obj._sequence_ambiguous_count() self.assertFalse(wf_obj.failed) + def test_quality_index_ambiguity(self): + wf_obj = self._make_workflow_obj({}) + + item1 = {'Barcode': 'AATATATATATACA'} + item2 = {'Barcode': 'AATARATATATACA'} + item3 = {'Barcode': 'AATAATATATNCA'} + item4 = {'Barcode': 'ATAtagcta'} + + wf_obj.state = item1 + wf_obj.failed = False + wf_obj._quality_index_ambiguity() + self.assertFalse(wf_obj.failed) + + wf_obj.state = item2 + wf_obj.failed = False + wf_obj._quality_index_ambiguity() + self.assertTrue(wf_obj.failed) + + wf_obj.state = item3 + wf_obj.failed = False + wf_obj._quality_index_ambiguity() + self.assertTrue(wf_obj.failed) + + wf_obj.state = item4 + wf_obj.failed = False + wf_obj._quality_index_ambiguity() + self.assertFalse(wf_obj.failed) + def test_full_process_simple(self): """Just demux""" wf_obj = self._make_workflow_obj({'demultiplex': True, From 30a24ca19c48dcccf4829f323c2eb5fe6b732e92 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 13 Aug 2014 13:17:34 -0600 Subject: [PATCH 53/61] ENH: success stats --- tests/test_process_seqs.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/test_process_seqs.py b/tests/test_process_seqs.py index 30f40cf765..4a0a3a635d 100644 --- a/tests/test_process_seqs.py +++ b/tests/test_process_seqs.py @@ -410,6 +410,11 @@ def test_full_process_simple(self): for k in exp: npt.assert_equal(obs[k], exp[k]) + self.assertEqual(wf_obj.stats['sample_counts'], + {'s1': 5, 's2': 3, 's4': 4}) + self.assertEqual(wf_obj.stats['sequence_count'], 12) + self.assertEqual(wf_obj.stats['sequence_lengths'], {76: 12}) + mapping = MetadataMap( {'s1': {'BarcodeSequence': 'AAAAAAAAAAAA', From 0606b609a3b2eea1b09096c25aaa0092dff2dc3b Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 13 Aug 2014 13:29:19 -0600 Subject: [PATCH 54/61] TST: checking of stats --- qiime/process_seqs.py | 48 ++++++++++++++++++++++++++++++-------- tests/test_process_seqs.py | 28 +++++++++++++++++++--- 2 files changed, 63 insertions(+), 13 deletions(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index d85dd3b850..1eb8500ba4 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -2,6 +2,8 @@ """Filter poor quality reads, trim barcodes/primers and assign to samples""" +from collections import Counter + import numpy as np from future.builtins import zip from skbio.core.workflow import Workflow, requires, method, not_none @@ -272,25 +274,35 @@ class SequenceWorkflow(Workflow): occurs during quality checks then barcode stats, such as exceed_barcode_error, will not be reflective of those sequences. - quality_max_bad_run_length + sequence_lengths : Counter + A `Counter` that contains a count of the number of times a sequence + length was observed. This count is incremented at the end of processing + and if the sequence did not fail processing. + sample_counts : Counter + A `Counter` that contains the observed samples and the number of + sequences per sample. This count is incremented at the end of + processing and if the sequence did not fail processing. + sequence_count : int + Total number of sequences processed regardless of success or failure. + quality_max_bad_run_length : int Number of sequences containing a run of poor quality bases - min_per_read_length_fraction + min_per_read_length_fraction : int Number of sequences containing excessive poor quality bases - barcode_corrected + barcode_corrected : int Number of sequences in which a barcode was corrected - unknown_barcode + unknown_barcode : int Number of sequences in which an unknown barcode was observed - exceed_barcode_error + exceed_barcode_error : int Number of barcodes with errors that exceeded tolerance - unknown_primer_barcode_pair + unknown_primer_barcode_pair : int Number of unknown primer barcode pairs - exceeds_max_primer_mismatch + exceeds_max_primer_mismatch : int Number of primer mismatches exceeds tolerance - min_seq_len + min_seq_len : int Number of sequences whose length did not meet tolerance - max_ambig_count + max_ambig_count : int Number of sequences that contained to many ambiguous characters - index_ambig_count + index_ambig_count : int The number of index reads that have ambiguous bases Attributes @@ -322,6 +334,9 @@ def __init__(self, *args, **kwargs): 'Barcode errors': None} kwargs['stats'] = { + 'sequence_lengths': Counter(), + 'sample_counts': Counter(), + 'sequence_count': 0, 'quality_max_bad_run_length': 0, 'min_per_read_length_fraction': 0, 'barcode_corrected': 0, @@ -346,6 +361,7 @@ def initialize_state(self, item): for k in self.state: self.state[k] = None self.state.update(item) + self.stats['sequence_count'] += 1 @method(priority=200) @requires(state=has_sequence_qual) @@ -446,6 +462,18 @@ def wf_sequence(self): self._sequence_length_check() self._sequence_ambiguous_count() + @method() + def wf_success_stats(self): + """Increment stats contingent on a successful sequence + + Notes + ----- + Only makes updates to `stats` and cannot change state or trigger + `failed`. + """ + self.stats['sequence_lengths'][len(self.state['Sequence'])] += 1 + self.stats['sample_counts'][self.state['Sample']] += 1 + @requires(option='phred_quality_threshold') @requires(option='max_bad_run_length') def _quality_max_bad_run_length(self): diff --git a/tests/test_process_seqs.py b/tests/test_process_seqs.py index 4a0a3a635d..2394ea7fba 100644 --- a/tests/test_process_seqs.py +++ b/tests/test_process_seqs.py @@ -406,13 +406,18 @@ def test_full_process_simple(self): barcode = FastqIterator([bc_raw], phred_offset=64) it = IterAdapter(seq=seq, barcode=barcode) - for obs, exp in zip(wf_obj(it), fastq1_expected): + def failcb(obj): + return obj.state + + for obs, exp in zip(wf_obj(it, fail_callback=failcb), fastq1_expected): for k in exp: npt.assert_equal(obs[k], exp[k]) + self.assertEqual(wf_obj.stats['exceed_barcode_error'], 0) + self.assertEqual(wf_obj.stats['unknown_barcode'], 1) self.assertEqual(wf_obj.stats['sample_counts'], {'s1': 5, 's2': 3, 's4': 4}) - self.assertEqual(wf_obj.stats['sequence_count'], 12) + self.assertEqual(wf_obj.stats['sequence_count'], 13) self.assertEqual(wf_obj.stats['sequence_lengths'], {76: 12}) @@ -447,12 +452,21 @@ def test_full_process_simple(self): 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 31, 34, 34, 34, 34, 34, 34, 34, 34, 33, 34, 33, 31, 34, 30, 34, 25, 31, 32, 33, 33, 30, 34, 16, 34, 32, 34, - 34, 34, 34, 8, 25, 7, 25, 26, 20, 34, 34, 30, 31, 33, + 34, 34, 34, 8, 25, 7, 25, 26, 20, 34, 34, 30, 31, 33, 34, 27, 30, 34, 33, 20], dtype=np.int8), 'Sequence': ('GGTTACCTTGTTACGACTTCACCCCAATCATCGGCCCCACCTTAGACAGCTGACTCCTA' 'AAAGGTTATCTCACCGG'), 'SequenceID': '990:2:4:11271:5323#1/1'}, + {'Barcode errors': 3, + 'Final barcode': 'TTCCTTATATAC', + 'Original barcode': 'TTTTTTATATAT', + 'Qual': np.array([34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34, 34], + dtype=np.int8), + 'Sequence':'AATGAGAGTGATGAGTGATGATGATGA', + 'SequenceID': 'this_has_a_bad_barcode'}, + {'Barcode errors': 0, 'Final barcode': 'AAAAAAAAAAAA', 'Original barcode': 'AAAAAAAAAAAA', @@ -588,6 +602,10 @@ def test_full_process_simple(self): GGTTACCTTGTTACGACTTCACCCCAATCATCGGCCCCACCTTAGACAGCTGACTCCTAAAAGGTTATCTCACCGG + bbcbbbbbbbbbbbbbbbbbbbbbbbbbb_bbbbbbbbaba_b^bY_`aa^bPb`bbbbHYGYZTbb^_ab[^baT +@this_has_a_bad_barcode +AATGAGAGTGATGAGTGATGATGATGA ++ +bbbbbbbbbbbbbbbbbbbbbbbbbbb @990:2:4:11272:9538#1/1 GCACACACCGCCCGTCACACCATCCGAGTTGGAGGTACCCGAAGCCGGTAGTCTAACCGCAAGGAGGACGCTGTCG + @@ -638,6 +656,10 @@ def test_full_process_simple(self): AAAAAAAAAAAC + bbcbbbbbbbbb +@this_has_a_bad_barcode +TTTTTTATATAT ++ +bbbbbbbbbbbb @990:2:4:11272:9538#1/2 AAAAAAAAAAAA + From 21b9318bd6dde022089666a43d5a9cbbe41a3838 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 13 Aug 2014 13:42:38 -0600 Subject: [PATCH 55/61] ENH: removed need to specify option demulitplex --- qiime/process_seqs.py | 4 +--- tests/test_process_seqs.py | 9 +++------ 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/qiime/process_seqs.py b/qiime/process_seqs.py index 1eb8500ba4..ff524d14ca 100644 --- a/qiime/process_seqs.py +++ b/qiime/process_seqs.py @@ -206,8 +206,6 @@ class SequenceWorkflow(Workflow): ------- All options are considered optional. - demultiplex : bool - Whether to attempt demultiplex or not. check_primer : bool Whether to attempt a primer check or not phred_quality_threshold : int @@ -386,7 +384,7 @@ def wf_quality(self): self._quality_index_ambiguity() @method(priority=150) - @requires(option='demultiplex', values=True) + @requires(option='barcode_type', values=['golay_12', 'hamming_8']) def wf_demultiplex(self): """Demultiplex a sequence diff --git a/tests/test_process_seqs.py b/tests/test_process_seqs.py index 2394ea7fba..dff3ad8159 100644 --- a/tests/test_process_seqs.py +++ b/tests/test_process_seqs.py @@ -164,8 +164,7 @@ def test_demultiplex_hamming8(self): def test_demultiplex_encoded_barcode(self): """Verify decoding barcodes""" - wf_obj = self._make_workflow_obj({'demultiplex': True, - 'barcode_type': 'golay_12'}) + wf_obj = self._make_workflow_obj({'barcode_type': 'golay_12'}) needs_a_fix = {'Barcode': 'GGAGACAAGGGT', 'Sequence': 'AATTGGCC'} exact = {'Barcode': 'GGAGACAAGGGA', 'Sequence': 'AATTGGCC'} @@ -214,8 +213,7 @@ def test_demultiplex_encoded_barcode(self): def test_demultiplex_max_barcode_error(self): """Verify failing max_barcode_error checking""" - wf_obj = self._make_workflow_obj({'demultiplex': True, - 'barcode_type': 'golay_12', + wf_obj = self._make_workflow_obj({'barcode_type': 'golay_12', 'max_barcode_error': 0}) needs_a_fix = {'Barcode': 'GGAGACAAGGGT', 'Sequence': 'AATTGGCC'} @@ -396,8 +394,7 @@ def test_quality_index_ambiguity(self): def test_full_process_simple(self): """Just demux""" - wf_obj = self._make_workflow_obj({'demultiplex': True, - 'barcode_type': 'golay_12'}) + wf_obj = self._make_workflow_obj({'barcode_type': 'golay_12'}) seq_raw = fastq1.splitlines() bc_raw = barcode_fastq1.splitlines() From 1a5dfdaebab18f7b88eb1437c582c8cf66036da0 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 13 Aug 2014 14:25:02 -0600 Subject: [PATCH 56/61] ENH/MAINT: adding click interface, removing old files --- qiime/cli.py | 19 ++ qiime/click_commands.py | 129 +++++++++++ qiime/workflow/core.py | 332 ---------------------------- scripts/qiime | 9 + tests/test_quality_filter_fasta.py | 112 ---------- tests/test_workflow/test_core.py | 341 ----------------------------- 6 files changed, 157 insertions(+), 785 deletions(-) create mode 100644 qiime/cli.py create mode 100644 qiime/click_commands.py delete mode 100644 qiime/workflow/core.py create mode 100755 scripts/qiime delete mode 100644 tests/test_quality_filter_fasta.py delete mode 100644 tests/test_workflow/test_core.py diff --git a/qiime/cli.py b/qiime/cli.py new file mode 100644 index 0000000000..b50c88191e --- /dev/null +++ b/qiime/cli.py @@ -0,0 +1,19 @@ +import click + +import qiime + + +def print_version(ctx, param, value): + if not value or ctx.resilient_parsing: + return + click.echo('Version %s' % qiime.__version__) + ctx.exit() + + +@click.group() +@click.option('--version', is_flag=True, callback=print_version, + expose_value=False, is_eager=True) +@click.pass_context +def qiime_cli(ctx): + """QIIME, canonically pronounced 'chime'""" + pass diff --git a/qiime/click_commands.py b/qiime/click_commands.py new file mode 100644 index 0000000000..279341c959 --- /dev/null +++ b/qiime/click_commands.py @@ -0,0 +1,129 @@ +import os + +import click + +from .cli import qiime_cli + + +@qiime_cli.command() +# I/O options +@click.option('--sequence-read-fp', '-i', multiple=True, required=True, + type=click.Path(exists=True), help='Input sequence reads') +@click.option('--output-dir', '-o', type=click.Path(exists=False), + required=True) +@click.option('--mapping_fp', '-m', required=True, + type=click.File('U'), help='Mapping file') +@click.option('--barcode-read-fp', '-b', multiple=True, required=False, + type=click.Path(exists=True), help='Barcode read files') +@click.option('--rev-comp/--no-rev-comp', default=False, + help='Reverse complement sequences on output') +@click.option('--start-seq-id', type=int, default=0, + help='The starting unique ID for sequences') +# Iterator options +@click.option('--rev-comp-barcodes/--no-rev-comp-barcodes', default=False, + help='Reverse complement barcode reads') +@click.option('--phred-offset', type=click.Choice(['33', '64']), + help='The ASCII offset used to decode PHRED scores') +# Runtime options +@click.option('--phred-quality-threshold', '-q', default=3, type=int, + help='Minimum PHRED quality score') +@click.option('--barcode-type', help='The type of barcode used', default=None, + type=click.Choice(['golay_12', 'hamming_8', 'not-barcoded'])) +@click.option('--max-barcode-error', default=1.5, type=float, + help='The maximum number of barcode errors allowed') +@click.option('--retain-primer/--no-retain-primer', default=False, + help='Whether to retain the primers or not (if applicable)') +@click.option('--max-primer-mismatch', type=int, default=0, + help='Maximum mismatches allowed within the primers') +@click.option('--min-seq-len', type=int, + help='The minimum sequence length') +@click.option('--max-ambig-count', default=0, type=int, + help='Maximum ambiguous bases allowed') +# Other options +@click.option('--rev-comp-mapping-barcodes/--no-rev-comp-mapping-barcodes', + default=False, help='Reverse complement the mapping barcodes') +@click.pass_context +def slib(ctx, **kwargs): + """Quality filter and demultiplex sequences""" + from skbio import DNA + from skbio.parse.sequences.factory import load + + from qiime.parse import parse_mapping_file_to_dict + from qiime.process_seqs import SequenceWorkflow, IterAdapter + + # qiime_config is available under ctx.obj['qiime_config'] + + # reverse complement for reversing mapping barcodes + def rc(seq): + return str(DNA(seq).rc()) + + # reverse complement for reversing barcode reads + def rc_it(st): + st['Sequence'] = rc(st['Sequence']) + st['Qual'] = st['Qual'][::-1] if st['Qual'] is not None else None + + # id formatter for writing + def format_id(idx, state): + seq_id = "%s_%d" % (state['Sample'], idx) + ori_id = state['SequenceID'] + ori_bc = "orig_bc=%s" % state['Original barcode'] + new_bc = "new_bc=%s" % state['Final barcode'] + bc_diff = "bc_diffs=%d" % (state['Barcode errors'] or 0) + return " ".join([seq_id, ori_id, ori_bc, new_bc, bc_diff]) + + # should be sourced from skbio + def format_fasta(id_, seq): + return ">%s\n%s\n" % (id_, seq) + + # not defining fastq as the method should be sourced from skbio + # particularly dealing with qual + + # setup sequence iterator + seqs = load(kwargs.pop('sequence_read_fp')) + + # setup barcode iterator + barcode_read_fp = kwargs.pop('barcode_read_fp') + if barcode_read_fp: + transform = rc_it if kwargs.pop('rev_comp_barcodes') else None + barcodes = load(barcode_read_fp, transform=transform) + else: + barcodes = None + + # load mapping, setup barcode and primer maps + mapping, comments = parse_mapping_file_to_dict(kwargs.pop('mapping_fp')) + barcode_map = {v['BarcodeSequence']: k for k, v in mapping.items()} + primer_map = {v['BarcodeSequence']: v['LinkerPrimerSequence'].split(',') + for v in mapping.values()} + + # reverse complement barcodes if necessary + if kwargs.pop('rev_comp_mapping_barcodes'): + barcode_map = {rc(k): v for k, v in barcode_map.items()} + + # setup outputs and options + output_dir = kwargs.pop('output_dir') + if not os.path.exists(output_dir): + os.mkdir(output_dir) + + success_fp = os.path.join(output_dir, 'seqs.fna') + fail_fp = os.path.join(output_dir, 'unassigned.fna') + + if os.path.exists(success_fp): + raise IOError("%s already exists!" % success_fp) + + if os.path.exists(fail_fp): + raise IOError("%s already exists!" % fail_fp) + + # setup starting sequence ID and whether to RC on write + seq_id = kwargs.pop('start_seq_id') + rc = rc if kwargs.pop('rev_comp') else lambda x: x + + # setup sequence/barcode iterator + iter_ = IterAdapter(seqs, barcodes) + wf = SequenceWorkflow(options=kwargs, mapping=mapping, + barcodes=barcode_map, primers=primer_map) + + with open(success_fp, 'w') as success, open(fail_fp, 'w') as failed: + for idx, item in enumerate(wf(iter_, fail_callback=lambda x: x.state)): + id_ = format_id(seq_id + idx, item) + formatted = format_fasta(id_, item['Sequence']) + failed.write(formatted) if wf.failed else success.write(formatted) diff --git a/qiime/workflow/core.py b/qiime/workflow/core.py deleted file mode 100644 index 714498c7ef..0000000000 --- a/qiime/workflow/core.py +++ /dev/null @@ -1,332 +0,0 @@ -#!/usr/bin/env python - -"""Perform multiple method calls, determined at runtime, on independent items - -Construct arbitrarily complex workflows in which the specific methods run are -determined at runtime. These methods are applied to items that are assumed to -be independent. - -As an example: - -class MyWorkflow(Workflow): - @priority(100) - @no_requirements - def wf_mul(self, item): - self.FinalState *= item - - @priority(10) - @requires(Option='add_value') - def wf_add(self, item): - self.FinalState += item - - @requires(Option='sub_value', Values=[1,5,10]) - def wf_sub(self, item): - self.FinalState -= item - self.FinalState -= self.Options['sub_value'] - - @priority(1000) - @requires(IsValid=False) - def wf_init(self, item): - self.FinalState = item - -# (i * i) + i - i - 5 -wf = MyWorkflow(Options={'add_value':None, 'sub_value':5}) -gen = (i for i in range(10)) -for i in wf(gen): - print i - -# (i * i) - i - 10 -wf = MyWorkflow(Options={'sub_value':10}) -gen = (i for i in range(10)) -for i in wf(gen): - print i - -# (i * i) -wf = MyWorkflow() -gen = (i for i in range(10)) -for i in wf(gen): - print i -""" - -from itertools import chain -from functools import update_wrapper -from collections import Iterable, defaultdict -from types import MethodType - -__author__ = "Daniel McDonald" -__copyright__ = "Copyright 2013, The QIIME Project" -__credits__ = ["Daniel McDonald", "Tony Walters"] -__license__ = "BSD" # NOTE, this script does _not_ import GPL code -__version__ = "1.7.0-dev" -__maintainer__ = "Daniel McDonald" -__email__ = "mcdonadt@colorado.edu" -__status__ = "Development" - -# thank you Flask project... -_missing = object() -_executed = object() - -class Exists(object): - def __contains__(self, item): - return True -option_exists = Exists() - -def _debug_trace_wrapper(obj, f): - """Trace a function call""" - def wrapped(self, *args, **kwargs): - if not hasattr(obj, 'DebugTrace'): - raise AttributeError("%s does not have DebugTrace!" % obj.__class__) - - obj.DebugTrace.append(f.__name__) - return f(self, *args, **kwargs) - - return update_wrapper(wrapped, f) - -def _tag_function(f): - """Tag, you're it""" - setattr(f, '__workflowtag__', None) - -class priority(object): - """Sets a function priority""" - def __init__(self, Priority): - self.Priority = Priority - - def __call__(self, f): - f.Priority = self.Priority - return f - -def no_requirements(f): - def decorated(self, *args, **kwargs): - f(self, *args, **kwargs) - return _executed - _tag_function(decorated) - return update_wrapper(decorated, f) - -class requires(object): - """Decorator that executes a function if requirements are met""" - def __init__(self, IsValid=True, Option=None, Values=_missing, - ValidData=None): - """ - IsValid : execute the function if self.Failed is False - Option : a required option - Values : required values associated with an option - ValidData : data level requirements, this must be a function with the - following signature: f(*args, **kwargs) returning True. NOTE: if - ValidData returns False on the first item evaluated, the decorated - function will be removed from the remaining workflow. - """ - # self here is the requires object - self.IsValid = IsValid - self.Option = Option - self.ValidData = ValidData - - if Values is _missing: - self.Values = option_exists - elif not isinstance(Values, set): - if isinstance(Values, str): - self.Values = Values - elif isinstance(Values, Iterable): - self.Values = set(Values) - else: - self.Values = set([Values]) - else: - self.Values = Values - - def doShortCircuit(self, wrapped): - if self.IsValid and (wrapped.Failed and wrapped.ShortCircuit): - return True - else: - return False - - def __call__(self, f): - """Wrap a function - - f : the function to wrap - """ - ### not sure how I feel about having multiple functions in here. - ### also, the handling of Data is a bit dirty as it is now replicated - ### over these functions. It is ideal to keep the functions slim, thus - ### the multiple functions, but this could explode if not careful - def decorated_with_option(dec_self, *args, **kwargs): - """A decorated function that has an option to validate - - dec_self : this is "self" for the decorated function - """ - if self.doShortCircuit(dec_self): - return - - if self.ValidData is not None: - if not self.ValidData(*args, **kwargs): - return - - s_opt = self.Option - ds_opts = dec_self.Options - - if s_opt in ds_opts and ds_opts[s_opt] in self.Values: - f(dec_self, *args, **kwargs) - return _executed - - def decorated_without_option(dec_self, *args, **kwargs): - """A decorated function that does not have an option to validate - - dec_self : this is "self" for the decorated function - """ - if self.doShortCircuit(dec_self): - return - - if self.ValidData is not None: - if not self.ValidData(*args, **kwargs): - return - - f(dec_self, *args, **kwargs) - return _executed - - _tag_function(decorated_with_option) - _tag_function(decorated_without_option) - - if self.Option is None: - return update_wrapper(decorated_without_option, f) - else: - return update_wrapper(decorated_with_option, f) - -class Workflow(object): - """Arbitrary worflow support structure""" - - def __init__(self, ShortCircuit=True, Debug=False, Options=None, **kwargs): - """Build thy self - - ShortCiruit : if True, enables ignoring function groups when a given - item has failed - Debug : Enable debug mode - Options : runtime options, {'option':values} - kwargs : Additional arguments will be added to self - - All workflow methods (i.e., those starting with "wk_") must be decorated - by either "no_requirements" or "requires". This ensures that the methods - support the automatic workflow determination mechanism. - """ - if Options is None: - self.Options = {} - else: - self.Options = Options - - ### collections.Counter instead? - self.Stats = defaultdict(int) - self.ShortCircuit = ShortCircuit - self.Failed = False - self.Debug = Debug - - if self.Debug: - self.DebugTrace = [] - - for k,v in kwargs.iteritems(): - if hasattr(self, k): - raise AttributeError("%s exists in self!" % k) - setattr(self, k, v) - - for f in self._all_wf_methods(): - if not hasattr(f, '__workflowtag__'): - raise AttributeError("%s isn't a workflow method!" % f.__name__) - - self._sanity_check() - self._stage_state() - self._setup_debug() - - def _setup_debug(self): - """Wrap all methods with debug trace support""" - if not self.Debug: - return - - _ignore = set(['_get_workflow','_all_wf_methods','_sanity_check', - '_stage_state']) - - for attrname in dir(self): - if attrname.startswith('__'): - continue - if attrname in _ignore: - continue - - attr = getattr(self, attrname) - - if isinstance(attr, MethodType): - setattr(self, attrname, _debug_trace_wrapper(self, attr)) - - def _stage_state(self): - """Stage any additional data necessary for the workflow - - This does not need to be overloaded - """ - pass - - def _sanity_check(self): - """Perform a sanity check on self""" - raise NotImplementedError("Must implement a sanity check!") - - def _all_wf_methods(self, default_priority=0): - """Get all workflow methods - - Methods are sorted by priority - """ - methods = [getattr(self, f) for f in dir(self) if f.startswith('wf_')] - key = lambda x: getattr(x, 'Priority', default_priority) - methods_sorted = sorted(methods, key=key, reverse=True) - - if methods_sorted[0] != self.wf_SETUP_DEBUG_TRACE: - name = methods_sorted[0].__name__ - debug_prio = self.wf_SETUP_DEBUG_TRACE.Priority - - raise AttributeError("Method %s has a higher priority than the " - "debug trace method. Please set its priority " - "below %d." % (name, debug_prio)) - - if not self.Debug: - methods_sorted.pop(0) - - return methods_sorted - - def _get_workflow(self, it): - """Get the methods executed, sorted by priority""" - # save state - shortcircuit_state = self.ShortCircuit - self.ShortCircuit = False - stats = self.Stats.copy() - - peek = it.next() - executed = [f for f in self._all_wf_methods() if f(peek) is _executed] - - # restore state - self.ShortCircuit = shortcircuit_state - self.Stats = stats - generator_reset = chain([peek], it) - - return generator_reset, executed - - @priority(99999999) - @no_requirements - def wf_SETUP_DEBUG_TRACE(self, item): - self.DebugTrace = [] - - def __call__(self, it, success_callback=None, fail_callback=None): - """Operate on all the data - - it : an iterator - success_callback : method to call on a successful item prior to - yielding - fail_callback : method to call on a failed item prior to yielding - """ - if success_callback is None: - success_callback = lambda x: x.FinalState - - it, workflow = self._get_workflow(it) - - for item in it: - self.Failed = False - - for f in workflow: - f(item) - - if self.Failed: - if fail_callback is not None: - yield fail_callback(self) - else: - yield success_callback(self) diff --git a/scripts/qiime b/scripts/qiime new file mode 100755 index 0000000000..f1fdbaba28 --- /dev/null +++ b/scripts/qiime @@ -0,0 +1,9 @@ +#!/usr/bin/env python + +from qiime.cli import qiime_cli +from qiime.click_commands import slib +from qiime.util import load_qiime_config + + +if __name__ == '__main__': + qiime_cli(obj={'qiime_config': load_qiime_config()}) diff --git a/tests/test_quality_filter_fasta.py b/tests/test_quality_filter_fasta.py deleted file mode 100644 index 96e36b7d7b..0000000000 --- a/tests/test_quality_filter_fasta.py +++ /dev/null @@ -1,112 +0,0 @@ -#!/usr/bin/env python - -from cogent.util.unit_test import TestCase, main -from cogent.parse.fasta import MinimalFastaParser -from qiime.parse import MinimalQualParser -from itertools import chain -from numpy import array -from qiime.quality_filter_fasta import (_fasta_qual_strict, - fasta_qual_iterator) - -class IteratorTests(TestCase): - def setUp(self): - fasta1_gen = MinimalFastaParser(fasta1.splitlines()) - qual1_gen = MinimalQualParser(qual1.splitlines()) - fasta2_gen = MinimalFastaParser(fasta2.splitlines()) - qual2_gen = MinimalQualParser(qual2.splitlines()) - qual2_bad_gen = MinimalQualParser(qual2_bad.splitlines()) - - self.fasta_gen = chain(fasta1_gen, fasta2_gen) - self.qual_gen = chain(qual1_gen, qual2_gen) - - self.reversed_fasta_gen = chain(fasta2_gen, fasta1_gen) - self.qual_bad_gen = chain(qual1_gen, qual2_bad_gen) - - def test_fasta_qual_strict_simple(self): - exp = [('a', 'abcde', 'a', array([1, 2, 3, 4, 5])), - ('b', 'asdasdasd', 'b', array([1,1,1,1,1,1,1,1,1])), - ('c', '123123', 'c', array([2, 2, 2, 2, 2, 2])), - ('x', 'abcdefg', 'x', array([1, 2, 3, 4, 5, 6, 7])), - ('y', 'popopo', 'y', array([1, 1, 1, 1, 1, 1]))] - - obs = _fasta_qual_strict(self.fasta_gen, self.qual_gen) - for o,e in zip(obs,exp): - osi, osd, oqi, oqd = o - esi, esd, eqi, eqd = e - self.assertEqual((osi, osd, oqi), (esi, esd, eqi)) - self.assertTrue((oqd == eqd).all()) - - def test_fasta_qual_strict_mismatch_ids(self): - with self.assertRaises(ValueError): - g = _fasta_qual_strict(self.reversed_fasta_gen, self.qual_gen) - _ = list(g) - - def test_fasta_qual_strict_mismatch_length(self): - with self.assertRaises(ValueError): - _ = list(_fasta_qual_strict(self.fasta_gen, self.qual_bad_gen)) - - def test_fasta_qual_iterators_just_fasta(self): - exp = [('a', 'abcde', None, None), - ('b', 'asdasdasd', None, None), - ('c', '123123', None, None), - ('x', 'abcdefg', None, None), - ('y', 'popopo', None, None)] - - open_fps = map(lambda x: x.splitlines(), [fasta1, fasta2]) - obs = list(fasta_qual_iterator(open_fps)) - self.assertEqual(obs, exp) - - def test_fasta_qual_iterators_fasta_qual(self): - exp = [('a', 'abcde', 'a', array([1, 2, 3, 4, 5])), - ('b', 'asdasdasd', 'b', array([1,1,1,1,1,1,1,1,1])), - ('c', '123123', 'c', array([2, 2, 2, 2, 2, 2])), - ('x', 'abcdefg', 'x', array([1, 2, 3, 4, 5, 6, 7])), - ('y', 'popopo', 'y', array([1, 1, 1, 1, 1, 1]))] - - splitter = lambda x: x.splitlines() - fasta_fps = map(splitter, [fasta1, fasta2]) - qual_fps = map(splitter, [qual1, qual2]) - - obs = fasta_qual_iterator(fasta_fps, qual_fps) - for o,e in zip(obs, exp): - osi, osd, oqi, oqd = o - esi, esd, eqi, eqd = e - self.assertEqual((osi, osd, oqi), (esi, esd, eqi)) - self.assertTrue((oqd == eqd).all()) - -fasta1 = """>a -abcde ->b -asdasdasd ->c -123123 -""" - -fasta2 = """>x -abcdefg ->y -popopo -""" - -qual1 = """>a -1 2 3 4 5 ->b -1 1 1 1 1 1 1 1 1 ->c -2 2 2 2 2 2 -""" - -qual2 = """>x -1 2 3 4 5 6 7 ->y -1 1 1 1 1 1 -""" - -qual2_bad = """>x -1 2 3 4 5 6 ->y -1 1 1 1 1 1 -""" - -if __name__ == '__main__': - main() diff --git a/tests/test_workflow/test_core.py b/tests/test_workflow/test_core.py deleted file mode 100644 index baf23f23e7..0000000000 --- a/tests/test_workflow/test_core.py +++ /dev/null @@ -1,341 +0,0 @@ -#!/usr/bin/env python - -from itertools import izip -from qiime.workflow.core import (Workflow, requires, priority, - no_requirements) -from unittest import TestCase, main - -__author__ = "Daniel McDonald" -__copyright__ = "Copyright 2013, The QIIME Project" -__credits__ = ["Daniel McDonald"] -__license__ = "BSD" # NOTE, does not import any GPL code -__version__ = "1.7.0-dev" -__maintainer__ = "Daniel McDonald" -__email__ = "mcdonadt@colorado.edu" -__status__ = "Development" - -def construct_iterator(**kwargs): - """make an iterator for testing purposes""" - to_gen = [] - for k in sorted(kwargs): - if k.startswith('iter'): - to_gen.append(kwargs[k]) - if len(to_gen) == 1: - return (x for x in to_gen[0]) - else: - return izip(*to_gen) - -class MockWorkflow(Workflow): - def _sanity_check(self): - pass - - @priority(90) - @requires(Option='A', Values=True) - def wf_groupA(self, item): - self.methodA1(item) - self.methodA2(item) - - @requires(Option='B', Values=True) - def wf_groupB(self, item): - self.methodB1(item) - self.methodB2(item) - - @priority(10) - @requires(Option='C', Values=True) - def wf_groupC(self, item): - self.methodC1(item) - self.methodC2(item) - - @requires(IsValid=False) # always execute - def methodA1(self, item): - name = 'A1' - self.Stats[name] += 1 - if item == 'fail %s' % name: - self.Failed = True - self.FinalState = (name, item) - - def methodA2(self, item): - name = 'A2' - self.Stats[name] += 1 - if item == 'fail %s' % name: - self.Failed = True - self.FinalState = (name, item) - - @requires(IsValid=False) - def methodB1(self, item): - name = 'B1' - self.Stats[name] += 1 - if item == 'fail %s' % name: - self.Failed = True - self.FinalState = 'failed' - else: - self.FinalState = (name, item) - - @requires(Option='foo', Values=[1,2,3]) - def methodB2(self, item): - name = 'B2' - self.Stats[name] += 1 - if item == 'fail %s' % name: - self.Failed = True - self.FinalState = 'failed' - else: - self.FinalState = (name, item) - - @no_requirements - def methodC1(self, item): - name = 'C1' - self.Stats[name] += 1 - if item == 'fail %s' % name: - self.Failed = True - self.FinalState = (name, item) - - @requires(IsValid=True, Option='C2', Values=[1,2,3]) - def methodC2(self, item): - name = 'C2' - self.Stats[name] += 1 - if item == 'fail %s' % name: - self.Failed = True - self.FinalState = (name, item) - -class WorkflowTests(TestCase): - def setUp(self): - self.obj_short = MockWorkflow(Options={'A':True, 'C':True}) - self.obj_debug = MockWorkflow(Debug=True, Options={'A':True, 'C':True}) - self.obj_noshort = MockWorkflow(ShortCircuit=False, Options=\ - {'A':True, - 'C':True}) - - def test_untagged_wf_method(self): - class WFTest(Workflow): - @no_requirements - def wf_1(self): - pass - def wf_2(self): - pass - - with self.assertRaises(AttributeError): - _ = WFTest() - - def test_get_workflow_debug(self): - gen = construct_iterator(**{'iter_x':[1,2,3,4,5]}) - exp_wf = [self.obj_debug.wf_SETUP_DEBUG_TRACE, self.obj_debug.wf_groupA, - self.obj_debug.wf_groupC] - obs_gen, obs_wf = self.obj_debug._get_workflow(gen) - - self.assertEqual(obs_wf, exp_wf) - self.assertEqual(list(obs_gen), [1,2,3,4,5]) - - self.assertEqual(self.obj_debug.Stats, {}) - self.assertTrue(self.obj_debug.ShortCircuit) - - def test_debug_trace(self): - gen = construct_iterator(**{'iter_x':[1,2,3,4,5]}) - obj = self.obj_debug(gen) - - exp = ('C1',1) - obs = obj.next() - self.assertEqual(obs, exp) - - exp = ['wf_groupA', - 'methodA1', - 'methodA2', - 'wf_groupC', - 'methodC1', - 'methodC2'] - obs = self.obj_debug.DebugTrace - self.assertEqual(obs, exp) - - def test_get_workflow(self): - gen = construct_iterator(**{'iter_x':[1,2,3,4,5]}) - exp_wf = [self.obj_short.wf_groupA, self.obj_short.wf_groupC] - obs_gen, obs_wf = self.obj_short._get_workflow(gen) - - self.assertEqual(obs_wf, exp_wf) - self.assertEqual(list(obs_gen), [1,2,3,4,5]) - - self.assertEqual(self.obj_short.Stats, {}) - self.assertTrue(self.obj_short.ShortCircuit) - - def test_init(self): - self.assertEqual(self.obj_short.Options, {'A':True, 'C':True}) - self.assertEqual(self.obj_short.Stats, {}) - self.assertTrue(self.obj_short.ShortCircuit) - self.assertEqual(self.obj_noshort.Options, {'A':True, 'C':True}) - self.assertEqual(self.obj_noshort.Stats, {}) - self.assertFalse(self.obj_noshort.ShortCircuit) - - def test_all_wf_methods(self): - # note on priority: groupA:90, groupC:10, groupB:0 (default) - exp = [self.obj_short.wf_groupA, self.obj_short.wf_groupC, - self.obj_short.wf_groupB] - obs = self.obj_short._all_wf_methods() - self.assertEqual(obs, exp) - - def test_call_AC_no_fail(self): - single_iter = construct_iterator(**{'iter_x':[1,2,3,4,5]}) - sf = lambda x: x.FinalState # success function - - exp_stats = {'A1':5, 'A2':5, 'C1':5} - # C2 isn't executed as its requirements aren't met in the Options - exp_result = [('C1',1), ('C1',2), ('C1',3), ('C1',4), ('C1', 5)] - - obs_result = list(self.obj_short(single_iter, sf, None)) - - self.assertEqual(obs_result, exp_result) - self.assertEqual(self.obj_short.Stats, exp_stats) - - def test_call_AC_fail(self): - single_iter = construct_iterator(**{'iter_x':[1,2,'fail A2',4,5]}) - sf = lambda x: x.FinalState # success function - ff = lambda x: x.FinalState # failed function - - exp_stats = {'A1':5, 'A2':5, 'C1':4, 'C2':4} - - self.obj_short.Options['C2'] = 1 - # pass in a failed callback to capture the result, and pause execution - gen = self.obj_short(single_iter, sf, ff) - - r1 = gen.next() - self.assertEqual(r1, ('C2', 1)) - self.assertFalse(self.obj_short.Failed) - - r2 = gen.next() - self.assertEqual(r2, ('C2', 2)) - self.assertFalse(self.obj_short.Failed) - - r3 = gen.next() - self.assertEqual(self.obj_short.FinalState, ('A2', 'fail A2')) - self.assertTrue(self.obj_short.Failed) - self.assertEqual(r3, ('A2', 'fail A2')) - - r4 = gen.next() - self.assertEqual(r4, ('C2', 4)) - self.assertFalse(self.obj_short.Failed) - - r5 = gen.next() - self.assertEqual(r5, ('C2', 5)) - self.assertFalse(self.obj_short.Failed) - - self.assertEqual(self.obj_short.Stats, exp_stats) - - def test_call_AC_fail_noshort(self): - single_iter = construct_iterator(**{'iter_x':[1,2,'fail A2',4,5]}) - sf = lambda x: x.FinalState # success function - ff = lambda x: x.FinalState # failed function - - exp_stats = {'A1':5, 'A2':5, 'C1':5} - - # pass in a failed callback to capture the result, and pause execution - gen = self.obj_noshort(single_iter, sf, ff) - - r1 = gen.next() - self.assertEqual(r1, ('C1', 1)) - self.assertFalse(self.obj_noshort.Failed) - - r2 = gen.next() - self.assertEqual(r2, ('C1', 2)) - self.assertFalse(self.obj_noshort.Failed) - - r3 = gen.next() - self.assertEqual(self.obj_noshort.FinalState, ('C1', 'fail A2')) - self.assertTrue(self.obj_noshort.Failed) - - r4 = gen.next() - self.assertEqual(r4, ('C1', 4)) - self.assertFalse(self.obj_noshort.Failed) - - r5 = gen.next() - self.assertEqual(r5, ('C1', 5)) - self.assertFalse(self.obj_noshort.Failed) - - self.assertEqual(self.obj_noshort.Stats, exp_stats) - -class MockWorkflowReqTest(Workflow): - def _sanity_check(self): - pass - - @priority(5) - @requires(ValidData=lambda x: x < 3) - def wf_needs_data(self, item): - name = 'needs_data' - self.Stats[name] += 1 - if item == 'fail %s' % name: - self.Failed = True - self.FinalState = (name, item) - - @priority(10) - @no_requirements - def wf_always_run(self, item): - name = 'always_run' - self.Stats[name] += 1 - if item == 'fail %s' % name: - self.Failed = True - self.FinalState = (name, item) - -class RequiresTests(TestCase): - def test_validdata(self): - obj = MockWorkflowReqTest() - single_iter = construct_iterator(**{'iter_x':[1,2,3,4,5]}) - - exp_stats = {'needs_data':2, 'always_run':5} - # C2 isn't executed as its requirements aren't met in the Options - exp_result = [('needs_data',1), ('needs_data',2), ('always_run',3), - ('always_run',4), ('always_run', 5)] - - obs_result = list(obj(single_iter)) - - self.assertEqual(obs_result, exp_result) - self.assertEqual(obj.Stats, exp_stats) - - def test_methodb1(self): - obj = MockWorkflow() - obj.methodB1('test') - self.assertEqual(obj.FinalState, ('B1', 'test')) - self.assertFalse(obj.Failed) - - # methodb1 executes regardless of if self.Failed - obj.Failed = True - obj.methodB1('test 2') - self.assertEqual(obj.FinalState, ('B1', 'test 2')) - - obj.Failed = False - obj.methodB1('fail B1') - self.assertEqual(obj.FinalState, 'failed') - - self.assertEqual(obj.Stats, {'B1':3}) - - def test_methodb2_accept(self): - # methodb2 is setup to be valid when foo is in [1,2,3], make sure we - # can execute - obj = MockWorkflow(Options={'foo':1}) - obj.methodB2('test') - self.assertEqual(obj.FinalState, ('B2', 'test')) - self.assertEqual(obj.Stats, {'B2':1}) - - # methodb2 will not execute if self.Failed - obj.Failed = True - obj.methodB2('test 2') - self.assertEqual(obj.FinalState, ('B2', 'test')) - self.assertEqual(obj.Stats, {'B2':1}) - - def test_methodb2_ignore(self): - # methodb2 is setup to be valid when foo is in [1, 2, 3], make sure - # we do not execute - obj = MockWorkflow(Options={'foo':'bar'}) - obj.methodB2('test') - self.assertEqual(obj.FinalState, None) - self.assertEqual(obj.Stats, {}) - -class PriorityTests(TestCase): - def test_dec(self): - @priority(10) - def foo(x,y,z): - """doc check""" - return x+y+z - - self.assertEqual(foo.Priority, 10) - self.assertEqual(foo.__name__, 'foo') - self.assertEqual(foo.__doc__, 'doc check') - -if __name__ == '__main__': - main() From 97ac7b537df2ffa8f29ebf100b410a1d07e205ac Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 13 Aug 2014 15:29:41 -0600 Subject: [PATCH 57/61] ENH: output fastq or fasta --- qiime/click_commands.py | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/qiime/click_commands.py b/qiime/click_commands.py index 279341c959..feb2b8cc0f 100644 --- a/qiime/click_commands.py +++ b/qiime/click_commands.py @@ -19,10 +19,12 @@ help='Reverse complement sequences on output') @click.option('--start-seq-id', type=int, default=0, help='The starting unique ID for sequences') +@click.option('--to-fastq', is_flag=True, + help='Write out in fastq') # Iterator options @click.option('--rev-comp-barcodes/--no-rev-comp-barcodes', default=False, help='Reverse complement barcode reads') -@click.option('--phred-offset', type=click.Choice(['33', '64']), +@click.option('--phred-offset', default='33', type=click.Choice(['33', '64']), help='The ASCII offset used to decode PHRED scores') # Runtime options @click.option('--phred-quality-threshold', '-q', default=3, type=int, @@ -47,12 +49,15 @@ def slib(ctx, **kwargs): """Quality filter and demultiplex sequences""" from skbio import DNA from skbio.parse.sequences.factory import load + from skbio.format.sequences.fastq import format_fastq_record from qiime.parse import parse_mapping_file_to_dict from qiime.process_seqs import SequenceWorkflow, IterAdapter # qiime_config is available under ctx.obj['qiime_config'] + phred_offset = int(kwargs.pop('phred_offset')) + # reverse complement for reversing mapping barcodes def rc(seq): return str(DNA(seq).rc()) @@ -71,21 +76,28 @@ def format_id(idx, state): bc_diff = "bc_diffs=%d" % (state['Barcode errors'] or 0) return " ".join([seq_id, ori_id, ori_bc, new_bc, bc_diff]) - # should be sourced from skbio - def format_fasta(id_, seq): - return ">%s\n%s\n" % (id_, seq) + # should be sourced from skbio but there doesn't appear to be a method that + # takes a single seq and ID + def format_fasta(id_, item): + return ">%s\n%s\n" % (id_, item['Sequence']) - # not defining fastq as the method should be sourced from skbio - # particularly dealing with qual + def make_format_fastq(offset): + def f(id_, state): + seq = state['Sequence'] + qual = state['Qual'] + return format_fastq_record(id_, seq, qual, offset) + return f + format_fastq = make_format_fastq(phred_offset) # setup sequence iterator - seqs = load(kwargs.pop('sequence_read_fp')) + seqs = load(kwargs.pop('sequence_read_fp'), phred_offset=phred_offset) # setup barcode iterator barcode_read_fp = kwargs.pop('barcode_read_fp') if barcode_read_fp: transform = rc_it if kwargs.pop('rev_comp_barcodes') else None - barcodes = load(barcode_read_fp, transform=transform) + barcodes = load(barcode_read_fp, transform=transform, + phred_offset=phred_offset) else: barcodes = None @@ -104,8 +116,11 @@ def format_fasta(id_, seq): if not os.path.exists(output_dir): os.mkdir(output_dir) - success_fp = os.path.join(output_dir, 'seqs.fna') - fail_fp = os.path.join(output_dir, 'unassigned.fna') + to_fastq = kwargs.pop('to_fastq') + ext = 'fq' if to_fastq else 'fna' + formatter = format_fastq if to_fastq else format_fasta + success_fp = os.path.join(output_dir, 'seqs.%s' % ext) + fail_fp = os.path.join(output_dir, 'unassigned.%s' % ext) if os.path.exists(success_fp): raise IOError("%s already exists!" % success_fp) @@ -125,5 +140,5 @@ def format_fasta(id_, seq): with open(success_fp, 'w') as success, open(fail_fp, 'w') as failed: for idx, item in enumerate(wf(iter_, fail_callback=lambda x: x.state)): id_ = format_id(seq_id + idx, item) - formatted = format_fasta(id_, item['Sequence']) + formatted = formatter(id_, item) failed.write(formatted) if wf.failed else success.write(formatted) From 27d801abefb787dfdaba87175815d71509de623c Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 13 Aug 2014 17:27:09 -0600 Subject: [PATCH 58/61] TST: tests for click command --- tests/test_click_commands.py | 53 ++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 tests/test_click_commands.py diff --git a/tests/test_click_commands.py b/tests/test_click_commands.py new file mode 100644 index 0000000000..cab3e9a97b --- /dev/null +++ b/tests/test_click_commands.py @@ -0,0 +1,53 @@ +import inspect +import os +import re +from unittest import TestCase, main + +from click.testing import CliRunner + +from qiime.click_commands import slib + + +def get_qiime_test_data(name): + """Based off of skbio.util.testing.get_data_path""" + callers_filename = inspect.getouterframes(inspect.currentframe())[1][1] + path = os.path.dirname(os.path.abspath(callers_filename)) + data_path = os.path.join(path, '../qiime_test_data', name) + return data_path + + +class ClickMixin(object): + runner = CliRunner() + + def find_usage_commands(self, text): + pat = re.compile(r'\$ qiime.*(?:\n|$)') + + for match in pat.finditer(text): + yield match.group(0) + + def parse_usage_commands(self, command_name, command_str): + if not command_str.startswith('$ qiime %s' % command_name): + raise ValueError("Cannot interpret: %s" % command_str) + + _, args = command_str.split(command_name, 1) + args = args.replace('$PWD', self.testdata_dir) + return [arg.strip() for arg in args.split()] + + def test_usage(self): + for usage in self.find_usage_commands(self.command.help): + args = self.parse_usage_commands(self.command.name, usage) + + with self.runner.isolated_filesystem(): + result = self.runner.invoke(self.command, args=args) + if result.exit_code != 0: + self.fail(result.output) + + +class SlibTests(ClickMixin, TestCase): + def setUp(self): + self.testdata_dir = get_qiime_test_data('split_libraries_fastq') + self.command = slib + + +if __name__ == '__main__': + main() From 3319d97118829bb30d412a8ee8bcaa8081956632 Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Wed, 13 Aug 2014 17:27:24 -0600 Subject: [PATCH 59/61] DOC: usage exampleS --- qiime/click_commands.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/qiime/click_commands.py b/qiime/click_commands.py index feb2b8cc0f..da6eef645c 100644 --- a/qiime/click_commands.py +++ b/qiime/click_commands.py @@ -11,7 +11,7 @@ type=click.Path(exists=True), help='Input sequence reads') @click.option('--output-dir', '-o', type=click.Path(exists=False), required=True) -@click.option('--mapping_fp', '-m', required=True, +@click.option('--mapping-fp', '-m', required=True, type=click.File('U'), help='Mapping file') @click.option('--barcode-read-fp', '-b', multiple=True, required=False, type=click.Path(exists=True), help='Barcode read files') @@ -29,8 +29,9 @@ # Runtime options @click.option('--phred-quality-threshold', '-q', default=3, type=int, help='Minimum PHRED quality score') -@click.option('--barcode-type', help='The type of barcode used', default=None, - type=click.Choice(['golay_12', 'hamming_8', 'not-barcoded'])) +@click.option('--barcode-type', help='The type of barcode used', + default='golay_12', type=click.Choice(['golay_12', 'hamming_8', + 'not-barcoded'])) @click.option('--max-barcode-error', default=1.5, type=float, help='The maximum number of barcode errors allowed') @click.option('--retain-primer/--no-retain-primer', default=False, @@ -46,7 +47,25 @@ default=False, help='Reverse complement the mapping barcodes') @click.pass_context def slib(ctx, **kwargs): - """Quality filter and demultiplex sequences""" + """Quality filter and demultiplex sequences + + Examples + -------- + + Demultiplex and quality filter (at Phred >= Q20) one lane of Illumina fastq + data and write results to ./slout_q20: + + $ qiime slib -i $PWD/lane1_read1.fastq.gz -b $PWD/lane1_barcode.fastq.gz \ + -m $PWD/map.txt -o slout_q20 --rev-comp-mapping-barcodes -q 20 + + Demultiplex and quality filter (at Phred >= Q20) two lanes of Illumina + fastq data and write results to ./slout_q20: + + $ qiime slib -i $PWD/lane1_read1.fastq.gz -i $PWD/lane2_read1.fastq.gz \ + -b $PWD/lane1_barcode.fastq.gz -b $PWD/lane2_barcode.fastq.gz \ + -m $PWD/map.txt -o slout_q20 --rev-comp-mapping-barcodes -q 20 + """ + print kwargs from skbio import DNA from skbio.parse.sequences.factory import load from skbio.format.sequences.fastq import format_fastq_record From 417a54162ce96025ae941ed858f9c4fc8e116f0f Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Thu, 14 Aug 2014 15:05:09 -0600 Subject: [PATCH 60/61] BLD: adding click --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ad230a45c1..d25165f428 100644 --- a/setup.py +++ b/setup.py @@ -261,7 +261,7 @@ def app_available(app_name): 'biom-format == 2.1', 'emperor == 0.9.3-dev', 'scikit-bio == 0.1.4', 'brokit == 0.0.0-dev', 'pandas >= 0.13.1', 'future', 'h5py>=2.2.0', - 'burrito'], + 'burrito', 'click >= 3.0'], dependency_links=[ 'https://github.com/biocore/brokit/archive/master.zip#egg=brokit-0.0.0-dev', 'https://github.com/biocore/biom-format/archive/master.zip#egg=biom-format-2.0.1-dev', From 03a5da43e6eac2e70647a13295d8295f7932f4dd Mon Sep 17 00:00:00 2001 From: Daniel McDonald Date: Thu, 14 Aug 2014 15:05:49 -0600 Subject: [PATCH 61/61] MAINT: removed print --- qiime/click_commands.py | 1 - 1 file changed, 1 deletion(-) diff --git a/qiime/click_commands.py b/qiime/click_commands.py index da6eef645c..eeb23168bb 100644 --- a/qiime/click_commands.py +++ b/qiime/click_commands.py @@ -65,7 +65,6 @@ def slib(ctx, **kwargs): -b $PWD/lane1_barcode.fastq.gz -b $PWD/lane2_barcode.fastq.gz \ -m $PWD/map.txt -o slout_q20 --rev-comp-mapping-barcodes -q 20 """ - print kwargs from skbio import DNA from skbio.parse.sequences.factory import load from skbio.format.sequences.fastq import format_fastq_record