diff --git a/CHANGELOG.md b/CHANGELOG.md index 95049093b..09214d7a9 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,12 +3,15 @@ ## Version 0.4.2-dev (changes since 0.4.2 go here) ### Features +* Added `skbio.sequence.distance.kmer_distance` for computing the kmer distance between two sequences. ([#913](https://github.com/biocore/scikit-bio/issues/913)) ### Backward-incompatible changes [stable] ### Backward-incompatible changes [experimental] ### Bug fixes +* Fixed bug when using `Sequence.iter_kmers` on empty `Sequence` object. Previously this raised a `ValueError`, now it returns +an empty generator. ### Miscellaneous diff --git a/skbio/sequence/_sequence.py b/skbio/sequence/_sequence.py index ed15cbcd6..60f24d967 100644 --- a/skbio/sequence/_sequence.py +++ b/skbio/sequence/_sequence.py @@ -1874,7 +1874,7 @@ def iter_kmers(self, k, overlap=True): step = k count = len(self) // k - if self.has_positional_metadata(): + if len(self) == 0 or self.has_positional_metadata(): for i in range(0, len(self) - k + 1, step): yield self[i:i+k] # Optimized path when no positional metadata diff --git a/skbio/sequence/distance.py b/skbio/sequence/distance.py index 568db42b9..5d35e62ea 100644 --- a/skbio/sequence/distance.py +++ b/skbio/sequence/distance.py @@ -17,6 +17,7 @@ :toctree: generated/ hamming + kmer_distance """ @@ -87,16 +88,7 @@ def hamming(seq1, seq2): 0.5 """ - for seq in seq1, seq2: - if not isinstance(seq, skbio.Sequence): - raise TypeError( - "`seq1` and `seq2` must be Sequence instances, not %r" - % type(seq).__name__) - - if type(seq1) is not type(seq2): - raise TypeError( - "Sequences must have matching type. Type %r does not match type %r" - % (type(seq1).__name__, type(seq2).__name__)) + _check_seqs(seq1, seq2) # Hamming requires equal length sequences. We are checking this here # because the error you would get otherwise is cryptic. @@ -113,3 +105,76 @@ def hamming(seq1, seq2): distance = scipy.spatial.distance.hamming(seq1.values, seq2.values) return float(distance) + + +@experimental(as_of='0.4.2-dev') +def kmer_distance(seq1, seq2, k, overlap=True): + """Compute the kmer distance between a pair of sequences + + The kmer distance between two sequences is the fraction of kmers that are + unique to either sequence. + + Parameters + ---------- + seq1, seq2 : Sequence + Sequences to compute kmer distance between. + k : int + The kmer length. + overlap : bool, optional + Defines whether the kmers should be overlapping or not. + + Returns + ------- + float + kmer distance between `seq1` and `seq2`. + + Raises + ------ + ValueError + If `k` is less than 1. + TypeError + If `seq1` and `seq2` are not ``Sequence`` instances. + TypeError + If `seq1` and `seq2` are not the same type. + + Notes + ----- + kmer counts are not incorporated in this distance metric. + + ``np.nan`` will be returned if there are no kmers defined for the + sequences. + + Examples + -------- + >>> from skbio import Sequence + >>> seq1 = Sequence('ATCGGCGAT') + >>> seq2 = Sequence('GCAGATGTG') + >>> kmer_distance(seq1, seq2, 3) # doctest: +ELLIPSIS + 0.9230769230... + + """ + _check_seqs(seq1, seq2) + seq1_kmers = set(map(str, seq1.iter_kmers(k, overlap=overlap))) + seq2_kmers = set(map(str, seq2.iter_kmers(k, overlap=overlap))) + all_kmers = seq1_kmers | seq2_kmers + if not all_kmers: + return np.nan + shared_kmers = seq1_kmers & seq2_kmers + number_unique = len(all_kmers) - len(shared_kmers) + fraction_unique = number_unique / len(all_kmers) + return fraction_unique + + +def _check_seqs(seq1, seq2): + # Asserts both sequences are skbio.sequence objects + for seq in seq1, seq2: + if not isinstance(seq, skbio.Sequence): + raise TypeError( + "`seq1` and `seq2` must be Sequence instances, not %r" + % type(seq).__name__) + + # Asserts sequences have the same type + if type(seq1) is not type(seq2): + raise TypeError( + "Sequences must have matching type. Type %r does not match type %r" + % (type(seq1).__name__, type(seq2).__name__)) diff --git a/skbio/sequence/tests/test_distance.py b/skbio/sequence/tests/test_distance.py index 905c35939..8abbbe85f 100644 --- a/skbio/sequence/tests/test_distance.py +++ b/skbio/sequence/tests/test_distance.py @@ -16,7 +16,7 @@ import numpy.testing as npt from skbio import Sequence, DNA -from skbio.sequence.distance import hamming +from skbio.sequence.distance import hamming, kmer_distance class TestHamming(unittest.TestCase): @@ -126,5 +126,117 @@ def test_sequences_with_metadata(self): self.assertEqual(distance, 0.75) +class TestKmerDistance(unittest.TestCase): + def test_default_kwargs(self): + seq1 = Sequence('AACCTAGCAATGGAT') + seq2 = Sequence('CAGGCAGTTCTCACC') + obs = kmer_distance(seq1, seq2, 3) + exp = 0.9130434782608695 + self.assertAlmostEqual(obs, exp) + + def test_nondefault_k(self): + seq1 = Sequence('GCTTATGGAGAGAGA') + seq2 = Sequence('CTCGAACTCCAGCCA') + obs = kmer_distance(seq1, seq2, 2) + exp = 0.7333333333333333 + self.assertAlmostEqual(obs, exp) + seq1 = Sequence('EADDECAEECDEACD') + seq2 = Sequence('DCBCBADADABCCDA') + obs = kmer_distance(seq1, seq2, 1) + exp = 0.4 + self.assertAlmostEqual(obs, exp) + + def test_overlap_false(self): + seq1 = Sequence('CGTTATGTCTGTGAT') + seq2 = Sequence('CTGAATCGGTAGTGT') + obs = kmer_distance(seq1, seq2, 3, overlap=False) + exp = 0.8888888888888888 + self.assertAlmostEqual(obs, exp) + + def test_entirely_different_sequences(self): + seq1 = Sequence('CCGTGGTCGTATAAG') + seq2 = Sequence('CGCCTTCCACATCAG') + obs = kmer_distance(seq1, seq2, 3) + exp = 1.0 + self.assertEqual(obs, exp) + + def test_same_sequence(self): + seq1 = Sequence('CTGCGACAGTTGGTA') + seq2 = Sequence('CTGCGACAGTTGGTA') + obs = kmer_distance(seq1, seq2, 3) + exp = 0.0 + self.assertEqual(obs, exp) + + def test_differing_length_seqs(self): + seq1 = Sequence('AGAAATCTGAGCAAGGATCA') + seq2 = Sequence('TTAGTGCGTAATCCG') + obs = kmer_distance(seq1, seq2, 3) + exp = 0.9285714285714286 + self.assertAlmostEqual(obs, exp) + + def test_with_sequence_subclass(self): + seq1 = DNA('GATGGTACTGTAGGT') + seq2 = DNA('AGGGTGAAGGTATCA') + obs = kmer_distance(seq1, seq2, 3) + exp = 0.8421052631578947 + self.assertAlmostEqual(obs, exp) + + def test_with_metadata_sanity(self): + seq1 = Sequence('AACCTAGCAATGGAT', + metadata={'Name': 'Kestrel Gorlick'}, + positional_metadata={'seq': list('ACTCAAGCTACGAAG')}) + seq2 = Sequence('CAGGCAGTTCTCACC') + obs = kmer_distance(seq1, seq2, 3) + exp = 0.9130434782608695 + self.assertAlmostEqual(obs, exp) + + def test_return_type(self): + seq1 = Sequence('ATCG') + seq2 = Sequence('ATCG') + obs = kmer_distance(seq1, seq2, 3) + self.assertIsInstance(obs, float) + self.assertEqual(obs, 0.0) + + def test_empty_sequences(self): + seq1 = Sequence('') + seq2 = Sequence('') + obs = kmer_distance(seq1, seq2, 3) + npt.assert_equal(obs, np.nan) + + def test_one_empty_sequence(self): + seq1 = Sequence('') + seq2 = Sequence('CGGGCAGCTCCTACCTGCTA') + obs = kmer_distance(seq1, seq2, 3) + exp = 1.0 + self.assertAlmostEqual(obs, exp) + + def test_no_kmers_found(self): + seq1 = Sequence('ATCG') + seq2 = Sequence('ACGT') + obs = kmer_distance(seq1, seq2, 5) + npt.assert_equal(obs, np.nan) + + def test_k_less_than_one_error(self): + seq1 = Sequence('ATCG') + seq2 = Sequence('ACTG') + with six.assertRaisesRegex(self, ValueError, + 'k must be greater than 0.'): + kmer_distance(seq1, seq2, 0) + + def test_type_mismatch_error(self): + seq1 = Sequence('ABC') + seq2 = DNA('ATC') + with six.assertRaisesRegex(self, TypeError, + "Type 'Sequence'.*type 'DNA'"): + kmer_distance(seq1, seq2, 3) + + def test_non_sequence_error(self): + seq1 = Sequence('ATCG') + seq2 = 'ATCG' + with six.assertRaisesRegex(self, TypeError, + "not 'str'"): + kmer_distance(seq1, seq2, 3) + + if __name__ == "__main__": unittest.main() diff --git a/skbio/sequence/tests/test_sequence.py b/skbio/sequence/tests/test_sequence.py index 477e6578e..4146171dc 100644 --- a/skbio/sequence/tests/test_sequence.py +++ b/skbio/sequence/tests/test_sequence.py @@ -1707,6 +1707,16 @@ def test_iter_kmers_different_sequences_no_positional_metadata(self): ] self._compare_kmers_results(seq.iter_kmers(3, overlap=False), expected) + def test_iter_kmers_empty_sequence(self): + seq = Sequence('') + expected = [] + self._compare_kmers_results(seq.iter_kmers(3, overlap=False), expected) + + def test_iter_kmers_empty_sequence_with_positional_metadata(self): + seq = Sequence('', positional_metadata={'quality': []}) + expected = [] + self._compare_kmers_results(seq.iter_kmers(3, overlap=False), expected) + def test_kmer_frequencies_empty_sequence(self): seq = Sequence('')