Skip to content

Commit

Permalink
Merge pull request scikit-bio#1289 from Kleptobismol/issue-913
Browse files Browse the repository at this point in the history
Adds `kmer_distance` function to skbio.sequence.distance
  • Loading branch information
jairideout committed Feb 26, 2016
2 parents aacca27 + 4471b39 commit ab357f4
Show file tree
Hide file tree
Showing 5 changed files with 202 additions and 12 deletions.
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@
## Version 0.4.2-dev (changes since 0.4.2 go here)

### Features
* Added `skbio.sequence.distance.kmer_distance` for computing the kmer distance between two sequences. ([#913](https://github.com/biocore/scikit-bio/issues/913))

### Backward-incompatible changes [stable]

### Backward-incompatible changes [experimental]

### Bug fixes
* Fixed bug when using `Sequence.iter_kmers` on empty `Sequence` object. Previously this raised a `ValueError`, now it returns
an empty generator.

### Miscellaneous

Expand Down
2 changes: 1 addition & 1 deletion skbio/sequence/_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -1874,7 +1874,7 @@ def iter_kmers(self, k, overlap=True):
step = k
count = len(self) // k

if self.has_positional_metadata():
if len(self) == 0 or self.has_positional_metadata():
for i in range(0, len(self) - k + 1, step):
yield self[i:i+k]
# Optimized path when no positional metadata
Expand Down
85 changes: 75 additions & 10 deletions skbio/sequence/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
:toctree: generated/
hamming
kmer_distance
"""

Expand Down Expand Up @@ -87,16 +88,7 @@ def hamming(seq1, seq2):
0.5
"""
for seq in seq1, seq2:
if not isinstance(seq, skbio.Sequence):
raise TypeError(
"`seq1` and `seq2` must be Sequence instances, not %r"
% type(seq).__name__)

if type(seq1) is not type(seq2):
raise TypeError(
"Sequences must have matching type. Type %r does not match type %r"
% (type(seq1).__name__, type(seq2).__name__))
_check_seqs(seq1, seq2)

# Hamming requires equal length sequences. We are checking this here
# because the error you would get otherwise is cryptic.
Expand All @@ -113,3 +105,76 @@ def hamming(seq1, seq2):
distance = scipy.spatial.distance.hamming(seq1.values, seq2.values)

return float(distance)


@experimental(as_of='0.4.2-dev')
def kmer_distance(seq1, seq2, k, overlap=True):
"""Compute the kmer distance between a pair of sequences
The kmer distance between two sequences is the fraction of kmers that are
unique to either sequence.
Parameters
----------
seq1, seq2 : Sequence
Sequences to compute kmer distance between.
k : int
The kmer length.
overlap : bool, optional
Defines whether the kmers should be overlapping or not.
Returns
-------
float
kmer distance between `seq1` and `seq2`.
Raises
------
ValueError
If `k` is less than 1.
TypeError
If `seq1` and `seq2` are not ``Sequence`` instances.
TypeError
If `seq1` and `seq2` are not the same type.
Notes
-----
kmer counts are not incorporated in this distance metric.
``np.nan`` will be returned if there are no kmers defined for the
sequences.
Examples
--------
>>> from skbio import Sequence
>>> seq1 = Sequence('ATCGGCGAT')
>>> seq2 = Sequence('GCAGATGTG')
>>> kmer_distance(seq1, seq2, 3) # doctest: +ELLIPSIS
0.9230769230...
"""
_check_seqs(seq1, seq2)
seq1_kmers = set(map(str, seq1.iter_kmers(k, overlap=overlap)))
seq2_kmers = set(map(str, seq2.iter_kmers(k, overlap=overlap)))
all_kmers = seq1_kmers | seq2_kmers
if not all_kmers:
return np.nan
shared_kmers = seq1_kmers & seq2_kmers
number_unique = len(all_kmers) - len(shared_kmers)
fraction_unique = number_unique / len(all_kmers)
return fraction_unique


def _check_seqs(seq1, seq2):
# Asserts both sequences are skbio.sequence objects
for seq in seq1, seq2:
if not isinstance(seq, skbio.Sequence):
raise TypeError(
"`seq1` and `seq2` must be Sequence instances, not %r"
% type(seq).__name__)

# Asserts sequences have the same type
if type(seq1) is not type(seq2):
raise TypeError(
"Sequences must have matching type. Type %r does not match type %r"
% (type(seq1).__name__, type(seq2).__name__))
114 changes: 113 additions & 1 deletion skbio/sequence/tests/test_distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import numpy.testing as npt

from skbio import Sequence, DNA
from skbio.sequence.distance import hamming
from skbio.sequence.distance import hamming, kmer_distance


class TestHamming(unittest.TestCase):
Expand Down Expand Up @@ -126,5 +126,117 @@ def test_sequences_with_metadata(self):
self.assertEqual(distance, 0.75)


class TestKmerDistance(unittest.TestCase):
def test_default_kwargs(self):
seq1 = Sequence('AACCTAGCAATGGAT')
seq2 = Sequence('CAGGCAGTTCTCACC')
obs = kmer_distance(seq1, seq2, 3)
exp = 0.9130434782608695
self.assertAlmostEqual(obs, exp)

def test_nondefault_k(self):
seq1 = Sequence('GCTTATGGAGAGAGA')
seq2 = Sequence('CTCGAACTCCAGCCA')
obs = kmer_distance(seq1, seq2, 2)
exp = 0.7333333333333333
self.assertAlmostEqual(obs, exp)
seq1 = Sequence('EADDECAEECDEACD')
seq2 = Sequence('DCBCBADADABCCDA')
obs = kmer_distance(seq1, seq2, 1)
exp = 0.4
self.assertAlmostEqual(obs, exp)

def test_overlap_false(self):
seq1 = Sequence('CGTTATGTCTGTGAT')
seq2 = Sequence('CTGAATCGGTAGTGT')
obs = kmer_distance(seq1, seq2, 3, overlap=False)
exp = 0.8888888888888888
self.assertAlmostEqual(obs, exp)

def test_entirely_different_sequences(self):
seq1 = Sequence('CCGTGGTCGTATAAG')
seq2 = Sequence('CGCCTTCCACATCAG')
obs = kmer_distance(seq1, seq2, 3)
exp = 1.0
self.assertEqual(obs, exp)

def test_same_sequence(self):
seq1 = Sequence('CTGCGACAGTTGGTA')
seq2 = Sequence('CTGCGACAGTTGGTA')
obs = kmer_distance(seq1, seq2, 3)
exp = 0.0
self.assertEqual(obs, exp)

def test_differing_length_seqs(self):
seq1 = Sequence('AGAAATCTGAGCAAGGATCA')
seq2 = Sequence('TTAGTGCGTAATCCG')
obs = kmer_distance(seq1, seq2, 3)
exp = 0.9285714285714286
self.assertAlmostEqual(obs, exp)

def test_with_sequence_subclass(self):
seq1 = DNA('GATGGTACTGTAGGT')
seq2 = DNA('AGGGTGAAGGTATCA')
obs = kmer_distance(seq1, seq2, 3)
exp = 0.8421052631578947
self.assertAlmostEqual(obs, exp)

def test_with_metadata_sanity(self):
seq1 = Sequence('AACCTAGCAATGGAT',
metadata={'Name': 'Kestrel Gorlick'},
positional_metadata={'seq': list('ACTCAAGCTACGAAG')})
seq2 = Sequence('CAGGCAGTTCTCACC')
obs = kmer_distance(seq1, seq2, 3)
exp = 0.9130434782608695
self.assertAlmostEqual(obs, exp)

def test_return_type(self):
seq1 = Sequence('ATCG')
seq2 = Sequence('ATCG')
obs = kmer_distance(seq1, seq2, 3)
self.assertIsInstance(obs, float)
self.assertEqual(obs, 0.0)

def test_empty_sequences(self):
seq1 = Sequence('')
seq2 = Sequence('')
obs = kmer_distance(seq1, seq2, 3)
npt.assert_equal(obs, np.nan)

def test_one_empty_sequence(self):
seq1 = Sequence('')
seq2 = Sequence('CGGGCAGCTCCTACCTGCTA')
obs = kmer_distance(seq1, seq2, 3)
exp = 1.0
self.assertAlmostEqual(obs, exp)

def test_no_kmers_found(self):
seq1 = Sequence('ATCG')
seq2 = Sequence('ACGT')
obs = kmer_distance(seq1, seq2, 5)
npt.assert_equal(obs, np.nan)

def test_k_less_than_one_error(self):
seq1 = Sequence('ATCG')
seq2 = Sequence('ACTG')
with six.assertRaisesRegex(self, ValueError,
'k must be greater than 0.'):
kmer_distance(seq1, seq2, 0)

def test_type_mismatch_error(self):
seq1 = Sequence('ABC')
seq2 = DNA('ATC')
with six.assertRaisesRegex(self, TypeError,
"Type 'Sequence'.*type 'DNA'"):
kmer_distance(seq1, seq2, 3)

def test_non_sequence_error(self):
seq1 = Sequence('ATCG')
seq2 = 'ATCG'
with six.assertRaisesRegex(self, TypeError,
"not 'str'"):
kmer_distance(seq1, seq2, 3)


if __name__ == "__main__":
unittest.main()
10 changes: 10 additions & 0 deletions skbio/sequence/tests/test_sequence.py
Original file line number Diff line number Diff line change
Expand Up @@ -1707,6 +1707,16 @@ def test_iter_kmers_different_sequences_no_positional_metadata(self):
]
self._compare_kmers_results(seq.iter_kmers(3, overlap=False), expected)

def test_iter_kmers_empty_sequence(self):
seq = Sequence('')
expected = []
self._compare_kmers_results(seq.iter_kmers(3, overlap=False), expected)

def test_iter_kmers_empty_sequence_with_positional_metadata(self):
seq = Sequence('', positional_metadata={'quality': []})
expected = []
self._compare_kmers_results(seq.iter_kmers(3, overlap=False), expected)

def test_kmer_frequencies_empty_sequence(self):
seq = Sequence('')

Expand Down

0 comments on commit ab357f4

Please sign in to comment.