-
Notifications
You must be signed in to change notification settings - Fork 80
/
Copy pathcompute-dna-mh-another-way.py
executable file
·60 lines (46 loc) · 1.42 KB
/
compute-dna-mh-another-way.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#! /usr/bin/env python
"""
Use the MurmurHash library mmh3 and separate Python code to calculate
a MinHash signature for input DNA sequence, as a way to do an
external check on our C++ implementation.
The output of this is used in test_sourmash.py to verify our C++ code.
"""
__complementTranslation = { "A": "T", "C": "G", "G": "C", "T": "A", "N": "N" }
def complement(s):
"""
Return complement of 's'.
"""
c = "".join(__complementTranslation[n] for n in s)
return c
def reverse(s):
"""
Return reverse of 's'.
"""
r = "".join(reversed(s))
return r
def kmers(seq, k):
for start in range(len(seq) - k + 1):
yield seq[start:start + k]
###
K = 21
import sys, screed
import mmh3
import sourmash_lib
print('imported sourmash:', sourmash_lib, file=sys.stderr)
from sourmash_lib import MinHash
import sourmash_lib.signature
record = next(iter(screed.open(sys.argv[1])))
print('loaded', record.name, file=sys.stderr)
revcomp = reverse(complement((record.sequence)))
E = sourmash_lib.Estimators(ksize=K, n=500, protein=False)
mh = E.mh
for fwd_kmer in kmers(record.sequence, K):
rev_kmer = reverse(complement(fwd_kmer))
if fwd_kmer < rev_kmer:
kmer = fwd_kmer
else:
kmer = rev_kmer
hash = mmh3.hash128(kmer, seed=42)
mh.add_hash(hash)
s = sourmash_lib.signature.SourmashSignature('', E, name=record.name)
print(sourmash_lib.signature.save_signatures([s]))