-
Notifications
You must be signed in to change notification settings - Fork 0
/
genome.py
77 lines (68 loc) · 3.3 KB
/
genome.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
#!/usr/bin/env python
# Author: Imran Akbar (imran@osg.stanford.edu)
# Created: 1/25/2012
'''
brute force algorithm would search for the substrings at every character in the source sequence
would have O(mn) complexity
more sophisticated algorithms include Boyer-Moore-Horspool, Boyer-Moore, and Knuth-Morris-Pratt
however, these algorithms work best for a single pattern/matching sequence
whereas we have multiple sequences that we need to match
for multiple sequences that we want to match, we should use one of the Aho-Corasick, Commentz-Walter, or Rabin-Karp algorithms
for better average running time
the existing library for Aho-Corasick won't work with a streaming input file not in memory
and Rabin-Karp is simpler, so let's implement that
adapted from sample implementations here:
http://courses.csail.mit.edu/6.006/spring08/keynotes/rolling_hash.py
https://github.com/laurentluce/python-algorithms/blob/master/algorithms/string_matching.py
'''
import sys
patternsToMatch = []
def readFile():
try:
file = open('genome_input.txt','r')
parameters = map(int, file.readline().strip('\n').split(" ")) # k = number of sequences, m = length of each sequence, n = length genome
# read in and store each sequence to be matched
for i in range(parameters[0]):
patternsToMatch.append(file.readline().strip('\n'))
return [file, parameters]
except:
print "Error reading from file"
def computeHash(string, base):
# example: if the string = "abc", value = (97 x base^2) + (98 x base^1) + (99 x base^0), where 97 is the ASCII value of a
value = 0
p = len(string)-1
for i in range(p+1):
value += ord(string[i]) * (base ** p) # ** is exponential (^)
p -= 1
return value
def stringMatch(pattern,textStream,m,n,hash_base=256):
matchAtPositions = []
homePosition = textStream.tell()
htext = computeHash(textStream.read(m), hash_base) # will read m characters from the beginning of genome stream
textStream.seek(homePosition) # go back to start of genome stream
hpattern = computeHash(pattern, hash_base)
for i in range(n-m+1):
if htext == hpattern: # the hashes match
textStream.seek(homePosition + i) # move to position in stream
if textStream.read(m) == pattern: # verify that strings actually match (instead of a hash collision)
matchAtPositions.append(i) # add this position to the results
if i < n-m: # haven't reached the end yet
textStream.seek(homePosition + i) # move to position in stream
first_char = textStream.read(1)
textStream.seek(homePosition + i + m) # move to position in stream
last_char = textStream.read(1)
htext = (hash_base * (htext - (ord(first_char) * (hash_base ** (m-1))))) + ord(last_char)
if len(matchAtPositions)>0:
print "Found pattern match at position(s):" + str(matchAtPositions)
else:
print "No matches found for pattern"
textStream.seek(homePosition) # go back to start of genome stream, in case any further patterns need to be searched
def main():
file, parameters = readFile()
for pattern in patternsToMatch:
print pattern
stringMatch(pattern,file,parameters[1],parameters[2])
file.close()
if __name__ == '__main__':
main()
sys.exit()