forked from petercombs/MPRA_selection
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathExtractSequenceFromDataFile.py
40 lines (36 loc) · 1.3 KB
/
ExtractSequenceFromDataFile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
from sys import argv, stderr
from collections import defaultdict
from textwrap import wrap
if __name__ == "__main__":
filetype = "?"
chrom = "?"
for line in open(argv[2]):
data = line.strip().split()
try:
pos = int(data[1])
except:
continue
if filetype == "?":
if len(data) == 10:
filetype = "kircher"
seq = defaultdict(lambda: set("N"))
elif len(data) == 5:
filetype = "patwardhan"
seq = defaultdict(lambda: set("ATGC"))
if filetype == "kircher":
if data[9] == argv[1]:
chrom = data[0]
seq[pos] = {data[2]}
elif filetype == "patwardhan":
if data[0].upper() == argv[1]:
seq[pos].discard(data[2])
#else:
#print("Skipping row for ", data[0].upper())
else:
raise ValueError("We cannot infer the correct filetype")
min = min(seq.keys())
max = max(seq.keys())
# We know that TCF7L2 has a missing base in the data file, so I'm going to
# use range instead of just looping over the bases.
seq = wrap("".join(seq[i].pop() for i in range(min, max+1)))
print(f">Homo_sapiens chr{chrom}:{min}-{max} 100.0", *seq, sep="\n")