-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathTruthBamToMatrix.py
executable file
·98 lines (73 loc) · 2.08 KB
/
TruthBamToMatrix.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env python
import argparse
import sys
parser = argparse.ArgumentParser(description="")
parser.add_argument("-m", "--mpileup", help="input mpipleup, must be run with -r and --output-QNAME", type=argparse.FileType('r'))
parser.add_argument("-n", "--nucfreq", help="input nucfreq file", type=argparse.FileType('r') )
parser.add_argument("outfile",nargs="?", help="output matrix", type=argparse.FileType('w'), default=sys.stdout)
parser.add_argument('-d', action="store_true", default=False)
parser.add_argument('--header', action="store_true", default=False)
args = parser.parse_args()
import pysam
import pandas as pd
import re
poses = []
refs = set()
for line in args.nucfreq:
tokens = line.strip().split()
poses.append(int(tokens[1]))
refs.add(tokens[0])
assert len(refs) == 1
ref = str(refs.pop())
#print(ref, file=sys.stderr)
def parseopt(opt, numseqs):
matches=re.findall(".*(\+|\-)([0-9]+)([ACGTNacgtn]+).*", opt)
replace = ""
if len(matches) > 0:
return( ["n"] * numseqs)
# tried something more fancy but it does not work
for insdel, length, chars in matches:
length = int(length)
new = opt.replace("{}{}{}".format(insdel, length, chars[0:length]), "")
print(opt, insdel, length, chars, new)
opt = new
else:
out = []
for char in opt:
if char in [",", "."]:
out.append( ".")
else:
out.append( "1")
assert len(out) == numseqs, "{} {}".format(out, opt)
return(out)
out = {}
allnames = set()
for line in args.mpileup:
tokens = line.strip().split()
conitg = tokens[0]
pos = int(tokens[1])
if(pos in poses):
rbase = tokens[2].upper()
numseqs = int(tokens[3])
opts = parseopt(tokens[4], numseqs)
names = tokens[6].split(",")
for i in range(numseqs):
opt = opts[i]
name = names[i]
allnames.add(name)
if name not in out:
out[name] = {}
if pos not in out[name]:
out[name][pos] = {}
out[name][pos] = opt
for name in allnames:
cur = out[name]
write = ""
for pos in poses:
if pos not in cur:
write += "n"
else:
write += cur[pos]
write += "\t" + name + "\n"
args.outfile.write(write)
exit()