-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathpodplot.py
executable file
·87 lines (71 loc) · 2.89 KB
/
podplot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python
import bisect
import argparse
import time
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use('Agg')
def log(message):
""" Log messages to standard output. """
print(time.ctime() + ' --- ' + message, flush=True)
if __name__ == "__main__":
whale = """\n _V__ _V__ _V__ _V__\n(____\/{ (____\/{ (____\/{ (____\/{\n"""
parser = argparse.ArgumentParser(description='Plot minimum read length vs. coverage or % sequence. Coverage is plotted when -g is specified. ')
parser.add_argument("fai", metavar="<fai.fofn>", type=str, help="A list of fai files. First column is the fai file and second column is the label.")
parser.add_argument("-g", metavar="<genome_size>", type=int, default=0, help="expected genome size (no prefix symbols).")
args = parser.parse_args()
fai_fofn = args.fai
genome_size = args.g
print(whale)
# Get all of the fai files
labels = dict()
with open(fai_fofn, 'r') as f:
for line in f:
fai_file, label = line.rstrip().split("\t")
labels[fai_file] = label
# Compute coverages and plot
MEDIUM_SIZE = 18
BIGGER_SIZE = 100
plt.figure(figsize=(7, 7))
xvals = list(range(0, 105000, 5000))
for fai in labels:
log("Calculating coverage for " + fai)
read_lens = []
with open(fai, 'r') as f:
for line in f:
rlen = line.rstrip().split("\t")[1]
read_lens.append(int(rlen))
sorted_lens = sorted(read_lens, reverse=True)
seqsums = dict()
for i in range(len(xvals)):
seqsums[i] = 0
for i in sorted_lens:
idx = bisect.bisect_right(xvals, i)
for j in range(idx):
seqsums[j] += i
covs = dict()
percs = dict()
if genome_size:
for i in seqsums:
covs[i] = seqsums[i] / genome_size
# Plot for this iteration
plt.scatter([xvals[i] for i in covs.keys()], list(covs.values()), s=100, alpha=0.5)
plt.plot([xvals[i] for i in covs.keys()], list(covs.values()), label=labels[fai], linewidth=3, alpha=0.5)
else:
total_seq = sum(sorted_lens)
for i in seqsums:
percs[i] = (seqsums[i] / total_seq) * 100
plt.scatter([xvals[i] for i in percs.keys()], list(percs.values()), s=100, alpha=0.5)
plt.plot([xvals[i] for i in percs.keys()], list(percs.values()), label=labels[fai], linewidth=3, alpha=0.5)
plt.legend(loc=1, prop={'size': 10})
#plt.rc('axes', labelsize=BIGGER_SIZE)
#plt.rc('xtick', labelsize=MEDIUM_SIZE)
#plt.rc('ytick', labelsize=MEDIUM_SIZE)
plt.xlabel("Min Read Length (bp)")
if genome_size:
plt.ylabel("Coverage (X)")
else:
plt.ylabel("Percentage of Total Sequence")
plt.grid()
plt.savefig("podplot")
log("Goodbye!")