-
Notifications
You must be signed in to change notification settings - Fork 42
/
RPKM_saturationSummary.py
executable file
·243 lines (209 loc) · 9.88 KB
/
RPKM_saturationSummary.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import division, with_statement
'''
Copyright 2015, 陈同 (chentong_biology@163.com).
===========================================================
'''
__author__ = 'chentong & ct586[9]'
__author_email__ = 'chentong_biology@163.com'
#=========================================================
desc = '''
Program description:
This is designed to summarize results output by `RPKM_saturation.py`.
'''
import sys
import os
from json import dumps as json_dumps
from time import localtime, strftime
timeformat = "%Y-%m-%d %H:%M:%S"
from optparse import OptionParser as OP
import re
from tools import *
#from multiprocessing.dummy import Pool as ThreadPool
#from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding('utf8')
debug = 0
def fprint(content):
"""
This is a Google style docs.
Args:
param1(str): this is the first param
param2(int, optional): this is a second param
Returns:
bool: This is a description of what is returned
Raises:
KeyError: raises an exception))
"""
print json_dumps(content,indent=1)
def cmdparameter(argv):
if len(argv) == 1:
global desc
print >>sys.stderr, desc
cmd = 'python ' + argv[0] + ' -h'
os.system(cmd)
sys.exit(1)
usages = "%prog -f file"
parser = OP(usage=usages)
parser.add_option("-f", "--files", dest="filein",
metavar="FILEIN", help="`,` or ` ` separated a list of files. *.Log.final.out generated by `STAR` during mapping")
parser.add_option("-l", "--labels", dest="label",
metavar="LABEL", help="`,` or ` ` separated a list of labels to label each file. It must have same order as files.")
parser.add_option("-o", "--output-prefix", dest="out_prefix",
help="The prefix of output files. UNUSED")
parser.add_option("-c", "--map-summary-cnt-file", dest="cnt",
help="A matrix file with first column as sample name and last column as \
counts of mapped reads. Normally this file is output by <STAR_read_mapSummary.py>.")
parser.add_option("-r", "--report-dir", dest="report_dir",
default='report', help="Directory for report files. Default 'report'.")
parser.add_option("-R", "--report-sub-dir", dest="report_sub_dir",
default='2_mapping_quality', help="Directory for saving report figures and tables. This dir will put under <report_dir>, so only dir name is needed. Default '2_mapping_quality'.")
parser.add_option("-d", "--doc-only", dest="doc_only",
default=False, action="store_true", help="Specify to only generate doc.")
parser.add_option("-n", "--number", dest="number", type="int",
default=40, help="Set the maximum allowed samples for barplot. Default 40.\
If more than this number of samples are given, heatmap will be used instead. UNUSED.")
parser.add_option("-v", "--verbose", dest="verbose",
action="store_true", help="Show process information")
parser.add_option("-D", "--debug", dest="debug",
default=False, action="store_true", help="Debug the program")
(options, args) = parser.parse_args(argv[1:])
assert options.filein != None, "A filename needed for -i"
return (options, args)
#--------------------------------------------------------------------
def readTwoColumnFile(fileL, labelL, header=0, index_col=0):
tmpL = []
for file, label in zip(fileL, labelL):
coverageM = pd.read_table(file, header=header, index_col=index_col)
coverageM.columns = [label]
tmpL.append(coverageM)
mergeM = pd.concat(tmpL, axis=1)
return mergeM
#-, ------------------------
def plot(fileL):
for file in fileL:
cmd = "s-plot barPlot -f " + file
os.system(cmd)
#--------------------------------------
def plot_melt(total_melt, nameL):
x_level = ["'"+i+"'" for i in nameL]
x_level = '"'+','.join(x_level)+'"'
cmd = ["s-plot barPlot -m TRUE -a Sample -R 90 -B set -O 1 -w 20 -u 25 -f ",
total_melt, ' -k free_y -L', x_level,
' -y \'Reads count or relative percent\' -x \'Samples\' ']
#print ' '.join(cmd)
os.system(' '.join(cmd))
#--------------------------------------
def plot_heatmap(totalTable):
cmd = ["s-plot heatmapS -a TRUE -b TRUE -R TRUE",
"-x white -y blue -u 18 -v 30 -F 12 ",
"-f ", totalTable, "-I RPM"]
os.system(' '.join(cmd))
#---------------------------------------
def generateDoc(report_dir, report_sub_dir, fileL, labelL, cntD):
dest_dir = report_dir+'/'+report_sub_dir+'/'
os.system('mkdir -p '+dest_dir)
print "\n## 测序饱和度评估 {#Sequencing-saturation-estimation}\n"
curation_label = "Sequencing_saturation_estimation"
knitr_read_txt(report_dir, curation_label)
print """样品测序饱和度评估。
首先把基因按其表达量分为4组,Q1(表达量最低的25%)、Q2(表达量在25%-50%之间)、Q3(表达量在50%-75%之间)、Q4(表达量最高的25%);然后从全部的比对reads中随机抽取5%、10%、15%、...、85%、90%、100%,利用随机抽取的reads计算基因的表达量,并与采用全部序列计算的表达量相比较,得到随机抽取条件(也就是在不同测序量条件)下的表达值偏差,并以箱线图的形式展示。
$$
Percent Relative Error = \\frac{RPKM_{abs}-RPKM_{real}}{RPKM_{real}} * 100
$$
当我们从总测序READs中抽取80%或90%的序列,计算出的表达量与使用全部测序READs计算出的表达量的偏差为0时,就说明我们测序的READs已达到饱和了,而且没有浪费太多的测序READs。
当我们从总测序READs中抽取30%或50%的序列,计算出的表达量与使用全部测序READs计算出的表达量的偏差为0时,也说明我们测序的READs已达到饱和了,但是浪费了一半的测序READs。以后再设计实验时就可以降低测序量,减少费用。
通常情况下,Q4区间的基因因为表达量最高,更容易达到饱和;而Q1区间的基因因为表达量最低,则需要测序很深才能达到饱和。
* Q1 (0-25%): Transcripts with expression level ranked below 25 percentile.
* Q2 (25-50%): Transcripts with expression level ranked between 25 percentile and 50 percentile.
* Q3 (50-75%): Transcripts with expression level ranked between 50 percentile and 75 percentile.
* Q4 (75-100%): Transcripts with expression level ranked above 75 percentile.
"""
len_fileL = len(fileL)
group = 3
for i in range(0, len_fileL, 3):
subF = fileL[i:i+3]
subL = labelL[i:i+3]
sub_cntL = [h+" ("+cntD[h]+" usable reads)" for h in subL]
pdfL = [j+'.saturation.pdf' for j in subF]
copypdf(dest_dir, *pdfL)
len_subF = len(subF)
pdfL = [report_sub_dir+'/'+os.path.split(j)[-1] for j in pdfL]
pngL = [j.replace('pdf', 'png') for j in pdfL]
pdf_link = [] #[label_pdf](pdf), [label_pdf](pdf)
for pdf, label in zip(pdfL, subL):
tmp_155 = '['+label+'_pdf]'+'('+pdf+')'
pdf_link.append(tmp_155)
pdf_link = ' '.join(pdf_link)
print "(ref:read-saturation-fig-{}) Summary sequencing saturation of each samples. From left to right, the samples are **{}**。 纵轴表示Percent Relative Error,值越低越好,也就是说箱线图的中位数或整个箱线图越接近0越好,说明测序越饱和。横轴表示抽取的READs数占总READs数的比例,模拟的是低测序通量情况下计算的基因表达的偏差。 {}\n".format(i, ', '.join(sub_cntL), pdf_link)
pngFileL = [] #"png1", "png2", "png3"
for png in pngL:
tmp_164 = "'"+png+"'"
pngFileL.append(tmp_164)
pngFileL = ', '.join(pngFileL)
print '''```{{r read-saturation-fig-{label}, out.width="{width}%", fig.cap="(ref:read-saturation-fig-{label})"}}
knitr::include_graphics(c({png}))
```
'''.format(label=i, png=pngFileL, width=int(100/len_subF))
#--------------------------------
def read_cnt_file(cnt_file):
cntD = {}
header = 1
for line in open(cnt_file):
if header:
header -= 1
continue
lineL = line.strip().split('\t')
sample = lineL[0]
reads_cnt = lineL[-1]
assert sample not in cntD, "Duplicate "+sample
cntD[sample] = reads_cnt
return cntD
#----------------------------
def main():
options, args = cmdparameter(sys.argv)
#-----------------------------------
file = options.filein
fileL = re.split(r'[, ]*', file.strip())
sample_readin = len(fileL)
label = options.label
labelL = re.split(r'[, ]*', label.strip())
verbose = options.verbose
op = options.out_prefix
cnt_file = options.cnt
cntD = read_cnt_file(cnt_file)
report_dir = options.report_dir
report_sub_dir = options.report_sub_dir
global debug
debug = options.debug
doc_only = options.doc_only
num_samples_each_grp = options.number
melt = 0
if sample_readin <= num_samples_each_grp:
melt = 1
#-----------------------------------
aDict = {}
#curation_label = os.path.split(sys.argv[0])[-1].replace('.', '_')
if doc_only:
generateDoc(report_dir, report_sub_dir, fileL, labelL, cntD)
return 0
generateDoc(report_dir, report_sub_dir, fileL, labelL, cntD)
###--------multi-process------------------
if __name__ == '__main__':
startTime = strftime(timeformat, localtime())
main()
endTime = strftime(timeformat, localtime())
fh = open('python.log', 'a')
print >>fh, "%s\n\tRun time : %s - %s " % \
(' '.join(sys.argv), startTime, endTime)
fh.close()
###---------profile the program---------
#import profile
#profile_output = sys.argv[0]+".prof.txt")
#profile.run("main()", profile_output)
#import pstats
#p = pstats.Stats(profile_output)
#p.sort_stats("time").print_stats()
###---------profile the program---------