-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathWessim1.py
221 lines (204 loc) · 7.45 KB
/
Wessim1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import sys
import random
import bisect
#import pysam
import gzip
import cPickle
import numpy
from time import time, localtime, strftime
import argparse
from multiprocessing import Process
import os
import math
#import pysam
inds={'A':0,'T':1,'G':2,'C':3,'N':4,'a':0,'t':1,'g':2,'c':3,'n':4}
def subprogram(command, name):
os.system(command)
print "exiting subprocess " + str(name)
def main(argv):
t0 = time()
arguline = " ".join(argv)
parser = argparse.ArgumentParser(description='Wessim1: Whole Exome Sequencing SIMulator 1 (Ideal target region-based version)', prog='Wessim1', formatter_class=argparse.RawTextHelpFormatter)
group1 = parser.add_argument_group('Mandatory input files')
group1.add_argument('-R', metavar = 'FILE', dest='reference', required=True, help='faidx-indexed (R)eference genome FASTA file')
group1.add_argument('-B', metavar = 'FILE', dest='region', required=True, help='Target region .(B)ED file')
group2 = parser.add_argument_group('Parameters for exome capture')
group2.add_argument('-f', metavar = 'INT', type=int, dest='fragsize', required=False, help='mean (f)ragment size. this corresponds to insert size when sequencing in paired-end mode. [200]', default=200)
group2.add_argument('-d', metavar = 'INT', type=int, dest='fragsd', required=False, help='standard (d)eviation of fragment size [50]', default=50)
group2.add_argument('-m', metavar = 'INT', type=int, dest='fragmin', required=False, help='(m)inimum fragment length [read_length + 20]')
group2.add_argument('-x', metavar = 'INT',type=int, dest='slack', required=False, help='slack margin of the given boundaries [0]', default=0)
group3 = parser.add_argument_group('Parameters for sequencing')
group3.add_argument('-p', action='store_true', help='generate paired-end reads [single]')
group3.add_argument('-n', metavar = 'INT', type=int, dest='readnumber', required=True, help='total (n)umber of reads')
group3.add_argument('-l', metavar = 'INT', type=int, dest='readlength', required=True, help='read (l)ength (bp)')
group3.add_argument('-M', metavar = 'FILE', dest='model', required=True, help='GemSim (M)odel file (.gzip)')
group3.add_argument('-t', metavar = 'INT', type=int, dest='threadnumber', required=False, help='number of (t)hreaded subprocesses [1]', default=1)
group4 = parser.add_argument_group('Output options')
group4.add_argument('-o', metavar = 'FILE', dest='outfile', help='(o)utput file header. ".fastq.gz" or ".fastq" will be attached automatically. Output will be splitted into two files in paired-end mode', required=True)
group4.add_argument('-z', action='store_true', help='compress output with g(z)ip [false]')
group4.add_argument('-q', metavar = 'INT', type=int, dest='qualbase', required=False, help='(q)uality score offset [33]', default=33)
group4.add_argument('-v', action='store_true', help='(v)erbose; print out intermediate messages.')
args = parser.parse_args()
reffile = args.reference
regionfile = args.region
isize = args.fragsize
isd = args.fragsd
imin = args.fragmin
slack = args.slack
getRegionVector(reffile, regionfile, slack)
paired = args.p
readlength = args.readlength
readnumber = args.readnumber
threadnumber = args.threadnumber
if imin==None:
if paired:
imin = readlength + 20
else:
imin = readlength + 20
if isize < imin:
print "too small mean fragment size (" + str(isize) + ") compared to minimum length (" + str(imin) + "). Increase it and try again."
sys.exit(0)
model = args.model
outfile = args.outfile
compress = args.z
qualbase = args.qualbase
verbose = args.v
print
print "-------------------------------------------"
print "Reference:", reffile
print "Region file:", regionfile
print "Fragment:",isize, "+-", isd, ">", imin
print "Paired-end mode?", paired
print "Sequencing model:", model
print "Read length:", readlength, "Read number:", readnumber
print "Output File:", outfile
print "Gzip compress?", compress
print "Quality base:", qualbase
print "Thread number:", threadnumber
print "Job started at:", strftime("%Y-%m-%d %H:%M:%S", localtime())
print "-------------------------------------------"
print
processes = []
for t in range(0, threadnumber):
readstart = int(float(readnumber) / float(threadnumber) * t) + 1
readend = int(float(readnumber) / float(threadnumber) * (t+1))
command = "python __sub_wessim1.py " + arguline + " -1 " + str(readstart) + " -2 " + str(readend) + " -i " + str(t+1)
p = Process(target=subprogram, args=(command, t+1))
p.start()
processes.append(p)
for p in processes:
p.join()
t1 = time()
print "Done generating " + str(readnumber) + " reads in %f secs" % (t1 - t0)
print "Merging subresults..."
wread = None
wread2 = None
if paired and compress:
wread = gzip.open(outfile + "_1.fastq.gz", 'wb')
wread2 = gzip.open(outfile + "_2.fastq.gz", 'wb')
elif paired and not compress:
wread = open(outfile + "_1.fastq", 'w')
wread2 = open(outfile + "_2.fastq", 'w')
elif not paired and compress:
wread = gzip.open(outfile + ".fastq.gz", 'wb')
else:
wread = open(outfile + ".fastq", 'w')
if not paired:
for t in range(0, threadnumber):
suboutfile = outfile + "-" + str(t+1)
fread = None
if compress:
suboutfile += ".fastq.gz"
fread = gzip.open(suboutfile, 'rb')
else:
suboutfile += ".fastq"
fread = open(suboutfile, 'r')
line = fread.readline()
while line:
wread.write(line)
line = fread.readline()
fread.close()
os.remove(suboutfile)
wread.close()
else:
for t in range(0, threadnumber):
suboutfile1 = outfile + "-" + str(t+1) + "_1"
suboutfile2 = outfile + "-" + str(t+1) + "_2"
fread1 = None
fread2 = None
if compress:
suboutfile1 += ".fastq.gz"
suboutfile2 += ".fastq.gz"
fread1 = gzip.open(suboutfile1, "rb")
fread2 = gzip.open(suboutfile2, "rb")
else:
suboutfile1 += ".fastq"
suboutfile2 += ".fastq"
fread1 = open(suboutfile1, "r")
fread2 = open(suboutfile2, "r")
line1 = fread1.readline()
line2 = fread2.readline()
while line1 and line2:
wread.write(line1)
wread2.write(line2)
line1 = fread1.readline()
line2 = fread2.readline()
fread1.close()
fread2.close()
os.remove(suboutfile1)
os.remove(suboutfile2)
wread.close()
wread2.close()
sys.exit(0)
def read_fasta(fname):
with open(fname, "r") as fh:
name = None
seqs = {}
seqs2 = {}
for line in fh.readlines():
if line[0]=='>':
line = line.rstrip()
name = line[1:]
name = name.split()[0]
name = name.split("_")[0]
seqs[name] = []
else:
seqs[name].append(line.rstrip())
#chrs = []
for key in seqs:
#chrs.append(key)
seqs2[key] = ''.join(seqs[key])
fh.close()
return(seqs2)
def getRegionVector(fastafile, regionfile, slack):
print("Generating fasta file for given regions...")
faoutfile = regionfile + ".fa"
abdoutfile = regionfile + ".abd"
ref=read_fasta(fastafile)
f = open(regionfile)
wfa = open(faoutfile, 'w')
wabd = open(abdoutfile, 'w')
abd = 0
for i in f.readlines():
values = i.split("\t")
if i.startswith("#") or len(values)<3:
continue
chrom = values[0]
start = max(int(values[1]) - slack, 1)
end = int(values[2]) + slack
header = ">" + chrom + "_" + str(start) + "_" + str(end)
x0 = ref[chrom]
x = x0[(start-1):end]
length = len(x)
abd += length
wfa.write(header + "\n")
wfa.write(x + "\n")
wabd.write(str(abd) + "\n")
f.close()
wfa.close()
wabd.close()
if __name__=="__main__":
main(sys.argv[1:])
# fewer dependencies
# can read in first line of bed
# fix "length is one less" issue: started from (start+1)