forked from bmvdgeijn/WASP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
rmdup_pe.py
289 lines (221 loc) · 10 KB
/
rmdup_pe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
import random
import pysam
import os
import sys
import argparse
import util
class ReadStats(object):
def __init__(self):
# number of reads discarded becaused not mapped
self.discard_unmapped = 0
# number of reads discarded because mate unmapped
self.discard_mate_unmapped = 0
# number of reads discarded because not proper pair
self.discard_improper_pair = 0
# paired reads map to different chromosomes
self.discard_different_chromosome = 0
# number of reads discarded because secondary match
self.discard_secondary = 0
# reads where we expected to see other pair, but it was missing
# possibly due to read-pairs with different names
self.discard_missing_pair = 0
# reads with only one paired mapped
self.discard_single = 0
# reads discarded because duplicated
self.discard_dup = 0
# number of read pairs kept
self.keep_pair = 0
def write(self, file_handle):
sys.stderr.write("DISCARD reads:\n"
" unmapped: %d\n"
" mate unmapped: %d\n"
" improper pair: %d\n"
" different chromosome: %d\n"
" secondary alignment: %d\n"
" missing pairs (e.g. mismatched read names): %d\n"
" not paired: %d\n"
" duplicate pairs: %d\n"
"KEEP reads:\n"
" pairs: %d\n" %
(self.discard_unmapped,
self.discard_improper_pair,
self.discard_improper_pair,
self.discard_different_chromosome,
self.discard_secondary,
self.discard_missing_pair,
self.discard_single,
self.discard_dup,
self.keep_pair))
def main(input_bam, output_bam):
if input_bam.endswith(".sam") or input_bam.endswith("sam.gz"):
infile = pysam.Samfile(input_bam, "r")
else:
# assume binary BAM file
infile = pysam.Samfile(input_bam, "rb")
if output_bam.endswith(".sam"):
# output in text SAM format
outfile = pysam.Samfile(output_bam, "w", template=infile)
elif output_bam.endswith(".bam"):
# output in binary compressed BAM format
outfile = pysam.Samfile(output_bam, "wb", template=infile)
else:
raise ValueError("name of output file must end with .bam or .sam")
filter_reads(infile, outfile)
infile.close()
outfile.close()
def update_read_cache(cur_by_mpos, keep_cache, discard_cache,
read_stats, outfile):
for mpos, read_list in cur_by_mpos.items():
# only keep one read from list with same pos,mate_pos pair
# shuffle order of reads in list and take first
# as 'keep' read
random.shuffle(read_list)
keep_read = read_list.pop()
if keep_read.qname in keep_cache:
raise ValueError("read %s is already "
"in keep cache" % keep_read.qname)
keep_cache[keep_read.qname] = keep_read
# rest of reads get discarded
for discard_read in read_list:
# corner case: if reads are completely overlapping
# (same start pos) then we either want to keep both
# or discard both right now
if discard_read.qname in discard_cache:
# discard both reads from pair
del discard_cache[discard_read.qname]
elif discard_read.qname == keep_read.qname:
# keep both reads from pair
read_stats.keep_pair += 1
outfile.write(keep_read)
outfile.write(discard_read)
del keep_cache[keep_read.qname]
else:
discard_cache[discard_read.qname] = discard_read
def filter_reads(infile, outfile):
read_stats = ReadStats()
cur_tid = None
seen_chrom = set([])
# name of reads to keep
keep_cache = {}
# name of reads to discard
discard_cache = {}
cur_by_mpos = {}
read_count = 0
# current position on chromosome
cur_pos = None
# lists of reads at current position,
# grouped by the mate pair position
cur_by_mpos = {}
for read in infile:
read_count += 1
if read.is_unmapped:
read_stats.discard_unmapped += 1
continue
if (cur_tid is None) or (read.tid != cur_tid):
# this is a new chromosome
cur_chrom = infile.getrname(read.tid)
if cur_pos:
update_read_cache(cur_by_mpos, keep_cache, discard_cache,
read_stats, outfile)
if len(keep_cache) + len(discard_cache) != 0:
sys.stderr.write("WARNING: failed to find pairs for %d "
"reads on this chromosome\n" %
(len(keep_cache) + len(discard_cache)))
read_stats.discard_missing_pair += len(keep_cache) + \
len(discard_cache)
sys.stderr.write("keep_cache:\n")
for r in keep_cache.values():
sys.stderr.write(" %s\n" % r.qname)
sys.stderr.write("discard_cache:\n")
for r in discard_cache.values():
sys.stderr.write(" %s\n" % r.qname)
keep_cache = {}
discard_cache = {}
cur_pos = None
cur_by_mpos = {}
read_count = 0
if cur_chrom in seen_chrom:
# sanity check that input bam file is sorted
raise ValueError("expected input BAM file to be sorted "
"but chromosome %s is repeated\n" % cur_chrom)
seen_chrom.add(cur_chrom)
cur_tid = read.tid
sys.stderr.write("starting chromosome %s\n" % cur_chrom)
sys.stderr.write("processing reads\n")
if read.mate_is_unmapped:
read_stats.discard_mate_unmapped += 1
continue
if read.is_secondary:
# this is a secondary alignment (i.e. read was aligned more than
# once and this has align score that <= best score)
read_stats.discard_secondary += 1
continue
if (not read.is_paired) or (read.next_reference_name is None):
read_stats.discard_single += 1
continue
if (read.next_reference_name != cur_chrom) and \
(read.next_reference_name != "="):
# other side of pair mapped to different chromosome
read_stats.discard_different_chromosome += 1
continue
if not read.is_proper_pair:
read_stats.discard_improper_pair += 1
continue
if (cur_pos is not None) and (read.pos < cur_pos):
raise ValueError("expected input BAM file to be sorted "
"but reads are out of order")
if cur_pos is None or read.pos > cur_pos:
# we have advanced to a new start position
# decide which of reads at last position to keep or discard
update_read_cache(cur_by_mpos, keep_cache, discard_cache,
read_stats, outfile)
# create new list of reads at current position
cur_pos = read.pos
cur_by_mpos = {}
if read.qname in keep_cache:
# we already saw prev side of pair, retrieve from cache
read1 = keep_cache[read.qname]
read2 = read
del keep_cache[read.qname]
if read2.next_reference_start != read1.reference_start:
sys.stderr.write("WARNING: read pair positions "
"do not match for pair %s\n" % read.qname)
read_stats.keep_pair += 1
outfile.write(read1)
outfile.write(read2)
elif read.qname in discard_cache:
# we already saw prev side of pair, but decided to discard
# because read duplicated
del discard_cache[read.qname]
read_stats.discard_dup += 1
else:
# we have not seen other side of this read yet
# add read to list of those at current position
# grouping by mate-pair position
if read.mpos in cur_by_mpos:
cur_by_mpos[read.mpos].append(read)
else:
cur_by_mpos[read.mpos] = [read]
# final update of read cache is just to cache strange corner case
# where final read pair on chromosome were overlapping (same start pos)
if cur_pos:
update_read_cache(cur_by_mpos, keep_cache, discard_cache,
read_stats, outfile)
if (len(keep_cache) + len(discard_cache)) != 0:
sys.stderr.write("WARNING: failed to find pairs for %d "
"keep reads and %d discard reads on this "
"chromosome\n" % (len(keep_cache), len(discard_cache)))
read_stats.discard_missing_pair += len(keep_cache) + len(discard_cache)
read_stats.write(sys.stderr)
if __name__ == "__main__":
sys.stderr.write("command line: %s\n" % " ".join(sys.argv))
sys.stderr.write("python version: %s\n" % sys.version)
sys.stderr.write("pysam version: %s\n" % pysam.__version__)
util.check_pysam_version()
parser = argparse.ArgumentParser()
parser.add_argument('input_bam', help="input BAM or SAM file (must "
"be sorted!)")
parser.add_argument("output_bam", help="output BAM or SAM file (not "
"sorted!)")
options = parser.parse_args()
main(options.input_bam, options.output_bam)