forked from andreas-wilm/compbio-utils
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvcf_read_support.py
executable file
·258 lines (210 loc) · 8.01 KB
/
vcf_read_support.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
#!/usr/bin/env python
"""Extract reads overlapping variant positions and tag them according
to whether they support a variant or the reference.
Output SAM with extra tag key:Z:chr:pos:ref>alt where chr, pos, ref
and alt correspond to the variant of question. For reads supporting
variants key is 'VV', for those supporting the reference it's VR,
otherwise the read will not be written
"""
__author__ = "Andreas Wilm"
__email__ = "wilma@gis.a-star.edu.sg"
__copyright__ = "2014 Genome Institute of Singapore"
__license__ = "GPL2"
#--- standard library imports
#
import sys
import logging
import os
import argparse
import gzip
from collections import namedtuple
#--- third-party imports
#
import pysam
#--- project specific imports
#
# /
#global logger
# http://docs.python.org/library/logging.html
LOG = logging.getLogger("")
logging.basicConfig(level=logging.WARN,
format='%(levelname)s [%(asctime)s]: %(message)s')
Variant = namedtuple('Variant',
['chrom', 'pos', 'id', 'ref', 'alt', 'qual', 'filter', 'info'])
# all fields are strings with the exception of:
# pos: int (-1 based)
# qual: an int if not missing, otherwise "."
# info: dict
SKIP_FLAGS = [0x4, 0x100, 0x200, 0x400]
def cmdline_parser():
"""Returns an argparse instance
"""
# http://docs.python.org/dev/howto/argparse.html
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("--verbose",
action="store_true",
help="Be verbose")
parser.add_argument("--debug",
action="store_true",
help="Enable debugging")
parser.add_argument("-b", "--bam",
required=True,
help="Input BAM file matching vcf")
parser.add_argument("-i", "--vcf",
help="Input VCF file containing variants to analyze"
" (clashes with --var)")
parser.add_argument("-v", "--var",
help="Report reads for this variant only. Format: chr:pos:ref-alt"
" (clashes with --vcf)")
default = 0
parser.add_argument("--mq-filter",
dest="min_mq",
type=int,
default=default,
help="Ignore reads with mapping quality below this value (default=%d)" % default)
default = 5
parser.add_argument("--bq-filter",
dest="min_bq",
type=int,
default=default,
help="Ignore reads with bases below this value (default=%d)" % default)
parser.add_argument("-a", "--use-orphan",
action="store_true",
help="Don't ignore orphan-reads / anomalous read-pairs")
return parser
def simple_vcf_reader(fh):
"""yields Variant (chromosome and position only) for variants
listed in vcf file
"""
for line in fh:
if line.startswith('#'):
continue
ls = line.rstrip().split('\t')
# 8 fixed fields per record
assert len(ls)>=8, (
"Number of retrieved fields in vcf file too small")
# ignoring the rest
(chrom, pos, id, ref, alt, qual, filter, info) = ls[:8]
pos = int(pos)-1
try:
qual = int(qual)
except:
qual = "."
info = dict([field.split('=') for field in info.split(';')])
yield Variant(chrom, pos, id, ref, alt, qual, filter, info)
def main():
"""The main function
"""
parser = cmdline_parser()
args = parser.parse_args()
if args.verbose:
LOG.setLevel(logging.INFO)
if args.debug:
LOG.setLevel(logging.DEBUG)
import pdb
from IPython.core import ultratb
sys.excepthook = ultratb.FormattedTB(mode='Verbose',
color_scheme='Linux', call_pdb=1)
assert os.path.exists(args.bam), (
"BAM file %s does not exist" % args.bam)
sam_in_fh = pysam.Samfile(args.bam)
sam_out_fh = pysam.Samfile("-", "w", template=sam_in_fh)
# variants
#
#
if args.vcf and args.var:
LOG.fatal("Please use one: vcf or variant arg, but bot both")
sys.exit(1)
if args.vcf:
if args.vcf == '-':
vcf_reader = simple_vcf_reader(sys.stdin)
else:
if args.vcf[-3:] == '.gz':
vcf_reader = simple_vcf_reader(gzip.open(args.vcf))
else:
vcf_reader = simple_vcf_reader(open(args.vcf))
variants = [r for r in vcf_reader]
LOG.info("Loaded %d variants from %s" % (len(variants), args.vcf))
elif args.var:
try:
(chrom, pos, ref_alt) = args.var.split(":")
(ref, alt) = ref_alt.split('-')
pos = int(pos)-1
except:
LOG.fatal("Couldn't parse variant %s" % args.var)
sys.exit(1)
variants = [Variant(chrom, pos, ".", ref, alt, ".", ".", dict())]
else:
LOG.critical("Missing vcf or variant argument")
sys.exit(1)
for var in variants:
if var.info.has_key('INDEL'):
LOG.warn("Skipping unsupported indel variant at %s:%d" % (
var.chrom, var.pos+1))
continue
if len(var.ref)>1 or len(var.alt)>1:
LOG.warn("Skipping ref/alt variant with more than"
" 1 base at %s:%d" % (var.chrom, var.pos+1))
continue
reads = list(sam_in_fh.fetch(reference=var.chrom,
start=var.pos, end=var.pos+1))
LOG.info("%s %d: %d (unfiltered) reads covering position" % (
var.chrom, var.pos+1, len(reads)))
for r in reads:
# FIXME combine
for f in SKIP_FLAGS:
if r.flag & f:
continue
orphan = (r.flag & 0x1) and not (r.flag & 0x2)
if orphan and not args.use_orphan:
continue
if r.mapq < args.min_mq:
continue
vpos_on_read = [vpos_on_read
for (vpos_on_read, vpos_on_ref) in r.aligned_pairs
if vpos_on_ref==var.pos]
assert len(vpos_on_read)==1
vpos_on_read = vpos_on_read[0]
if vpos_on_read == None:# FIXME no support for deletions
continue
b = r.query[vpos_on_read]
bq = ord(r.qqual[vpos_on_read])-33
if bq < args.min_bq:
continue
has_ref = False
has_var = False
if b.upper() == var.ref[0].upper():
has_ref = True
elif b.upper() == var.alt[0].upper():
has_var = True
else:
# ignore non ref non var
continue
# only way I found to add tags. inspired by
# http://www.ngcrawford.com/2012/04/17/python-adding-read-group-rg-tags-to-bam-or-sam-files/
if has_ref:
var_tag_key = 'VR'
elif has_var:
var_tag_key = 'VV'
else:
continue# paranoia (already handled above)
assert var_tag_key not in [t[0] for t in r.tags], (
"Oops...tag %s already present in read. Refusing to overwrite")
var_tag = (var_tag_key, '%s:%d:%s>%s' % (
var.chrom, var.pos+1, var.ref, var.alt))
new_tags = r.tags
new_tags.append(var_tag)
r.tags = new_tags
sam_out_fh.write(r)
sam_in_fh.close()
# FIXME close sam out if not stdout
# FIXME untangle and move to functions
# FIXME add tests:
# 1:
# vcf_read_support.py -b bam -v var | grep -c var
# should give same as
# vcf_read_support.py -b bam -i vcf -b bam | grep -c var
# ...
if __name__ == "__main__":
main()
LOG.info("Successful program exit")