forked from bmvdgeijn/WASP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_as_counts.py
325 lines (248 loc) · 12.5 KB
/
get_as_counts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
import sys
import argparse
import numpy as np
import pysam
import util
import snptable
import tables
import os
def write_results(out_f, chrom_name, snp_tab, ref_matches,
alt_matches, oth_matches, geno_sample):
haps = None
has_haps = False
if geno_sample:
# get index for this sample in the haplotype table
samp_idx_dict = dict(zip(snp_tab.samples,
range(len(snp_tab.samples))))
if geno_sample in samp_idx_dict:
idx = samp_idx_dict[geno_sample]
geno_hap_idx = np.array([idx*2, idx*2+1], dtype=np.int)
haps = snp_tab.haplotypes[:,geno_hap_idx]
has_haps = True
sys.stderr.write("geno_hap_idx: %s\n" % repr(geno_hap_idx))
else:
sys.stderr.write("WARNING: sample %s is not present for "
"chromosome %s\n" % (geno_sample, chrom_name))
haps = None
has_haps = False
for i in range(snp_tab.n_snp):
if has_haps:
geno_str = "%d|%d" % (haps[i, 0], haps[i, 1])
else:
geno_str = "NA"
out_f.write("%s %d %s %s %s %d %d %d\n" %
(chrom_name, snp_tab.snp_pos[i],
snp_tab.snp_allele1[i], snp_tab.snp_allele2[i],
geno_str, ref_matches[i], alt_matches[i],
oth_matches[i]))
def write_header(out_f):
out_f.write("CHROM SNP.POS REF.ALLELE ALT.ALLELE GENOTYPE REF.COUNT "
"ALT.COUNT OTHER.COUNT\n")
def parse_samples(samples_str):
"""Gets list of samples from --samples argument. This may be
a comma-delimited string or a path to a file. If a file is provided
then the first column of the file is assumed to be the sample name"""
if samples_str is None:
return None
# first check if this is a path to a file
if os.path.exists(samples_str) and not os.path.isdir(samples_str):
samples = []
if util.is_gzipped(samples_str):
f = gzip.open(samples_str)
else:
f = open(samples_str)
for line in f:
# assume first token in line is sample name
samples.append(line.split()[0])
sys.stderr.write("read %d sample names from file '%s'\n" %
(len(samples), samples_str))
f.close()
else:
# otherwise assume comma-delimited string
if ("," not in samples_str and len(samples_str) > 15) \
or ("/" in samples_str):
sys.stderr.write("WARNING: --samples argument (%s) "
"does not look like sample name "
"but is not path to valid file. "
"Assuming it is a sample name anyway."
% samples_str)
samples = samples_str.split(",")
sys.stderr.write("SAMPLES: %s\n"% repr(samples))
return samples
def parse_options():
parser = argparse.ArgumentParser(description="This script outputs "
"allele-specific counts for SNPs, using "
"reads from the provided BAM file. "
"Currently indels are not output and "
"chromosomes with no mapped reads "
"are skipped. Output "
"is written to stdout, with a single "
"header row and the following "
"columns: <chromosome> <snp_position> "
"<ref_allele> <alt_allele> <genotype> "
"<ref_allele_count> <alt_allele_count> "
"<other_count>. Reads that overlap "
"multiple SNPs will be counted multiple "
"times in the output (this behavior "
"differs from the "
"extract_haplotype_read_counts.py "
"script).")
parser.add_argument("--snp_dir", action='store',
help=("Directory containing SNP text files "
"This directory should contain one file per "
"chromosome named like chr<#>.snps.txt.gz. "
"Each file should contain 3 columns: position "
"RefAllele AltAllele"),
default=None)
parser.add_argument("--snp_tab",
help="Path to HDF5 file to read SNP information "
"from. Each row of SNP table contains SNP name "
"(rs_id), position, allele1, allele2.",
metavar="SNP_TABLE_H5_FILE",
default=None)
parser.add_argument("--snp_index",
help="Path to HDF5 file containing SNP index. The "
"SNP index is used to convert the genomic position "
"of a SNP to its corresponding row in the haplotype "
"and snp_tab HDF5 files.",
metavar="SNP_INDEX_H5_FILE",
default=None)
parser.add_argument("--haplotype",
help="Path to HDF5 file to read phased haplotypes "
"from. When generating alternative reads "
"use known haplotypes from this file rather "
"than all possible allelic combinations.",
metavar="HAPLOTYPE_H5_FILE",
default=None)
parser.add_argument("--samples",
help="Use only haplotypes and SNPs that are "
"polymorphic in these samples. "
"SAMPLES can either be a comma-delimited string "
"of sample names or a path to a file with one sample "
"name per line (file is assumed to be "
"whitespace-delimited and first column is assumed to "
"be sample name). Sample names should match those "
"present in the haplotype HDF5 file. Samples are "
"ignored if no haplotype file is provided.",
metavar="SAMPLES", default=None)
parser.add_argument("--genotype_sample",
metavar="GENO_SAMPLE",
help="output genotypes for sample with name "
"GENO_SAMPLE alongside allele-specific counts. "
"GENO_SAMPLE must match one "
"of the names present in the haplotype HDF5 file. "
"If the --samples argument is provided then "
"GENO_SAMPLE must also be one of the specified "
"samples. If --genotype_sample is "
"not provided or the GENO_SAMPLE does not match any "
"of the samples in haplotype file then NA is "
"output for genotype.", default=None)
parser.add_argument("bam_filename", action='store',
help="Coordinate-sorted input BAM file "
"containing mapped reads.")
options = parser.parse_args()
if options.snp_dir:
if(options.snp_tab or options.snp_index or options.haplotype):
parser.error("expected --snp_dir OR (--snp_tab, --snp_index and "
"--haplotype) arguments but not both")
else:
if not (options.snp_tab and options.snp_index and options.haplotype):
parser.error("either --snp_dir OR (--snp_tab, "
"--snp_index AND --haplotype) arguments must be "
"provided")
return options
def main(bam_filename, snp_dir=None, snp_tab_filename=None,
snp_index_filename=None, haplotype_filename=None, samples=None,
geno_sample=None):
out_f = sys.stdout
bam = pysam.Samfile(bam_filename)
cur_chrom = None
cur_tid = None
seen_chrom = set([])
snp_tab = snptable.SNPTable()
read_pair_cache = {}
# keep track of number of ref matches, non-ref matches, and other
# for each SNP
snp_ref_match = None
snp_alt_match = None
snp_other_match = None
if geno_sample and not haplotype_filename:
sys.stderr.write("WARNING: cannot obtain genotypes for sample "
"%s without --haplotype argument\n")
geno_sample = None
sys.stderr.write("GENOTYPE_SAMPLE: %s\n" % geno_sample)
if snp_tab_filename:
if (not snp_index_filename) or (not haplotype_filename):
raise ValueError("--snp_index and --haplotype must be provided "
"if --snp_tab is provided")
snp_tab_h5 = tables.openFile(snp_tab_filename, "r")
snp_index_h5 = tables.openFile(snp_index_filename, "r")
hap_h5 = tables.openFile(haplotype_filename, "r")
else:
snp_tab_h5 = None
snp_index_h5 = None
hap_h5 = None
for read in bam:
if (cur_tid is None) or (read.tid != cur_tid):
# this is a new chromosome
if cur_chrom:
# write out results from last chromosome
write_results(out_f, cur_chrom, snp_tab, snp_ref_match,
snp_alt_match, snp_oth_match, geno_sample)
cur_chrom = bam.getrname(read.tid)
if cur_chrom in seen_chrom:
# sanity check that input bam file is sorted
raise ValueError("expected input BAM file to be sorted "
"but chromosome %s is repeated\n" % cur_chrom)
seen_chrom.add(cur_chrom)
cur_tid = read.tid
sys.stderr.write("starting chromosome %s\n" % cur_chrom)
# read SNPs for next chromomsome
if snp_tab_h5:
# read SNPs from HDF5 files, reduce to set that are
# polymorphic in specified samples
snp_tab.read_h5(snp_tab_h5, snp_index_h5, hap_h5,
cur_chrom, samples=samples)
elif snp_dir:
# read SNPs from text file
snp_filename = "%s/%s.snps.txt.gz" % (snp_dir, cur_chrom)
snp_tab.read_file(snp_filename)
else:
raise ValueError("--snp_dir OR (--snp_tab, --snp_index, "
"and --hap_h5) must be defined")
sys.stderr.write("read %d SNPs\n" % snp_tab.n_snp)
# clear SNP table and results
snp_ref_match = np.zeros(snp_tab.n_snp, dtype=np.int16)
snp_alt_match = np.zeros(snp_tab.n_snp, dtype=np.int16)
snp_oth_match = np.zeros(snp_tab.n_snp, dtype=np.int16)
if read.is_secondary:
# this is a secondary alignment (i.e. read was aligned more than
# once and this has align score that <= best score)
continue
# loop over all SNP that overlap this read
snp_idx, snp_read_pos, \
indel_idx, indel_read_pos = snp_tab.get_overlapping_snps(read)
for snp_i, read_pos in zip(snp_idx, snp_read_pos):
snp_pos = snp_tab.snp_pos[snp_i]
ref_allele = snp_tab.snp_allele1[snp_i]
alt_allele = snp_tab.snp_allele2[snp_i]
if ref_allele == read.query_sequence[read_pos-1]:
snp_ref_match[snp_i] += 1
elif alt_allele == read.query_sequence[read_pos-1]:
snp_alt_match[snp_i] += 1
else:
snp_oth_match[snp_i] += 1
if cur_chrom:
# write results for final chromosome
write_results(out_f, cur_chrom, snp_tab, snp_ref_match,
snp_alt_match, snp_oth_match, geno_sample)
if __name__ == "__main__":
sys.stderr.write("command: %s\n" % " ".join(sys.argv))
options = parse_options()
samples = parse_samples(options.samples)
main(options.bam_filename,
snp_dir=options.snp_dir,
snp_tab_filename=options.snp_tab,
snp_index_filename=options.snp_index,
haplotype_filename=options.haplotype,
samples=samples, geno_sample=options.genotype_sample)