-
Notifications
You must be signed in to change notification settings - Fork 0
/
prepare_vcf_file.py
278 lines (252 loc) · 13.3 KB
/
prepare_vcf_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
#!/usr/bin/env python3
import argparse
import logging
import os
import sys
import subprocess
from cyvcf2 import VCF
# tested with:
# bcftools version: 1.9 (using htslib 1.9)
# ---------------------------------------------------------------------------------------------------------------------
# Functions
# ---------------------------------------------------------------------------------------------------------------------
def parse_arguments():
description = "This script is used to re-format an input VCF file for the power calculations pipeline. " \
"Specifically, it will:\nCheck VCF file is a multi-sample VCF format; split multi-allelic sites; " \
"make sure GT genotypes are in haploid format, if not, convert diploid to haploid; add variant ID " \
"made up of CHROM.POS.REF.ALT; and select subset of samples, if chosen.\n"
parser = argparse.ArgumentParser(description=description)
group = parser.add_argument_group('Required arguments')
group.add_argument(
"-v", "--input_vcf", action="store", dest="input_vcf",
help="multi-sample VCF file",
required=True, metavar="INPUT_VCF"
)
group.add_argument(
"-o", "--output_vcf", action="store", dest="output_vcf",
help="output VCF file correctly formatted",
required=True, metavar="OUTPUT_VCF"
)
group = parser.add_argument_group('Optional arguments')
group.add_argument(
"-b", "--bcftools_path", action="store", dest="bcftools_path",
help="Full path to bcftools executable or executable name", required=False, default='bcftools',
metavar="BCFTOOLS_PATH"
)
group.add_argument(
"-g", "--bgzip_path", action="store", dest="bgzip_path",
help="Full path to bgzip executable or executable name", required=False, default='bgzip',
metavar="BGZIP_PATH"
)
group.add_argument(
"-t", "--tabix_path", action="store", dest="tabix_path",
help="Full path to tabix executable or executable name", required=False, default='tabix',
metavar="TABIX_PATH"
)
group.add_argument(
"-c", "--chr_id", action="store", dest="chr_id",
help="Chromosome ID to be used in VCF.\n",
required=False, metavar="CHR_ID"
)
group.add_argument(
"-f", "--to_haploid_format", action="store_true", dest="to_haploid_format",
help="Whether to convert VCF GT fields to haploid format (0/0 or 0:. to 0)\n", required=False,
)
group.add_argument(
"-r", "--replace_gt", action="store_true", dest="replace_gt",
help="Replace GT value ./. with 0/0. Needed for Snippy multi-sample VCF files.\n", required=False,
)
group.add_argument(
"-s", "--samples_list", action="store", dest="samples_list",
help="Samples to keep from VCF file.\nOne sample id per line expected.\n",
required=False, metavar="SAMPLES_LIST"
)
return parser.parse_args()
def check_dependency(executable_name):
""" Returns true if executable exists, else false """
found = False
try:
output = subprocess.check_output(['which', executable_name]).strip()
if output:
found = True
except subprocess.CalledProcessError as err:
print('ERROR:', err)
return found
def run_command_string(command_line_string):
"""
This function executes a command line, check for execution errors and returns stdout
:param command_line_string: it must be a string
:return: stdout
"""
print('\tRunning: ' + command_line_string)
try:
process_completed = subprocess.run(
command_line_string,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
check=True,
shell=True,
)
except subprocess.CalledProcessError as err:
print('ERROR:', err)
return process_completed.stdout.decode('utf-8')
def run_command_shell_string(command_line_string):
"""
This function executes a command line, check for execution errors and but does not return stdout
This is to be used when the stdout is not needed
Note: shell=True needs to be set if I/O redirection operators are to be used (e.g. >) in the command line,
otherwise they will have no special meaning, they are treated as ordinary arguments
Note: if shell=True is used then the command line must be provided as a string, not a list
:param command_line_string: it must be a string not a list
"""
print('\tRunning: ' + command_line_string)
try:
subprocess.run(command_line_string,
check=True,
shell=True,
)
except subprocess.CalledProcessError as err:
print('ERROR:', err)
# ------------------------------------------------------------------------------------
# Main program
# ------------------------------------------------------------------------------------
def _main():
# Configure logging
logging.basicConfig(
format='%(asctime)s %(levelname)s: %(message)s',
level=logging.INFO
)
# Get arguments
args = parse_arguments()
# Making sure input VCF file exists
if not os.path.exists(args.input_vcf):
logging.error(f'Input VCF file (--input_vcf) {args.input_vcf} not found!')
sys.exit(-1)
# Making sure dependencies exist
check_dependency(args.bcftools_path)
# Check multi-VCF format
logging.info(f'Extracting samples from input VCF file {args.input_vcf}')
stout = run_command_string(''.join([args.bcftools_path, ' query -l ', args.input_vcf]))
vcf_samples = dict()
for sample in stout.strip().split('\n'):
if sample != '':
vcf_samples[sample] = 1
if len(vcf_samples) < 2:
logging.error(f'Only {str(len(vcf_samples))} samples could be extracted from {args.input_vcf}. '
f'Use a multi-sample VCF file.')
sys.exit(-1)
else:
logging.info(f'A total of {str(len(vcf_samples))} samples extracted from {args.input_vcf}')
# Create temporary VCF file
tmp_vcf_file = args.input_vcf + '_tmp'
tmp_vcf_file2 = tmp_vcf_file + '_tmp'
logging.info(f'Creating temporary VCF file {tmp_vcf_file}')
run_command_shell_string(''.join(['cp ', args.input_vcf, ' ', tmp_vcf_file]))
# Change CHROM id, if chosen
if args.chr_id is not None:
logging.info(f'Replacing VCF chromosome Id')
stout = run_command_string(''.join([args.bcftools_path, ' view ', args.input_vcf, ' | grep \"^##contig\"']))
num_contigs = stout.strip().split('\n')
if len(num_contigs) == 1:
old_chr_id = stout.strip().split('=')[2].split(',')[0]
new_chr_id = args.chr_id.strip()
logging.info(f'Replacing CHROM id \'{old_chr_id}\' with \'{new_chr_id}\'')
# run_command_shell_string(''.join(['sed -i \'\' \'s/^', old_chr_id, '/', new_chr_id, '/g\'', ' ',
# tmp_vcf_file]))
# run_command_shell_string(''.join(['sed -i \'\' \'s/contig=<ID=', old_chr_id, '/contig=<ID=', new_chr_id,
# '/g\'', ' ', tmp_vcf_file]))
run_command_shell_string(''.join(['sed -i \'s/^', old_chr_id, '/', new_chr_id, '/g\'', ' ',
tmp_vcf_file]))
run_command_shell_string(''.join(['sed -i \'s/contig=<ID=', old_chr_id, '/contig=<ID=', new_chr_id,
'/g\'', ' ', tmp_vcf_file]))
else:
logging.error(f'Only one contig/chrom supported in input VCF file if option --chr_id is chosen. '
f'{str(len(num_contigs))} contigs found.\n')
sys.exit(-1)
# Split multi-allelic alleles
logging.info(f'Split multi-allelic sites')
run_command_shell_string(''.join([args.bcftools_path, ' norm -Ov -m -any ', tmp_vcf_file, ' > ', tmp_vcf_file2]))
run_command_shell_string(''.join(['mv ', tmp_vcf_file2, ' ', tmp_vcf_file]))
# Add variant IDs
logging.info(f'Adding variant IDs')
run_command_shell_string(''.join([args.bcftools_path, ' annotate -Ov -x ID -I +\'%CHROM\.%POS\.%REF\.%ALT\' ',
tmp_vcf_file, ' > ', tmp_vcf_file2]))
run_command_shell_string(''.join(['mv ', tmp_vcf_file2, ' ', tmp_vcf_file]))
if args.replace_gt:
logging.info(f'Replacing GT value ./. with 0/0.\n')
# 1. extract columns #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
vcf_part1 = tmp_vcf_file + '_part1'
run_command_shell_string(''.join(['cat ', tmp_vcf_file, ' | grep -v "^#" | ',
'cut -f 1-9 > ', vcf_part1]))
# 2. extract header ##
vcf_header = tmp_vcf_file + '_header'
run_command_shell_string(''.join(['cat ', tmp_vcf_file, ' | grep "^#" > ', vcf_header]))
# 3. extract GT columns
vcf_part2 = tmp_vcf_file + '_part2'
run_command_shell_string(
''.join([args.bcftools_path, ' query -f \'[\t%GT]\n\' ', tmp_vcf_file, ' > ', vcf_part2]))
# 4. create new VCF from parts
vcf_part2_tmp = tmp_vcf_file + '_part2_tmp'
run_command_shell_string(''.join(['cat ', vcf_part2, ' | sed \'s/\.\/\./0\/0/g\' > ', vcf_part2_tmp]))
run_command_shell_string(''.join(['mv ', vcf_part2_tmp, ' ', vcf_part2]))
# 4. join new VCF parts back together
vcf_body = tmp_vcf_file + '_body'
run_command_shell_string(''.join(['paste -d\'\\0\' ', vcf_part1, ' ', vcf_part2, ' > ', vcf_body]))
run_command_shell_string(''.join(['cat ', vcf_header, ' ', vcf_body, ' > ', tmp_vcf_file]))
# remove temporary VCF parts
run_command_shell_string(''.join(['rm ', vcf_header]))
run_command_shell_string(''.join(['rm ', vcf_part1]))
run_command_shell_string(''.join(['rm ', vcf_part2]))
run_command_shell_string(''.join(['rm ', vcf_body]))
# Check if diploid GT genotypes used, change to haploid
# logging.info(f'Finding out if diploid or haploid GT format used in {args.input_vcf}')
# run_command_shell_string(''.join(['cp ', tmp_vcf_file, ' | head -n 1000 > ', tmp_vcf_file2]))
# stout = run_command_string(''.join([args.bcftools_path, ' query -f \'[%GT ]\' ', tmp_vcf_file2,
# ' | tr \' \' \'\n\' | sort | uniq']))
# print(stout)
# run_command_shell_string(''.join(['rm ', tmp_vcf_file2]))
# diploid_format = True if '/' in stout else False
# if ':' in stout:
# diploid_format = True
# if diploid_format:
if args.to_haploid_format:
logging.info(f'Converting to haploid format\n')
# 1. extract columns #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT
vcf_part1 = tmp_vcf_file + '_part1'
run_command_shell_string(''.join(['cat ', tmp_vcf_file, ' | grep -v "^#" | ',
'cut -f 1-9 > ', vcf_part1]))
# 2. extract header ##
vcf_header = tmp_vcf_file + '_header'
run_command_shell_string(''.join(['cat ', tmp_vcf_file, ' | grep "^#" > ', vcf_header]))
# 3. extract GT columns
vcf_part2 = tmp_vcf_file + '_part2'
run_command_shell_string(''.join([args.bcftools_path, ' query -f \'[\t%GT]\n\' ', tmp_vcf_file, ' > ', vcf_part2]))
# 4. create new VCF from parts
vcf_part2_tmp = tmp_vcf_file + '_part2_tmp'
run_command_shell_string(''.join(['cat ', vcf_part2, ' | sed \'s/0\/0/0/g\' > ', vcf_part2_tmp]))
run_command_shell_string(''.join(['mv ', vcf_part2_tmp, ' ', vcf_part2]))
run_command_shell_string(''.join(['cat ', vcf_part2, ' | sed \'s/0\:\./0/g\' > ', vcf_part2_tmp]))
run_command_shell_string(''.join(['mv ', vcf_part2_tmp, ' ', vcf_part2]))
run_command_shell_string(''.join(['cat ', vcf_part2, ' | sed \'s/1\/1/1/g\' > ', vcf_part2_tmp]))
run_command_shell_string(''.join(['mv ', vcf_part2_tmp, ' ', vcf_part2]))
run_command_shell_string(''.join(['cat ', vcf_part2, ' | sed \'s/1\:\./1/g\' > ', vcf_part2_tmp]))
run_command_shell_string(''.join(['mv ', vcf_part2_tmp, ' ', vcf_part2]))
run_command_shell_string(''.join(['cat ', vcf_part2, ' | sed \'s/\.\/\./\./g\' > ', vcf_part2_tmp]))
run_command_shell_string(''.join(['mv ', vcf_part2_tmp, ' ', vcf_part2]))
run_command_shell_string(''.join(['cat ', vcf_part2, ' | sed \'s/\.\:\./\./g\' > ', vcf_part2_tmp]))
run_command_shell_string(''.join(['mv ', vcf_part2_tmp, ' ', vcf_part2]))
# 4. join new VCF parts back together
vcf_body = tmp_vcf_file + '_body'
run_command_shell_string(''.join(['paste -d\'\\0\' ', vcf_part1, ' ', vcf_part2, ' > ', vcf_body]))
run_command_shell_string(''.join(['cat ', vcf_header, ' ', vcf_body, ' > ', tmp_vcf_file]))
# remove temporary VCF parts
run_command_shell_string(''.join(['rm ', vcf_header]))
run_command_shell_string(''.join(['rm ', vcf_part1]))
run_command_shell_string(''.join(['rm ', vcf_part2]))
run_command_shell_string(''.join(['rm ', vcf_body]))
# gzip and tabix
run_command_shell_string(''.join(['mv ', tmp_vcf_file, ' ', args.output_vcf]))
run_command_shell_string(''.join([args.bgzip_path, ' -c ', args.output_vcf , ' > ', args.output_vcf + '.gz']))
run_command_shell_string(''.join([args.tabix_path, ' -p vcf ', args.output_vcf + '.gz']))
if __name__ == "__main__":
_main()