-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTSVtoVCF.py
executable file
·113 lines (85 loc) · 3.27 KB
/
TSVtoVCF.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import sys
import itertools
def parseArguments():
"""
Parse Arguments, sets default values, type of arguments, and help documentation
:return:
Parse arguments used for file in and file out
"""
# making the parser
parser = argparse.ArgumentParser()
# adding different parse arguments
parser.add_argument("-d","--data",
help = "file to read",
type = argparse.FileType('r'),
default = sys.stdin)
parser.add_argument("-o","--data_out",
help = "file to write",
type = argparse.FileType('w'),
default = sys.stdout)
args = parser.parse_args()
return args
def readTSV(file):
"""
Takes TSV file as input, generates dictionary for each line (each variant) with name, chr, start, end, len as keys
:return:
"""
for tupleLine in enumerate(file):
if tupleLine[0] == 0:
continue
else:
dict={}
line = tupleLine[1]
line = line.rstrip()
values = line.split('\t')# outputs a list called values, filled with strings that correspond to the tab separated text
dict['Name'] = values[0]
dict['Chr'] = values[1]
dict['Start'] = values[2]
dict['End'] = values[3]
dict['Len'] = values[4]
yield dict
def convert(dict):
"""
Input dictionary with TSV headers as keys
Output values from dictionary keys into list
:param dict:
:return:
"""
variantList = []
variantList.append(dict['Chr'])
variantList.append(dict['Start'])
variantList.append(dict['Name'][2:len(dict['Name'])-1]) # remove quotes and @ sign
variantList.append('N')
variantList.append('<INV>')
variantList.append('.')
variantList.append('.')
variantList.append("END={};SVTYPE=INV".format(dict['End']))
return variantList
def vcfWriter(variantGenerator, file):
"""Inputting a generator that produces lists, with strings corresponding to each VCF column as items in the list, and file to write
Outputting file with 1. Header 2. Each list becomes a VCF record with tab separators"""
# vcfFile = open(file, "w")
file.write("""##fileformat=VCFv4.2
##INFO=<ID=END,Number=1,Type=Integer,Description=“End position of the variant described in this record”>
##INFO=<ID=SVTYPE,Number=1,Type=String,Description=“Type of structural variant”>
##ALT=<ID=IV,Description=“Inversion”>
#CHROM POS ID REF ALT QUAL FILTER INFO""")
# for element in variantList:
file.write(str('\n'))
for item in variantGenerator:
# file.write(item)
# file.write(''.join(str(item)))
file.write(item[0]+'\t'+item[1]+'\t'+item[2]+'\t'+item[3]+'\t'+item[4]+'\t'+item[5]+'\t'+item[6]+'\t'+item[7])
file.write(str('\n'))
# now I would like to catch all of those dictionaries in a list....
def main():
args = parseArguments()
variantSpecs = itertools.imap(convert, readTSV(args.data)) # generators can act like lists
vcfWriter(variantSpecs, args.data_out)
# variantList = convert(dict)
# print(variantList)
if __name__ == '__main__':
main()