-
Notifications
You must be signed in to change notification settings - Fork 2
/
finerad_input.py
executable file
·183 lines (147 loc) · 5.9 KB
/
finerad_input.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
'''
The program takes as input the .haplotypes.tsv file from Stacks, the.alleles.loci
from ipyrad, or the .alleles from pyrad to parse the haplotypes (phased SNPs) of
each sample in the dataset and convert it into a haplotype matrix ready for
analysis with fineRADstructure (http://cichlid.gurdon.cam.ac.uk/fineRADstructure.html)
fineRADstructure is a more sensistive method than Structure or Admixture because uses
the entire haplotype information instead of a single randomly sampled SNP
Type python finerad_input.py -h to show the help.
'''
__author__ = "Edgardo M. Ortiz"
__version__ = "1.0"
__email__ = "e.ortiz.v@gmail.com"
__date__ = "2017-10-23"
import sys
import argparse
def stacks_to_finerad(filename, outfile, minsample):
output = open(outfile, "w")
locus_num = 0
with open(filename) as stacks_haps:
for line in stacks_haps:
# Write header of output file with only sample names
if line.startswith("Catalog"):
header = line.strip("\n").split("\t")[2:]
output.write("\t".join(header)+"\n")
# Skip locus if it is invariable
elif "consensus" in line:
continue
# Process if locus is variable
else:
genotypes = line.strip("\n").replace("-","").split("\t")[2:]
# Skip locus if any sample has more than two alleles
if any(g.count("/") > 1 for g in genotypes):
continue
else:
# Write row of haplotypes to output if locus contains enough samples
if len(genotypes) - genotypes.count("") >= minsample:
output.write("\t".join(genotypes)+"\n")
locus_num += 1
output.close()
print(str(locus_num)+" loci for "+str(len(genotypes))+" samples written to "+outfile)
def pyrad_to_finerad(filename, outfile, minsample):
haplotypes = {}
# First pass to the file to get all sample names
with open(filename) as pyrad_alleles:
for line in pyrad_alleles:
if not line.strip() == "":
if not line.startswith("/"):
sample = line.split()[0].strip(">")[:-2] # Sample name does not include last two characters
if sample not in haplotypes:
haplotypes[sample] = []
# Second pass to the file to parse haplotypes
with open(filename) as pyrad_alleles:
locus = []
locus_num = 0
for line in pyrad_alleles:
if not line.strip() == "":
if not line.startswith("/"):
locus.append(line.strip("\n"))
else:
# Skip if locus doesn't contain enough samples
if len(locus)/2 >= minsample: # Every sample has two lines
snp_raw_coords = []
for pos in range(len(line.split("|")[0])):
if line[pos] in ["*","-"]:
snp_raw_coords.append(pos)
# Skip locus if it is invariable
if snp_raw_coords == []:
locus = []
# Process if locus is variable
else:
# But first check that SNPs (columns) don't contain deletions, or more than 50% Ns
snp_coords = []
for pos in snp_raw_coords:
snp = ""
for allele in locus:
snp += allele[pos]
if snp.count("-") == 0:
if snp.count("N") <= len(locus)/4:
snp_coords.append(pos)
# Add an empty cell to each sample in haplotypes
for sample in haplotypes:
haplotypes[sample].append("")
# Fill the last empty cells with the haplotypes
for allele in locus:
sample = allele.split()[0].strip(">")[:-2]
hap = ""
hap = "".join([hap+allele[c] for c in snp_coords])
# Don't add haplotype if it is more than 50% Ns
if hap.count("N") >= len(hap)/2:
hap = ""
# Add haplotypes to samples
if haplotypes[sample][-1] == "":
haplotypes[sample][-1] = hap
else:
if haplotypes[sample][-1] != hap:
haplotypes[sample][-1] = haplotypes[sample][-1]+"/"+hap
locus_num += 1
locus = []
else:
locus = []
# Remove samples without data
for sample in haplotypes.keys():
if haplotypes[sample].count("") == len(haplotypes[sample]):
del haplotypes[sample]
# Write header
output = open(outfile, "w")
output.write("\t".join(sorted(haplotypes.keys()))+"\n")
# Write genotypes
final_locus_num = 0
for l in range(locus_num):
genotypes = []
for sample in sorted(haplotypes.keys()):
genotypes.append(haplotypes[sample][l])
# Final check for minsample
if len(genotypes) - genotypes.count("") >= minsample:
output.write("\t".join(genotypes)+"\n")
final_locus_num += 1
output.close()
print(str(final_locus_num)+" loci for "+str(len(genotypes))+" samples written to "+outfile)
def main():
parser = argparse.ArgumentParser(description="Converts haplotype data from Stacks, pyrad or ipyrad into a haplotype matrix for analysis with fineRADstructure (http://cichlid.gurdon.cam.ac.uk/fineRADstructure.html)")
parser.add_argument("-i", "--input", action="store", dest="filename", required=True,
help="Name of the haplotypes.tsv file from Stacks, the .alleles file from pyrad, or the .alleles.loci file from ipyrad to be converted.")
parser.add_argument("-t", "--type", action="store", dest="data_type", default="",
help="Source of the input file, options: stacks, pyrad, ipyrad, default=guess")
parser.add_argument("-n", "--minsample", action="store", type=int, default=2,
help="Minimum number of samples in a locus, default=2")
parser.add_argument("-o", "--output", action="store", dest="outfile", default="",
help="Name for the output file, default=input name + .minsample + .finerad")
args = parser.parse_args()
filename = args.filename
data_type = args.data_type
minsample = args.minsample
if args.outfile == "":
outfile = filename+".min"+str(minsample)+".finerad"
else:
outfile = args.outfile
if "pyrad" in data_type.lower() or "alleles" in filename:
pyrad_to_finerad(filename, outfile, minsample)
elif "stacks" in data_type.lower() or "haplotypes.tsv" in filename:
stacks_to_finerad(filename, outfile, minsample)
else:
print("Unknown input format, use -h for help")
if __name__ == "__main__":
main()