-
Notifications
You must be signed in to change notification settings - Fork 1
/
paml_parser.py
executable file
·163 lines (147 loc) · 6.68 KB
/
paml_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/python3
'''
Loops through a directory containing multiple paml output files, with a naming scheme
of clusterID.XXXXXX (for example 2356.paml_out). Returns two text file:
1. A working file of Model output. (see below)
Cluster Model np lnL Number_sites_>95 Number_sites_>99
551 Model 0 -3309.989574
551 Model 1 -3294.362074
551 Model 2 -3280.01743 3 1
551 Model 3 -3279.439089
551 Model 7 -3294.376692
551 Model 8 -3280.047899 7 2
552 Model 0 -1038.052331
552 Model 1 -1009.671555
552 Model 2 -1008.532431 0 0
552 Model 3 -1008.532431
552 Model 7 -1009.761659
552 Model 8 -1008.532415 0 0
2. Sumarized text file for M3vM0, M2vM1, and M8vM7 showing LRT values, p_values,
and number of BEB significant sites in the M2 and M8 model. (see below)
Cluster M3vM0_lrt M3vM0_p M2vM1_lrt M2vM1_p M2_num_sites_>95 M2_num_sites_>99 M8vM7_lrt M8vM7_p M8_num_sites_>95 M8_num_sites_>99
551 61.1 0.0 28.69 0.0 3 1 28.66 0.0 7 2
552 59.04 0.0 2.28 0.32 0 0 2.46 0.293 0 0
553 -0.0 1.0 -0.0 1.0 0 0 -0.0 1.0 0 0
554 0.0 1.0 0.0 1.0 0 0 -0.0 1.0 0 0
555 -0.0 1.0 0.0 1.0 0 0 -0.0 1.0 0 0
556 0.0 1.0 0.0 1.0 0 0 0.01 0.997 0 0
557 36.73 0.0 9.83 0.007 2 0 10.27 0.006 2 0
558 0.02 1.0 0.0 0.999 0 0 -0.0 1.0 0 0
559 140.39 0.0 31.93 0.0 5 4 34.95 0.0 8 5
560 0.0 1.0 0.0 0.999 0 0 -0.0 1.0 0 0
'''
import os, sys, re, argparse
from collections import defaultdict
from scipy import stats
import numpy as np
rundir = os.getcwd()
class MyFormatter(argparse.RawTextHelpFormatter):
def __init__(self, prog):
super(MyFormatter, self).__init__(prog, max_help_position=48)
parser = argparse.ArgumentParser(
usage='./%(prog)s [options] -d directory -o output_basename',
description = ''' Loops through a directory of paml output files and extracts
LRT test statistics and number of BEB significant sites for M2 and M8 model.''',
epilog = """Written by Stephen A. Wyka (2020)""",
formatter_class = MyFormatter)
parser.add_argument(
'-d',
'--directory',
required = True,
help = 'Input directory',
metavar=''
)
parser.add_argument(
'-o',
'--output',
required = True,
help = 'Base name of output files',
metavar=''
)
args=parser.parse_args()
model_list = ['Model 0', 'Model 1', 'Model 2', 'Model 3', 'Model 7', 'Model 8']
if __name__ == "__main__":
input_dir = os.path.abspath(args.directory)
codeML_dict = {} # {Cluster_ID : {Model 0 : [lnL], Model 1 : [lnL], Model 8 : [lnL, sites>95, sites>99]}
files = sorted([f for f in os.listdir(input_dir) if '.paml_out' in f], key=lambda x: int(x.split('.')[0]))
for f in files:
cluster = f.split('.')[0]
codeML_dict[cluster] = {m : [] for m in model_list}
fpath = os.path.join(input_dir, f)
model_breaks = defaultdict(list) # dictionary with Model's as keys and lists of lines associated with give model output
with open(fpath, 'r') as f_in:
model = ''
for line in f_in:
model_start = re.search(r'(Model) (\d)(:)', line)
if model_start != None:
model = 'Model '+model_start.group(2)
continue
model_breaks[model].append([line])
model_breaks.pop('')
for model, lines in model_breaks.items():
codeML_dict[cluster][model] = []
beb_list = []
l_c = -1
for l in lines:
l_c += 1
l = l[0]
if 'lnL' in l:
l_data = re.search(r'(lnL)(.+np:) (\d+)(\):)\s(.+)\s([+-])',l)
lnL = float(l_data.group(5).strip())
codeML_dict[cluster][model].append(lnL)
elif 'Bayes Empirical Bayes (BEB) analysis' in l:
num_sig_95 = 0
num_sig_99 = 0
for i in range(l_c, len(lines)):
sites = re.search(r'(\d+) (\w) (.+) (.+) (\+\-) (.+)',lines[i][0])
if sites:
if '**' in sites.group(3):
num_sig_99 += 1
num_sig_95 += 1
elif '*' in sites.group(3):
num_sig_95 += 1
codeML_dict[cluster][model].append(num_sig_95)
codeML_dict[cluster][model].append(num_sig_99)
else:
pass
for cluster, results in codeML_dict.items():
for model in results:
if not codeML_dict[cluster][model]:
codeML_dict[cluster][model] = [float('nan')]
overall_output = os.path.join(rundir, args.output+'_parsred_paml.txt')
lrt_output = os.path.join(rundir, args.output+'_lrt_stats.txt')
lrt_results = [['Cluster\tM3vM0_lrt\tM3vM0_p\tM2vM1_lrt\tM2vM1_p\tM2_num_sites_>95\tM2_num_sites_>99\tM8vM7_lrt\tM8vM7_p\tM8_num_sites_>95\tM8_num_sites_>99']]
with open(overall_output, 'w') as over_out, open(lrt_output, 'w') as lrt_out:
over_out.write('Cluster\tModel\tnp\tlnL\tNumber_sites_>95\tNumber_sites_>99\n')
for cluster, results in codeML_dict.items():
for model in results:
res = [str(x) for x in codeML_dict[cluster][model]]
over_out.write(cluster+'\t'+model+'\t'+'\t'.join(res)+'\n')
tmp_list = []
m3_m0 = [codeML_dict[cluster]['Model 3'][0], codeML_dict[cluster]['Model 0'][0]]
m2_m1 = [codeML_dict[cluster]['Model 2'][0], codeML_dict[cluster]['Model 1'][0]]
if not np.isnan(codeML_dict[cluster]['Model 2'][0]):
m2_95 = codeML_dict[cluster]['Model 2'][1]
m2_99 = codeML_dict[cluster]['Model 2'][2]
else:
m2_95 = 'nan'
m2_99 = 'nan'
m8_m7 = [codeML_dict[cluster]['Model 8'][0], codeML_dict[cluster]['Model 7'][0]]
if not np.isnan(codeML_dict[cluster]['Model 8'][0]):
m8_95 = codeML_dict[cluster]['Model 8'][1]
m8_99 = codeML_dict[cluster]['Model 8'][2]
else:
m8_95 = 'nan'
m8_99 = 'nan'
lrt_3v0 = 2*(m3_m0[0] - m3_m0[1])
p_3v0 = (1- stats.chi2.cdf(lrt_3v0, 4))
lrt_2v1 = 2*(m2_m1[0] - m2_m1[1])
p_2v1 = (1- stats.chi2.cdf(2*(m2_m1[0] - m2_m1[1]), 2))
lrt_8v7 = 2*(m8_m7[0] - m8_m7[1])
p_8v7 = (1- stats.chi2.cdf(2*(m8_m7[0] - m8_m7[1]), 2))
tmp_list = [cluster, round(lrt_3v0, 2), round(p_3v0, 3),
round(lrt_2v1, 2), round(p_2v1, 3), m2_95, m2_99,
round(lrt_8v7, 2), round(p_8v7, 3), m8_95, m8_99]
lrt_results.append([str(x) for x in tmp_list])
for item in lrt_results:
lrt_out.write('\t'.join(item) + '\n')