-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvalidation.py
239 lines (176 loc) · 12.1 KB
/
validation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
'''
ALBA MALAGON MARQUEZ
Final Degree Project: 'STRATEGY FOR ACCURATE CNV DETECTION AND ML ALGORITHM FOR CLASSIFYING NGS VARIANTS'
Script to validate CNV calls between ExomeDepth, arrays and the developed method (CNV_predictions.py)
USAGE: python3 validation.py
'''
from CNV_libraries_variables import *
starttime = timeit.default_timer()
# R E A D I N G F I L E S ----------------------------------------------------------------------------------
#these are the CNV calls between Exome Depth and Arrays (exomedepth_and_arrays.py)
truePdata = pd.read_csv('{}/truePdata.csv'.format(os.getcwd()), sep="\t")
falsePdata = pd.read_csv('{}/falsePdata.csv'.format(os.getcwd()), sep="\t")
segduppseudodata = pd.read_csv('{}/segduppseudodata.csv'.format(os.getcwd()), sep="\t")
#these are the CNV calls after implementing our strategy (CNV_validation_method.py)
truePdataM = pd.read_csv('{}/truePdataMETHOD.csv'.format(os.getcwd()), sep="\t")
falsePdataM = pd.read_csv('{}/falsePdataMETHOD.csv'.format(os.getcwd()), sep="\t")
noinfoM = pd.read_csv('{}/NOINFOdataMETHOD.csv'.format(os.getcwd()), sep="\t")
segdup_pseudoM = pd.read_csv('{}/superdups_pseudoMETHOD.csv'.format(os.getcwd()), sep="\t")
segduppseudodata.drop(['Unnamed: 0'], axis=1, inplace=True)
truePdata.drop(['Unnamed: 0'], axis=1, inplace=True)
falsePdata.drop(['Unnamed: 0'], axis=1, inplace=True)
truePdataM.drop(['Unnamed: 0'], axis=1, inplace=True)
falsePdataM.drop(['Unnamed: 0'], axis=1, inplace=True)
noinfoM.drop(['Unnamed: 0'], axis=1, inplace=True)
segdup_pseudoM.drop(['Unnamed: 0'], axis=1, inplace=True)
#only considering deletions
truePdata_indexes = truePdata[(truePdata['type'] != 'deletion')]
falsePdata_indexes = falsePdata[(falsePdata['type'] != 'deletion')]
segduppseudodata_indexes = segduppseudodata[(segduppseudodata['type'] != 'deletion')]
truePdataM_indexes = truePdataM[(truePdataM['type'] != 'deletion')]
falsePdataM_indexes = falsePdataM[(falsePdataM['type'] != 'deletion')]
noinfoM_indexes = noinfoM[(noinfoM['type'] != 'deletion')]
segdup_pseudoM_indexes = segdup_pseudoM[(segdup_pseudoM['type'] != 'deletion')]
# Step 2 : remove deletion indexes
truePdata = truePdata.drop(truePdata_indexes.index, axis=0)
falsePdata = falsePdata.drop(falsePdata_indexes.index, axis=0)
segduppseudodata = segduppseudodata.drop(segduppseudodata_indexes.index, axis=0)
truePdataM = truePdataM.drop(truePdataM_indexes.index, axis=0)
falsePdataM = falsePdataM.drop(falsePdataM_indexes.index, axis=0)
noinfoM = noinfoM.drop(noinfoM_indexes.index, axis=0)
segdup_pseudoM = segdup_pseudoM.drop(segdup_pseudoM_indexes.index, axis=0)
#printing heads of all datasets
color_print('Coincide ED and ARRAYS:', color='magenta', end='')
print(len(truePdata))
print(truePdata.head())
color_print('NO coincide ED and ARRAYS:', color='red', end='')
print(len(falsePdata))
print(falsePdata.head())
color_print('Compatible ED and METHOD:', color='green',end='')
print(len(truePdataM))
print(truePdataM.head())
color_print('NO compatible ED and METHOD:', color='cyan',end='')
print(len(falsePdataM))
print(falsePdataM.head())
color_print('NO info ED and METHOD:', color='yellow',end='')
print(len(noinfoM))
print(noinfoM.head())
color_print('SegDups and pseudogenes arrays:', color='blue',end='')
print(len(segduppseudodata))
print(segduppseudodata.head())
color_print('SegDups and pseudogenes METHOD:', color='white',end='')
print(len(segdup_pseudoM))
print(segdup_pseudoM.head())
#merging the coincident calls between ExomeDepth and arrays with each of the possibilities:
#with compatibility cases between ExomeDepth and the method
COINCIDEIXENtp_OK = truePdata.merge(truePdataM, on=['chr', 'start', 'end', 'width', 'type', 'nexons', 'BF',
'reads.expected', 'reads.observed', 'reads.ratio', 'SegDup',
'exons.hg19', 'RefSeqGenes', 'Sample', 'PROBES', 'COVERAGE',
'array_type'], how = 'inner' ,indicator=True)
COINCIDEIXENtp_OK.to_csv('/Users/alba/Desktop/QGEN/cnv/COINCIDEIXENtp_OK.csv', header=True, sep="\t")
#with INcompatibility cases between ExomeDepth and the method
COINCIDEIXENtp_KO = truePdata.merge(falsePdataM, on=['chr', 'start', 'end', 'width', 'type', 'nexons', 'BF',
'reads.expected', 'reads.observed', 'reads.ratio', 'SegDup',
'exons.hg19', 'RefSeqGenes', 'Sample', 'PROBES', 'COVERAGE',
'array_type'], how = 'inner' ,indicator=True)
COINCIDEIXENtp_KO.to_csv('/Users/alba/Desktop/QGEN/cnv/COINCIDEIXENtp_KO.csv', header=True, sep="\t")
#with no information cases between ExomeDepth and the method
COINCIDEIXENtp_NI = truePdata.merge(noinfoM, on=['chr', 'start', 'end', 'width', 'type', 'nexons', 'BF',
'reads.expected', 'reads.observed', 'reads.ratio', 'SegDup',
'exons.hg19', 'RefSeqGenes', 'Sample', 'PROBES', 'COVERAGE',
'array_type'], how = 'inner' ,indicator=True)
COINCIDEIXENtp_NI.to_csv('/Users/alba/Desktop/QGEN/cnv/COINCIDEIXENtp_NI.csv', header=True, sep="\t")
#with segmental duplication (or pseudogenes) cases between ExomeDepth and the method
COINCIDEIXENtp_SD = truePdata.merge(segdup_pseudoM, on=['chr', 'start', 'end', 'width', 'type', 'nexons', 'BF',
'reads.expected', 'reads.observed', 'reads.ratio', 'SegDup',
'exons.hg19', 'RefSeqGenes', 'Sample', 'PROBES', 'COVERAGE',
'array_type'], how = 'inner' ,indicator=True)
COINCIDEIXENtp_SD.to_csv('/Users/alba/Desktop/QGEN/cnv/COINCIDEIXENtp_SD.csv', header=True, sep="\t")
#merging the NO coincident calls between ExomeDepth and arrays with each of the possibilities:
#with compatibility cases between ExomeDepth and the method
COINCIDEIXENfp_OK = falsePdata.merge(truePdataM, on=['chr', 'start', 'end', 'width', 'type', 'nexons', 'BF',
'reads.expected', 'reads.observed', 'reads.ratio', 'SegDup',
'exons.hg19', 'RefSeqGenes', 'Sample', 'PROBES', 'COVERAGE',
'array_type'], how = 'inner',indicator=True)
COINCIDEIXENfp_OK.to_csv('/Users/alba/Desktop/QGEN/cnv/COINCIDEIXENfp_OK.csv', header=True, sep="\t")
#with INcompatibility cases between ExomeDepth and the method
COINCIDEIXENfp_KO = falsePdata.merge(falsePdataM, on=['chr', 'start', 'end', 'width', 'type', 'nexons', 'BF',
'reads.expected', 'reads.observed', 'reads.ratio', 'SegDup',
'exons.hg19', 'RefSeqGenes', 'Sample', 'PROBES', 'COVERAGE',
'array_type'], how = 'inner',indicator=True)
COINCIDEIXENfp_KO.to_csv('/Users/alba/Desktop/QGEN/cnv/COINCIDEIXENfp_KO.csv', header=True, sep="\t")
#with no information cases between ExomeDepth and the method
COINCIDEIXENfp_NI = falsePdata.merge(noinfoM, on=['chr', 'start', 'end', 'width', 'type', 'nexons', 'BF',
'reads.expected', 'reads.observed', 'reads.ratio', 'SegDup',
'exons.hg19', 'RefSeqGenes', 'Sample', 'PROBES', 'COVERAGE',
'array_type'], how = 'inner',indicator=True)
COINCIDEIXENfp_NI.to_csv('/Users/alba/Desktop/QGEN/cnv/COINCIDEIXENfp_NI.csv', header=True, sep="\t")
#with segmental duplication (or pseudogenes) cases between ExomeDepth and the method
COINCIDEIXENfp_SD = falsePdata.merge(segdup_pseudoM, on=['chr', 'start', 'end', 'width', 'type', 'nexons', 'BF',
'reads.expected', 'reads.observed', 'reads.ratio', 'SegDup',
'exons.hg19', 'RefSeqGenes', 'Sample', 'PROBES', 'COVERAGE',
'array_type'], how = 'inner',indicator=True)
COINCIDEIXENfp_SD.to_csv('/Users/alba/Desktop/QGEN/cnv/COINCIDEIXENfp_SD.csv', header=True, sep="\t")
#merging the segmental duplication calls between ExomeDepth and arrays with each of the possibilities:
#with segmental duplication (or pseudogenes) cases between ExomeDepth and the method
COINCIDEIXENsd_SD = segduppseudodata.merge(segdup_pseudoM, on=['chr', 'start', 'end', 'width', 'type', 'nexons', 'BF',
'reads.expected', 'reads.observed', 'reads.ratio', 'SegDup',
'exons.hg19', 'RefSeqGenes', 'Sample', 'PROBES', 'COVERAGE',
'array_type'], how = 'inner' ,indicator=True)
COINCIDEIXENsd_SD.to_csv('/Users/alba/Desktop/QGEN/cnv/COINCIDEIXENsd_SD.csv', header=True, sep="\t")
#with no information cases between ExomeDepth and the method
COINCIDEIXENsd_ni = segduppseudodata.merge(noinfoM, on=['chr', 'start', 'end', 'width', 'type', 'nexons', 'BF',
'reads.expected', 'reads.observed', 'reads.ratio', 'SegDup',
'exons.hg19', 'RefSeqGenes', 'Sample', 'PROBES', 'COVERAGE',
'array_type'], how = 'inner' ,indicator=True)
COINCIDEIXENsd_ni.to_csv('/Users/alba/Desktop/QGEN/cnv/COINCIDEIXENsd_ni.csv', header=True, sep="\t")
#with compatibility cases between ExomeDepth and the method
COINCIDEIXENsd_tp = segduppseudodata.merge(truePdataM, on=['chr', 'start', 'end', 'width', 'type', 'nexons', 'BF',
'reads.expected', 'reads.observed', 'reads.ratio', 'SegDup',
'exons.hg19', 'RefSeqGenes', 'Sample', 'PROBES', 'COVERAGE',
'array_type'], how = 'inner' ,indicator=True)
COINCIDEIXENsd_tp.to_csv('/Users/alba/Desktop/QGEN/cnv/COINCIDEIXENsd_tp.csv', header=True, sep="\t")
#with INcompatibility cases between ExomeDepth and the method
COINCIDEIXENsd_fp = segduppseudodata.merge(falsePdataM, on=['chr', 'start', 'end', 'width', 'type', 'nexons', 'BF',
'reads.expected', 'reads.observed', 'reads.ratio', 'SegDup',
'exons.hg19', 'RefSeqGenes', 'Sample', 'PROBES', 'COVERAGE',
'array_type'], how = 'inner' ,indicator=True)
COINCIDEIXENsd_fp.to_csv('/Users/alba/Desktop/QGEN/cnv/COINCIDEIXENsd_fp.csv', header=True, sep="\t")
color_print('\n \n \n------------------------------------------------------------------------------------------------------------------------------------------------------------\n \n \n'.format(), color='red')
color_print('Coincide ED and ARRAY + compatible METHOD: --> TP: {}'.format(len(COINCIDEIXENtp_OK)), color='magenta')
print(COINCIDEIXENtp_OK.head())
color_print('Coincide ED and ARRAY + NO compatible METHOD: --> FP: {}'.format(len(COINCIDEIXENtp_KO)), color='magenta')
print(COINCIDEIXENtp_KO.head())
color_print('Coincide ED and ARRAY + NO INFO METHOD: {}'.format(len(COINCIDEIXENtp_NI)), color='magenta')
print(COINCIDEIXENtp_NI.head())
color_print('No coincide ED and ARRAY + compatible METHOD: --> FP: {}'.format(len(COINCIDEIXENfp_OK)), color='cyan')
print(COINCIDEIXENfp_OK.head())
color_print('No coincide ED and ARRAY + NO compatible METHOD: --> TP: {}'.format(len(COINCIDEIXENfp_KO)), color='cyan')
print(COINCIDEIXENfp_KO.head())
color_print('No coincide ED and ARRAY + NO INFO METHOD: {}'.format(len(COINCIDEIXENfp_NI)), color='cyan')
print(COINCIDEIXENfp_NI.head())
color_print('SEGDUP or PSEUDOGENE METHOD: {}'.format(len(COINCIDEIXENsd_SD)), color='cyan')
print(COINCIDEIXENsd_SD.head())
print('\n')
print('Total number of CNVs from arrays: 247')
print('Total number of CNVs from Exome Depth covered: 992') #992 o 570
color_print('\n \n \n------------------------------------------------------------------------------------------------------------------------------------------------------------\n \n \n'.format(), color='red')
t = PrettyTable(["call coincident exome depth and d'arrays", 'method OK', 'method KO', 'method NI', 'method SD'])
t.add_row([len(truePdata), len(COINCIDEIXENtp_OK), len(COINCIDEIXENtp_KO), len(COINCIDEIXENtp_NI), len(COINCIDEIXENtp_SD)])
print(t)
print('\n\n')
t2 = PrettyTable(["call NO coincident exome depth and d'arrays", 'method OK', 'method KO', 'method NI', 'method SD'])
t2.add_row([len(falsePdata), len(COINCIDEIXENfp_OK), len(COINCIDEIXENfp_KO), len(COINCIDEIXENfp_NI), len(COINCIDEIXENfp_SD)])
print(t2)
print('\n\n')
t3 = PrettyTable(["total segmental duplications", 'method SD', 'method NI', 'method OK', 'method KO'])
t3.add_row([len(segduppseudodata), len(COINCIDEIXENsd_SD), len(COINCIDEIXENsd_ni), len(COINCIDEIXENsd_tp), len(COINCIDEIXENsd_fp) ])
print(t3)
print('\n')
color_print('\nONLY CONSIDERING DELETIONS:\n'.format(), color='magenta')
color_print('Total TP: {}'.format(len(COINCIDEIXENtp_OK)+len(COINCIDEIXENfp_KO)), color='green')
color_print('Total FP: {}'.format(len(COINCIDEIXENtp_KO)+len(COINCIDEIXENfp_OK)), color='red')
color_print('Total NI: {}'.format(len(COINCIDEIXENtp_NI)+len(COINCIDEIXENfp_NI)), color='yellow')
color_print('Total SD: {}'.format(len(COINCIDEIXENtp_SD)+len(COINCIDEIXENfp_SD)+len(COINCIDEIXENsd_SD)), color='white')
stoptime = timeit.default_timer()
print('\n\nTime:',stoptime - starttime)