-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_parser.py
449 lines (335 loc) · 15.1 KB
/
data_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
#!/usr/bin/env python3
"""
Program: Data Parser
File: data_parser.py
Version: V3.3
Date: 23.10.22
Function: returns csv files from a NetMHCpan 4.1 output file.
Copyright: (c) Joan M. Amaya C., 2022
Author: Joan Manuel Amaya Cuesta
--------------------------------------------------------------------------
Description:
============
script used and referenced in the methods section of my Bioinformatics
dissertation. This script was used in all three parts of the methods
this class will use the raw data from NetMHCpan 4.1 and parse it by
the %Rank as "This measure is not affected by inherent bias of certain
molecules towards higher or lower mean predicted affinities.
Strong binders are defined as having %rank<0.5, and weak binders with %rank<2"
[17]
This script returns a csv file with the data parsed for the number of alleles
that the user want but the only options are 1, 4 and 5 alleles
all csv file created will be used for downstream analysis
data = Parser("xxx.xls") call the instance with the output fron netMHCpan
data.one_allele(): call to parse and return a csv file
data.four_alleles(): call to parse and return four csv files
data.five_alleles(): call to parse and return five csv files
--------------------------------------------------------------------------
Usage:
======
excel files created from NetMHCpan 4.1
--------------------------------------------------------------------------
Revision History:
=================
V2.0 july 2022
V3.0 24.08.28
V3.3 23.10.22 Original By: JMAC
"""
#*************************************************************************
# Import libraries
import pandas as pd
import re
#*************************************************************************
class Parser:
"""
This is the general parser script that have been used for all three
sections of my Bioinformatics dissertation.
This script will only work with the output from NetMHCpan - 4.1
The script will parse the output generated by the program depending
of how many alleles have been selected for prediction.
The user must select how many alleles will need to parse, and
depending on that the user will choose one object method,
please see object method section to identify the correct option
depending on your needs.
The output file in csv format will be used for downstream analysis
Object attributes:
netmhcpan_output (file): NetMHCpan - 4.1 output xls file.
Object methods:
__init__(self, netmhcpan_output): opens NetMHCpan - 4.1 output
and create a DataFrame object that will be used for parsing
the appropriate data.
one_allele(self): to parse only one allele to return a csv file
four_alleles(self): to parse only four alleles to return four csv file
five_alleles(self): same as before but for five alleles
"""
def __init__(self, netmhcpan_output):
self.df = pd.read_table(netmhcpan_output, low_memory=False)
self.f = open(netmhcpan_output)
def one_allele(self):
# common columns
df_common = self.df.iloc[:, 0:3]
#unique columns per
df_1st_allele = self.df.iloc[:, 3:9]
df_overall_1st = df_common.join(df_1st_allele)
dff = df_overall_1st
#here the script selects the row containing the columns name
dff.columns = dff.iloc[0]
dff = dff.drop(0)
#as only analyses one peptide lenght i.e 9aa
#those columns are not necessary
dff = dff.drop(columns = ["core", "icore"])
#data type object changed in order to help
#to manipulate the data later on
dff["EL_Rank"] = dff["EL_Rank"].astype(float)
#columns of relevance selected and threshold
#recommended by NetMHCpan-4.1 developers
dff = dff[["Peptide", "ID", "EL_Rank"]].query("EL_Rank <= 2")
dff = dff.sort_values(by=["EL_Rank"])
#saves the parsed data
dff = dff.reset_index(drop=True)
#final parsed file with customised name
parsed_file = input("Please enter file name: ")
dff.to_csv(f"{parsed_file}.csv", index=False)
def four_alleles(self):
#with more than one allele, the allele name
#must be extracted for each one of
#the alelles
line = self.f.readlines(1)
p = re.compile(r'(HLA-\w*):(\w*)')
line = str(line)
it = p.finditer(line)
alleles_list = []
for match in it:
alleles_list.append(match.group(1) + match.group(2))
# common columns
df_common = self.df.iloc[:, 0:3]
#unique columns per allele
#1st allele
df_1st_allele = self.df.iloc[:, 3:9]
#2nd allele
df_2nd_allele = self.df.iloc[:, 9:15]
#3rd allele
df_3rd_allele = self.df.iloc[:, 15:21]
#4th allele
df_4th_allele = self.df.iloc[:, 21:27]
#new DataFrame objects with all necessary from an allele
df_overall_1st = df_common.join(df_1st_allele)
df_overall_2nd = df_common.join(df_2nd_allele)
df_overall_3rd = df_common.join(df_3rd_allele)
df_overall_4th = df_common.join(df_4th_allele)
#to prepare final csv file with 1st allele parsed
dff = df_overall_1st
#here the script selects the row containing the columns name
dff.columns = dff.iloc[0]
dff = dff.drop(0)
#as only analyses one peptide lenght i.e 9aa
#those columns are not necessary
dff = dff.drop(columns = ["core", "icore"])
#data type object changed in order to help
#to manipulate the data later on
dff["Pos"] = dff["Pos"].astype(int)
dff["EL_Rank"] = dff["EL_Rank"].astype(float)
dff["BA_Rank"] = dff["BA_Rank"].astype(float)
#columns of relevance selected and threshold
#recommended by NetMHCpan-4.1 developers
dff = dff[["Pos", "Peptide", "ID", "EL_Rank",
"BA_Rank"]].query("EL_Rank <= 2")
dff = dff.sort_values(by=["EL_Rank"])
#saves the parsed data sorted by EL Rank%
dff = dff.reset_index(drop=True)
dff.to_csv(f"{alleles_list[0]}.csv", index=False)
#final csv file with 2nd allele parsed
dff = df_overall_2nd
#here the script selects the row containing the columns name
dff.columns = dff.iloc[0]
dff = dff.drop(0)
#as only analyses one peptide lenght i.e 9aa
#those columns are not necessary
dff = dff.drop(columns = ["core", "icore"])
#data type object changed in order to help
#to manipulate the data later on
dff["Pos"] = dff["Pos"].astype(int)
dff["EL_Rank"] = dff["EL_Rank"].astype(float)
dff["BA_Rank"] = dff["BA_Rank"].astype(float)
#columns of relevance selected and threshold
#recommended by NetMHCpan-4.1 developers
dff = dff[["Pos", "Peptide", "ID", "EL_Rank",
"BA_Rank"]].query("EL_Rank <= 2")
dff = dff.sort_values(by=["EL_Rank"])
#saves the parsed data sorted by EL Rank%
dff = dff.reset_index(drop=True)
dff.to_csv(f"{alleles_list[1]}.csv", index=False)
#final csv file with 3rd allele parsed
dff = df_overall_3rd
#here the script selects the row containing the columns name
dff.columns = dff.iloc[0]
dff = dff.drop(0)
#as only analyses one peptide lenght i.e 9aa
#those columns are not necessary
dff = dff.drop(columns = ["core", "icore"])
#data type object changed in order to help
#to manipulate the data later on
dff["Pos"] = dff["Pos"].astype(int)
dff["EL_Rank"] = dff["EL_Rank"].astype(float)
dff["BA_Rank"] = dff["BA_Rank"].astype(float)
#columns of relevance selected and threshold
#recommended by NetMHCpan-4.1 developers
dff = dff[["Pos", "Peptide", "ID", "EL_Rank",
"BA_Rank"]].query("EL_Rank <= 2")
dff = dff.sort_values(by=["EL_Rank"])
#saves the parsed data sorted by EL Rank%
dff = dff.reset_index(drop=True)
dff.to_csv(f"{alleles_list[2]}.csv", index=False)
#final csv file with 4th allele parsed
dff = df_overall_4th
#here the script selects the row containing the columns name
dff.columns = dff.iloc[0]
dff = dff.drop(0)
#as only analyses one peptide lenght i.e 9aa
#those columns are not necessary
dff = dff.drop(columns = ["core", "icore"])
#data type object changed in order to help
#to manipulate the data later on
dff["Pos"] = dff["Pos"].astype(int)
dff["EL_Rank"] = dff["EL_Rank"].astype(float)
dff["BA_Rank"] = dff["BA_Rank"].astype(float)
#columns of relevance selected and threshold
#recommended by NetMHCpan-4.1 developers
dff = dff[["Pos", "Peptide", "ID", "EL_Rank",
"BA_Rank"]].query("EL_Rank <= 2")
dff = dff.sort_values(by=["EL_Rank"])
#saves the parsed data sorted by EL Rank% only
dff = dff.reset_index(drop=True)
dff.to_csv(f"{alleles_list[3]}.csv", index=False)
def five_alleles(self):
line = self.f.readlines(1)
p = re.compile(r'(HLA-\w*):(\w*)')
line = str(line)
it = p.finditer(line)
alleles_list = []
for match in it:
alleles_list.append(match.group(1) + match.group(2))
# common columns
df_common = self.df.iloc[:, 0:3]
#unique columns per allele
#1st allele
df_1st_allele = self.df.iloc[:, 3:9]
#2nd allele
df_2nd_allele = self.df.iloc[:, 9:15]
#3rd allele
df_3rd_allele = self.df.iloc[:, 15:21]
#4th allele
df_4th_allele = self.df.iloc[:, 21:27]
#5th allele
df_5th_allele = self.df.iloc[:, 27:33]
#new DataFrame object with all necessary from an allele
df_overall_1st = df_common.join(df_1st_allele)
df_overall_2nd = df_common.join(df_2nd_allele)
df_overall_3rd = df_common.join(df_3rd_allele)
df_overall_4th = df_common.join(df_4th_allele)
df_overall_5th = df_common.join(df_5th_allele)
#final csv file with 1st allele parsed
dff = df_overall_1st
#here the script selects the row containing the columns name
dff.columns = dff.iloc[0]
dff = dff.drop(0)
#as only analyses one peptide lenght i.e 9aa
#those columns are not necessary
dff = dff.drop(columns = ["core", "icore"])
#data type object changed in order to help
#to manipulate the data later on
dff["Pos"] = dff["Pos"].astype(int)
dff["EL_Rank"] = dff["EL_Rank"].astype(float)
dff["BA_Rank"] = dff["BA_Rank"].astype(float)
#columns of relevance selected and threshold
#recommended by NetMHCpan-4.1 developers
dff = dff[["Pos", "Peptide", "ID", "EL_Rank",
"BA_Rank"]].query("EL_Rank <= 2")
dff = dff.sort_values(by=["EL_Rank"])
#saves the parsed data sorted by EL Rank% only at this moment
dff = dff.reset_index(drop=True)
dff.to_csv(f"{alleles_list[0]}.csv", index=False)
#final csv file with 2nd allele parsed
dff = df_overall_2nd
#here the script selects the row containing the columns name
dff.columns = dff.iloc[0]
dff = dff.drop(0)
#as only analyses one peptide lenght i.e 9aa
#those columns are not necessary
dff = dff.drop(columns = ["core", "icore"])
#data type object changed in order to help
#to manipulate the data later on
dff["Pos"] = dff["Pos"].astype(int)
dff["EL_Rank"] = dff["EL_Rank"].astype(float)
dff["BA_Rank"] = dff["BA_Rank"].astype(float)
#columns of relevance selected and threshold
#recommended by NetMHCpan-4.1 developers
dff = dff[["Pos", "Peptide", "ID", "EL_Rank",
"BA_Rank"]].query("EL_Rank <= 2")
dff = dff.sort_values(by=["EL_Rank"])
#saves the parsed data sorted by EL Rank% only at this moment
dff = dff.reset_index(drop=True)
dff.to_csv(f"{alleles_list[1]}.csv", index=False)
#final csv file with 3rd allele parsed
dff = df_overall_3rd
#here the script selects the row containing the columns name
dff.columns = dff.iloc[0]
dff = dff.drop(0)
#as only analyses one peptide lenght i.e 9aa
#those columns are not necessary
dff = dff.drop(columns = ["core", "icore"])
#data type object changed in order to help
#to manipulate the data later on
dff["Pos"] = dff["Pos"].astype(int)
dff["EL_Rank"] = dff["EL_Rank"].astype(float)
dff["BA_Rank"] = dff["BA_Rank"].astype(float)
#columns of relevance selected and threshold
#recommended by NetMHCpan-4.1 developers
dff = dff[["Pos", "Peptide", "ID", "EL_Rank",
"BA_Rank"]].query("EL_Rank <= 2")
dff = dff.sort_values(by=["EL_Rank"])
#saves the parsed data sorted by EL Rank% only at this moment
dff = dff.reset_index(drop=True)
dff.to_csv(f"{alleles_list[2]}.csv", index=False)
#final csv file with 4th allele parsed
dff = df_overall_4th
#here the script selects the row containing the columns name
dff.columns = dff.iloc[0]
dff = dff.drop(0)
#as only analyses one peptide lenght i.e 9aa
#those columns are not necessary
dff = dff.drop(columns = ["core", "icore"])
#data type object changed in order to help
#to manipulate the data later on
dff["Pos"] = dff["Pos"].astype(int)
dff["EL_Rank"] = dff["EL_Rank"].astype(float)
dff["BA_Rank"] = dff["BA_Rank"].astype(float)
#columns of relevance selected and threshold
#recommended by NetMHCpan-4.1 developers
dff = dff[["Pos", "Peptide", "ID", "EL_Rank",
"BA_Rank"]].query("EL_Rank <= 2")
dff = dff.sort_values(by=["EL_Rank"])
#saves the parsed data sorted by EL Rank% only at this moment
dff = dff.reset_index(drop=True)
dff.to_csv(f"{alleles_list[3]}.csv", index=False)
#final csv file with 5th allele parsed
dff = df_overall_5th
#here the script selects the row containing the columns name
dff.columns = dff.iloc[0]
dff = dff.drop(0)
#as only analyses one peptide lenght i.e 9aa
#those columns are not necessary
dff = dff.drop(columns = ["core", "icore"])
#data type object changed in order to help
#to manipulate the data later on
dff["Pos"] = dff["Pos"].astype(int)
dff["EL_Rank"] = dff["EL_Rank"].astype(float)
dff["BA_Rank"] = dff["BA_Rank"].astype(float)
#columns of relevance selected and threshold
#recommended by NetMHCpan-4.1 developers
dff = dff[["Pos", "Peptide", "ID", "EL_Rank",
"BA_Rank"]].query("EL_Rank <= 2")
dff = dff.sort_values(by=["EL_Rank"])
#saves the parsed data sorted by EL Rank%
dff = dff.reset_index(drop=True)
dff.to_csv(f"{alleles_list[4]}.csv", index=False)