data_parser.py

#!/usr/bin/env python3
"""
Program:    Data Parser
File:       data_parser.py

Version:    V3.3
Date:       23.10.22
Function:   returns csv files from a NetMHCpan 4.1 output file.

Copyright:  (c) Joan M. Amaya C., 2022
Author:     Joan Manuel Amaya Cuesta

--------------------------------------------------------------------------
Description:
============
script used and referenced in the methods section of my Bioinformatics
dissertation. This script was used in all three parts of the methods
this class  will use the raw data from NetMHCpan 4.1 and parse it by
the %Rank as "This measure is not affected by inherent bias of certain
molecules towards higher or lower mean predicted affinities.
Strong binders are defined as having %rank<0.5, and weak binders with %rank<2"
[17]
This script returns a csv file with the data parsed for the number of alleles
that the user want but the only options are 1, 4 and 5 alleles
all csv file created will be used for downstream analysis
data = Parser("xxx.xls") call the instance with the output fron netMHCpan
data.one_allele(): call to parse and return a csv file
data.four_alleles(): call to parse and return four csv files
data.five_alleles(): call to parse and return five csv files

--------------------------------------------------------------------------
Usage:
======
excel files created from NetMHCpan 4.1

--------------------------------------------------------------------------
Revision History:
=================
V2.0   july 2022
V3.0   24.08.28
V3.3   23.10.22 Original   By: JMAC
"""

#*************************************************************************
# Import libraries

import pandas as pd
import re

#*************************************************************************

class Parser:
    """
    This is the general parser script that have been used for all three
    sections of my Bioinformatics dissertation.
    This script will only work with the output from NetMHCpan - 4.1
    The script will parse the output generated by the program depending
    of how many alleles have been selected for prediction.
    The user must select how many alleles will need to parse, and
    depending on that the user will choose one object method,
    please see object method section to identify the correct option
    depending on your needs.
    The output file in csv format will be used for downstream analysis

    Object attributes:
    netmhcpan_output (file): NetMHCpan - 4.1 output xls file.

    Object methods:
    __init__(self, netmhcpan_output): opens NetMHCpan - 4.1 output
    and create a DataFrame object that will be used for parsing
    the appropriate data.

    one_allele(self): to parse only one allele to return a csv file

    four_alleles(self): to parse only four alleles to return four csv file

    five_alleles(self): same as before but for five alleles

    """

    def __init__(self, netmhcpan_output):
        self.df = pd.read_table(netmhcpan_output, low_memory=False)
        self.f = open(netmhcpan_output)


    def one_allele(self):

        # common columns
        df_common = self.df.iloc[:, 0:3]

        #unique columns per
        df_1st_allele = self.df.iloc[:, 3:9]

        df_overall_1st = df_common.join(df_1st_allele)
        dff = df_overall_1st

        #here the script selects the row containing the columns name
        dff.columns = dff.iloc[0]
        dff = dff.drop(0)

        #as only analyses one peptide lenght i.e 9aa
        #those columns are not necessary
        dff = dff.drop(columns = ["core", "icore"])

        #data type object changed in order to help
        #to manipulate the data later on
        dff["EL_Rank"] = dff["EL_Rank"].astype(float)


        #columns of relevance selected and threshold
        #recommended by NetMHCpan-4.1 developers
        dff = dff[["Peptide", "ID", "EL_Rank"]].query("EL_Rank <= 2")
        dff = dff.sort_values(by=["EL_Rank"])

        #saves the parsed data
        dff = dff.reset_index(drop=True)

        #final parsed file with customised name
        parsed_file = input("Please enter file name: ")
        dff.to_csv(f"{parsed_file}.csv", index=False)


    def four_alleles(self):

        #with more than one allele, the allele name
        #must be extracted for each one of
        #the alelles
        line = self.f.readlines(1)
        p = re.compile(r'(HLA-\w*):(\w*)')
        line = str(line)
        it = p.finditer(line)
        alleles_list = []

        for match in it:
            alleles_list.append(match.group(1) + match.group(2))


        # common columns
        df_common = self.df.iloc[:, 0:3]

        #unique columns per allele
        #1st allele
        df_1st_allele = self.df.iloc[:, 3:9]

        #2nd allele
        df_2nd_allele = self.df.iloc[:, 9:15]

        #3rd allele
        df_3rd_allele = self.df.iloc[:, 15:21]

        #4th allele
        df_4th_allele = self.df.iloc[:, 21:27]


        #new DataFrame objects with all necessary from an allele
        df_overall_1st = df_common.join(df_1st_allele)
        df_overall_2nd = df_common.join(df_2nd_allele)
        df_overall_3rd = df_common.join(df_3rd_allele)
        df_overall_4th = df_common.join(df_4th_allele)


        #to prepare final csv file with 1st allele parsed
        dff = df_overall_1st

        #here the script selects the row containing the columns name
        dff.columns = dff.iloc[0]
        dff = dff.drop(0)

        #as only analyses one peptide lenght i.e 9aa
        #those columns are not necessary
        dff = dff.drop(columns = ["core", "icore"])

        #data type object changed in order to help
        #to manipulate the data later on
        dff["Pos"] = dff["Pos"].astype(int)
        dff["EL_Rank"] = dff["EL_Rank"].astype(float)
        dff["BA_Rank"] = dff["BA_Rank"].astype(float)

        #columns of relevance selected and threshold
        #recommended by NetMHCpan-4.1 developers
        dff = dff[["Pos", "Peptide", "ID", "EL_Rank",
                   "BA_Rank"]].query("EL_Rank <= 2")
        dff = dff.sort_values(by=["EL_Rank"])

        #saves the parsed data sorted by EL Rank%
        dff = dff.reset_index(drop=True)
        dff.to_csv(f"{alleles_list[0]}.csv", index=False)

        #final csv file with 2nd allele parsed
        dff = df_overall_2nd

        #here the script selects the row containing the columns name
        dff.columns = dff.iloc[0]
        dff = dff.drop(0)

        #as only analyses one peptide lenght i.e 9aa
        #those columns are not necessary
        dff = dff.drop(columns = ["core", "icore"])

        #data type object changed in order to help
        #to manipulate the data later on
        dff["Pos"] = dff["Pos"].astype(int)
        dff["EL_Rank"] = dff["EL_Rank"].astype(float)
        dff["BA_Rank"] = dff["BA_Rank"].astype(float)

        #columns of relevance selected and threshold
        #recommended by NetMHCpan-4.1 developers
        dff = dff[["Pos", "Peptide", "ID", "EL_Rank",
                   "BA_Rank"]].query("EL_Rank <= 2")
        dff = dff.sort_values(by=["EL_Rank"])

        #saves the parsed data sorted by EL Rank%
        dff = dff.reset_index(drop=True)
        dff.to_csv(f"{alleles_list[1]}.csv", index=False)


        #final csv file with 3rd allele parsed
        dff = df_overall_3rd

        #here the script selects the row containing the columns name
        dff.columns = dff.iloc[0]
        dff = dff.drop(0)

        #as only analyses one peptide lenght i.e 9aa
        #those columns are not necessary
        dff = dff.drop(columns = ["core", "icore"])

        #data type object changed in order to help
        #to manipulate the data later on
        dff["Pos"] = dff["Pos"].astype(int)
        dff["EL_Rank"] = dff["EL_Rank"].astype(float)
        dff["BA_Rank"] = dff["BA_Rank"].astype(float)

        #columns of relevance selected and threshold
        #recommended by NetMHCpan-4.1 developers
        dff = dff[["Pos", "Peptide", "ID", "EL_Rank",
                   "BA_Rank"]].query("EL_Rank <= 2")
        dff = dff.sort_values(by=["EL_Rank"])

        #saves the parsed data sorted by EL Rank%
        dff = dff.reset_index(drop=True)
        dff.to_csv(f"{alleles_list[2]}.csv", index=False)


        #final csv file with 4th allele parsed
        dff = df_overall_4th
        #here the script selects the row containing the columns name
        dff.columns = dff.iloc[0]
        dff = dff.drop(0)

        #as only analyses one peptide lenght i.e 9aa
        #those columns are not necessary
        dff = dff.drop(columns = ["core", "icore"])

        #data type object changed in order to help
        #to manipulate the data later on
        dff["Pos"] = dff["Pos"].astype(int)
        dff["EL_Rank"] = dff["EL_Rank"].astype(float)
        dff["BA_Rank"] = dff["BA_Rank"].astype(float)

        #columns of relevance selected and threshold
        #recommended by NetMHCpan-4.1 developers
        dff = dff[["Pos", "Peptide", "ID", "EL_Rank",
                   "BA_Rank"]].query("EL_Rank <= 2")
        dff = dff.sort_values(by=["EL_Rank"])

        #saves the parsed data sorted by EL Rank% only
        dff = dff.reset_index(drop=True)
        dff.to_csv(f"{alleles_list[3]}.csv", index=False)


    def five_alleles(self):

        line = self.f.readlines(1)
        p = re.compile(r'(HLA-\w*):(\w*)')
        line = str(line)
        it = p.finditer(line)
        alleles_list = []

        for match in it:
            alleles_list.append(match.group(1) + match.group(2))


        # common columns
        df_common = self.df.iloc[:, 0:3]

        #unique columns per allele
        #1st allele
        df_1st_allele = self.df.iloc[:, 3:9]

        #2nd allele
        df_2nd_allele = self.df.iloc[:, 9:15]

        #3rd allele
        df_3rd_allele = self.df.iloc[:, 15:21]

        #4th allele
        df_4th_allele = self.df.iloc[:, 21:27]

        #5th allele
        df_5th_allele = self.df.iloc[:, 27:33]

        #new DataFrame object with all necessary from an allele
        df_overall_1st = df_common.join(df_1st_allele)
        df_overall_2nd = df_common.join(df_2nd_allele)
        df_overall_3rd = df_common.join(df_3rd_allele)
        df_overall_4th = df_common.join(df_4th_allele)
        df_overall_5th = df_common.join(df_5th_allele)


        #final csv file with 1st allele parsed
        dff = df_overall_1st

        #here the script selects the row containing the columns name
        dff.columns = dff.iloc[0]
        dff = dff.drop(0)

        #as only analyses one peptide lenght i.e 9aa
        #those columns are not necessary
        dff = dff.drop(columns = ["core", "icore"])

        #data type object changed in order to help
        #to manipulate the data later on
        dff["Pos"] = dff["Pos"].astype(int)
        dff["EL_Rank"] = dff["EL_Rank"].astype(float)
        dff["BA_Rank"] = dff["BA_Rank"].astype(float)

        #columns of relevance selected and threshold
        #recommended by NetMHCpan-4.1 developers
        dff = dff[["Pos", "Peptide", "ID", "EL_Rank",
                   "BA_Rank"]].query("EL_Rank <= 2")
        dff = dff.sort_values(by=["EL_Rank"])

        #saves the parsed data sorted by EL Rank% only at this moment
        dff = dff.reset_index(drop=True)
        dff.to_csv(f"{alleles_list[0]}.csv", index=False)


        #final csv file with 2nd allele parsed
        dff = df_overall_2nd

        #here the script selects the row containing the columns name
        dff.columns = dff.iloc[0]
        dff = dff.drop(0)

        #as only analyses one peptide lenght i.e 9aa
        #those columns are not necessary
        dff = dff.drop(columns = ["core", "icore"])

        #data type object changed in order to help
        #to manipulate the data later on
        dff["Pos"] = dff["Pos"].astype(int)
        dff["EL_Rank"] = dff["EL_Rank"].astype(float)
        dff["BA_Rank"] = dff["BA_Rank"].astype(float)

        #columns of relevance selected and threshold
        #recommended by NetMHCpan-4.1 developers
        dff = dff[["Pos", "Peptide", "ID", "EL_Rank",
                   "BA_Rank"]].query("EL_Rank <= 2")
        dff = dff.sort_values(by=["EL_Rank"])

        #saves the parsed data sorted by EL Rank% only at this moment
        dff = dff.reset_index(drop=True)
        dff.to_csv(f"{alleles_list[1]}.csv", index=False)


        #final csv file with 3rd allele parsed
        dff = df_overall_3rd

        #here the script selects the row containing the columns name
        dff.columns = dff.iloc[0]
        dff = dff.drop(0)

        #as only analyses one peptide lenght i.e 9aa
        #those columns are not necessary
        dff = dff.drop(columns = ["core", "icore"])

        #data type object changed in order to help
        #to manipulate the data later on
        dff["Pos"] = dff["Pos"].astype(int)
        dff["EL_Rank"] = dff["EL_Rank"].astype(float)
        dff["BA_Rank"] = dff["BA_Rank"].astype(float)

        #columns of relevance selected and threshold
        #recommended by NetMHCpan-4.1 developers
        dff = dff[["Pos", "Peptide", "ID", "EL_Rank",
                   "BA_Rank"]].query("EL_Rank <= 2")
        dff = dff.sort_values(by=["EL_Rank"])

        #saves the parsed data sorted by EL Rank% only at this moment
        dff = dff.reset_index(drop=True)
        dff.to_csv(f"{alleles_list[2]}.csv", index=False)


        #final csv file with 4th allele parsed
        dff = df_overall_4th
        #here the script selects the row containing the columns name
        dff.columns = dff.iloc[0]
        dff = dff.drop(0)

        #as only analyses one peptide lenght i.e 9aa
        #those columns are not necessary
        dff = dff.drop(columns = ["core", "icore"])

        #data type object changed in order to help
        #to manipulate the data later on
        dff["Pos"] = dff["Pos"].astype(int)
        dff["EL_Rank"] = dff["EL_Rank"].astype(float)
        dff["BA_Rank"] = dff["BA_Rank"].astype(float)

        #columns of relevance selected and threshold
        #recommended by NetMHCpan-4.1 developers
        dff = dff[["Pos", "Peptide", "ID", "EL_Rank",
                   "BA_Rank"]].query("EL_Rank <= 2")
        dff = dff.sort_values(by=["EL_Rank"])

        #saves the parsed data sorted by EL Rank% only at this moment
        dff = dff.reset_index(drop=True)
        dff.to_csv(f"{alleles_list[3]}.csv", index=False)

        #final csv file with 5th allele parsed
        dff = df_overall_5th
        #here the script selects the row containing the columns name
        dff.columns = dff.iloc[0]
        dff = dff.drop(0)

        #as only analyses one peptide lenght i.e 9aa
        #those columns are not necessary
        dff = dff.drop(columns = ["core", "icore"])

        #data type object changed in order to help
        #to manipulate the data later on
        dff["Pos"] = dff["Pos"].astype(int)
        dff["EL_Rank"] = dff["EL_Rank"].astype(float)
        dff["BA_Rank"] = dff["BA_Rank"].astype(float)

        #columns of relevance selected and threshold
        #recommended by NetMHCpan-4.1 developers
        dff = dff[["Pos", "Peptide", "ID", "EL_Rank",
                   "BA_Rank"]].query("EL_Rank <= 2")
        dff = dff.sort_values(by=["EL_Rank"])

        #saves the parsed data sorted by EL Rank%
        dff = dff.reset_index(drop=True)
        dff.to_csv(f"{alleles_list[4]}.csv", index=False)