-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathepitopes_variability.py
134 lines (107 loc) · 5.24 KB
/
epitopes_variability.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env python3
"""
Program: Variability
File: epitopes_variability.py
Version: V1.0
Date: 23.10.22
Function: returns a bar plots in png format with all the information
of the epitope variability of the Zika incursion in Brazil
Copyright: (c) Joan M. Amaya C., 2022
Author: Joan Manuel Amaya Cuesta
--------------------------------------------------------------------------
Description:
============
Script used and referenced in 2.2.4 Epitope prediction and data analysis
(Zika incursion in Brazil) of my Bioinformatics dissertation.
This program uses the csv file created with joiner_mapper but instead of
changes_over_time.csv will use the csv file with all the joint data. Will returns
a png file with barplots showing the total number of unique epitopes, recurrents
and news
data = variability("zikav.csv") call the csv file
--------------------------------------------------------------------------
Usage:
======
csv files created on joiner_mapper
--------------------------------------------------------------------------
Revision History:
=================
V0.1 Ausgust 2022
V1.0 23.10.22 Original By: JMAC
"""
#*************************************************************************
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#******************************************************************************
class variability:
"""
This script in unique for the Zika incursion.
Will use the csv file generated with joiner_mapper
with all the joint data of the alleles, also, will remove
bad quality data and create a png file with barplots
of all the epitope variability information
Object attributes:
__init__(self, df_zika_incursion): calls the specific data for this case.
Object methods:
__init__(self, df_zika_incursion): reads the csv .
"""
def __init__(self, df_zika_incursion):
self.df = pd.read_csv(df_zika_incursion)
#in order to calculate the intermitent recurring epitopes
#each year will be separated in different dataframe objects
#and then filtered
df_2015 = self.df[self.df["Year"] == 2015]
df_2016 = self.df[self.df["Year"] == 2016]
df_2017 = self.df[self.df["Year"] == 2017]
df_2018 = self.df[self.df["Year"] == 2018]
#2016 did have lots of bad quality artefacts
#many sequences had more than 5 X
#in order to avoid a wrong count these sequences will be removed
df_2016["Peptide"] = df_2016["Peptide"].str.extract(r"([^X]{9})")
#total per year
df_total_2015 = df_2015["Peptide"].drop_duplicates().nunique()
df_total_2016 = df_2016["Peptide"].drop_duplicates().nunique()
df_total_2017 = df_2017["Peptide"].drop_duplicates().nunique()
df_total_2018 = df_2018["Peptide"].drop_duplicates().nunique()
#recurrents
#as we are calculating only the recurrent epitopes
#all duplications must be removed on each year
df_unique_2015 = df_2015["Peptide"].drop_duplicates()
df_unique_2016 = df_2016["Peptide"].drop_duplicates()
df_unique_2017 = df_2017["Peptide"].drop_duplicates()
df_unique_2018 = df_2018["Peptide"].drop_duplicates()
df_recurrent_15_16 = df_unique_2015[df_unique_2015.isin(df_unique_2016)]
df_recurrent_16_17 = df_recurrent_15_16[df_recurrent_15_16.isin(df_unique_2017)]
df_recurrent_17_18 = df_recurrent_16_17[df_recurrent_16_17.isin(df_unique_2018)]
df_recurrent_16 = df_recurrent_15_16.count()
df_recurrent_17 = df_recurrent_16_17.count()
df_recurrent_18 = df_recurrent_17_18.count()
#observed in one year only
df_only_15_16 = df_unique_2015[~df_unique_2015.isin(df_unique_2016)]
df_only_16_17 = df_only_15_16[~df_only_15_16.isin(df_unique_2017)]
df_only_17_18 = df_only_16_17[~df_only_16_17.isin(df_unique_2018)]
df_only_16 = df_only_15_16.count()
df_only_17 = df_only_16_17.count()
df_only_18 = df_only_17_18.count()
#dataframes for visualisation
df_total = pd.DataFrame({"Total" : [df_total_2015, df_total_2016,
df_total_2017, df_total_2018]})
df_recurrents = pd.DataFrame({"Recurrents" : [0, df_recurrent_16, df_recurrent_17,
df_recurrent_18]})
df_news = pd.DataFrame({"New" : [0, df_only_16, df_only_17, df_only_18]})
df_years = pd.DataFrame({"Year" : [2015, 2016, 2017, 2018]})
df_epitope_counts = pd.concat([df_years, df_total, df_recurrents, df_news], axis = 1)
df_epitope_variability = pd.melt(df_epitope_counts, id_vars = ["Year"],
var_name = "Analysis",
value_name = "Epitopes")
x_size = 16
y_size = 7
plt.figure(figsize = (x_size, y_size))
ax = sns.barplot(x="Year", y="Epitopes", data=df_epitope_variability,
palette = "viridis", hue = 'Analysis')
plt.title("Epitopes variability", fontsize = 20)
plt.xlabel("Year", fontsize = 15)
plt.ylabel("Number of epitopes", fontsize = 15)
plt.legend(fontsize=15)
plt.savefig("epitope_variability")