-
Notifications
You must be signed in to change notification settings - Fork 0
/
EDA_DRKG_compounds_names.py
105 lines (89 loc) · 3.99 KB
/
EDA_DRKG_compounds_names.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# ------------------------------------------------------------------------------------------------------
# Script: EDA_DRKG_compounds_names.py
# Author: Sebastian Ayala Ruano
# Date: 28-10-2021
# Description: This script does the exploratory data analysis, and obtain the IDs
# and datasource of the compounds in the DRKG
# Version: 1.0
# License: MIT License
# Usage: python EDA_DRKG_compounds_names.py
# Dependencies: Details in how to install them in the README.md file
# References: https://github.com/sayalaruano/DengueDrugRep/blob/main/Scripts/EDA_DRKG_compounds_names.py
# ------------------------------------------------------------------------------------------------------
#%%
# Import libraries
import pandas as pd
import csv
import re
import matplotlib.pyplot as plt
import numpy as np
#%%
# Specify the file path
file_path = "Data/DRKG/compounds_DRKG.tsv"
# Open .tsv file
with open(file_path, 'r') as file:
# Create a csv.reader object with tab delimiter
tsv_reader = csv.reader(file, delimiter='\t')
# Create a list to store rows
rows = []
# Read and append each row to the list
for row in tsv_reader:
# Concatenate all elements after the first element into the second element
row = [row[0], ' '.join(row[1:])]
rows.append(row)
# Create a DataFrame without specifying headers
compounds_DRKG = pd.DataFrame(rows, columns=["Entity", "Source"])
#%%
# Create a column with the compound id
compounds_DRKG['compound_id'] = compounds_DRKG['Entity'].str.split('::').str[1]
# Iterate over rows and create a column with the data source
# The CHEMBL, DrugBank and nmrshiftdb2 entries have the ID directly after the :: symbol, while
# the rest of the entries have the name of the database and then the ID separated by a colon
for index, row in compounds_DRKG.iterrows():
if row['compound_id'].startswith('CHEMBL'):
compounds_DRKG.loc[index, 'data_source'] = "CHEMBL"
elif row['compound_id'].startswith('DB'):
compounds_DRKG.loc[index, 'data_source'] = "DrugBank"
elif row['compound_id'].startswith('nmrshiftdb2'):
compounds_DRKG.loc[index, 'data_source'] = "nmrshiftdb2"
else:
compounds_DRKG.loc[index, 'data_source'] = re.search(r'[:\s]*([A-Za-z]+)', compounds_DRKG.loc[index, 'compound_id']).group(1)
# Unify the CHEBI identidiers
compounds_DRKG['data_source'] = compounds_DRKG['data_source'].replace('chebi', 'CHEBI')
#%%
# Add a column with IDs without the data source
for index, row in compounds_DRKG.iterrows():
if ":" in row['compound_id']:
compounds_DRKG.loc[index, 'compound_id_short'] = compounds_DRKG.loc[index, 'compound_id'].split(":")[1]
#%%
# Export the dataframe as a csv file
compounds_DRKG.to_csv('Data/DRKG/Compounds_DRKG_datasource.csv', index=False)
# %%
# Create a list with the compound names
db_names = compounds_DRKG['data_source'].unique().tolist()
# Add the "_id" word in the end of each element of the list
db_names = [x + "_id" for x in db_names]
# Convert the list into a df and export it as a csv file
db_names = pd.DataFrame(db_names)
db_names.to_csv('Data/DRKG/Names_datasources_compounds_DRKG.csv', index=False, header=False)
# %%
# Group the data by 'data_source' and count the occurrences
database_counts = compounds_DRKG['data_source'].value_counts()
# Create a color gradient for the bars
num_databases = len(database_counts)
colors = plt.cm.viridis(np.linspace(0.2, 0.8, num_databases))
# Create a bar plot for all databases with a color gradient and log scale on the y-axis
plt.figure(figsize=(10, 6))
plt.bar(database_counts.index, database_counts.values, color=colors, edgecolor='black')
plt.yscale('log') # Use a logarithmic scale for the y-axis
# Add labels and title
plt.xlabel('Database')
plt.ylabel('Log(Number of Compounds)')
plt.title('Distribution of Compounds per Database')
# Rotate x-axis labels for better readability
plt.xticks(rotation=45, ha='right')
# # Export the plot as a png file and display it
plt.tight_layout()
plt.savefig('img/Compounds_DRKG_distribution.png', dpi=300)
plt.show()
# %%