Skip to content

Commit

Permalink
update algorithm __init__
Browse files Browse the repository at this point in the history
  • Loading branch information
xywawawa authored Jun 4, 2024
1 parent 604e849 commit e83a903
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 79 deletions.
27 changes: 6 additions & 21 deletions cenproteo/JDC.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"""

class JDC:
def __init__(self, ppi_file, gene_expression_file,essential_protein_file):
def __init__(self, ppi_file, gene_expression_file):
# Load PPI network data
self.ppi_file = ppi_file
try:
Expand All @@ -35,17 +35,15 @@ def __init__(self, ppi_file, gene_expression_file,essential_protein_file):
self.ecc = None
self.jaccard = None

self.essential_protein_file = essential_protein_file
df_essential = pd.read_csv(essential_protein_file)
self.essential_protein_list = self._get_essential_protein(df_essential)

self.sorted_score = self.calculate_jdc()

def _get_essential_protein(self, df):
essential_pro = []
for _, row in df.iterrows():
pro = row.iloc[1]
essential_pro.append(pro)
return essential_pro

def edge_clustering_coefficient(self):
if self.ecc is not None:
return self.ecc
Expand Down Expand Up @@ -129,13 +127,9 @@ def export_result_to_csv(self,save_path):
result_df = pd.DataFrame(self.sorted_jdc, columns=['Protein', 'JDC Centrality Score'])
result_df.to_csv(save_path, index=False)

def first_n_comparison(self, n):
"""
Compare the first n elements of the result list.
Args:
result (list): The list of results.
"""
def first_n_comparison(self, n, real_essential_protein_file):
df_essential = pd.read_csv(real_essential_protein_file)
self.essential_protein_list = self._get_essential_protein(df_essential)
count = 0

for protein_tuple in self.sorted_jdc[:n]:
Expand All @@ -145,12 +139,3 @@ def first_n_comparison(self, n):
print(f"There're {count} essential proteins in the top {n} predicted by algorism.")
return count

# Example usage
# ppi_file = r"C:\Users\Administrator\Desktop\CenProteo-main\SC_Data\processed_data\DIP_data_with_combined_scores.csv"
# gene_expression = r"C:\Users\Administrator\Desktop\CenProteo-main\SC_Data\processed_data\filtered_GE_matrix.csv"
# essential_protein = r"C:\Users\Administrator\Desktop\CenProteo-main\SC_Data\processed_data\extracted_essential_protein.csv"
# JDC_test = JDC(ppi_file,gene_expression,essential_protein)
# sorted_score = JDC_test.calculate_jdc(5) # to calculate the top n essential protein
# JDC_test.first_n_comparison(sorted_score) # compare the essential protein between the result and the ground truth


20 changes: 4 additions & 16 deletions cenproteo/TEO.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
os.chdir(script_dir)

class TEO:
def __init__(self, ppi_file, gene_expression_file, essential_protein_file):
def __init__(self, ppi_file, gene_expression_file):
# load ppi network data
self.ppi_file = ppi_file
df_ppi = pd.read_csv(ppi_file)
Expand All @@ -25,11 +25,6 @@ def __init__(self, ppi_file, gene_expression_file, essential_protein_file):
df_GO = pd.read_csv(ppi_file, index_col = [0, 1])
self.GO_similarity_dict = self._create_GO_dict(df_GO)

# load essential protein data
self.essential_protein_file = essential_protein_file
df_essential = pd.read_csv(essential_protein_file)
self.essential_protein_list = self._get_essential_protein(df_essential)

# the result dict using three kinds of GO term
self.TEO_BP = self.TEO('BP')
self.TEO_MF = self.TEO('MF')
Expand Down Expand Up @@ -140,7 +135,9 @@ def TEO(self, GO_term):
sorted_TEO_score = dict(sorted(TEO_score.items(), key=lambda item: item[1], reverse=True))
return sorted_TEO_score

def first_n_comparison(self, n, GO_term):
def first_n_comparison(self, n, GO_term, real_essential_protein_file):
df_essential = pd.read_csv(real_essential_protein_file)
self.essential_protein_list = self._get_essential_protein(df_essential)
# Evaluate the efficiency of the algorism by counting how many proteins with high teo score (top n) exist in the essential protein list
count = 0
if GO_term == 'BP':
Expand Down Expand Up @@ -172,12 +169,3 @@ def export_results_to_csv(self, GO_term, result_path):
for key, value in score_list:
writer.writerow([key, value])



# Example Usage
# ppi_file = r'SC_Data/processed_data/combined_data.csv'
# gene_expression_file = r'SC_Data/processed_data/filtered_GE_matrix.csv'
# essential_protein_file = r'SC_Data/processed_data/extracted_essential_protein.csv'
# teo = TEO(ppi_file, gene_expression_file, essential_protein_file)
# # teo.export_results_to_csv('BP', r'TEO_BP_result.csv')
# teo.first_n_comparison(200, 'BP')
24 changes: 6 additions & 18 deletions cenproteo/TGSO.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
os.chdir(script_dir)

class TGSO:
def __init__(self, ppi_file, gene_expression_file, subcellular_localization_file, i_score_file, essential_protein_file, alpha=0.3, max_iter=100, tol=10e-6):
def __init__(self, ppi_file, gene_expression_file, subcellular_localization_file, i_score_file, alpha=0.3, max_iter=100, tol=10e-6):
# load ppi network data
self.ppi_file = ppi_file
df_ppi = pd.read_csv(ppi_file)
Expand All @@ -30,11 +30,6 @@ def __init__(self, ppi_file, gene_expression_file, subcellular_localization_file
self.i_score_file = i_score_file
self.i_score_dict = self._load_i_score(i_score_file)

# load essential protein data
self.essential_protein_file = essential_protein_file
df_essential = pd.read_csv(essential_protein_file)
self.essential_protein_list = self._get_essential_protein(df_essential)

self.ADN = self.ADN()
self.CEN = self.CEN()
self.colo_sub = self.CLN()
Expand Down Expand Up @@ -288,7 +283,10 @@ def calculate_P(self):
sorted_protein_score = dict(sorted(P.items(), key=lambda item: item[1], reverse=True))
return sorted_protein_score, iter_time

def first_n_comparison(self, n):
def first_n_comparison(self, n, real_essential_protein_file):
# load essential protein data
df_essential = pd.read_csv(real_essential_protein_file)
self.essential_protein_list = self._get_essential_protein(df_essential)
count = 0
top_TGSO_score = list(self.protein_score.keys())[:n]
for ess_pro in top_TGSO_score:
Expand All @@ -309,14 +307,4 @@ def export_results_to_csv(self, file_path):
writer.writerow(['Protein', 'Score']) # Writing header
for protein, score in sorted(self.protein_score.items(), key=lambda item: item[1], reverse=True):
writer.writerow([protein, score]) # Writing each protein and its score


# Example Usage
# ppi_file = r'SC_Data/processed_data/combined_data.csv'
# gene_expression_file = r'SC_Data/processed_data/filtered_GE_matrix.csv'
# subcellular_localization_file = r'SC_Data/processed_data/yeast_compartment_knowledge_full.csv'
# essential_protein_file = r'SC_Data/processed_data/extracted_essential_protein.csv'
# i_score_file = r'SC_Data/processed_data/I_score.csv'
# tgso = TGSO(ppi_file, gene_expression_file, subcellular_localization_file, i_score_file, essential_protein_file)
# # tgso.first_n_comparison(200)
# tgso.export_results_to_csv('TGSO_result.csv')

41 changes: 17 additions & 24 deletions cenproteo/classical_algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import pandas as pd

class classical_algorithms:
def __init__(self, ppi_file,essential_protein_file):
def __init__(self, ppi_file):
"""
Initialize the class with a file path to a protein-protein interaction data CSV.
This method reads the CSV file and constructs a graph using NetworkX.
Expand All @@ -17,18 +17,6 @@ def __init__(self, ppi_file,essential_protein_file):
edges = df.apply(lambda row: (row['Protein A'], row['Protein B']), axis=1).tolist()
G.add_edges_from(edges)
self.G = G

self.essential_protein_file = essential_protein_file
df_essential = pd.read_csv(essential_protein_file)
self.essential_protein_list = self._get_essential_protein(df_essential)

def _get_essential_protein(self, df):
essential_pro = []
for _, row in df.iterrows():
pro = row.iloc[1]
essential_pro.append(pro)
return essential_pro


#find essential protein by computing degree centrality
def DC(self):
Expand Down Expand Up @@ -113,30 +101,35 @@ def export_result_to_csv(self, sorted_result, file_name):
result_df = pd.DataFrame(sorted_result, columns=['Protein', 'Centrality Score'])
result_df.to_csv(file_name, index=False)

def first_n_comparison(self, n,result):
def _get_essential_protein(self, df):
essential_pro = []
for _, row in df.iterrows():
pro = row.iloc[1]
essential_pro.append(pro)
return essential_pro

def first_n_comparison(self, n, result, real_essential_protein_file):
"""
Compare the first n elements of the result list.
Compare the first n elements of the result list and real essential protein list.
Args:
result (list): The list of results.
n: The first n proteins chosen to be compared.
real_essential_protein_file: The real essential protein file path.
"""
df_essential = pd.read_csv(real_essential_protein_file)
self.real_essential_protein_list = self._get_essential_protein(df_essential)

count = 0

for protein_tuple in result[:n]:
protein_name ,score = protein_tuple
if protein_name in self.essential_protein_list:
if protein_name in self.real_essential_protein_list:
count = count + 1
print( f"There're {count} essential proteins in the top {n} predicted by algorism.")
return count



# Example usage
# ppi_file = r"C:\Users\Administrator\Desktop\CenProteo-main\SC_Data\processed_data\DIP_data_with_combined_scores.csv"
# essential_protein = r"C:\Users\Administrator\Desktop\CenProteo-main\SC_Data\processed_data\extracted_essential_protein.csv"
# class_test = classical_algorithms(ppi_file,essential_protein)
# sorted_score = class_test.DC(5) # Change different method by using different function -- class_test.NC(N)
# class_test.first_n_comparison(sorted_score) # compare the essential protein between the result and the ground truth




Expand Down

0 comments on commit e83a903

Please sign in to comment.