update algorithm __init__

Imiloin · Jun 4, 2024 · e83a903 · e83a903
1 parent 604e849
commit e83a903
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 79 deletions.
diff --git a/cenproteo/JDC.py b/cenproteo/JDC.py
@@ -11,7 +11,7 @@
 """
 
 class JDC:
-    def __init__(self, ppi_file, gene_expression_file,essential_protein_file):
+    def __init__(self, ppi_file, gene_expression_file):
         # Load PPI network data
         self.ppi_file = ppi_file
         try:
@@ -35,17 +35,15 @@ def __init__(self, ppi_file, gene_expression_file,essential_protein_file):
         self.ecc = None
         self.jaccard = None
 
-        self.essential_protein_file = essential_protein_file
-        df_essential = pd.read_csv(essential_protein_file)
-        self.essential_protein_list = self._get_essential_protein(df_essential)
-
         self.sorted_score = self.calculate_jdc()
+
     def _get_essential_protein(self, df):
         essential_pro = []
         for _, row in df.iterrows():
             pro = row.iloc[1]
             essential_pro.append(pro)
         return essential_pro
+
     def edge_clustering_coefficient(self):
         if self.ecc is not None:
             return self.ecc
@@ -129,13 +127,9 @@ def export_result_to_csv(self,save_path):
         result_df = pd.DataFrame(self.sorted_jdc, columns=['Protein', 'JDC Centrality Score'])
         result_df.to_csv(save_path, index=False)
 
-    def first_n_comparison(self, n):
-        """
-        Compare the first n elements of the result list.
-        Args:
-            result (list): The list of results.
-
-        """
+    def first_n_comparison(self, n, real_essential_protein_file):
+        df_essential = pd.read_csv(real_essential_protein_file)
+        self.essential_protein_list = self._get_essential_protein(df_essential)
         count = 0
 
         for protein_tuple in self.sorted_jdc[:n]:
@@ -145,12 +139,3 @@ def first_n_comparison(self, n):
         print(f"There're {count} essential proteins in the top {n} predicted by algorism.")
         return count
 
-# Example usage
-# ppi_file = r"C:\Users\Administrator\Desktop\CenProteo-main\SC_Data\processed_data\DIP_data_with_combined_scores.csv"
-# gene_expression = r"C:\Users\Administrator\Desktop\CenProteo-main\SC_Data\processed_data\filtered_GE_matrix.csv"
-# essential_protein = r"C:\Users\Administrator\Desktop\CenProteo-main\SC_Data\processed_data\extracted_essential_protein.csv"
-# JDC_test =  JDC(ppi_file,gene_expression,essential_protein)
-# sorted_score = JDC_test.calculate_jdc(5) # to calculate the top n essential protein
-# JDC_test.first_n_comparison(sorted_score)  # compare the essential protein between the result and the ground truth
-
-
diff --git a/cenproteo/TEO.py b/cenproteo/TEO.py
@@ -7,7 +7,7 @@
 os.chdir(script_dir)
 
 class TEO:
-    def __init__(self, ppi_file, gene_expression_file, essential_protein_file):
+    def __init__(self, ppi_file, gene_expression_file):
         # load ppi network data
         self.ppi_file = ppi_file
         df_ppi = pd.read_csv(ppi_file)
@@ -25,11 +25,6 @@ def __init__(self, ppi_file, gene_expression_file, essential_protein_file):
         df_GO = pd.read_csv(ppi_file, index_col = [0, 1])
         self.GO_similarity_dict = self._create_GO_dict(df_GO)
 
-        # load essential protein data
-        self.essential_protein_file = essential_protein_file
-        df_essential = pd.read_csv(essential_protein_file)
-        self.essential_protein_list = self._get_essential_protein(df_essential)
-
         # the result dict using three kinds of GO term
         self.TEO_BP = self.TEO('BP')
         self.TEO_MF = self.TEO('MF')
@@ -140,7 +135,9 @@ def TEO(self, GO_term):
         sorted_TEO_score = dict(sorted(TEO_score.items(), key=lambda item: item[1], reverse=True))
         return sorted_TEO_score
 
-    def first_n_comparison(self, n, GO_term):
+    def first_n_comparison(self, n, GO_term, real_essential_protein_file):
+        df_essential = pd.read_csv(real_essential_protein_file)
+        self.essential_protein_list = self._get_essential_protein(df_essential)
         # Evaluate the efficiency of the algorism by counting how many proteins with high teo score (top n) exist in the essential protein list
         count = 0
         if GO_term == 'BP':
@@ -172,12 +169,3 @@ def export_results_to_csv(self, GO_term, result_path):
             for key, value in score_list:
                 writer.writerow([key, value])
 
-
-
-# Example Usage
-# ppi_file = r'SC_Data/processed_data/combined_data.csv'
-# gene_expression_file = r'SC_Data/processed_data/filtered_GE_matrix.csv'
-# essential_protein_file = r'SC_Data/processed_data/extracted_essential_protein.csv'
-# teo = TEO(ppi_file, gene_expression_file, essential_protein_file)
-# # teo.export_results_to_csv('BP', r'TEO_BP_result.csv')
-# teo.first_n_comparison(200, 'BP')
diff --git a/cenproteo/TGSO.py b/cenproteo/TGSO.py
@@ -7,7 +7,7 @@
 os.chdir(script_dir)
 
 class TGSO:
-    def __init__(self, ppi_file, gene_expression_file, subcellular_localization_file, i_score_file, essential_protein_file, alpha=0.3, max_iter=100, tol=10e-6):
+    def __init__(self, ppi_file, gene_expression_file, subcellular_localization_file, i_score_file, alpha=0.3, max_iter=100, tol=10e-6):
         # load ppi network data
         self.ppi_file = ppi_file
         df_ppi = pd.read_csv(ppi_file)
@@ -30,11 +30,6 @@ def __init__(self, ppi_file, gene_expression_file, subcellular_localization_file
         self.i_score_file = i_score_file
         self.i_score_dict = self._load_i_score(i_score_file)
 
-        # load essential protein data
-        self.essential_protein_file = essential_protein_file
-        df_essential = pd.read_csv(essential_protein_file)
-        self.essential_protein_list = self._get_essential_protein(df_essential)
-
         self.ADN = self.ADN()
         self.CEN = self.CEN()
         self.colo_sub = self.CLN()
@@ -288,7 +283,10 @@ def calculate_P(self):
         sorted_protein_score = dict(sorted(P.items(), key=lambda item: item[1], reverse=True))
         return sorted_protein_score, iter_time
 
-    def first_n_comparison(self, n):
+    def first_n_comparison(self, n, real_essential_protein_file):
+        # load essential protein data
+        df_essential = pd.read_csv(real_essential_protein_file)
+        self.essential_protein_list = self._get_essential_protein(df_essential)
         count = 0
         top_TGSO_score = list(self.protein_score.keys())[:n]
         for ess_pro in top_TGSO_score:
@@ -309,14 +307,4 @@ def export_results_to_csv(self, file_path):
             writer.writerow(['Protein', 'Score'])  # Writing header
             for protein, score in sorted(self.protein_score.items(), key=lambda item: item[1], reverse=True):
                 writer.writerow([protein, score])  # Writing each protein and its score
-
-
-# Example Usage
-# ppi_file = r'SC_Data/processed_data/combined_data.csv'
-# gene_expression_file = r'SC_Data/processed_data/filtered_GE_matrix.csv'
-# subcellular_localization_file = r'SC_Data/processed_data/yeast_compartment_knowledge_full.csv'
-# essential_protein_file = r'SC_Data/processed_data/extracted_essential_protein.csv'
-# i_score_file = r'SC_Data/processed_data/I_score.csv'
-# tgso = TGSO(ppi_file, gene_expression_file, subcellular_localization_file, i_score_file, essential_protein_file)
-# # tgso.first_n_comparison(200)
-# tgso.export_results_to_csv('TGSO_result.csv')
+
diff --git a/cenproteo/classical_algorithms.py b/cenproteo/classical_algorithms.py
@@ -2,7 +2,7 @@
 import pandas as pd
 
 class classical_algorithms:
-    def __init__(self, ppi_file,essential_protein_file):
+    def __init__(self, ppi_file):
         """
         Initialize the class with a file path to a protein-protein interaction data CSV.
         This method reads the CSV file and constructs a graph using NetworkX.
@@ -17,18 +17,6 @@ def __init__(self, ppi_file,essential_protein_file):
         edges = df.apply(lambda row: (row['Protein A'], row['Protein B']), axis=1).tolist()
         G.add_edges_from(edges)
         self.G = G
-
-        self.essential_protein_file = essential_protein_file
-        df_essential = pd.read_csv(essential_protein_file)
-        self.essential_protein_list = self._get_essential_protein(df_essential)
-
-    def _get_essential_protein(self, df):
-        essential_pro = []
-        for _, row in df.iterrows():
-            pro = row.iloc[1]
-            essential_pro.append(pro)
-        return essential_pro
-
 
     #find essential protein by computing degree centrality
     def DC(self):
@@ -113,30 +101,35 @@ def export_result_to_csv(self, sorted_result, file_name):
         result_df = pd.DataFrame(sorted_result, columns=['Protein', 'Centrality Score'])
         result_df.to_csv(file_name, index=False)
 
-    def first_n_comparison(self, n,result):
+    def _get_essential_protein(self, df):
+        essential_pro = []
+        for _, row in df.iterrows():
+            pro = row.iloc[1]
+            essential_pro.append(pro)
+        return essential_pro
+
+    def first_n_comparison(self, n, result, real_essential_protein_file):
         """
-        Compare the first n elements of the result list.
+        Compare the first n elements of the result list and real essential protein list.
         Args:
             result (list): The list of results.
+            n: The first n proteins chosen to be compared.
+            real_essential_protein_file: The real essential protein file path.
 
         """
+        df_essential = pd.read_csv(real_essential_protein_file)
+        self.real_essential_protein_list = self._get_essential_protein(df_essential)
+
         count = 0
 
         for protein_tuple in result[:n]:
             protein_name ,score = protein_tuple
-            if protein_name in self.essential_protein_list:
+            if protein_name in self.real_essential_protein_list:
                 count =  count + 1
         print( f"There're {count} essential proteins in the top {n} predicted by algorism.")
         return count
 
-
-
-# Example usage
-# ppi_file = r"C:\Users\Administrator\Desktop\CenProteo-main\SC_Data\processed_data\DIP_data_with_combined_scores.csv"
-# essential_protein = r"C:\Users\Administrator\Desktop\CenProteo-main\SC_Data\processed_data\extracted_essential_protein.csv"
-# class_test = classical_algorithms(ppi_file,essential_protein)
-# sorted_score = class_test.DC(5) # Change different method by using different function  -- class_test.NC(N)
-# class_test.first_n_comparison(sorted_score) # compare the essential protein between the result and the ground truth
+