#1165 #1165 #1158 Modify the namespace of gene_regulatory_network and…

… protein_protein_interactions to include _new, and fixing the populate data scripts to add timestamp and source to the gene and protein tables, and adding annotation type for both GRN and PPI network and interactions table
dondi · Feb 10, 2025 · 2abe82e · 2abe82e
1 parent 8f856be
commit 2abe82e
Show file tree

Hide file tree

Showing 10 changed files with 103 additions and 134 deletions.
diff --git a/database2/network-database/constants.py b/database2/network-database/constants.py
@@ -1,7 +1,7 @@
 class Constants:
     # database namespace
-    GRN_DATABASE_NAMESPACE = "gene_regulatory_network"
-    PPI_DATABASE_NAMESPACE = "protein_protein_interactions"
+    GRN_DATABASE_NAMESPACE = "gene_regulatory_network_new"
+    PPI_DATABASE_NAMESPACE = "protein_protein_interactions_new"
 
     # network types
     GRN_NETWORK_MODE = "grn"

diff --git a/database2/network-database/data_services/data_fetcher_service.py b/database2/network-database/data_services/data_fetcher_service.py
@@ -77,7 +77,14 @@ def fetch_data(self):
         query.add_constraint("regulatoryRegions.strainBackground", "=", "S288c", code="C")
 
         rows_data = []
+        print("Query length: ", len(query.rows()))
+        networks = set()
         for row in query.rows():
+            network = (row["secondaryIdentifier"], row["regulatoryRegions.regulator.secondaryIdentifier"], row["regulatoryRegions.annotationType"])
+            if network in networks:
+                continue
+            else:
+                networks.add(network)
             rows_data.append({
                 "regulatorStandardName": row["regulatoryRegions.regulator.symbol"],
                 "regulatorSystematicName": row["regulatoryRegions.regulator.secondaryIdentifier"],
@@ -93,10 +100,11 @@ def fetch_data(self):
                 "pubMedId": row["regulatoryRegions.publications.pubMedId"],
                 "datasource": row["regulatoryRegions.datasource"],
                 "annotationType": row["regulatoryRegions.annotationType"]
-            })
-
+            })  
+                
         df = pd.DataFrame(rows_data)
         print("Data fetched successfully")
+        print("Number of duplicated networks: ", len(query.rows()) - len(networks))
         print("====================================================================")
         return df
 
@@ -130,7 +138,7 @@ def fetch_data(self):
         count = 0
         print("Query length: ", len(query.rows()))
         for row in query.rows():
-            interaction = (row["secondaryIdentifier"], row["interactions.participant2.secondaryIdentifier"])
+            interaction = (row["secondaryIdentifier"], row["interactions.participant2.secondaryIdentifier"], row["interactions.details.annotationType"])
             if interaction in interactions:
                 count += 1
                 continue

diff --git a/database2/network-database/data_services/processor.py b/database2/network-database/data_services/processor.py
@@ -3,19 +3,20 @@
 import pandas as pd
 
 class Processor(ABC):
-    def __init__(self):
+    def __init__(self, formatted_time_stamp=None):
         self.species = "Saccharomyces cerevisiae"
         self.taxon_id = "559292"
         self.source = "AllianceMine - Saccharomyces Genome Database"
         self.source_display_name = "AllianceMine - SGD"
+        self.formatted_time_stamp = formatted_time_stamp
 
     @abstractmethod
     def process_data(self, data):
         pass
 
 class GeneProcessor(Processor):
-    def __init__(self):
-        super().__init__()
+    def __init__(self, formatted_time_stamp):
+        super().__init__(formatted_time_stamp)
 
     def process_data(self, data, regulators, proteins):
         print("Processing data from GeneProcessor")
@@ -28,19 +29,17 @@ def process_data(self, data, regulators, proteins):
         processed_data = []
         for _, row in combine_genes_df.iterrows():
             gene_id = row['systematicName']
-            display_gene_id = row['standardName']
-            species = self.species
-            taxon_id = self.taxon_id
-
             # Check if the gene_id (systematicName) matches any of the regulators
             regulator = gene_id in regulators["regulator_gene_id"].values
 
             processed_data.append({
                 "gene_id": gene_id,
-                "display_gene_id": display_gene_id,
-                "species": species,
-                "taxon_id": taxon_id,
-                "regulator": regulator
+                "display_gene_id": row['standardName'],
+                "species": self.species,
+                "taxon_id": self.taxon_id,
+                "regulator": regulator,
+                "time_stamp": self.formatted_time_stamp,
+                "source": self.source
             })
 
         processed_df = pd.DataFrame(processed_data)
@@ -71,27 +70,21 @@ def _combine_with_protein_genes(self, genes, proteins):
 
 class GeneRegulatoryNetworkProcessor(Processor):
     def __init__(self, formatted_time_stamp):
-        self.formatted_time_stamp = formatted_time_stamp
-        super().__init__()
+        super().__init__(formatted_time_stamp)
 
     def process_data(self, data):
         print("Processing data from GeneRegulatoryNetworkProcessor")
 
         processed_data = []
 
         for _, row in data.iterrows():
-            regulator_gene_id = row['regulatorSystematicName']
-            target_gene_id = row['targetSystematicName']
-            taxon_id = self.taxon_id
-            time_stamp = self.formatted_time_stamp
-            source = self.source 
-
             processed_data.append({
-                "regulator_gene_id": regulator_gene_id,
-                "target_gene_id": target_gene_id,
-                "taxon_id": taxon_id,
-                "time_stamp": time_stamp,
-                "source": source
+                "regulator_gene_id": row['regulatorSystematicName'],
+                "target_gene_id": row['targetSystematicName'],
+                "taxon_id":  self.taxon_id,
+                "annotation_type": row['annotationType'],
+                "time_stamp": self.formatted_time_stamp,
+                "source": self.source 
             })
 
         processed_df = pd.DataFrame(processed_data)
@@ -100,28 +93,23 @@ def process_data(self, data):
         return processed_df
 
 class ProteinProcessor(Processor):
-    def __init__(self):
-        super().__init__()
+    def __init__(self, formatted_time_stamp):
+        super().__init__(formatted_time_stamp)
 
     def process_data(self, data):
         print("Processing data from ProteinProcessor")
 
         processed_data = []
         for _, row in data.iterrows():
-            standard_name = row['proteinStandardName']
-            gene_systematic_name = row['proteinSystematicName']
-            length = row['length']
-            molecular_weight = row['molecularWeight']
-            pi = row['pI']
-            taxon_id = self.taxon_id
-
             processed_data.append({
-                "standard_name": standard_name,
-                "gene_systematic_name": gene_systematic_name,
-                "length": length,
-                "molecular_weight": molecular_weight,
-                "pi": pi,
-                "taxon_id": taxon_id
+                "standard_name": row['proteinStandardName'],
+                "gene_systematic_name": row['proteinSystematicName'],
+                "length": row['length'],
+                "molecular_weight": row['molecularWeight'],
+                "pi": row['pI'],
+                "taxon_id": self.taxon_id,
+                "time_stamp": self.formatted_time_stamp,
+                "source": self.source
             })
 
         processed_df = pd.DataFrame(processed_data)
@@ -131,27 +119,20 @@ def process_data(self, data):
 
 class ProteinProteinInteractionsProcessor(Processor):
     def __init__(self, formatted_time_stamp):
-        self.formatted_time_stamp = formatted_time_stamp
-        super().__init__()
+        super().__init__(formatted_time_stamp)
 
     def process_data(self, data):
         print("Processing data from ProteinProteinInteractionsProcessor")
         processed_data = []
         for _, row in data.iterrows():
-            protein_1 = row['protein1StandardName']
-            protein_2 = row['protein2StandardName']
-            interaction_detection_methods_identifier = row['interactionDetectionMethodsIdentifier']
-            experiment_name = row['experimentName']
-            time_stamp = self.formatted_time_stamp
-            source = self.source
-
             processed_data.append({
-                "protein1": protein_1,
-                "protein2": protein_2,
-                "interaction_detection_methods_identifier": interaction_detection_methods_identifier,
-                "experiment_name": experiment_name,
-                "time_stamp": time_stamp,
-                "source": source
+                "protein1": row['protein1StandardName'],
+                "protein2": row['protein2StandardName'],
+                "interaction_detection_methods_identifier": row['interactionDetectionMethodsIdentifier'],
+                "annotation_type": row['annotationType'],
+                "experiment_name": row['experimentName'],
+                "time_stamp": self.formatted_time_stamp,
+                "source": self.source
             })
 
         processed_df = pd.DataFrame(processed_data)
@@ -161,8 +142,7 @@ def process_data(self, data):
 
 class SourceProcessor(Processor):
     def __init__(self, formatted_time_stamp):
-        self.formatted_time_stamp = formatted_time_stamp
-        super().__init__()
+        super().__init__(formatted_time_stamp)
 
     def process_data(self):
         print("Processing data from SourceProcessor")

diff --git a/database2/network-database/database_services/populator.py b/database2/network-database/database_services/populator.py
@@ -33,13 +33,13 @@ def process_file(self, conn, cursor, data_filepath, copy_statement):
 
         # Determine if we need to drop the last column (PPI network type)
         if self.network_mode == Constants.PPI_NETWORK_MODE and data_filepath == Constants.MISSING_PPI_GENE_DATA_FILEPATH:
-            print("Dropping the last column from the input data...")
+            print("Dropping the regulator column from the input data...")
             processed_rows = []
 
             with open(data_filepath, 'r') as f:
                 for line in f:
                     columns = line.strip().split('\t')
-                    processed_row = columns[:-1]
+                    processed_row = columns[:4] + columns[5:]
                     processed_rows.append('\t'.join(processed_row))
 
             from io import StringIO
@@ -83,9 +83,9 @@ def __init__(self, db_url, network_mode):
 
     def get_copy_statement(self):
         if self.network_mode == Constants.GRN_NETWORK_MODE:
-            return f"COPY {self.database_namespace}.gene (gene_id, display_gene_id, species, taxon_id, regulator) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;"
+            return f"COPY {self.database_namespace}.gene (gene_id, display_gene_id, species, taxon_id, regulator, time_stamp, source) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;"
         elif self.network_mode == Constants.PPI_NETWORK_MODE:
-            return f"COPY {self.database_namespace}.gene (gene_id, display_gene_id, species, taxon_id) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;"
+            return f"COPY {self.database_namespace}.gene (gene_id, display_gene_id, species, taxon_id, time_stamp, source) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;"
         else:
             raise ValueError(f"Unknown network type: {self.network_mode}")
 
@@ -95,23 +95,23 @@ def __init__(self, db_url):
         self.filepath = Constants.MISSING_PROTEIN_DATA_FILEPATH
 
     def get_copy_statement(self):
-        return f"COPY {Constants.PPI_DATABASE_NAMESPACE}.protein (standard_name, gene_systematic_name, length, molecular_weight, PI, taxon_id) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;"
+        return f"COPY {Constants.PPI_DATABASE_NAMESPACE}.protein (standard_name, gene_systematic_name, length, molecular_weight, PI, taxon_id, time_stamp, source) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;"
 
 class GeneRegulatoryNetworkDataPopulator(DataPopulator):
     def __init__(self, db_url):
         super().__init__(db_url)
         self.filepath = Constants.GENE_REGULATORY_NETWORK_DATA_FILEPATH
 
     def get_copy_statement(self):
-        return f"COPY {Constants.GRN_DATABASE_NAMESPACE}.network (regulator_gene_id, target_gene_id, taxon_id, time_stamp, source) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;"
+        return f"COPY {Constants.GRN_DATABASE_NAMESPACE}.network (regulator_gene_id, target_gene_id, taxon_id, annotation_type, time_stamp, source) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;"
 
 class ProteinProteinInteractionsDataPopulator(DataPopulator):
     def __init__(self, db_url):
         super().__init__(db_url)
         self.filepath = Constants.PROTEIN_PROTEIN_INTERACTIONS_DATA_FILEPATH
 
     def get_copy_statement(self):
-        return f"COPY {Constants.PPI_DATABASE_NAMESPACE}.physical_interactions (protein1, protein2, interaction_detection_methods_identifier, experiment_name, time_stamp, source) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;"
+        return f"COPY {Constants.PPI_DATABASE_NAMESPACE}.physical_interactions (protein1, protein2, interaction_detection_methods_identifier, annotation_type, experiment_name, time_stamp, source) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;"
 
 class SourceDataPopulator(DataPopulator):
     def __init__(self, db_url, network_mode):

diff --git a/database2/network-database/main.py b/database2/network-database/main.py
@@ -17,13 +17,13 @@ def load_data(network_option):
         grnDataGenerator = GeneRegulatoryNetworkDataGenerator(GeneRegulatoryNetworkFetcherService(), GeneRegulatoryNetworkProcessor(formatted_time_stamp), save_service)
 
     if network_option in ['all', Constants.PPI_NETWORK_MODE]:
-        proteinDataGenerator = ProteinDataGenerator(ProteinFetcherService(), ProteinProcessor(), save_service)
+        proteinDataGenerator = ProteinDataGenerator(ProteinFetcherService(), ProteinProcessor(formatted_time_stamp), save_service)
         ProteinProteinInteractionsDataGenerator(ProteinProteinInteractionsFetcherService(), ProteinProteinInteractionsProcessor(formatted_time_stamp), save_service)
 
     if network_option == Constants.GRN_NETWORK_MODE:
-        GeneDataGenerator(GeneFetcherService(), GeneProcessor(), save_service, grnDataGenerator.data)
+        GeneDataGenerator(GeneFetcherService(), GeneProcessor(formatted_time_stamp), save_service, grnDataGenerator.data)
     else:
-        GeneDataGenerator(GeneFetcherService(), GeneProcessor(), save_service, grnDataGenerator.data if grnDataGenerator else None, proteinDataGenerator.data)
+        GeneDataGenerator(GeneFetcherService(), GeneProcessor(formatted_time_stamp), save_service, grnDataGenerator.data if grnDataGenerator else None, proteinDataGenerator.data)
 
     SourceDataGenerator(SourceProcessor(formatted_time_stamp), save_service)
 
@@ -40,30 +40,24 @@ def adding_data_to_databse(network_option, db_url):
     print("Adding data to database.................................................")
     if network_option in ['all', Constants.GRN_NETWORK_MODE]:
         network_mode = Constants.GRN_NETWORK_MODE
+        SourceDataPopulator(db_url, network_mode).populate_data()
         GeneDataPopulator(db_url, network_mode).populate_data()
         GeneUpdater(db_url, network_mode).update_data()
-
-        SourceDataPopulator(db_url, network_mode).populate_data()
-
         GeneRegulatoryNetworkDataPopulator(db_url).populate_data()
-
-
 
     if network_option in ['all', Constants.PPI_NETWORK_MODE]:
         network_mode = Constants.PPI_NETWORK_MODE
+        SourceDataPopulator(db_url, network_mode).populate_data()
+
         GeneDataPopulator(db_url, network_mode).populate_data()
         GeneUpdater(db_url, network_mode).update_data()
 
         ProteinDataPopulator(db_url).populate_data()
         ProteinProteinInteractionsUpdater(db_url).update_data()
         ProteinUpdater(db_url).update_data()
 
-        SourceDataPopulator(db_url, network_mode).populate_data()
-
         ProteinProteinInteractionsDataPopulator(db_url).populate_data()
 
-
-
 def main(network_option, db_url):
     load_data(network_option)
     filter_data(network_option, db_url)