Skip to content

Commit

Permalink
#1165 #1165 #1158 Modify the namespace of gene_regulatory_network and…
Browse files Browse the repository at this point in the history
… protein_protein_interactions to include _new, and fixing the populate data scripts to add timestamp and source to the gene and protein tables, and adding annotation type for both GRN and PPI network and interactions table
  • Loading branch information
ntran18 committed Feb 10, 2025
1 parent 8f856be commit 2abe82e
Show file tree
Hide file tree
Showing 10 changed files with 103 additions and 134 deletions.
4 changes: 2 additions & 2 deletions database2/network-database/constants.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
class Constants:
# database namespace
GRN_DATABASE_NAMESPACE = "gene_regulatory_network"
PPI_DATABASE_NAMESPACE = "protein_protein_interactions"
GRN_DATABASE_NAMESPACE = "gene_regulatory_network_new"
PPI_DATABASE_NAMESPACE = "protein_protein_interactions_new"

# network types
GRN_NETWORK_MODE = "grn"
Expand Down
14 changes: 11 additions & 3 deletions database2/network-database/data_services/data_fetcher_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,14 @@ def fetch_data(self):
query.add_constraint("regulatoryRegions.strainBackground", "=", "S288c", code="C")

rows_data = []
print("Query length: ", len(query.rows()))
networks = set()
for row in query.rows():
network = (row["secondaryIdentifier"], row["regulatoryRegions.regulator.secondaryIdentifier"], row["regulatoryRegions.annotationType"])
if network in networks:
continue
else:
networks.add(network)
rows_data.append({
"regulatorStandardName": row["regulatoryRegions.regulator.symbol"],
"regulatorSystematicName": row["regulatoryRegions.regulator.secondaryIdentifier"],
Expand All @@ -93,10 +100,11 @@ def fetch_data(self):
"pubMedId": row["regulatoryRegions.publications.pubMedId"],
"datasource": row["regulatoryRegions.datasource"],
"annotationType": row["regulatoryRegions.annotationType"]
})

})
df = pd.DataFrame(rows_data)
print("Data fetched successfully")
print("Number of duplicated networks: ", len(query.rows()) - len(networks))
print("====================================================================")
return df

Expand Down Expand Up @@ -130,7 +138,7 @@ def fetch_data(self):
count = 0
print("Query length: ", len(query.rows()))
for row in query.rows():
interaction = (row["secondaryIdentifier"], row["interactions.participant2.secondaryIdentifier"])
interaction = (row["secondaryIdentifier"], row["interactions.participant2.secondaryIdentifier"], row["interactions.details.annotationType"])
if interaction in interactions:
count += 1
continue
Expand Down
92 changes: 36 additions & 56 deletions database2/network-database/data_services/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,20 @@
import pandas as pd

class Processor(ABC):
def __init__(self):
def __init__(self, formatted_time_stamp=None):
self.species = "Saccharomyces cerevisiae"
self.taxon_id = "559292"
self.source = "AllianceMine - Saccharomyces Genome Database"
self.source_display_name = "AllianceMine - SGD"
self.formatted_time_stamp = formatted_time_stamp

@abstractmethod
def process_data(self, data):
pass

class GeneProcessor(Processor):
def __init__(self):
super().__init__()
def __init__(self, formatted_time_stamp):
super().__init__(formatted_time_stamp)

def process_data(self, data, regulators, proteins):
print("Processing data from GeneProcessor")
Expand All @@ -28,19 +29,17 @@ def process_data(self, data, regulators, proteins):
processed_data = []
for _, row in combine_genes_df.iterrows():
gene_id = row['systematicName']
display_gene_id = row['standardName']
species = self.species
taxon_id = self.taxon_id

# Check if the gene_id (systematicName) matches any of the regulators
regulator = gene_id in regulators["regulator_gene_id"].values

processed_data.append({
"gene_id": gene_id,
"display_gene_id": display_gene_id,
"species": species,
"taxon_id": taxon_id,
"regulator": regulator
"display_gene_id": row['standardName'],
"species": self.species,
"taxon_id": self.taxon_id,
"regulator": regulator,
"time_stamp": self.formatted_time_stamp,
"source": self.source
})

processed_df = pd.DataFrame(processed_data)
Expand Down Expand Up @@ -71,27 +70,21 @@ def _combine_with_protein_genes(self, genes, proteins):

class GeneRegulatoryNetworkProcessor(Processor):
def __init__(self, formatted_time_stamp):
self.formatted_time_stamp = formatted_time_stamp
super().__init__()
super().__init__(formatted_time_stamp)

def process_data(self, data):
print("Processing data from GeneRegulatoryNetworkProcessor")

processed_data = []

for _, row in data.iterrows():
regulator_gene_id = row['regulatorSystematicName']
target_gene_id = row['targetSystematicName']
taxon_id = self.taxon_id
time_stamp = self.formatted_time_stamp
source = self.source

processed_data.append({
"regulator_gene_id": regulator_gene_id,
"target_gene_id": target_gene_id,
"taxon_id": taxon_id,
"time_stamp": time_stamp,
"source": source
"regulator_gene_id": row['regulatorSystematicName'],
"target_gene_id": row['targetSystematicName'],
"taxon_id": self.taxon_id,
"annotation_type": row['annotationType'],
"time_stamp": self.formatted_time_stamp,
"source": self.source
})

processed_df = pd.DataFrame(processed_data)
Expand All @@ -100,28 +93,23 @@ def process_data(self, data):
return processed_df

class ProteinProcessor(Processor):
def __init__(self):
super().__init__()
def __init__(self, formatted_time_stamp):
super().__init__(formatted_time_stamp)

def process_data(self, data):
print("Processing data from ProteinProcessor")

processed_data = []
for _, row in data.iterrows():
standard_name = row['proteinStandardName']
gene_systematic_name = row['proteinSystematicName']
length = row['length']
molecular_weight = row['molecularWeight']
pi = row['pI']
taxon_id = self.taxon_id

processed_data.append({
"standard_name": standard_name,
"gene_systematic_name": gene_systematic_name,
"length": length,
"molecular_weight": molecular_weight,
"pi": pi,
"taxon_id": taxon_id
"standard_name": row['proteinStandardName'],
"gene_systematic_name": row['proteinSystematicName'],
"length": row['length'],
"molecular_weight": row['molecularWeight'],
"pi": row['pI'],
"taxon_id": self.taxon_id,
"time_stamp": self.formatted_time_stamp,
"source": self.source
})

processed_df = pd.DataFrame(processed_data)
Expand All @@ -131,27 +119,20 @@ def process_data(self, data):

class ProteinProteinInteractionsProcessor(Processor):
def __init__(self, formatted_time_stamp):
self.formatted_time_stamp = formatted_time_stamp
super().__init__()
super().__init__(formatted_time_stamp)

def process_data(self, data):
print("Processing data from ProteinProteinInteractionsProcessor")
processed_data = []
for _, row in data.iterrows():
protein_1 = row['protein1StandardName']
protein_2 = row['protein2StandardName']
interaction_detection_methods_identifier = row['interactionDetectionMethodsIdentifier']
experiment_name = row['experimentName']
time_stamp = self.formatted_time_stamp
source = self.source

processed_data.append({
"protein1": protein_1,
"protein2": protein_2,
"interaction_detection_methods_identifier": interaction_detection_methods_identifier,
"experiment_name": experiment_name,
"time_stamp": time_stamp,
"source": source
"protein1": row['protein1StandardName'],
"protein2": row['protein2StandardName'],
"interaction_detection_methods_identifier": row['interactionDetectionMethodsIdentifier'],
"annotation_type": row['annotationType'],
"experiment_name": row['experimentName'],
"time_stamp": self.formatted_time_stamp,
"source": self.source
})

processed_df = pd.DataFrame(processed_data)
Expand All @@ -161,8 +142,7 @@ def process_data(self, data):

class SourceProcessor(Processor):
def __init__(self, formatted_time_stamp):
self.formatted_time_stamp = formatted_time_stamp
super().__init__()
super().__init__(formatted_time_stamp)

def process_data(self):
print("Processing data from SourceProcessor")
Expand Down
14 changes: 7 additions & 7 deletions database2/network-database/database_services/populator.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,13 +33,13 @@ def process_file(self, conn, cursor, data_filepath, copy_statement):

# Determine if we need to drop the last column (PPI network type)
if self.network_mode == Constants.PPI_NETWORK_MODE and data_filepath == Constants.MISSING_PPI_GENE_DATA_FILEPATH:
print("Dropping the last column from the input data...")
print("Dropping the regulator column from the input data...")
processed_rows = []

with open(data_filepath, 'r') as f:
for line in f:
columns = line.strip().split('\t')
processed_row = columns[:-1]
processed_row = columns[:4] + columns[5:]
processed_rows.append('\t'.join(processed_row))

from io import StringIO
Expand Down Expand Up @@ -83,9 +83,9 @@ def __init__(self, db_url, network_mode):

def get_copy_statement(self):
if self.network_mode == Constants.GRN_NETWORK_MODE:
return f"COPY {self.database_namespace}.gene (gene_id, display_gene_id, species, taxon_id, regulator) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;"
return f"COPY {self.database_namespace}.gene (gene_id, display_gene_id, species, taxon_id, regulator, time_stamp, source) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;"
elif self.network_mode == Constants.PPI_NETWORK_MODE:
return f"COPY {self.database_namespace}.gene (gene_id, display_gene_id, species, taxon_id) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;"
return f"COPY {self.database_namespace}.gene (gene_id, display_gene_id, species, taxon_id, time_stamp, source) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;"
else:
raise ValueError(f"Unknown network type: {self.network_mode}")

Expand All @@ -95,23 +95,23 @@ def __init__(self, db_url):
self.filepath = Constants.MISSING_PROTEIN_DATA_FILEPATH

def get_copy_statement(self):
return f"COPY {Constants.PPI_DATABASE_NAMESPACE}.protein (standard_name, gene_systematic_name, length, molecular_weight, PI, taxon_id) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;"
return f"COPY {Constants.PPI_DATABASE_NAMESPACE}.protein (standard_name, gene_systematic_name, length, molecular_weight, PI, taxon_id, time_stamp, source) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;"

class GeneRegulatoryNetworkDataPopulator(DataPopulator):
def __init__(self, db_url):
super().__init__(db_url)
self.filepath = Constants.GENE_REGULATORY_NETWORK_DATA_FILEPATH

def get_copy_statement(self):
return f"COPY {Constants.GRN_DATABASE_NAMESPACE}.network (regulator_gene_id, target_gene_id, taxon_id, time_stamp, source) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;"
return f"COPY {Constants.GRN_DATABASE_NAMESPACE}.network (regulator_gene_id, target_gene_id, taxon_id, annotation_type, time_stamp, source) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;"

class ProteinProteinInteractionsDataPopulator(DataPopulator):
def __init__(self, db_url):
super().__init__(db_url)
self.filepath = Constants.PROTEIN_PROTEIN_INTERACTIONS_DATA_FILEPATH

def get_copy_statement(self):
return f"COPY {Constants.PPI_DATABASE_NAMESPACE}.physical_interactions (protein1, protein2, interaction_detection_methods_identifier, experiment_name, time_stamp, source) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;"
return f"COPY {Constants.PPI_DATABASE_NAMESPACE}.physical_interactions (protein1, protein2, interaction_detection_methods_identifier, annotation_type, experiment_name, time_stamp, source) FROM stdin WITH CSV DELIMITER E'\\t' HEADER;"

class SourceDataPopulator(DataPopulator):
def __init__(self, db_url, network_mode):
Expand Down
18 changes: 6 additions & 12 deletions database2/network-database/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,13 +17,13 @@ def load_data(network_option):
grnDataGenerator = GeneRegulatoryNetworkDataGenerator(GeneRegulatoryNetworkFetcherService(), GeneRegulatoryNetworkProcessor(formatted_time_stamp), save_service)

if network_option in ['all', Constants.PPI_NETWORK_MODE]:
proteinDataGenerator = ProteinDataGenerator(ProteinFetcherService(), ProteinProcessor(), save_service)
proteinDataGenerator = ProteinDataGenerator(ProteinFetcherService(), ProteinProcessor(formatted_time_stamp), save_service)
ProteinProteinInteractionsDataGenerator(ProteinProteinInteractionsFetcherService(), ProteinProteinInteractionsProcessor(formatted_time_stamp), save_service)

if network_option == Constants.GRN_NETWORK_MODE:
GeneDataGenerator(GeneFetcherService(), GeneProcessor(), save_service, grnDataGenerator.data)
GeneDataGenerator(GeneFetcherService(), GeneProcessor(formatted_time_stamp), save_service, grnDataGenerator.data)
else:
GeneDataGenerator(GeneFetcherService(), GeneProcessor(), save_service, grnDataGenerator.data if grnDataGenerator else None, proteinDataGenerator.data)
GeneDataGenerator(GeneFetcherService(), GeneProcessor(formatted_time_stamp), save_service, grnDataGenerator.data if grnDataGenerator else None, proteinDataGenerator.data)

SourceDataGenerator(SourceProcessor(formatted_time_stamp), save_service)

Expand All @@ -40,30 +40,24 @@ def adding_data_to_databse(network_option, db_url):
print("Adding data to database.................................................")
if network_option in ['all', Constants.GRN_NETWORK_MODE]:
network_mode = Constants.GRN_NETWORK_MODE
SourceDataPopulator(db_url, network_mode).populate_data()
GeneDataPopulator(db_url, network_mode).populate_data()
GeneUpdater(db_url, network_mode).update_data()

SourceDataPopulator(db_url, network_mode).populate_data()

GeneRegulatoryNetworkDataPopulator(db_url).populate_data()



if network_option in ['all', Constants.PPI_NETWORK_MODE]:
network_mode = Constants.PPI_NETWORK_MODE
SourceDataPopulator(db_url, network_mode).populate_data()

GeneDataPopulator(db_url, network_mode).populate_data()
GeneUpdater(db_url, network_mode).update_data()

ProteinDataPopulator(db_url).populate_data()
ProteinProteinInteractionsUpdater(db_url).update_data()
ProteinUpdater(db_url).update_data()

SourceDataPopulator(db_url, network_mode).populate_data()

ProteinProteinInteractionsDataPopulator(db_url).populate_data()



def main(network_option, db_url):
load_data(network_option)
filter_data(network_option, db_url)
Expand Down
Loading

0 comments on commit 2abe82e

Please sign in to comment.