Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create GENIE tables #13

Merged
merged 11 commits into from
Apr 16, 2024
204 changes: 204 additions & 0 deletions genie/genie_bpc_elt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
"""GENIE ELT pipeline"""
import os

from dotenv import dotenv_values
import pandas as pd
import snowflake.connector
from snowflake.connector.pandas_tools import write_pandas
import synapseclient


def create_snowflake_resources(
ctx: snowflake.connector.connect,
syn: synapseclient.Synapse,
cohort: str,
version: str,
clinical_synid: str,
cbioportal_synid: str
):
cs = ctx.cursor()
clinical_files = syn.getChildren(clinical_synid)
cbioportal_files = syn.getChildren(cbioportal_synid)
schema_name = f"{cohort}_{version}"
cs.execute(
f"CREATE SCHEMA IF NOT EXISTS {schema_name} WITH MANAGED ACCESS;"
)
cs.execute(f"USE SCHEMA {schema_name}")

for clinical_file in clinical_files:
print(clinical_file['name'])
table_name = clinical_file['name'].replace(".csv", "")
clinical_entity = syn.get(clinical_file['id'])
clin_df = pd.read_csv(
clinical_entity.path,
comment="#",
low_memory=False
)
write_pandas(
ctx,
clin_df,
table_name,
auto_create_table=True,
quote_identifiers=False,
overwrite=True
)

for cbioportal_file in cbioportal_files:
# print(cbioportal_file['name'])
table_name = (cbioportal_file['name']
.replace("data_", "")
.replace(".txt", "")
.replace(f"genie_{cohort}_", "")
.replace("cna_hg19.seg", "seg")
)
# TODO: error when uploading SEG file and CNA file
exclude = ("gene_panel", "meta", "CNA", "case_lists", "seg", "tmb")
if not table_name.startswith(exclude):
print(table_name)
table_ent = syn.get(cbioportal_file['id'])
table_df = pd.read_csv(
table_ent.path,
sep="\t",
comment="#",
low_memory=False
)
write_pandas(
ctx,
table_df,
table_name,
auto_create_table=True,
quote_identifiers=False,
overwrite=True
)

def main():
"""GENIE BPC ELT pipeline"""
syn = synapseclient.login()

config = dotenv_values("../.env")

ctx = snowflake.connector.connect(
user=config['user'],
password=config['password'],
account=config['snowflake_account'],
database="genie",
role="SYSADMIN",
warehouse="compute_xsmall"
)
cohorts = [
{
"cohort": "nsclc",
"version": "public_02_2",
"clinical_synid": "syn30358089",
"cbioportal_synid": "syn30358098"
},
{
"cohort": "crc",
"version": "public_02_2",
"clinical_synid": "syn39802567",
"cbioportal_synid": "syn39802595"
},
{
"cohort": "bladder",
"version": "consortium_01_1",
"clinical_synid": "syn28495599",
"cbioportal_synid": "syn26958249"
},
{
"cohort": "bladder",
"version": "consortium_01_2",
"clinical_synid": "syn53018574",
"cbioportal_synid": "syn53018728"
},
{
"cohort": "brca",
"version": "consortium_01_1",
"clinical_synid": "syn26253353",
"cbioportal_synid": "syn24981909"
},
{
"cohort": "brca",
"version": "consortium_01_2",
"clinical_synid": "syn39802381",
"cbioportal_synid": "syn32299078"
},
{
"cohort": "crc",
"version": "consortium_01_1",
"clinical_synid": "syn24166685",
"cbioportal_synid": "syn23561876"
},
{
"cohort": "crc",
"version": "consortium_01_2",
"clinical_synid": "syn26046784",
"cbioportal_synid": "syn25998993"
},
{
"cohort": "crc",
"version": "public_preview_02_0",
"clinical_synid": "syn39802279",
"cbioportal_synid": "syn30381296"
},
{
"cohort": "nsclc",
"version": "consortium_01_1",
"clinical_synid": "syn22418966",
"cbioportal_synid": "syn22679734"
},
{
"cohort": "nsclc",
"version": "public_preview_02_0",
"clinical_synid": "syn27245047",
"cbioportal_synid": "syn27199149"
},
{
"cohort": "nsclc",
"version": "consortium_02_1",
"clinical_synid": "syn25982471",
"cbioportal_synid": "syn25471745"
},
{
"cohort": "panc",
"version": "consortium_01_1",
"clinical_synid": "syn27244194",
"cbioportal_synid": "syn26288998"
},
{
"cohort": "panc",
"version": "consortium_01_2",
"clinical_synid": "syn50612197",
"cbioportal_synid": "syn50697830"
},
{
"cohort": "prostate",
"version": "consortium_01_1",
"clinical_synid": "syn28495574",
"cbioportal_synid": "syn26471041"
},
{
"cohort": "prostate",
"version": "consortium_01_2",
"clinical_synid": "syn50612196",
"cbioportal_synid": "syn50697637"
}
]

for cohort_info in cohorts:
cohort = cohort_info['cohort']
version = cohort_info['version']
clinical_synid = cohort_info['clinical_synid']
cbioportal_synid = cohort_info['cbioportal_synid']
print(cohort, version)
create_snowflake_resources(
ctx=ctx,
syn=syn,
cohort=cohort,
version=version,
clinical_synid=clinical_synid,
cbioportal_synid=cbioportal_synid
)
ctx.close()

if __name__ == "__main__":
main()
96 changes: 96 additions & 0 deletions genie/genie_elt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
"""GENIE ELT pipeline"""
import os

from dotenv import dotenv_values
import pandas as pd
import snowflake.connector
from snowflake.connector.pandas_tools import write_pandas
import synapseclient
import synapseutils as synu


def push_cbio_files_to_snowflake(syn: synapseclient.Synapse, ctx: snowflake.connector.connect, synid: str, structured_data: list):
"""_summary_

Args:
ctx (snowflake.connector.connect): _description_
synid (str): _description_
structured_data (list): _description_
"""
folder_ent = syn.get(synid)
release_name = folder_ent.name.split("-")[0].replace(".", "_")

release_files = syn.getChildren(synid, includeTypes=["file", "folder", "link"])
release_file_map = {
release_file['name']: syn.get(release_file['id'], followLink=True)
for release_file in release_files
if release_file['name'].startswith(structured_data) and
release_file['name'].endswith(("txt", "bed"))
}

ctx.execute(
f"CREATE SCHEMA IF NOT EXISTS consortium_{release_name} WITH MANAGED ACCESS;"
)
ctx.execute(f"USE SCHEMA consortium_{release_name}")
for release_file_key, release_file_ent in release_file_map.items():
tbl_name = (release_file_key
.replace("data_", "")
.replace(".txt", "")
.replace(".seg", "")
)
if tbl_name == "genie_combined.bed":
tbl_name = "genomic_information"
elif tbl_name == "genie_cna_hg19.seg":
tbl_name = "cna_hg19"
print(tbl_name)

table_df = pd.read_csv(
release_file_ent.path,
sep="\t",
comment="#",
low_memory=False
)
write_pandas(
ctx,
table_df,
tbl_name,
auto_create_table=True,
quote_identifiers=False,
overwrite=True
)

def main():
"""GENIE ELT pipeline"""
syn = synapseclient.login()

config = dotenv_values("../.env")

ctx = snowflake.connector.connect(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you need to handle any context management here to close the connection? Not sure how this piece works internally.

user=config['user'],
password=config['password'],
account=config['snowflake_account'],
database="genie",
role="SYSADMIN",
warehouse="compute_xsmall"
)
cs = ctx.cursor()
# Exclude data_cna for now
structured_data = (
"data_clinical", "data_mutations", "data_fusions"
"assay_information", "data_cna_hg19", "data_gene_matrix",
"data_sv", "genomic_information", "genie_combined", "genie_cna_hg19"
)
directory_info = synu.walk(syn, "syn7492881")
for dirpath, dirnames, filenames in directory_info:
if len(dirpath[0].split("/")) != 2:
continue
for dirname, dir_synid in dirnames:
# if dirname.endswith("-public") or folder_ent.name.startswith(("1.0", "2.0", "3.0")):
if not dirname.endswith('-consortium'):
continue
print(dirname)
push_cbio_files_to_snowflake(syn=syn, ctx=cs, synid=dir_synid, structured_data=structured_data)
ctx.close()

if __name__ == "__main__":
main()
3 changes: 2 additions & 1 deletion sage/audit_elt.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
cs = ctx.cursor()

files = syn.getChildren("syn53180811")

# files = [{"id": "syn55198186", "name": "IT-3518-sage-org-buckets.csv"}]
for file_info in files:
synapse_id = file_info['id']
table_name = file_info['name'].replace("-", "_").replace(".csv", "")
Expand All @@ -42,3 +42,4 @@
overwrite=True,
quote_identifiers=False
)
cs.close()