-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Create GENIE tables #13
Merged
Merged
Changes from all commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
9d5a977
Create GENIE tables
thomasyu888 35077a7
Add in other file formats to ELT into snowflake
thomasyu888 177ef12
Add tbl name
thomasyu888 c7b1cb0
add in bpc elt
thomasyu888 f2a4b1e
Add 16.1-consortium
thomasyu888 61283d4
Add in all bpc cohorts
thomasyu888 57131c0
Merge branch 'dev' into create-genie
thomasyu888 3751ba6
Make sure to close connection
thomasyu888 ec6e414
Push up S3 bucket list
thomasyu888 9ca5e61
Clean up code by pushing things to a function
thomasyu888 e4e16a4
Update genie/genie_elt.py
thomasyu888 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,204 @@ | ||
"""GENIE ELT pipeline""" | ||
import os | ||
|
||
from dotenv import dotenv_values | ||
import pandas as pd | ||
import snowflake.connector | ||
from snowflake.connector.pandas_tools import write_pandas | ||
import synapseclient | ||
|
||
|
||
def create_snowflake_resources( | ||
ctx: snowflake.connector.connect, | ||
syn: synapseclient.Synapse, | ||
cohort: str, | ||
version: str, | ||
clinical_synid: str, | ||
cbioportal_synid: str | ||
): | ||
cs = ctx.cursor() | ||
clinical_files = syn.getChildren(clinical_synid) | ||
cbioportal_files = syn.getChildren(cbioportal_synid) | ||
schema_name = f"{cohort}_{version}" | ||
cs.execute( | ||
f"CREATE SCHEMA IF NOT EXISTS {schema_name} WITH MANAGED ACCESS;" | ||
) | ||
cs.execute(f"USE SCHEMA {schema_name}") | ||
|
||
for clinical_file in clinical_files: | ||
print(clinical_file['name']) | ||
table_name = clinical_file['name'].replace(".csv", "") | ||
clinical_entity = syn.get(clinical_file['id']) | ||
clin_df = pd.read_csv( | ||
clinical_entity.path, | ||
comment="#", | ||
low_memory=False | ||
) | ||
write_pandas( | ||
ctx, | ||
clin_df, | ||
table_name, | ||
auto_create_table=True, | ||
quote_identifiers=False, | ||
overwrite=True | ||
) | ||
|
||
for cbioportal_file in cbioportal_files: | ||
# print(cbioportal_file['name']) | ||
table_name = (cbioportal_file['name'] | ||
.replace("data_", "") | ||
.replace(".txt", "") | ||
.replace(f"genie_{cohort}_", "") | ||
.replace("cna_hg19.seg", "seg") | ||
) | ||
# TODO: error when uploading SEG file and CNA file | ||
exclude = ("gene_panel", "meta", "CNA", "case_lists", "seg", "tmb") | ||
if not table_name.startswith(exclude): | ||
print(table_name) | ||
table_ent = syn.get(cbioportal_file['id']) | ||
table_df = pd.read_csv( | ||
table_ent.path, | ||
sep="\t", | ||
comment="#", | ||
low_memory=False | ||
) | ||
write_pandas( | ||
ctx, | ||
table_df, | ||
table_name, | ||
auto_create_table=True, | ||
quote_identifiers=False, | ||
overwrite=True | ||
) | ||
|
||
def main(): | ||
"""GENIE BPC ELT pipeline""" | ||
syn = synapseclient.login() | ||
|
||
config = dotenv_values("../.env") | ||
|
||
ctx = snowflake.connector.connect( | ||
user=config['user'], | ||
password=config['password'], | ||
account=config['snowflake_account'], | ||
database="genie", | ||
role="SYSADMIN", | ||
warehouse="compute_xsmall" | ||
) | ||
cohorts = [ | ||
{ | ||
"cohort": "nsclc", | ||
"version": "public_02_2", | ||
"clinical_synid": "syn30358089", | ||
"cbioportal_synid": "syn30358098" | ||
}, | ||
{ | ||
"cohort": "crc", | ||
"version": "public_02_2", | ||
"clinical_synid": "syn39802567", | ||
"cbioportal_synid": "syn39802595" | ||
}, | ||
{ | ||
"cohort": "bladder", | ||
"version": "consortium_01_1", | ||
"clinical_synid": "syn28495599", | ||
"cbioportal_synid": "syn26958249" | ||
}, | ||
{ | ||
"cohort": "bladder", | ||
"version": "consortium_01_2", | ||
"clinical_synid": "syn53018574", | ||
"cbioportal_synid": "syn53018728" | ||
}, | ||
{ | ||
"cohort": "brca", | ||
"version": "consortium_01_1", | ||
"clinical_synid": "syn26253353", | ||
"cbioportal_synid": "syn24981909" | ||
}, | ||
{ | ||
"cohort": "brca", | ||
"version": "consortium_01_2", | ||
"clinical_synid": "syn39802381", | ||
"cbioportal_synid": "syn32299078" | ||
}, | ||
{ | ||
"cohort": "crc", | ||
"version": "consortium_01_1", | ||
"clinical_synid": "syn24166685", | ||
"cbioportal_synid": "syn23561876" | ||
}, | ||
{ | ||
"cohort": "crc", | ||
"version": "consortium_01_2", | ||
"clinical_synid": "syn26046784", | ||
"cbioportal_synid": "syn25998993" | ||
}, | ||
{ | ||
"cohort": "crc", | ||
"version": "public_preview_02_0", | ||
"clinical_synid": "syn39802279", | ||
"cbioportal_synid": "syn30381296" | ||
}, | ||
{ | ||
"cohort": "nsclc", | ||
"version": "consortium_01_1", | ||
"clinical_synid": "syn22418966", | ||
"cbioportal_synid": "syn22679734" | ||
}, | ||
{ | ||
"cohort": "nsclc", | ||
"version": "public_preview_02_0", | ||
"clinical_synid": "syn27245047", | ||
"cbioportal_synid": "syn27199149" | ||
}, | ||
{ | ||
"cohort": "nsclc", | ||
"version": "consortium_02_1", | ||
"clinical_synid": "syn25982471", | ||
"cbioportal_synid": "syn25471745" | ||
}, | ||
{ | ||
"cohort": "panc", | ||
"version": "consortium_01_1", | ||
"clinical_synid": "syn27244194", | ||
"cbioportal_synid": "syn26288998" | ||
}, | ||
{ | ||
"cohort": "panc", | ||
"version": "consortium_01_2", | ||
"clinical_synid": "syn50612197", | ||
"cbioportal_synid": "syn50697830" | ||
}, | ||
{ | ||
"cohort": "prostate", | ||
"version": "consortium_01_1", | ||
"clinical_synid": "syn28495574", | ||
"cbioportal_synid": "syn26471041" | ||
}, | ||
{ | ||
"cohort": "prostate", | ||
"version": "consortium_01_2", | ||
"clinical_synid": "syn50612196", | ||
"cbioportal_synid": "syn50697637" | ||
} | ||
] | ||
|
||
for cohort_info in cohorts: | ||
cohort = cohort_info['cohort'] | ||
version = cohort_info['version'] | ||
clinical_synid = cohort_info['clinical_synid'] | ||
cbioportal_synid = cohort_info['cbioportal_synid'] | ||
print(cohort, version) | ||
create_snowflake_resources( | ||
ctx=ctx, | ||
syn=syn, | ||
cohort=cohort, | ||
version=version, | ||
clinical_synid=clinical_synid, | ||
cbioportal_synid=cbioportal_synid | ||
) | ||
ctx.close() | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
"""GENIE ELT pipeline""" | ||
import os | ||
|
||
from dotenv import dotenv_values | ||
import pandas as pd | ||
import snowflake.connector | ||
from snowflake.connector.pandas_tools import write_pandas | ||
import synapseclient | ||
import synapseutils as synu | ||
|
||
|
||
def push_cbio_files_to_snowflake(syn: synapseclient.Synapse, ctx: snowflake.connector.connect, synid: str, structured_data: list): | ||
"""_summary_ | ||
|
||
Args: | ||
ctx (snowflake.connector.connect): _description_ | ||
synid (str): _description_ | ||
structured_data (list): _description_ | ||
""" | ||
folder_ent = syn.get(synid) | ||
release_name = folder_ent.name.split("-")[0].replace(".", "_") | ||
|
||
release_files = syn.getChildren(synid, includeTypes=["file", "folder", "link"]) | ||
release_file_map = { | ||
release_file['name']: syn.get(release_file['id'], followLink=True) | ||
for release_file in release_files | ||
if release_file['name'].startswith(structured_data) and | ||
release_file['name'].endswith(("txt", "bed")) | ||
} | ||
|
||
ctx.execute( | ||
f"CREATE SCHEMA IF NOT EXISTS consortium_{release_name} WITH MANAGED ACCESS;" | ||
) | ||
ctx.execute(f"USE SCHEMA consortium_{release_name}") | ||
for release_file_key, release_file_ent in release_file_map.items(): | ||
tbl_name = (release_file_key | ||
.replace("data_", "") | ||
.replace(".txt", "") | ||
.replace(".seg", "") | ||
) | ||
if tbl_name == "genie_combined.bed": | ||
tbl_name = "genomic_information" | ||
elif tbl_name == "genie_cna_hg19.seg": | ||
tbl_name = "cna_hg19" | ||
print(tbl_name) | ||
|
||
table_df = pd.read_csv( | ||
release_file_ent.path, | ||
sep="\t", | ||
comment="#", | ||
low_memory=False | ||
) | ||
write_pandas( | ||
ctx, | ||
table_df, | ||
tbl_name, | ||
auto_create_table=True, | ||
quote_identifiers=False, | ||
overwrite=True | ||
) | ||
|
||
def main(): | ||
"""GENIE ELT pipeline""" | ||
syn = synapseclient.login() | ||
|
||
config = dotenv_values("../.env") | ||
|
||
ctx = snowflake.connector.connect( | ||
user=config['user'], | ||
password=config['password'], | ||
account=config['snowflake_account'], | ||
database="genie", | ||
role="SYSADMIN", | ||
warehouse="compute_xsmall" | ||
) | ||
cs = ctx.cursor() | ||
# Exclude data_cna for now | ||
structured_data = ( | ||
"data_clinical", "data_mutations", "data_fusions" | ||
"assay_information", "data_cna_hg19", "data_gene_matrix", | ||
"data_sv", "genomic_information", "genie_combined", "genie_cna_hg19" | ||
) | ||
directory_info = synu.walk(syn, "syn7492881") | ||
for dirpath, dirnames, filenames in directory_info: | ||
if len(dirpath[0].split("/")) != 2: | ||
continue | ||
for dirname, dir_synid in dirnames: | ||
# if dirname.endswith("-public") or folder_ent.name.startswith(("1.0", "2.0", "3.0")): | ||
if not dirname.endswith('-consortium'): | ||
continue | ||
print(dirname) | ||
push_cbio_files_to_snowflake(syn=syn, ctx=cs, synid=dir_synid, structured_data=structured_data) | ||
ctx.close() | ||
|
||
if __name__ == "__main__": | ||
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you need to handle any context management here to close the connection? Not sure how this piece works internally.