-
Notifications
You must be signed in to change notification settings - Fork 4
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Create GENIE tables #13
Changes from 3 commits
9d5a977
35077a7
177ef12
c7b1cb0
f2a4b1e
61283d4
57131c0
3751ba6
ec6e414
9ca5e61
e4e16a4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
"""GENIE ELT pipeline""" | ||
import os | ||
|
||
from dotenv import dotenv_values | ||
import pandas as pd | ||
import snowflake.connector | ||
from snowflake.connector.pandas_tools import write_pandas | ||
import synapseclient | ||
|
||
|
||
def main(): | ||
"""GENIE ELT pipeline""" | ||
syn = synapseclient.login() | ||
|
||
config = dotenv_values("../.env") | ||
|
||
ctx = snowflake.connector.connect( | ||
user=config['user'], | ||
password=config['password'], | ||
account=config['snowflake_account'], | ||
database="genie", | ||
role="SYSADMIN", | ||
warehouse="compute_xsmall" | ||
) | ||
|
||
cs = ctx.cursor() | ||
# data_CNA | ||
structured_data = ( | ||
"data_clinical", "data_mutations", | ||
"assay_information", "data_cna_hg19", "data_gene_matrix", | ||
"data_sv", "genomic_information" | ||
) | ||
releases = syn.getChildren("syn7844529") | ||
for release in releases: | ||
if release['name'] != "Release 15.0-public": | ||
continue | ||
print(release['name']) | ||
release_name = (release['name'] | ||
.replace("Release ", "") | ||
.replace(".", "_") | ||
.replace("-public", "") | ||
) | ||
release_id = release['id'] | ||
release_files = syn.getChildren(release_id) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. By default this may include more than just the Files you intend to retrieve: It may include other entity types |
||
release_file_map = { | ||
release_file['name']: syn.get(release_file['id'], followLink=True) | ||
for release_file in release_files | ||
if release_file['name'].startswith(structured_data) and | ||
release_file['name'].endswith("txt") | ||
} | ||
|
||
cs.execute( | ||
f"CREATE SCHEMA IF NOT EXISTS public_{release_name} WITH MANAGED ACCESS;" | ||
) | ||
for release_file_key, release_file_ent in release_file_map.items(): | ||
cs.execute(f"USE SCHEMA public_{release_name}") | ||
tbl_name = (release_file_key | ||
.replace("data_", "") | ||
.replace(".txt", "") | ||
.replace(".seg", "") | ||
) | ||
print(tbl_name) | ||
table_df = pd.read_csv( | ||
release_file_ent.path, | ||
sep="\t", | ||
comment="#", | ||
low_memory=False | ||
) | ||
write_pandas( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are you required to commit after writing? |
||
ctx, | ||
table_df, | ||
tbl_name, | ||
auto_create_table=True, | ||
quote_identifiers=False, | ||
overwrite=True | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
This file was deleted.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you need to handle any context management here to close the connection? Not sure how this piece works internally.