Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create GENIE tables #13

Merged
merged 11 commits into from
Apr 16, 2024
80 changes: 80 additions & 0 deletions genie/genie_elt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
"""GENIE ELT pipeline"""
import os

from dotenv import dotenv_values
import pandas as pd
import snowflake.connector
from snowflake.connector.pandas_tools import write_pandas
import synapseclient


def main():
"""GENIE ELT pipeline"""
syn = synapseclient.login()

config = dotenv_values("../.env")

ctx = snowflake.connector.connect(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do you need to handle any context management here to close the connection? Not sure how this piece works internally.

user=config['user'],
password=config['password'],
account=config['snowflake_account'],
database="genie",
role="SYSADMIN",
warehouse="compute_xsmall"
)

cs = ctx.cursor()
# data_CNA
structured_data = (
"data_clinical", "data_mutations",
"assay_information", "data_cna_hg19", "data_gene_matrix",
"data_sv", "genomic_information"
)
releases = syn.getChildren("syn7844529")
for release in releases:
if release['name'] != "Release 15.0-public":
continue
print(release['name'])
release_name = (release['name']
.replace("Release ", "")
.replace(".", "_")
.replace("-public", "")
)
release_id = release['id']
release_files = syn.getChildren(release_id)
Copy link
Contributor

@BryanFauble BryanFauble Jan 8, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

By default this may include more than just the Files you intend to retrieve:
https://python-docs.synapse.org/reference/client/?h=getchildren#synapseclient.Synapse.getChildren

It may include other entity types

release_file_map = {
release_file['name']: syn.get(release_file['id'], followLink=True)
for release_file in release_files
if release_file['name'].startswith(structured_data) and
release_file['name'].endswith("txt")
}

cs.execute(
f"CREATE SCHEMA IF NOT EXISTS public_{release_name} WITH MANAGED ACCESS;"
)
for release_file_key, release_file_ent in release_file_map.items():
cs.execute(f"USE SCHEMA public_{release_name}")
tbl_name = (release_file_key
.replace("data_", "")
.replace(".txt", "")
.replace(".seg", "")
)
print(tbl_name)
table_df = pd.read_csv(
release_file_ent.path,
sep="\t",
comment="#",
low_memory=False
)
write_pandas(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Are you required to commit after writing?

ctx,
table_df,
tbl_name,
auto_create_table=True,
quote_identifiers=False,
overwrite=True
)


if __name__ == "__main__":
main()
120 changes: 0 additions & 120 deletions transforms/genie_elt.py

This file was deleted.