Skip to content

Commit 62e428e

Browse files
authored
Merge pull request #13 from Sage-Bionetworks/create-genie
Create GENIE tables
2 parents c57ce06 + e4e16a4 commit 62e428e

File tree

3 files changed

+302
-1
lines changed

3 files changed

+302
-1
lines changed

genie/genie_bpc_elt.py

+204
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,204 @@
1+
"""GENIE ELT pipeline"""
2+
import os
3+
4+
from dotenv import dotenv_values
5+
import pandas as pd
6+
import snowflake.connector
7+
from snowflake.connector.pandas_tools import write_pandas
8+
import synapseclient
9+
10+
11+
def create_snowflake_resources(
12+
ctx: snowflake.connector.connect,
13+
syn: synapseclient.Synapse,
14+
cohort: str,
15+
version: str,
16+
clinical_synid: str,
17+
cbioportal_synid: str
18+
):
19+
cs = ctx.cursor()
20+
clinical_files = syn.getChildren(clinical_synid)
21+
cbioportal_files = syn.getChildren(cbioportal_synid)
22+
schema_name = f"{cohort}_{version}"
23+
cs.execute(
24+
f"CREATE SCHEMA IF NOT EXISTS {schema_name} WITH MANAGED ACCESS;"
25+
)
26+
cs.execute(f"USE SCHEMA {schema_name}")
27+
28+
for clinical_file in clinical_files:
29+
print(clinical_file['name'])
30+
table_name = clinical_file['name'].replace(".csv", "")
31+
clinical_entity = syn.get(clinical_file['id'])
32+
clin_df = pd.read_csv(
33+
clinical_entity.path,
34+
comment="#",
35+
low_memory=False
36+
)
37+
write_pandas(
38+
ctx,
39+
clin_df,
40+
table_name,
41+
auto_create_table=True,
42+
quote_identifiers=False,
43+
overwrite=True
44+
)
45+
46+
for cbioportal_file in cbioportal_files:
47+
# print(cbioportal_file['name'])
48+
table_name = (cbioportal_file['name']
49+
.replace("data_", "")
50+
.replace(".txt", "")
51+
.replace(f"genie_{cohort}_", "")
52+
.replace("cna_hg19.seg", "seg")
53+
)
54+
# TODO: error when uploading SEG file and CNA file
55+
exclude = ("gene_panel", "meta", "CNA", "case_lists", "seg", "tmb")
56+
if not table_name.startswith(exclude):
57+
print(table_name)
58+
table_ent = syn.get(cbioportal_file['id'])
59+
table_df = pd.read_csv(
60+
table_ent.path,
61+
sep="\t",
62+
comment="#",
63+
low_memory=False
64+
)
65+
write_pandas(
66+
ctx,
67+
table_df,
68+
table_name,
69+
auto_create_table=True,
70+
quote_identifiers=False,
71+
overwrite=True
72+
)
73+
74+
def main():
75+
"""GENIE BPC ELT pipeline"""
76+
syn = synapseclient.login()
77+
78+
config = dotenv_values("../.env")
79+
80+
ctx = snowflake.connector.connect(
81+
user=config['user'],
82+
password=config['password'],
83+
account=config['snowflake_account'],
84+
database="genie",
85+
role="SYSADMIN",
86+
warehouse="compute_xsmall"
87+
)
88+
cohorts = [
89+
{
90+
"cohort": "nsclc",
91+
"version": "public_02_2",
92+
"clinical_synid": "syn30358089",
93+
"cbioportal_synid": "syn30358098"
94+
},
95+
{
96+
"cohort": "crc",
97+
"version": "public_02_2",
98+
"clinical_synid": "syn39802567",
99+
"cbioportal_synid": "syn39802595"
100+
},
101+
{
102+
"cohort": "bladder",
103+
"version": "consortium_01_1",
104+
"clinical_synid": "syn28495599",
105+
"cbioportal_synid": "syn26958249"
106+
},
107+
{
108+
"cohort": "bladder",
109+
"version": "consortium_01_2",
110+
"clinical_synid": "syn53018574",
111+
"cbioportal_synid": "syn53018728"
112+
},
113+
{
114+
"cohort": "brca",
115+
"version": "consortium_01_1",
116+
"clinical_synid": "syn26253353",
117+
"cbioportal_synid": "syn24981909"
118+
},
119+
{
120+
"cohort": "brca",
121+
"version": "consortium_01_2",
122+
"clinical_synid": "syn39802381",
123+
"cbioportal_synid": "syn32299078"
124+
},
125+
{
126+
"cohort": "crc",
127+
"version": "consortium_01_1",
128+
"clinical_synid": "syn24166685",
129+
"cbioportal_synid": "syn23561876"
130+
},
131+
{
132+
"cohort": "crc",
133+
"version": "consortium_01_2",
134+
"clinical_synid": "syn26046784",
135+
"cbioportal_synid": "syn25998993"
136+
},
137+
{
138+
"cohort": "crc",
139+
"version": "public_preview_02_0",
140+
"clinical_synid": "syn39802279",
141+
"cbioportal_synid": "syn30381296"
142+
},
143+
{
144+
"cohort": "nsclc",
145+
"version": "consortium_01_1",
146+
"clinical_synid": "syn22418966",
147+
"cbioportal_synid": "syn22679734"
148+
},
149+
{
150+
"cohort": "nsclc",
151+
"version": "public_preview_02_0",
152+
"clinical_synid": "syn27245047",
153+
"cbioportal_synid": "syn27199149"
154+
},
155+
{
156+
"cohort": "nsclc",
157+
"version": "consortium_02_1",
158+
"clinical_synid": "syn25982471",
159+
"cbioportal_synid": "syn25471745"
160+
},
161+
{
162+
"cohort": "panc",
163+
"version": "consortium_01_1",
164+
"clinical_synid": "syn27244194",
165+
"cbioportal_synid": "syn26288998"
166+
},
167+
{
168+
"cohort": "panc",
169+
"version": "consortium_01_2",
170+
"clinical_synid": "syn50612197",
171+
"cbioportal_synid": "syn50697830"
172+
},
173+
{
174+
"cohort": "prostate",
175+
"version": "consortium_01_1",
176+
"clinical_synid": "syn28495574",
177+
"cbioportal_synid": "syn26471041"
178+
},
179+
{
180+
"cohort": "prostate",
181+
"version": "consortium_01_2",
182+
"clinical_synid": "syn50612196",
183+
"cbioportal_synid": "syn50697637"
184+
}
185+
]
186+
187+
for cohort_info in cohorts:
188+
cohort = cohort_info['cohort']
189+
version = cohort_info['version']
190+
clinical_synid = cohort_info['clinical_synid']
191+
cbioportal_synid = cohort_info['cbioportal_synid']
192+
print(cohort, version)
193+
create_snowflake_resources(
194+
ctx=ctx,
195+
syn=syn,
196+
cohort=cohort,
197+
version=version,
198+
clinical_synid=clinical_synid,
199+
cbioportal_synid=cbioportal_synid
200+
)
201+
ctx.close()
202+
203+
if __name__ == "__main__":
204+
main()

genie/genie_elt.py

+96
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
"""GENIE ELT pipeline"""
2+
import os
3+
4+
from dotenv import dotenv_values
5+
import pandas as pd
6+
import snowflake.connector
7+
from snowflake.connector.pandas_tools import write_pandas
8+
import synapseclient
9+
import synapseutils as synu
10+
11+
12+
def push_cbio_files_to_snowflake(syn: synapseclient.Synapse, ctx: snowflake.connector.connect, synid: str, structured_data: list):
13+
"""_summary_
14+
15+
Args:
16+
ctx (snowflake.connector.connect): _description_
17+
synid (str): _description_
18+
structured_data (list): _description_
19+
"""
20+
folder_ent = syn.get(synid)
21+
release_name = folder_ent.name.split("-")[0].replace(".", "_")
22+
23+
release_files = syn.getChildren(synid, includeTypes=["file", "folder", "link"])
24+
release_file_map = {
25+
release_file['name']: syn.get(release_file['id'], followLink=True)
26+
for release_file in release_files
27+
if release_file['name'].startswith(structured_data) and
28+
release_file['name'].endswith(("txt", "bed"))
29+
}
30+
31+
ctx.execute(
32+
f"CREATE SCHEMA IF NOT EXISTS consortium_{release_name} WITH MANAGED ACCESS;"
33+
)
34+
ctx.execute(f"USE SCHEMA consortium_{release_name}")
35+
for release_file_key, release_file_ent in release_file_map.items():
36+
tbl_name = (release_file_key
37+
.replace("data_", "")
38+
.replace(".txt", "")
39+
.replace(".seg", "")
40+
)
41+
if tbl_name == "genie_combined.bed":
42+
tbl_name = "genomic_information"
43+
elif tbl_name == "genie_cna_hg19.seg":
44+
tbl_name = "cna_hg19"
45+
print(tbl_name)
46+
47+
table_df = pd.read_csv(
48+
release_file_ent.path,
49+
sep="\t",
50+
comment="#",
51+
low_memory=False
52+
)
53+
write_pandas(
54+
ctx,
55+
table_df,
56+
tbl_name,
57+
auto_create_table=True,
58+
quote_identifiers=False,
59+
overwrite=True
60+
)
61+
62+
def main():
63+
"""GENIE ELT pipeline"""
64+
syn = synapseclient.login()
65+
66+
config = dotenv_values("../.env")
67+
68+
ctx = snowflake.connector.connect(
69+
user=config['user'],
70+
password=config['password'],
71+
account=config['snowflake_account'],
72+
database="genie",
73+
role="SYSADMIN",
74+
warehouse="compute_xsmall"
75+
)
76+
cs = ctx.cursor()
77+
# Exclude data_cna for now
78+
structured_data = (
79+
"data_clinical", "data_mutations", "data_fusions"
80+
"assay_information", "data_cna_hg19", "data_gene_matrix",
81+
"data_sv", "genomic_information", "genie_combined", "genie_cna_hg19"
82+
)
83+
directory_info = synu.walk(syn, "syn7492881")
84+
for dirpath, dirnames, filenames in directory_info:
85+
if len(dirpath[0].split("/")) != 2:
86+
continue
87+
for dirname, dir_synid in dirnames:
88+
# if dirname.endswith("-public") or folder_ent.name.startswith(("1.0", "2.0", "3.0")):
89+
if not dirname.endswith('-consortium'):
90+
continue
91+
print(dirname)
92+
push_cbio_files_to_snowflake(syn=syn, ctx=cs, synid=dir_synid, structured_data=structured_data)
93+
ctx.close()
94+
95+
if __name__ == "__main__":
96+
main()

sage/audit_elt.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
cs = ctx.cursor()
2323

2424
files = syn.getChildren("syn53180811")
25-
25+
# files = [{"id": "syn55198186", "name": "IT-3518-sage-org-buckets.csv"}]
2626
for file_info in files:
2727
synapse_id = file_info['id']
2828
table_name = file_info['name'].replace("-", "_").replace(".csv", "")
@@ -42,3 +42,4 @@
4242
overwrite=True,
4343
quote_identifiers=False
4444
)
45+
cs.close()

0 commit comments

Comments
 (0)