Skip to content

Commit

Permalink
added gwas scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
guhanrv committed Oct 2, 2019
1 parent 081b64b commit f6be4c3
Show file tree
Hide file tree
Showing 6 changed files with 395 additions and 113 deletions.
40 changes: 26 additions & 14 deletions analyses/GWAS/fet.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

hl.init(log="/home/fet.log")

# pop_strings = ["AJ", "FIN", "LIT", "NFE"]
pop_strings = ["NFE"]
pop_strings = ["AJ", "FIN", "LIT", "NFE"]
# pop_strings = ["NFE"]

for pop_string in pop_strings:
# for disease_string in ["cd", "ibd", "uc"]:
Expand Down Expand Up @@ -42,26 +42,38 @@
)
pop_ca_co_mt = pop_ca_co_mt.annotate_rows(fet_p_value=pop_ca_co_mt.fet.p_value)

pop_ca_co_mt = pop_ca_co_mt.annotate(
HGVSp=pop_ca_co_mt.vep.transcript_consequences.map(lambda x: x.hgvsp),
HGVSc=pop_ca_co_mt.vep.transcript_consequences.map(lambda x: x.hgvsc),
consequence=pop_ca_co_mt.vep.most_severe_consequence,
gene_symbol=pop_ca_co_mt.vep.transcript_consequences.map(lambda x: x.gene_symbol),
)

print("Exporting FET results to table...")
rows = pop_ca_co_mt.rows()
rows = rows.key_by()
rows.select(
V=rows.V,
P=rows.fet_p_value,
P=rows.P,
OR=rows.OR,
SE=rows.se,
CaAC=rows.caac,
CaNAC=rows.canac,
CoAC=rows.coac,
CoNAC=rows.conac,
maf=rows.variant_qc.AF,
call_rate=rows.variant_qc.call_rate,
mean_dp=rows.variant_qc.dp_stats.mean,
SE=rows.SE,
CaAC=rows.CaAC,
CaNAC=rows.CaNAC,
CoAC=rows.CoAC,
CoNAC=rows.CoNAC,
maf=rows.maf,
call_rate=rows.call_rate,
mean_dp=rows.mean_dp,
case_phwe=rows.case_phwe,
control_phwe=rows.control_phwe,
HGVSp=rows.HGVSp,
HGVSc=rows.HGVSc,
consequence=rows.consequence,
gene_symbol=rows.gene_symbol,
).export(
"gs://ibd-exomes/v36meta/"
+ pop_string
+ pop
+ "_"
+ disease_string
+ disease
+ "_FET_results.tsv.gz"
)
)
215 changes: 215 additions & 0 deletions analyses/GWAS/logreg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,215 @@
import hail as hl

# hailctl dataproc start guhancluster --vep GRCh38 --zone us-west1-b --num-preemptible-workers 100 --worker-machine-type n1-standard-16 --master-machine-type n1-highmem-32

hl.init(log="/home/gwas.log")

def compute_pcs(mt, pc_num):
# Run PCA on synonymous, NFE <AF > 0.01, LD Pruned variants

print("Performing variant QC and filtering on call rate...")
mt = hl.variant_qc(mt, name="variant_qc")
mt = mt.filter_rows(mt.variant_qc.call_rate > 0.95)

# Lifted over version, generated with above code
nfe_ht = hl.read_table("gs://ibd-exomes/v36meta/fin_enriched_exomes_38.ht")

mt = mt.annotate_rows(nfe_ht=nfe_ht[mt.row_key])
print("Filtering gnomAD NFE AF > 0.01...")
mt = mt.filter_rows(mt.nfe_ht.nfe.AF > 0.01)
print(mt.count())

print("Filtering for synonymous variants...")
mt = mt.filter_rows(mt.vep.most_severe_consequence == "synonymous_variant")
print(mt.count())

# LD PRUNE
pruned_variants = hl.ld_prune(mt.GT)
print("Pruning...")
mt = mt.filter_rows(hl.is_defined(pruned_variants[mt.row_key]))
print(mt.count())

print("Computing and annotating PCs...")
eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT, k=pc_num)

return eigenvalues, pcs

def run_logistic_regression(test, mt, pop, disease):
result = hl.logistic_regression_rows(
test=test,
y=mt.is_case,
x=mt.GT.n_alt_alleles(),
covariates=[
1,
mt.scores[0],
mt.scores[1],
mt.scores[2],
mt.scores[3],
mt.scores[4],
mt.scores[5],
mt.scores[6],
mt.scores[7],
mt.scores[8],
mt.scores[9],
],
)
result = result.checkpoint(
"gs://ibd-exomes/v36meta/" + pop + "_" + disease + "_" + test + "_results.ht",
overwrite=True,
)
return result


def export_tsv(mt, pop, disease):
rows = mt.rows()
rows = rows.key_by()
print("Annotating specific fields...")
rows = rows.annotate(
maf=rows.variant_qc.AF,
call_rate=rows.variant_qc.call_rate,
HGVSp=rows.vep.transcript_consequences.map(lambda x: x.hgvsp),
HGVSc=rows.vep.transcript_consequences.map(lambda x: x.hgvsc),
consequence=rows.vep.most_severe_consequence,
mean_dp=rows.variant_qc.dp_stats.mean,
gene_symbol=rows.vep.transcript_consequences.map(lambda x: x.gene_symbol),
)
rows.select(
V=rows.V,
P=hl.cond(rows.wald_fit.converged == True, rows.wald_p, rows.firth_p),
OR=hl.cond(
rows.wald_fit.converged == True,
hl.exp(rows.wald_beta),
hl.exp(rows.firth_beta),
),
SE=hl.cond(
rows.wald_fit.converged == True,
rows.wald_se,
(rows.firth_beta / (1 - hl.qnorm(rows.firth_p))),
),
CaAC=rows.caac,
CaNAC=rows.canac,
CoAC=rows.coac,
CoNAC=rows.conac,
maf=rows.maf,
call_rate=rows.call_rate,
mean_dp=rows.mean_dp,
case_phwe=rows.case_phwe,
control_phwe=rows.control_phwe,
TEST=hl.cond(rows.wald_fit.converged == True, "WALD", "FIRTH"),
HGVSp=rows.HGVSp,
HGVSc=rows.HGVSc,
consequence=rows.consequence,
gene_symbol=rows.gene_symbol,
).export(
"gs://ibd-exomes/v36meta/"
+ pop
+ "_"
+ disease
+ "_logreg_results.tsv.gz"
)



def main():
# for pop in ["AJ", "FIN", "LIT", "NFE"]:
for pop in ["NFE"]:
print("Running " + pop + " Wald...")
for disease in ["cd", "ibd", "uc"]:
# for disease in ["cd"]:
# Read in MT
pop_ca_co_mt = hl.read_matrix_table(
"gs://ibd-exomes/v36meta/" + pop + "_" + disease + ".mt"
)

# Coalesce
pop_ca_co_mt = pop_ca_co_mt.naive_coalesce(500)

# Prep for GWAS with C/C status and PCs
print("Annotating case/control status...")
pop_ca_co_mt = pop_ca_co_mt.annotate_cols(
is_case=(~(pop_ca_co_mt.control.contains(pop_ca_co_mt.DIAGNOSIS)))
)

# Checkpoint
pop_ca_co_mt = pop_ca_co_mt.checkpoint(
"gs://ibd-exomes/v36meta/" + pop + "_" + disease + "_diagnosis.mt", overwrite=True
)

# print("Performing VEP...")
# pop_ca_co_mt = hl.vep(pop_ca_co_mt, "gs://hail-common/vep/vep/vep95-GRCh38-loftee-gcloud.json")

eigenvalues, pcs = compute_pcs(pop_ca_co_mt, 10)

pop_ca_co_mt = pop_ca_co_mt.annotate_cols(scores=pcs[pop_ca_co_mt.s].scores)

# Checkpoint
pop_ca_co_mt = pop_ca_co_mt.checkpoint(
"gs://ibd-exomes/v36meta/" + pop + "_" + disease + "_pcs.mt", overwrite=True
)

# Read in MT
pop_ca_co_mt = hl.read_matrix_table(
"gs://ibd-exomes/v36meta/" + pop + "_" + disease + "_pcs.mt"
)

# print(pop_ca_co_mt.count())

print(
"Conducting Wald test for "
+ pop
+ " population, "
+ disease
+ "..."
)

# Run, annotate with results
wald_result = run_logistic_regression("wald", pop_ca_co_mt, pop, disease)
print(wald_result.count())

wald_result = hl.read_table(
"gs://ibd-exomes/v36meta/" + pop + "_" + disease + "_wald_results.ht"
)

pop_ca_co_mt = pop_ca_co_mt.annotate_rows(
wald_beta=wald_result[pop_ca_co_mt.row_key].beta,
wald_se=wald_result[pop_ca_co_mt.row_key].standard_error,
wald_z_stat=wald_result[pop_ca_co_mt.row_key].z_stat,
wald_p=wald_result[pop_ca_co_mt.row_key].p_value,
wald_fit=wald_result[pop_ca_co_mt.row_key].fit,
)

# # Rerun with Firth
# pop_ca_co_firth_mt = pop_ca_co_mt.filter_rows(
# pop_ca_co_mt.wald_fit.converged == False
# )

# print(
# "Conducting Firth test for "
# + pop
# + " population, "
# + disease
# + "..."
# )

# print(pop_ca_co_firth_mt.count())
# firth_result = run_logistic_regression(
# "firth", pop_ca_co_firth_mt, pop, disease
# )

firth_result = hl.read_table(
"gs://ibd-exomes/v36meta/" + pop + "_" + disease + "_firth_results.ht"
)

# Merge
pop_ca_co_mt = pop_ca_co_mt.annotate_rows(
firth_beta=firth_result[pop_ca_co_mt.row_key].beta,
firth_chi_sq_stat=firth_result[pop_ca_co_mt.row_key].chi_sq_stat,
firth_p=firth_result[pop_ca_co_mt.row_key].p_value,
firth_fit=firth_result[pop_ca_co_mt.row_key].fit,
)

print("Exporting logreg results to table...")
export_tsv(pop_ca_co_mt, pop, disease)

if __name__ == "__main__":
main()
12 changes: 12 additions & 0 deletions analyses/GWAS/prep_for_gwas.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,20 @@

pop_strings = ["AJ", "FIN", "LIT", "NFE"]

print("Reading in Full MT...")
sex_mt = hl.read_matrix_table("gs://ibd-exomes/v36meta/v36+ccdg_082119.mt/")

imputed_sex = hl.impute_sex(sex_mt.GT)
imputed_sex = imputed_sex.checkpoint("gs://ibd-exomes/v36meta/imputed_sex.ht/", overwrite=True)

# WHAT?

print("Reading in QC'ed MT...")
mt = hl.read_matrix_table("gs://ibd-exomes/v36meta/v36+ccdg_qc.mt/")
print(mt.count())

mt = mt.annotate_cols(is_female=imputed_sex[mt.col_key].is_female)

print("Reading in diagnoses...")
diagnosis_info = hl.import_table("gs://ibd-exomes/v36meta/v36+ccdg_pop+diagnosis.tsv")
diagnosis_info = diagnosis_info.key_by("SAMPLE_ID")
Expand Down Expand Up @@ -149,6 +159,8 @@
)
)

pop_ca_co_mt = hl.vep(pop_ca_co_mt, "gs://hail-common/vep/vep/vep95-GRCh38-loftee-gcloud.json")

print("Writing to file...")
pop_ca_co_mt.write(
"gs://ibd-exomes/v36meta/" + pop_string + "_" + disease_string + ".mt",
Expand Down
62 changes: 0 additions & 62 deletions analyses/GWAS/wald.py

This file was deleted.

Loading

0 comments on commit f6be4c3

Please sign in to comment.