forked from annie-novak9/Coloc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
main_wrapper.py
250 lines (215 loc) · 8.64 KB
/
main_wrapper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
import os
import os.path
import argparse
'''
Set up argparser
'''
parser = argparse.ArgumentParser(description = 'Run GWAS-eQTL colocalization pipeline.')
#Add arguments
parser.add_argument('--gwas', required=True, help = 'GWAS summary statistics directory path')
parser.add_argument('--eqtl', required=True, help='Exact eQTL txt file path')
parser.add_argument('--vcf', required=True, help = 'Directory containing the vcf files from the eQTL population')
parser.add_argument('--snp_annot', required=True, help = 'SNP annotation files directory path')
parser.add_argument('--gene_annot', required=True, help = 'Exact gene annotation file path')
parser.add_argument('--geno', required=True, help = 'Genotype files directory path')
parser.add_argument('--frq', required = True, help = 'Exact frq file path')
parser.add_argument('--out', required=True, help = 'Specify main output directory for all output file')
parser.add_argument('--pop1', required=True, help = 'Population used for vcf files')
parser.add_argument('--pop4', required=True, help = 'Population used for frq and eQTL files')
parser.add_argument('--pop_size', type = int, required=True, help = 'Size of population')
parser.add_argument('--phenotypes', nargs = '*', required=True, help = 'Phenotypes to test')
parser.add_argument('--chrs', type = int, nargs= '*', required=True, help = 'Indicate what chromosomes to run')
#Flag to specify whether to run all genes or genes specified by user
parser.add_argument('-gene_id', type = str, nargs='*', default= False, help = 'Input specific genes ids to run colocalization on, default is set to False')
args = parser.parse_args()
'''
Store arguments
'''
gwas = args.gwas
eqtl = args.eqtl
vcf = args.vcf
if vcf[-1] == "/":
vcf = vcf[:-1]
snp_annot = args.snp_annot
gene_annot = args.gene_annot
geno = args.geno
frq = args.frq
out = args.out
pop1 = args.pop1
pop4 = args.pop4
pop_size = args.pop_size
phenos = args.phenotypes
chrs = args.chrs
#Convert lists from int to str
chrs = list(map(str, chrs))
pop_size = str(pop_size)
#Get Coloc repo working directory
wd = os.getcwd()
'''
Check/create necessary output directories
'''
if os.path.isdir(out) == False:
os.mkdir(out)
path = args.out + '/'
#Make main subdirectories
dirs = ['LD_matrix', 'gene_lists', 'GWAS_TOPMED', 'eQTL', 'Coloc_output']
for dir in dirs:
temp_path = path + dir
if os.path.isdir(temp_path) == False:
os.mkdir(temp_path)
#Make population directories in subdirectories
temp_path = path + 'LD_matrix/' + pop1
if os.path.isdir(temp_path) == False:
os.mkdir(temp_path)
temp_path = temp_path + '/' + pop1 + '_1Mb_coords_LDMatrix'
if os.path.isdir(temp_path) == False:
os.mkdir(temp_path)
temp_path = path + 'gene_lists/' + pop1
if os.path.isdir(temp_path) == False:
os.mkdir(temp_path)
#Make population directories in subdirectories
temp_path = path + 'GWAS_TOPMED/' + pop4
if os.path.isdir(temp_path) == False:
os.mkdir(temp_path)
temp_path = path + 'eQTL/' + pop4
if os.path.isdir(temp_path) == False:
os.mkdir(temp_path)
#Save LD ouput dir
ld = path + 'LD_matrix/'
print('Directories created.', flush=True)
'''
Run Scripts
'''
print('Run ' + pop1 + '.', flush=True)
## Script 2
for chr in chrs:
#Get vcf file specific to chr
vcf_files = os.listdir(vcf)
for file in vcf_files:
if chr+"." in file:
chr_vcf = vcf+"/"+file
#maybe implement subprocess for parallelization
os.system("chmod u+x 02_make_bed.sh") #Make the script executable
cmd = "./02_make_bed.sh "+pop1+" "+chr+" "+chr_vcf+" "+out
os.system(cmd)
print('Bfiles made.', flush=True)
## Scripts 1
#Get files in dirs
geno_files = os.listdir(geno)
snp_annot_files = os.listdir(snp_annot)
chr_snp=""
chr_geno=""
#Get file specific to chr
for chr in chrs:
#print(chr,flush=True)
for file in geno_files:
#print(file,flush=True)
if "chr"+chr+"." in file:
#print("Found genotype file",flush=True)
#print(file,flush=True)
chr_geno = geno+"/"+file
for file in snp_annot_files:
#print(file,flush=True)
if "chr"+chr+"." in file:
#print("Found SNP annotation file",flush=True)
#print(file,flush=True)
chr_snp = snp_annot+"/"+file
#Run command
script1cmd = 'Rscript 01b_run_pull_snps_driving.R ' + chr + ' ' + chr_snp + ' ' + gene_annot + ' ' + chr_geno + ' ' + out + ' ' + pop1
#print(script1cmd, flush=True)
os.system(script1cmd)
print('Pulling SNPs completed.', flush=True)
## Script 3
for chr in chrs:
os.system("chmod u+x 03_make_LD_matrix.sh")
cmd = "./03_make_LD_matrix.sh "+pop1+" "+chr+" "+out
os.system("./03_make_LD_matrix.sh "+pop1+" "+chr+" "+out)
print('LD matrices created.', flush=True)
## Run scripts w/specified gene list
if args.gene_id != False:
#Get genes
gene_ids = args.gene_id
#Convert list of genes to string
genes_unlist = ' '.join(gene_ids)
print('Gene id subset to run: '+genes_unlist, flush=True)
for pheno in phenos:
print('Running '+ pheno + ' for '+pop1, flush=True)
## Script 4
#Get list of files in gwas directory
gwas_files = os.listdir(gwas)
#Get gwas file specific to phenotype
for file in gwas_files:
if pheno in file:
pheno_gwas_file = file
#Run script 4 command
chrs_unlist = ' '.join(chrs)
cmd = 'Rscript 04_prep_files_coloc.R '+ gwas + '/' + pheno_gwas_file + ' ' + eqtl + ' ' + frq + ' ' + out \
+ ' ' + pop4 + ' ' + pop_size + ' ' + pheno + ' ' + chrs_unlist
os.system(cmd)
print('Input files formatted.', flush=True)
## Scripts 5
#Get formatted gwas file specific to pop and phenotype
out_gwas = out + '/GWAS_TOPMED/' + pop4
gwas_files = os.listdir(out_gwas)
for file in gwas_files:
if pheno in file:
if pop4 in file:
pheno_pop_gwas = file
print('Formatted gwas file: '+pheno_pop_gwas, flush=True)
#Get formatted eqtl file specific to pop & phenotype
out_eqtl = out + '/eQTL/' + pop4
eqtl_files = os.listdir(out_eqtl)
for file in eqtl_files:
if pheno in file:
if pop4 in file:
pheno_pop_eqtl = file
print('Formatted eqtl file: '+pheno_pop_eqtl, flush=True)
#Get LD dir for this pop
pop_ld = ld + pop1 + '/' + pop1 + '_1Mb_coords_LDMatrix'
#Run script 5 command
cmd = 'Rscript 05b_run_coloc.R ' + out_gwas + '/' + pheno_pop_gwas + ' ' + out_eqtl + '/' + pheno_pop_eqtl + ' ' + pop_ld + ' ' + out \
+ ' ' + pop1 + ' ' + pop_size + ' ' + pheno + ' ' + genes_unlist
os.system(cmd)
print('Coloc analysis finished', flush=True)
## Run all genes in chromosomes
else:
for pheno in phenos:
print('Running '+ pheno + ' for '+pop1, flush=True)
## Script 4
#Get list of files in gwas directory
gwas_files = os.listdir(gwas)
#Get gwas file specific to phenotype
for file in gwas_files:
if pheno in file:
pheno_gwas_file = file
#Run script 4 command
chrs_unlist = ' '.join(chrs)
cmd = 'Rscript 04_prep_files_coloc.R '+ gwas + '/' + pheno_gwas_file + ' ' + eqtl + ' ' + frq + ' ' + out \
+ ' ' + pop4 + ' ' + pop_size + ' ' + pheno + ' ' + chrs_unlist
os.system(cmd)
print('Input files formatted.', flush=True)
## Scripts 5
#Get formatted gwas file specific to pop and phenotype
out_gwas = out + '/GWAS_TOPMED/' + pop4
gwas_files = os.listdir(out_gwas)
for file in gwas_files:
if pheno in file:
if pop4 in file:
pheno_pop_gwas = file
print('Formatted gwas file: '+pheno_pop_gwas, flush=True)
#Get formatted eqtl file specific to pop & phenotype
out_eqtl = out + '/eQTL/' + pop4
eqtl_files = os.listdir(out_eqtl)
for file in eqtl_files:
if pheno in file:
if pop4 in file:
pheno_pop_eqtl = file
print('Formatted eqtl file: '+pheno_pop_eqtl, flush=True)
#Get LD dir for this pop
pop_ld = ld + pop1 + '/' + pop1 + '_1Mb_coords_LDMatrix'
#Run script 5 command
cmd = 'Rscript 05b_run_coloc.R ' + out_gwas + '/' + pheno_pop_gwas + ' ' + out_eqtl + '/' + pheno_pop_eqtl + ' ' + pop_ld + ' ' + out \
+ ' ' + pop1 + ' ' + pop_size + ' ' + pheno
os.system(cmd)
print('Coloc analysis finished', flush=True)
print('Pipeline completed running.', flush=True)