-
Notifications
You must be signed in to change notification settings - Fork 106
/
install_data.py
executable file
·122 lines (93 loc) · 4.26 KB
/
install_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env python
from optparse import OptionParser
from distutils import spawn
import glob
import os
import subprocess
import sys
################################################################################
# install_data.py
#
# Download and arrange pre-trained models and data.
################################################################################
################################################################################
# main
################################################################################
def main():
usage = 'usage: %prog [options] arg'
parser = OptionParser(usage)
parser.add_option('-r', dest='restart', default=False, action='store_true', help='Do not overwrite existing files, as if restarting an aborted installation [Default: %default]')
(options,args) = parser.parse_args()
if spawn.find_executable('wget').find('wget') != -1:
dl_cmd = 'wget'
elif spawn.find_executable('curl').find('curl') != -1:
dl_cmd = 'curl -L -O'
else:
print >> sys.stderr, 'Cannot find wget or curl to download files'
exit(1)
os.chdir('data')
############################################################
# download pre-trained model
############################################################
os.chdir('models')
if not options.restart or not os.path.isfile('pretrained_model.th'):
print >> sys.stderr, 'Downloading pre-trained model.'
cmd = '%s https://www.dropbox.com/s/rguytuztemctkf8/pretrained_model.th.gz' % dl_cmd
subprocess.call(cmd, shell=True)
cmd = 'gunzip pretrained_model.th.gz'
subprocess.call(cmd, shell=True)
os.chdir('..')
############################################################
# download human genome
############################################################
os.chdir('genomes')
if not options.restart or not os.path.isfile('hg19.fa'):
print >> sys.stderr, 'Downloading hg19 FASTA from UCSC. If you already have it, CTL-C to place a sym link in the genomes directory named hg19.fa'
# download hg19
cmd = '%s ftp://hgdownload.cse.ucsc.edu/goldenPath/hg19/bigZips/chromFa.tar.gz -O chromFa.tar.gz' % dl_cmd
subprocess.call(cmd, shell=True)
# un-tar
cmd = 'tar -xzvf chromFa.tar.gz'
subprocess.call(cmd, shell=True)
# cat
cmd = 'cat chr?.fa chr??.fa > hg19.fa'
subprocess.call(cmd, shell=True)
# clean up
os.remove('chromFa.tar.gz')
for chrom_fa in glob.glob('chr*.fa'):
os.remove(chrom_fa)
if not options.restart or not os.path.isfile('hg19.fa.fai'):
cmd = 'samtools faidx hg19.fa'
subprocess.call(cmd, shell=True)
os.chdir('..')
############################################################
# download and prepare public data
############################################################
if not options.restart or not os.path.isfile('encode_roadmap.h5'):
cmd = '%s https://www.dropbox.com/s/h1cqokbr8vjj5wc/encode_roadmap.bed.gz' % dl_cmd
subprocess.call(cmd, shell=True)
cmd = 'gunzip encode_roadmap.bed.gz'
subprocess.call(cmd, shell=True)
cmd = '%s https://www.dropbox.com/s/8g3kc0ai9ir5d15/encode_roadmap_act.txt.gz' % dl_cmd
subprocess.call(cmd, shell=True)
cmd = 'gunzip encode_roadmap_act.txt.gz'
subprocess.call(cmd, shell=True)
'''
# download and arrange available data
cmd = './get_dnase.sh'
subprocess.call(cmd, shell=True)
# preprocess
cmd = 'preprocess_features.py -y -m 200 -s 600 -o encode_roadmap -c human.hg19.genome sample_beds.txt'
subprocess.call(cmd, shell=True)
'''
# make a FASTA file
cmd = 'bedtools getfasta -fi genomes/hg19.fa -bed encode_roadmap.bed -s -fo encode_roadmap.fa'
subprocess.call(cmd, shell=True)
# make an HDF5 file
cmd = 'seq_hdf5.py -c -t 71886 -v 70000 encode_roadmap.fa encode_roadmap_act.txt encode_roadmap.h5'
subprocess.call(cmd, shell=True)
################################################################################
# __main__
################################################################################
if __name__ == '__main__':
main()