-
Notifications
You must be signed in to change notification settings - Fork 21
/
_submit_merqury.sh
executable file
·223 lines (173 loc) · 6.98 KB
/
_submit_merqury.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
#!/usr/bin/env bash
if [[ "$#" -lt 3 ]]; then
echo
echo "Usage: _submit_merqury.sh [-c] <read-db.meryl> [<mat.meryl> <pat.meryl>] <asm1.fasta> [asm2.fasta] <out>"
echo
echo "***Submitter script to run each steps in parallele on slurm. Modify according to your cluster environment.***"
echo
echo -e "\t-c\t\t: [OPTIONAL] input meryl databases are homopolymer compressed, while asm.fasta isn't."
echo -e "\t\t\t This option is not supported for trio-based analysis. Use pre-compressed assemblies with no -c option."
echo -e "\t<read-db.meryl>\t: k-mer counts of the read set"
echo -e "\t<mat.meryl>\t: k-mer counts of the maternal haplotype (ex. mat.hapmer.meryl)"
echo -e "\t<pat.meryl>\t: k-mer counts of the paternal haplotype (ex. pat.hapmer.meryl)"
echo -e "\t<asm1.fasta>\t: Assembly fasta file (ex. pri.fasta, hap1.fasta or maternal.fasta)"
echo -e "\t[asm2.fasta]\t: Additional fasta file (ex. alt.fasta, hap2.fasta or paternal.fasta)"
echo -e "\t\t\t*asm1.meryl and asm2.meryl will be generated. Avoid using the same names as the hap-mer dbs"
echo -e "\t<out>\t\t: Output prefix"
echo -e "Arang Rhie, 2020-01-29. arrhie@gmail.com"
exit 0
fi
source $MERQURY/util/util.sh
compress=""
if [ "x$1" = "x-c" ]; then
compress="-c"
shift
fi
readdb=`link $1`
echo "read: $readdb"
echo
if [[ "$#" -gt 4 ]]; then
echo "Haplotype dbs provided."
echo "Running Merqury in trio mode..."
echo
hap1=`link $2`
hap2=`link $3`
asm1=`link $4`
echo "hap1: $hap1"
echo "hap2: $hap2"
echo "asm1: $asm1"
if [[ "$#" -eq 5 ]]; then
out=$5
else
asm2=`link $5`
out=$6
echo "asm2: $asm2"
fi
elif [[ "$#" -gt 2 ]]; then
echo "No haplotype dbs provided."
echo "Running Merqury in non-trio mode..."
echo
asm1=`link $2`
echo "asm1: $asm1"
if [[ "$#" -eq 3 ]]; then
out=$3
else
asm2=`link $3`
out=$4
echo "asm2: $asm2"
fi
fi
echo "out : $out"
echo
if [ -e $out ]; then
echo "$out already exists. Provide a different name. (Are we missing the <out>?)"
exit -1
fi
mkdir -p logs
# All jobs are expected to finish within 4 hours, giving more time for large genomes (>5GB)
partition=norm
walltime=8:00:00
path=`pwd`
extra=""
#### Get spectra-cn plots and QV stats
cpus=24
mem=24g
name=$out.spectra-cn
script="$MERQURY/eval/spectra-cn.sh"
args="$compress $readdb $asm1 $asm2 $out"
log=logs/$name.%A.log
echo "\
sbatch -J $name --mem=$mem --partition=$partition --cpus-per-task=$cpus -D $path $extra --time=$walltime --error=$log --output=$log $script $args"
sbatch -J $name --mem=$mem --partition=$partition --cpus-per-task=$cpus -D $path $extra --time=$walltime --error=$log --output=$log $script $args > cn.jid
jid=`cat cn.jid`
if [ -z $hap1 ]; then
exit 0
fi
if [ ! -z $compress ]; then
echo "[ WARNING ] :: -c option is not supported for trio-based analysis."
echo "[ WARNING ] :: Use pre-compressed assemblies and run with no -c option."
exit 0
fi
# All below jobs are expected to finish within 4 hours
partition=quick
walltime=4:00:00
#### Get blob plots
cpus=12
mem=24g
script="$MERQURY/trio/hap_blob.sh"
# ./hap_blob.sh <hap1.meryl> <hap2.meryl> <asm1.fasta> [asm2.fasta] <out>
args="$hap1 $hap2 $asm1 $asm2 $out"
name=$out.blob
log=logs/$name.%A.log
echo "\
sbatch -J $name --mem=$mem --partition=$partition --cpus-per-task=$cpus -D $path $extra --time=$walltime --error=$log --output=$log $script $args"
sbatch -J $name --mem=$mem --partition=$partition --cpus-per-task=$cpus -D $path $extra --time=$walltime --error=$log --output=$log $script $args > blob.jid
#### Get haplotype specfic spectra-cn plots
cpus=24
mem=16g
extra="--dependency=afterok:$jid" # Re-uses asm.meryl dbs in spectra-cn.sh.
name=$out.spectra-hap
script="$MERQURY/trio/spectra-hap.sh"
args="$readdb $hap1 $hap2 $asm1 $asm2 $out"
log=logs/$name.%A.log
echo "\
sbatch -J $name --mem=$mem --partition=$partition --cpus-per-task=$cpus -D $path $extra --time=$walltime --error=$log --output=$log $script $args"
sbatch -J $name --mem=$mem --partition=$partition --cpus-per-task=$cpus -D $path $extra --time=$walltime --error=$log --output=$log $script $args > hap.jid
#### Get phase blocks
# This may take longer
partition=norm
walltime=12:00:00
cpus=24
mem=32g
extra=""
script="$MERQURY/trio/phase_block.sh"
# ./phase_block.sh <asm.fasta> <mat.meryl> <pat.meryl> <out>
# Only one assembly given.
asm1_name=`echo $asm1 | sed 's/\.gz$//g' | sed 's/\.fa$//g' | sed 's/\.fasta//g'`
args="$asm1 $hap1 $hap2 $out.$asm1_name"
name=$out.phase-block.$asm1_name
log=logs/$name.%A.log
echo "\
sbatch -J $name --mem=$mem --partition=$partition --cpus-per-task=$cpus -D $path $extra --time=$walltime --error=$log --output=$log $script $args"
sbatch -J $name --mem=$mem --partition=$partition --cpus-per-task=$cpus -D $path $extra --time=$walltime --error=$log --output=$log $script $args > block1.jid
if [[ "$asm2" == "" ]] ; then
# Compute block stats
partition=quick
walltime=1:00:00
cpus=4
mem=8g
name="$out.block_N1"
log=logs/$name.%A.log
extra="--dependency=afterok:`cat block1.jid`"
# ./block_n_stats.sh <asm1.fasta> <asm1.*.phased_block.bed> [<asm2.fasta> <asm2.*.phased_block.bed>] <out> [genome_size]
script="$MERQURY/trio/block_n_stats.sh"
args="$asm1 $out.$asm1_name.*.phased_block.bed $out"
echo "\
sbatch -J $name --mem=$mem --partition=$partition --cpus-per-task=$cpus -D $path $extra --time=$walltime --error=$log --output=$log $script $args"
sbatch -J $name --mem=$mem --partition=$partition --cpus-per-task=$cpus -D $path $extra --time=$walltime --error=$log --output=$log $script $args > block1_N.jid
exit 0
fi
cpus=24
mem=32g
extra=""
asm2_name=`echo $asm2 | sed 's/\.gz//g' | sed 's/\.fa$//g' | sed 's/\.fasta//g'`
args="$asm2 $hap1 $hap2 $out.$asm2_name"
name=$out.phase-block.$asm2_name
log=logs/$name.%A.log
echo "\
sbatch -J $name --mem=$mem --partition=$partition --cpus-per-task=$cpus -D $path $extra --time=$walltime --error=$log --output=$log $script $args"
sbatch -J $name --mem=$mem --partition=$partition --cpus-per-task=$cpus -D $path $extra --time=$walltime --error=$log --output=$log $script $args > block2.jid
# Compute block stats
partition=quick
walltime=1:00:00
cpus=4
mem=8g
extra="--dependency=afterok:`cat block1.jid`,afterok:`cat block2.jid`"
# ./block_n_stats.sh <asm1.fasta> <asm1.*.phased_block.bed> [<asm2.fasta> <asm2.*.phased_block.bed>] <out> [genome_size]
script="$MERQURY/trio/block_n_stats.sh"
args="$asm1 $out.$asm1_name.*.phased_block.bed $asm2 $out.$asm2_name.*.phased_block.bed $out"
name=$out.block_N
log=logs/$name.%A.log
echo "\
sbatch -J $name --mem=$mem --partition=$partition --cpus-per-task=$cpus -D $path $extra --time=$walltime --error=$log --output=$log $script $args"
sbatch -J $name --mem=$mem --partition=$partition --cpus-per-task=$cpus -D $path $extra --time=$walltime --error=$log --output=$log $script $args > block2_N.jid