-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathmummer_biplot.sh
executable file
·344 lines (315 loc) · 13.2 KB
/
mummer_biplot.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
#!/bin/bash
# mummer_biplot.sh: produce a pairwise plot from two fasta sequences
#
# Requirements:
# run on a unix computer installed with mummer3+ (mummer apps in $PATH)
# two related fasta references to be compared
#
# use instead asm2dotplot.sh + pafCoordsDotPlotly_SP.R (much faster)
#
# Stephane Plaisance (VIB-NC+BITS) 2017/09/22; v1.0
#
# visit our Git: https://github.com/Nucleomics-VIB
# check parameters for your system
version="1.01, 2018_04_13"
usage='# Usage: mummer_biplot.sh -x <reference assembly> -y <query assembly>
# script version '${version}'
# [optional: -o <result folder>]
# [optional: -c <min-cluster|100>]
# [optional: -t <data type (nucmer|promer; default nucmer)>]
# [optional: -I <min-identity to include in show-coords|95>]
# [optional: -L <min-align length to include in show-coords|100>]
# [optional: -f <output format (png,postscript,x11)|png>]
# [optional: -T <threads for nucmer job>|1]
# [optional: -h <this help text>]'
while getopts "x:y:o:c:p:t:I:L:f:T:h" opt; do
case $opt in
x) assembly1=${OPTARG} ;;
y) assembly2=${OPTARG} ;;
o) outpathopt=${OPTARG} ;;
f) format=${OPTARG} ;;
c) clust=${OPTARG} ;;
t) datatype=${OPTARG} ;;
I) minidentityopt=${OPTARG} ;;
L) minalignopt=${OPTARG} ;;
T) threads=${OPTARG} ;;
h) echo "${usage}" >&2; exit 0 ;;
\?) echo "Invalid option: -${OPTARG}" >&2; exit 1 ;;
*) echo "this command requires arguments, try -h" >&2; exit 1 ;;
esac
done
# defaults
cluster=${clust:-100}
coordfilter=""
outformat=${format:-"png"}
# filtering options for show-coords
minidentity=${minidentityopt:-95}
coordfilter="${coordfilter}_filtered-I.${minidentity}"
minalign=${minalignopt:-100}
coordfilter="${coordfilter}_filtered-L.${minalign}"
# choose protein or nucleic alignment method
if [ -z "${datatype}" ]; then
prog="nucmer"
else
prog="${datatype}"
fi
# threads => for nucmer only
thr=${threads:-1}
multi=""
if [ ${prog} = "nucmer" ]; then
multi="-t ${thr}"
fi
# test if minimal arguments were provided
if [ -z "${assembly1}" ]
then
echo "# no first assembly provided!"
echo "${usage}"
exit 1
fi
if [ ! -f "${assembly1}" ]; then
echo "${assembly1} file not found!"
exit 1
fi
if [ -z "${assembly2}" ]
then
echo "# no second assembly provided!"
echo "${usage}"
exit 1
fi
if [ ! -f "${assembly2}" ]; then
echo "${assembly2} file not found!";
exit 1
fi
# check if mummer requirements are present
$( hash ${prog} 2>/dev/null ) || ( echo "# ${prog} not found in PATH (nucmer or promer?)"; exit 1 )
$( hash show-coords 2>/dev/null ) || ( echo "# show-coords not found in PATH"; exit 1 )
$( hash mummerplot 2>/dev/null ) || ( echo "# mummer-plot not found in PATH"; exit 1 )
# labels from filenames
xlabel=$(basename ${assembly1%.f*})
ylabel=$(basename ${assembly2%.f*})
# other parameters or defaults
outpath=${outpathopt:-"mummer_results"}
mkdir -p ${outpath}
result="${outpath}/${prog}-plot-${ylabel%.f*}_vs_${xlabel%.f*}"
# build the command
stamp=$(date +%s)
# logfile
logfile=${outpath}/${ylabel}_vs_${xlabel}_mummer3-log_${stamp}.txt
cmd="${prog} --maxmatch \
-c ${cluster} \
-p ${result} \
${multi} \
${assembly1} ${assembly2}
> ${logfile} 2>&1"
# show and execute
echo "# ${cmd}"
eval ${cmd}
# after success create alignment file and plot
if [ $? -eq 0 ]; then
cmd="(show-coords -r -c -l ${result}.delta > ${result}_all_coords.txt && \
show-coords -r -c -l -I ${minidentity} -L ${minalign} ${result}.delta \
> ${result}${coordfilter}_coords.txt && \
mummerplot --fat --filter --layout --${outformat} --large -p ${result} ${result}.delta) \
>> ${logfile} 2>&1"
echo "# ${cmd}"
eval ${cmd}
else
echo "Mummer analysis seems to have failed, please check ${logfile}!"
fi
exit 0
########################################################################################
# man pages for the main executables used above
# USAGE: nucmer [options] <Reference> <Query>
#
# DESCRIPTION:
# nucmer generates nucleotide alignments between two mutli-FASTA input
# files. The out.delta output file lists the distance between insertions
# and deletions that produce maximal scoring alignments between each
# sequence. The show-* utilities know how to read this format.
#
# MANDATORY:
# Reference Set the input reference multi-FASTA filename
# Query Set the input query multi-FASTA filename
#
# OPTIONS:
# --mum Use anchor matches that are unique in both the reference
# and query
# --mumcand Same as --mumreference
# --mumreference Use anchor matches that are unique in in the reference
# but not necessarily unique in the query (default behavior)
# --maxmatch Use all anchor matches regardless of their uniqueness
#
# -b|breaklen Set the distance an alignment extension will attempt to
# extend poor scoring regions before giving up (default 200)
# --[no]banded Enforce absolute banding of dynamic programming matrix
# based on diagdiff parameter EXPERIMENTAL (default no)
# -c|mincluster Sets the minimum length of a cluster of matches (default 65)
# --[no]delta Toggle the creation of the delta file (default --delta)
# --depend Print the dependency information and exit
# -D|diagdiff Set the maximum diagonal difference between two adjacent
# anchors in a cluster (default 5)
# -d|diagfactor Set the maximum diagonal difference between two adjacent
# anchors in a cluster as a differential fraction of the gap
# length (default 0.12)
# --[no]extend Toggle the cluster extension step (default --extend)
# -f
# --forward Use only the forward strand of the Query sequences
# -g|maxgap Set the maximum gap between two adjacent matches in a
# cluster (default 90)
# -h
# --help Display help information and exit
# -l|minmatch Set the minimum length of a single match (default 20)
# -o
# --coords Automatically generate the original NUCmer1.1 coords
# output file using the 'show-coords' program
# --[no]optimize Toggle alignment score optimization, i.e. if an alignment
# extension reaches the end of a sequence, it will backtrack
# to optimize the alignment score instead of terminating the
# alignment at the end of the sequence (default --optimize)
# -p|prefix Set the prefix of the output files (default "out")
# -r
# --reverse Use only the reverse complement of the Query sequences
# --[no]simplify Simplify alignments by removing shadowed clusters. Turn
# this option off if aligning a sequence to itself to look
# for repeats (default --simplify)
# -V
# --version Display the version information and exit
# USAGE: promer [options] <Reference> <Query>
#
# DESCRIPTION:
# promer generates amino acid alignments between two mutli-FASTA DNA input
# files. The out.delta output file lists the distance between insertions
# and deletions that produce maximal scoring alignments between each
# sequence. The show-* utilities know how to read this format. The DNA
# input is translated into all 6 reading frames in order to generate the
# output, but the output coordinates reference the original DNA input.
#
# MANDATORY:
# Reference Set the input reference multi-FASTA DNA file
# Query Set the input query multi-FASTA DNA file
#
# OPTIONS:
# --mum Use anchor matches that are unique in both the reference
# and query
# --mumcand Same as --mumreference
# --mumreference Use anchor matches that are unique in in the reference
# but not necessarily unique in the query (default behavior)
# --maxmatch Use all anchor matches regardless of their uniqueness
#
# -b|breaklen Set the distance an alignment extension will attempt to
# extend poor scoring regions before giving up, measured in
# amino acids (default 60)
# -c|mincluster Sets the minimum length of a cluster of matches, measured in
# amino acids (default 20)
# --[no]delta Toggle the creation of the delta file (default --delta)
# --depend Print the dependency information and exit
# -d|diagfactor Set the clustering diagonal difference separation factor
# (default .11)
# --[no]extend Toggle the cluster extension step (default --extend)
# -g|maxgap Set the maximum gap between two adjacent matches in a
# cluster, measured in amino acids (default 30)
# -h
# --help Display help information and exit.
# -l|minmatch Set the minimum length of a single match, measured in amino
# acids (default 6)
# -m|masklen Set the maximum bookend masking lenth, measured in amino
# acids (default 8)
# -o
# --coords Automatically generate the original PROmer1.1 ".coords"
# output file using the "show-coords" program
# --[no]optimize Toggle alignment score optimization, i.e. if an alignment
# extension reaches the end of a sequence, it will backtrack
# to optimize the alignment score instead of terminating the
# alignment at the end of the sequence (default --optimize)
#
# -p|prefix Set the prefix of the output files (default "out")
# -V
# --version Display the version information and exit
# -x|matrix Set the alignment matrix number to 1 [BLOSUM 45], 2 [BLOSUM
# 62] or 3 [BLOSUM 80] (default 2)
# USAGE: show-coords [options] <deltafile>
#
# -b Merges overlapping alignments regardless of match dir
# or frame and does not display any idenitity information.
# -B Switch output to btab format
# -c Include percent coverage information in the output
# -d Display the alignment direction in the additional
# FRM columns (default for promer)
# -g Deprecated option. Please use 'delta-filter' instead
# -h Display help information
# -H Do not print the output header
# -I float Set minimum percent identity to display
# -k Knockout (do not display) alignments that overlap
# another alignment in a different frame by more than 50%
# of their length, AND have a smaller percent similarity
# or are less than 75% of the size of the other alignment
# (promer only)
# -l Include the sequence length information in the output
# -L long Set minimum alignment length to display
# -o Annotate maximal alignments between two sequences, i.e.
# overlaps between reference and query sequences
# -q Sort output lines by query IDs and coordinates
# -r Sort output lines by reference IDs and coordinates
# -T Switch output to tab-delimited format
# USAGE: mummerplot [options] <match file>
#
# DESCRIPTION:
# mummerplot generates plots of alignment data produced by mummer, nucmer,
# promer or show-tiling by using the GNU gnuplot utility. After generating
# the appropriate scripts and datafiles, mummerplot will attempt to run
# gnuplot to generate the plot. If this attempt fails, a warning will be
# output and the resulting .gp and .[frh]plot files will remain so that the
# user may run gnuplot independently. If the attempt succeeds, either an x11
# window will be spawned or an additional output file will be generated
# (.ps or .png depending on the selected terminal). Feel free to edit the
# resulting gnuplot script (.gp) and rerun gnuplot to change line thinkness,
# labels, colors, plot size etc.
#
# MANDATORY:
# match file Set the alignment input to 'match file'
# Valid inputs are from mummer, nucmer, promer and
# show-tiling (.out, .cluster, .delta and .tiling)
#
# OPTIONS:
# -b|breaklen Highlight alignments with breakpoints further than
# breaklen nucleotides from the nearest sequence end
# --[no]color Color plot lines with a percent similarity gradient or
# turn off all plot color (default color by match dir)
# If the plot is very sparse, edit the .gp script to plot
# with 'linespoints' instead of 'lines'
# -c
# --[no]coverage Generate a reference coverage plot (default for .tiling)
# --depend Print the dependency information and exit
# -f
# --filter Only display .delta alignments which represent the "best"
# hit to any particular spot on either sequence, i.e. a
# one-to-one mapping of reference and query subsequences
# -h
# --help Display help information and exit
# -l
# --layout Layout a .delta multiplot in an intelligible fashion,
# this option requires the -R -Q options
# --fat Layout sequences using fattest alignment only
# -p|prefix Set the prefix of the output files (default 'out')
# -rv Reverse video for x11 plots
# -r|IdR Plot a particular reference sequence ID on the X-axis
# -q|IdQ Plot a particular query sequence ID on the Y-axis
# -R|Rfile Plot an ordered set of reference sequences from Rfile
# -Q|Qfile Plot an ordered set of query sequences from Qfile
# Rfile/Qfile Can either be the original DNA multi-FastA
# files or lists of sequence IDs, lens and dirs [ /+/-]
# -r|rport Specify the port to send reference ID and position on
# mouse double click in X11 plot window
# -q|qport Specify the port to send query IDs and position on mouse
# double click in X11 plot window
# -s|size Set the output size to small, medium or large
# --small --medium --large (default 'small')
# -S
# --SNP Highlight SNP locations in each alignment
# -t|terminal Set the output terminal to x11, postscript or png
# --x11 --postscript --png (default 'x11')
# -t|title Specify the gnuplot plot title (default none)
# -x|xrange Set the xrange for the plot '[min:max]'
# -y|yrange Set the yrange for the plot '[min:max]'
# -V
# --version Display the version information and exit