-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathRUN.sh
executable file
·310 lines (310 loc) · 13.8 KB
/
RUN.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
#!/bin/bash
function printVersion {
printf "Spliceogen 2.0 1-October-2019\n"
}
function printHelp {
cat <<-END
Usage:
------
3 required args:
1) -input path/to/VCF/input/file(s).VCF
Note: multiple input files are accepted "eg. -input *.vcf"
Note: deprecated v1.0 tags "inputVCF" and "inputBED" are still accepted
2) -gtf path/to/annotation.gtf
3) -fasta path/to/genome.fa
optional arg:
4) -branchpointer hgXX
OR
-branchpointerIndels hgXX
Note: user must specify hg19 or hg38
END
}
#set default parameters
POSITIONAL=()
INPUTFILES=""
USEBP=""
USEBPINDELS=""
ANNOTATION=""
FASTAPATH=""
GENOMEBUILD=""
#parse command line args
while [[ $# -gt 0 ]]
do
key="$1"
case $key in
-h|--help)
printHelp
exit 0
;;
-v|--version)
printVersion
exit 0
;;
-input)
INPUTFILES="$2"
shift
shift
while [ "$1" ] && [[ ! $1 == *-* ]]; do
INPUTFILES="$INPUTFILES $1"
shift
done
;;
-inputVCF)
INPUTFILES="$2"
shift
shift
while [ "$1" ] && [[ ! $1 == *-* ]]; do
INPUTFILES="$INPUTFILES $1"
shift
done
;;
-inputBED)
INPUTFILES="$2"
shift
shift
while [ "$1" ] && [[ ! $1 == *-* ]]; do
INPUTFILES="$INPUTFILES $1"
shift
done
;;
-fasta)
FASTAPATH="$2"
shift
shift ;;
-gtf)
ANNOTATION="$2"
shift
shift ;;
-branchpointer)
GENOMEBUILD="$2"
USEBP="TRUE"
USEBPINDELS="FALSE"
shift
shift ;;
-branchpointerIndels)
GENOMEBUILD="$2"
USEBP="TRUE"
USEBPINDELS="TRUE"
shift
shift ;;
*)
POSITIONAL+=("$1")
shift ;;
esac
done
set - "${POSITIONAL[@]}"
#check input files exist and are not gzipped
if [ ! -f $FASTAPATH ]; then
echo -e "Fasta file not found: use -fasta ./path/to/hgXX.fa\nExiting..."
exit 1
elif [ ! -f "$ANNOTATION" ]; then
echo "GTF annotation file not found: use -gtf path/to/gencodeXX.gtf\nExiting..."
exit 1
fi
#correct mismatches in "chr" nomenclature between gtf and fasta
gtfChr=$(zcat -f "$ANNOTATION" | grep -v '^GL000' | tail -1 | awk '{print $1}' | grep chr)
fastaChr=$(cat "$FASTAPATH" | head -1 | awk '{print $1}' | grep chr)
gtfChrAdd=""
gtfChrRemove="UnmatchedString"
if [ "$fastaChr" != "" ]; then
if [ "$gtfChr" == "" ]; then
gtfChrAdd="chr"
fi
elif [ "$fastaChr" == "" ]; then
if [ "$gtfChr" != "" ]; then
gtfChrRemove="chr"
fi
fi
#prepare splice site intervals from annotation.gtf
gtfBasename=$(basename $ANNOTATION)
if [ ! -f data/"$gtfBasename"_SpliceSiteIntervals.txt ] || [[ "$ANNOTATION" -nt data/"$gtfBasename"_SpliceSiteIntervals.txt ]] ; then
echo "Preparing splice site annotation..."
zcat -f "$ANNOTATION" | grep '[[:blank:]]gene[[:blank:]]\|[[:blank:]]transcript[[:blank:]]\|[[:blank:]]exon[[:blank:]]' | grep -v '^GL000' |
sed "s/$gtfChrRemove//" | awk -v var="$gtfChrAdd" '{print var$0}' | java -cp bin getSpliceSiteIntervalsFromGTF > data/"$gtfBasename"_SpliceSiteIntervals.txt
fi
#for each input VCF/BED file
for FILE in $INPUTFILES; do
fileID=$(echo "$FILE" | xargs -n 1 basename)
#check current file exists
if [ ! -f "$FILE" ]; then
echo "Error: variant input file not found: $FILE \n Exiting..."
exit 1
fi
echo "Input file: $fileID"
#remove temp files from any previous run
rm temp/"$fileID"* 2> /dev/null
#check input file type
FILETYPE=""
nFields=$(zcat -f $FILE | tail -1 | wc -w)
vcfHeader=$(zcat -f $FILE | head -1 | grep VCF)
if [ "$nFields" -eq 4 ]; then
FILETYPE="TSV"
elif [ ! -z "$vcfHeader" ]; then
FILETYPE="VCF"
else
FILETYPE="BED"
fi
#correct mismatches in "chr" nomenclature between variant input and provided gtf/fasta
inputChr=$(zcat -f "$FILE" | tail -1 | awk '{print $1}' | grep chr)
inputChrAdd=""
inputChrRemove="UnmatchedString"
if [ "$fastaChr" != "" ]; then
if [ "$inputChr" == "" ]; then
inputChrAdd="chr"
fi
elif [ "$fastaChr" == "" ]; then
if [ "$inputChr" != "" ]; then
inputChrRemove="chr"
fi
fi
#sort body of input file
zcat -f "$FILE" | grep "^#" > temp/"$fileID"_sorted
if [ "$FILETYPE" == "TSV" ]; then
zcat -f "$FILE" | grep -v "^#" | sort -k1,1 -k2,2n | sed "s/$inputChrRemove//" | awk -v OFS="\\t" -v var=$inputChrAdd '{print var$1, $2, $2, "x", "1", ".", $3, $4}' >> temp/"$fileID"_sorted
else
zcat -f "$FILE" | grep -v "^#" | sort -k1,1 -k2,2n | sed "s/$inputChrRemove//" | awk -v OFS="\\t" -v var=$inputChrAdd '{print var$0}' >> temp/"$fileID"_sorted
fi
#check bedtools is installed
bedtoolsLocation=$(which bedtools);
if [ "$bedtoolsLocation" == "" ]; then
printf -- 'Warning: Bedtools does not appear to be installed.\n';
printf -- 'Get it here: https://bedtools.readthedocs.io/en/latest/content/installation.html\n';
fi;
#note: the bedtools getfasta "-name" behaviour is not backwards compatible. Between v2.26.0 and v2.27.0 the previous function of "name" was given to "name+". So need to alter command for different versions
recentBedtools=false
bedtoolsVersion=$(bedtools -version)
versionSort=$(echo -e "bedtools v2.26.0\nbedtools v2.27.0\n$bedtoolsVersion" | sort -V | head -2 | tail -1)
if [ "$versionSort" == "bedtools v2.27.0" ]; then
recentBedtools=true
fi
#bedtools intersect to get strand info
echo "Retrieving strand info..."
zcat -f "$ANNOTATION" | grep '[[:blank:]]gene[[:blank:]]' | sort -k1,1 -k4,4n | grep -v '^GL000' | awk -v var="$gtfChrAdd" -v OFS="\\t" '{print var$0}' | sed "s/$gtfChrRemove//" | bedtools intersect -a temp/"$fileID"_sorted -b stdin -wa -wb -sorted > temp/"$fileID"unstrandedInput.txt
if [ $? -ne 0 ]; then
echo "Warning. Bedtools intersect returned non-zero exit status. Intersection failed between provided variant VCF/BED file and provided GTF. See above error message for more details"
fi
if [ ! -s temp/"$fileID"unstrandedInput.txt ]; then
echo "Error: no variants were returned following bedtools intersect between input file \""$fileID"\" and gtf. \n Exiting..."
exit 1
fi
#generate flanking intervals.bed for bedtools getfasta and branchpointer input
if [ "$FILETYPE" = "VCF" ]; then
grep '[[:blank:]]+[[:blank:]]' temp/"$fileID"unstrandedInput.txt | awk -v OFS="\\t" '{print ".", $1, $2, "+", $4, $5}' | ( [[ "$USEBP" ]] && tee temp/"$fileID"bpInput.txt || cat ) | java -cp bin getFastaIntervals > temp/"$fileID"fastaIntervals.bed
grep '[[:blank:]]-[[:blank:]]' temp/"$fileID"unstrandedInput.txt | awk -v OFS="\\t" '{print ".", $1, $2, "-", $4, $5}' | ( [[ "$USEBP" ]] && tee -a temp/"$fileID"bpInput.txt || cat ) | java -cp bin getFastaIntervals >> temp/"$fileID"fastaIntervals.bed
elif [ "$FILETYPE" = "TSV" ] || [ "$FILETYPE" = "BED" ] ; then
grep '[[:blank:]]+[[:blank:]]' temp/"$fileID"unstrandedInput.txt | awk -v OFS="\\t" '{print ".", $1, $2, "+", $7, $8}' | ( [[ "$USEBP" ]] && tee temp/"$fileID"bpInput.txt || cat ) | java -cp bin getFastaIntervals > temp/"$fileID"fastaIntervals.bed
grep '[[:blank:]]-[[:blank:]]' temp/"$fileID"unstrandedInput.txt | awk -v OFS="\\t" '{print ".", $1, $2, "-", $7, $8}' | ( [[ "$USEBP" ]] && tee -a temp/"$fileID"bpInput.txt || cat ) | java -cp bin getFastaIntervals >> temp/"$fileID"fastaIntervals.bed
fi
echo "Retrieving flanking FASTA sequence..."
if [ "$recentBedtools" == true ]; then
bedtools getfasta -fi $FASTAPATH -bed temp/"$fileID"fastaIntervals.bed -name+ -s > temp/"$fileID"seqToScan.FASTA
else
bedtools getfasta -fi $FASTAPATH -bed temp/"$fileID"fastaIntervals.bed -name -s > temp/"$fileID"seqToScan.FASTA
fi
if [ ! -s temp/"$fileID"seqToScan.FASTA ]; then
echo "Error: no variants were returned following bedtools getfasta command. \n Exiting..."
exit 1
fi
#seqScan: generates input strings for maxentscan and genesplicer as well as ESRseq scores
echo "Scanning for motifs..."
rm output/"$fileID"mesOmmitted.txt 2> /dev/null
rm output/"$fileID"refMismatch.txt 2> /dev/null
java -cp bin seqScan temp/"$fileID"seqToScan.FASTA -useESR $fileID 1>&2
if [ -s output/"$fileID"refMismatch.txt ]; then
refMismatchCount=$(wc -l output/"$fileID"refMismatch.txt | awk '{print $1}')
echo "Note: $refMismatchCount variants were excluded because the provided Reference allele does not match the nucleotide(s) in the provided FASTA. IDs of excluded variant(s) are outputted here: Spliceogen/output/""$fileID""refMismatch.txt"
fi
if [ -s output/"$fileID"mesOmmitted.txt ]; then
mesOmmittedCount=$(wc -l output/"$fileID"mesOmmitted.txt | awk '{print $1}')
echo "Note: $mesOmmittedCount variants were excluded from MaxEntScan because their flanking FASTA sequence contains invalid characters (most commonly \"n\"), which cannot be processed by MaxEntScan. IDs of ommitted variant(s) are listed in: Spliceogen/output/""$fileID""mesOmmitted.txt"
fi
#run maxEntScan and confirm non-zero exit, since invalid inputs cause it to exit early
if [ -s temp/"$fileID"mesDonorInput.txt ] || [ -s temp/"$fileID"mesAcceptorInput.txt ] ; then
echo "Running MaxEntScan..."
perl score5.pl temp/"$fileID"mesDonorInput.txt | sed 's/;-;(-)/;/' | sed 's/;+;/;/' | sort -t '>' -k2 | java -cp bin processScoresMES > temp/"$fileID"mesDonorScores.txt
retVal=( ${PIPESTATUS[0]} )
if [ $retVal -ne 0 ]; then
echo "MaxEntScan returned non-zero exit status. It is likely not all variants were processed. Exiting..."
exit $retVal
fi
perl score3.pl temp/"$fileID"mesAcceptorInput.txt | sed 's/;-;(-)/;/' | sed 's/;+;/;/' | sort -t '>' -k2 | java -cp bin processScoresMES > temp/"$fileID"mesAcceptorScores.txt
retVal=( ${PIPESTATUS[0]} )
if [ $retVal -ne 0 ]; then
echo "MaxEntScan returned non-zero exit status. It is likely not all variants were processed. Exiting..."
exit $retVal
fi
else
echo "No input for MaxEntScan"
fi
#run genesplicer
if [ -s temp/"$fileID"gsInput.FASTA ] ; then
echo "Running GeneSplicer..."
bin/linux/genesplicerAdapted temp/"$fileID"gsInput.FASTA human > temp/"$fileID"gsScores.txt
else
echo "No input for GeneSplicer"
fi
#run branchpointer SNPs
if [ "$USEBP" = "TRUE" -a "$USEBPINDELS" = "FALSE" ]; then
echo "Running Branchpointer..."
Rscript --slave --vanilla bin/bpProcessing.R "$fileID" $(pwd) "$ANNOTATION" "$GENOMEBUILD" &> output/"$fileID"bpLog.txt
#awk -v OFS=\\t '{print $2, $3, $4, $8, $9, $15, $16, $21, $22, $23, $24}' output/bpOutputSNPs.txt > output/bpSNPsSummarised.txt
fi
#run branchpointer indels
if [ "$USEBPINDELS" = "TRUE" ] ; then
echo "Running Branchpointer..."
while read -r na chr start strand ref alt; do
refLength=${#ref}
altLength=${#ref}
end=$((refLength+start))
if [ $altLength -gt 1 ] || [ $refLength -gt 1 ]; then
echo -e ".\t$chr\t$start\t$end\t$strand\t$ref\t$alt" >> temp/"$fileID"bpInputIndels.txt
fi
done < temp/"$fileID"bpInput.txt
Rscript --slave --vanilla bin/bpProcessingINDELS.R "$fileID" $(pwd) "$ANNOTATION" "$GENOMEBUILD" &> output/"$fileID"bpIndelLog.txt
#awk -v OFS=\\t '{print $2, $3, $4, $8, $9, $16, $17, $23, $24, $25, $26}' output/"$fileID"bpOutputIndels.txt > output/bpIndelsSummarised.txt
#awk -v OFS=\\t '{print $2, $3, $4, $8, $9, $15, $16, $22, $23, $24, $25}' output/"$fileID"bpOutputSNPs.txt" > output/bpSNPsSummarised.txt
fi
#merge scores into one line
scoresToMerge=""
if [ -s temp/"$fileID"gsScores.txt ] ; then
scoresToMerge="temp/"$fileID"gsScores.txt"
fi
if [ -s temp/"$fileID"mesDonorScores.txt ] ; then
scoresToMerge="$scoresToMerge temp/"$fileID"mesDonorScores.txt"
fi
if [ -s temp/"$fileID"mesAcceptorScores.txt ] ; then
scoresToMerge="$scoresToMerge temp/"$fileID"mesAcceptorScores.txt"
fi
if [ -s temp/"$fileID"ESRoutput.txt ] ; then
scoresToMerge="$scoresToMerge temp/"$fileID"ESRoutput.txt"
fi
#edit splice site intervals file in event of changed input/fasta "chr" nomenclature
intervalsFileChr=$(cat data/"$gtfBasename"_SpliceSiteIntervals.txt | head -1 | awk '{print $1}' | grep chr)
if [ "$fastaChr" != "" ]; then
if [ "$intervalsFileChr" == "" ]; then
sed -i 's/^/chr/' data/"$gtfBasename"_SpliceSiteIntervals.txt
fi
elif [ "$fastaChr" == "" ]; then
if [ "$gtfChr" != "" ]; then
sed -i "s/^chr//" data/"$gtfBasename"_SpliceSiteIntervals.txt
fi
fi
checkScoresExist=$(echo "$scoresToMerge" | grep "temp")
if [ -z "$checkScoresExist" ]; then
echo "No MaxEntScan/GeneSplicer/ESRseq scores to process"
else
echo "Processing scores..."
cat $(echo "$scoresToMerge") data/"$gtfBasename"_SpliceSiteIntervals.txt sources/terminatingMergeLine.txt | sort -k1,1 -V -k 2,2n -k 3 -k 4 -s | java -cp bin mergeOutput "$fileID" inputAdd="$inputChrRemove" inputRemove="$inputChrAdd"
fi
#sort predictions
if [ -s temp/"$fileID"_gain_unsorted.txt ]; then
echo -e "#CHR\tSTART\tREF\tALT\tGENE\tdonGainP\taccGainP" > output/"$fileID"_ssGain.txt
sort -gr -k8,8 temp/"$fileID"_gain_unsorted.txt | cut -f1-7 >> output/"$fileID"_ssGain.txt
fi
if [ -s temp/"$fileID"_loss_unsorted.txt ]; then
echo -e "#CHR\tSTART\tREF\tALT\tGENE\twithinSS\tdonLossP\taccLossP" > output/"$fileID"_withinSS.txt
sort -gr -k9,9 temp/"$fileID"_loss_unsorted.txt | cut -f1-8 >> output/"$fileID"_withinSS.txt
fi
#clean up temp files
rm temp/"$fileID"* 2> /dev/null
done