-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathHybPhyloMaker0b_preparereference.sh
201 lines (187 loc) · 7.85 KB
/
HybPhyloMaker0b_preparereference.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
#!/bin/bash
#----------------MetaCentrum----------------
#PBS -l walltime=2:0:0
#PBS -l select=1:ncpus=1:mem=1gb:scratch_local=1gb
#PBS -j oe
#PBS -N HybPhyloMaker0b_preparereference
#PBS -m abe
#-------------------HYDRA-------------------
#$ -S /bin/bash
#$ -q sThC.q
#$ -l mres=1G
#$ -cwd
#$ -j y
#$ -N HybPhyloMaker0b_preparereference
#$ -o HybPhyloMaker0b_preparereference.log
# ********************************************************************************
# * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building *
# * https://github.com/tomas-fer/HybPhyloMaker *
# * Script 0b - Prepare pseudoreference *
# * v.1.6.5 *
# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2018 *
# * tomas.fer@natur.cuni.cz *
# ********************************************************************************
# Make two files:
# (1) pseudo-reference from multifasta file (e.g. Hyb-Seq probes),
# i.e. make single fasta (one line) with all original sequences separated by number of Ns specified in $nrns
# $nrns Ns are also added to the beginning and the end of the reference
# Pseudo-reference (*_with${nrns}Ns_beginend.fas) is saved to HybSeqSource directory
# (2) BED file with starting and ending positions of all exons in pseudoreference file
# This file is later used for per exon coverage calculation
if [[ $PBS_O_HOST == *".cz" ]]; then
echo -e "\nHybPhyloMaker0b is running on MetaCentrum...\n"
#settings for MetaCentrum
#Move to scratch
cd $SCRATCHDIR
#Copy file with settings from home and set variables from settings.cfg
cp $PBS_O_WORKDIR/settings.cfg .
. settings.cfg
. /packages/run/modules-2.0/init/bash
path=/storage/$server/home/$LOGNAME/$data
source=/storage/$server/home/$LOGNAME/HybSeqSource
elif [[ $HOSTNAME == compute-*-*.local ]]; then
echo -e "\nHybPhyloMaker0b is running on Hydra...\n"
#settings for Hydra
#set variables from settings.cfg
. settings.cfg
path=../$data
source=../HybSeqSource
#Make and enter work directory
mkdir -p workdir00_ref
cd workdir00_ref
else
echo -e "\nHybPhyloMaker0b is running locally...\n"
#settings for local run
#set variables from settings.cfg
. settings.cfg
path=../$data
source=../HybSeqSource
#Make and enter work directory
mkdir -p workdir00_ref
cd workdir00_ref
fi
#Check necessary file
echo -ne "Testing if input data are available..."
if [[ $cp =~ "yes" ]]; then
if [ ! -f "$source/$cpDNACDS" ]; then
echo -e "'$cpDNACDS' is missing in 'HybSeqSource'. Exiting...\n"
rm -d ../workdir00_ref/ 2>/dev/null
exit 3
else
cp $source/$cpDNACDS .
# Cut filename before first '.', i.e. remove suffix - does not work if there are other dots in reference file name
name=`ls $cpDNACDS | cut -d'.' -f 1`
rm $cpDNACDS
if [ -f "$source/${name}_with${nrns}Ns_beginend.fas" ]; then
echo -e "File '${name}_with${nrns}Ns_beginend.fas' already exists in '$source'. Delete it or rename before running this script again. Exiting...\n"
rm -d ../workdir00_ref/ 2>/dev/null
exit
elif [ -f "$source/${name}_with${nrns}Ns_beginend_exon_positions.bed" ]; then
echo -e "File '${name}_with${nrns}Ns_beginend_exon_positions.bed' already exists in '$source'. Delete it or rename before running this script again. Exiting...\n"
rm -d ../workdir00_ref/ 2>/dev/null
exit
else
echo -e "OK\n"
fi
fi
else
if [ ! -f "$source/$probes" ]; then
echo -e "'$probes' is missing in 'HybSeqSource'. Exiting...\n"
rm -d ../workdir00_ref/ 2>/dev/null
exit 3
else
cp $source/$probes .
# Cut filename before first '.', i.e. remove suffix - does not work if there are other dots in reference file name
name=`ls $probes | cut -d'.' -f 1`
rm $probes
if [ -f "$source/${name}_with${nrns}Ns_beginend.fas" ]; then
echo -e "File '${name}_with${nrns}Ns_beginend.fas' already exists in '$source'. Delete it or rename before running this script again. Exiting...\n"
rm -d ../workdir00_ref/ 2>/dev/null
exit
elif [ -f "$source/${name}_with${nrns}Ns_beginend_exon_positions.bed" ]; then
echo -e "File '${name}_with${nrns}Ns_beginend_exon_positions.bed' already exists in '$source'. Delete it or rename before running this script again. Exiting...\n"
rm -d ../workdir00_ref/ 2>/dev/null
exit
else
echo -e "OK\n"
fi
fi
fi
if [[ ! $location == "1" ]]; then
if [ "$(ls -A ../workdir00_ref)" ]; then
echo -e "Directory 'workdir00_ref' already exists and is not empty. Delete it or rename before running this script again. Exiting...\n"
rm -d ../workdir00_ref/ 2>/dev/null
exit 3
fi
fi
#Copy probe sequence file and remove 'CR' characters
if [[ $cp =~ "yes" ]]; then
cp $source/$cpDNACDS .
sed -i 's/\x0D$//' $cpDNACDS
else
cp $source/$probes .
sed -i 's/\x0D$//' $probes
fi
# 1. Prepare pseudoreference
# Cut filename before first '.', i.e. remove suffix - does not work if there are other dots in reference file name
if [[ $cp =~ "yes" ]]; then
name=`ls $cpDNACDS | cut -d'.' -f 1`
else
name=`ls $probes | cut -d'.' -f 1`
fi
# Print headers in fasta file
echo ">${name}_with${nrns}Ns_beginend" > ${name}_with${nrns}Ns_beginend.fas
# 0. print N $nrns times to variable $a
#a=$(printf "%0.sN" {1..400})
a=$(printf "%0.sN" $(seq 1 $nrns))
# 1. awk command to remove EOLs from lines not beginning with '>', i.e. all lines containing sequence for particular record are merged, i.e. each record on two lines only
# 2. sed command to delete all lines starting with '>', i.e. only sequences remain
# 3. tr command to replace all EOLs ("\n") by Q
# 4. sed command to replace all Qs by a sequence of $nrns Ns (saved in variable $a)
# 5. awk command to print $nrns Ns to the beginning and the end of the reference
if [[ $cp =~ "yes" ]]; then
cat $cpDNACDS | awk '!/^>/ { printf "%s", $0; n = "\n" } /^>/ { print n $0; n = "" } END { printf "%s", n }' | sed '/>/d' | tr "\n" "Q" | sed "s/Q/$a/g" | awk -v val=$a '{ print val $0 val }' >> ${name}_with${nrns}Ns_beginend.fas
else
cat $probes | awk '!/^>/ { printf "%s", $0; n = "\n" } /^>/ { print n $0; n = "" } END { printf "%s", n }' | sed '/>/d' | tr "\n" "Q" | sed "s/Q/$a/g" | awk -v val=$a '{ print val $0 val }' >> ${name}_with${nrns}Ns_beginend.fas
fi
#Copy pseudoreference to home
cp ${name}_with${nrns}Ns_beginend.fas $source
echo -e "Pseudoreference named '${name}_with${nrns}Ns_beginend.fas' was prepared and saved to '$source'..."
# 2. Prepare BED file
# Remove fasta header (if any), i.e., delete all line beginning with '>'
header=$(grep ">" ${name}_with${nrns}Ns_beginend.fas | sed 's/>//')
if [[ $cp =~ "yes" ]]; then
sed '/^>/ d' $cpDNACDS > reflines.txt
else
sed '/^>/ d' $probes > reflines.txt
fi
# cat $probes | sed 's/N\{400\}/Q/g' | tr 'Q' '\n' > reflines.txt
# Delete empty lines
sed -i.bak '/^$/d' reflines.txt
# Count length of each line
awk '{print length($0)}' reflines.txt > lengths.txt
# Set $linesum to $nrns + 1 (i.e., starting position of the first exon)
linesum=`expr $nrns + 1`
cat lengths.txt | while read line
do
# Print BED file (name and two numbers: 1. start ($linesum) 2. end of the exon ($linesum + length of the exon - 1)
echo $header $linesum `expr $linesum + $line - 1` >> ${header}_exon_positions.bed
# Increaese $linesum by length of the region and $nrns (so now it is start position of the next exon)
linesum=`expr $linesum + $line + $nrns`
done
# Replace spaces by tabs
cat ${header}_exon_positions.bed | tr ' ' '\t' > tmp && mv tmp ${header}_exon_positions.bed
#Copy BED file to home
cp ${header}_exon_positions.bed $source
echo -e "\nBED file '${header}_exon_positions.bed' was prepared and saved to '$source'..."
#Clean scratch/work directory
if [[ $PBS_O_HOST == *".cz" ]]; then
#delete scratch
if [[ ! $SCRATCHDIR == "" ]]; then
rm -rf $SCRATCHDIR/*
fi
else
cd ..
rm -r workdir00_ref
fi
echo -e "\nScript HybPhyloMaker0b finished...\n"