-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun_StarAlign2pass.sh
executable file
·184 lines (144 loc) · 4.98 KB
/
run_StarAlign2pass.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
#!/bin/env bash
# run_StarAlign2pass.sh
# takes all fastq_files from the local 'reads' folder
# align reads to a reference
# perform two passes:
# first pass to add existing junctions to the reference index
# second pass to align to the extended reference index
#
# Stéphane Plaisance - VIB-BITS - Feb-27-2020 v1.1
# including NC stringency options
# but keeping other defaults as on STAR forum
version="1.1, 2020_03_04"
workdir=${1:-"/data2/NC_projects/DNBSEQG400_validation/DNBSEQG400_eval3"}
cd ${workdir}
thr=84
readlen=100
idxlen=$((${readlen}-1))
refindex=$STAR_INDEXES/GRCh38.99
reffolder=$BIODATA/references
refgtf=${reffolder}/Homo_sapiens.GRCh38.99/Homo_sapiens.GRCh38.99.chr.gtf
reffasta=${reffolder}/Homo_sapiens.GRCh38.99/Homo_sapiens.GRCh38.dna.primary_assembly.fa
#####################################
# Nucleomics STAR optional settings #
#####################################
read -r -d '' STAR_OPTIONS <<'EOF'
--outFilterMismatchNmax 10 \
--outFilterMismatchNoverLmax 0.3 \
--alignSJDBoverhangMin 3 \
--alignSJoverhangMin 5 \
--alignIntronMin 21 \
--alignIntronMax 500000 \
--outFilterMultimapNmax 10 \
--outSJfilterOverhangMin 12 30 30 30 \
--outWigType None \
--outSAMprimaryFlag OneBestScore
EOF
######################################
# build STAR index if does not exist #
######################################
if [ -d "${refindex}" ]; then
echo "# STAR index already present, passing"
else
mkdir -p ${refindex}
cmd="STAR \
--runMode genomeGenerate \
--runThreadN ${thr} \
--genomeDir ${refindex} \
--genomeFastaFiles ${reffasta} \
--sjdbGTFfile ${refgtf} \
--sjdbOverhang ${idxlen}"
echo "# ${cmd}"
eval ${cmd}
fi
##############################################################
# first pass on all samples to collect all possible junctions
##############################################################
mkdir -p ${workdir}/STAR_mappings_PASS-1
for reads1 in ${workdir}/reads/*_1.fq.gz; do
samplename=$(basename ${reads1/%1.f*.gz})
reads2=${reads1/_1.fq.gz/_2.fq.gz}
outpfx=${workdir}/STAR_mappings_PASS-1/${samplename}
cmd="STAR \
--runMode alignReads \
--genomeDir ${refindex} \
--genomeLoad LoadAndKeep \
--readFilesCommand zcat \
--readFilesIn ${reads1} ${reads2} \
--outSAMtype None \
--runThreadN ${thr} \
--outFileNamePrefix ${outpfx}\
${STAR_OPTIONS}"
echo "# first PASS alignment for $(basename ${reads1}) $(basename ${reads2})"
echo "# ${cmd}"
eval ${cmd}
done
# unload genome
cmd="STAR \
--genomeDir ${refindex} \
--genomeLoad Remove \
--outSAMtype None \
--outFileNamePrefix /dev/null/"
echo "# unloading the reference index"
echo "# ${cmd}"
eval ${cmd}
#############################
# merge and filter junctions
#############################
# 1. Filter out the junctions on chrM, those are most likely to be false.
# 2. Filter out non-canonical junctions (column5 == 0).
# 3. Filter out junctions supported by multimappers only (column7==0)
# 4. Filter out junctions supported by too few reads (e.g. column7<=2)
cmd="cat ${workdir}/STAR_mappings_PASS-1/*_SJ.out.tab \
| awk 'BEGIN {OFS=\"\\t\"; strChar[0]=\".\"; strChar[1]=\"+\"; strChar[2]=\"-\";}
{if((\$1!=\"M\") && (\$5>0) && (\$7>2)){print \$1,\$2,\$3,strChar[\$4]}}' \
| sort -k 1V,1 -k 2n,2 -k 3n,3 \
| uniq > ${workdir}/SJ.out.all.tab"
echo "# merging and filtering all junction files"
echo "# ${cmd}"
eval ${cmd}
##################################
# create new index with junctions
##################################
mkdir -p ${workdir}/SJ_index
outpfx=${workdir}/SJ_index/
cmd="STAR \
--runMode genomeGenerate \
--genomeDir ${workdir}/SJ_index \
--genomeFastaFiles ${reffasta} \
--sjdbGTFfile ${refgtf} \
--sjdbFileChrStartEnd ${workdir}/SJ.out.all.tab \
--runThreadN ${thr} \
--sjdbOverhang ${idxlen} \
--outFileNamePrefix ${outpfx}"
echo "# creating junction-aware reference index"
echo "# ${cmd}"
eval ${cmd}
################################################
# second pass on all samples and counting genes
################################################
mkdir -p ${workdir}/STAR_mappings_PASS-2
for reads1 in reads/*_1.fq.gz; do
samplename=$(basename ${reads1/%1.f*.gz})
reads2=${reads1/_1.fq.gz/_2.fq.gz}
outpfx=${workdir}/STAR_mappings_PASS-2/${samplename}
cmd="STAR \
--runMode alignReads \
--runThreadN ${thr} \
--genomeDir ${workdir}/SJ_index \
--readFilesCommand zcat \
--readFilesIn ${reads1} ${reads2} \
--outFileNamePrefix ${outpfx} \
--sjdbFileChrStartEnd ${workdir}/SJ.out.all.tab \
--outFilterType BySJout \
--outSAMtype BAM SortedByCoordinate \
--outSAMattributes Standard \
--outSAMunmapped Within \
--quantMode GeneCounts \
${STAR_OPTIONS}"
echo "# second PASS alignment for $(basename ${reads1}) $(basename ${reads2})"
echo "# ${cmd}"
eval ${cmd}
done
# cleanup
rm -rf _STARtmp