-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathHybPhyloMaker1a2_rawprocessParallel_summary.sh
166 lines (152 loc) · 7.01 KB
/
HybPhyloMaker1a2_rawprocessParallel_summary.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
#!/bin/bash
#----------------MetaCentrum----------------
#PBS -l walltime=1:0:0
#PBS -l select=1:ncpus=1:mem=1gb:scratch_local=1gb
#PBS -j oe
#PBS -N HybPhyloMaker1a2_rawprocessParallel_summary
#PBS -m abe
#-------------------HYDRA-------------------
#$ -S /bin/bash
#$ -pe mthread 2
#$ -q sThC.q
#$ -l mres=1G
#$ -cwd
#$ -j y
#$ -N HybPhyloMaker1a2_rawprocessParallel_summary
#$ -o HybPhyloMaker1a2_rawprocessParallel_summary.log
# ********************************************************************************
# * HybPhyloMaker - Pipeline for Hyb-Seq data processing and tree building *
# * https://github.com/tomas-fer/HybPhyloMaker *
# * Script 01a2 - Raw data processing in parallel summary *
# * v.1.8.0a *
# * Tomas Fer, Dept. of Botany, Charles University, Prague, Czech Republic, 2021 *
# * tomas.fer@natur.cuni.cz *
# * based on Weitemier et al. (2014), Applications in Plant Science 2(9): 1400042*
# ********************************************************************************
#This makes a summary table of raw data filtering after HybPhyloMaker1a_rawprocessParallel.sh
#Complete path and set configuration for selected location
if [[ $PBS_O_HOST == *".cz" ]]; then
echo -e "\nHybPhyloMaker1a2 is running on MetaCentrum...\n"
#settings for MetaCentrum
#Copy file with settings from home and set variables from settings.cfg
cp $PBS_O_WORKDIR/settings.cfg .
. settings.cfg
path=/storage/$server/home/$LOGNAME/$data
source=/storage/$server/home/$LOGNAME/HybSeqSource
#Move to scratch
cd $SCRATCHDIR
elif [[ $HOSTNAME == compute-*-*.local ]]; then
echo -e "\nHybPhyloMaker1a2 is running on Hydra...\n"
#settings for Hydra
#set variables from settings.cfg
. settings.cfg
path=../$data
source=../HybSeqSource
#Make and enter work directory
mkdir -p workdir01a2
cd workdir01a2
else
echo -e "\nHybPhyloMaker1a2 is running locally..."
echo -e "This summary is only for cluster environment. Exiting...\n"
exit 3
fi
#Write log
logname=HPM1a2
echo -e "HybPhyloMaker1a2: summary of data filtering in parallel" > ${logname}.log
if [[ $PBS_O_HOST == *".cz" ]]; then
echo -e "Job run on MetaCentrum: $PBS_JOBID" >> ${logname}.log
echo -e "From: $PBS_O_HOST" >> ${logname}.log
echo -e "Host: $HOSTNAME" >> ${logname}.log
echo -e "$PBS_NUM_NODES node(s) with $PBS_NCPUS core(s)" >> ${logname}.log
memM=$(bc <<< "scale=2; $(echo $PBS_RESC_MEM) / 1024 / 1024 ")
memG=$(bc <<< "scale=2; $(echo $PBS_RESC_MEM) / 1024 / 1024 / 1024 ")
if (( $(echo $memG 1 | awk '{if ($1 < $2) print 1;}') )); then
echo -e "Memory: $memM Mb" >> ${logname}.log
else
echo -e "Memory: $memG Gb" >> ${logname}.log
fi
elif [[ $HOSTNAME == compute-*-*.local ]]; then
echo -e "run on Hydra: $HOSTNAME" >> ${logname}.log
else
echo -e "local run: "`hostname`"/"`whoami` >> ${logname}.log
fi
echo -e "\nBegin:" `date '+%A %d-%m-%Y %X'` >> ${logname}.log
echo -e "\nSettings" >> ${logname}.log
if [[ $PBS_O_HOST == *".cz" ]]; then
printf "%-25s %s\n" `echo -e "\nServer:\t$server"` >> ${logname}.log
fi
for set in data adapterfile; do
printf "%-25s %s\n" `echo -e "${set}:\t" ${!set}` >> ${logname}.log
done
# echo -e "\nGenerating file for download and for import to Geneious...\n"
# #Copy all -all-no-dups.fas from all subfolders to folder 'for_Geneious' (for easy import to Geneious)
# mkdir $path/20filtered/for_Geneious
# find $path/20filtered/ -name '*no-dups.fastq.gz' -exec cp -t $path/20filtered/for_Geneious/ {} +
# #Pack and combine all *all-no-dups.fas files to one archive for easy download
# tar cfz $path/20filtered/for_Geneious/$data-no-dups.tar.gz -C $path/20filtered/for_Geneious/ . 2>/dev/null
# #Delete copies of fastq files in 'for Geneious' folder
# rm $path/20filtered/for_Geneious/*.fastq.gz
#Copy file list and summary logs
echo -e "Copying logs...\n"
cp $path/10rawreads/SamplesFileNames.txt .
cat SamplesFileNames.txt | while read line; do
mkdir $line
cp $path/20filtered/$line/*.log $line
done
#Summary of basic reads processing (nr. reads, PhiX filtering, quality trimming, duplicate removal)
#Produce tab-separated table
echo -e "Creating summary table..."
#Write headers (number of pairs, number of reads, number of reads after PhiX removal, percentage od PhiX reads,
#quality trimming - both surviving, forward only surviving, reverse only surviving, reads after quality trimming,
#percentage of trimmed reads, reads after duplicate removal, percentage of duplicated reads)
echo -e "Sample no.\tGenus\tSpecies\tNr. of pairs\tNr. of reads\tNr. of reads without PhiX\t% PhiX reads\tBoth surviving\tForward only surviving\tReverse only surviving\tNr. reads after quality trimming\t% quality trimmed reads\tNr. reads without duplicates\t% duplicates" > reads_summary.txt
cat SamplesFileNames.txt | while read line
do
echo "Processing $line"
number=$(cut -d'_' -f2 <<< $line)
genus=$(cut -d'-' -f1 <<< $line)
species=$(cut -d'_' -f1 <<< $line | cut -d'-' -f2)
cd $line
#Number of pairs: first number on first line in bowtie2.log
npairs=`cat bowtie2.log | head -n 1 | cut -d' ' -f1`
#Number of reads: first number on third line in PhiX-removal.log
nreads=`cat PhiX-removal.log | head -n 3 | tail -n 1 | cut -d' ' -f1`
#Number of reads after removal of PhiX reads: first number on fourth line in PhiX-removal.log
withoutPhiX=`cat PhiX-removal.log | head -n 4 | tail -n 1 | cut -d' ' -f1`
#Percentage of PhiX reads: first number on last line in bowtie2.log
percPhiX=`cat bowtie2.log | tail -n 1 | cut -d'%' -f1`
#Number of both surviving reads after Trimmomatic
bs=`cat Trimmomatic.log | grep 'Input Read Pairs:' | cut -d' ' -f7`
#Number of forward only surviving reads after Trimmomatic
fos=`cat Trimmomatic.log | grep 'Input Read Pairs:' | cut -d' ' -f12`
#Number of reverse only surviving reads after Trimmomatic
ros=`cat Trimmomatic.log | grep 'Input Read Pairs:' | cut -d' ' -f17`
# Counting number of reads after quality trimming
let aftertrimming="(2 * $bs) + $fos + $ros"
#Calculate percentage of quality trimmed reads
perctrimm=$(echo -e "scale=4;100 - 100 * ($aftertrimming / $withoutPhiX)" | bc)
#Number of reads after duplicate removal
withoutdups=`cat duplicate-removal-fastuniq.log | tail -n 1`
#withoutdups=`cat duplicate-removal.log | head -n 2 | tail -n 1 | cut -d' ' -f2`
#Calculate percentage of duplicate reads removed
percdups=$(echo -e "scale=4;100 - 100 * ($withoutdups / $aftertrimming)" | bc)
cd ..
echo -e "$number\t$genus\t$species\t$npairs\t$nreads\t$withoutPhiX\t$percPhiX\t$bs\t$fos\t$ros\t$aftertrimming\t$perctrimm\t$withoutdups\t$percdups" >> reads_summary.txt
rm -r $line
done
#Copy table to storage
cp reads_summary.txt $path/20filtered/
#Copy log to home
echo -e "\nEnd:" `date '+%A %d-%m-%Y %X'` >> ${logname}.log
cp ${logname}.log $path/20filtered/
#Clean scratch/work directory
if [[ $PBS_O_HOST == *".cz" ]]; then
#delete scratch
if [[ ! $SCRATCHDIR == "" ]]; then
rm -rf $SCRATCHDIR/*
fi
else
cd ..
rm -r workdir01a2
fi
echo -e "\nScript HybPhyloMaker1a2 finished...\n"