-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPF_P1.sh
executable file
·64 lines (56 loc) · 2.69 KB
/
PF_P1.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# All rights reserved by Liuyu
# Author: Liuyu
#!/bin/sh
source ${CSMD_HOME}/configs.sh
PF_WORKDIR_P1_SEQDIR=$1
PF_WORKDIR_P1_SEQLST=$2
PF_WORKDIR_P1_OUTDIR=$3
echo "#--------------------------------------------------------------#"
echo "Per-sample microbiome profiling"
if [ ! -d "${PF_WORKDIR_P1_SEQDIR}/SEQ/" ];then
echo -e "\033[44;37;5m ERROR:\033[0m ${PF_WORKDIR_P1_SEQDIR}/SEQ/ not exist"
echo "Please make sure all RefSeq bacteria genomes can be found in ${PF_WORKDIR_P1_SEQDIR}/SEQ/"
exit
elif [ ! -f "${PF_WORKDIR_P1_SEQDIR}/assembly_summary.txt" ];then
echo -e "\033[44;37;5m ERROR: \033[0m ${PF_WORKDIR_P1_SEQDIR}/assembly_summary.txt not exit"
exit
else
echo -e "\033[44;37;5m TIPS:\033[0m The SEQDIR is ready."
echo "NOTE: All RefSeq bacteria genomes are expected in ${PF_WORKDIR_P1_SEQDIR}/SEQ/"
echo "NOTE: RefSeq bacteria summary information is expected in ${PF_WORKDIR_P1_SEQDIR}/assembly_summary.txt"
fi
if [ ! -f "${PF_WORKDIR_P1_SEQLST}" ];then
echo -e "\033[44;37;5m ERROR:\033[0m the sequence list ${PF_WORKDIR_P1_SEQLST} not exist"
echo "Please make sure the sequence list is ready for database update, each line with a RefSeq accssion no. "
else
echo -e "\033[44;37;5m TIPS:\033[0m The SEQLST is ready."
echo "NOTE: Sequences for database update are expected in ${PF_WORKDIR_P1_SEQLST}, each line with a RefSeq accssion no."
fi
echo "PF Phase I: Genome colection and index"
if [ ! -d "${PF_WORKDIR_P1_OUTDIR}/DB/Genome/" ];then
mkdir -p ${PF_WORKDIR_P1_OUTDIR}/DB/Genome
else
if [ "`ls -A ${PF_WORKDIR_P1_OUTDIR}/DB/Genome/`" != "" ];then
echo -e "\033[44;37;5m ERROR:\033[0m ${PF_WORKDIR_P1_OUTDIR}/DB/Genome/ not empty"
exit
fi
fi
for aa in $(cat ${PF_WORKDIR_P1_SEQLST})
do
taxid=$(awk -F '\t' '{if($1=="'${aa}'")print $6}' ${PF_WORKDIR_P1_SEQDIR}/assembly_summary.txt)
organism=$(awk -F '\t' '{if($1=="'${aa}'")print $8}' ${PF_WORKDIR_P1_SEQDIR}/assembly_summary.txt | sed 's/ /_/g' | sed 's/\//!/g')
header=">${aa}|${taxid}|${organism}"
echo ${header}
fasta_genome=$(ls ${PF_WORKDIR_P1_SEQDIR}/SEQ/${aa}*.gz)
if [ "$fasta_genome" != "" ]; then
gzip -d ${fasta_genome} -c > ${PF_WORKDIR_P1_OUTDIR}/DB/Genome/${aa}.fna
sed -i 's/^>.*$/NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN/' ${PF_WORKDIR_P1_OUTDIR}/DB/Genome/${aa}.fna
sed -i "1s/^.*$/${header}/" ${PF_WORKDIR_P1_OUTDIR}/DB/Genome/${aa}.fna
else
echo -e "\033[44;37;5m ERROR:\033[0m the fasta genome of ${aa} not exist"
exit
fi
done
cat ${PF_WORKDIR_P1_OUTDIR}/DB/Genome/*.fna > ${PF_WORKDIR_P1_OUTDIR}/DB/csmdSpecies
cd ${PF_WORKDIR_P1_OUTDIR}/DB
bowtie2-build csmdSpecies csmdSpecies;