-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcuration_2.0_pipe.sh
92 lines (70 loc) · 3.37 KB
/
curation_2.0_pipe.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/bin/sh
Help()
{
# Display help
printf "\nCommand: sh curation_2.0_pipe.sh -f <original fasta> -a <agp> <options>\n\n"
echo "-h Prints help."
echo "-f Pass original fasta file with combined haplotypes."
echo "-a Pass the agp generated by PretextView."
# echo "-p Pass the primary assembly you curated (1 for haplotype 1 (default), 2 for haplotype 2)."
printf "\n"
}
fasta=""
agpfile=""
hap=""
while getopts ":hf:a:p:" option; do
case $option in
h) #display Help
Help
exit;;
f) #Pass original fasta file
fasta=$OPTARG;;
a) #Pass Pretext generated AGP file of curated assembly
agpfile=$OPTARG;;
esac
done
if [ -d logs ]
then
count=`ls logs/* | wc -l`
exec 1<> logs/std.${count}.out
else
mkdir -p logs
exec 1<> logs/std.0.out
fi
pth=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
printf "Rapid-curation-2.0 scripts located in: $pth\n"
## Programs/tools
# use_gfastats=/vggpfs/fs3/vgl/store/nbrajuka/gfastats/build/bin/gfastats
# use_seqkit=/vggpfs/fs3/vgl/store/nbrajuka/conda/envs/statistics/bin/seqkit
printf "Dependecies:\nBiopython v1.81\npandas\ngfastats v1.3.6\n"
# could reasonably put everything in a function and just call for both haps.
mkdir -p Hap_1
mkdir -p Hap_2
printf "\nOriginal assembly: ${fasta} \nPretextView generated AGP: ${agpfile}\n\n" ### but checks/breakpoints for if these aren't provided.
printf "Running AGPcorrect on the PretextView generated agp to correct for sequence lengths.\n"
printf "python3 ${pth}/AGPcorrect.py ${fasta} ${agpfile}\n\n"
python3 $pth/AGPcorrect.py ${fasta} ${agpfile}
printf "Splitting the haplotypes from the corrected AGP. Outputs sent to respective directories.\n"
printf "python3 $pth/hap_split.py\n\n"
python3 $pth/hap_split.py
printf "Assigning unlocs before the agp is imposed on the fasta.\n"
## If the --agp-to-path in the next block is run first the unlocs will get assimilated into their main assigned scaffolds - they need to be differentiated first.
printf "python3 $pth/unloc.py Hap_1\n"
printf "python3 $pth/unloc.py Hap_2\n\n"
python3 $pth/unloc.py Hap_1
python3 $pth/unloc.py Hap_2
printf "Imposing the haplotypic agp on the original fasta to generate a curated fasta.\n"
printf "gfastats $fasta --agp-to-path Hap_1/hap.unlocs.no_hapdups.agp --sort largest -o Hap_1/hap.sorted.fa\n"
gfastats $fasta --agp-to-path Hap_1/hap.unlocs.no_hapdups.agp -o Hap_1/hap.unlocs.no_hapdups.fa 2>> logs/std.${count}.out
printf "gfastats $fasta --agp-to-path Hap_1/hap.unlocs.no_hapdups.agp --sort largest -o Hap_2/hap.sorted.fa\n\n"
gfastats $fasta --agp-to-path Hap_2/hap.unlocs.no_hapdups.agp -o Hap_2/hap.unlocs.no_hapdups.fa 2>> logs/std.${count}.out
printf "gfastats Hap_1/hap.unloc.no_hapdups.fa --sort largest -o Hap_1/hap.sorted.fa"
gfastats Hap_1/hap.unlocs.no_hapdups.fa --sort largest -o Hap_1/hap.sorted.fa 2>> logs/std.${count}.out
printf "gfastats Hap_2/hap.unloc.no_hapdups.fa --sort largest -o Hap_2/hap.sorted.fa"
gfastats Hap_2/hap.unlocs.no_hapdups.fa --sort largest -o Hap_2/hap.sorted.fa 2>> logs/std.${count}.out
printf "\nSubstituting scaffold for chromosome assignments.\n"
printf "python3 $pth/chromosome_assignment.py Hap_1\n"
python3 $pth/chromosome_assignment.py Hap_1
printf "python3 $pth/chromosome_assignment.py Hap_2\n\n"
python3 $pth/chromosome_assignment.py Hap_2
exec 1>&-