-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_len_gc.sh
executable file
·110 lines (95 loc) · 4.96 KB
/
get_len_gc.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/bin/bash
# This script gets some stats (avg gc content, avg length, count ) for a user specified feature in gtf file using relevant fasta
# There are 2 mandatory arguments :
# arg1: the path to directory containing the species gtf and fasta files (uncompressed) with the following hirarchy: raw/species/ e.g (data/raw/Homo_Sapiens)
# arg2: the feature that will be filtered for stats. this feature should exist in column 3 gtf file. e.g (exon, gene, transcript)
# example: to calculate the stats of all genes of Homo Sapiens use the following
# bash get_len_gc.sh dat/raw/Homo_Sapiens gene
# There are 2 optional arguments to be used when you want to filter certain tag in column 9 gtf:
# arg3: a flag to include or exclude the following (tag=value) argument. to include use -i , to exclude use -e
# arg4: the name of the tag you want to filter tigetherr with the filtering value in the form of (tag=value). e.g (-i gene_biotype=protein_coding)
# example:
# to calculate the stats of exons coming from non-protein coding transcript of Homo Sapiens use the following
# bash get_len_gc.sh dat/raw/Homo_Sapiens exon -e transcript_biotype=proteing_coding
# reading and parsing input args
dir=$1
fet=$2
tagieflag=$3
tagvalue=$4
tag="${tagvalue%%=*}"
val="${tagvalue##*=}"
# setting color vars
RED='\033[0;31m'
GREEN='\e[92m'
NC='\033[0m'
# getting the fa and gtf files
sp_fa=`find $dir -name '*.fa' -type f`
sp_gtf=`find $dir -name '*.gtf' -type f`
if [ -z "${sp_fa}" ] || [ -z "${sp_gtf}" ]
then
echo "${RED} no gtf or fa files in ${dir} ${NC}"
else
## Initial Preparation
# creating stage dir with species under it
stg_dir=${dir/"raw"/"stage"}
mkdir -p $stg_dir
# creating output dir with species under it
out_dir=${dir/"raw"/"output"}
mkdir -p $out_dir
# extracting species name
sp="${sp_gtf##*/}"
sp="${sp%%.*}"
## filtering feature from gtf
filt_gtf_name="${sp_gtf##*/}"
filt_gtf_name=${filt_gtf_name/".gtf"/".${fet}${tagieflag}${tag}${val}.gtf"}
filt_gtf_path="${stg_dir}${filt_gtf_name}"
# removing all chromosomes starting with mM (Mitochondria) and filtering out feature
# in case of choosing feature that has id, duplicated id records are removed
# in case of user inputting tag flag (include: -i, exclude: -e) , another step of filtering in (with -i) or filtering out (with -e) these tags is made
if [ -z "${tagieflag}" ]
then
cat $sp_gtf | \
awk -v fet="${fet}" '$1 ~ /^[^Mm#]/ {if ($3==fet) {tagged=0; for(i=9; i<=NF; i++) { if($i==fet"_id") {value=$(i+1); tagged=1} \
if($i==fet"_id" && seen[value]!=1) {seen[value]=1; print $0; break} if(i==NF && tagged!=1) {print $0}}}}' > $filt_gtf_path
elif [ -z "${tagieflag}" ] || [ -z "${tagvalue}" ]
then
echo "${RED} arg3 (tag -i -e flag) together with arg4 (tag=value) must be set ${NC}"
exit 128
elif [ "${tagieflag}" == "-i" ]
then
cat $sp_gtf | \
awk -v fet="${fet}" '$1 ~ /^[^Mm#]/ {if ($3==fet) {tagged=0; for(i=9; i<=NF; i++) { if($i==fet"_id") {value=$(i+1); tagged=1} \
if($i==fet"_id" && seen[value]!=1) {seen[value]=1; print $0; break} if(i==NF && tagged!=1) {print $0}}}}' | \
awk -v tag="${tag}" -v val="${val}" '{for(i=9; i<=NF; i++) {if($i==tag && index($(i+1),val) != 0) {print $0; break}}}' > $filt_gtf_path
elif [ "${tagieflag}" == "-e" ]
then
cat $sp_gtf | \
awk -v fet="${fet}" '$1 ~ /^[^Mm#]/ {if ($3==fet) {tagged=0; for(i=9; i<=NF; i++) { if($i==fet"_id") {value=$(i+1); tagged=1} \
if($i==fet"_id" && seen[value]!=1) {seen[value]=1; print $0; break} if(i==NF && tagged!=1) {print $0}}}}' | \
awk -v tag="${tag}" -v val="${val}" '{for(i=9; i<=NF; i++) {if($i==tag && index($(i+1),val) == 0) {print $0; break}}}' > $filt_gtf_path
else
echo "${RED} unknown arg3. arg3 is a flag to include -i or exclude -e the following tag=value pair ${NC}"
exit 128
fi
lc=`cat $filt_gtf_path | wc -l`
# checking if filter output contains 0 records and removing if so
if [ $lc -eq 0 ]
then
echo -e "${RED} your input feature (${fet}${tagieflag}${tagvalue}) doesnot exist in ${sp} gtf file ${NC}"
rm $filt_gtf_path
exit 128
else
echo -e "${GREEN} filtered record for ${fet} ${tagvalue} in ${filt_gtf_name} is ${lc} ${NC}"
fi
## calculate the len and gc content per feature
stats_gtf_name=${filt_gtf_name/".gtf"/".stats.gtf"}
stats_gtf_path="${stg_dir}${stats_gtf_name}"
bedtools nuc -fi $sp_fa -bed $filt_gtf_path > $stats_gtf_path
lc=`cat $stats_gtf_path | wc -l`
echo -e "${GREEN} filtered record for ${fet} in ${stats_gtf_name} is ${lc} ${NC}"
# Aggregate the avg GC and len
agg_csv_name=${stats_gtf_name/".gtf"/".csv"}
agg_csv_path="${out_dir}${agg_csv_name}"
awk -v sp="${sp}" -v fet="${fet}_${tagieflag}_${tagvalue}" -v total="${lc}" 'BEGIN{FS="\t"; OFS=","; print "species,"fet"_avg_gc,"fet"_avg_len,"fet"_count"} { total_gc += $11; count_gc++; total_len +=$18; count_len++ } \
END { print sp,total_gc/count_gc,total_len/count_len,total}' $stats_gtf_path > $agg_csv_path
fi