-
Notifications
You must be signed in to change notification settings - Fork 2
/
bed2nfrdip
executable file
·155 lines (138 loc) · 4.69 KB
/
bed2nfrdip
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/bin/bash
#PBS -l nodes=1:ppn=4
FLANK_WIN_UP=500
FLANK_WIN_DOWN=500
GENOME="mm9"
#### usage ####
usage() {
echo
echo Program: "bed2nfrdip (compute nfr dip corresponding to input coordinates in BED format)"
echo Author: BRIC, University of Copenhagen, Denmark
echo Version: 1.0
echo Contact: pundhir@binf.ku.dk
echo "Usage: bed2nfrdip -i <file> -j <file(s)> [OPTIONS]"
echo "Options:"
echo " -i <file> [input genomic coordinates in BED format (can be stdin)]"
echo " -j <file> [input mapped reads in BAM format]"
echo " [if multiple separate by a comma]"
echo "[OPTIONS]"
echo " -k <float> [size factor to normalize the expression of read block]"
echo " [if multiple separate by a comma]"
echo " -t [normalize expression by counts per million mapped reads]"
echo " -l <int> [length of the upstream flanking window (default: 500)]"
echo " -m <int> [length of the downstream flanking window (default: 500)]"
echo " -e <int> [extend 3' end of reads by input number of bases]"
echo " [if multiple separate by a comma]"
echo " -g <string> [genome (default: mm9)]"
echo " -x [input file is not in pare output format]"
echo " -h [help]"
echo
exit 0
}
#### parse options ####
while getopts i:j:k:tl:m:e:g:xh ARG; do
case "$ARG" in
i) BEDFILE=$OPTARG;;
j) BAMFILE=$OPTARG;;
k) SIZEFACTOR=$OPTARG;;
t) CPM=1;;
l) FLANK_WIN_UP=$OPTARG;;
m) FLANK_WIN_DOWN=$OPTARG;;
e) EXTEND=$OPTARG;;
g) GENOME=$OPTARG;;
x) NOPAREFORMAT=1;;
h) HELP=1;;
esac
done
## usage, if necessary file and directories are given/exist
if [ -z "$BEDFILE" -o -z "$BAMFILE" -o "$HELP" ]; then
usage
fi
###################
#helperfunction
function wait_for_jobs_to_finish {
for job in `jobs -p`
do
echo $job
wait $job
done
echo $1
}
###############
## check if input genome option is suitable
if [ "$GENOME" != "mm9" -a "$GENOME" != "hg19" ]; then
echo
echo "Presently the program only support analysis for mm9 or hg19"
usage
fi
## parse input bam files in an array
IFS=","
BAMFILES=($BAMFILE)
BAMFILES_COUNT=${#BAMFILES[@]}
IFS=""
## initialize size factors, if size factors not provided
if [ -z "$SIZEFACTOR" ]; then
SIZEFACTOR=""
for(( i=0; i<$BAMFILES_COUNT; i++ )); do
SIZEFACTOR="$SIZEFACTOR,1"
done
SIZEFACTOR=`echo $SIZEFACTOR | perl -ane '$_=~s/^\,//g; print $_;'`;
fi
## parse input size factors in an array
IFS=","
SIZEFACTORS=($SIZEFACTOR)
SIZEFACTORS_COUNT=${#SIZEFACTORS[@]}
IFS=""
## initialize extend parameter, if not provided
if [ -z "$EXTEND" ]; then
EXTEND=""
for(( i=0; i<$BAMFILES_COUNT; i++ )); do
EXTEND="$EXTEND,0"
done
EXTEND=`echo $EXTEND | perl -ane '$_=~s/^\,//g; print $_;'`;
fi
## parse input size factors in an array
IFS=","
EXTENDS=($EXTEND)
EXTENDS_COUNT=${#EXTENDS[@]}
IFS=""
if [ "$BAMFILES_COUNT" -ne "$SIZEFACTORS_COUNT" -o "$BAMFILES_COUNT" -ne "$EXTENDS_COUNT" ]; then
echo -n "Please provide size factor and extend parameter for each input bam file";
usage
fi
## create temporary BED file if input is from stdin
if [ "$BEDFILE" == "stdin" ]; then
TMP=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 32 | head -n 1)
while read LINE; do
echo ${LINE}
done | perl -ane '$line=""; foreach(@F) { $line.="$_\t"; } $line=~s/\t$//g; print "$line\n";' > $TMP
BEDFILE=$TMP
fi
#echo -n "compute nfr dip for input regions.. "
readarray -t NFRREGION < $BEDFILE;
for (( i=0; i<${#NFRREGION[@]}; i++ )); do
COOR=`echo ${NFRREGION[$i]} | perl -ane 'print "$F[0]:$F[1]-$F[2]";'`;
NAME=`echo ${NFRREGION[$i]} | perl -ane 'print "$F[4]";'`;
COLUMN=`echo ${NFRREGION[$i]} | perl -ane 'print scalar(@F);'`;
if [ -z "$NOPAREFORMAT" ]; then
FLANK_WIN_UP=`echo ${NFRREGION[$i]} | perl -ane 'print "$F[6]";'`;
FLANK_WIN_DOWN=`echo ${NFRREGION[$i]} | perl -ane 'print "$F[7]";'`;
#else
# FLANK_WIN_UP=1
# FLANK_WIN_DOWN=1
fi
CHR=`echo ${NFRREGION[$i]} | perl -ane 'print "$F[0]";'`;
START=`echo ${NFRREGION[$i]} | perl -ane 'print "$F[1]";'`;
END=`echo ${NFRREGION[$i]} | perl -ane 'print "$F[2]";'`;
#echo "coor2nfrdip -i $COOR -j $BAMFILE -k $SIZEFACTOR -l $FLANK_WIN_UP -m $FLANK_WIN_DOWN -e $EXTEND -g $GENOME"; exit
if [ -z "$CPM" ]; then
SCORE=`coor2nfrdip -i $COOR -j $BAMFILE -k $SIZEFACTOR -l $FLANK_WIN_UP -m $FLANK_WIN_DOWN -e $EXTEND -g $GENOME`
else
SCORE=`coor2nfrdip -i $COOR -j $BAMFILE -k $SIZEFACTOR -l $FLANK_WIN_UP -m $FLANK_WIN_DOWN -e $EXTEND -g $GENOME -t`
fi
echo -e "$CHR\t$START\t$END\t$SCORE\t$NAME"
done
#done > $OUTFILE
if [ ! -z "$TMP" ]; then
rm $TMP
fi