-
Notifications
You must be signed in to change notification settings - Fork 2
/
bed2coverage
executable file
·167 lines (150 loc) · 5.92 KB
/
bed2coverage
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/bin/bash
#PBS -l nodes=1:ppn=4
GENOME="mm9"
AVG=0
#### usage ####
usage() {
echo Program: "bed2coverage (compute read coverage corresponding to input BED file)"
echo Author: BRIC, University of Copenhagen, Denmark
echo Version: 1.0
echo Contact: pundhir@binf.ku.dk
echo "Usage: bed2coverage -i <file | stdin> -j <file(s)>"
echo "Options:"
echo " -i <file> [input file containing genomic coordinate in BED format]"
echo " -j <file> [input mapped reads in BAM format]"
echo " [if multiple seperate by a comma]"
echo "[OPTIONS]"
echo " -s <float> [normalize expression by input size factor]"
echo " [if multiple, seperate them by a comma]"
echo " -m [normalize expression by counts per million mapped reads]"
echo " -d [remove duplicate reads]"
echo " -e <int> [extend 3' end of reads by input number of bases (useful for ChIP-seq data)]"
echo " -v <int> [instead of giving expression for each replicate, output only the mean (deafult: 0)]"
echo " [assumes that consecutive input files are replicates 1 and 2, respectively]"
echo " -r [require same strandedness]"
echo " -g <string> [genome (default: mm9)]"
echo " -h [help]"
echo
exit 0
}
#### parse options ####
while getopts i:j:s:mde:g:v:rh ARG; do
case "$ARG" in
i) INPUTBEDFILE=$OPTARG;;
j) INPUTBAMFILES=$OPTARG;;
s) INPUTSIZEFACTORS=$OPTARG;;
m) CPM=1;;
d) REMOVE_DUPLICATE=1;;
e) INPUTEXTENDS=$OPTARG;;
g) GENOME=$OPTARG;;
v) AVG=$OPTARG;;
r) SAME_STRAND=1;;
h) HELP=1;;
esac
done
## usage, if necessary file and directories are given/exist
if [ -z "$INPUTBEDFILE" -o -z "$INPUTBAMFILES" -o "$HELP" ]; then
usage
fi
## populating files based on input genome
if [ "$GENOME" == "mm9" ]; then
GENOME_FILE="/home/pundhir/project/genome_annotations/mouse.mm9.genome"
elif [ "$GENOME" == "hg19" ]; then
GENOME_FILE="/home/pundhir/project/genome_annotations/human.hg19.genome"
else
echo "Presently the program only support analysis for mm9 or hg19"
echo
usage
fi
## parse input bam files in an array
oIFS=$IFS
IFS=","
BAMFILES=($INPUTBAMFILES)
BAMFILES_COUNT=${#BAMFILES[@]}
IFS=$oIFS
## initialize size factors, if both size factors and total reads not provided
if [ -z "$INPUTSIZEFACTORS" -a -z "$CPM" ]; then
INPUTSIZEFACTORS=""
for(( i=0; i<$BAMFILES_COUNT; i++ )); do
INPUTSIZEFACTORS="$INPUTSIZEFACTORS,1"
done
INPUTSIZEFACTORS=`echo $INPUTSIZEFACTORS | perl -ane '$_=~s/^\,//g; print $_;'`;
fi
## initialize extend parameter, if not provided
if [ -z "$INPUTEXTENDS" ]; then
INPUTEXTENDS=""
for(( i=0; i<$BAMFILES_COUNT; i++ )); do
INPUTEXTENDS="$INPUTEXTENDS,0"
done
INPUTEXTENDS=`echo $INPUTEXTENDS | perl -ane '$_=~s/^\,//g; print $_;'`;
fi
## parse extend parameter in an array
IFS=","
EXTENDS=($INPUTEXTENDS)
EXTENDS_COUNT=${#EXTENDS[@]}
IFS=$oIFS
## parse bam files in an array
IFS=","
INPUTBAMFILES=$(echo $INPUTBAMFILES | sed 's/\,/ /g')
IFS=$oIFS
## create temporary BED file if input is from stdin
if [ "$INPUTBEDFILE" == "stdin" ]; then
TMP=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 32 | head -n 1)
while read LINE; do
echo ${LINE}
done | perl -ane '$line=""; foreach(@F) { $line.="$_\t"; } $line=~s/\t$//g; print "$line\n";' > $TMP
INPUTBEDFILE=$TMP
fi
if [ ! -z "$INPUTSIZEFACTORS" ]; then
## parse input size factors in an array
IFS=","
SIZEFACTORS=($INPUTSIZEFACTORS)
SIZEFACTORS_COUNT=${#SIZEFACTORS[@]}
IFS=$oIFS
if [ "$BAMFILES_COUNT" -ne "$SIZEFACTORS_COUNT" -o "$BAMFILES_COUNT" -ne "$EXTENDS_COUNT" ]; then
echo -n "Please provide size factor and extend parameter for each input bam file";
usage
fi
if [ -z "$REMOVE_DUPLICATE" ]; then
if [ ! -z "$SAME_STRAND" ]; then
samtools bedcov $INPUTBEDFILE $INPUTBAMFILES
else
samtools bedcov $INPUTBEDFILE $INPUTBAMFILES
fi
else
if [ ! -z "$SAME_STRAND" ]; then
samtools bedcov $INPUTBEDFILE $INPUTBAMFILES
else
samtools bedcov $INPUTBEDFILE $INPUTBAMFILES
fi
fi | perl -ane '@sizeFactors=split(/\,/, "'$INPUTSIZEFACTORS'"); $field_count=scalar(@F)-'$BAMFILES_COUNT'; foreach(@F[0..$field_count-1]) { print "$_\t"; } $i=0; foreach(@F[$field_count..scalar(@F)-1]) { printf("%0.5f\t", $_/$sizeFactors[$i]); $i++; } print "\n";'
else
MAPPEDREADS=""
for(( i=0; i<$BAMFILES_COUNT; i++ )); do
## create index of input BAM file, if does not exist
if [ ! -f "${BAMFILES[$i]}.bai" ]; then
samtools index ${BAMFILES[$i]}
fi
COUNT=$(samtools idxstats ${BAMFILES[$i]} | grep -wE "^[0-9a-zA-Z]+" | perl -ane '$sum+=$F[2]; END { print "$sum"; }');
MAPPEDREADS="$MAPPEDREADS,$COUNT"
done
MAPPEDREADS=$(echo $MAPPEDREADS | perl -ane '$_=~s/^\,//g; print $_;')
#echo $MAPPEDREADS
if [ -z "$REMOVE_DUPLICATE" ]; then
if [ ! -z "$SAME_STRAND" ]; then
samtools bedcov $INPUTBEDFILE $INPUTBAMFILES
else
samtools bedcov $INPUTBEDFILE $INPUTBAMFILES
fi
else
if [ ! -z "$SAME_STRAND" ]; then
samtools bedcov $INPUTBEDFILE $INPUTBAMFILES
else
samtools bedcov $INPUTBEDFILE $INPUTBAMFILES
fi
fi | perl -ane '@mappedReads=split(/\,/, "'$MAPPEDREADS'"); $field_count=scalar(@F)-'$BAMFILES_COUNT'; foreach(@F[0..$field_count-1]) { print "$_\t"; } $i=0; foreach(@F[$field_count..scalar(@F)-1]) { printf("%0.5f\t", ($_*1000000)/$mappedReads[$i]); $i++; } print "\n";'
fi | perl -ane 'if('$AVG') { $field_count=scalar(@F)-'$BAMFILES_COUNT'; foreach(@F[0..$field_count-1]) { print "$_\t"; } $i=0; for($i=$field_count; $i<scalar(@F); $i+=2) { if(!defined($F[$i+1])) { $F[$i+1]=$F[$i]; } printf("%0.5f\t", ($F[$i]+$F[$i+1])/2); } print "\n"; } else { print $_; }'
if [ ! -z "$TMP" ]; then
rm $TMP
fi
exit