bed2nfrdip

#!/bin/bash
#PBS -l nodes=1:ppn=4

FLANK_WIN_UP=500
FLANK_WIN_DOWN=500
GENOME="mm9"

#### usage ####
usage() {
    echo
	echo Program: "bed2nfrdip (compute nfr dip corresponding to input coordinates in BED format)"
	echo Author: BRIC, University of Copenhagen, Denmark
	echo Version: 1.0
	echo Contact: pundhir@binf.ku.dk
	echo "Usage: bed2nfrdip -i <file> -j <file(s)> [OPTIONS]"
	echo "Options:"
    echo " -i <file>   [input genomic coordinates in BED format (can be stdin)]"
    echo " -j <file>   [input mapped reads in BAM format]"
    echo "             [if multiple separate by a comma]"
    echo "[OPTIONS]"
    echo " -k <float>  [size factor to normalize the expression of read block]"
    echo "             [if multiple separate by a comma]"
    echo " -t          [normalize expression by counts per million mapped reads]"
    echo " -l <int>    [length of the upstream flanking window (default: 500)]"
    echo " -m <int>    [length of the downstream flanking window (default: 500)]"
    echo " -e <int>    [extend 3' end of reads by input number of bases]"
    echo "             [if multiple separate by a comma]"
    echo " -g <string> [genome (default: mm9)]"
    echo " -x          [input file is not in pare output format]"
	echo " -h          [help]"
	echo
	exit 0
}

#### parse options ####
while getopts i:j:k:tl:m:e:g:xh ARG; do
	case "$ARG" in
		i) BEDFILE=$OPTARG;;
		j) BAMFILE=$OPTARG;;
		k) SIZEFACTOR=$OPTARG;;
        t) CPM=1;;
        l) FLANK_WIN_UP=$OPTARG;;
        m) FLANK_WIN_DOWN=$OPTARG;;
        e) EXTEND=$OPTARG;;
        g) GENOME=$OPTARG;;
        x) NOPAREFORMAT=1;;
		h) HELP=1;;
	esac
done

## usage, if necessary file and directories are given/exist
if [ -z "$BEDFILE" -o -z "$BAMFILE" -o "$HELP" ]; then
	usage
fi

###################
#helperfunction
function wait_for_jobs_to_finish {
    for job in `jobs -p`
    do
        echo $job
        wait $job
    done
    echo $1
}
###############

## check if input genome option is suitable
if [ "$GENOME" != "mm9" -a "$GENOME" != "hg19" ]; then
    echo
    echo "Presently the program only support analysis for mm9 or hg19"
    usage
fi

## parse input bam files in an array
IFS=","
BAMFILES=($BAMFILE)
BAMFILES_COUNT=${#BAMFILES[@]}
IFS=""

## initialize size factors, if size factors not provided
if [ -z "$SIZEFACTOR" ]; then
    SIZEFACTOR=""
    for(( i=0; i<$BAMFILES_COUNT; i++ )); do
        SIZEFACTOR="$SIZEFACTOR,1"
    done
    SIZEFACTOR=`echo $SIZEFACTOR | perl -ane '$_=~s/^\,//g; print $_;'`;
fi

## parse input size factors in an array
IFS=","
SIZEFACTORS=($SIZEFACTOR)
SIZEFACTORS_COUNT=${#SIZEFACTORS[@]}
IFS=""

## initialize extend parameter, if not provided
if [ -z "$EXTEND" ]; then
    EXTEND=""
    for(( i=0; i<$BAMFILES_COUNT; i++ )); do
        EXTEND="$EXTEND,0"
    done
    EXTEND=`echo $EXTEND | perl -ane '$_=~s/^\,//g; print $_;'`;
fi

## parse input size factors in an array
IFS=","
EXTENDS=($EXTEND)
EXTENDS_COUNT=${#EXTENDS[@]}
IFS=""

if [ "$BAMFILES_COUNT" -ne "$SIZEFACTORS_COUNT" -o "$BAMFILES_COUNT" -ne "$EXTENDS_COUNT" ]; then
    echo -n "Please provide size factor and extend parameter for each input bam file";
    usage
fi

## create temporary BED file if input is from stdin
if [ "$BEDFILE" == "stdin" ]; then
    TMP=$(cat /dev/urandom | tr -dc 'a-zA-Z0-9' | fold -w 32 | head -n 1)
    while read LINE; do
        echo ${LINE}
    done | perl -ane '$line=""; foreach(@F) { $line.="$_\t"; } $line=~s/\t$//g; print "$line\n";' > $TMP
    BEDFILE=$TMP
fi

#echo -n "compute nfr dip for input regions.. "
readarray -t NFRREGION < $BEDFILE;

for (( i=0; i<${#NFRREGION[@]}; i++ )); do
    COOR=`echo ${NFRREGION[$i]} | perl -ane 'print "$F[0]:$F[1]-$F[2]";'`;
    NAME=`echo ${NFRREGION[$i]} | perl -ane 'print "$F[4]";'`;
    COLUMN=`echo ${NFRREGION[$i]} | perl -ane 'print scalar(@F);'`;
    if [ -z "$NOPAREFORMAT" ]; then
        FLANK_WIN_UP=`echo ${NFRREGION[$i]} | perl -ane 'print "$F[6]";'`;
        FLANK_WIN_DOWN=`echo ${NFRREGION[$i]} | perl -ane 'print "$F[7]";'`;
    #else
    #    FLANK_WIN_UP=1
    #    FLANK_WIN_DOWN=1
    fi

    CHR=`echo ${NFRREGION[$i]} | perl -ane 'print "$F[0]";'`;
    START=`echo ${NFRREGION[$i]} | perl -ane 'print "$F[1]";'`;
    END=`echo ${NFRREGION[$i]} | perl -ane 'print "$F[2]";'`;
    #echo "coor2nfrdip -i $COOR -j $BAMFILE -k $SIZEFACTOR -l $FLANK_WIN_UP -m $FLANK_WIN_DOWN -e $EXTEND -g $GENOME"; exit
    if [ -z "$CPM" ]; then
        SCORE=`coor2nfrdip -i $COOR -j $BAMFILE -k $SIZEFACTOR -l $FLANK_WIN_UP -m $FLANK_WIN_DOWN -e $EXTEND -g $GENOME`
    else
        SCORE=`coor2nfrdip -i $COOR -j $BAMFILE -k $SIZEFACTOR -l $FLANK_WIN_UP -m $FLANK_WIN_DOWN -e $EXTEND -g $GENOME -t`
    fi
    echo -e "$CHR\t$START\t$END\t$SCORE\t$NAME"
done
#done > $OUTFILE

if [ ! -z "$TMP" ]; then
    rm $TMP
fi