bisulfite_conversion_rateSummary.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import division, with_statement
'''
Copyright 2015, 陈同 (chentong_biology@163.com).  
===========================================================
'''
__author__ = 'chentong & ct586[9]'
__author_email__ = 'chentong_biology@163.com'
#=========================================================
desc = '''
Program description:
    This is designed to summarize reads distribution output by `STAR`.
'''

import sys
import os
from json import dumps as json_dumps
from time import localtime, strftime 
timeformat = "%Y-%m-%d %H:%M:%S"
from optparse import OptionParser as OP
import re
from tools import *
#from multiprocessing.dummy import Pool as ThreadPool

#from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding('utf8')

debug = 0

def fprint(content):
    """ 
    This is a Google style docs.

    Args:
        param1(str): this is the first param
        param2(int, optional): this is a second param
            
    Returns:
        bool: This is a description of what is returned
            
    Raises:
        KeyError: raises an exception))
    """
    print json_dumps(content,indent=1)

def cmdparameter(argv):
    if len(argv) == 1:
        global desc
        print >>sys.stderr, desc
        cmd = 'python ' + argv[0] + ' -h'
        os.system(cmd)
        sys.exit(1)
    usages = "%prog -f file"
    parser = OP(usage=usages)
    parser.add_option("-f", "--files", dest="filein",
        metavar="FILEIN", help="`,` or ` ` separated a list of files. *.Log.final.out generated by `STAR` during mapping")
    parser.add_option("-l", "--labels", dest="label",
        metavar="LABEL", help="`,` or ` ` separated a list of labels to label each file. It must have same order as files.")
    parser.add_option("-o", "--output-prefix", dest="out_prefix",
        help="The prefix of output files.")
    parser.add_option("-r", "--report-dir", dest="report_dir",
        default='report', help="Directory for report files. Default 'report'.")
    parser.add_option("-R", "--report-sub-dir", dest="report_sub_dir",
        default='2_mapping_quality', help="Directory for saving report figures and tables. This dir will put under <report_dir>,  so only dir name is needed. Default '2_mapping_quality'.")
    parser.add_option("-d", "--doc-only", dest="doc_only",
        default=False, action="store_true", help="Specify to only generate doc.")
    parser.add_option("-n", "--number", dest="number", type="int", 
        default=60, help="Set the maximum allowed samples for horizontal barplot. Default 60.\
 If more than this number of samples are given, vertical barplot will be used instead.")
    parser.add_option("-v", "--verbose", dest="verbose",
        action="store_true", help="Show process information")
    parser.add_option("-D", "--debug", dest="debug",
        default=False, action="store_true", help="Debug the program")
    (options, args) = parser.parse_args(argv[1:])
    assert options.filein != None, "A filename needed for -i"
    return (options, args)
#--------------------------------------------------------------------

#--------------------------------------
def plot(summary_op, num_samples_each_grp, sample_readin, nameL):
    x_level = ["'"+i+"'" for i in nameL]
    x_level = '"'+','.join(x_level)+'"'
    if sample_readin<=num_samples_each_grp:
        cmd = ["s-plot barPlot -m TRUE -a Sample -R 45 -w 30 -d dodge -f ", 
                summary_op, '-P none -L', x_level, 
                ' -y  \'C2T conversion rate (%)\' -x \'Samples\' ']
    else:
        height = sample_readin // 5
        if height < 10:
            height = 10
        cmdL = ["s-plot barPlot -m TRUE -f", summary_op, "-a Sample", 
                "-L", x_level, 
                "-d dodge -y 'C2T conversion rate (%)' -x 'Samples' -P none -w 20",
                "-F TRUE -u", str(height)
               ]
    #print ' '.join(cmd)
    os.system(' '.join(cmd))
#--------------------------------------

def generateDoc(report_dir, report_sub_dir, op, sample_readin):
    dest_dir = report_dir+'/'+report_sub_dir+'/'
    os.system('mkdir -p '+dest_dir)
    pdf = op+'.dodgeBars.pdf'
    copypdf(dest_dir, pdf)


    print "\n## Bisulfite conversion rate\n"
    curation_label = "Bisulfite_conversion_rate"
    knitr_read_txt(report_dir,  curation_label)
    
    print """当用亚硫酸盐处理单链DNA时，未被修饰的胞嘧啶会脱氨基变为尿嘧啶，而甲基化和羟甲基化修饰的胞嘧啶则不受影响。这是亚硫酸盐测序DNA甲基化的基本原理。在样品处理过程中，会人为加入不包含甲基化胞嘧啶的外源DNA序列，又称lambda DNA。通过计算lambda DNA中被转换的胞嘧啶相对于所有检测到的胞嘧啶的比例可以评估亚硫酸盐处理的效率。一般转换率在99%以上。Bisulfite conversion involves the deamination of unmodified cytosines to uracil, leaving the modified bases 5-mC and 5-hmC as cytosines.

    样品亚硫酸盐处理效率如 Figure \@ref(fig:bisulfite-conversion-rate-fig)所示。

"""
    
    pdf = report_sub_dir+'/'+os.path.split(pdf)[-1]
    print "(ref:bisulfite-conversion-rate-fig) Summary of bisulfite conversion rate. \
[PDF]({})\n".format(pdf)

    png = pdf.replace('pdf', 'png')
    print '''```{{r bisulfite-conversion-rate-fig, fig.cap="(ref:bisulfite-conversion-rate-fig)"}}
knitr::include_graphics("{png}")
```
'''.format(png=png)

#--------------------------------


def main():
    options, args = cmdparameter(sys.argv)
    #-----------------------------------
    file = options.filein
    fileL = re.split(r'[, ]*', file.strip())
    sample_readin = len(fileL)
    label = options.label
    labelL = re.split(r'[, ]*', label.strip())
    verbose = options.verbose
    op = options.out_prefix
    report_dir = options.report_dir
    report_sub_dir = options.report_sub_dir
    global debug
    debug = options.debug
    doc_only = options.doc_only
    num_samples_each_grp = options.number
    #-----------------------------------
    #curation_label = os.path.split(sys.argv[0])[-1].replace('.', '_')
    summary_op = op + '.summary.xls'
    if doc_only:
        generateDoc(report_dir, report_sub_dir, summary_op, sample_readin)
        return 0
    summary_op_fh = open(summary_op, 'w')
    print >>summary_op_fh, "variable\tvalue\tSample"
    for file in fileL:
        header = 1
        for line in open(file):
            if header:
                header -= 1 
                continue
            cov_rate, sample = line.split()
            print >>summary_op_fh, "\t".join(["C2T rate", cov_rate, sample])
    summary_op_fh.close()

    plot(summary_op, num_samples_each_grp, sample_readin, labelL)

    generateDoc(report_dir, report_sub_dir, summary_op, sample_readin)
    ###--------multi-process------------------

if __name__ == '__main__':
    startTime = strftime(timeformat, localtime())
    main()
    endTime = strftime(timeformat, localtime())
    fh = open('python.log', 'a')
    print >>fh, "%s\n\tRun time : %s - %s " % \
        (' '.join(sys.argv), startTime, endTime)
    fh.close()
    ###---------profile the program---------
    #import profile
    #profile_output = sys.argv[0]+".prof.txt")
    #profile.run("main()", profile_output)
    #import pstats
    #p = pstats.Stats(profile_output)
    #p.sort_stats("time").print_stats()
    ###---------profile the program---------