RPKM_saturationSummary.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import division, with_statement
'''
Copyright 2015, 陈同 (chentong_biology@163.com).  
===========================================================
'''
__author__ = 'chentong & ct586[9]'
__author_email__ = 'chentong_biology@163.com'
#=========================================================
desc = '''
Program description:
    This is designed to summarize results output by `RPKM_saturation.py`.
'''

import sys
import os
from json import dumps as json_dumps
from time import localtime, strftime 
timeformat = "%Y-%m-%d %H:%M:%S"
from optparse import OptionParser as OP
import re
from tools import *
#from multiprocessing.dummy import Pool as ThreadPool

#from bs4 import BeautifulSoup
reload(sys)
sys.setdefaultencoding('utf8')

debug = 0

def fprint(content):
    """ 
    This is a Google style docs.

    Args:
        param1(str): this is the first param
        param2(int, optional): this is a second param
            
    Returns:
        bool: This is a description of what is returned
            
    Raises:
        KeyError: raises an exception))
    """
    print json_dumps(content,indent=1)

def cmdparameter(argv):
    if len(argv) == 1:
        global desc
        print >>sys.stderr, desc
        cmd = 'python ' + argv[0] + ' -h'
        os.system(cmd)
        sys.exit(1)
    usages = "%prog -f file"
    parser = OP(usage=usages)
    parser.add_option("-f", "--files", dest="filein",
        metavar="FILEIN", help="`,` or ` ` separated a list of files. *.Log.final.out generated by `STAR` during mapping")
    parser.add_option("-l", "--labels", dest="label",
        metavar="LABEL", help="`,` or ` ` separated a list of labels to label each file. It must have same order as files.")
    parser.add_option("-o", "--output-prefix", dest="out_prefix",
        help="The prefix of output files. UNUSED")
    parser.add_option("-c", "--map-summary-cnt-file", dest="cnt",
        help="A matrix file with first column as sample name and last column as \
counts of mapped reads. Normally this file is output by <STAR_read_mapSummary.py>.")
    parser.add_option("-r", "--report-dir", dest="report_dir",
        default='report', help="Directory for report files. Default 'report'.")
    parser.add_option("-R", "--report-sub-dir", dest="report_sub_dir",
        default='2_mapping_quality', help="Directory for saving report figures and tables. This dir will put under <report_dir>,  so only dir name is needed. Default '2_mapping_quality'.")
    parser.add_option("-d", "--doc-only", dest="doc_only",
        default=False, action="store_true", help="Specify to only generate doc.")
    parser.add_option("-n", "--number", dest="number", type="int", 
        default=40, help="Set the maximum allowed samples for barplot. Default 40.\
 If more than this number of samples are given, heatmap will be used instead. UNUSED.")
    parser.add_option("-v", "--verbose", dest="verbose",
        action="store_true", help="Show process information")
    parser.add_option("-D", "--debug", dest="debug",
        default=False, action="store_true", help="Debug the program")
    (options, args) = parser.parse_args(argv[1:])
    assert options.filein != None, "A filename needed for -i"
    return (options, args)
#--------------------------------------------------------------------

def readTwoColumnFile(fileL, labelL, header=0, index_col=0):
    tmpL = []
    for file, label in zip(fileL, labelL):
        coverageM = pd.read_table(file, header=header, index_col=index_col)
        coverageM.columns = [label]
        tmpL.append(coverageM)
    mergeM = pd.concat(tmpL, axis=1)
    return mergeM
#-, ------------------------
def plot(fileL):
    for file in fileL:
        cmd = "s-plot barPlot -f " + file
        os.system(cmd)
#--------------------------------------
def plot_melt(total_melt, nameL):
    x_level = ["'"+i+"'" for i in nameL]
    x_level = '"'+','.join(x_level)+'"'
    cmd = ["s-plot barPlot -m TRUE -a Sample -R 90 -B set -O 1 -w 20 -u 25 -f ", 
            total_melt, ' -k free_y -L', x_level, 
            ' -y  \'Reads count or relative percent\' -x \'Samples\' ']
    #print ' '.join(cmd)
    os.system(' '.join(cmd))
#--------------------------------------
def plot_heatmap(totalTable):
    cmd = ["s-plot heatmapS -a TRUE -b TRUE -R TRUE", 
            "-x white -y blue -u 18 -v 30 -F 12 ", 
            "-f ", totalTable, "-I RPM"]
    os.system(' '.join(cmd))

#---------------------------------------

def generateDoc(report_dir, report_sub_dir, fileL, labelL, cntD):
    dest_dir = report_dir+'/'+report_sub_dir+'/'
    os.system('mkdir -p '+dest_dir)

    print "\n## 测序饱和度评估 {#Sequencing-saturation-estimation}\n"

    curation_label = "Sequencing_saturation_estimation"
    knitr_read_txt(report_dir,  curation_label)
    
    print """样品测序饱和度评估。

首先把基因按其表达量分为4组，Q1(表达量最低的25%)、Q2(表达量在25%-50%之间)、Q3(表达量在50%-75%之间)、Q4(表达量最高的25%)；然后从全部的比对reads中随机抽取5%、10%、15%、...、85%、90%、100%，利用随机抽取的reads计算基因的表达量，并与采用全部序列计算的表达量相比较，得到随机抽取条件（也就是在不同测序量条件）下的表达值偏差，并以箱线图的形式展示。

$$
Percent Relative Error = \\frac{RPKM_{abs}-RPKM_{real}}{RPKM_{real}} * 100
$$

当我们从总测序READs中抽取80%或90%的序列，计算出的表达量与使用全部测序READs计算出的表达量的偏差为0时，就说明我们测序的READs已达到饱和了，而且没有浪费太多的测序READs。

当我们从总测序READs中抽取30%或50%的序列，计算出的表达量与使用全部测序READs计算出的表达量的偏差为0时，也说明我们测序的READs已达到饱和了，但是浪费了一半的测序READs。以后再设计实验时就可以降低测序量，减少费用。

通常情况下，Q4区间的基因因为表达量最高，更容易达到饱和；而Q1区间的基因因为表达量最低，则需要测序很深才能达到饱和。

* Q1 (0-25%): Transcripts with expression level ranked below 25 percentile.
* Q2 (25-50%): Transcripts with expression level ranked between 25 percentile and 50 percentile.
* Q3 (50-75%): Transcripts with expression level ranked between 50 percentile and 75 percentile.
* Q4 (75-100%): Transcripts with expression level ranked above 75 percentile.

"""
    
    len_fileL = len(fileL)
    group = 3
    
    for i in range(0, len_fileL, 3):
        subF = fileL[i:i+3]
        subL = labelL[i:i+3]
        sub_cntL = [h+" ("+cntD[h]+" usable reads)" for h in subL]
        pdfL = [j+'.saturation.pdf' for j in subF]
        copypdf(dest_dir, *pdfL)
        len_subF = len(subF)
        pdfL = [report_sub_dir+'/'+os.path.split(j)[-1] for j in pdfL]
        pngL = [j.replace('pdf', 'png') for j in pdfL]

        pdf_link = []  #[label_pdf](pdf), [label_pdf](pdf)
        for pdf, label in zip(pdfL, subL):
            tmp_155 = '['+label+'_pdf]'+'('+pdf+')'
            pdf_link.append(tmp_155)

        pdf_link = ' '.join(pdf_link)

        print "(ref:read-saturation-fig-{}) Summary sequencing saturation of each samples. From left to right, the samples are **{}**。 纵轴表示Percent Relative Error，值越低越好，也就是说箱线图的中位数或整个箱线图越接近0越好，说明测序越饱和。横轴表示抽取的READs数占总READs数的比例，模拟的是低测序通量情况下计算的基因表达的偏差。 {}\n".format(i, ', '.join(sub_cntL), pdf_link)
        
        pngFileL = []  #"png1", "png2", "png3"
        for png in pngL:
            tmp_164 = "'"+png+"'"
            pngFileL.append(tmp_164)
        pngFileL = ', '.join(pngFileL)
        print '''```{{r read-saturation-fig-{label}, out.width="{width}%", fig.cap="(ref:read-saturation-fig-{label})"}}
knitr::include_graphics(c({png}))
```
'''.format(label=i, png=pngFileL, width=int(100/len_subF))

#--------------------------------

def read_cnt_file(cnt_file):
    cntD = {}
    header = 1
    for line in open(cnt_file):
        if header:
            header -= 1
            continue
        lineL = line.strip().split('\t')
        sample = lineL[0]
        reads_cnt = lineL[-1]
        assert sample not in cntD, "Duplicate "+sample
        cntD[sample] = reads_cnt
    return cntD
#----------------------------

def main():
    options, args = cmdparameter(sys.argv)
    #-----------------------------------
    file = options.filein
    fileL = re.split(r'[, ]*', file.strip())
    sample_readin = len(fileL)
    label = options.label
    labelL = re.split(r'[, ]*', label.strip())
    verbose = options.verbose
    op = options.out_prefix
    cnt_file = options.cnt
    cntD = read_cnt_file(cnt_file)
    report_dir = options.report_dir
    report_sub_dir = options.report_sub_dir
    global debug
    debug = options.debug
    doc_only = options.doc_only
    num_samples_each_grp = options.number
    melt = 0
    if sample_readin <= num_samples_each_grp:
        melt = 1
    #-----------------------------------
    aDict = {}
    #curation_label = os.path.split(sys.argv[0])[-1].replace('.', '_')
    if doc_only:
        generateDoc(report_dir, report_sub_dir, fileL, labelL, cntD)
        return 0

    generateDoc(report_dir, report_sub_dir, fileL, labelL, cntD)
    ###--------multi-process------------------

if __name__ == '__main__':
    startTime = strftime(timeformat, localtime())
    main()
    endTime = strftime(timeformat, localtime())
    fh = open('python.log', 'a')
    print >>fh, "%s\n\tRun time : %s - %s " % \
        (' '.join(sys.argv), startTime, endTime)
    fh.close()
    ###---------profile the program---------
    #import profile
    #profile_output = sys.argv[0]+".prof.txt")
    #profile.run("main()", profile_output)
    #import pstats
    #p = pstats.Stats(profile_output)
    #p.sort_stats("time").print_stats()
    ###---------profile the program---------