chrometools

#!/usr/bin/env python3

# standard
import os
import argparse
import sys
import subprocess
import gzip
import io # For io.StringIO
import shlex
# custom
from select_random_subset import process_fastq

def bx(args):
    return None

def subsample(args):
    in_fastq = args.fastq
    whitelist_path = args.whitelist
    output_dir = args.output_dir
    preprocess = args.subsample_size
    stats = output_dir
    seed = args.seed
    max_read_pairs = args.max_read_pairs
    verbose = args.verbose
    out_fastq = args.output

    # Open whitelist

    whitelist_file = ''

    if whitelist_path[-2:] == 'gz':
        whitelist_file = gzip.open(whitelist_path,'rb')
    else:
        whitelist_file = open(whitelist_path,'rb')

    my_process_fastq = process_fastq.ProcessFastQBarCodes(
                        barcode_whitelist_file=whitelist_file,
                        stats_path=stats,
                        subset_size=preprocess,
                        random_seed=seed,
                        max_read_pairs=max_read_pairs,
                        verbose=verbose)
    
    # open output fastq
    if out_fastq in {'stdout','-'}:
        my_output = sys.stdout
    else:
        my_output = gzip.open(out_fastq+'.gz','wt+')
    # Open input FASTQ
    if in_fastq[0] == '-':
        my_process_fastq.process_fastq(file_in=sys.stdin, file_out=my_output)
    else:
        for input_path in in_fastq:
            if input_path[-2:] == 'gz':
                input_file = gzip.open(input_path, 'rb')
            else:
                input_file = open(input_path, 'rb')
            my_process_fastq.process_fastq(file_in=input_file, file_out=my_output)
            input_file.close()
    my_output.close()
    print("Subsampling done.")
    
def mapping(args):
    minimap_command = "minimap2 -ax sr -t %s %s %s" % (args.threads, args.ref, args.fastq)
    stdout = subprocess.check_output(shlex.split(minimap_command))
    with open(args.sam,'w') as sam:
        sam.write(stdout.decode("utf-8"))
    print("Mapping done.")

def filtering(args):
    print("filter")

def sorting(args):
    print("sorting")

def molecule(args):
    molecest_command = "%s/MolecEst/MolecLenEst.py -b %s" % (os.path.dirname(os.path.realpath(__file__)),args.input)
    stdout = subprocess.check_output(shlex.split(molecest_command))
    with open(args.output,'w') as output:
        output.write(stdout.decode("utf-8"))
    print("Molecule length estimation done")

def stats(args):
    print("stats")

def report(args):
    print("report")

def multiqc(args):
    os.chdir(args.directory)
    subprocess.call(shlex.split('multiqc -f -c multiqc_config_chromeqc.yaml .'))
    print("Multiqc report compilation complete.")

def chromeqc_pipeline(args):
    # remember to time each part of the pipeline
    print("all pipeline")

def main(argv):
    parser = argparse.ArgumentParser()
    subparsers = parser.add_subparsers()

    ##########
    chromeqc_subparser = subparsers.add_parser('chromeqc', help='run the chromeqc pipeline')
    chromeqc_subparser.add_argument('fastq', nargs='*', help='paths to raw FASTQ files')
    chromeqc_subparser.add_argument('output_dir', help='output directory')
    chromeqc_subparser.add_argument('-t','--threads', type=str, default=1, help='number of threads')
    chromeqc_subparser.add_argument('-m','--memory', type=str, default=-1, help='max memory usage in gb')
    chromeqc_subparser.set_defaults(func=chromeqc_pipeline)

    ##########
    subsample_subparser = subparsers.add_parser('subsample', help='extracts barcode from read sequences')
    subsample_subparser.add_argument('fastq', help='paths raw chromium FASTQ files')
    subsample_subparser.add_argument('whitelist', type=str, help='path to whitelisted barcodes')
    subsample_subparser.add_argument('output_dir', type=str, help='output dir') 
    subsample_subparser.add_argument('output', type=str, help='output file')
    subsample_subparser.add_argument('-k', '--subsample_size', default=4000, type=int, help='preprocess size')
    subsample_subparser.add_argument('-s', '--seed', default=1334, type=int, help='seed for random sampling')
    subsample_subparser.add_argument('-p', '--max_read_pairs', default=-1, type=int, help='max number of read pairs')
    subsample_subparser.add_argument('-v', '--verbose', default=False, type=bool, help='verbose output')
    subsample_subparser.set_defaults(func=subsample)

    ########## 
    map_subparser = subparsers.add_parser('map', help='map reads onto reference genome')
    map_subparser.add_argument('fastq', help='preprocessed chromium FASTQ files')
    map_subparser.add_argument('ref', type=str, help='reference genome')
    map_subparser.add_argument('sam', type=str, help='output SAM file path')
    map_subparser.add_argument('-t','--threads', type=str, default=1, help='number of threads')
    map_subparser.set_defaults(func=mapping)

    ##########
    filter_subparser = subparsers.add_parser('filter', help='filter by alignment quality')
    filter_subparser.add_argument('input', help='input file')
    filter_subparser.add_argument('output_dir', help='output directory')
    filter_subparser.set_defaults(func=filtering)

    ########## 
    sort_subparser = subparsers.add_parser('sort', help='group alignments by barcode, chromosome, position')
    sort_subparser.add_argument('input', help='input file')
    sort_subparser.add_argument('output_dir', help='output directory')
    sort_subparser.set_defaults(func=sorting)

    ##########
    molecule_subparser = subparsers.add_parser('molecule', help='group reads into molecules')
    molecule_subparser.add_argument('input', help='BAM or SAM file')
    molecule_subparser.add_argument('output', help='output TSV')
    molecule_subparser.set_defaults(func=molecule)

    ##########
    stats_subparser = subparsers.add_parser('stats', help='compute molecule and barcode statistics')
    stats_subparser.add_argument('alignment', nargs='*', help='path to chromium reads mapment')
    stats_subparser.add_argument('output_dir', type=str, help='output dir')
    stats_subparser.set_defaults(func=stats)

    ##########
    report_subparser = subparsers.add_parser('report', help='generate the report')
    report_subparser.add_argument('input', help='input file')
    report_subparser.add_argument('output_dir', help='output directory')
    report_subparser.set_defaults(func=report)

    ##########
    multiqc_subparser = subparsers.add_parser('multiqc', help='aggregate multiple reports')
    multiqc_subparser.add_argument('directory', help='directory containing report files')
    multiqc_subparser.set_defaults(func=multiqc)

    args = parser.parse_args()
    args.func(args)
    '''
    try:
        args.func(args)
    except:
        parser.print_help()
        sys.exit(0)
    '''

if __name__ == "__main__":
    main(sys.argv)