From 79374b48940ef7a0f22e0827f6fd4081c82ec3d5 Mon Sep 17 00:00:00 2001
From: cytham <e0054302@u.nus.edu>
Date: Thu, 19 Mar 2020 14:53:32 +0800
Subject: [PATCH] bump v1.3.4

---
 CHANGELOG.txt      |   4 +
 nanovar/nanovar    | 479 +++++++++++++++++++++++++++++++++++++++++++++
 nanovar/version.py |   2 +-
 3 files changed, 484 insertions(+), 1 deletion(-)
 create mode 100644 nanovar/nanovar

diff --git a/CHANGELOG.txt b/CHANGELOG.txt
index 3139dcd..fb483db 100644
--- a/CHANGELOG.txt
+++ b/CHANGELOG.txt
@@ -4,6 +4,10 @@ NanoVar Changelog
 Release Summary:
 
 
+Version 1.3.4 - Mar 19, 2020
+    * Fixed missing nanovar script
+
+
 Version 1.3.3 - Mar 19, 2020
     * Upgraded model (ONT (Guppy) - v1, pacbio (CLR or CSS) - v1)
     * Modified normal read breakpoint buffer from 400 to 100 to include shorter alignments as breakend-opposing reads
diff --git a/nanovar/nanovar b/nanovar/nanovar
new file mode 100644
index 0000000..17074d1
--- /dev/null
+++ b/nanovar/nanovar
@@ -0,0 +1,479 @@
+#!/usr/bin/env python3
+
+"""
+NanoVar
+
+This is the main executable file of the program NanoVar.
+
+Copyright (C) 2019 Tham Cheng Yong
+
+NanoVar is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or
+(at your option) any later version.
+
+NanoVar is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License
+along with NanoVar.  If not, see <https://www.gnu.org/licenses/>.
+"""
+
+
+__author__ = 'CY Tham'
+import os
+import sys
+import time
+import pysam
+import logging
+import nanovar
+import random
+import threading
+from datetime import datetime
+from nanovar import __version__, input_parser, gzip_check, fastx_valid, bed_valid, check_exe, check_index
+from progress.spinner import Spinner
+
+
+def main():
+    # Parse arguments
+    args = input_parser()
+    file_path = args.input
+    ref_path = args.ref
+    ref_name = os.path.basename(ref_path).rsplit('.', 1)[0]
+    wk_dir = args.dir
+    data_type = args.data_type
+    genome_filter = args.filter_bed
+    minlen = args.minlen
+    splitpct = args.splitpct
+    minalign = args.minalign
+    mincov = args.mincov
+    buff = args.buffer
+    score_threshold = args.score
+    homo_t = args.homo
+    het_t = args.hetero
+    threads = args.threads
+    debug = args.debug
+    quiet = args.quiet
+    force = args.force
+    model_path = args.model
+    filter_bed_dir = os.path.join(os.path.dirname(nanovar.__file__), 'gaps')
+    mm = args.mm
+    st = args.st
+    mdb = args.mdb
+    wmk = args.wmk
+    hsb = args.hsb
+
+    # Check model file
+    if model_path is None:
+        if data_type == 'ont':
+            model_path = os.path.join(os.path.dirname(nanovar.__file__), 'model',
+                                      'ANN.E100B400L3N12-5D0.4-0.2SGDsee11_het_gup_v1.h5')
+        elif data_type == 'pacbio-clr':
+            model_path = os.path.join(os.path.dirname(nanovar.__file__), 'model',
+                                      'ANN.E100B400L3N12-5D0.4-0.2SGDsee11_het_clr_v1.h5')
+        elif data_type == 'pacbio-ccs':
+            model_path = os.path.join(os.path.dirname(nanovar.__file__), 'model',
+                                      'ANN.E100B400L3N12-5D0.4-0.2SGDsee11_het_ccs_v1.h5')
+        else:
+            logging.critical("Error: Invalid data type given '%s'" % data_type)
+            raise Exception("Error: Invalid data type given '%s'" % data_type)
+    else:
+        if data_type not in ['ont', 'pacbio-clr', 'pacbio-ccs']:
+            logging.info("Invalid data type given '%s', but irrelevant due to custom-built model use." % data_type)
+
+    # Check homo_t > het_t
+    if homo_t <= het_t:
+        raise Exception("Error: --homo threshold %s is less than or equal to --hetero threshold %s" % (str(homo_t), str(het_t)))
+
+    # Check for required executables
+    mm = check_exe(mm, 'minimap2')
+    st = check_exe(st, 'samtools')
+    mdb = check_exe(mdb, 'makeblastdb')
+    wmk = check_exe(wmk, 'windowmasker')
+    hsb = check_exe(hsb, 'hs-blastn')
+
+    # Observe verbosity
+    if quiet:
+        sys.stdout = open(os.devnull, 'w')
+
+    # Assign threads
+    spin_switch, threads_mm, threads_bt, threads_index = assign_threads(force, threads, quiet, ref_path, wk_dir, ref_name)
+
+    # spin_switch = False
+
+    # Print initiation message
+    now = datetime.now()
+    now_str = now.strftime("[%d/%m/%Y %H:%M:%S]")
+    print(now_str, "- NanoVar started")
+    if spin_switch:
+        task = TaskProgress()
+        msg = 'Checking integrity of input files'
+        spinner = Spinner(msg + ' - ')
+        thread_spin = threading.Thread(target=task.run, args=(spinner,))
+        thread_spin.setDaemon(True)
+        thread_spin.start()
+    else:
+        print('Checking integrity of input files -')
+        task = ''
+        thread_spin = ''
+
+    # Setup working directory
+    if not os.path.exists(wk_dir):
+        os.makedirs(wk_dir)
+    if not os.path.exists(os.path.join(wk_dir, 'fig')):
+        os.makedirs(os.path.join(wk_dir, 'fig'))
+
+    # Setup up logging
+    log_file = os.path.join(wk_dir, 'NanoVar-{:%d%m%y-%H%M}.log'.format(datetime.now()))
+    logging.basicConfig(filename=log_file, level=logging.DEBUG, format='[%(asctime)s] - %(levelname)s - %(message)s',
+                        datefmt='%d/%m/%Y %H:%M:%S')
+    logging.info('Initialize NanoVar log file')
+    logging.info('Version: NanoVar-%s' % __version__)
+    logging.info('Command: %s' % ' '.join(sys.argv))
+
+    # Detect read or map file
+    filename = os.path.basename(file_path)
+    read_suffix = ['.fa', '.fq', '.fasta', '.fastq', '.fa.gzip', '.fq.gzip', '.fa.gz', '.fq.gz', '.fasta.gz', '.fastq.gz']
+    bam_suffix = '.bam'
+    if any(s in filename.lower() for s in read_suffix):
+        input_name = os.path.basename(file_path).rsplit('.f', 1)[0]
+        input_type = 'raw'
+        # Test gzip compression and validates read file
+        if gzip_check(file_path):
+            # read_para = "<(zcat " + file_path + ")"
+            fastx_check = fastx_valid(file_path, "gz")
+        else:
+            # read_para = "<(cat " + file_path + ")"
+            fastx_check = fastx_valid(file_path, "txt")
+        if fastx_check[0] == "Fail":
+            logging.critical("Error: Input FASTQ/FASTA file is corrupted around line %s +/- 4" % str(fastx_check[1]))
+            raise Exception("Error: Input FASTQ/FASTA file is corrupted around line %s +/- 4" % str(fastx_check[1]))
+        else:
+            logging.debug("Input FASTQ/FASTA file passed")
+    elif bam_suffix in filename.lower():
+        sam = pysam.AlignmentFile(file_path, "rb")
+        try:
+            assert sam.is_bam, "Error: Input BAM file is not a BAM file."
+            input_name = os.path.basename(file_path).rsplit('.bam', 1)[0]
+            input_type = 'bam'
+            fastx_check = []
+        except AssertionError:
+            logging.critical("Error: Input BAM file is not a BAM file.")
+            raise Exception("Error: Input BAM file is not a BAM file.")
+    else:
+        logging.critical("Error: Input file is not recognised, please ensure file suffix has '.fa' or '.fq' or '.bam'")
+        raise Exception("Error: Input file is not recognised, please ensure file suffix has '.fa' or '.fq' or '.bam'")
+
+    if gzip_check(ref_path):
+        logging.critical("Error: Input reference file is gzipped, please unpack it")
+        raise Exception("Error: Input reference file is gzipped, please unpack it")
+
+    # Logging config info
+    logging.info('Input file: %s' % file_path)
+    logging.info('Read type: %s' % data_type)
+    logging.info('Reference genome: %s' % ref_path)
+    logging.info('Working directory: %s' % wk_dir)
+    logging.info('Model: %s' % model_path)
+    logging.info('Filter file: %s' % genome_filter)
+    logging.info('Minimum number of reads for calling a breakend: %s' % str(mincov))
+    logging.info('Minimum SV len: %s' % str(minlen))
+    logging.info('Mapping percent for split-read: %s' % str(splitpct))
+    logging.info('Length buffer for clustering: %s' % str(buff))
+    logging.info('Score threshold: %s' % str(score_threshold))
+    logging.info('Homozygous read ratio threshold: %s' % str(homo_t))
+    logging.info('Heterozygous read ratio threshold: %s' % str(het_t))
+    logging.info('Number of threads: %s\n' % str(threads))
+    if input_type == 'raw':
+        logging.info('Total number of reads in FASTQ/FASTA: %s\n' % str(fastx_check[1]))
+    elif input_type == 'bam':
+        logging.info('Total number of reads in FASTQ/FASTA: -\n')
+    logging.info('NanoVar started')
+
+    from Bio import SeqIO
+    from collections import OrderedDict
+    # Process reference genome
+    contig_len_dict = OrderedDict()
+    total_gsize = 0
+    for seq_record in SeqIO.parse(ref_path, "fasta"):
+        contig_len_dict[seq_record.id] = len(seq_record)
+        total_gsize += len(seq_record)
+
+    # Check contig id for invalid symbols
+    contig_omit = checkcontignames(contig_len_dict)
+
+    # Validate filter BED file
+    if genome_filter is not None:
+        if genome_filter in ('hg38', 'hg19', 'mm10'):
+            filter_path = os.path.join(filter_bed_dir, genome_filter + '_filter.bed')
+        else:
+            filter_path = genome_filter
+        if os.path.isfile(filter_path):
+            if bed_valid(filter_path, contig_len_dict):
+                logging.debug("Genome filter BED passed")
+        else:
+            logging.critical("Error: Genome filter BED %s is not found" % filter_path)
+            raise Exception("Error: Genome filter BED %s is not found" % filter_path)
+    else:
+        filter_path = genome_filter
+
+    # Update progress
+    if spin_switch:
+        task.endspin()
+        thread_spin.join()
+        print('')
+        if len(contig_omit) > 0:
+            print('Warning: The following contig id(s) contains invalid symbols [ ~ : - ] %s. Reads mapping to these contig(s) '
+                  'will be ignored.' % ', '.join(['>' + x for x in contig_omit]))
+            logging.warning('Warning: The following contig id(s) contains invalid symbols [ ~ : - ] %s. Reads mapping to these '
+                            'contig(s) will be ignored.' % ', '.join(['>' + x for x in contig_omit]))
+        task = TaskProgress()
+        msg = 'Indexing genome and aligning reads'
+        spinner = Spinner(msg + ' - ')
+        thread_spin = threading.Thread(target=task.run, args=(spinner,))
+        thread_spin.setDaemon(True)
+        thread_spin.start()
+    else:
+        if len(contig_omit) > 0:
+            print('Warning: The following contig id(s) contains invalid symbols [ ~ : - ] %s. Reads mapping to these contig(s) '
+                  'will be ignored.' % ', '.join(['>' + x for x in contig_omit]))
+            logging.warning('Warning: The following contig id(s) contains invalid symbols [ ~ : - ] %s. Reads mapping to these '
+                            'contig(s) will be ignored.' % ', '.join(['>' + x for x in contig_omit]))
+        print('Indexing genome and aligning reads -')
+
+    # Pre-indexing
+    from nanovar.nv_align import make_index, align_mm, align_hsb
+    if threads_index == 1:
+        indexing = threading.Thread(target=make_index, args=(force, ref_path, wk_dir, ref_name, mdb, wmk, hsb))
+        indexing.daemon = True
+        indexing.start()
+    else:
+        indexing = ''
+
+    # Aligning using minimap2
+    if input_type == 'raw':
+        logging.info('Read alignment using minimap2')
+        mma = align_mm(ref_path, file_path, wk_dir, input_name, ref_name, threads_mm, mm, data_type, st)
+        bam_path = mma[1]
+    elif input_type == 'bam':
+        logging.info('Input BAM file, skipping minimap2 alignment')
+        mma = ['-', '']
+        bam_path = file_path
+    else:
+        raise Exception("Error: Internal error, input_type: %s unknown" % input_type)
+
+    # Update progress
+    if spin_switch:
+        task.endspin()
+        thread_spin.join()
+        print('')
+        task = TaskProgress()
+        msg = 'Analyzing read alignments and detecting SVs'
+        spinner = Spinner(msg + ' - ')
+        thread_spin = threading.Thread(target=task.run, args=(spinner,))
+        thread_spin.setDaemon(True)
+        thread_spin.start()
+    else:
+        print('Analyzing read alignments and detecting SVs -')
+
+    # Parse and detect SVs
+    logging.getLogger("matplotlib").setLevel(logging.WARNING)
+    from nanovar.nv_characterize import VariantDetect
+    logging.info('Parsing BAM and detecting SVs')
+    run = VariantDetect(wk_dir, bam_path, splitpct, minalign, filter_path, minlen, buff, model_path,
+                        total_gsize, contig_len_dict, score_threshold, file_path, input_name, ref_path, ref_name, mma[0],
+                        mincov, homo_t, het_t, debug, contig_omit)
+    run.bam_parse_detect()
+    run.coverage_stats()
+
+    logging.info('Total number of mapped reads: %s\n' % str(run.maps))
+
+    # Update progress
+    if spin_switch:
+        task.endspin()
+        thread_spin.join()
+        print('')
+        task = TaskProgress()
+        msg = 'Clustering SV breakends and inferencing'
+        spinner = Spinner(msg + ' - ')
+        thread_spin = threading.Thread(target=task.run, args=(spinner,))
+        thread_spin.setDaemon(True)
+        thread_spin.start()
+    else:
+        print('Clustering SV breakends and inferencing -')
+
+    # SV breakend clustering and extracting INS and INV SVs
+    run.cluster_extract()
+
+    # Update progress
+    if spin_switch:
+        task.endspin()
+        thread_spin.join()
+        print('')
+        task = TaskProgress()
+        msg = 'Re-evaluating INS and INV SVs'
+        spinner = Spinner(msg + ' - ')
+        thread_spin = threading.Thread(target=task.run, args=(spinner,))
+        thread_spin.setDaemon(True)
+        thread_spin.start()
+    else:
+        print('Re-evaluating INS and INV SVs -')
+
+    # Wait for indexing or make index
+    if threads_index == 1:
+        indexing.join()
+    elif threads_index == 0:
+        make_index(force, ref_path, wk_dir, ref_name, mdb, wmk, hsb)
+
+    # Run hsblastn on INS and INV reads
+    hsba = align_hsb(ref_path, wk_dir, ref_name, threads_bt, hsb)
+    sub_run = VariantDetect(wk_dir, hsba[1], splitpct, minalign, filter_path, minlen, buff, model_path,
+                            total_gsize, contig_len_dict, score_threshold, file_path, input_name, ref_path, ref_name, hsba[0],
+                            mincov, homo_t, het_t, debug, contig_omit)
+
+    # Parsing INS and INV SVs and clustering
+    sub_run.rlendict = run.rlendict
+    sub_run.parse_detect_hsb()
+    logging.info('Parsing BAM and detecting INV and INS SVs')
+    run.cluster_nn(add_out=sub_run.total_out)
+
+    # Update progress
+    if spin_switch:
+        task.endspin()
+        thread_spin.join()
+        print('')
+        task = TaskProgress()
+        msg = 'Generating VCF files and report'
+        spinner = Spinner(msg + ' - ')
+        thread_spin = threading.Thread(target=task.run, args=(spinner,))
+        thread_spin.setDaemon(True)
+        thread_spin.start()
+    else:
+        print('Generating VCF files and report -')
+
+    # Write intermediate files
+    run.write2file(add_out=sub_run.total_out)
+
+    # Generating VCF and HTML report
+    run.vcf_report()
+    logging.info('NanoVar ended')
+    now = datetime.now()
+    if spin_switch:
+        task.endspin()
+        thread_spin.join()
+        time.sleep(1)
+        now_str = now.strftime("\n[%d/%m/%Y %H:%M:%S]")
+    else:
+        now_str = now.strftime("[%d/%m/%Y %H:%M:%S]")
+    print(now_str, "- NanoVar ended")
+
+
+# Check contig name
+def checkcontignames(contig_len_dict):
+    contig_omit = {}
+    for contig in contig_len_dict:
+        if "~" in contig or ":" in contig or "-" in contig:
+            contig_omit[contig] = [0, int(contig_len_dict[contig])]
+    return contig_omit
+
+
+# Progress message
+class TaskProgress:
+
+    def __init__(self):
+        self.prog_spin = True
+
+    def endspin(self):
+        self.prog_spin = False
+
+    def run(self, spin):
+        while self.prog_spin:
+            spin.next()
+            sleep()
+
+
+# Randomize sleep timing
+def sleep():
+    t = 0.05
+    t += t * random.uniform(-0.1, 0.1)
+    time.sleep(t)
+
+
+# Assign thread usage according to number of threads
+def assign_threads(force, threads, quiet, ref_path, wk_dir, ref_name):
+    if force:
+        if threads == 1:
+            spin_switch = False
+            threads_mm = 1
+            threads_bt = 1
+            threads_index = 0
+        elif threads == 2:
+            spin_switch = False
+            threads_mm = 1
+            threads_bt = 2
+            threads_index = 1
+        else:  # threads > 2
+            if quiet:
+                spin_switch = False
+                threads_mm = threads - 1
+                threads_bt = min(threads, 53)
+                threads_index = 1
+            else:
+                spin_switch = True
+                threads_mm = threads - 2
+                threads_bt = min(threads - 1, 53)
+                threads_index = 1
+    else:
+        # Check indexes
+        index_present = check_index(ref_path, wk_dir, ref_name)
+        if threads == 1:
+            spin_switch = False
+            threads_mm = 1
+            threads_bt = 1
+            threads_index = 0
+        elif threads == 2:
+            if index_present:
+                if quiet:
+                    spin_switch = False
+                    threads_mm = 2
+                    threads_bt = 2
+                    threads_index = 0
+                else:
+                    spin_switch = True
+                    threads_mm = 1
+                    threads_bt = 1
+                    threads_index = 0
+            else:
+                spin_switch = False
+                threads_mm = 1
+                threads_bt = 2
+                threads_index = 1
+        else:  # threads > 2
+            if index_present:
+                if quiet:
+                    spin_switch = False
+                    threads_mm = threads
+                    threads_bt = min(threads, 53)
+                    threads_index = 0
+                else:
+                    spin_switch = True
+                    threads_mm = threads - 1
+                    threads_bt = min(threads - 1, 53)
+                    threads_index = 0
+            else:
+                if quiet:
+                    spin_switch = False
+                    threads_mm = threads - 1
+                    threads_bt = min(threads, 53)
+                    threads_index = 1
+                else:
+                    spin_switch = True
+                    threads_mm = threads - 2
+                    threads_bt = min(threads - 1, 53)
+                    threads_index = 1
+    return spin_switch, threads_mm, threads_bt, threads_index
+
+
+if __name__ == "__main__":
+    main()
diff --git a/nanovar/version.py b/nanovar/version.py
index 7b1e312..f9e47b6 100644
--- a/nanovar/version.py
+++ b/nanovar/version.py
@@ -1 +1 @@
-__version__ = "1.3.3"
+__version__ = "1.3.4"