bump v1.3.7

cytham · May 23, 2020 · 5065d65 · 5065d65
1 parent 55acb49
commit 5065d65
Show file tree

Hide file tree

Showing 12 changed files with 519 additions and 354 deletions.
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
@@ -4,6 +4,16 @@ NanoVar Changelog
 Release Summary:
 
 
+Version 1.3.7 - May 23, 2020
+    * Changed version import approach in setup.py
+    * All SV classes except deletions now undergo secondary analysis by hsblast alignment
+    * Allowed clustering of Nov_Ins to other SV classes
+    * Added depth of coverage information in VCF header
+    * Fixed SV index duplications
+    * Added BND limitation in README.md
+    * Removed "number_of_maps" from condfidence score equation in nv_nn.py to capture SVs consisting repetitive elements
+
+
 Version 1.3.6 - Apr 18, 2020
     * Fixed Exception bug in nv_cluster.py (Thanks to jiadong324, https://github.com/cytham/nanovar/issues/9#issuecomment-609146494)
     * Added mincov filter early in nv_cluster.py for faster computation

diff --git a/README.md b/README.md
@@ -141,4 +141,8 @@ Although NanoVar is provided with a universal model and threshold score, instruc
 
 ## Limitations
 * The inaccurate basecalling of large homopolymer or low complexity DNA regions may result in the false determination of deletion SVs. We advise the use of up-to-date ONT basecallers such as Guppy to minimize this possibility.
-
+* For BND SVs, NanoVar cannot calculate the actual number of SV-opposing reads (normal reads) at the novel adjacency as there
+ are two breakends from distant locations. Since it is not clear whether the novel adjacency is derived from both or either
+  breakends, it is not possible to know which breakend location(s) to consider for counting normal reads. Currently, NanoVar
+   approximates the normal read count by the minimum count from either breakend location. This would help to capture more true
+    BNDs but might also lower its precision.
diff --git a/nanovar/nanovar b/nanovar/nanovar
@@ -295,13 +295,13 @@ def main():
         thread_spin.join()
         print('')
         task = TaskProgress()
-        msg = 'Clustering SV breakends and inferencing'
+        msg = 'Clustering SV breakends'
         spinner = Spinner(msg + ' - ')
         thread_spin = threading.Thread(target=task.run, args=(spinner,))
         thread_spin.setDaemon(True)
         thread_spin.start()
     else:
-        print('Clustering SV breakends and inferencing -')
+        print('Clustering SV breakends -')
 
     # SV breakend clustering and extracting INS and INV SVs
     run.cluster_extract()
@@ -312,13 +312,13 @@ def main():
         thread_spin.join()
         print('')
         task = TaskProgress()
-        msg = 'Re-evaluating INS and INV SVs'
+        msg = 'Re-evaluating SVs with BLAST and inferencing'
         spinner = Spinner(msg + ' - ')
         thread_spin = threading.Thread(target=task.run, args=(spinner,))
         thread_spin.setDaemon(True)
         thread_spin.start()
     else:
-        print('Re-evaluating INS and INV SVs -')
+        print('Re-evaluating SVs with BLAST and inferencing -')
 
     # Wait for indexing or make index
     if threads_index == 1:

diff --git a/nanovar/nv_bam_parser.c b/nanovar/nv_bam_parser.c
diff --git a/nanovar/nv_bam_parser.pyx b/nanovar/nv_bam_parser.pyx
@@ -131,7 +131,7 @@ def bam_parse(bam, unsigned int minlen, float splitpct, unsigned int minalign, s
                     final = breakpoint_parser(out2, minlen, sig_index, seed)
                     total_out.extend(final)
                     for i in final:
-                        parse_dict[i.split('\t')[8]] = i
+                        parse_dict[i.split('\t')[8]] = '\t'.join(i.split('\t')[0:5]) + '\tmm\t' + '\t'.join(i.split('\t')[6:])
         else:  # Multiple alignment read
             seed += 1
             total_lines = []
@@ -172,8 +172,8 @@ def bam_parse(bam, unsigned int minlen, float splitpct, unsigned int minalign, s
                 final = breakpoint_parser(out2, minlen, sig_index, seed)
                 total_out.extend(final)
                 for i in final:
-                    parse_dict[i.split('\t')[8]] = i
-    return total_subdata, total_out, basecov, parse_dict, rlendict, len(repeat_dict), detect_out
+                    parse_dict[i.split('\t')[8]] = '\t'.join(i.split('\t')[0:5]) + '\tmm\t' + '\t'.join(i.split('\t')[6:])
+    return total_subdata, total_out, basecov, parse_dict, rlendict, len(repeat_dict), detect_out, seed
 
 
 # Analyze CIGAR for Indels in segment and return read advancement call

diff --git a/nanovar/nv_characterize.py b/nanovar/nv_characterize.py
@@ -67,11 +67,13 @@ def __init__(self, wk_dir, bam, splitpct, minalign, filter_path, minlen, buff, m
         # HTML SV table SV ratio limit
         self.ratio_limit = 1
         self.maps = 0
+        self.seed = 0
+        self.seed2 = 1
 
     def bam_parse_detect(self):
         random.seed(1)
-        self.total_subdata, self.total_out, self.basecov, self.parse_dict, self.rlendict, self.maps, self.detect_out = bam_parse(
-            self.bam, self.minlen, self.splitpct, self.minalign, self.dir, self.filter, self.contig_omit)
+        self.total_subdata, self.total_out, self.basecov, self.parse_dict, self.rlendict, self.maps, self.detect_out, self.seed \
+            = bam_parse(self.bam, self.minlen, self.splitpct, self.minalign, self.dir, self.filter, self.contig_omit)
         writer(os.path.join(self.dir, 'subdata.tsv'), self.total_subdata, self.debug)
         writer(os.path.join(self.dir, 'detect.tsv'), self.detect_out, self.debug)
         writer(os.path.join(self.dir, 'parse1.tsv'), self.total_out, self.debug)
@@ -89,12 +91,12 @@ def coverage_stats(self):
 
     def cluster_extract(self):
         logging.info("Clustering SV breakends")
-        cluster_out = sv_cluster(self.total_subdata, self.total_out, self.buff, self.maxovl, self.mincov, self.contig, False)
+        cluster_out, self.seed2 = sv_cluster(self.total_subdata, self.total_out, self.buff, self.maxovl, self.mincov,
+                                             self.contig, False, self.seed2)
         logging.info("Filtering INS and INV SVs")
         total_qnames = []
         for line in cluster_out:
             svtype = line.split('\t')[3].split(' ')[0]
-            # cov = int(line.split('\t')[10])
             if svtype in ['S-Nov_Ins_bp', 'E-Nov_Ins_bp', 'Nov_Ins']:  # Some INS are actually DUP
                 qnames = [line.split('\t')[0]]
                 self.ins_out.append(self.parse_dict[line.split('\t')[8]])
@@ -103,22 +105,23 @@ def cluster_extract(self):
                         qnames.append(mate[:-6])
                         self.ins_out.append(self.parse_dict[mate])
                 total_qnames.extend(qnames)
-            elif svtype in ['Inv', 'Inv(1)', 'Inv(2)']:  # For more precise INV bps
+            elif svtype == 'Del':
+                self.out_rest.append(line)
+            else:  # For more precise INV, DUP, Intra and Inter bps
                 qnames = [line.split('\t')[0]]
                 if line.split('\t')[11] != '.':
                     for mate in line.split('\t')[11].split(','):
                         qnames.append(mate[:-6])
                 total_qnames.extend(qnames)
-            elif svtype not in ['S-Nov_Ins_bp', 'E-Nov_Ins_bp', 'Nov_Ins', 'Inv', 'Inv(1)', 'Inv(2)']:
-                self.out_rest.append(line)
         qnames_dict = {x: 1 for x in set(total_qnames)}
         self.fasta_extract(qnames_dict)
 
     def cluster_nn(self, add_out):
         logging.info("Re-clustering INS/INV SVs and merging")
         # Merge old ins with new from hsblastn
         sub_out = self.ins_out + add_out
-        cluster_out_ins = sv_cluster(self.total_subdata, sub_out, self.buff, self.maxovl, self.mincov, self.contig, True)
+        cluster_out_ins, _ = sv_cluster(self.total_subdata, sub_out, self.buff, self.maxovl, self.mincov, self.contig, True,
+                                        self.seed2)
         new_cluster_out = self.out_rest + cluster_out_ins
         logging.info("Neural network inference")
         new_total_out = self.total_out + add_out
@@ -140,7 +143,6 @@ def parse_detect_hsb(self):
         data = open(self.bam, 'r').read().splitlines()
         data.append('null\tnull\tnull\tnull\tnull\tnull')
         nlines = len(data) - 1
-        co = 0
         for i in range(nlines):
             if data[i].split('\t')[0] == data[i + 1].split('\t')[0]:  # Grouping alignments by read name
                 temp1.append(align_info(data[i], self.rlendict))
@@ -150,7 +152,7 @@ def parse_detect_hsb(self):
                 temp1.append(align_info(data[i], self.rlendict))
                 if data[i].split('\t')[1].strip() not in chromocollect:
                     chromocollect.append(data[i].split('\t')[1].strip())
-                co += 1
+                self.seed += 1
                 # Parse entries and correct overlap alignments
                 subdata = entry_parser(temp1, chromocollect, ovlt)
                 for entry in subdata:
@@ -163,7 +165,7 @@ def parse_detect_hsb(self):
                     pass
                 else:
                     # Parse breakpoints
-                    final = breakpoint_parser(out2, self.minlen, sig_index, co)
+                    final = breakpoint_parser(out2, self.minlen, sig_index, self.seed)
                     self.total_out.extend(final)
                 temp1 = []
                 chromocollect = []
@@ -173,7 +175,7 @@ def parse_detect_hsb(self):
     def vcf_report(self):
         logging.info("Creating VCF")
         create_vcf(self.dir, self.thres, self.out_nn, self.refpath, self.rpath, self.rname, self.mapcmd, self.contig,
-                   self.homo_t, self.het_t, self.minlen)
+                   self.homo_t, self.het_t, self.minlen, self.depth)
         logging.info("Creating HTML report")
         create_report(self.dir, self.contig, self.thres, self.rpath, self.refpath, self.rlendict, self.rname,
                       self.num_limit, self.ratio_limit)
@@ -183,27 +185,6 @@ def write2file(self, add_out):
         writer(os.path.join(self.dir, 'parse2.tsv'), add_out, self.debug)
         writer(os.path.join(self.dir, 'cluster.tsv'), self.out_nn, self.debug)
 
-    # Parse data and detect SVs
-    def parse_detect(self, total_lines, contig_collect, seed, gapdict, ovlt, sig_index):
-        lines_sort = sorted(sorted(total_lines, key=lambda x: x[1], reverse=True), key=lambda y: y[0])
-        temp1 = [tup[2] for tup in lines_sort]
-        # Parse entries and correct overlap alignments
-        subdata = entry_parser(temp1, contig_collect, ovlt)
-        for entry in subdata:
-            self.total_subdata.append('\t'.join(entry.split('\t')[0:5]))
-            # Add to base coverage
-            self.basecov += int(entry.split('\t')[2])
-        # SV detection
-        out1, out2 = sv_detect(subdata, self.splitpct, self.minalign, gapdict)
-        if out1 == '' and out2 == '':
-            pass
-        else:
-            # Parse breakpoints
-            final = breakpoint_parser(out2, self.minlen, sig_index, seed)
-            self.total_out.extend(final)
-            for i in final:
-                self.parse_dict[i.split('\t')[8]] = i
-
     # FASTA extractor
     def fasta_extract(self, qnames):
         fasta = os.path.join(self.dir, 'temp1.fa')