Skip to content

Commit

Permalink
bump v1.3.7
Browse files Browse the repository at this point in the history
  • Loading branch information
cytham committed May 23, 2020
1 parent 55acb49 commit 5065d65
Show file tree
Hide file tree
Showing 12 changed files with 519 additions and 354 deletions.
10 changes: 10 additions & 0 deletions CHANGELOG.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,16 @@ NanoVar Changelog
Release Summary:


Version 1.3.7 - May 23, 2020
* Changed version import approach in setup.py
* All SV classes except deletions now undergo secondary analysis by hsblast alignment
* Allowed clustering of Nov_Ins to other SV classes
* Added depth of coverage information in VCF header
* Fixed SV index duplications
* Added BND limitation in README.md
* Removed "number_of_maps" from condfidence score equation in nv_nn.py to capture SVs consisting repetitive elements


Version 1.3.6 - Apr 18, 2020
* Fixed Exception bug in nv_cluster.py (Thanks to jiadong324, https://github.com/cytham/nanovar/issues/9#issuecomment-609146494)
* Added mincov filter early in nv_cluster.py for faster computation
Expand Down
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -141,4 +141,8 @@ Although NanoVar is provided with a universal model and threshold score, instruc

## Limitations
* The inaccurate basecalling of large homopolymer or low complexity DNA regions may result in the false determination of deletion SVs. We advise the use of up-to-date ONT basecallers such as Guppy to minimize this possibility.

* For BND SVs, NanoVar cannot calculate the actual number of SV-opposing reads (normal reads) at the novel adjacency as there
are two breakends from distant locations. Since it is not clear whether the novel adjacency is derived from both or either
breakends, it is not possible to know which breakend location(s) to consider for counting normal reads. Currently, NanoVar
approximates the normal read count by the minimum count from either breakend location. This would help to capture more true
BNDs but might also lower its precision.
8 changes: 4 additions & 4 deletions nanovar/nanovar
Original file line number Diff line number Diff line change
Expand Up @@ -295,13 +295,13 @@ def main():
thread_spin.join()
print('')
task = TaskProgress()
msg = 'Clustering SV breakends and inferencing'
msg = 'Clustering SV breakends'
spinner = Spinner(msg + ' - ')
thread_spin = threading.Thread(target=task.run, args=(spinner,))
thread_spin.setDaemon(True)
thread_spin.start()
else:
print('Clustering SV breakends and inferencing -')
print('Clustering SV breakends -')

# SV breakend clustering and extracting INS and INV SVs
run.cluster_extract()
Expand All @@ -312,13 +312,13 @@ def main():
thread_spin.join()
print('')
task = TaskProgress()
msg = 'Re-evaluating INS and INV SVs'
msg = 'Re-evaluating SVs with BLAST and inferencing'
spinner = Spinner(msg + ' - ')
thread_spin = threading.Thread(target=task.run, args=(spinner,))
thread_spin.setDaemon(True)
thread_spin.start()
else:
print('Re-evaluating INS and INV SVs -')
print('Re-evaluating SVs with BLAST and inferencing -')

# Wait for indexing or make index
if threads_index == 1:
Expand Down
622 changes: 376 additions & 246 deletions nanovar/nv_bam_parser.c

Large diffs are not rendered by default.

6 changes: 3 additions & 3 deletions nanovar/nv_bam_parser.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def bam_parse(bam, unsigned int minlen, float splitpct, unsigned int minalign, s
final = breakpoint_parser(out2, minlen, sig_index, seed)
total_out.extend(final)
for i in final:
parse_dict[i.split('\t')[8]] = i
parse_dict[i.split('\t')[8]] = '\t'.join(i.split('\t')[0:5]) + '\tmm\t' + '\t'.join(i.split('\t')[6:])
else: # Multiple alignment read
seed += 1
total_lines = []
Expand Down Expand Up @@ -172,8 +172,8 @@ def bam_parse(bam, unsigned int minlen, float splitpct, unsigned int minalign, s
final = breakpoint_parser(out2, minlen, sig_index, seed)
total_out.extend(final)
for i in final:
parse_dict[i.split('\t')[8]] = i
return total_subdata, total_out, basecov, parse_dict, rlendict, len(repeat_dict), detect_out
parse_dict[i.split('\t')[8]] = '\t'.join(i.split('\t')[0:5]) + '\tmm\t' + '\t'.join(i.split('\t')[6:])
return total_subdata, total_out, basecov, parse_dict, rlendict, len(repeat_dict), detect_out, seed


# Analyze CIGAR for Indels in segment and return read advancement call
Expand Down
47 changes: 14 additions & 33 deletions nanovar/nv_characterize.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,11 +67,13 @@ def __init__(self, wk_dir, bam, splitpct, minalign, filter_path, minlen, buff, m
# HTML SV table SV ratio limit
self.ratio_limit = 1
self.maps = 0
self.seed = 0
self.seed2 = 1

def bam_parse_detect(self):
random.seed(1)
self.total_subdata, self.total_out, self.basecov, self.parse_dict, self.rlendict, self.maps, self.detect_out = bam_parse(
self.bam, self.minlen, self.splitpct, self.minalign, self.dir, self.filter, self.contig_omit)
self.total_subdata, self.total_out, self.basecov, self.parse_dict, self.rlendict, self.maps, self.detect_out, self.seed \
= bam_parse(self.bam, self.minlen, self.splitpct, self.minalign, self.dir, self.filter, self.contig_omit)
writer(os.path.join(self.dir, 'subdata.tsv'), self.total_subdata, self.debug)
writer(os.path.join(self.dir, 'detect.tsv'), self.detect_out, self.debug)
writer(os.path.join(self.dir, 'parse1.tsv'), self.total_out, self.debug)
Expand All @@ -89,12 +91,12 @@ def coverage_stats(self):

def cluster_extract(self):
logging.info("Clustering SV breakends")
cluster_out = sv_cluster(self.total_subdata, self.total_out, self.buff, self.maxovl, self.mincov, self.contig, False)
cluster_out, self.seed2 = sv_cluster(self.total_subdata, self.total_out, self.buff, self.maxovl, self.mincov,
self.contig, False, self.seed2)
logging.info("Filtering INS and INV SVs")
total_qnames = []
for line in cluster_out:
svtype = line.split('\t')[3].split(' ')[0]
# cov = int(line.split('\t')[10])
if svtype in ['S-Nov_Ins_bp', 'E-Nov_Ins_bp', 'Nov_Ins']: # Some INS are actually DUP
qnames = [line.split('\t')[0]]
self.ins_out.append(self.parse_dict[line.split('\t')[8]])
Expand All @@ -103,22 +105,23 @@ def cluster_extract(self):
qnames.append(mate[:-6])
self.ins_out.append(self.parse_dict[mate])
total_qnames.extend(qnames)
elif svtype in ['Inv', 'Inv(1)', 'Inv(2)']: # For more precise INV bps
elif svtype == 'Del':
self.out_rest.append(line)
else: # For more precise INV, DUP, Intra and Inter bps
qnames = [line.split('\t')[0]]
if line.split('\t')[11] != '.':
for mate in line.split('\t')[11].split(','):
qnames.append(mate[:-6])
total_qnames.extend(qnames)
elif svtype not in ['S-Nov_Ins_bp', 'E-Nov_Ins_bp', 'Nov_Ins', 'Inv', 'Inv(1)', 'Inv(2)']:
self.out_rest.append(line)
qnames_dict = {x: 1 for x in set(total_qnames)}
self.fasta_extract(qnames_dict)

def cluster_nn(self, add_out):
logging.info("Re-clustering INS/INV SVs and merging")
# Merge old ins with new from hsblastn
sub_out = self.ins_out + add_out
cluster_out_ins = sv_cluster(self.total_subdata, sub_out, self.buff, self.maxovl, self.mincov, self.contig, True)
cluster_out_ins, _ = sv_cluster(self.total_subdata, sub_out, self.buff, self.maxovl, self.mincov, self.contig, True,
self.seed2)
new_cluster_out = self.out_rest + cluster_out_ins
logging.info("Neural network inference")
new_total_out = self.total_out + add_out
Expand All @@ -140,7 +143,6 @@ def parse_detect_hsb(self):
data = open(self.bam, 'r').read().splitlines()
data.append('null\tnull\tnull\tnull\tnull\tnull')
nlines = len(data) - 1
co = 0
for i in range(nlines):
if data[i].split('\t')[0] == data[i + 1].split('\t')[0]: # Grouping alignments by read name
temp1.append(align_info(data[i], self.rlendict))
Expand All @@ -150,7 +152,7 @@ def parse_detect_hsb(self):
temp1.append(align_info(data[i], self.rlendict))
if data[i].split('\t')[1].strip() not in chromocollect:
chromocollect.append(data[i].split('\t')[1].strip())
co += 1
self.seed += 1
# Parse entries and correct overlap alignments
subdata = entry_parser(temp1, chromocollect, ovlt)
for entry in subdata:
Expand All @@ -163,7 +165,7 @@ def parse_detect_hsb(self):
pass
else:
# Parse breakpoints
final = breakpoint_parser(out2, self.minlen, sig_index, co)
final = breakpoint_parser(out2, self.minlen, sig_index, self.seed)
self.total_out.extend(final)
temp1 = []
chromocollect = []
Expand All @@ -173,7 +175,7 @@ def parse_detect_hsb(self):
def vcf_report(self):
logging.info("Creating VCF")
create_vcf(self.dir, self.thres, self.out_nn, self.refpath, self.rpath, self.rname, self.mapcmd, self.contig,
self.homo_t, self.het_t, self.minlen)
self.homo_t, self.het_t, self.minlen, self.depth)
logging.info("Creating HTML report")
create_report(self.dir, self.contig, self.thres, self.rpath, self.refpath, self.rlendict, self.rname,
self.num_limit, self.ratio_limit)
Expand All @@ -183,27 +185,6 @@ def write2file(self, add_out):
writer(os.path.join(self.dir, 'parse2.tsv'), add_out, self.debug)
writer(os.path.join(self.dir, 'cluster.tsv'), self.out_nn, self.debug)

# Parse data and detect SVs
def parse_detect(self, total_lines, contig_collect, seed, gapdict, ovlt, sig_index):
lines_sort = sorted(sorted(total_lines, key=lambda x: x[1], reverse=True), key=lambda y: y[0])
temp1 = [tup[2] for tup in lines_sort]
# Parse entries and correct overlap alignments
subdata = entry_parser(temp1, contig_collect, ovlt)
for entry in subdata:
self.total_subdata.append('\t'.join(entry.split('\t')[0:5]))
# Add to base coverage
self.basecov += int(entry.split('\t')[2])
# SV detection
out1, out2 = sv_detect(subdata, self.splitpct, self.minalign, gapdict)
if out1 == '' and out2 == '':
pass
else:
# Parse breakpoints
final = breakpoint_parser(out2, self.minlen, sig_index, seed)
self.total_out.extend(final)
for i in final:
self.parse_dict[i.split('\t')[8]] = i

# FASTA extractor
def fasta_extract(self, qnames):
fasta = os.path.join(self.dir, 'temp1.fa')
Expand Down
Loading

0 comments on commit 5065d65

Please sign in to comment.