From 7adeeb9fac15619733dc4c377336e435ed13f507 Mon Sep 17 00:00:00 2001 From: Haibao Tang Date: Mon, 29 Apr 2024 22:50:22 -0700 Subject: [PATCH] Deprecate some assembly code --- jcvi/assembly/ca.py | 1216 ----------------------------------- jcvi/assembly/preprocess.py | 93 +-- jcvi/formats/bed.py | 163 +---- 3 files changed, 39 insertions(+), 1433 deletions(-) delete mode 100644 jcvi/assembly/ca.py diff --git a/jcvi/assembly/ca.py b/jcvi/assembly/ca.py deleted file mode 100644 index 4dbbacd6..00000000 --- a/jcvi/assembly/ca.py +++ /dev/null @@ -1,1216 +0,0 @@ -#!/usr/bin/env python -# -*- coding: UTF-8 -*- - -""" -Prepare input files for Celera Assembler, dispatch based on file suffix:: - -*.fasta: convert-fasta-to-v2.pl -*.sff: sffToCA -*.fastq: fastqToCA -""" -import logging -import os.path as op -import sys -from pickle import dump, load - -import networkx as nx -from collections import Counter -from random import choice -from Bio import SeqIO - -from jcvi.formats.base import must_open -from jcvi.formats.fasta import Fasta, SeqRecord, filter, format, parse_fasta -from jcvi.formats.blast import Blast -from jcvi.utils.console import printf -from jcvi.utils.range import range_minmax -from jcvi.algorithms.graph import graph_stats, graph_local_neighborhood -from jcvi.apps.base import ( - OptionParser, - ActionDispatcher, - sh, - need_update, - glob, - get_abs_path, - popen, -) - - -def main(): - - actions = ( - ("tracedb", "convert trace archive files to frg file"), - ("clr", "prepare vector clear range file based on BLAST to vectors"), - ("fasta", "convert fasta to frg file"), - ("sff", "convert 454 reads to frg file"), - ("fastq", "convert Illumina reads to frg file"), - ("shred", "shred contigs into pseudo-reads"), - ("astat", "generate the coverage-rho scatter plot"), - ("unitigs", "output uniquely extended unitigs based on best.edges"), - ("merger", "merge reads into unitigs offline"), - ("removecontains", "remove contained reads from gkpStore"), - ("graph", "visualize best.edges"), - ("prune", "prune overlap graph"), - ("overlap", "visualize overlaps for a given fragment"), - ) - p = ActionDispatcher(actions) - p.dispatch(globals()) - - -frgTemplate = """{{FRG -act:A -acc:{fragID} -rnd:1 -sta:G -lib:{libID} -pla:0 -loc:0 -src: -. -seq: -{seq} -. -qlt: -{qvs} -. -hps: -. -clr:{clr_beg},{clr_end} -}}""" - -headerTemplate = """{{VER -ver:2 -}} -{{LIB -act:A -acc:{libID} -ori:U -mea:0.000 -std:0.000 -src: -. -nft:17 -fea: -forceBOGunitigger=1 -isNotRandom=0 -doNotTrustHomopolymerRuns=0 -doTrim_initialNone=1 -doTrim_initialMerBased=0 -doTrim_initialFlowBased=0 -doTrim_initialQualityBased=0 -doRemoveDuplicateReads=1 -doTrim_finalLargestCovered=1 -doTrim_finalEvidenceBased=0 -doTrim_finalBestEdge=0 -doRemoveSpurReads=1 -doRemoveChimericReads=1 -doCheckForSubReads=0 -doConsensusCorrection=0 -forceShortReadFormat=0 -constantInsertSize=0 -. -}}""" - - -class OverlapLine(object): - - # See doc: http://wgs-assembler.sourceforge.net/wiki/index.php/OverlapStore - def __init__(self, line): - args = line.split() - self.aid = int(args[0]) - self.bid = int(args[1]) - self.orientation = args[2] - self.ahang = int(args[3]) - self.bhang = int(args[4]) - self.erate = float(args[5]) - self.erate_adj = float(args[6]) - - -def add_graph_options(p): - p.add_option("--maxerr", default=100, type="int", help="Maximum error rate") - p.add_option( - "--frgctg", - default="../9-terminator/asm.posmap.frgctg", - help="Annotate graph with contig membership", - ) - - -def prune(args): - """ - %prog prune best.edges - - Prune overlap graph. - """ - from collections import defaultdict - - p = OptionParser(prune.__doc__) - add_graph_options(p) - opts, args = p.parse_args(args) - - if len(args) != 1: - sys.exit(not p.print_help()) - - (bestedges,) = args - G = read_graph(bestedges, maxerr=opts.maxerr) - reads_to_ctgs = parse_ctgs(opts.frgctg) - edges = defaultdict(int) - r = defaultdict(int) - for a, b, d in G.edges_iter(data=True): - ua, ub = reads_to_ctgs.get(a), reads_to_ctgs.get(b) - nn = (ua, ub).count(None) - if nn == 0: - if ua == ub: - r["Same tigs"] += 1 - else: - r["Diff tigs"] += 1 - if ua > ub: - ua, ub = ub, ua - edges[(ua, ub)] += 1 - elif nn == 1: - r["One null"] += 1 - else: - assert nn == 2 - r["Two nulls"] += 1 - - U = nx.Graph() - difftigs = "diff_tigs.txt" - neighbors = defaultdict(list) - fw = open(difftigs, "w") - for (ua, ub), count in edges.items(): - print("\t".join((ua, ub, str(count))), file=fw) - U.add_edge(ua, ub, weight=count) - neighbors[ua].append((ub, count)) - neighbors[ub].append((ua, count)) - fw.close() - - print("[Unitig edge property]", file=sys.stderr) - for k, v in r.items(): - print(": ".join((k, str(v))), file=sys.stderr) - print("Total: {0}".format(sum(r.values())), file=sys.stderr) - - print("[Unitig degree distribution]", file=sys.stderr) - degrees = U.degree() - degree_counter = Counter(degrees.values()) - for degree, count in sorted(degree_counter.items()): - print("{0}\t{1}".format(degree, count), file=sys.stderr) - - # To find associative contigs, one look for a contig that is connected and - # only connected to another single contig - and do that recursively until no - # more contigs can be found - associative = {} - for ua, ubs in neighbors.items(): - if len(ubs) == 1: # Only one neighbor - ub, count = ubs[0] - if count >= 2: # Bubble - associative[ua] = (ub, count) - print( - "A total of {0} associative contigs found".format(len(associative)), - file=sys.stderr, - ) - - # Keep only one for mutual associative - for ua, ub in associative.items(): - if ub in associative and ua < ub: - print(ua, "mutually associative with", ub, file=sys.stderr) - del associative[ub] - print( - "A total of {0} associative contigs retained".format(len(associative)), - file=sys.stderr, - ) - - assids = "associative.ids" - fw = open(assids, "w") - for ua, (ub, count) in sorted(associative.items(), key=lambda x: (x[1], x[0])): - print("\t".join((ua, ub, str(count))), file=fw) - fw.close() - logging.debug("Associative contigs written to `{0}`".format(assids)) - - -def removecontains(args): - """ - %prog removecontains 4-unitigger/best.contains asm.gkpStore - - Remove contained reads from gkpStore. This will improve assembly contiguity - without sacrificing accuracy, when using bogart unitigger. - """ - p = OptionParser(removecontains.__doc__) - opts, args = p.parse_args(args) - - if len(args) != 2: - sys.exit(not p.print_help()) - - contains, gkpStore = args - - s = set() - fp = open(contains) - for row in fp: - if row[0] == "#": - continue - iid = int(row.split()[0]) - s.add(iid) - - cmd = "gatekeeper -dumpfragments -lastfragiid {}".format(gkpStore) - gkpmsg = popen(cmd).read() - last_iid = int(gkpmsg.strip().split()[-1]) - - ndeleted = 0 - editfile = "delete.edit" - fw = open(editfile, "w") - for iid in range(1, last_iid + 1): - if iid in s: - print("frg iid {0} isdeleted 1".format(iid), file=fw) - ndeleted += 1 - - fw.close() - assert len(s) == ndeleted - logging.debug("A total of {0} contained reads flagged as deleted.".format(ndeleted)) - print("Now you can run:", file=sys.stderr) - print("$ gatekeeper --edit {0} {1}".format(editfile, gkpStore), file=sys.stderr) - - -def overlap(args): - """ - %prog overlap best.contains iid - - Visualize overlaps for a given fragment. Must be run in 4-unitigger. All - overlaps for iid were retrieved, excluding the ones matching best.contains. - """ - p = OptionParser(overlap.__doc__) - p.add_option("--maxerr", default=2, type="int", help="Maximum error rate") - p.add_option("--canvas", default=100, type="int", help="Canvas size") - opts, args = p.parse_args(args) - - if len(args) != 2: - sys.exit(not p.print_help()) - - bestcontains, iid = args - canvas = opts.canvas - - bestcontainscache = bestcontains + ".cache" - if need_update(bestcontains, bestcontainscache): - fp = open(bestcontains) - fw = open(bestcontainscache, "w") - exclude = set() - for row in fp: - if row[0] == "#": - continue - j = int(row.split()[0]) - exclude.add(j) - dump(exclude, fw) - fw.close() - - exclude = load(open(bestcontainscache)) - logging.debug("A total of {0} reads to exclude".format(len(exclude))) - - cmd = "overlapStore -d ../asm.ovlStore -b {0} -e {0}".format(iid) - cmd += " -E {0}".format(opts.maxerr) - frags = [] - for row in popen(cmd): - r = OverlapLine(row) - if r.bid in exclude: - continue - frags.append(r) - - # Also include to query fragment - frags.append(OverlapLine("{0} {0} N 0 0 0 0".format(iid))) - frags.sort(key=lambda x: x.ahang) - - # Determine size of the query fragment - cmd = "gatekeeper -b {0} -e {0}".format(iid) - cmd += " -tabular -dumpfragments ../asm.gkpStore" - fp = popen(cmd) - size = int(next(fp).split()[-1]) - - # Determine size of canvas - xmin = min(x.ahang for x in frags) - xmax = max(x.bhang for x in frags) - xsize = -xmin + size + xmax - ratio = xsize / canvas - - for f in frags: - fsize = -f.ahang + size + f.bhang - a = (f.ahang - xmin) / ratio - b = fsize / ratio - t = "-" * b - if f.orientation == "N": - t = t[:-1] + ">" - else: - t = "<" + t[1:] - if f.ahang == 0 and f.bhang == 0: - t = "[green]{}".format(t) - c = canvas - a - b - printf( - "{}{}{}{} ({})".format( - " " * a, t, " " * c, str(f.bid).rjust(10), f.erate_adj - ), - ) - - -def parse_ctgs(frgtoctg): - cache = "frgtoctg.cache" - if need_update(frgtoctg, cache): - reads_to_ctgs = {} - frgtodeg = frgtoctg.replace(".frgctg", ".frgdeg") - iidtouid = frgtoctg.replace(".posmap.frgctg", ".iidtouid") - fp = open(iidtouid) - frgstore = {} - for row in fp: - tag, iid, uid = row.split() - if tag == "FRG": - frgstore[uid] = int(iid) - - for pf, f in zip(("ctg", "deg"), (frgtoctg, frgtodeg)): - fp = open(f) - logging.debug("Parse posmap file `{0}`".format(f)) - for row in fp: - frg, ctg = row.split()[:2] - frg = frgstore[frg] - reads_to_ctgs[frg] = pf + ctg - logging.debug("Loaded mapping: {0}".format(len(reads_to_ctgs))) - - fw = open(cache, "w") - dump(reads_to_ctgs, fw) - fw.close() - logging.debug("Contig mapping written to `{0}`".format(cache)) - - reads_to_ctgs = load(open(cache)) - logging.debug("Contig mapping loaded from `{0}`".format(cache)) - return reads_to_ctgs - - -def read_graph(bestedges, maxerr=100, directed=False): - logging.debug("Max error = {0}%".format(maxerr)) - tag = "dir." if directed else "" - bestgraph = bestedges.split(".")[0] + ".err{0}.{1}graph".format(maxerr, tag) - if need_update(bestedges, bestgraph): - G = {} if directed else nx.Graph() - fp = open(bestedges) - best_store = {} - for row in fp: - if row[0] == "#": - continue - id1, lib_id, best5, o5, best3, o3, j1, j2 = row.split() - id1, best5, best3 = int(id1), int(best5), int(best3) - j1, j2 = float(j1), float(j2) - if j1 <= maxerr or j2 <= maxerr: - if not directed: - G.add_node(id1) - id1p5, id1p3 = "{0}-5'".format(id1), "{0}-3'".format(id1) - best5o5 = "{0}-{1}".format(best5, o5) - best3o3 = "{0}-{1}".format(best3, o3) - best_store[id1p5] = best5o5 - best_store[id1p3] = best3o3 - if best5 and j1 <= maxerr: - if directed: - G[id1p5] = best5o5 - else: - G.add_edge(best5, id1, weight=10) - if best3 and j2 <= maxerr: - if directed: - G[id1p3] = best3o3 - else: - G.add_edge(id1, best3, weight=10) - - # Annotate edge weight for mutual best link, note that edge weights are - # (11) set close to 10, to minimize impact to layout (Yifan Hu's - # multilevel) - nmutuals = 0 - for k, v in best_store.items(): - if best_store.get(v) == k and k < v: - k, v = int(k.split("-")[0]), int(v.split("-")[0]) - G[k][v]["weight"] = 11 - nmutuals += 1 - logging.debug("Mutual best edges: {0}".format(nmutuals)) - - if directed: - fw = open(bestgraph, "w") - dump(G, fw) - fw.close() - else: - nx.write_gpickle(G, bestgraph) - logging.debug("Graph pickled to `{0}`".format(bestgraph)) - - # Compute node degree histogram and save in (degree, counts) tab file - degrees = G.degree() - degree_counter = Counter(degrees.values()) - degreesfile = "degrees.txt" - fw = open(degreesfile, "w") - for degree, count in sorted(degree_counter.items()): - print("{0}\t{1}".format(degree, count), file=fw) - fw.close() - logging.debug("Node degree distribution saved to `{0}`".format(degreesfile)) - - # Save high degree (top 1%) nodes in save in (node, degree) tab file - percentile = sorted(degrees.values(), reverse=True)[len(degrees) / 1000] - logging.debug("Top 0.1% has degree of at least {0}".format(percentile)) - hubs = [(k, v) for k, v in degrees.items() if v >= percentile] - hubs.sort(key=lambda x: x[1], reverse=True) # degress descending - hubsfile = "hubs.txt" - fw = open(hubsfile, "w") - for node, degree in hubs: - print("{0}\t{1}".format(node, degree), file=fw) - fw.close() - logging.debug("Hubs saved to `{0}`".format(hubsfile)) - - logging.debug("Read graph from `{0}`".format(bestgraph)) - if directed: - G = load(open(bestgraph)) - else: - G = nx.read_gpickle(bestgraph) - graph_stats(G) - return G - - -def merger(args): - """ - %prog merger layout gkpStore contigs.fasta - - Merge reads into one contig. - """ - p = OptionParser(merger.__doc__) - opts, args = p.parse_args(args) - - if len(args) != 3: - sys.exit(not p.print_help()) - - layout, gkpstore, contigs = args - fp = open(layout) - pf = "0" - iidfile = pf + ".iids" - for i, row in enumerate(fp): - logging.debug("Read unitig {0}".format(i)) - fw = open(iidfile, "w") - layout = row.split("|") - print("\n".join(layout), file=fw) - fw.close() - cmd = "gatekeeper -iid {0}.iids -dumpfasta {0} {1}".format(pf, gkpstore) - sh(cmd) - - fastafile = "{0}.fasta".format(pf) - newfastafile = "{0}.new.fasta".format(pf) - format( - [ - fastafile, - newfastafile, - "--sequential=replace", - "--sequentialoffset=1", - "--nodesc", - ] - ) - fasta([newfastafile]) - - sh("rm -rf {0}".format(pf)) - cmd = "runCA {0}.frg -p {0} -d {0} consensus=pbutgcns".format(pf) - cmd += " unitigger=bogart doFragmentCorrection=0 doUnitigSplitting=0" - sh(cmd) - outdir = "{0}/9-terminator".format(pf) - - cmd = "cat {0}/{1}.ctg.fasta {0}/{1}.deg.fasta {0}/{1}.singleton.fasta".format( - outdir, pf - ) - sh(cmd, outfile=contigs, append=True) - - -def unitigs(args): - """ - %prog unitigs best.edges - - Reads Celera Assembler's "best.edges" and extract all unitigs. - """ - p = OptionParser(unitigs.__doc__) - p.add_option("--maxerr", default=2, type="int", help="Maximum error rate") - opts, args = p.parse_args(args) - - if len(args) != 1: - sys.exit(not p.print_help()) - - (bestedges,) = args - G = read_graph(bestedges, maxerr=opts.maxerr, directed=True) - H = nx.Graph() - intconv = lambda x: int(x.split("-")[0]) - for k, v in G.iteritems(): - if k == G.get(v, None): - H.add_edge(intconv(k), intconv(v)) - - nunitigs = nreads = 0 - for h in nx.connected_component_subgraphs(H, copy=False): - st = [x for x in h if h.degree(x) == 1] - if len(st) != 2: - continue - src, target = st - path = list(nx.all_simple_paths(h, src, target)) - assert len(path) == 1 - (path,) = path - print("|".join(str(x) for x in path)) - nunitigs += 1 - nreads += len(path) - logging.debug( - "A total of {0} unitigs built from {1} reads.".format(nunitigs, nreads) - ) - - -def graph(args): - """ - %prog graph best.edges - - Convert Celera Assembler's "best.edges" to a GEXF which can be used to - feed into Gephi to check the topology of the best overlapping graph. Mutual - best edges are represented as thicker edges. - - Reference: - https://github.com/PacificBiosciences/Bioinformatics-Training/blob/master/scripts/CeleraToGephi.py - """ - p = OptionParser(graph.__doc__) - p.add_option( - "--query", - default=-1, - type="int", - help="Search from node, -1 to select random node, 0 to disable", - ) - p.add_option("--contig", help="Search from contigs, use comma to separate") - p.add_option( - "--largest", default=0, type="int", help="Only show largest components" - ) - p.add_option("--maxsize", default=500, type="int", help="Max graph size") - p.add_option( - "--nomutualbest", - default=False, - action="store_true", - help="Do not plot mutual best edges as heavy", - ) - add_graph_options(p) - opts, args = p.parse_args(args) - - if len(args) != 1: - sys.exit(not p.print_help()) - - (bestedges,) = args - query = opts.query - contig = opts.contig - largest = opts.largest - frgctg = opts.frgctg - edgeweight = not opts.nomutualbest - G = read_graph(bestedges, maxerr=opts.maxerr) - - if largest: - H = list(nx.connected_component_subgraphs(G)) - c = min(len(H), largest) - logging.debug("{0} components found, {1} retained".format(len(H), c)) - G = nx.Graph() - for x in H[:c]: - G.add_edges_from(x.edges()) - - if query: - if query == -1: - query = choice(G.nodes()) - reads_to_ctgs = parse_ctgs(frgctg) - if contig: - contigs = set(contig.split(",")) - core = [k for k, v in reads_to_ctgs.items() if v in contigs] - else: - ctg = reads_to_ctgs.get(query) - core = [k for k, v in reads_to_ctgs.items() if v == ctg] - logging.debug( - "Reads ({0}) extended from the same contig {1}".format(len(core), ctg) - ) - - # Extract a local neighborhood - SG = nx.Graph() - H = graph_local_neighborhood(G, query=core, maxsize=opts.maxsize) - SG.add_edges_from(H.edges(data=edgeweight)) - G = SG - - seen = [] - for n, attrib in G.nodes_iter(data=True): - contig = reads_to_ctgs.get(n, "na") - attrib["label"] = contig - seen.append(contig) - c = Counter(seen) - cc = ["{0}({1})".format(k, v) for k, v in c.most_common()] - print("Contigs: {0}".format(" ".join(cc)), file=sys.stderr) - - gexf = "best" - if query >= 0: - gexf += ".{0}".format(query) - gexf += ".gexf" - nx.write_gexf(G, gexf) - logging.debug( - "Graph written to `{0}` (|V|={1}, |E|={2})".format(gexf, len(G), G.size()) - ) - - -def astat(args): - """ - %prog astat coverage.log - - Create coverage-rho scatter plot. - """ - p = OptionParser(astat.__doc__) - p.add_option("--cutoff", default=1000, type="int", help="Length cutoff") - p.add_option("--genome", default="", help="Genome name") - p.add_option( - "--arrDist", - default=False, - action="store_true", - help="Use arrDist instead", - ) - opts, args = p.parse_args(args) - - if len(args) != 1: - sys.exit(not p.print_help()) - - (covfile,) = args - cutoff = opts.cutoff - genome = opts.genome - plot_arrDist = opts.arrDist - - suffix = ".{0}".format(cutoff) - small_covfile = covfile + suffix - update_covfile = need_update(covfile, small_covfile) - if update_covfile: - fw = open(small_covfile, "w") - else: - logging.debug("Found `{0}`, will use this one".format(small_covfile)) - covfile = small_covfile - - fp = open(covfile) - header = next(fp) - if update_covfile: - fw.write(header) - - data = [] - msg = "{0} tigs scanned ..." - for row in fp: - tigID, rho, covStat, arrDist = row.split() - tigID = int(tigID) - if tigID % 1000000 == 0: - sys.stderr.write(msg.format(tigID) + "\r") - - rho, covStat, arrDist = [float(x) for x in (rho, covStat, arrDist)] - if rho < cutoff: - continue - - if update_covfile: - fw.write(row) - data.append((tigID, rho, covStat, arrDist)) - - print(msg.format(tigID), file=sys.stderr) - - from jcvi.graphics.base import plt, savefig - - logging.debug("Plotting {0} data points.".format(len(data))) - tigID, rho, covStat, arrDist = zip(*data) - - y = arrDist if plot_arrDist else covStat - ytag = "arrDist" if plot_arrDist else "covStat" - - fig = plt.figure(1, (7, 7)) - ax = fig.add_axes([0.12, 0.1, 0.8, 0.8]) - ax.plot(rho, y, ".", color="lightslategrey") - - xtag = "rho" - info = (genome, xtag, ytag) - title = "{0} {1} vs. {2}".format(*info) - ax.set_title(title) - ax.set_xlabel(xtag) - ax.set_ylabel(ytag) - - if plot_arrDist: - ax.set_yscale("log") - - imagename = "{0}.png".format(".".join(info)) - savefig(imagename, dpi=150) - - -def emitFragment(fw, fragID, libID, shredded_seq, clr=None, qvchar="l", fasta=False): - """ - Print out the shredded sequence. - """ - if fasta: - s = SeqRecord(shredded_seq, id=fragID, description="") - SeqIO.write([s], fw, "fasta") - return - - seq = str(shredded_seq) - slen = len(seq) - qvs = qvchar * slen # shredded reads have default low qv - - if clr is None: - clr_beg, clr_end = 0, slen - else: - clr_beg, clr_end = clr - - print( - frgTemplate.format( - fragID=fragID, - libID=libID, - seq=seq, - qvs=qvs, - clr_beg=clr_beg, - clr_end=clr_end, - ), - file=fw, - ) - - -def shred(args): - """ - %prog shred fastafile - - Similar to the method of `shredContig` in runCA script. The contigs are - shredded into pseudo-reads with certain length and depth. - """ - p = OptionParser(shred.__doc__) - p.set_depth(depth=2) - p.add_option( - "--readlen", - default=1000, - type="int", - help="Desired length of the reads", - ) - p.add_option( - "--minctglen", - default=0, - type="int", - help="Ignore contig sequence less than", - ) - p.add_option( - "--shift", - default=50, - type="int", - help="Overlap between reads must be at least", - ) - p.add_option( - "--fasta", - default=False, - action="store_true", - help="Output shredded reads as FASTA sequences", - ) - opts, args = p.parse_args(args) - - if len(args) != 1: - sys.exit(not p.print_help()) - - (fastafile,) = args - libID = fastafile.split(".")[0] - depth = opts.depth - readlen = opts.readlen - shift = opts.shift - - outfile = libID + ".depth{0}".format(depth) - if opts.fasta: - outfile += ".fasta" - else: - outfile += ".frg" - f = Fasta(fastafile, lazy=True) - - fw = must_open(outfile, "w", checkexists=True) - if not opts.fasta: - print(headerTemplate.format(libID=libID), file=fw) - - """ - Taken from runCA: - - |*********| - |###################| - |--------------------------------------------------| - ---------------1--------------- - ---------------2--------------- - ---------------3--------------- - *** - center_increments - ### - center_range_width - """ - for ctgID, (name, rec) in enumerate(f.iteritems_ordered()): - seq = rec.seq - seqlen = len(seq) - if seqlen < opts.minctglen: - continue - - shredlen = min(seqlen - shift, readlen) - numreads = max(seqlen * depth / shredlen, 1) - center_range_width = seqlen - shredlen - - ranges = [] - if depth == 1: - if seqlen < readlen: - ranges.append((0, seqlen)) - else: - for begin in range(0, seqlen, readlen - shift): - end = min(seqlen, begin + readlen) - ranges.append((begin, end)) - else: - if numreads == 1: - ranges.append((0, shredlen)) - else: - prev_begin = -1 - center_increments = center_range_width * 1.0 / (numreads - 1) - for i in range(numreads): - begin = center_increments * i - end = begin + shredlen - begin, end = int(begin), int(end) - - if begin == prev_begin: - continue - - ranges.append((begin, end)) - prev_begin = begin - - for shredID, (begin, end) in enumerate(ranges): - shredded_seq = seq[begin:end] - fragID = "{0}.{1}.frag{2}.{3}-{4}".format(libID, ctgID, shredID, begin, end) - emitFragment(fw, fragID, libID, shredded_seq, fasta=opts.fasta) - - fw.close() - logging.debug("Shredded reads are written to `{0}`.".format(outfile)) - return outfile - - -def tracedb(args): - """ - %prog tracedb - - Run `tracedb-to-frg.pl` within current folder. - """ - p = OptionParser(tracedb.__doc__) - - opts, args = p.parse_args(args) - - if len(args) != 1: - sys.exit(p.print_help()) - - (action,) = args - assert action in ("xml", "lib", "frg") - - CMD = "tracedb-to-frg.pl" - xmls = glob("xml*") - - if action == "xml": - for xml in xmls: - cmd = CMD + " -xml {0}".format(xml) - sh(cmd, outfile="/dev/null", errfile="/dev/null", background=True) - - elif action == "lib": - cmd = CMD + " -lib {0}".format(" ".join(xmls)) - sh(cmd) - - elif action == "frg": - for xml in xmls: - cmd = CMD + " -frg {0}".format(xml) - sh(cmd, background=True) - - -def make_matepairs(fastafile): - """ - Assumes the mates are adjacent sequence records - """ - assert op.exists(fastafile) - - matefile = fastafile.rsplit(".", 1)[0] + ".mates" - if op.exists(matefile): - logging.debug("matepairs file `{0}` found".format(matefile)) - else: - logging.debug("parsing matepairs from `{0}`".format(fastafile)) - matefw = open(matefile, "w") - it = SeqIO.parse(fastafile, "fasta") - for fwd, rev in zip(it, it): - print("{0}\t{1}".format(fwd.id, rev.id), file=matefw) - - matefw.close() - - return matefile - - -get_mean_sv = lambda size: (size, size / 5) - - -def split_fastafile(fastafile, maxreadlen=32000): - pf = fastafile.split(".")[0] - smallfastafile = pf + "-small.fasta" - bigfastafile = pf + "-big.fasta" - shredfastafile = pf + "-big.depth1.fasta" - - if need_update(fastafile, (smallfastafile, shredfastafile)): - filter([fastafile, str(maxreadlen), "--less", "-o", smallfastafile]) - filter([fastafile, str(maxreadlen), "-o", bigfastafile]) - shred( - [ - "--depth=1", - "--shift={0}".format(maxreadlen / 100), - "--readlen={0}".format(maxreadlen), - "--fasta", - bigfastafile, - ] - ) - - return smallfastafile, shredfastafile - - -def fasta(args): - """ - %prog fasta fastafile - - Convert reads formatted as FASTA file, and convert to CA frg file. If .qual - file is found, then use it, otherwise just make a fake qual file. Mates are - assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a - matefile is given. - """ - from jcvi.formats.fasta import clean, make_qual - - p = OptionParser(fasta.__doc__) - p.add_option( - "--clean", - default=False, - action="store_true", - help="Clean up irregular chars in seq", - ) - p.add_option("--matefile", help="Matepairs file") - p.add_option( - "--maxreadlen", default=262143, type="int", help="Maximum read length allowed" - ) - p.add_option( - "--minreadlen", default=1000, type="int", help="Minimum read length allowed" - ) - p.add_option( - "--sequential", - default=False, - action="store_true", - help="Overwrite read name (e.g. long Pacbio name)", - ) - p.set_size() - opts, args = p.parse_args(args) - - if len(args) != 1: - sys.exit(not p.print_help()) - - (fastafile,) = args - maxreadlen = opts.maxreadlen - minreadlen = opts.minreadlen - if maxreadlen > 0: - split = False - f = Fasta(fastafile, lazy=True) - for id, size in f.itersizes_ordered(): - if size > maxreadlen: - logging.debug( - "Sequence {0} (size={1}) longer than max read len {2}".format( - id, size, maxreadlen - ) - ) - split = True - break - - if split: - for f in split_fastafile(fastafile, maxreadlen=maxreadlen): - fasta([f, "--maxreadlen=0"]) - return - - plate = op.basename(fastafile).split(".")[0] - - mated = opts.size != 0 - mean, sv = get_mean_sv(opts.size) - - if mated: - libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate - else: - libname = plate - - frgfile = libname + ".frg" - - if opts.clean: - cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta" - if need_update(fastafile, cleanfasta): - clean([fastafile, "--canonical", "-o", cleanfasta]) - fastafile = cleanfasta - - if mated: - qualfile = make_qual(fastafile, score=21) - if opts.matefile: - matefile = opts.matefile - assert op.exists(matefile) - else: - matefile = make_matepairs(fastafile) - - cmd = "convert-fasta-to-v2.pl" - cmd += " -l {0} -s {1} -q {2} ".format(libname, fastafile, qualfile) - if mated: - cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile) - - sh(cmd, outfile=frgfile) - return - - fw = must_open(frgfile, "w") - print(headerTemplate.format(libID=libname), file=fw) - - sequential = opts.sequential - i = j = 0 - for fragID, seq in parse_fasta(fastafile): - if len(seq) < minreadlen: - j += 1 - continue - i += 1 - if sequential: - fragID = libname + str(100000000 + i) - emitFragment(fw, fragID, libname, seq) - fw.close() - - logging.debug( - "A total of {0} fragments written to `{1}` ({2} discarded).".format( - i, frgfile, j - ) - ) - - -def sff(args): - """ - %prog sff sffiles - - Convert reads formatted as 454 SFF file, and convert to CA frg file. - Turn --nodedup on if another deduplication mechanism is used (e.g. - CD-HIT-454). See assembly.sff.deduplicate(). - """ - p = OptionParser(sff.__doc__) - p.add_option( - "--prefix", dest="prefix", default=None, help="Output frg filename prefix" - ) - p.add_option( - "--nodedup", - default=False, - action="store_true", - help="Do not remove duplicates", - ) - p.set_size() - opts, args = p.parse_args(args) - - if len(args) < 1: - sys.exit(p.print_help()) - - sffiles = args - plates = [x.split(".")[0].split("_")[-1] for x in sffiles] - - mated = opts.size != 0 - mean, sv = get_mean_sv(opts.size) - - if len(plates) > 1: - plate = plates[0][:-1] + "X" - else: - plate = "_".join(plates) - - if mated: - libname = "Titan{0}Kb-".format(opts.size / 1000) + plate - else: - libname = "TitanFrags-" + plate - - if opts.prefix: - libname = opts.prefix - - cmd = "sffToCA" - cmd += " -libraryname {0} -output {0} ".format(libname) - cmd += " -clear 454 -trim chop " - if mated: - cmd += " -linker titanium -insertsize {0} {1} ".format(mean, sv) - if opts.nodedup: - cmd += " -nodedup " - - cmd += " ".join(sffiles) - - sh(cmd) - - -def fastq(args): - """ - %prog fastq fastqfile - - Convert reads formatted as FASTQ file, and convert to CA frg file. - """ - from jcvi.formats.fastq import guessoffset - - p = OptionParser(fastq.__doc__) - p.add_option( - "--outtie", - dest="outtie", - default=False, - action="store_true", - help="Are these outie reads?", - ) - p.set_phred() - p.set_size() - - opts, args = p.parse_args(args) - - if len(args) < 1: - sys.exit(p.print_help()) - - fastqfiles = [get_abs_path(x) for x in args] - size = opts.size - outtie = opts.outtie - if size > 1000 and (not outtie): - logging.debug("[warn] long insert size {0} but not outtie".format(size)) - - mated = size != 0 - libname = op.basename(args[0]).split(".")[0] - libname = libname.replace("_1_sequence", "") - - frgfile = libname + ".frg" - mean, sv = get_mean_sv(opts.size) - - cmd = "fastqToCA" - cmd += " -libraryname {0} ".format(libname) - fastqs = " ".join("-reads {0}".format(x) for x in fastqfiles) - if mated: - assert len(args) in (1, 2), "you need one or two fastq files for mated library" - fastqs = "-mates {0}".format(",".join(fastqfiles)) - cmd += "-insertsize {0} {1} ".format(mean, sv) - cmd += fastqs - - offset = int(opts.phred) if opts.phred else guessoffset([fastqfiles[0]]) - illumina = offset == 64 - if illumina: - cmd += " -type illumina" - if outtie: - cmd += " -outtie" - - sh(cmd, outfile=frgfile) - - -def clr(args): - """ - %prog blastfile fastafiles - - Calculate the vector clear range file based BLAST to the vectors. - """ - p = OptionParser(clr.__doc__) - opts, args = p.parse_args(args) - - if len(args) < 2: - sys.exit(not p.print_help()) - - blastfile = args[0] - fastafiles = args[1:] - - sizes = {} - for fa in fastafiles: - f = Fasta(fa) - sizes.update(f.itersizes()) - - b = Blast(blastfile) - for query, hits in b.iter_hits(): - - qsize = sizes[query] - vectors = list((x.qstart, x.qstop) for x in hits) - vmin, vmax = range_minmax(vectors) - - left_size = vmin - 1 - right_size = qsize - vmax - - if left_size > right_size: - clr_start, clr_end = 0, vmin - else: - clr_start, clr_end = vmax, qsize - - print("\t".join(str(x) for x in (query, clr_start, clr_end))) - del sizes[query] - - for q, size in sorted(sizes.items()): - print("\t".join(str(x) for x in (q, 0, size))) - - -if __name__ == "__main__": - main() diff --git a/jcvi/assembly/preprocess.py b/jcvi/assembly/preprocess.py index a9797994..49225191 100644 --- a/jcvi/assembly/preprocess.py +++ b/jcvi/assembly/preprocess.py @@ -7,28 +7,28 @@ import os import os.path as op import sys -import logging -from jcvi.formats.base import BaseFile, write_file, must_open -from jcvi.formats.fastq import guessoffset -from jcvi.utils.cbook import depends, human_size -from jcvi.apps.base import ( +from ..apps.base import ( OptionParser, ActionDispatcher, cleanup, + datadir, download, - sh, + logger, mkdir, need_update, - datadir, + sh, ) +from ..formats.base import BaseFile, must_open, write_file +from ..formats.fastq import guessoffset +from ..utils.cbook import depends, human_size class FastQCdata(BaseFile, dict): def __init__(self, filename, human=False): super(FastQCdata, self).__init__(filename) if not op.exists(filename): - logging.debug("File `{0}` not found.".format(filename)) + logger.debug("File `%s` not found.", filename) # Sample_RF37-1/RF37-1_GATCAG_L008_R2_fastqc => # RF37-1_GATCAG_L008_R2 self["Filename"] = op.basename(op.split(filename)[0]).rsplit("_", 1)[0] @@ -66,14 +66,13 @@ def __init__(self, filename, human=False): def main(): actions = ( - ("count", "count reads based on FASTQC results"), - ("trim", "trim reads using TRIMMOMATIC"), - ("correct", "correct reads using ALLPATHS-LG"), - ("hetsmooth", "reduce K-mer diversity using het-smooth"), - ("alignextend", "increase read length by extending based on alignments"), ("contamination", "check reads contamination against Ecoli"), + ("correct", "correct reads using ALLPATHS-LG"), + ("count", "count reads based on FASTQC results"), ("diginorm", "run K-mer based normalization"), ("expand", "expand sequences using short reads"), + ("hetsmooth", "reduce K-mer diversity using het-smooth"), + ("trim", "trim reads using TRIMMOMATIC"), ) p = ActionDispatcher(actions) p.dispatch(globals()) @@ -205,7 +204,7 @@ def expand(args): ) samfile, mapped, _ = get_samfile(reads, bes, bowtie=True, mapped=True) - logging.debug("Extract first {0} reads from `{1}`.".format(nreads, mapped)) + logger.debug("Extract first %d reads from `%s`.", nreads, mapped) pf = mapped.split(".")[0] pf = pf.split("-")[0] @@ -252,9 +251,7 @@ def expand(args): fw.close() cleanup(samfile, logfile, mapped, reads, fastafile, qualfile, blastfile, pf) - logging.debug( - "Annotated seqs (n={0}) written to `{1}`.".format(len(recs), annotatedfasta) - ) + logger.debug("Annotated seqs (n=%d) written to `%s`.", len(recs), annotatedfasta) return annotatedfasta @@ -302,68 +299,6 @@ def contamination(args): fw.close() -def alignextend(args): - """ - %prog alignextend ref.fasta read.1.fastq read.2.fastq - - Wrapper around AMOS alignextend. - """ - choices = "prepare,align,filter,rmdup,genreads".split(",") - p = OptionParser(alignextend.__doc__) - p.add_option( - "--nosuffix", - default=False, - action="store_true", - help="Do not add /1/2 suffix to the read", - ) - p.add_option( - "--rc", - default=False, - action="store_true", - help="Reverse complement the reads before alignment", - ) - p.add_option("--len", default=100, type="int", help="Extend to this length") - p.add_option( - "--stage", default="prepare", choices=choices, help="Start from certain stage" - ) - p.add_option( - "--dup", - default=10, - type="int", - help="Filter duplicates with coordinates within this distance", - ) - p.add_option( - "--maxdiff", default=1, type="int", help="Maximum number of differences" - ) - p.set_home("amos") - p.set_cpus() - opts, args = p.parse_args(args) - - if len(args) != 3: - sys.exit(not p.print_help()) - - ref, r1, r2 = args - pf = op.basename(r1).split(".")[0] - cmd = op.join(opts.amos_home, "src/Experimental/alignextend.pl") - if not opts.nosuffix: - cmd += " -suffix" - bwa_idx = "{0}.ref.fa.sa".format(pf) - if not need_update(ref, bwa_idx): - cmd += " -noindex" - cmd += " -threads {0}".format(opts.cpus) - offset = guessoffset([r1]) - if offset == 64: - cmd += " -I" - if opts.rc: - cmd += " -rc" - cmd += " -allow -len {0} -dup {1}".format(opts.len, opts.dup) - cmd += " -min {0} -max {1}".format(2 * opts.len, 20 * opts.len) - cmd += " -maxdiff {0}".format(opts.maxdiff) - cmd += " -stage {0}".format(opts.stage) - cmd += " ".join(("", pf, ref, r1, r2)) - sh(cmd) - - def count(args): """ %prog count *.gz diff --git a/jcvi/formats/bed.py b/jcvi/formats/bed.py index df6a2bb6..6b17114e 100755 --- a/jcvi/formats/bed.py +++ b/jcvi/formats/bed.py @@ -452,40 +452,38 @@ def bed_sum(beds, seqid=None, unique=True): def main(): actions = ( - ("depth", "calculate average depth per feature using coverageBed"), - ("mergebydepth", "returns union of features beyond certain depth"), - ("sort", "sort bed file"), - ("merge", "merge bed files"), - ("index", "index bed file using tabix"), - ("bins", "bin bed lengths into each window"), - ("summary", "summarize the lengths of the intervals"), - ("evaluate", "make truth table and calculate sensitivity and specificity"), - ("pile", "find the ids that intersect"), - ("pairs", "estimate insert size between paired reads from bedfile"), - ("mates", "print paired reads from bedfile"), - ("sizes", "infer the sizes for each seqid"), - ("uniq", "remove overlapping features with higher scores"), - ("longest", "select longest feature within overlapping piles"), ("bedpe", "convert to bedpe format"), + ("bins", "bin bed lengths into each window"), + ("chain", "chain bed segments together"), + ("closest", "find closest BED feature"), + ("density", "calculates density of features per seqid"), + ("depth", "calculate average depth per feature using coverageBed"), ("distance", "calculate distance between bed features"), - ("sample", "sample bed file and remove high-coverage regions"), - ("refine", "refine bed file using a second bed file"), - ("flanking", "get n flanking features for a given position"), - ("some", "get a subset of bed features given a list"), - ("fix", "fix non-standard bed files"), + ("evaluate", "make truth table and calculate sensitivity and specificity"), ("filter", "filter bedfile to retain records between size range"), ("filterbedgraph", "filter bedgraph to extract unique regions"), - ("random", "extract a random subset of features"), + ("fix", "fix non-standard bed files"), + ("flanking", "get n flanking features for a given position"), + ("format", "reformat BED file"), + ("gaps", "define gaps in BED file using complementBed"), + ("index", "index bed file using tabix"), ("juncs", "trim junctions.bed overhang to get intron, merge multiple beds"), + ("longest", "select longest feature within overlapping piles"), + ("mates", "print paired reads from bedfile"), + ("merge", "merge bed files"), + ("mergebydepth", "returns union of features beyond certain depth"), + ("pairs", "estimate insert size between paired reads from bedfile"), + ("pile", "find the ids that intersect"), + ("random", "extract a random subset of features"), + ("refine", "refine bed file using a second bed file"), + ("sample", "sample bed file and remove high-coverage regions"), ("seqids", "print out all seqids on one line"), - ("alignextend", "alignextend based on BEDPE and FASTA ref"), - ("clr", "extract clear range based on BEDPE"), - ("chain", "chain bed segments together"), - ("density", "calculates density of features per seqid"), + ("sizes", "infer the sizes for each seqid"), + ("some", "get a subset of bed features given a list"), + ("sort", "sort bed file"), + ("summary", "summarize the lengths of the intervals"), ("tiling", "compute the minimum tiling path"), - ("format", "reformat BED file"), - ("closest", "find closest BED feature"), - ("gaps", "define gaps in BED file using complementBed"), + ("uniq", "remove overlapping features with higher scores"), ) p = ActionDispatcher(actions) p.dispatch(globals()) @@ -799,52 +797,6 @@ def density(args): print("\t".join(str(x) for x in (seqid, nfeats, size, "{0:.1f}".format(ds)))) -def clr(args): - """ - %prog clr [bamfile|bedpefile] ref.fasta - - Use mates from BEDPE to extract ranges where the ref is covered by mates. - This is useful in detection of chimeric contigs. - """ - p = OptionParser(clr.__doc__) - p.set_bedpe() - opts, args = p.parse_args(args) - - if len(args) != 2: - sys.exit(not p.print_help()) - - bedpe, ref = args - if bedpe.endswith(".bam"): - bedpefile = bedpe.replace(".bam", ".bedpe") - if need_update(bedpe, bedpefile): - cmd = "bamToBed -bedpe -i {0}".format(bedpe) - sh(cmd, outfile=bedpefile) - bedpe = bedpefile - - filtered = bedpe + ".filtered" - if need_update(bedpe, filtered): - filter_bedpe( - bedpe, filtered, ref, rc=opts.rc, minlen=opts.minlen, maxlen=opts.maxlen - ) - - rmdup = filtered + ".sorted.rmdup" - if need_update(filtered, rmdup): - rmdup_bedpe(filtered, rmdup, dupwiggle=opts.dup) - - converted = rmdup + ".converted" - if need_update(rmdup, converted): - fp = open(rmdup) - fw = open(converted, "w") - for row in fp: - r = BedpeLine(row) - print(r.bedline, file=fw) - fw.close() - - merged = converted + ".merge.bed" - if need_update(converted, merged): - mergeBed(converted) - - def sfa_to_fq(sfa, qvchar): fq = sfa.rsplit(".", 1)[0] + ".fq" fp = must_open(sfa) @@ -925,71 +877,6 @@ def rmdup_bedpe(filtered, rmdup, dupwiggle=10): fw.close() -def alignextend(args): - """ - %prog alignextend bedpefile ref.fasta - - Similar idea to alignextend, using mates from BEDPE and FASTA ref. See AMOS - script here: - - https://github.com/nathanhaigh/amos/blob/master/src/Experimental/alignextend.pl - """ - p = OptionParser(alignextend.__doc__) - p.add_option("--len", default=100, type="int", help="Extend to this length") - p.add_option( - "--qv", default=31, type="int", help="Dummy qv score for extended bases" - ) - p.add_option( - "--bedonly", - default=False, - action="store_true", - help="Only generate bed files, no FASTA", - ) - p.set_bedpe() - opts, args = p.parse_args(args) - - if len(args) != 2: - sys.exit(not p.print_help()) - - bedpe, ref = args - qvchar = chr(opts.qv + 33) - pf = bedpe.split(".")[0] - - filtered = bedpe + ".filtered" - if need_update(bedpe, filtered): - filter_bedpe( - bedpe, - filtered, - ref, - rc=opts.rc, - minlen=opts.minlen, - maxlen=opts.maxlen, - rlen=opts.rlen, - ) - - rmdup = filtered + ".filtered.sorted.rmdup" - if need_update(filtered, rmdup): - rmdup_bedpe(filtered, rmdup, dupwiggle=opts.dup) - - if opts.bedonly: - return - - bed1, bed2 = pf + ".1e.bed", pf + ".2e.bed" - if need_update(rmdup, (bed1, bed2)): - sh("cut -f1-3,7-9 {0}".format(rmdup), outfile=bed1) - sh("cut -f4-6,7-8,10 {0}".format(rmdup), outfile=bed2) - - sfa1, sfa2 = pf + ".1e.sfa", pf + ".2e.sfa" - if need_update((bed1, bed2, ref), (sfa1, sfa2)): - for bed in (bed1, bed2): - fastaFromBed(bed, ref, name=True, tab=True, stranded=True) - - fq1, fq2 = pf + ".1e.fq", pf + ".2e.fq" - if need_update((sfa1, sfa2), (fq1, fq2)): - for sfa in (sfa1, sfa2): - sfa_to_fq(sfa, qvchar) - - def seqids(args): """ %prog seqids bedfile