From c27dc372ad887f33c0cc46c3086c1022f3cfc652 Mon Sep 17 00:00:00 2001 From: Chen Tong Date: Wed, 27 Sep 2023 10:57:10 +0800 Subject: [PATCH 1/3] add ignore zero anchor parameter for skip no matches in batch search (#598) --- jcvi/compara/catalog.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/jcvi/compara/catalog.py b/jcvi/compara/catalog.py index 80488cc2..eb8e4749 100644 --- a/jcvi/compara/catalog.py +++ b/jcvi/compara/catalog.py @@ -666,6 +666,12 @@ def ortholog(args): dotplot_group.add_option( "--no_dotplot", default=False, action="store_true", help="Do not make dotplot" ) + p.add_option( + "--ignore_zero_anchor", + default=False, + action="store_true", + help="Ignore this pair of ortholog identification instead of throwing an error when performing many pairs of cataloging." + ) opts, args = p.parse_args(args) @@ -674,6 +680,7 @@ def ortholog(args): a, b = args dbtype = opts.dbtype + ignore_zero_anchor = opts.ignore_zero_anchor suffix = ".cds" if dbtype == "nucl" else ".pep" abed, afasta = a + ".bed", a + suffix bbed, bfasta = b + ".bed", b + suffix @@ -727,7 +734,15 @@ def ortholog(args): dargs += ["--no_strip_names"] if opts.liftover_dist: dargs += ["--liftover_dist={}".format(opts.liftover_dist)] - scan(dargs) + try: + scan(dargs) + except ValueError as e: + if ignore_zero_anchor: + logging.debug(f"{e}") + logging.debug("Ignoring this error and continuing...") + return + else: + raise ValueError(e) if quota: quota_main([lifted_anchors, "--quota={0}".format(quota), "--screen"]) if need_update(anchors, pdf, warn=True) and not opts.no_dotplot: From 7e4c5e5c2fd69970de54308788ad3b1b9d50ff3f Mon Sep 17 00:00:00 2001 From: Haibao Tang Date: Thu, 28 Sep 2023 02:59:35 -0700 Subject: [PATCH 2/3] Add a module logger (#600) * update base.py * update formats.base * update compara.base * update graphics.base * revert logging changes * use the correct Console --- jcvi/apps/base.py | 108 +++++++++++++++++++----------------------- jcvi/compara/base.py | 10 ++-- jcvi/formats/base.py | 44 +++++++---------- jcvi/graphics/base.py | 37 ++++++--------- 4 files changed, 85 insertions(+), 114 deletions(-) diff --git a/jcvi/apps/base.py b/jcvi/apps/base.py index 719c3cf2..100642c3 100644 --- a/jcvi/apps/base.py +++ b/jcvi/apps/base.py @@ -10,8 +10,6 @@ import sys import logging import fnmatch -import functools - from collections.abc import Iterable from http.client import HTTPSConnection @@ -29,7 +27,8 @@ from typing import Any, Collection, List, Optional, Union from natsort import natsorted -from rich.logging import Console, RichHandler +from rich.console import Console +from rich.logging import RichHandler from jcvi import __copyright__, __version__ @@ -40,28 +39,33 @@ JCVIHELP = "JCVI utility libraries {} [{}]\n".format(__version__, __copyright__) -def patch_debug(func): - @functools.wraps(func) - def wraps(*args, **kwargs): - import inspect +def debug(level=logging.DEBUG): + """ + Turn on the debugging + """ + logging.basicConfig( + level=level, + format="%(message)s", + datefmt="[%X]", + handlers=[RichHandler(console=Console(stderr=True))], + ) - callerframerecord = inspect.stack()[1] - frame = callerframerecord[0] - info = inspect.getframeinfo(frame) - # get caller function name - caller = frame.f_code.co_name - patch_message = f"{info.filename}:{info.lineno}:{caller}" - old_debug = logging.debug - def my_debug(message: str, *args, **kwargs): - old_debug(f"{patch_message} {message}", *args, **kwargs) +debug() - logging.debug = my_debug - ret = func(*args, **kwargs) - logging.debug = old_debug - return ret - return wraps +def get_logger(name: str): + """Return a logger with a default ColoredFormatter.""" + logger = logging.getLogger(name) + if logger.hasHandlers(): + logger.handlers.clear() + logger.addHandler(RichHandler()) + logger.propagate = False + logger.setLevel(logging.INFO) + return logger + + +logger = get_logger("jcvi") class ActionDispatcher(object): @@ -603,12 +607,12 @@ def set_image_options( iopts = ImageOptions(opts) if opts.notex: - logging.info("--notex={}. latex use is disabled.".format(opts.notex)) + logger.info("--notex=%s. latex use is disabled.", opts.notex) elif not is_tex_available(): if not bool(which("latex")): - logging.info("`latex` not found. latex use is disabled.") + logger.info("`latex` not found. latex use is disabled.") if not bool(which("lp")): - logging.info("`lp` not found. latex use is disabled.") + logger.info("`lp` not found. latex use is disabled.") setup_theme(style=opts.style, font=opts.font, usetex=iopts.usetex) @@ -1048,9 +1052,9 @@ def ConfigSectionMap(Config, section): try: cfg[option] = Config.get(section, option) if cfg[option] == -1: - logging.debug("skip: {0}".format(option)) + logger.debug("skip: %s", option) except: - logging.debug("exception on {0}!".format(option)) + logger.debug("exception on %s!", option) cfg[option] = None return cfg @@ -1131,7 +1135,7 @@ def dmain(mainfile, type="action"): def backup(filename): bakname = filename + ".bak" if op.exists(filename): - logging.debug("Backup `{0}` to `{1}`".format(filename, bakname)) + logger.debug("Backup `%s` to `%s`", filename, bakname) sh("mv {0} {1}".format(filename, bakname)) return bakname @@ -1205,7 +1209,7 @@ def sh( cmd += " &" if log: - logging.debug(cmd) + logger.debug(cmd) call_func = check_output if check else call return call_func(cmd, shell=True, executable=shell, stderr=redirect_error) @@ -1218,7 +1222,7 @@ def Popen(cmd, stdin=None, stdout=PIPE, debug=False, shell="/bin/bash"): from subprocess import Popen as P if debug: - logging.debug(cmd) + logger.debug(cmd) # See: proc = P(cmd, bufsize=1, stdin=stdin, stdout=stdout, shell=True, executable=shell) return proc @@ -1307,7 +1311,7 @@ def mkdir(dirname, overwrite=False): if overwrite: cleanup(dirname) os.mkdir(dirname) - logging.debug("Overwrite folder `{0}`.".format(dirname)) + logger.debug("Overwrite folder `%s`", dirname) else: return False # Nothing is changed else: @@ -1315,7 +1319,7 @@ def mkdir(dirname, overwrite=False): os.mkdir(dirname) except: os.makedirs(dirname) - logging.debug("`{0}` not found. Creating new.".format(dirname)) + logger.debug("`%s` not found. Creating new.", dirname) return True @@ -1377,7 +1381,7 @@ def need_update(a: str, b: str, warn: bool = False) -> bool: or any(is_newer_file(x, y) for x in a for y in b) ) if (not should_update) and warn: - logging.debug("File `{}` found. Computation skipped.".format(", ".join(b))) + logger.debug("File `%s` found. Computation skipped.", ", ".join(b)) return should_update @@ -1476,8 +1480,7 @@ def download( final_filename = filename or target if op.exists(final_filename): if debug: - msg = "File `{}` exists. Download skipped.".format(final_filename) - logging.info(msg) + logger.info("File `%s` exists. Download skipped.", final_filename) success = True else: from jcvi.utils.ez_setup import get_best_downloader @@ -1527,26 +1530,11 @@ def getfilesize(filename, ratio=None): while size < heuristicsize: size += 2**32 if size > 2**32: - logging.warning("Gzip file estimated uncompressed size: {0}.".format(size)) + logger.warning("Gzip file estimated uncompressed size: %d", size) return size -def debug(level=logging.DEBUG): - """ - Turn on the debugging - """ - logging.basicConfig( - level=level, - format="%(message)s", - datefmt="[%X]", - handlers=[RichHandler(console=Console(stderr=True))], - ) - - -debug() - - def main(): actions = ( ("less", "enhance the unix `less` command"), @@ -1603,7 +1591,7 @@ def expand(args): for a in args: oa = a.replace("/", "_") if oa in seen: - logging.debug("Name collision `{0}`, ignored.".format(oa)) + logger.debug("Name collision `%s`, ignored", oa) continue cmd = "cp -s" if opts.symlink else "mv" @@ -1995,7 +1983,7 @@ def notify(args): opts, args = p.parse_args(args) if len(args) == 0: - logging.error("Please provide a brief message to be sent") + logger.error("Please provide a brief message to be sent") sys.exit(not p.print_help()) subject = opts.subject @@ -2005,7 +1993,7 @@ def notify(args): toaddr = opts.email.split(",") # TO address should be in a list for addr in toaddr: if not is_valid_email(addr): - logging.debug("Email address `{0}` is not valid!".format(addr)) + logger.debug("Email address `%s` is not valid!", addr) sys.exit() send_email(fromaddr, toaddr, subject, message) else: @@ -2165,7 +2153,7 @@ def waitpid(args): msg = check_output(shlex.split(get_origcmd)).strip() _waitpid(pid, interval=opts.interval) else: - logging.debug("Process with PID {0} does not exist".format(pid)) + logger.debug("Process with PID %d does not exist", pid) sys.exit() if opts.notify: @@ -2188,9 +2176,9 @@ def get_config(path): config.read(path) except ParsingError: e = sys.exc_info()[1] - logging.error( - "There was a problem reading or parsing " - "your credentials file: %s" % (e.args[0],), + logger.error( + "There was a problem reading or parsing your credentials file: %s", + e.args[0], ) return config @@ -2241,16 +2229,16 @@ def getpath( else: err_msg = f"Cannot execute binary `{path}`. Please verify and rerun." if warn == "exit": - logging.fatal(err_msg) + logger.fatal(err_msg) else: - logging.warning(err_msg) + logger.warning(err_msg) return None if changed: configfile = open(cfg, "w") config.write(configfile) configfile.close() - logging.debug("Configuration written to `{0}`.".format(cfg)) + logger.debug("Configuration written to `%s`", cfg) return path diff --git a/jcvi/compara/base.py b/jcvi/compara/base.py index 9919159e..eae6ec6b 100644 --- a/jcvi/compara/base.py +++ b/jcvi/compara/base.py @@ -1,6 +1,6 @@ -import logging from collections import defaultdict +from ..apps.base import logger from ..formats.base import BaseFile, read_block, must_open from ..utils.range import Range @@ -69,9 +69,9 @@ def print_to_file(self, filename="stdout", accepted=None): print("\t".join((a, b, score)), file=fw) fw.close() - logging.debug("Removed %d existing anchors.", nremoved) - logging.debug("Corrected scores for %d anchors.", ncorrected) - logging.debug("Anchors written to `%s`.", filename) + logger.debug("Removed %d existing anchors", nremoved) + logger.debug("Corrected scores for %d anchors", ncorrected) + logger.debug("Anchors written to `%s`", filename) def blast(self, blastfile=None, outfile=None): """ @@ -102,7 +102,7 @@ def blast(self, blastfile=None, outfile=None): nlines += 1 fw.close() - logging.debug("A total of %d BLAST lines written to `%s`.", nlines, outfile) + logger.debug("A total of %d BLAST lines written to `%s`", nlines, outfile) return outfile diff --git a/jcvi/formats/base.py b/jcvi/formats/base.py index 79aa1ac0..8617a145 100644 --- a/jcvi/formats/base.py +++ b/jcvi/formats/base.py @@ -5,20 +5,20 @@ import os.path as op import math import sys -import logging from collections import OrderedDict from itertools import groupby, islice, cycle from Bio import SeqIO -from jcvi.apps.base import ( +from ..apps.base import ( OptionParser, ActionDispatcher, cleanup, - sh, - need_update, + logger, mkdir, + need_update, popen, + sh, ) @@ -28,10 +28,9 @@ class BaseFile(object): def __init__(self, filename): - self.filename = filename if filename: - logging.debug("Load file `{0}`".format(filename)) + logger.debug("Load file `%s`", filename) class LineFile(BaseFile, list): @@ -40,15 +39,12 @@ class LineFile(BaseFile, list): """ def __init__(self, filename, comment=None, load=False): - super(LineFile, self).__init__(filename) if load: fp = must_open(filename) self.lines = [l.strip() for l in fp if l[0] != comment] - logging.debug( - "Load {0} lines from `{1}`.".format(len(self.lines), filename) - ) + logger.debug("Load %d lines from `%s`", len(self.lines), filename) class DictFile(BaseFile, OrderedDict): @@ -66,7 +62,6 @@ def __init__( keycast=None, cast=None, ): - BaseFile.__init__(self, filename) OrderedDict.__init__(self) self.keypos = keypos @@ -84,7 +79,7 @@ def __init__( msg = "Must contain >= {0} columns. {1}.\n".format(ncols, action) msg += " --> Line {0}: {1}".format(lineno + 1, row) - logging.error(msg) + logger.error(msg) if strict: sys.exit(1) else: @@ -100,7 +95,7 @@ def __init__( assert thiscols, "File empty" self.ncols = thiscols - logging.debug("Imported {0} records from `{1}`.".format(len(self), filename)) + logger.debug("Imported %d records from `%s`", len(self), filename) @classmethod def num_columns(cls, filename, delimiter=None): @@ -136,7 +131,6 @@ class FileMerger(object): """ def __init__(self, filelist, outfile): - self.filelist = filelist self.outfile = outfile self.ingz = filelist[0].endswith(".gz") @@ -166,7 +160,7 @@ def __init__(self, filename, outputdir=None, format="fasta", mode="cycle"): self.mode = mode format = format or self._guess_format(filename) - logging.debug("format is %s" % format) + logger.debug("format is %s", format) if format in ("fasta", "fastq"): self.klass = "seqio" @@ -179,7 +173,6 @@ def __init__(self, filename, outputdir=None, format="fasta", mode="cycle"): mkdir(outputdir) def _open(self, filename): - if self.klass == "seqio": handle = SeqIO.parse(open(filename), self.format) elif self.klass == "clust": @@ -260,16 +253,14 @@ def split(self, N, force=False): """ mode = self.mode assert mode in ("batch", "cycle", "optimal") - logging.debug("set split mode=%s" % mode) + logger.debug("set split mode=%s", mode) self.names = self.__class__.get_names(self.filename, N) if self.outputdir: self.names = [op.join(self.outputdir, x) for x in self.names] if not need_update(self.filename, self.names) and not force: - logging.error( - "file %s already existed, skip file splitting" % self.names[0] - ) + logger.error("file %s already existed, skip file splitting", self.names[0]) return filehandles = [open(x, "w") for x in self.names] @@ -277,7 +268,7 @@ def split(self, N, force=False): if mode == "batch": for batch, fw in zip(self._batch_iterator(N), filehandles): count = self.write(fw, batch) - logging.debug("write %d records to %s" % (count, fw.name)) + logger.debug("write %d records to %s", count, fw.name) elif mode == "cycle": handle = self._open(self.filename) @@ -415,7 +406,7 @@ def must_open(filename, mode="r", checkexists=False, skipcheck=False, oappend=Fa else: fp = open(filename, "w") else: - logging.debug("File `{0}` already exists. Skipped.".format(filename)) + logger.debug("File `%s` already exists. Skipped.", filename) return None else: fp = open(filename, mode) @@ -462,7 +453,7 @@ def write_file(filename, contents, meta=None, skipcheck=False, append=False, tee fileop = "appended" if append else "written" message = "{0} {1} to `{2}`.".format(meta, fileop, filename) - logging.debug(message.capitalize()) + logger.debug(message.capitalize()) if meta == "run script" and not append: sh("chmod u+x {0}".format(filename)) @@ -542,7 +533,6 @@ def flexible_cast(s): def main(): - actions = ( ("pairwise", "convert a list of IDs into all pairs"), ("split", "split large file into N chunks"), @@ -821,7 +811,7 @@ def group(args): if len(cols) < len(atoms): cols = [x for x in range(len(atoms))] if groupby not in cols: - logging.error("groupby col index `{0}` is out of range".format(groupby)) + logger.error("groupby col index `%s` is out of range", groupby) sys.exit() key = atoms[groupby] @@ -922,13 +912,13 @@ def split(args): fs = FileSplitter(filename, outputdir=outdir, format=opts.format, mode=opts.mode) if opts.all: - logging.debug("option -all override N") + logger.debug("option -all override N") N = fs.num_records else: N = min(fs.num_records, int(N)) assert N > 0, "N must be > 0" - logging.debug("split file into %d chunks" % N) + logger.debug("split file into %d chunks", N) fs.split(N) return fs diff --git a/jcvi/graphics/base.py b/jcvi/graphics/base.py index 0a140998..5569a70a 100644 --- a/jcvi/graphics/base.py +++ b/jcvi/graphics/base.py @@ -11,6 +11,7 @@ logging.getLogger("numexpr").setLevel(logging.WARNING) logging.getLogger("PIL").setLevel(logging.INFO) + from functools import partial import numpy as np @@ -36,10 +37,8 @@ from matplotlib.path import Path from typing import Optional -from jcvi.formats.base import LineFile -from jcvi.apps.base import glob, listify, datadir, sample_N, which - -logging.getLogger().setLevel(logging.DEBUG) +from ..apps.base import datadir, glob, listify, logger, sample_N, which +from ..formats.base import LineFile CHARS = { @@ -95,10 +94,8 @@ def __init__(self, fig, usetex: bool = True): try: self.build_height_array(fig, usetex=usetex) except ValueError as e: - logging.debug( - "Failed to init heights (error: {}). Variable label sizes skipped.".format( - e - ) + logger.debug( + "Failed to init heights (error: %s). Variable label sizes skipped.", e ) @classmethod @@ -204,8 +201,8 @@ def load_image(filename): ret[:, :, 2] = ret[:, :, 1] = ret[:, :, 0] = img img = ret else: - h, w, c = img.shape - logging.debug("Image `{0}` loaded ({1}px x {2}px).".format(filename, w, h)) + h, w, _ = img.shape + logger.debug("Image `%s` loaded (%dpx x %dpx).", filename, w, h) return img @@ -315,24 +312,22 @@ def savefig(figname, dpi=150, iopts=None, cleanup=True): except: format = "pdf" try: - logging.debug(f"Matplotlib backend is: {mpl.get_backend()}") - logging.debug(f"Attempting save as: {figname}") + logger.debug("Matplotlib backend is: %s", mpl.get_backend()) + logger.debug("Attempting save as: %s", figname) plt.savefig(figname, dpi=dpi, format=format) except Exception as e: - message = "savefig failed with message:" - message += "\n{0}".format(str(e)) - logging.error(message) - logging.info("Try running again with --notex option to disable latex.") + logger.error("savefig failed with message:\n%s", e) + logger.info("Try running again with --notex option to disable latex.") if op.exists(figname): if op.getsize(figname) < 1000: - logging.debug(f"Cleaning up empty file: {figname}") + logger.debug("Cleaning up empty file: %s", figname) remove(figname) sys.exit(1) msg = "Figure saved to `{0}`".format(figname) if iopts: msg += " {0}".format(iopts) - logging.debug(msg) + logger.debug(msg) if cleanup: plt.rcdefaults() @@ -392,7 +387,7 @@ def fontprop(ax, name, size=12): fname = op.join(datadir, name) prop = fm.FontProperties(fname=fname, size=size) - logging.debug("Set font to `{0}` (`{1}`).".format(name, prop.get_file())) + logger.debug("Set font to `%s` (`%s`)", name, prop.get_file()) for text in ax.texts: text.set_fontproperties(prop) @@ -439,9 +434,7 @@ def setup_theme( if usetex: rc("text", usetex=True) else: - logging.info( - "Set text.usetex={}. Font styles may be inconsistent.".format(usetex) - ) + logger.info("Set text.usetex=%s. Font styles may be inconsistent.", usetex) rc("text", usetex=False) if font == "Helvetica": From b2f009cdd66214985c0f5221e855e747bc532636 Mon Sep 17 00:00:00 2001 From: Haibao Tang Date: Thu, 5 Oct 2023 15:53:31 -0700 Subject: [PATCH 3/3] Extra opts in gff.bed() (#602) * Add two options to gff.bed() * automatically find the other opts when --ensembl_cds --- jcvi/formats/gff.py | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/jcvi/formats/gff.py b/jcvi/formats/gff.py index 9eb20d16..8c4a68b0 100644 --- a/jcvi/formats/gff.py +++ b/jcvi/formats/gff.py @@ -497,7 +497,6 @@ def to_range(obj, score=None, id=None, strand=None): def main(): - actions = ( ("addparent", "merge sister features and infer their parent"), ("bed", "parse gff and produce bed file for particular feature type"), @@ -3022,6 +3021,18 @@ def bed(args): default="Parent", help="Parent gene key to group with --primary_only", ) + p.add_option( + "--add_chr", + default=False, + action="store_true", + help="Add `chr` prefix to seqid", + ) + p.add_option( + "--ensembl_cds", + default=False, + action="store_true", + help="Use transcript_name.exon_number as accn", + ) p.set_outfile() opts, args = p.parse_args(args) @@ -3036,11 +3047,15 @@ def bed(args): span = opts.span primary_only = opts.primary_only parent_key = opts.parent_key - + add_chr = opts.add_chr + ensembl_cds = opts.ensembl_cds if opts.type: type = set(x.strip() for x in opts.type.split(",")) if opts.source: source = set(x.strip() for x in opts.source.split(",")) + if ensembl_cds: + type = {"CDS"} + source = {"protein_coding"} gff = Gff( gffile, @@ -3053,6 +3068,8 @@ def bed(args): ) b = Bed() seen_parents = set() # used with --primary_only + seen = set() # used with --ensembl_cds + skipped_identical_range = 0 skipped_non_primary = 0 for g in gff: @@ -3072,6 +3089,18 @@ def bed(args): bl.accn = accn if span: bl.score = bl.span + if add_chr: + bl.seqid = "chr" + bl.seqid + if ensembl_cds: + bl.accn = "{0}.{1}".format( + g.get_attr("transcript_name"), g.get_attr("exon_number") + ) + position = (bl.seqid, bl.start, bl.end) + if position in seen: + skipped_identical_range += 1 + continue + seen.add(position) + b.append(bl) sorted = not opts.nosort @@ -3083,6 +3112,8 @@ def bed(args): ) if primary_only: logging.debug("Skipped non-primary: %d", skipped_non_primary) + if ensembl_cds: + logging.debug("Skipped due to identical range: %d", skipped_identical_range) def make_index(gff_file): @@ -3134,7 +3165,6 @@ def children(args): parents = set(opts.parents.split(",")) for feat in get_parents(gff_file, parents): - cc = [c.id for c in g.children(feat.id, 1)] if len(cc) <= 1: continue @@ -3513,7 +3543,6 @@ def get_coords(feature, site, fLen, seqlen, feat, children_list, gffdb): elif site in ["TrSS", "TrES"]: children = [] for c in gffdb.children(feat.id, 1): - if c.featuretype not in children_list: continue children.append((c.start, c.stop)) @@ -3686,7 +3715,6 @@ def bed12(args): fw = must_open(outfile, "w") for f in g.features_of_type(parent): - chrom = f.chrom chromStart = f.start - 1 chromEnd = f.stop @@ -3701,7 +3729,6 @@ def bed12(args): blocks = [] for c in g.children(name, 1): - cstart, cend = c.start - 1, c.stop if c.featuretype == block: