diff --git a/jcvi/apps/align.py b/jcvi/apps/align.py index 7b6a6696..c6a3ea2d 100644 --- a/jcvi/apps/align.py +++ b/jcvi/apps/align.py @@ -31,6 +31,13 @@ def run_formatdb(infile=None, outfile=None, dbtype="nucl"): sh(cmd) +@depends +def run_diamond_makedb(infile=None, outfile=None): + cmd = "diamond makedb" + cmd += " --in {0} --db {1} -p 5".format(infile, infile) + sh(cmd) + + @depends def run_blat( infile=None, @@ -586,5 +593,126 @@ def last(args, dbtype=None): return lastfile +def blast_main(args, dbtype=None): + """ + %prog database.fasta query.fasta + + Run blastp/blastn by calling BLAST+ blastp/blastn depends on dbtype. + """ + p = OptionParser(blast_main.__doc__) + p.add_option( + "--dbtype", + default="nucl", + choices=("nucl", "prot"), + help="Molecule type of subject database", + ) + p.add_option("--path", help="Specify BLAST path for blastn or blastp") + + p.set_cpus() + p.set_outdir() + p.set_params() + + opts, args = p.parse_args(args) + + if len(args) != 2: + sys.exit(not p.print_help()) + + subject, query = args + path = opts.path + cpus = opts.cpus + if not dbtype: + dbtype = opts.dbtype + + getpath = lambda x: op.join(path, x) if path else x + cmd = "blastn" if dbtype == "nucl" else "blastp" + lastdb_bin = getpath("makeblastdb") + lastal_bin = getpath(cmd) + for bin in (lastdb_bin, lastal_bin): + if not which(bin): + logging.fatal("`%s` not found on PATH. Have you installed BLAST?", bin) + sys.exit(1) + + db_suffix = '.nin' if dbtype == "nucl" else ".pin" + + run_formatdb( + infile=subject, + outfile=subject + db_suffix, + dbtype=dbtype) + + blastfile = get_outfile(subject, query, suffix="last", outdir=opts.outdir) + # Make several attempts to run LASTAL + try: + sh( + cmd + f" -num_threads {cpus} -query {query} -db {subject} -out {blastfile}" + + " -outfmt 6 -max_target_seqs 1000 -evalue 1e-5", + check=False, + redirect_error=STDOUT, + ) + except CalledProcessError as e: # multi-threading disabled + message = f"{cmd} failed with message:" + message += "\n{0}".format(e.output.decode()) + logging.error(message) + logging.fatal("Failed to run `blast`. Aborted.") + cleanup(blastfile) + sys.exit(1) + return blastfile + + +def diamond_blastp_main(args, dbtype="prot"): + """ + %prog database.fasta query.fasta + + Run diamond blastp for protein alignment. + """ + p = OptionParser(diamond_blastp_main.__doc__) + + p.add_option("--path", help="Specify diamond path for diamond blastp") + + p.set_cpus() + p.set_outdir() + p.set_params() + + opts, args = p.parse_args(args) + + if len(args) != 2: + sys.exit(not p.print_help()) + + subject, query = args + path = opts.path + cpus = opts.cpus + if not dbtype: + dbtype = opts.dbtype + + getpath = lambda x: op.join(path, x) if path else x + cmd = "diamond blastp" + diamond_bin = getpath("diamond") + for bin in (diamond_bin,): + if not which(bin): + logging.fatal("`%s` not found on PATH. Have you installed Diamond?", bin) + sys.exit(1) + + run_diamond_makedb( + infile=subject, + outfile=subject + '.dmnd',) + + blastfile = get_outfile(subject, query, suffix="last", outdir=opts.outdir) + # Make several attempts to run LASTAL + try: + sh( + cmd + f" --threads {cpus} --query {query} --db {subject} --out {blastfile}" + + " --ultra-sensitive --max-target-seqs 1000 --evalue 1e-5 --outfmt 6", + check=False, + redirect_error=STDOUT, + ) + except CalledProcessError as e: # multi-threading disabled + message = f"{cmd} failed with message:" + message += "\n{0}".format(e.output.decode()) + logging.error(message) + logging.fatal("Failed to run `diamond blastp`. Aborted.") + cleanup(blastfile) + sys.exit(1) + return blastfile + + if __name__ == "__main__": main() diff --git a/jcvi/compara/catalog.py b/jcvi/compara/catalog.py index eb8e4749..cc3539dd 100644 --- a/jcvi/compara/catalog.py +++ b/jcvi/compara/catalog.py @@ -609,6 +609,7 @@ def ortholog(args): match (RBH). """ from jcvi.apps.align import last as last_main + from jcvi.apps.align import diamond_blastp_main, blast_main from jcvi.compara.blastfilter import main as blastfilter_main from jcvi.compara.quota import main as quota_main from jcvi.compara.synteny import scan, mcscan, liftover @@ -621,6 +622,7 @@ def ortholog(args): choices=("nucl", "prot"), help="Molecule type of subject database", ) + p.add_option( "--full", default=False, @@ -673,6 +675,13 @@ def ortholog(args): help="Ignore this pair of ortholog identification instead of throwing an error when performing many pairs of cataloging." ) + p.add_option( + "--align_soft", + default="last", + choices=("last", "blast", "diamond_blastp"), + help="Sequence alignment software. Default for both and . Users could also use for both and , or for .", + ) + opts, args = p.parse_args(args) if len(args) != 2: @@ -690,6 +699,7 @@ def ortholog(args): dist = "--dist={0}".format(opts.dist) minsize_flag = "--min_size={}".format(opts.n) cpus_flag = "--cpus={}".format(opts.cpus) + align_soft = opts.align_soft aprefix = op.basename(a) bprefix = op.basename(b) @@ -697,7 +707,16 @@ def ortholog(args): qprefix = ".".join((bprefix, aprefix)) last = pprefix + ".last" if need_update((afasta, bfasta), last, warn=True): - last_main([bfasta, afasta, cpus_flag], dbtype) + if align_soft == "blast": + blast_main( + [bfasta, afasta, cpus_flag], dbtype + ) + elif dbtype == "prot" and align_soft == "diamond_blastp": + diamond_blastp_main( + [bfasta, afasta, cpus_flag], dbtype + ) + else: + last_main([bfasta, afasta, cpus_flag], dbtype) self_remove = opts.self_remove if a == b: