From 0ada5e4b5a61a09acde561dd5197e47e3abd374d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 19 Jun 2020 06:43:15 -0700 Subject: [PATCH 1/4] check for compatible moltypes on lca.load_databases; fix lca gather --- sourmash/lca/command_gather.py | 3 ++- sourmash/lca/lca_db.py | 10 ++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/sourmash/lca/command_gather.py b/sourmash/lca/command_gather.py index 4e9a690afc..c5a1aff002 100644 --- a/sourmash/lca/command_gather.py +++ b/sourmash/lca/command_gather.py @@ -191,7 +191,8 @@ def gather_main(args): dblist, ksize, scaled = lca_utils.load_databases(args.db, None) # for each query, gather all the matches across databases - query_sig = sourmash_args.load_query_signature(args.query, ksize, 'DNA') + moltype = dblist[0].moltype + query_sig = sourmash_args.load_query_signature(args.query, ksize, moltype) debug('classifying', query_sig.name()) # make sure we're looking at the same scaled value as database diff --git a/sourmash/lca/lca_db.py b/sourmash/lca/lca_db.py index fec69ddcb4..ee5a1e5f07 100644 --- a/sourmash/lca/lca_db.py +++ b/sourmash/lca/lca_db.py @@ -519,6 +519,7 @@ def load_databases(filenames, scaled=None, verbose=True): "Load multiple LCA databases; return (dblist, ksize, scaled)" ksize_vals = set() scaled_vals = set() + moltype_vals = set() dblist = [] # load all the databases @@ -537,14 +538,19 @@ def load_databases(filenames, scaled=None, verbose=True): lca_db.downsample_scaled(scaled) scaled_vals.add(lca_db.scaled) + moltype_vals.add(lca_db.moltype) + if len(moltype_vals) > 1: + raise Exception('multiple moltypes, quitting') + dblist.append(lca_db) ksize = ksize_vals.pop() scaled = scaled_vals.pop() + moltype = moltype_vals.pop() if verbose: notify(u'\r\033[K', end=u'') - notify('loaded {} LCA databases. ksize={}, scaled={}', len(dblist), - ksize, scaled) + notify('loaded {} LCA databases. ksize={}, scaled={} moltype={}', + len(dblist), ksize, scaled, moltype) return dblist, ksize, scaled From ee17eb2cdf2fd3345954875b96499339fbf669d7 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 19 Jun 2020 06:50:59 -0700 Subject: [PATCH 2/4] add tests for incompatible databases and lca gather --- tests/test_lca.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/test_lca.py b/tests/test_lca.py index 52a407828f..8a8de0a8da 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -1671,6 +1671,32 @@ def test_incompat_lca_db_scaled(c): assert 'new scaled 10000 is lower than current sample scaled 10000' in str(e.value) +@utils.in_thisdir +def test_lca_gather_protein(c): + # test lca gather on protein foo + testquery = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') + db1 = utils.get_test_data('prot/protein.lca.json.gz') + + c.run_sourmash('lca', 'gather', testquery, db1) + + assert c.last_result.status == 0 + assert 'loaded 1 LCA databases. ksize=57, scaled=100 moltype=protein' in c.last_result.err + assert '340.9 kbp 100.0% 100.0% s__B26-1 sp001593925 sp.' in c.last_result.out + + +@utils.in_thisdir +def test_incompat_lca_db_moltype(c): + # test load of incompatible LCA DBs + testquery = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') + db1 = utils.get_test_data('prot/protein.lca.json.gz') + db2 = utils.get_test_data('prot/dayhoff.lca.json.gz') + + with pytest.raises(ValueError) as e: + c.run_sourmash('lca', 'gather', testquery, db1, db2) + + assert 'Exception: multiple moltypes, quitting' in str(e.value) + + @utils.in_tempdir def test_incompat_lca_db_ksize(c): # create a database with ksize of 25 From 393722d04b4043b3f37e31b56de32c6c033d528f Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 19 Jun 2020 06:58:35 -0700 Subject: [PATCH 3/4] removed unused import --- sourmash/sbt.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sourmash/sbt.py b/sourmash/sbt.py index 96845b9a24..2885b3044e 100644 --- a/sourmash/sbt.py +++ b/sourmash/sbt.py @@ -57,8 +57,6 @@ def search_transcript(node, seq, threshold): import sys from tempfile import NamedTemporaryFile -from deprecation import deprecated - from .exceptions import IndexNotSupported from .sbt_storage import FSStorage, TarStorage, IPFSStorage, RedisStorage, ZipStorage from .logging import error, notify, debug From 9972bc0b27a24b709db778b80fc14bc4e26ae462 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 19 Jun 2020 06:58:40 -0700 Subject: [PATCH 4/4] add deprecation for sourmash lca gather --- sourmash/lca/command_gather.py | 4 ++++ tests/test_lca.py | 12 ++++++++++++ 2 files changed, 16 insertions(+) diff --git a/sourmash/lca/command_gather.py b/sourmash/lca/command_gather.py index c5a1aff002..812c6b3b41 100644 --- a/sourmash/lca/command_gather.py +++ b/sourmash/lca/command_gather.py @@ -184,6 +184,10 @@ def gather_main(args): """ set_quiet(args.quiet, args.debug) + notify("** WARNING: lca gather is deprecated as of sourmash 3.4, and will") + notify("** be removed in sourmash 4.0; use 'gather' instead.") + notify('') + if not check_files_exist(args.query, *args.db): sys.exit(-1) diff --git a/tests/test_lca.py b/tests/test_lca.py index 8a8de0a8da..b38cdb22ea 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -1684,6 +1684,18 @@ def test_lca_gather_protein(c): assert '340.9 kbp 100.0% 100.0% s__B26-1 sp001593925 sp.' in c.last_result.out +@utils.in_thisdir +def test_lca_gather_deprecated_message(c): + # lca gather is deprecated for 4.0; check message + testquery = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') + db1 = utils.get_test_data('prot/protein.lca.json.gz') + + c.run_sourmash('lca', 'gather', testquery, db1) + + assert c.last_result.status == 0 + assert 'WARNING: lca gather is deprecated as of sourmash 3.4' in c.last_result.err + + @utils.in_thisdir def test_incompat_lca_db_moltype(c): # test load of incompatible LCA DBs