Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix lca gather to work on non-DNA databases. #1031

Merged
merged 4 commits into from
Jun 20, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion sourmash/lca/command_gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,14 +184,19 @@ def gather_main(args):
"""
set_quiet(args.quiet, args.debug)

notify("** WARNING: lca gather is deprecated as of sourmash 3.4, and will")
notify("** be removed in sourmash 4.0; use 'gather' instead.")
notify('')

if not check_files_exist(args.query, *args.db):
sys.exit(-1)

# load all the databases
dblist, ksize, scaled = lca_utils.load_databases(args.db, None)

# for each query, gather all the matches across databases
query_sig = sourmash_args.load_query_signature(args.query, ksize, 'DNA')
moltype = dblist[0].moltype
query_sig = sourmash_args.load_query_signature(args.query, ksize, moltype)
debug('classifying', query_sig.name())

# make sure we're looking at the same scaled value as database
Expand Down
10 changes: 8 additions & 2 deletions sourmash/lca/lca_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -519,6 +519,7 @@ def load_databases(filenames, scaled=None, verbose=True):
"Load multiple LCA databases; return (dblist, ksize, scaled)"
ksize_vals = set()
scaled_vals = set()
moltype_vals = set()
dblist = []

# load all the databases
Expand All @@ -537,14 +538,19 @@ def load_databases(filenames, scaled=None, verbose=True):
lca_db.downsample_scaled(scaled)
scaled_vals.add(lca_db.scaled)

moltype_vals.add(lca_db.moltype)
if len(moltype_vals) > 1:
raise Exception('multiple moltypes, quitting')

dblist.append(lca_db)

ksize = ksize_vals.pop()
scaled = scaled_vals.pop()
moltype = moltype_vals.pop()

if verbose:
notify(u'\r\033[K', end=u'')
notify('loaded {} LCA databases. ksize={}, scaled={}', len(dblist),
ksize, scaled)
notify('loaded {} LCA databases. ksize={}, scaled={} moltype={}',
len(dblist), ksize, scaled, moltype)

return dblist, ksize, scaled
2 changes: 0 additions & 2 deletions sourmash/sbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,6 @@ def search_transcript(node, seq, threshold):
import sys
from tempfile import NamedTemporaryFile

from deprecation import deprecated

from .exceptions import IndexNotSupported
from .sbt_storage import FSStorage, TarStorage, IPFSStorage, RedisStorage, ZipStorage
from .logging import error, notify, debug
Expand Down
38 changes: 38 additions & 0 deletions tests/test_lca.py
Original file line number Diff line number Diff line change
Expand Up @@ -1671,6 +1671,44 @@ def test_incompat_lca_db_scaled(c):
assert 'new scaled 10000 is lower than current sample scaled 10000' in str(e.value)


@utils.in_thisdir
def test_lca_gather_protein(c):
# test lca gather on protein foo
testquery = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig')
db1 = utils.get_test_data('prot/protein.lca.json.gz')

c.run_sourmash('lca', 'gather', testquery, db1)

assert c.last_result.status == 0
assert 'loaded 1 LCA databases. ksize=57, scaled=100 moltype=protein' in c.last_result.err
assert '340.9 kbp 100.0% 100.0% s__B26-1 sp001593925 sp.' in c.last_result.out


@utils.in_thisdir
def test_lca_gather_deprecated_message(c):
# lca gather is deprecated for 4.0; check message
testquery = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig')
db1 = utils.get_test_data('prot/protein.lca.json.gz')

c.run_sourmash('lca', 'gather', testquery, db1)

assert c.last_result.status == 0
assert 'WARNING: lca gather is deprecated as of sourmash 3.4' in c.last_result.err


@utils.in_thisdir
def test_incompat_lca_db_moltype(c):
# test load of incompatible LCA DBs
testquery = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig')
db1 = utils.get_test_data('prot/protein.lca.json.gz')
db2 = utils.get_test_data('prot/dayhoff.lca.json.gz')

with pytest.raises(ValueError) as e:
c.run_sourmash('lca', 'gather', testquery, db1, db2)

assert 'Exception: multiple moltypes, quitting' in str(e.value)


@utils.in_tempdir
def test_incompat_lca_db_ksize(c):
# create a database with ksize of 25
Expand Down