diff --git a/doc/command-line.md b/doc/command-line.md index 505c93b2c9..fa245e0a7f 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -993,6 +993,39 @@ size: 5177 signature license: CC0 ``` +### `sourmash signature fileinfo` - display a summary of the contents of a sourmash collection + +Display signature file, database, or collection. + +For example, +``` +sourmash sig fileinfo tests/test-data/prot/all.zip +``` +will display: +``` +path filetype: ZipFileLinearIndex +location: /Users/t/dev/sourmash/tests/test-data/prot/all.zip +is database? yes +has manifest? yes +is nonempty? yes +num signatures: 8 +** examining manifest... +31758 total hashes +summary of sketches: + 2 sketches with dayhoff, k=19, scaled=100 7945 total hashes + 2 sketches with hp, k=19, scaled=100 5184 total hashes + 2 sketches with protein, k=19, scaled=100 8214 total hashes + 2 sketches with DNA, k=31, scaled=1000 10415 total hashes +``` + +`sig fileinfo` will recognize +[all accepted sourmash input files](#loading-signatures-and-databases), +including individual .sig and .sig.gz files, Zip file collections, SBT +databases, LCA databases, and directory hierarchies. + +`sourmash sig fileinfo` provides optional JSON and YAML output, and +those formats are under semantic versioning. + ### `sourmash signature split` - split signatures into individual files Split each signature in the input file(s) into individual files, with @@ -1271,6 +1304,20 @@ exit on the first bad k-mer. If `--check-sequence --force` is provided, `sig kmers` will provide error messages (and skip bad sequences), but will continue processing input sequences. +### `sourmash signature manifest` - output a manifest for a file + +Output a manifest for a file, database, or collection. + +For example, +``` +sourmash sig manifest tests/test-data/prot/all.zip -o manifest.csv +``` +will create a CSV file, `manifest.csv`, in the internal sourmash +manifest format. The manifest will contain an entry for every +signature in the file, database, or collection. This format is largely +meant for internal use, but it can serve as a picklist pickfile for +subsetting large collections. + ## Advanced command-line usage ### Loading signatures and databases diff --git a/src/sourmash/cli/sig/__init__.py b/src/sourmash/cli/sig/__init__.py index 0bee9c126f..c240027ccf 100644 --- a/src/sourmash/cli/sig/__init__.py +++ b/src/sourmash/cli/sig/__init__.py @@ -11,6 +11,7 @@ from . import extract from . import filter from . import flatten +from . import fileinfo from . import kmers from . import intersect from . import manifest diff --git a/src/sourmash/cli/sig/fileinfo.py b/src/sourmash/cli/sig/fileinfo.py new file mode 100644 index 0000000000..8030db1dcc --- /dev/null +++ b/src/sourmash/cli/sig/fileinfo.py @@ -0,0 +1,31 @@ +"""provide summary information on the given file""" + + +def subparser(subparsers): + subparser = subparsers.add_parser('fileinfo') + subparser.add_argument('path') + subparser.add_argument( + '-q', '--quiet', action='store_true', + help='suppress non-error output' + ) + subparser.add_argument( + '-d', '--debug', action='store_true', + help='output debug information' + ) + subparser.add_argument( + '-f', '--force', action='store_true', + help='try to load all files as signatures' + ) + subparser.add_argument( + '--rebuild-manifest', help='forcibly rebuild the manifest', + action='store_true' + ) + subparser.add_argument( + '--json-out', help='output information in JSON format only', + action='store_true' + ) + + +def main(args): + import sourmash + return sourmash.sig.__main__.fileinfo(args) diff --git a/src/sourmash/cli/sig/manifest.py b/src/sourmash/cli/sig/manifest.py index f6797be731..497208fce8 100644 --- a/src/sourmash/cli/sig/manifest.py +++ b/src/sourmash/cli/sig/manifest.py @@ -8,6 +8,10 @@ def subparser(subparsers): '-q', '--quiet', action='store_true', help='suppress non-error output' ) + subparser.add_argument( + '-d', '--debug', action='store_true', + help='output debug information' + ) subparser.add_argument( '-o', '--output', '--csv', metavar='FILE', help='output information to a CSV file', @@ -17,6 +21,10 @@ def subparser(subparsers): '-f', '--force', action='store_true', help='try to load all files as signatures' ) + subparser.add_argument( + '--no-rebuild-manifest', help='use existing manifest if available', + action='store_true' + ) def main(args): diff --git a/src/sourmash/index/__init__.py b/src/sourmash/index/__init__.py index e4b68ca58f..db8cb00c97 100644 --- a/src/sourmash/index/__init__.py +++ b/src/sourmash/index/__init__.py @@ -52,6 +52,10 @@ class Index(ABC): is_database = False manifest = None + @abstractmethod + def __len__(self): + "Return the number of signatures in this Index object." + @property def location(self): "Return a resolvable location for this index, if possible." @@ -408,11 +412,13 @@ def save(self, path): save_signatures(self.signatures(), fp) @classmethod - def load(cls, location): + def load(cls, location, filename=None): "Load signatures from a JSON signature file." si = load_signatures(location, do_raise=True) - lidx = LinearIndex(si, filename=location) + if filename is None: + filename=location + lidx = LinearIndex(si, filename=filename) return lidx def select(self, **kwargs): @@ -557,6 +563,14 @@ def __bool__(self): return True def __len__(self): + "calculate number of signatures." + + # use manifest, if available. + m = self.manifest + if self.manifest is not None: + return len(m) + + # otherwise, iterate across all signatures. n = 0 for _ in self.signatures(): n += 1 @@ -845,12 +859,20 @@ class MultiIndex(Index): Concrete class; signatures held in memory; builds and uses manifests. """ - def __init__(self, manifest, parent=""): + def __init__(self, manifest, parent, *, prepend_location=False): """Constructor; takes manifest containing signatures, together with - optional top-level location to prepend to internal locations. + the top-level location. """ self.manifest = manifest self.parent = parent + self.prepend_location = prepend_location + + if prepend_location and self.parent is None: + raise ValueError("must set 'parent' if 'prepend_location' is set") + + @property + def location(self): + return self.parent def signatures(self): for row in self.manifest.rows: @@ -861,7 +883,7 @@ def signatures_with_location(self): loc = row['internal_location'] # here, 'parent' may have been removed from internal_location # for directories; if so, add it back in. - if self.parent: + if self.prepend_location: loc = os.path.join(self.parent, loc) yield row['signature'], loc @@ -877,13 +899,16 @@ def _signatures_with_internal(self): def __len__(self): + if self.manifest is None: + return 0 + return len(self.manifest) def insert(self, *args): raise NotImplementedError @classmethod - def load(cls, index_list, source_list, parent=""): + def load(cls, index_list, source_list, parent, *, prepend_location=False): """Create a MultiIndex from already-loaded indices. Takes two arguments: a list of Index objects, and a matching list @@ -903,10 +928,11 @@ def sigloc_iter(): yield ss, iloc # build manifest; note, signatures are stored in memory. + # CTB: could do this on demand? manifest = CollectionManifest.create_manifest(sigloc_iter()) # create! - return cls(manifest, parent=parent) + return cls(manifest, parent, prepend_location=prepend_location) @classmethod def load_from_directory(cls, pathname, *, force=False): @@ -942,7 +968,8 @@ def load_from_directory(cls, pathname, *, force=False): if not index_list: raise ValueError(f"no signatures to load under directory '{pathname}'") - return cls.load(index_list, source_list, parent=pathname) + return cls.load(index_list, source_list, pathname, + prepend_location=True) @classmethod def load_from_path(cls, pathname, force=False): @@ -957,19 +984,20 @@ def load_from_path(cls, pathname, force=False): if os.path.isdir(pathname): # traverse return cls.load_from_directory(pathname, force=force) - else: # load as a .sig/JSON file - index_list = [] - source_list = [] - try: - idx = LinearIndex.load(pathname) - index_list = [idx] - source_list = [pathname] - except (IOError, sourmash.exceptions.SourmashError): - if not force: - raise ValueError(f"no signatures to load from '{pathname}'") - return None - return cls.load(index_list, source_list) + # load as a .sig/JSON file + index_list = [] + source_list = [] + try: + idx = LinearIndex.load(pathname) + index_list = [idx] + source_list = [pathname] + except (IOError, sourmash.exceptions.SourmashError): + if not force: + raise ValueError(f"no signatures to load from '{pathname}'") + return None + + return cls.load(index_list, source_list, pathname) @classmethod def load_from_pathlist(cls, filename): @@ -992,7 +1020,7 @@ def load_from_pathlist(cls, filename): idx_list.append(idx) src_list.append(src) - return cls.load(idx_list, src_list) + return cls.load(idx_list, src_list, filename) def save(self, *args): raise NotImplementedError @@ -1000,7 +1028,8 @@ def save(self, *args): def select(self, **kwargs): "Run 'select' on the manifest." new_manifest = self.manifest.select_to_manifest(**kwargs) - return MultiIndex(new_manifest, parent=self.parent) + return MultiIndex(new_manifest, self.parent, + prepend_location=self.prepend_location) class LazyLoadedIndex(Index): diff --git a/src/sourmash/lca/lca_db.py b/src/sourmash/lca/lca_db.py index 52cf8fdae7..fbd2fc8d8b 100644 --- a/src/sourmash/lca/lca_db.py +++ b/src/sourmash/lca/lca_db.py @@ -78,6 +78,9 @@ def __init__(self, ksize, scaled, moltype='DNA'): def location(self): return self.filename + def __len__(self): + return self._next_index + def _invalidate_cache(self): if hasattr(self, '_cache'): del self._cache @@ -177,6 +180,10 @@ def signatures(self): for v in self._signatures.values(): yield v + def _signatures_with_internal(self): + for idx, ss in self._signatures.items(): + yield ss, self.location, idx + def select(self, ksize=None, moltype=None, num=0, scaled=0, abund=None, containment=False, picklist=None): """Make sure this database matches the requested requirements. @@ -297,6 +304,15 @@ def load(cls, db_name): for k, v in load_d['idx_to_lid'].items(): db.idx_to_lid[int(k)] = v + if db.ident_to_idx: + db._next_index = max(db.ident_to_idx.values()) + 1 + else: + db._next_index = 0 + if db.idx_to_lid: + db._next_lid = max(db.idx_to_lid.values()) + 1 + else: + db._next_lid = 0 + db.filename = db_name return db diff --git a/src/sourmash/manifest.py b/src/sourmash/manifest.py index 541c9ed86e..96c3401c96 100644 --- a/src/sourmash/manifest.py +++ b/src/sourmash/manifest.py @@ -2,6 +2,7 @@ Manifests for collections of signatures. """ import csv +import ast from sourmash.picklist import SignaturePicklist @@ -40,6 +41,9 @@ def __bool__(self): def __len__(self): return len(self.rows) + def __eq__(self, other): + return self.rows == other.rows + @classmethod def load_from_csv(cls, fp): "load a manifest from a CSV file." @@ -70,7 +74,7 @@ def load_from_csv(cls, fp): for k in introws: row[k] = int(row[k]) for k in boolrows: - row[k] = bool(row[k]) + row[k] = bool(ast.literal_eval(str(row[k]))) row['signature'] = None manifest_list.append(row) diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index f7e1c0acd8..c4838db0d4 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -5,7 +5,8 @@ import csv import json import os -from collections import defaultdict +from collections import defaultdict, namedtuple, Counter +import json import screed import sourmash @@ -14,6 +15,7 @@ from sourmash.logging import set_quiet, error, notify, print_results, debug from sourmash import sourmash_args from sourmash.minhash import _get_max_hash_for_scaled +from sourmash.manifest import CollectionManifest usage=''' @@ -269,43 +271,27 @@ def manifest(args): """ build a signature manifest """ - from sourmash.index import CollectionManifest - - set_quiet(args.quiet) - - # CTB: might want to switch to sourmash_args.FileOutputCSV here? - csv_fp = open(args.output, 'w', newline='') - - CollectionManifest.write_csv_header(csv_fp) - w = csv.DictWriter(csv_fp, fieldnames=CollectionManifest.required_keys) + set_quiet(args.quiet, args.debug) try: loader = sourmash_args.load_file_as_index(args.location, yield_all_files=args.force) - except Exception as exc: - error('\nError while reading signatures from {}:'.format(args.location)) - error(str(exc)) - error('(continuing)') - raise - - n = 0 - # Need to ignore existing manifests here! otherwise circularity... - try: - manifest_iter = loader._signatures_with_internal() - except NotImplementedError: - error("ERROR: manifests cannot be generated for this file.") + except ValueError as exc: + error(f"Cannot open '{args.location}'.") sys.exit(-1) - for n, (sig, parent, loc) in enumerate(manifest_iter): - # extract info, write as appropriate. - row = CollectionManifest.make_manifest_row(sig, loc, - include_signature=False) - w.writerow(row) + rebuild = True + if args.no_rebuild_manifest: + rebuild = False - notify(f'built manifest for {n} signatures total.') + manifest = sourmash_args.get_manifest(loader, require=True, + rebuild=rebuild) - if csv_fp: - csv_fp.close() + with open(args.output, "w", newline='') as csv_fp: + manifest.write_to_csv(csv_fp, write_header=True) + + notify(f"built manifest for {len(manifest)} signatures total.") + notify(f"wrote manifest to '{args.output}'") def overlap(args): @@ -615,12 +601,14 @@ def extract(args): # further filtering on md5 or name? if args.md5 is not None or args.name is not None: - def filter_fn(ss): + def filter_fn(row): # match? keep = False - if args.name and args.name in str(ss): - keep = True - if args.md5 and args.md5 in ss.md5sum(): + if args.name: + name = row['name'] or row['filename'] + if args.name in name: + keep = True + if args.md5 and args.md5 in row['md5']: keep = True return keep @@ -633,19 +621,40 @@ def filter_fn(ss): save_sigs.open() # start loading! - progress = sourmash_args.SignatureLoadingProgress() - loader = sourmash_args.load_many_signatures(args.signatures, - ksize=args.ksize, - moltype=moltype, - picklist=picklist, - progress=progress, - yield_all_files=args.force, - force=args.force) - for ss, sigloc in loader: - if filter_fn(ss): + total_rows_examined = 0 + for filename in args.signatures: + idx = sourmash_args.load_file_as_index(filename, + yield_all_files=args.force) + + idx = idx.select(ksize=args.ksize, + moltype=moltype, + picklist=picklist) + + manifest = sourmash_args.get_manifest(idx) + + sub_rows = [] + for row in manifest.rows: + if filter_fn(row): + sub_rows.append(row) + total_rows_examined += 1 + + sub_manifest = CollectionManifest(sub_rows) + sub_picklist = sub_manifest.to_picklist() + + try: + idx = idx.select(picklist=sub_picklist) + except ValueError: + error("** This input collection doesn't support 'extract' with picklists.") + error("** EXITING.") + error("**") + error("** You can use 'sourmash sig cat' with a picklist,") + error("** and then pipe the output to 'sourmash sig extract") + sys.exit(-1) + + for ss in idx.signatures(): save_sigs.add(ss) - notify(f"loaded {len(progress)} total that matched ksize & molecule type") + notify(f"loaded {total_rows_examined} total that matched ksize & molecule type") if not save_sigs: error("no matching signatures to save!") sys.exit(-1) @@ -1115,6 +1124,96 @@ def kmers(args): notify("NOTE: see --save-kmers or --save-sequences for output options.") +_SketchInfo = namedtuple('_SketchInfo', 'ksize, moltype, scaled, num, abund') + + +def fileinfo(args): + """ + provide summary information on the given path (collection, index, etc.) + """ + set_quiet(args.quiet, args.debug) + + text_out = False + if not args.json_out: + text_out = True + + # load as index! + try: + notify(f"** loading from '{args.path}'") + idx = sourmash_args.load_file_as_index(args.path, + yield_all_files=args.force) + except ValueError: + error(f"Cannot open '{args.path}'.") + sys.exit(-1) + + print_bool = lambda x: "yes" if x else "no" + print_none = lambda x: "n/a" if x is None else x + + info_d = {} + info_d['path_filetype'] = type(idx).__name__ + info_d['location'] = "" if not idx.location else idx.location + info_d['is_database'] = bool(idx.is_database) + info_d['has_manifest'] = bool(idx.manifest) + info_d['num_sketches'] = len(idx) + + if text_out: + print_results(f"path filetype: {info_d['path_filetype']}") + print_results(f"location: {info_d['location']}") + print_results(f"is database? {print_bool(info_d['is_database'])}") + print_results(f"has manifest? {print_bool(info_d['has_manifest'])}") + print_results(f"num signatures: {info_d['num_sketches']}") + + # also have arg to fileinfo to force recalculation + notify("** examining manifest...") + + manifest = sourmash_args.get_manifest(idx, rebuild=args.rebuild_manifest, + require=False) + + if manifest is None: + # actually can't find any file type to trigger this, but leaving it + # in for future eventualities, I guess? + notify("** no manifest and cannot be generated; exiting.") + sys.exit(0) + + # use a namedtuple to track counts of distinct sketch types and n hashes + total_size = 0 + counter = Counter() + hashcounts = Counter() + for row in manifest.rows: + ski = _SketchInfo(ksize=row['ksize'], moltype=row['moltype'], + scaled=row['scaled'], num=row['num'], + abund=row['with_abundance']) + counter[ski] += 1 + hashcounts[ski] += row['n_hashes'] + total_size += row['n_hashes'] + + # store in info_d + info_d['total_hashes'] = total_size + sketch_info = [] + for ski, count in counter.items(): + sketch_d = dict(ski._asdict()) + sketch_d['count'] = count + sketch_d['n_hashes'] = hashcounts[ski] + sketch_info.append(sketch_d) + info_d['sketch_info'] = sketch_info + + if text_out: + print_results(f"total hashes: {info_d['total_hashes']}") + print_results("summary of sketches:") + + for ski in info_d['sketch_info']: + mh_type = f"num={ski['num']}" if ski['num'] else f"scaled={ski['scaled']}" + mh_abund = ", abund" if ski['abund'] else "" + + sketch_str = f"{ski['count']} sketches with {ski['moltype']}, k={ski['ksize']}, {mh_type}{mh_abund}" + + print_results(f" {sketch_str: <50} {ski['n_hashes']} total hashes") + + else: + assert args.json_out + print(json.dumps(info_d)) + + def main(arglist=None): args = sourmash.cli.get_parser().parse_args(arglist) submod = getattr(sourmash.cli.sig, args.subcmd) diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index ac16da74a5..9885a63c15 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -1,5 +1,37 @@ """ Utility functions for sourmash CLI commands. + +The sourmash_args submodule contains functions that help with various +command-line functions. Library functions in this module often directly +send output to stdout/stderr in support of the CLI, and/or call +sys.exit to exit. + +argparse functionality: + +* check_scaled_bounds(args) -- check that --scaled is reasonable +* check_num_bounds(args) -- check that --num is reasonable +* get_moltype(args) -- verify that moltype selected is legit +* calculate_moltype(args) -- confirm that only one moltype was selected +* load_picklist(args) -- create a SignaturePicklist from --picklist args +* report_picklist(args, picklist) -- report on picklist value usage/matches + +signature/database loading functionality: + +* load_query_signature(filename, ...) -- load a single signature for query +* traverse_find_sigs(filenames, ...) -- find all .sig and .sig.gz files +* load_dbs_and_sigs(filenames, query, ...) -- load databases & signatures +* load_file_as_index(filename, ...) -- load a sourmash.Index class +* load_file_as_signatures(filename, ...) -- load a list of signatures +* load_pathlist_from_file(filename) -- load a list of paths from a file +* load_many_signatures(locations) -- load many signatures from many files +* get_manifest(idx) -- retrieve or build a manifest from an Index +* class SignatureLoadingProgress - signature loading progress bar + +signature and file output functionality: + +* SaveSignaturesToLocation(filename) - bulk signature output +* class FileOutput - file output context manager that deals w/stdout well +* class FileOutputCSV - file output context manager for CSV files """ import sys import os @@ -282,7 +314,10 @@ def _load_stdin(filename, **kwargs): "Load collection from .sig file streamed in via stdin" db = None if filename == '-': - db = LinearIndex.load(sys.stdin) + # load as LinearIndex, then pass into MultiIndex to generate a + # manifest. + lidx = LinearIndex.load(sys.stdin, filename='-') + db = MultiIndex.load((lidx,), (None,), parent="-") return db @@ -360,7 +395,7 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): # but nothing else. for n, (desc, load_fn) in enumerate(_loader_functions): try: - debug_literal(f"_load_databases: trying loader fn {n} {desc}") + debug_literal(f"_load_databases: trying loader fn {n} '{desc}'") db = load_fn(filename, traverse_yield_all=traverse_yield_all, cache_size=cache_size) @@ -370,6 +405,7 @@ def _load_database(filename, traverse_yield_all, *, cache_size=None): if db is not None: loaded = True + debug_literal("_load_databases: success!") break # check to see if it's a FASTA/FASTQ record (i.e. screed loadable) @@ -523,6 +559,7 @@ def __exit__(self, type, value, traceback): return False + class FileOutputCSV(FileOutput): """A context manager for CSV file outputs. @@ -666,6 +703,50 @@ def load_many_signatures(locations, progress, *, yield_all_files=False, notify(f"loaded {len(progress)} signatures total, from {n_files} files") +def get_manifest(idx, *, require=True, rebuild=False): + """ + Retrieve a manifest for this idx, loaded with `load_file_as_index`. + + If a manifest exists and `rebuild` is False, return the manifest. + If a manifest does not exist or `rebuild` is True, try to build one. + If a manifest cannot be built and `require` is True, error exit. + + In the case where `require=False` and a manifest cannot be built, + may return None. Otherwise always returns a manifest. + """ + from sourmash.index import CollectionManifest + + m = idx.manifest + + # has one, and don't want to rebuild? easy! return! + if m is not None and not rebuild: + debug_literal("get_manifest: found manifest") + return m + + debug_literal(f"get_manifest: no manifest found / rebuild={rebuild}") + + # CTB: CollectionManifest.create_manifest wants (ss, iloc). + # so this is an adaptor function! Might want to just change + # what `create_manifest` takes. + def manifest_iloc_iter(idx): + for (ss, loc, iloc) in idx._signatures_with_internal(): + yield ss, iloc + + # need to build one... + try: + m = CollectionManifest.create_manifest(manifest_iloc_iter(idx), + include_signature=False) + debug_literal("get_manifest: rebuilt manifest.") + except NotImplementedError: + if require: + error(f"ERROR: manifests cannot be generated for {idx.location}") + sys.exit(-1) + else: + debug_literal("get_manifest: cannot build manifest, not req'd") + return None + + return m + # # enum and classes for saving signatures progressively # diff --git a/tests/test-data/prot/dayhoff.sbt.zip b/tests/test-data/prot/dayhoff.sbt.zip index 9cb6ea8ca1..ca102815a9 100644 Binary files a/tests/test-data/prot/dayhoff.sbt.zip and b/tests/test-data/prot/dayhoff.sbt.zip differ diff --git a/tests/test-data/prot/hp.sbt.zip b/tests/test-data/prot/hp.sbt.zip index 9115c2f98b..d38aeb3fe5 100644 Binary files a/tests/test-data/prot/hp.sbt.zip and b/tests/test-data/prot/hp.sbt.zip differ diff --git a/tests/test-data/prot/protein.sbt.zip b/tests/test-data/prot/protein.sbt.zip index 845d41a676..03bfb0a4e9 100644 Binary files a/tests/test-data/prot/protein.sbt.zip and b/tests/test-data/prot/protein.sbt.zip differ diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index e8a824e788..4e4fc12d3c 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -1210,6 +1210,7 @@ def test_sig_extract_1(runtmp): def test_sig_extract_1(runtmp): + # run sig extract with --from-file c = runtmp # extract 47 from 47... :) @@ -1245,6 +1246,26 @@ def test_sig_extract_2(c): assert actual_extract_sig == test_extract_sig +@utils.in_tempdir +def test_sig_extract_2_zipfile(c): + # extract matches to 47's md5sum from among several in a zipfile + all_zip = utils.get_test_data('prot/all.zip') + sig47 = utils.get_test_data('47.fa.sig') + + c.run_sourmash('sig', 'extract', all_zip, '--md5', '09a0869') + + # stdout should be new signature + out = c.last_result.out + + test_extract_sig = sourmash.load_one_signature(sig47) + actual_extract_sig = sourmash.load_one_signature(out) + + print(test_extract_sig.minhash) + print(actual_extract_sig.minhash) + + assert actual_extract_sig == test_extract_sig + + @utils.in_tempdir def test_sig_extract_3(c): # extract nothing (no md5 match) @@ -1364,6 +1385,76 @@ def test_sig_extract_8_picklist_md5(runtmp): assert "extracted 1 signatures from 2 file(s)" in err assert "for given picklist, found 1 matches to 1 distinct values" in err + +def test_sig_extract_8_picklist_md5_zipfile(runtmp): + # extract 47 from a zipfile, using a picklist w/full md5 + allzip = utils.get_test_data('prot/all.zip') + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + # select on any of these attributes + row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', + md5full='09a08691ce52952152f0e866a59f6261', + md5short='09a08691ce5295215', + fullIdent='NC_009665.1', + nodotIdent='NC_009665') + + # make picklist + picklist_csv = runtmp.output('pick.csv') + with open(picklist_csv, 'w', newline='') as csvfp: + w = csv.DictWriter(csvfp, fieldnames=row.keys()) + w.writeheader() + w.writerow(row) + + picklist_arg = f"{picklist_csv}:md5full:md5" + runtmp.sourmash('sig', 'extract', allzip, '--picklist', picklist_arg) + + # stdout should be new signature + out = runtmp.last_result.out + + test_extract_sig = sourmash.load_one_signature(sig47) + actual_extract_sig = sourmash.load_one_signature(out) + + assert actual_extract_sig == test_extract_sig + + err = runtmp.last_result.err + + print(err) + assert "loaded 1 distinct values into picklist." in err + assert "loaded 1 total that matched ksize & molecule type" in err + assert "extracted 1 signatures from 1 file(s)" in err + assert "for given picklist, found 1 matches to 1 distinct values" in err + + +def test_sig_extract_8_picklist_md5_lca(runtmp): + # extract 47 from an LCA database, using a picklist w/full md5 + allzip = utils.get_test_data('lca/47+63.lca.json') + sig47 = utils.get_test_data('47.fa.sig') + sig63 = utils.get_test_data('63.fa.sig') + + # select on any of these attributes + row = dict(exactName='NC_009665.1 Shewanella baltica OS185, complete genome', + md5full='50a9274021e43eda8b2e77f8fa60ae8e', + md5short='50a9274021e43eda8b2e77f8fa60ae8e'[:8], + fullIdent='NC_009665.1', + nodotIdent='NC_009665') + + # make picklist + picklist_csv = runtmp.output('pick.csv') + with open(picklist_csv, 'w', newline='') as csvfp: + w = csv.DictWriter(csvfp, fieldnames=row.keys()) + w.writeheader() + w.writerow(row) + + picklist_arg = f"{picklist_csv}:md5full:md5" + with pytest.raises(SourmashCommandFailed) as exc: + runtmp.sourmash('sig', 'extract', allzip, '--picklist', picklist_arg) + + # this happens b/c the implementation of 'extract' uses picklists. + print(runtmp.last_result.err) + assert "This input collection doesn't support 'extract' with picklists." in runtmp.last_result.err + + def test_sig_extract_8_picklist_md5_include(runtmp): # extract 47 from 47, using a picklist w/full md5:: explicit include sig47 = utils.get_test_data('47.fa.sig') @@ -2971,16 +3062,17 @@ def test_sig_manifest_3_sbt(runtmp): def test_sig_manifest_4_lca(runtmp): # make a manifest from a .lca.json file sigfile = utils.get_test_data('prot/protein.lca.json.gz') - with pytest.raises(SourmashCommandFailed): - runtmp.sourmash('sig', 'manifest', sigfile, '-o', - 'SOURMASH-MANIFEST.csv') + runtmp.sourmash('sig', 'manifest', sigfile, '-o', + 'SOURMASH-MANIFEST.csv') - status = runtmp.last_result.status - out = runtmp.last_result.out - err = runtmp.last_result.err + manifest_fn = runtmp.output('SOURMASH-MANIFEST.csv') + with open(manifest_fn, newline='') as csvfp: + manifest = CollectionManifest.load_from_csv(csvfp) - assert status != 0 - assert "ERROR: manifests cannot be generated for this file." in err + assert len(manifest) == 2 + md5_list = [ row['md5'] for row in manifest.rows ] + assert '16869d2c8a1d29d1c8e56f5c561e585e' in md5_list + assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list def test_sig_manifest_5_dir(runtmp): @@ -3027,6 +3119,58 @@ def test_sig_manifest_6_pathlist(runtmp): assert '120d311cc785cc9d0df9dc0646b2b857' in md5_list +def test_sig_manifest_does_not_exist(runtmp): + with pytest.raises(SourmashCommandFailed): + runtmp.run_sourmash('sig', 'manifest', 'does-not-exist', + '-o', 'out.csv') + + assert "Cannot open 'does-not-exist'." in runtmp.last_result.err + + +def test_sig_manifest_7_allzip_1(runtmp): + # the rebuilt manifest w/o '-f' will miss dna-sig.noext + allzip = utils.get_test_data('prot/all.zip') + runtmp.sourmash('sig', 'manifest', allzip, '-o', 'xyz.csv') + + manifest_fn = runtmp.output('xyz.csv') + with open(manifest_fn, newline='') as csvfp: + manifest = CollectionManifest.load_from_csv(csvfp) + + assert len(manifest) == 7 + filenames = set( row['internal_location'] for row in manifest.rows ) + assert 'dna-sig.noext' not in filenames + + +def test_sig_manifest_7_allzip_2(runtmp): + # the rebuilt manifest w/ '-f' will contain dna-sig.noext + allzip = utils.get_test_data('prot/all.zip') + runtmp.sourmash('sig', 'manifest', allzip, '-o', 'xyz.csv', '-f') + + manifest_fn = runtmp.output('xyz.csv') + with open(manifest_fn, newline='') as csvfp: + manifest = CollectionManifest.load_from_csv(csvfp) + + assert len(manifest) == 8 + filenames = set( row['internal_location'] for row in manifest.rows ) + assert 'dna-sig.noext' in filenames + + +def test_sig_manifest_7_allzip_3(runtmp): + # the existing manifest contains 'dna-sig.noext' whther or not -f is + # used. + allzip = utils.get_test_data('prot/all.zip') + runtmp.sourmash('sig', 'manifest', allzip, '-o', 'xyz.csv', + '--no-rebuild') + + manifest_fn = runtmp.output('xyz.csv') + with open(manifest_fn, newline='') as csvfp: + manifest = CollectionManifest.load_from_csv(csvfp) + + assert len(manifest) == 8 + filenames = set( row['internal_location'] for row in manifest.rows ) + assert 'dna-sig.noext' in filenames + + def test_sig_kmers_1_dna(runtmp): # test sig kmers on dna seqfile = utils.get_test_data('short.fa') diff --git a/tests/test_cmd_signature_fileinfo.py b/tests/test_cmd_signature_fileinfo.py new file mode 100644 index 0000000000..fc94db9d64 --- /dev/null +++ b/tests/test_cmd_signature_fileinfo.py @@ -0,0 +1,314 @@ +""" +Tests for the 'sourmash signature fileinfo' command line. +""" +import csv +import shutil +import os +import glob + +import pytest +import screed +import json + +import sourmash_tst_utils as utils +import sourmash +from sourmash.signature import load_signatures +from sourmash.manifest import CollectionManifest +from sourmash_tst_utils import SourmashCommandFailed + +## command line tests + + +def test_fileinfo_1_sig(runtmp): + # get basic info on a signature + sig47 = utils.get_test_data('47.fa.sig') + + shutil.copyfile(sig47, runtmp.output('sig47.sig')) + runtmp.run_sourmash('sig', 'fileinfo', 'sig47.sig') + + out = runtmp.last_result.out + print(runtmp.last_result.out) + + expected_output = """\ +path filetype: MultiIndex +location: sig47.sig +is database? no +has manifest? yes +num signatures: 1 +total hashes: 5177 +summary of sketches: + 1 sketches with DNA, k=31, scaled=1000 5177 +""".splitlines() + for line in expected_output: + assert line.strip() in out + + +def test_fileinfo_1_sig_abund(runtmp): + # get basic info on a signature with abundance + sig47 = utils.get_test_data('47.abunds.fa.sig') + + shutil.copyfile(sig47, runtmp.output('sig47.sig')) + runtmp.run_sourmash('sig', 'fileinfo', 'sig47.sig') + + out = runtmp.last_result.out + print(runtmp.last_result.out) + + expected_output = """\ +path filetype: MultiIndex +location: sig47.sig +is database? no +has manifest? yes +num signatures: 1 +5177 total hashes +summary of sketches: + 1 sketches with DNA, k=31, scaled=1000, abund 5177 total hashes +""".splitlines() + for line in expected_output: + assert line.strip() in out + + +def test_fileinfo_2_lca(runtmp): + # get basic info on an LCA database + prot = utils.get_test_data('prot/protein.lca.json.gz') + + shutil.copyfile(prot, runtmp.output('protein.lca.json.gz')) + runtmp.run_sourmash('sig', 'fileinfo', 'protein.lca.json.gz') + + out = runtmp.last_result.out + print(runtmp.last_result.out) + + expected_output = """\ +path filetype: LCA_Database +location: protein.lca.json.gz +is database? yes +has manifest? no +num signatures: 2 +total hashes: 8214 +summary of sketches: + 2 sketches with protein, k=19, scaled=100 8214 +""".splitlines() + for line in expected_output: + assert line.strip() in out + + +def test_fileinfo_3_sbt_zip(runtmp): + # test on an SBT.zip + prot = utils.get_test_data('prot/protein.sbt.zip') + + shutil.copyfile(prot, runtmp.output('protein.sbt.zip')) + runtmp.run_sourmash('sig', 'fileinfo', 'protein.sbt.zip') + + out = runtmp.last_result.out + print(runtmp.last_result.out) + + expected_output = """\ +path filetype: SBT +location: protein.sbt.zip +is database? yes +has manifest? yes +num signatures: 3 +total hashes: 8214 +summary of sketches: + 2 sketches with protein, k=19, scaled=100 8214 total hashes +""".splitlines() + for line in expected_output: + assert line.strip() in out, line.strip() + + +def test_fileinfo_4_zip(runtmp): + # test on a ZipFileLinearIndex + prot = utils.get_test_data('prot/all.zip') + + shutil.copyfile(prot, runtmp.output('all.zip')) + runtmp.run_sourmash('sig', 'fileinfo', 'all.zip') + + out = runtmp.last_result.out + print(runtmp.last_result.out) + + # 'location' will be fully resolved, ignore it for now + expected_output = f"""\ +path filetype: ZipFileLinearIndex +is database? yes +has manifest? yes +num signatures: 8 +total hashes: 31758 +summary of sketches: + 2 sketches with dayhoff, k=19, scaled=100 7945 total hashes + 2 sketches with hp, k=19, scaled=100 5184 total hashes + 2 sketches with protein, k=19, scaled=100 8214 total hashes + 2 sketches with DNA, k=31, scaled=1000 10415 total hashes +""".splitlines() + for line in expected_output: + assert line.strip() in out + + +def test_fileinfo_4_zip_json_out(runtmp): + # check --json-out + prot = utils.get_test_data('prot/all.zip') + + shutil.copyfile(prot, runtmp.output('all.zip')) + runtmp.run_sourmash('sig', 'fileinfo', 'all.zip', '--json-out') + + out = runtmp.last_result.out + print(runtmp.last_result.out) + + # should succeed as loading as JSON, with correct info + vals = json.loads(out) + + assert vals['has_manifest'] + assert vals['is_database'] + assert vals['num_sketches'] == 8 + assert vals['path_filetype'] == 'ZipFileLinearIndex' + assert vals['total_hashes'] == 31758 + + d1 = {'ksize': 19, 'moltype': 'dayhoff', 'scaled': 100, 'num': 0, 'abund': False, 'count': 2, 'n_hashes': 7945} + d2 = {'ksize': 19, 'moltype': 'hp', 'scaled': 100, 'num': 0, 'abund': False, 'count': 2, 'n_hashes': 5184} + d3 = {'ksize': 19, 'moltype': 'protein', 'scaled': 100, 'num': 0, 'abund': False, 'count': 2, 'n_hashes': 8214} + d4 = {'ksize': 31, 'moltype': 'DNA', 'scaled': 1000, 'num': 0, 'abund': False, 'count': 2, 'n_hashes': 10415} + + assert d1 in vals['sketch_info'] + assert d2 in vals['sketch_info'] + assert d3 in vals['sketch_info'] + assert d4 in vals['sketch_info'] + assert len(vals['sketch_info']) == 4 + + +def test_fileinfo_4_zip_rebuild(runtmp): + # test --rebuild + prot = utils.get_test_data('prot/all.zip') + + shutil.copyfile(prot, runtmp.output('all.zip')) + runtmp.run_sourmash('sig', 'fileinfo', 'all.zip', '--rebuild') + + out = runtmp.last_result.out + print(runtmp.last_result.out) + + # 'location' will be fully resolved, ignore it for now + # CTB: note we're missing one of the 8 in the rebuilt, dna-sig.noext, + # because it is not automatically included unless you load the zipfile + # with traverse. This is intentional. + expected_output = f"""\ +path filetype: ZipFileLinearIndex +is database? yes +has manifest? yes +num signatures: 8 +total hashes: 26581 +summary of sketches: + 2 sketches with dayhoff, k=19, scaled=100 7945 total hashes + 2 sketches with hp, k=19, scaled=100 5184 total hashes + 2 sketches with protein, k=19, scaled=100 8214 total hashes + 1 sketches with DNA, k=31, scaled=1000 5238 total hashes +""".splitlines() + for line in expected_output: + assert line.strip() in out + + +def test_fileinfo_5_dir(runtmp): + # test on a directory + sig47 = utils.get_test_data('47.fa.sig') + + os.mkdir(runtmp.output('subdir')) + + shutil.copyfile(sig47, runtmp.output('subdir/sig47.sig')) + runtmp.run_sourmash('sig', 'fileinfo', 'subdir/') + + out = runtmp.last_result.out + print(runtmp.last_result.out) + + expected_output = """\ +path filetype: MultiIndex +location: subdir/ +is database? no +has manifest? yes +num signatures: 1 +total hashes: 5177 +summary of sketches: + 1 sketches with DNA, k=31, scaled=1000 5177 total hashes +""".splitlines() + for line in expected_output: + assert line.strip() in out + + +def test_fileinfo_6_pathlist(runtmp): + # test on a pathlist + sig47 = utils.get_test_data('47.fa.sig') + shutil.copyfile(sig47, runtmp.output("47.fa.sig")) + + with open(runtmp.output('pathlist.txt'), 'wt') as fp: + fp.write("47.fa.sig\n") + + runtmp.run_sourmash('sig', 'fileinfo', 'pathlist.txt') + + out = runtmp.last_result.out + print(runtmp.last_result.out) + + expected_output = """\ +path filetype: MultiIndex +location: pathlist.txt +is database? no +has manifest? yes +num signatures: 1 +total hashes: 5177 +summary of sketches: + 1 sketches with DNA, k=31, scaled=1000 5177 total hashes +""".splitlines() + for line in expected_output: + assert line.strip() in out + + +@pytest.mark.parametrize("db", ['v6.sbt.json', 'v5.sbt.json', 'v4.sbt.json', + 'v3.sbt.json', 'v2.sbt.json', 'v1.sbt.json']) +def test_fileinfo_7_sbt_json(runtmp, db): + # test on multiple versions of SBT JSON files + dbfile = utils.get_test_data(db) + + runtmp.run_sourmash('sig', 'fileinfo', dbfile) + + out = runtmp.last_result.out + print(runtmp.last_result.out) + + expected_output = f"""\ +path filetype: SBT +location: {dbfile} +is database? yes +has manifest? no +num signatures: 13 +total hashes: 3500 +summary of sketches: + 7 sketches with DNA, k=31, num=500 3500 total hashes +""".splitlines() + for line in expected_output: + assert line.strip() in out, line.strip() + + +def test_sig_fileinfo_stdin(runtmp): + # test on stdin + sig = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig') + with open(sig, 'rt') as fp: + data = fp.read() + + runtmp.run_sourmash('sig', 'fileinfo', '-', stdin_data=data) + + out = runtmp.last_result.out + print(out) + + expected_output = """\ +path filetype: MultiIndex +location: - +is database? no +has manifest? yes +num signatures: 1 +total hashes: 3409 +summary of sketches: + 1 sketches with protein, k=19, scaled=100 3409 total hashes +""".splitlines() + for line in expected_output: + assert line.strip() in out, line.strip() + + +def test_sig_fileinfo_does_not_exist(runtmp): + # test on file that does not exist + with pytest.raises(SourmashCommandFailed): + runtmp.run_sourmash('sig', 'fileinfo', 'does-not-exist') + + assert "Cannot open 'does-not-exist'." in runtmp.last_result.err diff --git a/tests/test_index.py b/tests/test_index.py index 31d1a0ac18..ea9cc01630 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -944,6 +944,22 @@ def test_zipfile_API_signatures_traverse_yield_all_select(use_manifest): assert len(zipidx) == 2 +def test_zipfile_API_signatures_traverse_yield_all_manifest(): + # check that manifest len is correct + zipfile_db = utils.get_test_data('prot/all.zip') + + zipidx = ZipFileLinearIndex.load(zipfile_db, traverse_yield_all=True, + use_manifest=True) + assert len(zipidx) == 8, len(zipidx) + assert len(zipidx.manifest) == 8, len(zipidx.manifest) + + zipidx = zipidx.select(moltype='DNA') + siglist = list(zipidx.signatures()) + assert len(siglist) == 2 + assert len(zipidx) == 2 + assert len(zipidx.manifest) == 2 + + def test_zipfile_API_signatures_select(use_manifest): # include dna-sig.noext zipfile_db = utils.get_test_data('prot/all.zip') @@ -1124,7 +1140,8 @@ def test_multi_index_search(): lidx3 = LinearIndex.load(sig63) # create MultiIndex with source location override - lidx = MultiIndex.load([lidx1, lidx2, lidx3], ['A', None, 'C']) + lidx = MultiIndex.load([lidx1, lidx2, lidx3], ['A', None, 'C'], + None) lidx = lidx.select(ksize=31) # now, search for sig2 @@ -1177,7 +1194,8 @@ def test_multi_index_gather(): lidx3 = LinearIndex.load(sig63) # create MultiIndex with source location override - lidx = MultiIndex.load([lidx1, lidx2, lidx3], ['A', None, 'C']) + lidx = MultiIndex.load([lidx1, lidx2, lidx3], ['A', None, 'C'], + None) lidx = lidx.select(ksize=31) matches = lidx.gather(ss2) @@ -1206,7 +1224,8 @@ def test_multi_index_signatures(): lidx3 = LinearIndex.load(sig63) # create MultiIndex with source location override - lidx = MultiIndex.load([lidx1, lidx2, lidx3], ['A', None, 'C']) + lidx = MultiIndex.load([lidx1, lidx2, lidx3], ['A', None, 'C'], + None) lidx = lidx.select(ksize=31) siglist = list(lidx.signatures()) @@ -1216,11 +1235,23 @@ def test_multi_index_signatures(): assert ss63 in siglist -def test_multi_index_load_from_path(): +def test_multi_index_create(): + mi = MultiIndex(None, None, prepend_location=False) + assert len(mi) == 0 + + +def test_multi_index_create_prepend(): + with pytest.raises(ValueError): + mi = MultiIndex(None, None, prepend_location=True) + + +def test_multi_index_load_from_directory(): # test MultiIndex loading from a directory. The full paths to the # signature files should be available via 'signatures_with_location()' dirname = utils.get_test_data('prot/protein') - mi = MultiIndex.load_from_path(dirname, force=False) + mi = MultiIndex.load_from_directory(dirname, force=False) + + assert mi.location == dirname sigs = list(mi.signatures()) assert len(sigs) == 2 @@ -1245,17 +1276,17 @@ def test_multi_index_load_from_path(): assert endings[1] in ilocs, ilocs -def test_multi_index_load_from_path_2(): +def test_multi_index_load_from_directory_2(): # only load .sig files, currently; not the databases under that directory. dirname = utils.get_test_data('prot') - mi = MultiIndex.load_from_path(dirname, force=False) + mi = MultiIndex.load_from_directory(dirname, force=False) sigs = list(mi.signatures()) assert len(sigs) == 7 @utils.in_tempdir -def test_multi_index_load_from_path_3(c): +def test_multi_index_load_from_directory_3(c): # check that force works ok on a directory dirname = utils.get_test_data('prot') @@ -1269,11 +1300,11 @@ def test_multi_index_load_from_path_3(c): count += 1 with pytest.raises(sourmash.exceptions.SourmashError): - mi = MultiIndex.load_from_path(c.location, force=False) + mi = MultiIndex.load_from_directory(c.location, force=False) @utils.in_tempdir -def test_multi_index_load_from_path_3_yield_all_true(c): +def test_multi_index_load_from_directory_3_yield_all_true(c): # check that force works ok on a directory w/force=True dirname = utils.get_test_data('prot') @@ -1286,14 +1317,14 @@ def test_multi_index_load_from_path_3_yield_all_true(c): shutil.copyfile(fullname, copyto) count += 1 - mi = MultiIndex.load_from_path(c.location, force=True) + mi = MultiIndex.load_from_directory(c.location, force=True) sigs = list(mi.signatures()) assert len(sigs) == 8 @utils.in_tempdir -def test_multi_index_load_from_path_3_yield_all_true_subdir(c): +def test_multi_index_load_from_directory_3_yield_all_true_subdir(c): # check that force works ok on subdirectories dirname = utils.get_test_data('prot') @@ -1309,14 +1340,14 @@ def test_multi_index_load_from_path_3_yield_all_true_subdir(c): shutil.copyfile(fullname, copyto) count += 1 - mi = MultiIndex.load_from_path(c.location, force=True) + mi = MultiIndex.load_from_directory(c.location, force=True) sigs = list(mi.signatures()) assert len(sigs) == 8 @utils.in_tempdir -def test_multi_index_load_from_path_3_sig_gz(c): +def test_multi_index_load_from_directory_3_sig_gz(c): # check that we find .sig.gz files, too dirname = utils.get_test_data('prot') @@ -1331,14 +1362,16 @@ def test_multi_index_load_from_path_3_sig_gz(c): shutil.copyfile(fullname, copyto) count += 1 - mi = MultiIndex.load_from_path(c.location, force=False) + mi = MultiIndex.load_from_directory(c.location, force=False) + + assert mi.location == c.location sigs = list(mi.signatures()) assert len(sigs) == 6 @utils.in_tempdir -def test_multi_index_load_from_path_3_check_traverse_fn(c): +def test_multi_index_load_from_directory_3_check_traverse_fn(c): # test the actual traverse function... eventually this test can be # removed, probably, as we consolidate functionality and test MultiIndex # better. @@ -1350,10 +1383,24 @@ def test_multi_index_load_from_path_3_check_traverse_fn(c): assert len(files) == 20, files # if this fails, check for extra files! -def test_multi_index_load_from_path_no_exist(): +def test_multi_index_load_from_directory_no_exist(): dirname = utils.get_test_data('does-not-exist') with pytest.raises(ValueError): - mi = MultiIndex.load_from_path(dirname, force=True) + mi = MultiIndex.load_from_directory(dirname, force=True) + + +def test_multi_index_load_from_file_path(): + sig2 = utils.get_test_data('2.fa.sig') + + mi = MultiIndex.load_from_path(sig2) + assert len(mi) == 3 + assert mi.location == sig2 + + +def test_multi_index_load_from_file_path_no_exist(): + filename = utils.get_test_data('does-not-exist') + with pytest.raises(ValueError): + mi = MultiIndex.load_from_directory(filename, force=True) def test_multi_index_load_from_pathlist_no_exist(): @@ -1377,6 +1424,8 @@ def test_multi_index_load_from_pathlist_1(c): sigs = list(mi.signatures()) assert len(sigs) == 7 + assert mi.location == file_list + @utils.in_tempdir def test_multi_index_load_from_pathlist_2(c): @@ -2251,7 +2300,7 @@ def test_lazy_index_wraps_multi_index_location(): db_paths = (sigdir, sigzip, siglca, sigsbt) dbs = [ sourmash.load_file_as_index(db_path) for db_path in db_paths ] - mi = MultiIndex.load(dbs, db_paths) + mi = MultiIndex.load(dbs, db_paths, None) lazy = LazyLinearIndex(mi) mi2 = mi.select(moltype='protein') diff --git a/tests/test_lca.py b/tests/test_lca.py index 2467914b13..03a4ff6650 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -24,9 +24,15 @@ def test_api_create_search(): ksize=31) lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) + assert len(lca_db) == 0 + assert not lca_db + count = lca_db.insert(ss) assert count == len(ss.minhash) + assert len(lca_db) == 1 + assert lca_db + results = lca_db.search(ss, threshold=0.0) print(results) assert len(results) == 1 diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 58ddda7d2c..b8fcd8eed4 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -3576,7 +3576,7 @@ def test_gather_metagenome_traverse_check_csv(runtmp, linear_gather, prefetch_ga r = csv.DictReader(fp) for row in r: filename = row['filename'] - assert filename.startswith(copy_testdata) + assert filename.startswith(copy_testdata), filename # should have full path to file sig was loaded from assert len(filename) > prefix_len @@ -4245,7 +4245,7 @@ def test_sbt_categorize(runtmp): out_csv = open(runtmp.output('out.csv')).read() print(out_csv) - assert './4.sig,genome-s10+s11,genome-s10,0.504' in out_csv + assert '4.sig,genome-s10+s11,genome-s10,0.504' in out_csv def test_sbt_categorize_ignore_abundance_1(runtmp): diff --git a/tests/test_sourmash_args.py b/tests/test_sourmash_args.py index 7b10fd2ea6..aea49a3d0b 100644 --- a/tests/test_sourmash_args.py +++ b/tests/test_sourmash_args.py @@ -12,6 +12,7 @@ import sourmash_tst_utils as utils import sourmash from sourmash import sourmash_args, manifest +from sourmash.index import LinearIndex def test_save_signatures_api_none(): @@ -404,3 +405,90 @@ def test_load_many_sigs_empty_file_force(runtmp): print(err) assert f"ERROR: Error while reading signatures from '{outloc}'." in err assert "(continuing)" in err + + +def test_get_manifest_1(): + # basic get_manifest retrieves a manifest + sig47 = utils.get_test_data('47.fa.sig') + idx = sourmash.load_file_as_index(sig47) + + manifest = sourmash_args.get_manifest(idx) + assert len(manifest) == 1 + + +def test_get_manifest_2_cannot_build(): + # test what happens when get_manifest cannot build manifest + sig47 = utils.get_test_data('47.fa.sig') + ss47 = sourmash.load_one_signature(sig47) + + idx = LinearIndex([ss47]) + + with pytest.raises(SystemExit) as exc: + m = sourmash_args.get_manifest(idx) + + +def test_get_manifest_2_cannot_buildno_require(): + # test what happens when get_manifest cannot build manifest + sig47 = utils.get_test_data('47.fa.sig') + ss47 = sourmash.load_one_signature(sig47) + + idx = LinearIndex([ss47]) + + m = sourmash_args.get_manifest(idx, require=False) + + assert m is None + + +def test_get_manifest_3_build(): + # check that manifest is building + sig47 = utils.get_test_data('47.fa.sig') + ss47 = sourmash.load_one_signature(sig47) + + class FakeIndex(LinearIndex): + was_called = 0 + def _signatures_with_internal(self): + self.was_called = 1 + return [(ss47, "fakeloc", "fakeiloc")] + + idx = FakeIndex([sig47]) + + assert not idx.was_called + m = sourmash_args.get_manifest(idx) + assert idx.was_called + + print(m) + assert len(m) == 1 + assert m.rows[0]['internal_location'] == "fakeiloc" + + +def test_get_manifest_3_build(): + # check that manifest is building + sig47 = utils.get_test_data('47.fa.sig') + ss47 = sourmash.load_one_signature(sig47) + + class FakeIndex(LinearIndex): + manifest = None + was_called = 0 + + def _signatures_with_internal(self): + self.was_called = 1 + return [(ss47, "fakeloc", "fakeiloc")] + + idx = FakeIndex([sig47]) + + assert not idx.was_called + m = sourmash_args.get_manifest(idx) + assert idx.was_called + + # now set and ask again, should not be called + idx.manifest = m + idx.was_called = 0 + + m2 = sourmash_args.get_manifest(idx) + assert not idx.was_called + assert m == m2 + + # now, force rebuild + m3 = sourmash_args.get_manifest(idx, rebuild=True) + assert idx.was_called + assert m == m3