From 883732cafa560fef6ac8b66d9e5b3777729209e8 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 25 Jul 2020 11:58:06 -0700 Subject: [PATCH 01/50] move sourmash._minhash to sourmash.minhash --- benchmarks/benchmarks.py | 5 +---- doc/conf.py | 2 +- doc/developer.md | 2 +- sourmash/__init__.py | 2 +- sourmash/cli/compute.py | 2 +- sourmash/lca/lca_db.py | 2 +- sourmash/{_minhash.py => minhash.py} | 0 sourmash/nodegraph.py | 2 +- sourmash/search.py | 2 +- sourmash/sig/__main__.py | 6 +++--- sourmash/signature.py | 2 +- tests/test__minhash.py | 2 +- tests/test__minhash_hypothesis.py | 2 +- tests/test_rustobj.py | 2 +- 14 files changed, 15 insertions(+), 18 deletions(-) rename sourmash/{_minhash.py => minhash.py} (100%) diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py index cb2ae91ddf..d9bdfaf6a9 100644 --- a/benchmarks/benchmarks.py +++ b/benchmarks/benchmarks.py @@ -2,10 +2,7 @@ import random -try: - from sourmash._minhash import MinHash -except: - from sourmash.minhash import MinHash +from sourmash.minhash import MinHash def load_sequences(): diff --git a/doc/conf.py b/doc/conf.py index 87dbf3ad88..802fadc71b 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -297,4 +297,4 @@ # If true, do not generate a @detailmenu in the "Top" node's menu. #texinfo_no_detailmenu = False -autodoc_mock_imports = ["sourmash._minhash"] +autodoc_mock_imports = ["sourmash.minhash"] diff --git a/doc/developer.md b/doc/developer.md index 35aec5345b..561f6b8857 100644 --- a/doc/developer.md +++ b/doc/developer.md @@ -34,7 +34,7 @@ run the Rust tests. ### If you're having trouble installing or using the development environment -If you are getting an error that contains `ImportError: cannot import name 'to_bytes' from 'sourmash._minhash'`, then it's likely you need to update Rust and clean up your environment. Some installation issues can be solved by simply removing the intermediate build files with: +If you are getting an error that contains `ImportError: cannot import name 'to_bytes' from 'sourmash.minhash'`, then it's likely you need to update Rust and clean up your environment. Some installation issues can be solved by simply removing the intermediate build files with: ``` make clean diff --git a/sourmash/__init__.py b/sourmash/__init__.py index dca58d86d2..e50b7f7f42 100644 --- a/sourmash/__init__.py +++ b/sourmash/__init__.py @@ -25,7 +25,7 @@ "use the PyPI ones." ) -from ._minhash import MinHash, get_minhash_default_seed, get_minhash_max_hash +from .minhash import MinHash, get_minhash_default_seed, get_minhash_max_hash DEFAULT_SEED = get_minhash_default_seed() MAX_HASH = get_minhash_max_hash() diff --git a/sourmash/cli/compute.py b/sourmash/cli/compute.py index cdfcfa7645..d5e959e0a5 100644 --- a/sourmash/cli/compute.py +++ b/sourmash/cli/compute.py @@ -28,7 +28,7 @@ from argparse import FileType -from sourmash._minhash import get_minhash_default_seed +from sourmash.minhash import get_minhash_default_seed from sourmash.cli.utils import add_construct_moltype_args diff --git a/sourmash/lca/lca_db.py b/sourmash/lca/lca_db.py index 6c9ecea3cd..1430637090 100644 --- a/sourmash/lca/lca_db.py +++ b/sourmash/lca/lca_db.py @@ -7,7 +7,7 @@ import functools import sourmash -from sourmash._minhash import get_max_hash_for_scaled +from sourmash.minhash import get_max_hash_for_scaled from sourmash.logging import notify, error, debug from sourmash.index import Index diff --git a/sourmash/_minhash.py b/sourmash/minhash.py similarity index 100% rename from sourmash/_minhash.py rename to sourmash/minhash.py diff --git a/sourmash/nodegraph.py b/sourmash/nodegraph.py index c865a3c7c3..ec0165dca4 100644 --- a/sourmash/nodegraph.py +++ b/sourmash/nodegraph.py @@ -7,7 +7,7 @@ from ._compat import string_types, range_type from ._lowlevel import ffi, lib -from ._minhash import to_bytes, MinHash +from .minhash import to_bytes, MinHash from .utils import RustObject, rustcall, decode_str from .exceptions import SourmashError diff --git a/sourmash/search.py b/sourmash/search.py index ad2da3a92b..02424e719d 100644 --- a/sourmash/search.py +++ b/sourmash/search.py @@ -4,7 +4,7 @@ from .logging import notify, error from .signature import SourmashSignature -from ._minhash import get_max_hash_for_scaled +from .minhash import get_max_hash_for_scaled # generic SearchResult. diff --git a/sourmash/sig/__main__.py b/sourmash/sig/__main__.py index 0d1cd0a258..938833df3a 100644 --- a/sourmash/sig/__main__.py +++ b/sourmash/sig/__main__.py @@ -12,9 +12,9 @@ import copy from sourmash.sourmash_args import FileOutput -from ..logging import set_quiet, error, notify, set_quiet, print_results, debug -from .. import sourmash_args -from .._minhash import get_max_hash_for_scaled +from sourmash.logging import set_quiet, error, notify, set_quiet, print_results, debug +from sourmash import sourmash_args +from sourmash.minhash import get_max_hash_for_scaled usage=''' sourmash signature [] - manipulate/work with signature files. diff --git a/sourmash/signature.py b/sourmash/signature.py index 4bcd9293cc..e6d5d50d84 100644 --- a/sourmash/signature.py +++ b/sourmash/signature.py @@ -11,7 +11,7 @@ from .logging import error from . import MinHash -from ._minhash import to_bytes +from .minhash import to_bytes from ._lowlevel import ffi, lib from .utils import RustObject, rustcall, decode_str from ._compat import PY2 diff --git a/tests/test__minhash.py b/tests/test__minhash.py index 866ceb1769..ccfb6f5fd5 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -43,7 +43,7 @@ import pytest import sourmash -from sourmash._minhash import ( +from sourmash.minhash import ( MinHash, hash_murmur, get_scaled_for_max_hash, diff --git a/tests/test__minhash_hypothesis.py b/tests/test__minhash_hypothesis.py index f3a1446907..6055e1d870 100644 --- a/tests/test__minhash_hypothesis.py +++ b/tests/test__minhash_hypothesis.py @@ -4,7 +4,7 @@ import hypothesis.strategies as st from sourmash import MinHash -from sourmash._minhash import get_max_hash_for_scaled +from sourmash.minhash import get_max_hash_for_scaled @given(st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000), diff --git a/tests/test_rustobj.py b/tests/test_rustobj.py index 4be7a0e1ee..2b9b8e6877 100644 --- a/tests/test_rustobj.py +++ b/tests/test_rustobj.py @@ -1,7 +1,7 @@ import pytest from sourmash.utils import RustObject -from sourmash._minhash import to_bytes +from sourmash.minhash import to_bytes def test_rustobj_init(): From 80f9bef51c5c11f8797e253f4ca9a04c957aaae7 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 25 Jul 2020 12:23:44 -0700 Subject: [PATCH 02/50] deprecate max_hash throughout --- sourmash/lca/lca_db.py | 4 +- sourmash/minhash.py | 47 +++++------ sourmash/search.py | 4 +- sourmash/sig/__main__.py | 6 +- tests/test__minhash.py | 128 ++++++++++++++---------------- tests/test__minhash_hypothesis.py | 4 +- 6 files changed, 88 insertions(+), 105 deletions(-) diff --git a/sourmash/lca/lca_db.py b/sourmash/lca/lca_db.py index 1430637090..3931813e19 100644 --- a/sourmash/lca/lca_db.py +++ b/sourmash/lca/lca_db.py @@ -7,7 +7,7 @@ import functools import sourmash -from sourmash.minhash import get_max_hash_for_scaled +from sourmash.minhash import _get_max_hash_for_scaled from sourmash.logging import notify, error, debug from sourmash.index import Index @@ -369,7 +369,7 @@ def downsample_scaled(self, scaled): self._invalidate_cache() - max_hash = get_max_hash_for_scaled(scaled) + max_hash = _get_max_hash_for_scaled(scaled) # filter out all hashes over max_hash in value. new_hashvals = {} diff --git a/sourmash/minhash.py b/sourmash/minhash.py index eee206db88..b624c6c24f 100644 --- a/sourmash/minhash.py +++ b/sourmash/minhash.py @@ -30,7 +30,7 @@ def get_minhash_max_hash(): return MINHASH_MAX_HASH -def get_max_hash_for_scaled(scaled): +def _get_max_hash_for_scaled(scaled): "Convert a 'scaled' value into a 'max_hash' value." if scaled == 0: return 0 @@ -40,7 +40,7 @@ def get_max_hash_for_scaled(scaled): return int(round(get_minhash_max_hash() / scaled, 0)) -def get_scaled_for_max_hash(max_hash): +def _get_scaled_for_max_hash(max_hash): "Convert a 'max_hash' value into a 'scaled' value." if max_hash == 0: return 0 @@ -130,24 +130,24 @@ def __init__( * track_abundance (default False) - track hash multiplicity * mins (default None) - list of hashvals, or (hashval, abund) pairs * seed (default 42) - murmurhash seed - - Deprecated: @CTB - * ``max_hash=``; use ``scaled`` instead. """ - if max_hash and scaled: - raise ValueError("cannot set both max_hash and scaled") - elif scaled: - max_hash = get_max_hash_for_scaled(scaled) + # support max_hash in constructor, for now. + if max_hash: + if scaled: + raise ValueError("cannot set both max_hash and scaled") + scaled = _get_scaled_for_max_hash(max_hash) - if max_hash and n: + if scaled and n: raise ValueError("cannot set both n and max_hash") - if not n and not (max_hash or scaled): + if not n and not scaled: raise ValueError("cannot omit both n and scaled") if dayhoff or hp: is_protein = False + # ok, for Rust API, go from scaled back to max_hash + max_hash = _get_max_hash_for_scaled(scaled) self._objptr = lib.kmerminhash_new( n, ksize, is_protein, dayhoff, hp, seed, int(max_hash), track_abundance ) @@ -313,10 +313,14 @@ def seed(self): def num(self): return self._methodcall(lib.kmerminhash_num) + @property + def max_hash(self): + return self._methodcall(lib.kmerminhash_max_hash) + @property def scaled(self): if self.max_hash: - return get_scaled_for_max_hash(self.max_hash) + return _get_scaled_for_max_hash(self.max_hash) return 0 @property @@ -339,10 +343,6 @@ def hp(self): def ksize(self): return self._methodcall(lib.kmerminhash_ksize) - @property - def max_hash(self): - return self._methodcall(lib.kmerminhash_max_hash) - @property def track_abundance(self): return self._methodcall(lib.kmerminhash_track_abundance) @@ -410,17 +410,6 @@ def downsample_n(self, new_num): return a - def downsample_max_hash(self, *others): - """Copy this object and downsample new object to min of ``*others``. - - Here, ``*others`` is one or more MinHash objects. - """ - max_hashes = [x.max_hash for x in others] - new_max_hash = min(self.max_hash, *max_hashes) - new_scaled = get_scaled_for_max_hash(new_max_hash) - - return self.downsample_scaled(new_scaled) - def downsample_scaled(self, new_scaled): """Copy this object and downsample new object to scaled=``new_scaled``. """ @@ -431,7 +420,7 @@ def downsample_scaled(self, new_scaled): if max_hash is None: raise ValueError("no max_hash available - cannot downsample") - old_scaled = get_scaled_for_max_hash(self.max_hash) + old_scaled = _get_scaled_for_max_hash(self.max_hash) if old_scaled > new_scaled: raise ValueError( "new scaled {} is lower than current sample scaled {}".format( @@ -439,7 +428,7 @@ def downsample_scaled(self, new_scaled): ) ) - new_max_hash = get_max_hash_for_scaled(new_scaled) + new_max_hash = _get_max_hash_for_scaled(new_scaled) a = MinHash( 0, diff --git a/sourmash/search.py b/sourmash/search.py index 02424e719d..be7cdc1cfe 100644 --- a/sourmash/search.py +++ b/sourmash/search.py @@ -4,7 +4,7 @@ from .logging import notify, error from .signature import SourmashSignature -from .minhash import get_max_hash_for_scaled +from .minhash import _get_max_hash_for_scaled # generic SearchResult. @@ -154,7 +154,7 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance): # eliminate mins under this new resolution. # (CTB note: this means that if a high scaled/low res signature is # found early on, resolution will be low from then on.) - new_max_hash = get_max_hash_for_scaled(cmp_scaled) + new_max_hash = _get_max_hash_for_scaled(cmp_scaled) query_mins = set(_filter_max_hash(query_mins, new_max_hash)) found_mins = set(_filter_max_hash(found_mins, new_max_hash)) orig_query_mins = set(_filter_max_hash(orig_query_mins, new_max_hash)) diff --git a/sourmash/sig/__main__.py b/sourmash/sig/__main__.py index 938833df3a..8f25aa46a9 100644 --- a/sourmash/sig/__main__.py +++ b/sourmash/sig/__main__.py @@ -14,7 +14,7 @@ from sourmash.logging import set_quiet, error, notify, set_quiet, print_results, debug from sourmash import sourmash_args -from sourmash.minhash import get_max_hash_for_scaled +from sourmash.minhash import _get_max_hash_for_scaled usage=''' sourmash signature [] - manipulate/work with signature files. @@ -53,7 +53,7 @@ def _set_num_scaled(mh, num, scaled): # Number of hashes is 0th parameter mh_params[0] = num # Scale is 8th parameter - mh_params[8] = get_max_hash_for_scaled(scaled) + mh_params[8] = _get_max_hash_for_scaled(scaled) mh.__setstate__(mh_params) assert mh.num == num assert mh.scaled == scaled @@ -730,7 +730,7 @@ def downsample(args): mh_new = mh.downsample_scaled(args.scaled) else: # try to turn a num into a scaled # first check: can we? - max_hash = get_max_hash_for_scaled(args.scaled) + max_hash = _get_max_hash_for_scaled(args.scaled) mins = mh.get_mins() if max(mins) < max_hash: raise ValueError("this num MinHash does not have enough hashes to convert it into a scaled MinHash.") diff --git a/tests/test__minhash.py b/tests/test__minhash.py index ccfb6f5fd5..1ec34b2f3a 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -46,8 +46,8 @@ from sourmash.minhash import ( MinHash, hash_murmur, - get_scaled_for_max_hash, - get_max_hash_for_scaled, + _get_scaled_for_max_hash, + _get_max_hash_for_scaled, ) from sourmash import signature @@ -61,6 +61,10 @@ # * nan on empty minhash # * define equals +scaled50 = _get_scaled_for_max_hash(50) +scaled100 = _get_scaled_for_max_hash(100) +scaled5000 = _get_scaled_for_max_hash(5000) + def test_basic_dna(track_abundance): # verify that MHs of size 1 stay size 1, & act properly as bottom sketches. @@ -235,23 +239,10 @@ def test_size_limit(track_abundance): assert mh.get_mins() == [5, 10, 20] -def test_max_hash(track_abundance): - # test behavior with max_hash - mh = MinHash(0, 4, track_abundance=track_abundance, max_hash=35) - mh.add_hash(10) - mh.add_hash(20) - mh.add_hash(30) - assert mh.get_mins() == [10, 20, 30] - mh.add_hash(40) - assert mh.get_mins() == [10, 20, 30] - mh.add_hash(36) - assert mh.get_mins() == [10, 20, 30] - - def test_scaled(track_abundance): - # test behavior with scaled (alt to max_hash) - scaled = get_scaled_for_max_hash(35) - print('XX', scaled, get_max_hash_for_scaled(scaled)) + # test behavior with scaled + scaled = _get_scaled_for_max_hash(35) + print('XX', scaled, _get_max_hash_for_scaled(scaled)) mh = MinHash(0, 4, track_abundance=track_abundance, scaled=scaled) assert mh.max_hash == 35 @@ -273,29 +264,23 @@ def test_no_scaled(track_abundance): def test_max_hash_conversion(): SCALED=100000 - max_hash = get_max_hash_for_scaled(SCALED) - new_scaled = get_scaled_for_max_hash(max_hash) + max_hash = _get_max_hash_for_scaled(SCALED) + new_scaled = _get_scaled_for_max_hash(max_hash) assert new_scaled == SCALED def test_max_hash_and_scaled_zero(): - max_hash = get_max_hash_for_scaled(0) - new_scaled = get_scaled_for_max_hash(0) + max_hash = _get_max_hash_for_scaled(0) + new_scaled = _get_scaled_for_max_hash(0) assert max_hash == new_scaled assert max_hash == 0 -def test_max_hash_and_scaled_error(track_abundance): - # test behavior when supplying both max_hash and scaled - with pytest.raises(ValueError): - mh = MinHash(0, 4, track_abundance=track_abundance, max_hash=35, - scaled=5) - - def test_max_hash_cannot_limit(track_abundance): - # make sure you can't set both max_n and max_hash. + # make sure you can't set both n and scaled. with pytest.raises(ValueError): - mh = MinHash(2, 4, track_abundance=track_abundance, max_hash=35) + mh = MinHash(2, 4, track_abundance=track_abundance, + scaled=_get_scaled_for_max_hash(1)) def test_no_downsample_scaled_if_n(track_abundance): @@ -315,8 +300,8 @@ def test_scaled(track_abundance): def test_mh_jaccard_similarity(): # check actual Jaccard value for a non-trivial case - a = MinHash(0, 20, max_hash=50, track_abundance=False) - b = MinHash(0, 20, max_hash=50, track_abundance=False) + a = MinHash(0, 20, scaled=scaled50, track_abundance=False) + b = MinHash(0, 20, scaled=scaled50, track_abundance=False) a.add_many([1, 3, 5, 8]) b.add_many([1, 3, 5, 6, 8, 10]) @@ -327,9 +312,9 @@ def test_mh_similarity_downsample_jaccard_value(): # check jaccard value after downsampling # max_hash = 50 - a = MinHash(0, 20, max_hash=50, track_abundance=False) + a = MinHash(0, 20, scaled=scaled50, track_abundance=False) # max_hash = 100 - b = MinHash(0, 20, max_hash=100, track_abundance=False) + b = MinHash(0, 20, scaled=scaled100, track_abundance=False) a.add_many([1, 3, 5, 8, 70]) b.add_many([1, 3, 5, 6, 8, 10, 70 ]) @@ -343,8 +328,8 @@ def test_mh_angular_similarity(): # https://www.sciencedirect.com/topics/computer-science/cosine-similarity # note: angular similarity is 1 - 2*(acos(sim) / pi), when elements # are always positive (https://en.wikipedia.org/wiki/Cosine_similarity) - a = MinHash(0, 20, max_hash=50, track_abundance=True) - b = MinHash(0, 20, max_hash=50, track_abundance=True) + a = MinHash(0, 20, scaled=scaled50, track_abundance=True) + b = MinHash(0, 20, scaled=scaled50, track_abundance=True) a.set_abundances({ 1:5, 3:3, 5:2, 8:2}) b.set_abundances({ 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 }) @@ -357,8 +342,8 @@ def test_mh_angular_similarity(): def test_mh_angular_similarity_2(): # check actual angular similarity for a second non-trivial case - a = MinHash(0, 20, max_hash=100, track_abundance=True) - b = MinHash(0, 20, max_hash=100, track_abundance=True) + a = MinHash(0, 20, scaled=scaled100, track_abundance=True) + b = MinHash(0, 20, scaled=scaled100, track_abundance=True) a.set_abundances({ 1:5, 3:3, 5:2, 8:2, 70:70 }) b.set_abundances({ 1:3, 3:2, 5:1, 6:1, 8:1, 10:1, 70:70 }) @@ -372,9 +357,9 @@ def test_mh_similarity_downsample_angular_value(): # test downsample=True argument to MinHash.similarity # max_hash = 50 - a = MinHash(0, 20, max_hash=50, track_abundance=True) + a = MinHash(0, 20, scaled=scaled50, track_abundance=True) # max_hash = 100 - b = MinHash(0, 20, max_hash=100, track_abundance=True) + b = MinHash(0, 20, scaled=scaled100, track_abundance=True) a.set_abundances({ 1:5, 3:3, 5:2, 8:2, 70:70 }) b.set_abundances({ 1:3, 3:2, 5:1, 6:1, 8:1, 10:1, 70:70 }) @@ -392,9 +377,9 @@ def test_mh_similarity_downsample_true(track_abundance): # verify sim(a, b) == sim(b, a), with and without ignore_abundance # max_hash = 50 - a = MinHash(0, 20, max_hash=50, track_abundance=track_abundance) + a = MinHash(0, 20, scaled=scaled50, track_abundance=track_abundance) # max_hash = 100 - b = MinHash(0, 20, max_hash=100, track_abundance=track_abundance) + b = MinHash(0, 20, scaled=scaled100, track_abundance=track_abundance) a_values = { 1:5, 3:3, 5:2, 8:2} b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } @@ -420,9 +405,9 @@ def test_mh_similarity_downsample_errors(track_abundance): # test downsample=False (default) argument to MinHash.similarity # max_hash = 50 - a = MinHash(0, 20, max_hash=50, track_abundance=track_abundance) + a = MinHash(0, 20, scaled=scaled50, track_abundance=track_abundance) # max_hash = 100 - b = MinHash(0, 20, max_hash=100, track_abundance=track_abundance) + b = MinHash(0, 20, scaled=scaled100, track_abundance=track_abundance) a_values = { 1:5, 3:3, 5:2, 8:2} b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 } @@ -680,8 +665,10 @@ def test_mh_count_common_diff_protein(track_abundance): def test_mh_count_common_diff_maxhash(track_abundance): - a = MinHash(0, 5, False, track_abundance=track_abundance, max_hash=1) - b = MinHash(0, 5, True, track_abundance=track_abundance, max_hash=2) + a = MinHash(0, 5, False, track_abundance=track_abundance, + scaled=_get_scaled_for_max_hash(1)) + b = MinHash(0, 5, True, track_abundance=track_abundance, + scaled=_get_scaled_for_max_hash(2)) with pytest.raises(ValueError): a.count_common(b) @@ -955,8 +942,11 @@ def test_mh_compare_diff_seed(track_abundance): def test_mh_compare_diff_max_hash(track_abundance): - a = MinHash(0, 5, track_abundance=track_abundance, max_hash=5) - b = MinHash(0, 5, track_abundance=track_abundance, max_hash=10) + a = MinHash(0, 5, track_abundance=track_abundance, + scaled=_get_max_hash_for_scaled(5)) + + b = MinHash(0, 5, track_abundance=track_abundance, + scaled=_get_max_hash_for_scaled(10)) with pytest.raises(ValueError): a.compare(b) @@ -979,8 +969,10 @@ def test_mh_concat_diff_ksize(track_abundance): def test_mh_concat_diff_max_hash(track_abundance): - a = MinHash(0, 5, track_abundance=track_abundance, max_hash=5) - b = MinHash(0, 5, track_abundance=track_abundance, max_hash=10) + a = MinHash(0, 5, track_abundance=track_abundance, + scaled=_get_max_hash_for_scaled(5)) + b = MinHash(0, 5, track_abundance=track_abundance, + scaled=_get_max_hash_for_scaled(10)) with pytest.raises(ValueError): a += b @@ -1236,8 +1228,8 @@ def test_set_abundance_initialized(): def test_reviving_minhash(): # simulate reading a MinHash from disk - mh = MinHash(0, 21, max_hash=184467440737095520, seed=42, - track_abundance=False) + scaled = _get_max_hash_for_scaled(184467440737095520) + mh = MinHash(0, 21, scaled=scaled, seed=42, track_abundance=False) mins = (28945103950853965, 74690756200987412, 82962372765557409, 93503551367950366, 106923350319729608, 135116761470196737, 160165359281648267, 162390811417732001, 177939655451276972) @@ -1274,7 +1266,8 @@ def test_mh_copy_and_clear(track_abundance): def test_mh_copy_and_clear_with_max_hash(track_abundance): # test basic creation of new, empty MinHash w/max_hash param set - a = MinHash(0, 10, track_abundance=track_abundance, max_hash=20) + a = MinHash(0, 10, track_abundance=track_abundance, + scaled=_get_scaled_for_max_hash(20)) for i in range(0, 40, 2): a.add_hash(i) @@ -1292,8 +1285,7 @@ def test_mh_copy_and_clear_with_max_hash(track_abundance): def test_scaled_property(track_abundance): scaled = 10000 - a = MinHash(0, 10, track_abundance=track_abundance, - max_hash=round(2**64 / scaled)) + a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled) assert a.scaled == scaled @@ -1311,7 +1303,8 @@ def test_mh_subtract(track_abundance): def test_pickle_max_hash(track_abundance): - a = MinHash(0, 10, track_abundance=track_abundance, max_hash=20) + a = MinHash(0, 10, track_abundance=track_abundance, + scaled=_get_scaled_for_max_hash(20)) for i in range(0, 40, 2): a.add_hash(i) @@ -1353,7 +1346,7 @@ def test_minhash_abund_add(): # std::vector iterators upon vector resizing - in this case, there # was also a bug in inserting into the middle of mins when scaled was set. - a = MinHash(0, 10, track_abundance=True, max_hash=5000) + a = MinHash(0, 10, track_abundance=True, scaled=scaled5000) n = 0 for i in range(10, 0, -1): @@ -1369,7 +1362,7 @@ def test_minhash_abund_capacity_increase(): # this should set capacity to 1000 - see KmerMinHash constructor call # to 'reserve' when n > 0 for specific parameter. - a = MinHash(0, 10, track_abundance=True, max_hash=5000) + a = MinHash(0, 10, track_abundance=True, scaled=scaled5000) # 1001 is dependent on the value passed to reserve (currently 1000). for i in range(1001, 0, -1): @@ -1381,8 +1374,8 @@ def test_minhash_abund_merge_flat(): # of a signature with abundance and a signature without abundance. # the correct behavior for now is to calculate simple Jaccard, # i.e. 'flatten' both of them. - a = MinHash(0, 10, track_abundance=True, max_hash=5000) - b = MinHash(0, 10, max_hash=5000) + a = MinHash(0, 10, track_abundance=True, scaled=scaled5000) + b = MinHash(0, 10, scaled=scaled5000) for i in range(0, 10, 2): a.add_hash(i) @@ -1399,8 +1392,8 @@ def test_minhash_abund_merge_flat_2(): # this targets a segfault caused by trying to merge # a signature with abundance and a signature without abundance. - a = MinHash(0, 10, track_abundance=True, max_hash=5000) - b = MinHash(0, 10, max_hash=5000) + a = MinHash(0, 10, track_abundance=True, scaled=scaled5000) + b = MinHash(0, 10, scaled=scaled5000) for i in range(0, 10, 2): a.add_hash(i) @@ -1436,7 +1429,7 @@ def test_distance_matrix(track_abundance): def test_remove_many(track_abundance): - a = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000) + a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) a.add_many(list(range(0, 100, 2))) @@ -1456,8 +1449,8 @@ def test_remove_many(track_abundance): def test_add_many(track_abundance): - a = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000) - b = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000) + a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) + b = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) a.add_many(list(range(0, 100, 2))) a.add_many(list(range(0, 100, 2))) @@ -1475,7 +1468,8 @@ def test_add_many(track_abundance): def test_set_abundances_huge(): max_hash = 4000000 - a = MinHash(0, 10, track_abundance=True, max_hash=max_hash) + a = MinHash(0, 10, track_abundance=True, + scaled=_get_scaled_for_max_hash(max_hash)) hashes = list(range(max_hash)) abundances = itertools.repeat(2) diff --git a/tests/test__minhash_hypothesis.py b/tests/test__minhash_hypothesis.py index 6055e1d870..9271ae6eed 100644 --- a/tests/test__minhash_hypothesis.py +++ b/tests/test__minhash_hypothesis.py @@ -4,7 +4,7 @@ import hypothesis.strategies as st from sourmash import MinHash -from sourmash.minhash import get_max_hash_for_scaled +from sourmash.minhash import _get_max_hash_for_scaled @given(st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000), @@ -35,7 +35,7 @@ def test_set_abundance_scaled_hypothesis(hashes, abundances, scaled): a.set_abundances(oracle) - max_hash = get_max_hash_for_scaled(scaled) + max_hash = _get_max_hash_for_scaled(scaled) below_max_hash = sum(1 for (k, v) in oracle.items() if k <= max_hash and v > 0) mins = a.get_mins(with_abundance=True) From 1584283ba11b70dc380ff78e80dc43e3c908eed0 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 25 Jul 2020 13:41:21 -0700 Subject: [PATCH 03/50] change MinHash.add(...) to MinHash.add_kmer(...) --- sourmash/minhash.py | 2 +- tests/test_jaccard.py | 4 ++-- tests/test_signature.py | 26 +++++++++++++------------- 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/sourmash/minhash.py b/sourmash/minhash.py index b624c6c24f..2eca595dcf 100644 --- a/sourmash/minhash.py +++ b/sourmash/minhash.py @@ -243,7 +243,7 @@ def add_sequence(self, sequence, force=False): self._methodcall(lib.kmerminhash_add_sequence, to_bytes(sequence), force) - def add(self, kmer): + def add_kmer(self, kmer): "Add a kmer into the sketch." self.add_sequence(kmer) diff --git a/tests/test_jaccard.py b/tests/test_jaccard.py index 93bda700b9..43a4c355a4 100644 --- a/tests/test_jaccard.py +++ b/tests/test_jaccard.py @@ -76,7 +76,7 @@ def test_dna_mh(track_abundance): seq = 'ATGGCAGTGACGATGCCAG' e1.add_sequence(seq) for i in range(len(seq) - 3): - e2.add(seq[i:i + 4]) + e2.add_kmer(seq[i:i + 4]) assert e1.get_mins() == e2.get_mins() print(e1.get_mins()) @@ -95,7 +95,7 @@ def test_protein_mh(track_abundance): for i in range(len(seq) - 5): kmer = seq[i:i + 6] - e2.add(kmer) + e2.add_kmer(kmer) assert e1.get_mins() == e2.get_mins() assert 901193879228338100 in e1.get_mins() diff --git a/tests/test_signature.py b/tests/test_signature.py index 94ef3770e0..7ceaf2ee70 100644 --- a/tests/test_signature.py +++ b/tests/test_signature.py @@ -13,11 +13,11 @@ def test_compare(track_abundance): # same content, same name -> equal e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) - e.add("AT" * 10) + e.add_kmer("AT" * 10) sig1 = SourmashSignature(e, name='foo') f = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) - f.add("AT" * 10) + f.add_kmer("AT" * 10) sig2 = SourmashSignature(f, name='foo') assert e == f @@ -26,11 +26,11 @@ def test_compare(track_abundance): def test_compare_ne(track_abundance): # same content, different names -> different e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) - e.add("AT" * 10) + e.add_kmer("AT" * 10) sig1 = SourmashSignature(e, name='foo') f = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) - f.add("AT" * 10) + f.add_kmer("AT" * 10) sig2 = SourmashSignature(f, name='bar') assert sig1 != sig2 @@ -39,11 +39,11 @@ def test_compare_ne(track_abundance): def test_compare_ne2(track_abundance): # same content, different filename -> different e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) - e.add("AT" * 10) + e.add_kmer("AT" * 10) sig1 = SourmashSignature(e, name='foo', filename='a') f = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) - f.add("AT" * 10) + f.add_kmer("AT" * 10) sig2 = SourmashSignature(f, name='foo', filename='b') assert sig1 != sig2 @@ -53,11 +53,11 @@ def test_compare_ne2(track_abundance): def test_compare_ne2_reverse(track_abundance): # same content, one has filename, other does not -> different e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) - e.add("AT" * 10) + e.add_kmer("AT" * 10) sig1 = SourmashSignature(e, name='foo') f = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) - f.add("AT" * 10) + f.add_kmer("AT" * 10) sig2 = SourmashSignature(f, filename='b') assert sig2 != sig1 @@ -67,7 +67,7 @@ def test_compare_ne2_reverse(track_abundance): def test_hashable(track_abundance): # check: can we use signatures as keys in dictionaries and sets? e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) - e.add("AT" * 10) + e.add_kmer("AT" * 10) sig = SourmashSignature(e) @@ -78,7 +78,7 @@ def test_hashable(track_abundance): def test_str(track_abundance): # signatures should be printable e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) - e.add("AT" * 10) + e.add_kmer("AT" * 10) sig = SourmashSignature(e) @@ -93,7 +93,7 @@ def test_str(track_abundance): def test_roundtrip(track_abundance): e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) - e.add("AT" * 10) + e.add_kmer("AT" * 10) sig = SourmashSignature(e) s = save_signatures([sig]) siglist = list(load_signatures(s)) @@ -106,7 +106,7 @@ def test_roundtrip(track_abundance): def test_load_signature_ksize_nonint(track_abundance): e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) - e.add("AT" * 10) + e.add_kmer("AT" * 10) sig = SourmashSignature(e) s = save_signatures([sig]) siglist = list(load_signatures(s, ksize='20')) @@ -312,7 +312,7 @@ def test_load_compressed(track_abundance): def test_binary_fp(tmpdir, track_abundance): e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance) - e.add("AT" * 10) + e.add_kmer("AT" * 10) path = tmpdir.join("1.sig") with open(str(path), 'wb') as fp: From 72b3ab946b49b4e8dc2d5352c2f0c022ee1c4f3b Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 25 Jul 2020 13:46:27 -0700 Subject: [PATCH 04/50] remove update and is_molecule_type from MinHash --- sourmash/minhash.py | 13 ++----------- sourmash/sourmash_args.py | 11 +++-------- tests/test_sourmash_compute.py | 20 ++++++++++---------- 3 files changed, 15 insertions(+), 29 deletions(-) diff --git a/sourmash/minhash.py b/sourmash/minhash.py index 2eca595dcf..8ccba4ee27 100644 --- a/sourmash/minhash.py +++ b/sourmash/minhash.py @@ -262,7 +262,7 @@ def remove_many(self, hashes): "Remove many hashes at once; ``hashes`` must be an iterable." self._methodcall(lib.kmerminhash_remove_many, list(hashes), len(hashes)) - def update(self, other): + def update_xxx(self, other): "Update this sketch from all the hashes in the other." self.add_many(other) @@ -570,16 +570,6 @@ def add_protein(self, sequence): "Add a protein sequence." self._methodcall(lib.kmerminhash_add_protein, to_bytes(sequence)) - def is_molecule_type(self, molecule): - """Check if this MinHash is a particular human-readable molecule type. - - Supports 'protein', 'dayhoff', 'hp', 'DNA'. - @CTB deprecate for 4.0? - """ - if molecule.lower() not in ('protein', 'dayhoff', 'hp', 'dna'): - raise ValueError("unknown moltype in query, '{}'".format(molecule)) - return molecule == self.moltype - @property def moltype(self): # TODO: test in minhash tests if self.is_protein: @@ -590,3 +580,4 @@ def moltype(self): # TODO: test in minhash tests return 'hp' else: return 'DNA' + diff --git a/sourmash/sourmash_args.py b/sourmash/sourmash_args.py index 5c82692170..b3bccbe82d 100644 --- a/sourmash/sourmash_args.py +++ b/sourmash/sourmash_args.py @@ -28,14 +28,9 @@ def get_moltype(sig, require=False): - if sig.minhash.is_molecule_type('DNA'): - moltype = 'DNA' - elif sig.minhash.is_molecule_type('dayhoff'): - moltype = 'dayhoff' - elif sig.minhash.is_molecule_type('hp'): - moltype = 'hp' - elif sig.minhash.is_molecule_type('protein'): - moltype = 'protein' + mh = sig.minhash + if mh.moltype in ('DNA', 'dayhoff', 'hp', 'protein'): + moltype = mh.moltype else: raise ValueError('unknown molecule type for sig {}'.format(sig.name())) diff --git a/tests/test_sourmash_compute.py b/tests/test_sourmash_compute.py index bc2ee59ef1..bd61737a7d 100644 --- a/tests/test_sourmash_compute.py +++ b/tests/test_sourmash_compute.py @@ -436,8 +436,8 @@ def test_do_sourmash_compute_multik_with_dayhoff_and_dna(): ksizes = set([ x.minhash.ksize for x in siglist ]) assert 21 in ksizes assert 30 in ksizes - assert sum(x.minhash.is_molecule_type('DNA') for x in siglist) == 2 - assert sum(x.minhash.is_molecule_type('dayhoff') for x in siglist) == 2 + assert sum(x.minhash.moltype == 'DNA' for x in siglist) == 2 + assert sum(x.minhash.moltype == 'dayhoff' for x in siglist) == 2 def test_do_sourmash_compute_multik_with_hp(): @@ -493,9 +493,9 @@ def test_do_sourmash_compute_multik_with_dayhoff_dna_protein(): ksizes = set([ x.minhash.ksize for x in siglist ]) assert 21 in ksizes assert 30 in ksizes - assert sum(x.minhash.is_molecule_type('DNA') for x in siglist) == 2 - assert sum(x.minhash.is_molecule_type('dayhoff') for x in siglist) == 2 - assert sum(x.minhash.is_molecule_type('protein') for x in siglist) == 2 + assert sum(x.minhash.moltype == 'DNA' for x in siglist) == 2 + assert sum(x.minhash.moltype == 'dayhoff' for x in siglist) == 2 + assert sum(x.minhash.moltype == 'protein' for x in siglist) == 2 def test_do_sourmash_compute_multik_with_dayhoff_hp_dna_protein(): @@ -516,11 +516,11 @@ def test_do_sourmash_compute_multik_with_dayhoff_hp_dna_protein(): ksizes = set([ x.minhash.ksize for x in siglist ]) assert 21 in ksizes assert 30 in ksizes - assert sum(x.minhash.is_molecule_type('DNA') for x in siglist) == 2 - assert sum(x.minhash.is_molecule_type('dayhoff') for x in siglist) == 2 - assert sum(x.minhash.is_molecule_type('hp') for x in siglist) == 2 + assert sum(x.minhash.moltype == 'DNA' for x in siglist) == 2 + assert sum(x.minhash.moltype == 'dayhoff' for x in siglist) == 2 + assert sum(x.minhash.moltype == 'hp' for x in siglist) == 2 # 2 = dayhoff, 2 = hp = 4 protein - assert sum(x.minhash.is_molecule_type('protein') for x in siglist) == 2 + assert sum(x.minhash.moltype == 'protein' for x in siglist) == 2 def test_do_sourmash_compute_multik_with_nothing(): @@ -641,7 +641,7 @@ def test_do_sourmash_compute_multik_input_is_protein(): assert 21 in ksizes assert 30 in ksizes - moltype = set([ x.minhash.is_molecule_type('protein') + moltype = set([ x.minhash.moltype == 'protein' for x in siglist ]) assert len(moltype) == 1 assert True in moltype From c3567ed2fb3f0ab337e208ae5053273a484baaeb Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 25 Jul 2020 13:48:27 -0700 Subject: [PATCH 05/50] remove subtract_mins --- sourmash/minhash.py | 8 -------- tests/test__minhash.py | 13 ------------- 2 files changed, 21 deletions(-) diff --git a/sourmash/minhash.py b/sourmash/minhash.py index 8ccba4ee27..589b4bb96a 100644 --- a/sourmash/minhash.py +++ b/sourmash/minhash.py @@ -297,14 +297,6 @@ def get_hashes(self): "Return the list of hashes." return self.get_mins() - def subtract_mins(self, other): - """Get the list of mins in this MinHash, after removing the ones in - ``other``. - """ - a = set(self.get_mins()) - b = set(other.get_mins()) - return a - b - @property def seed(self): return self._methodcall(lib.kmerminhash_seed) diff --git a/tests/test__minhash.py b/tests/test__minhash.py index 1ec34b2f3a..340aeaecf4 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -1289,19 +1289,6 @@ def test_scaled_property(track_abundance): assert a.scaled == scaled -def test_mh_subtract(track_abundance): - # test subtracting two identically configured minhashes - a = MinHash(20, 10, track_abundance=track_abundance) - for i in range(0, 40, 2): - a.add_hash(i) - - b = MinHash(20, 10, track_abundance=track_abundance) - for i in range(0, 80, 4): - b.add_hash(i) - - assert a.subtract_mins(b) == set(range(2, 40, 4)) - - def test_pickle_max_hash(track_abundance): a = MinHash(0, 10, track_abundance=track_abundance, scaled=_get_scaled_for_max_hash(20)) From 19a436598191a9d7c1075fc04240a702da0c5bac Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 25 Jul 2020 13:51:36 -0700 Subject: [PATCH 06/50] rename downsample_n to downsample_num --- sourmash/minhash.py | 2 +- sourmash/sig/__main__.py | 2 +- tests/test__minhash.py | 14 +++++++------- tests/test_cmd_signature.py | 2 +- tests/test_jaccard.py | 28 ++++++++++++++-------------- 5 files changed, 24 insertions(+), 24 deletions(-) diff --git a/sourmash/minhash.py b/sourmash/minhash.py index 589b4bb96a..50a0e1d228 100644 --- a/sourmash/minhash.py +++ b/sourmash/minhash.py @@ -387,7 +387,7 @@ def count_common(self, other, downsample=False): raise TypeError("Must be a MinHash!") return self._methodcall(lib.kmerminhash_count_common, other._get_objptr(), downsample) - def downsample_n(self, new_num): + def downsample_num(self, new_num): "Copy this object and downsample new object to num=``new_num``." if self.num and self.num < new_num: raise ValueError("new sample n is higher than current sample n") diff --git a/sourmash/sig/__main__.py b/sourmash/sig/__main__.py index 8f25aa46a9..5af9662c12 100644 --- a/sourmash/sig/__main__.py +++ b/sourmash/sig/__main__.py @@ -739,7 +739,7 @@ def downsample(args): _set_num_scaled(mh_new, 0, args.scaled) elif args.num: if mh.num: - mh_new = mh.downsample_n(args.num) + mh_new = mh.downsample_num(args.num) else: # try to turn a scaled into a num # first check: can we? if len(mh) < args.num: diff --git a/tests/test__minhash.py b/tests/test__minhash.py index 340aeaecf4..c245102c6d 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -698,10 +698,10 @@ def test_mh_count_common_notmh(track_abundance): a.count_common(b) -def test_mh_downsample_n_error(track_abundance): +def test_mh_downsample_num_error(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) with pytest.raises(ValueError): - a.downsample_n(30) + a.downsample_num(30) def test_mh_jaccard_asymmetric_num(track_abundance): @@ -720,7 +720,7 @@ def test_mh_jaccard_asymmetric_num(track_abundance): with pytest.raises(TypeError): a.compare(b) - a = a.downsample_n(10) + a = a.downsample_num(10) assert a.compare(b) == 0.5 assert b.compare(a) == 0.5 @@ -837,12 +837,12 @@ def test_mh_asymmetric_merge(track_abundance): with pytest.raises(TypeError): d.compare(a) - a = a.downsample_n(d.num) + a = a.downsample_num(d.num) print(a.get_mins()) print(d.get_mins()) assert d.compare(a) == 1.0 - c = c.downsample_n(b.num) + c = c.downsample_num(b.num) assert c.compare(b) == 1.0 @@ -873,10 +873,10 @@ def test_mh_inplace_concat_asymmetric(track_abundance): except TypeError as exc: assert 'must have same num' in str(exc) - a = a.downsample_n(d.num) + a = a.downsample_num(d.num) assert d.compare(a) == 1.0 # see: d += a, above. - c = c.downsample_n(b.num) + c = c.downsample_num(b.num) assert c.compare(b) == 0.5 diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index 73a8aa0706..41bda45fb8 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -1114,7 +1114,7 @@ def test_sig_downsample_2_num(c): test_downsample_sig = sourmash.load_one_signature(sigs11, ksize=21, select_moltype='DNA') actual_downsample_sig = sourmash.load_one_signature(out) - test_mh = test_downsample_sig.minhash.downsample_n(500) + test_mh = test_downsample_sig.minhash.downsample_num(500) assert actual_downsample_sig.minhash == test_mh diff --git a/tests/test_jaccard.py b/tests/test_jaccard.py index 43a4c355a4..679e0723f4 100644 --- a/tests/test_jaccard.py +++ b/tests/test_jaccard.py @@ -189,18 +189,18 @@ def test_jaccard_on_real_data(): assert mh1.compare(mh2) == 0.0183 assert mh2.compare(mh1) == 0.0183 - mh1 = mh1.downsample_n(1000) - mh2 = mh2.downsample_n(1000) + mh1 = mh1.downsample_num(1000) + mh2 = mh2.downsample_num(1000) assert mh1.compare(mh2) == 0.011 assert mh2.compare(mh1) == 0.011 - mh1 = mh1.downsample_n(100) - mh2 = mh2.downsample_n(100) + mh1 = mh1.downsample_num(100) + mh2 = mh2.downsample_num(100) assert mh1.compare(mh2) == 0.01 assert mh2.compare(mh1) == 0.01 - mh1 = mh1.downsample_n(10) - mh2 = mh2.downsample_n(10) + mh1 = mh1.downsample_num(10) + mh2 = mh2.downsample_num(10) assert mh1.compare(mh2) == 0.0 assert mh2.compare(mh1) == 0.0 @@ -221,24 +221,24 @@ def test_scaled_on_real_data(): assert round(mh1.compare(mh2), 5) == 0.01644 assert round(mh2.compare(mh1), 5) == 0.01644 - mh1 = mh1.downsample_n(10000) - mh2 = mh2.downsample_n(10000) + mh1 = mh1.downsample_num(10000) + mh2 = mh2.downsample_num(10000) assert mh1.compare(mh2) == 0.0183 assert mh2.compare(mh1) == 0.0183 - mh1 = mh1.downsample_n(1000) - mh2 = mh2.downsample_n(1000) + mh1 = mh1.downsample_num(1000) + mh2 = mh2.downsample_num(1000) assert mh1.compare(mh2) == 0.011 assert mh2.compare(mh1) == 0.011 - mh1 = mh1.downsample_n(100) - mh2 = mh2.downsample_n(100) + mh1 = mh1.downsample_num(100) + mh2 = mh2.downsample_num(100) assert mh1.compare(mh2) == 0.01 assert mh2.compare(mh1) == 0.01 - mh1 = mh1.downsample_n(10) - mh2 = mh2.downsample_n(10) + mh1 = mh1.downsample_num(10) + mh2 = mh2.downsample_num(10) assert mh1.compare(mh2) == 0.0 assert mh2.compare(mh1) == 0.0 From 01e8a4592e5906a98c9ededb9d0c61bbb8b5a46c Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 25 Jul 2020 13:59:11 -0700 Subject: [PATCH 07/50] switch to hashes property instead of using get_mins() --- sourmash/minhash.py | 4 + tests/test__minhash.py | 176 ++++++++++++++++++++--------------------- 2 files changed, 92 insertions(+), 88 deletions(-) diff --git a/sourmash/minhash.py b/sourmash/minhash.py index 50a0e1d228..eb49cde095 100644 --- a/sourmash/minhash.py +++ b/sourmash/minhash.py @@ -297,6 +297,10 @@ def get_hashes(self): "Return the list of hashes." return self.get_mins() + @property + def hashes(self): + return self.get_mins(with_abundance=True) + @property def seed(self): return self._methodcall(lib.kmerminhash_seed) diff --git a/tests/test__minhash.py b/tests/test__minhash.py index c245102c6d..81f951672e 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -72,15 +72,15 @@ def test_basic_dna(track_abundance): assert mh.moltype == 'DNA' mh.add_sequence('ATGC') - a = mh.get_mins() + a = mh.hashes mh.add_sequence('GCAT') # this will not get added; hash > ATGC - b = mh.get_mins() + b = mh.hashes print(a, b) - assert a == b + assert list(a) == list(b) assert len(b) == 1 - assert a[0] == b[0] == 12415348535738636339 + assert list(a)[0] == list(b)[0] == 12415348535738636339 def test_div_zero(track_abundance): @@ -108,15 +108,15 @@ def test_bytes_dna(track_abundance): mh.add_sequence('ATGC') mh.add_sequence(b'ATGC') mh.add_sequence('ATGC') - a = mh.get_mins() + a = mh.hashes mh.add_sequence('GCAT') # this will not get added; hash > ATGC mh.add_sequence(b'GCAT') # this will not get added; hash > ATGC mh.add_sequence('GCAT') # this will not get added; hash > ATGC - b = mh.get_mins() + b = mh.hashes print(a, b) - assert a == b + assert list(a) == list(b) assert len(b) == 1 @@ -134,7 +134,7 @@ def test_bytes_protein_dayhoff(track_abundance, dayhoff): mh.add_protein('AGYYG') mh.add_protein(b'AGYYG') - assert len(mh.get_mins()) == 4 + assert len(mh.hashes) == 4 def test_protein_dayhoff(track_abundance, dayhoff): @@ -142,7 +142,7 @@ def test_protein_dayhoff(track_abundance, dayhoff): mh = MinHash(10, 6, True, dayhoff=dayhoff, hp=False, track_abundance=track_abundance) mh.add_protein('AGYYG') - assert len(mh.get_mins()) == 4 + assert len(mh.hashes) == 4 def test_bytes_protein_hp(track_abundance, hp): @@ -158,9 +158,9 @@ def test_bytes_protein_hp(track_abundance, hp): mh.add_protein(b'AGYYG') if hp: - assert len(mh.get_mins()) == 1 + assert len(mh.hashes) == 1 else: - assert len(mh.get_mins()) == 4 + assert len(mh.hashes) == 4 def test_protein_hp(track_abundance, hp): @@ -169,9 +169,9 @@ def test_protein_hp(track_abundance, hp): mh.add_protein('AGYYG') if hp: - assert len(mh.get_mins()) == 1 + assert len(mh.hashes) == 1 else: - assert len(mh.get_mins()) == 4 + assert len(mh.hashes) == 4 def test_translate_codon(track_abundance): @@ -194,13 +194,13 @@ def test_dayhoff(track_abundance): dayhoff=True, hp=False, track_abundance=track_abundance) mh_dayhoff.add_sequence('ACTGAC') - assert len(mh_dayhoff.get_mins()) == 2 + assert len(mh_dayhoff.hashes) == 2 # verify that dayhoff-encoded hashes are different from protein/aa hashes mh_protein = MinHash(10, 6, is_protein=True, track_abundance=track_abundance) mh_protein.add_sequence('ACTGAC') - assert len(mh_protein.get_mins()) == 2 - assert mh_protein.get_mins() != mh_dayhoff.get_mins() + assert len(mh_protein.hashes) == 2 + assert mh_protein.hashes != mh_dayhoff.hashes def test_hp(track_abundance): @@ -211,13 +211,13 @@ def test_hp(track_abundance): mh_hp.add_sequence('ACTGAC') - assert len(mh_hp.get_mins()) == 2 + assert len(mh_hp.hashes) == 2 # verify that hp-encoded hashes are different from protein/aa hashes mh_protein = MinHash(10, 6, is_protein=True, track_abundance=track_abundance) mh_protein.add_sequence('ACTGAC') - assert len(mh_protein.get_mins()) == 2 - assert mh_protein.get_mins() != mh_hp.get_mins() + assert len(mh_protein.hashes) == 2 + assert mh_protein.hashes != mh_hp.hashes def test_protein_short(track_abundance): @@ -225,7 +225,7 @@ def test_protein_short(track_abundance): mh = MinHash(10, 9, True, track_abundance=track_abundance) mh.add_protein('AG') - assert len(mh.get_mins()) == 0, mh.get_mins() + assert len(mh.hashes) == 0, mh.hashes def test_size_limit(track_abundance): @@ -234,9 +234,9 @@ def test_size_limit(track_abundance): mh.add_hash(10) mh.add_hash(20) mh.add_hash(30) - assert mh.get_mins() == [10, 20, 30] + assert list(mh.hashes) == [10, 20, 30] mh.add_hash(5) # -> should push 30 off end - assert mh.get_mins() == [5, 10, 20] + assert list(mh.hashes) == [5, 10, 20] def test_scaled(track_abundance): @@ -249,11 +249,11 @@ def test_scaled(track_abundance): mh.add_hash(10) mh.add_hash(20) mh.add_hash(30) - assert mh.get_mins() == [10, 20, 30] + assert mh.hashes == [10, 20, 30] mh.add_hash(40) - assert mh.get_mins() == [10, 20, 30] + assert mh.hashes == [10, 20, 30] mh.add_hash(36) - assert mh.get_mins() == [10, 20, 30] + assert mh.hashes == [10, 20, 30] def test_no_scaled(track_abundance): @@ -458,26 +458,26 @@ def test_basic_dna_bad_2(track_abundance): def test_basic_dna_bad_force(track_abundance): # test behavior on bad DNA; use 100 so multiple hashes get added. mh = MinHash(100, 4, track_abundance=track_abundance) - assert len(mh.get_mins()) == 0 + assert len(mh.hashes) == 0 mh.add_sequence('ATGN', True) # ambiguous kmer skipped. - assert len(mh.get_mins()) == 0 + assert len(mh.hashes) == 0 mh.add_sequence('AATGN', True) # but good k-mers still used. - assert len(mh.get_mins()) == 1 + assert len(mh.hashes) == 1 mh.add_sequence('AATG', True) # checking that right kmer was added - assert len(mh.get_mins()) == 1 # (only 1 hash <- this is a dup) + assert len(mh.hashes) == 1 # (only 1 hash <- this is a dup) def test_basic_dna_bad_force_2(track_abundance): # test behavior on bad DNA mh = MinHash(100, 4, track_abundance=track_abundance) - assert len(mh.get_mins()) == 0 + assert len(mh.hashes) == 0 mh.add_sequence('AAGNCGG', True) # ambiguous kmers skipped. - assert len(mh.get_mins()) == 0 + assert len(mh.hashes) == 0 mh.add_sequence('AATGNGCGG', True) # ambiguous kmers skipped. - assert len(mh.get_mins()) == 2 + assert len(mh.hashes) == 2 mh.add_sequence('AATG', True) # checking that right kmers were added mh.add_sequence('GCGG', True) - assert len(mh.get_mins()) == 2 # (only 2 hashes should be there) + assert len(mh.hashes) == 2 # (only 2 hashes should be there) def test_consume_lowercase(track_abundance): @@ -531,7 +531,7 @@ def test_intersection_errors(track_abundance): a.add_sequence("TGCCGCCCAGCA") b.add_sequence("TGCCGCCCAGCA") - common = set(a.get_mins()) + common = set(a.hashes) combined_size = 3 intersection, size = a.intersection(b, in_common=False) @@ -554,7 +554,7 @@ def test_intersection_1(track_abundance): a.add_sequence('TGCCGCCCAGCA') b.add_sequence('TGCCGCCCAGCA') - common = set(a.get_mins()) + common = set(a.hashes) combined_size = 3 intersection, size = a.intersection(b, in_common=True) @@ -595,7 +595,7 @@ def test_intersection_1(track_abundance): a.add_sequence('GTCCGCCCAGTGA') b.add_sequence('GTCCGCCCAGTGG') - new_in_common = set(a.get_mins()).intersection(set(b.get_mins())) + new_in_common = set(a.hashes).intersection(set(b.hashes)) new_combined_size = 8 intersection, size = a.intersection(b, in_common=True) @@ -607,10 +607,10 @@ def test_intersection_1(track_abundance): assert size == new_combined_size intersection, size = a.intersection(a, in_common=True) - assert intersection == set(a.get_mins()) + assert intersection == set(a.hashes) intersection, size = b.intersection(b, in_common=True) - assert intersection == set(b.get_mins()) + assert intersection == set(b.hashes) def test_mh_copy(track_abundance): @@ -634,13 +634,13 @@ def test_mh_len(track_abundance): for i in range(0, 40, 2): a.add_hash(i) - assert a.get_mins() == list(range(0, 40, 2)) + assert list(a.hashes) == list(range(0, 40, 2)) def test_mh_unsigned_long_long(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) a.add_hash(9227159859419181011) # too big for a C long int. - assert 9227159859419181011 in a.get_mins() + assert 9227159859419181011 in a.hashes def test_mh_count_common(track_abundance): @@ -745,7 +745,7 @@ def test_mh_merge(track_abundance): d = b.merge(a) assert len(c) == len(d) - assert c.get_mins() == d.get_mins() + assert list(c.hashes) == list(d.hashes) assert c.compare(d) == 1.0 assert d.compare(c) == 1.0 @@ -763,7 +763,7 @@ def test_mh_merge_empty_num(track_abundance): assert len(c) assert len(c) == len(d) - assert c.get_mins() == d.get_mins() + assert list(c.hashes) == list(d.hashes) assert c.compare(d) == 1.0 assert d.compare(c) == 1.0 @@ -781,7 +781,7 @@ def test_mh_merge_empty_scaled(track_abundance): assert len(c) assert len(c) == len(d) - assert c.get_mins() == d.get_mins() + assert list(c.hashes) == list(d.hashes) assert c.compare(d) == 1.0 assert d.compare(c) == 1.0 @@ -796,7 +796,7 @@ def test_mh_merge_check_length(track_abundance): b.add_hash(i) c = a.merge(b) - assert len(c.get_mins()) == 20 + assert len(c.hashes) == 20 def test_mh_merge_check_length2(track_abundance): @@ -812,7 +812,7 @@ def test_mh_merge_check_length2(track_abundance): b.add_hash(4) c = a.merge(b) - assert len(c.get_mins()) == 3 + assert len(c.hashes) == 3 def test_mh_asymmetric_merge(track_abundance): # test merging two asymmetric (different size) MHs @@ -838,8 +838,8 @@ def test_mh_asymmetric_merge(track_abundance): d.compare(a) a = a.downsample_num(d.num) - print(a.get_mins()) - print(d.get_mins()) + print(a.hashes) + print(d.hashes) assert d.compare(a) == 1.0 c = c.downsample_num(b.num) @@ -896,7 +896,7 @@ def test_mh_inplace_concat(track_abundance): d += a assert len(c) == len(d) - assert c.get_mins() == d.get_mins() + assert c.hashes == d.hashes assert c.compare(d) == 1.0 assert d.compare(c) == 1.0 @@ -990,7 +990,7 @@ def test_short_sequence(track_abundance): a = MinHash(20, 5, track_abundance=track_abundance) a.add_sequence('GGGG') # adding a short sequence should fail silently - assert len(a.get_mins()) == 0 + assert len(a.hashes) == 0 def test_bytes_murmur(): @@ -1025,25 +1025,25 @@ def test_abundance_simple(): a = MinHash(20, 5, False, track_abundance=True) a.add_sequence('AAAAA') - assert a.get_mins() == [2110480117637990133] - assert a.get_mins(with_abundance=True) == {2110480117637990133: 1} + assert list(a.hashes) == [2110480117637990133] + assert a.hashes == {2110480117637990133: 1} a.add_sequence('AAAAA') - assert a.get_mins() == [2110480117637990133] - assert a.get_mins(with_abundance=True) == {2110480117637990133: 2} + assert list(a.hashes) == [2110480117637990133] + assert a.hashes == {2110480117637990133: 2} def test_add_hash_with_abundance(): a = MinHash(20, 5, False, track_abundance=True) a.add_hash_with_abundance(10, 1) - assert a.get_mins(with_abundance=True) == {10: 1} + assert a.hashes == {10: 1} a.add_hash_with_abundance(20, 2) - assert a.get_mins(with_abundance=True) == {10: 1, 20: 2} + assert a.hashes == {10: 1, 20: 2} a.add_hash_with_abundance(10, 2) - assert a.get_mins(with_abundance=True) == {10: 3, 20: 2} + assert a.hashes == {10: 3, 20: 2} def test_add_hash_with_abundance_2(): @@ -1059,20 +1059,20 @@ def test_clear(): a = MinHash(20, 5, False, track_abundance=True) a.add_hash(10) - assert a.get_mins(with_abundance=True) == {10: 1} + assert a.hashes == {10: 1} a.clear() - assert a.get_mins(with_abundance=True) == {} + assert a.hashes == {} def test_clear_2(): a = MinHash(20, 5, False, track_abundance=False) a.add_hash(10) - assert a.get_mins() == [10] + assert list(a.hashes) == [10] a.clear() - assert a.get_mins() == [] + assert list(a.hashes) == [] def test_abundance_simple_2(): @@ -1080,12 +1080,12 @@ def test_abundance_simple_2(): b = MinHash(20, 5, False, track_abundance=True) a.add_sequence('AAAAA') - assert a.get_mins() == [2110480117637990133] - assert a.get_mins(with_abundance=True) == {2110480117637990133: 1} + assert list(a.hashes) == [2110480117637990133] + assert a.hashes == {2110480117637990133: 1} a.add_sequence('AAAAA') - assert a.get_mins() == [2110480117637990133] - assert a.get_mins(with_abundance=True) == {2110480117637990133: 2} + assert list(a.hashes) == [2110480117637990133] + assert a.hashes == {2110480117637990133: 2} b.add_sequence('AAAAA') assert a.count_common(b) == 1 @@ -1097,15 +1097,15 @@ def test_abundance_count_common(): a.add_sequence('AAAAA') a.add_sequence('AAAAA') - assert a.get_mins() == [2110480117637990133] - assert a.get_mins(with_abundance=True) == {2110480117637990133: 2} + assert list(a.hashes) == [2110480117637990133] + assert a.hashes == {2110480117637990133: 2} b.add_sequence('AAAAA') b.add_sequence('GGGGG') assert a.count_common(b) == 1 assert a.count_common(b) == b.count_common(a) - assert b.get_mins(with_abundance=True) == [2110480117637990133, + assert b.hashes == [2110480117637990133, 10798773792509008305] @@ -1153,12 +1153,12 @@ def test_set_abundance_2(): ksize=30, select_moltype='dna') new_mh = sig.minhash.copy_and_clear() - mins = sig.minhash.get_mins() + mins = sig.minhash.hashes mins = {k: 1 for k in mins} new_mh.track_abundance = True new_mh.set_abundances(mins) - assert new_mh.get_mins(with_abundance=True) == mins + assert new_mh.hashes == mins def test_set_abundance_clear(): @@ -1169,7 +1169,7 @@ def test_set_abundance_clear(): a.set_abundances({1: 3, 2: 4}, clear=True) b.set_abundances({1: 3, 2: 4}, clear=False) - assert a.get_mins() == b.get_mins() + assert list(a.hashes) == list(b.hashes) def test_set_abundance_clear_2(): @@ -1177,20 +1177,20 @@ def test_set_abundance_clear_2(): a = MinHash(20, 5, False, track_abundance=True) a.add_hash(10) - assert a.get_mins(with_abundance=True) == {10: 1} + assert a.hashes == {10: 1} a.set_abundances({20: 2}) - assert a.get_mins(with_abundance=True) == {20: 2} + assert a.hashes == {20: 2} def test_set_abundance_clear_3(): a = MinHash(20, 5, False, track_abundance=True) a.add_hash(10) - assert a.get_mins(with_abundance=True) == {10: 1} + assert a.hashes == {10: 1} a.set_abundances({20: 1, 30: 4}, clear=False) - assert a.get_mins(with_abundance=True) == {10: 1, 20: 1, 30: 4} + assert a.hashes == {10: 1, 20: 1, 30: 4} def test_set_abundance_clear_4(): @@ -1199,10 +1199,10 @@ def test_set_abundance_clear_4(): a = MinHash(20, 5, False, track_abundance=True) a.set_abundances({20: 2, 10: 1}, clear=False) # should also sort the hashes - assert a.get_mins(with_abundance=True) == {10: 1, 20: 2} + assert a.hashes == {10: 1, 20: 2} a.set_abundances({20: 1, 10: 2}, clear=False) - assert a.get_mins(with_abundance=True) == {10: 3, 20: 3} + assert a.hashes == {10: 3, 20: 3} def test_reset_abundance_initialized(): @@ -1213,7 +1213,7 @@ def test_reset_abundance_initialized(): # Convert from Abundance to Regular MinHash a.track_abundance = False - assert a.get_mins(with_abundance=True) == [12415348535738636339] + assert a.hashes == [12415348535738636339] def test_set_abundance_initialized(): @@ -1243,7 +1243,7 @@ def test_set_abundance_num(): a.set_abundances({1: 3, 2: 4}) - assert a.get_mins(with_abundance=True) == {1: 3, 2: 4} + assert a.hashes == {1: 3, 2: 4} def test_mh_copy_and_clear(track_abundance): @@ -1259,7 +1259,7 @@ def test_mh_copy_and_clear(track_abundance): assert not b.is_protein assert b.track_abundance == track_abundance assert b.seed == a.seed - assert len(b.get_mins()) == 0 + assert len(b.hashes) == 0 assert a.scaled == b.scaled assert b.scaled == 0 @@ -1278,7 +1278,7 @@ def test_mh_copy_and_clear_with_max_hash(track_abundance): assert not b.is_protein assert b.track_abundance == track_abundance assert b.seed == a.seed - assert len(b.get_mins()) == 0 + assert len(b.hashes) == 0 assert a.scaled == b.scaled assert b.scaled != 0 @@ -1303,8 +1303,8 @@ def test_pickle_max_hash(track_abundance): assert not b.is_protein assert b.track_abundance == track_abundance assert b.seed == a.seed - assert len(b.get_mins()) == len(a.get_mins()) - assert len(b.get_mins()) == 11 + assert len(b.hashes) == len(a.hashes) + assert len(b.hashes) == 11 assert a.scaled == b.scaled assert b.scaled != 0 @@ -1322,8 +1322,8 @@ def test_pickle_scaled(track_abundance): assert not b.is_protein assert b.track_abundance == track_abundance assert b.seed == a.seed - assert len(b.get_mins()) == len(a.get_mins()) - assert len(b.get_mins()) == 11 + assert len(b.hashes) == len(a.hashes) + assert len(b.hashes) == 11 assert a.scaled == b.scaled assert b.scaled != 0 @@ -1339,8 +1339,8 @@ def test_minhash_abund_add(): for i in range(10, 0, -1): a.add_hash(i) n += 1 - assert len(a.get_mins()) == n - print(len(a.get_mins())) + assert len(a.hashes) == n + print(len(a.hashes)) def test_minhash_abund_capacity_increase(): @@ -1432,7 +1432,7 @@ def test_remove_many(track_abundance): assert orig_md5 != new_md5 assert len(a) == 33 - assert all(c % 6 != 0 for c in a.get_mins()) + assert all(c % 6 != 0 for c in a.hashes) def test_add_many(track_abundance): @@ -1443,7 +1443,7 @@ def test_add_many(track_abundance): a.add_many(list(range(0, 100, 2))) assert len(a) == 50 - assert all(c % 2 == 0 for c in a.get_mins()) + assert all(c % 2 == 0 for c in a.hashes) for h in range(0, 100, 2): b.add_hash(h) From de589ea50a3a4536f5b9b7633f28a88e4e8dd52c Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 25 Jul 2020 14:37:01 -0700 Subject: [PATCH 08/50] replace get_mins(...) with hashes thruout --- sourmash/commands.py | 6 +++--- sourmash/lca/command_classify.py | 2 +- sourmash/lca/command_gather.py | 6 +++--- sourmash/lca/command_summarize.py | 4 ++-- sourmash/lca/lca_db.py | 4 ++-- sourmash/search.py | 2 +- sourmash/sig/__main__.py | 22 +++++++++++----------- 7 files changed, 23 insertions(+), 23 deletions(-) diff --git a/sourmash/commands.py b/sourmash/commands.py index 7ae42f32fb..7249d3ee9d 100644 --- a/sourmash/commands.py +++ b/sourmash/commands.py @@ -703,10 +703,10 @@ def gather(args): e = MinHash(ksize=query.minhash.ksize, n=0, max_hash=new_max_hash, track_abundance=with_abundance) if with_abundance: - abunds = next_query.minhash.get_mins(with_abundance=True) + abunds = next_query.minhash.hashes e.set_abundances(abunds) else: - e.add_many(next_query.minhash.get_mins()) + e.add_many(next_query.minhash.hashes) with FileOutput(args.output_unassigned, 'wt') as fp: sig.save_signatures([ sig.SourmashSignature(e) ], fp) @@ -849,7 +849,7 @@ def multigather(args): notify('saving unassigned hashes to "{}"', output_unassigned) e = MinHash(ksize=query.minhash.ksize, n=0, max_hash=new_max_hash) - e.add_many(next_query.minhash.get_mins()) + e.add_many(next_query.minhash.hashes) sig.save_signatures([ sig.SourmashSignature(e) ], fp) n += 1 diff --git a/sourmash/lca/command_classify.py b/sourmash/lca/command_classify.py index 568040c56d..e4e8c7b452 100644 --- a/sourmash/lca/command_classify.py +++ b/sourmash/lca/command_classify.py @@ -36,7 +36,7 @@ def classify_signature(query_sig, dblist, threshold, majority): """ # gather assignments from across all the databases - assignments = lca_utils.gather_assignments(query_sig.minhash.get_mins(), + assignments = lca_utils.gather_assignments(query_sig.minhash.hashes, dblist) # now convert to trees -> do LCA & counts diff --git a/sourmash/lca/command_gather.py b/sourmash/lca/command_gather.py index 812c6b3b41..03c72f58ec 100644 --- a/sourmash/lca/command_gather.py +++ b/sourmash/lca/command_gather.py @@ -65,11 +65,11 @@ def gather_signature(query_sig, dblist, ignore_abundance): query_sig.minhash.ksize) # extract the basic set of mins - query_mins = set(query_sig.minhash.get_mins()) + query_mins = set(query_sig.minhash.hashes) n_mins = len(query_mins) if query_sig.minhash.track_abundance and not ignore_abundance: - orig_abunds = query_sig.minhash.get_mins(with_abundance=True) + orig_abunds = query_sig.minhash.hashes else: if query_sig.minhash.track_abundance and ignore_abundance: notify('** ignoring abundance') @@ -236,7 +236,7 @@ def gather_main(args): print_results('') # nothing found. else: - est_bp = len(query_sig.minhash.get_mins()) * query_sig.minhash.scaled + est_bp = len(query_sig.minhash) * query_sig.minhash.scaled print_results('') print_results('No assignment for est {} of sequence.', format_bp(est_bp)) diff --git a/sourmash/lca/command_summarize.py b/sourmash/lca/command_summarize.py index 74155a2d6c..2ac453e8be 100644 --- a/sourmash/lca/command_summarize.py +++ b/sourmash/lca/command_summarize.py @@ -130,11 +130,11 @@ def count_signature(sig, scaled, hashvals): mh = sig.minhash.downsample_scaled(scaled) if mh.track_abundance: - abunds = mh.get_mins(with_abundance=True) + abunds = mh.hashes for hashval, count in abunds.items(): hashvals[hashval] += count else: - for hashval in mh.get_mins(): + for hashval in mh.hashes: hashvals[hashval] += 1 diff --git a/sourmash/lca/lca_db.py b/sourmash/lca/lca_db.py index 3931813e19..dbf8a015ad 100644 --- a/sourmash/lca/lca_db.py +++ b/sourmash/lca/lca_db.py @@ -156,7 +156,7 @@ def insert(self, sig, ident=None, lineage=None): except TypeError: raise ValueError('lineage cannot be used as a key?!') - for hashval in minhash.get_mins(): + for hashval in minhash.hashes: self.hashval_to_idx[hashval].add(idx) return len(minhash) @@ -462,7 +462,7 @@ def _find_signatures(self, minhash, threshold, containment=False, # note that containment can be calculated w/o matching scaled. raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled)) - query_mins = set(minhash.get_mins()) + query_mins = set(minhash.hashes) # collect matching hashes for the query: c = Counter() diff --git a/sourmash/search.py b/sourmash/search.py index be7cdc1cfe..9281571767 100644 --- a/sourmash/search.py +++ b/sourmash/search.py @@ -125,7 +125,7 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance): orig_query_abunds = { k: 1 for k in orig_query_mins } if track_abundance: import numpy as np - orig_query_abunds = orig_query_mh.get_mins(with_abundance=True) + orig_query_abunds = orig_query_mh.hashes cmp_scaled = query.minhash.scaled # initialize with resolution of query while query.minhash: diff --git a/sourmash/sig/__main__.py b/sourmash/sig/__main__.py index 5af9662c12..da575394de 100644 --- a/sourmash/sig/__main__.py +++ b/sourmash/sig/__main__.py @@ -298,8 +298,8 @@ def overlap(args): scaled = sig1.minhash.scaled - hashes_1 = set(sig1.minhash.get_mins()) - hashes_2 = set(sig2.minhash.get_mins()) + hashes_1 = set(sig1.minhash.hashes) + hashes_2 = set(sig2.minhash.hashes) num_common = len(hashes_1.intersection(hashes_2)) disjoint_1 = len(hashes_1 - hashes_2) @@ -418,14 +418,14 @@ def intersect(args): progress=progress): if first_sig is None: first_sig = sigobj - mins = set(sigobj.minhash.get_mins()) + mins = set(sigobj.minhash.hashes) else: # check signature compatibility -- if not sigobj.minhash.is_compatible(first_sig.minhash): error("incompatible minhashes; specify -k and/or molecule type.") sys.exit(-1) - mins.intersection_update(sigobj.minhash.get_mins()) + mins.intersection_update(sigobj.minhash.hashes) total_loaded += 1 notify('loaded and intersected signatures from {}...', sigfile, end='\r') @@ -449,7 +449,7 @@ def intersect(args): error("--track-abundance not set on loaded signature?! exiting.") sys.exit(-1) intersect_mh = abund_sig.minhash.copy_and_clear() - abund_mins = abund_sig.minhash.get_mins(with_abundance=True) + abund_mins = abund_sig.minhash.hashes # do one last intersection mins.intersection_update(abund_mins) @@ -479,7 +479,7 @@ def subtract(args): error('Cannot use subtract on signatures with abundance tracking, sorry!') sys.exit(1) - subtract_mins = set(from_mh.get_mins()) + subtract_mins = set(from_mh.hashes) notify('loaded signature from {}...', from_sigfile, end='\r') @@ -500,7 +500,7 @@ def subtract(args): error('Cannot use subtract on signatures with abundance tracking, sorry!') sys.exit(1) - subtract_mins -= set(sigobj.minhash.get_mins()) + subtract_mins -= set(sigobj.minhash.hashes) notify('loaded and subtracted signatures from {}...', sigfile, end='\r') total_loaded += 1 @@ -625,7 +625,7 @@ def filter(args): ss) continue - abunds = mh.get_mins(with_abundance=True) + abunds = mh.hashes abunds2 = {} for k, v in abunds.items(): if v >= args.min_abundance: @@ -679,7 +679,7 @@ def flatten(args): for ss in siglist: flattened_mh = ss.minhash.copy_and_clear() flattened_mh.track_abundance = False - flattened_mh.add_many(ss.minhash.get_mins()) + flattened_mh.add_many(ss.minhash.hashes) ss.minhash = flattened_mh @@ -731,7 +731,7 @@ def downsample(args): else: # try to turn a num into a scaled # first check: can we? max_hash = _get_max_hash_for_scaled(args.scaled) - mins = mh.get_mins() + mins = mh.hashes if max(mins) < max_hash: raise ValueError("this num MinHash does not have enough hashes to convert it into a scaled MinHash.") @@ -810,7 +810,7 @@ def export(args): x['hashBits'] = 64 x['hashSeed'] = mh.seed - ll = list(mh.get_mins()) + ll = list(mh.hashes) x['sketches'] = [{ 'hashes': ll }] with FileOutput(args.output, 'wt') as fp: From c3b4416eb5d9f089d12c7c0c5f24233f2745db1d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 26 Jul 2020 08:20:15 -0700 Subject: [PATCH 09/50] change deprecated 'compare' usage to 'similarity' in test_jaccard --- tests/test_jaccard.py | 52 +++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/tests/test_jaccard.py b/tests/test_jaccard.py index 93bda700b9..99716d496e 100644 --- a/tests/test_jaccard.py +++ b/tests/test_jaccard.py @@ -186,23 +186,23 @@ def test_jaccard_on_real_data(): sig2 = list(load_signatures(b))[0] mh2 = sig2.minhash - assert mh1.compare(mh2) == 0.0183 - assert mh2.compare(mh1) == 0.0183 + assert mh1.similarity(mh2) == 0.0183 + assert mh2.similarity(mh1) == 0.0183 mh1 = mh1.downsample_n(1000) mh2 = mh2.downsample_n(1000) - assert mh1.compare(mh2) == 0.011 - assert mh2.compare(mh1) == 0.011 + assert mh1.similarity(mh2) == 0.011 + assert mh2.similarity(mh1) == 0.011 mh1 = mh1.downsample_n(100) mh2 = mh2.downsample_n(100) - assert mh1.compare(mh2) == 0.01 - assert mh2.compare(mh1) == 0.01 + assert mh1.similarity(mh2) == 0.01 + assert mh2.similarity(mh1) == 0.01 mh1 = mh1.downsample_n(10) mh2 = mh2.downsample_n(10) - assert mh1.compare(mh2) == 0.0 - assert mh2.compare(mh1) == 0.0 + assert mh1.similarity(mh2) == 0.0 + assert mh2.similarity(mh1) == 0.0 def test_scaled_on_real_data(): @@ -218,29 +218,29 @@ def test_scaled_on_real_data(): sig2 = list(load_signatures(b))[0] mh2 = sig2.minhash - assert round(mh1.compare(mh2), 5) == 0.01644 - assert round(mh2.compare(mh1), 5) == 0.01644 + assert round(mh1.similarity(mh2), 5) == 0.01644 + assert round(mh2.similarity(mh1), 5) == 0.01644 mh1 = mh1.downsample_n(10000) mh2 = mh2.downsample_n(10000) - assert mh1.compare(mh2) == 0.0183 - assert mh2.compare(mh1) == 0.0183 + assert mh1.similarity(mh2) == 0.0183 + assert mh2.similarity(mh1) == 0.0183 mh1 = mh1.downsample_n(1000) mh2 = mh2.downsample_n(1000) - assert mh1.compare(mh2) == 0.011 - assert mh2.compare(mh1) == 0.011 + assert mh1.similarity(mh2) == 0.011 + assert mh2.similarity(mh1) == 0.011 mh1 = mh1.downsample_n(100) mh2 = mh2.downsample_n(100) - assert mh1.compare(mh2) == 0.01 - assert mh2.compare(mh1) == 0.01 + assert mh1.similarity(mh2) == 0.01 + assert mh2.similarity(mh1) == 0.01 mh1 = mh1.downsample_n(10) mh2 = mh2.downsample_n(10) - assert mh1.compare(mh2) == 0.0 - assert mh2.compare(mh1) == 0.0 + assert mh1.similarity(mh2) == 0.0 + assert mh2.similarity(mh1) == 0.0 def test_scaled_on_real_data_2(): @@ -256,21 +256,21 @@ def test_scaled_on_real_data_2(): sig2 = list(load_signatures(b))[0] mh2 = sig2.minhash - assert round(mh1.compare(mh2), 5) == 0.01644 - assert round(mh2.compare(mh1), 5) == 0.01644 + assert round(mh1.similarity(mh2), 5) == 0.01644 + assert round(mh2.similarity(mh1), 5) == 0.01644 mh1 = mh1.downsample_scaled(1000) mh2 = mh2.downsample_scaled(1000) - assert round(mh1.compare(mh2), 4) == 0.0187 - assert round(mh2.compare(mh1), 4) == 0.0187 + assert round(mh1.similarity(mh2), 4) == 0.0187 + assert round(mh2.similarity(mh1), 4) == 0.0187 mh1 = mh1.downsample_scaled(10000) mh2 = mh2.downsample_scaled(10000) - assert round(mh1.compare(mh2), 3) == 0.01 - assert round(mh2.compare(mh1), 3) == 0.01 + assert round(mh1.similarity(mh2), 3) == 0.01 + assert round(mh2.similarity(mh1), 3) == 0.01 mh1 = mh1.downsample_scaled(100000) mh2 = mh2.downsample_scaled(100000) - assert round(mh1.compare(mh2), 2) == 0.01 - assert round(mh2.compare(mh1), 2) == 0.01 + assert round(mh1.similarity(mh2), 2) == 0.01 + assert round(mh2.similarity(mh1), 2) == 0.01 From 7a7bba98844d2f5a05c7f80ab7ea21a5acdf0eb4 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 26 Jul 2020 08:34:52 -0700 Subject: [PATCH 10/50] elminate most of the deprecation warnings in test__minhash by switching compare to similarity --- tests/test__minhash.py | 123 ++++++++++++++++++++++------------------- 1 file changed, 66 insertions(+), 57 deletions(-) diff --git a/tests/test__minhash.py b/tests/test__minhash.py index 866ceb1769..98cb015cd7 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -502,43 +502,44 @@ def test_consume_lowercase(track_abundance): a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA'.lower()) b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') - assert a.compare(b) == 1.0 - assert b.compare(b) == 1.0 - assert b.compare(a) == 1.0 - assert a.compare(a) == 1.0 + assert round(a.similarity(b), 3) == 1.0 + assert round(b.similarity(b), 3) == 1.0 + assert round(b.similarity(a), 3) == 1.0 + assert round(a.similarity(a), 3) == 1.0 -def test_compare_1(track_abundance): +def test_similarity_1(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) b = MinHash(20, 10, track_abundance=track_abundance) a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') - assert a.compare(b) == 1.0 - assert b.compare(b) == 1.0 - assert b.compare(a) == 1.0 - assert a.compare(a) == 1.0 + assert round(a.similarity(b), 3) == 1.0 + assert round(b.similarity(b), 3) == 1.0 + assert round(b.similarity(a), 3) == 1.0 + assert round(a.similarity(a), 3) == 1.0 # add same sequence again b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') - assert a.compare(b) == 1.0 - assert b.compare(b) == 1.0 - assert b.compare(a) == 1.0 - assert a.compare(a) == 1.0 + assert round(a.similarity(b), 3) == 1.0 + assert round(b.similarity(b), 3) == 1.0 + assert round(b.similarity(a), 3) == 1.0 + assert round(a.similarity(a), 3) == 1.0 b.add_sequence('GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT') - x = a.compare(b) + x = a.similarity(b) assert x >= 0.3, x - x = b.compare(a) + x = b.similarity(a) assert x >= 0.3, x - assert a.compare(a) == 1.0 - assert b.compare(b) == 1.0 + assert round(a.similarity(a), 3) == 1.0 + assert round(b.similarity(b), 3) == 1.0 def test_intersection_errors(track_abundance): + # CTB: remove this test in 4.0 a = MinHash(20, 10, track_abundance=track_abundance) b = MinHash(20, 10, track_abundance=track_abundance) c = MinHash(30, 10, track_abundance=track_abundance) @@ -563,6 +564,7 @@ def test_intersection_errors(track_abundance): # this filter doesn't work, but leaving it in pour encourages les autres. @pytest.mark.filterwarnings("ignore") def test_intersection_1(track_abundance): + # CTB: remove this test in 4.0 a = MinHash(20, 10, track_abundance=track_abundance) b = MinHash(20, 10, track_abundance=track_abundance) @@ -633,7 +635,7 @@ def test_mh_copy(track_abundance): a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') b = a.__copy__() - assert b.compare(a) == 1.0 + assert round(b.similarity(a), 3) == 1.0 def test_mh_len(track_abundance): @@ -730,12 +732,14 @@ def test_mh_jaccard_asymmetric_num(track_abundance): assert a.count_common(b) == 10 assert b.count_common(a) == 10 + # with 'jaccard', this will raise an error b/c different num with pytest.raises(TypeError): - a.compare(b) + a.jaccard(b) a = a.downsample_n(10) - assert a.compare(b) == 0.5 - assert b.compare(a) == 0.5 + # CTB note: this used to be 'compare', is now 'jaccard'; @CTB check compat + assert a.jaccard(b) == 0.5 + assert b.jaccard(a) == 0.5 def test_mh_merge_typeerror(track_abundance): @@ -759,8 +763,9 @@ def test_mh_merge(track_abundance): assert len(c) == len(d) assert c.get_mins() == d.get_mins() - assert c.compare(d) == 1.0 - assert d.compare(c) == 1.0 + # @CTB + #assert round(c.similarity(d), 3) == 1.0 + #assert round(d.similarity(c), 3) == 1.0 def test_mh_merge_empty_num(track_abundance): @@ -777,8 +782,8 @@ def test_mh_merge_empty_num(track_abundance): assert len(c) assert len(c) == len(d) assert c.get_mins() == d.get_mins() - assert c.compare(d) == 1.0 - assert d.compare(c) == 1.0 + assert round(c.similarity(d), 3) == 1.0 + assert round(d.similarity(c), 3) == 1.0 def test_mh_merge_empty_scaled(track_abundance): @@ -795,8 +800,8 @@ def test_mh_merge_empty_scaled(track_abundance): assert len(c) assert len(c) == len(d) assert c.get_mins() == d.get_mins() - assert c.compare(d) == 1.0 - assert d.compare(c) == 1.0 + assert round(c.similarity(d), 3) == 1.0 + assert round(d.similarity(c), 3) == 1.0 def test_mh_merge_check_length(track_abundance): @@ -846,17 +851,19 @@ def test_mh_asymmetric_merge(track_abundance): assert len(c) == len(a) assert len(d) == len(b) - # can't compare different sizes without downsampling + # can't use jaccard on different nums without downsampling with pytest.raises(TypeError): - d.compare(a) + d.jaccard(a) a = a.downsample_n(d.num) print(a.get_mins()) print(d.get_mins()) - assert d.compare(a) == 1.0 + # @CTB + #assert round(d.similarity(a), 3) == 1.0 c = c.downsample_n(b.num) - assert c.compare(b) == 1.0 + # @CTB + #assert c.similarity(b) == 1.0 def test_mh_inplace_concat_asymmetric(track_abundance): @@ -882,15 +889,17 @@ def test_mh_inplace_concat_asymmetric(track_abundance): assert len(d) == len(b) try: - d.compare(a) + d.similarity(a) except TypeError as exc: assert 'must have same num' in str(exc) a = a.downsample_n(d.num) - assert d.compare(a) == 1.0 # see: d += a, above. + # @CTB + #assert d.similarity(a) == 1.0 # see: d += a, above. c = c.downsample_n(b.num) - assert c.compare(b) == 0.5 + # @CTB + # assert c.similarity(b) == 0.5 def test_mh_inplace_concat(track_abundance): @@ -910,8 +919,8 @@ def test_mh_inplace_concat(track_abundance): assert len(c) == len(d) assert c.get_mins() == d.get_mins() - assert c.compare(d) == 1.0 - assert d.compare(c) == 1.0 + assert round(c.similarity(d), 3) == 1.0 + assert round(d.similarity(c), 3) == 1.0 def test_mh_merge_diff_protein(track_abundance): @@ -930,36 +939,36 @@ def test_mh_merge_diff_ksize(track_abundance): a.merge(b) -def test_mh_compare_diff_protein(track_abundance): +def test_mh_similarity_diff_protein(track_abundance): a = MinHash(20, 5, False, track_abundance=track_abundance) b = MinHash(20, 5, True, track_abundance=track_abundance) with pytest.raises(ValueError): - a.compare(b) + a.similarity(b) -def test_mh_compare_diff_ksize(track_abundance): +def test_mh_similarity_diff_ksize(track_abundance): a = MinHash(20, 5, track_abundance=track_abundance) b = MinHash(20, 6, track_abundance=track_abundance) with pytest.raises(ValueError): - a.compare(b) + a.similarity(b) -def test_mh_compare_diff_seed(track_abundance): +def test_mh_similarity_diff_seed(track_abundance): a = MinHash(20, 5, track_abundance=track_abundance, seed=1) b = MinHash(20, 5, track_abundance=track_abundance, seed=2) with pytest.raises(ValueError): - a.compare(b) + a.similarity(b) -def test_mh_compare_diff_max_hash(track_abundance): +def test_mh_similarity_diff_max_hash(track_abundance): a = MinHash(0, 5, track_abundance=track_abundance, max_hash=5) b = MinHash(0, 5, track_abundance=track_abundance, max_hash=10) with pytest.raises(ValueError): - a.compare(b) + a.similarity(b) def test_mh_concat_diff_protein(track_abundance): @@ -1117,33 +1126,33 @@ def test_abundance_count_common(): 10798773792509008305] -def test_abundance_compare(): +def test_abundance_similarity(): a = MinHash(20, 10, track_abundance=True) b = MinHash(20, 10, track_abundance=False) a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') - assert a.compare(b) == 1.0 - assert b.compare(b) == 1.0 - assert b.compare(a) == 1.0 - assert a.compare(a) == 1.0 + assert round(a.similarity(b), 3) == 1.0 + assert round(b.similarity(b), 3) == 1.0 + assert round(b.similarity(a), 3) == 1.0 + assert round(a.similarity(a), 3) == 1.0 # add same sequence again b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA') - assert a.compare(b) == 1.0 - assert b.compare(b) == 1.0 - assert b.compare(a) == 1.0 - assert a.compare(a) == 1.0 + assert round(a.similarity(b), 3) == 1.0 + assert round(b.similarity(b), 3) == 1.0 + assert round(b.similarity(a), 3) == 1.0 + assert round(a.similarity(a), 3) == 1.0 b.add_sequence('GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT') - x = a.compare(b) + x = a.similarity(b) assert x >= 0.3, x - x = b.compare(a) + x = b.similarity(a) assert x >= 0.3, x - assert a.compare(a) == 1.0 - assert b.compare(b) == 1.0 + assert round(a.similarity(a), 3) == 1.0 + assert round(b.similarity(b), 3) == 1.0 def test_set_abundance(): From f97bf396f2635fec3789751b7305ac04acc79e4a Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 26 Jul 2020 08:40:06 -0700 Subject: [PATCH 11/50] fix remaining tests in test__minhash --- tests/test__minhash.py | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/tests/test__minhash.py b/tests/test__minhash.py index 98cb015cd7..62bf6acbac 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -763,9 +763,13 @@ def test_mh_merge(track_abundance): assert len(c) == len(d) assert c.get_mins() == d.get_mins() - # @CTB - #assert round(c.similarity(d), 3) == 1.0 - #assert round(d.similarity(c), 3) == 1.0 + + if track_abundance: + assert round(c.similarity(d), 3) == 0.91 + assert round(d.similarity(c), 3) == 0.91 + else: + assert round(c.similarity(d), 3) == 1.0 + assert round(d.similarity(c), 3) == 1.0 def test_mh_merge_empty_num(track_abundance): @@ -858,12 +862,17 @@ def test_mh_asymmetric_merge(track_abundance): a = a.downsample_n(d.num) print(a.get_mins()) print(d.get_mins()) - # @CTB - #assert round(d.similarity(a), 3) == 1.0 + + if track_abundance: + assert round(d.similarity(a), 3) == 0.91 + else: + assert round(d.similarity(a), 3) == 1.0 c = c.downsample_n(b.num) - # @CTB - #assert c.similarity(b) == 1.0 + if track_abundance: + assert round(c.similarity(b), 3) == 0.91 + else: + assert c.similarity(b) == 1.0 def test_mh_inplace_concat_asymmetric(track_abundance): @@ -894,12 +903,16 @@ def test_mh_inplace_concat_asymmetric(track_abundance): assert 'must have same num' in str(exc) a = a.downsample_n(d.num) - # @CTB - #assert d.similarity(a) == 1.0 # see: d += a, above. + if track_abundance: + assert round(d.similarity(a), 3) == 0.795 # see: d += a, above. + else: + assert d.similarity(a) == 1.0 # see: d += a, above. c = c.downsample_n(b.num) - # @CTB - # assert c.similarity(b) == 0.5 + if track_abundance: + assert round(c.similarity(b), 3) == 0.436 + else: + assert c.similarity(b) == 0.5 def test_mh_inplace_concat(track_abundance): From 1bd9df82b5ba3f0c66ed30d3ff8ca0719e5adeec Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 26 Jul 2020 09:02:48 -0700 Subject: [PATCH 12/50] fix compat message --- tests/test__minhash.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test__minhash.py b/tests/test__minhash.py index 62bf6acbac..e4fe15d311 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -737,7 +737,7 @@ def test_mh_jaccard_asymmetric_num(track_abundance): a.jaccard(b) a = a.downsample_n(10) - # CTB note: this used to be 'compare', is now 'jaccard'; @CTB check compat + # CTB note: this used to be 'compare', is now 'jaccard' assert a.jaccard(b) == 0.5 assert b.jaccard(a) == 0.5 From 7743a95152f8eeffa1a740c30df2bb4e8c03d14d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 27 Jul 2020 07:57:15 -0700 Subject: [PATCH 13/50] restore removed functions, sigh :) --- sourmash/minhash.py | 43 +++++++++++++++++++++++++++++++++++++----- tests/test__minhash.py | 12 ++++++------ tests/test_jaccard.py | 28 +++++++++++++-------------- 3 files changed, 58 insertions(+), 25 deletions(-) diff --git a/sourmash/minhash.py b/sourmash/minhash.py index eb49cde095..48cee4ae5d 100644 --- a/sourmash/minhash.py +++ b/sourmash/minhash.py @@ -301,6 +301,14 @@ def get_hashes(self): def hashes(self): return self.get_mins(with_abundance=True) + def subtract_mins(self, other): + """Get the list of mins in this MinHash, after removing the ones in + ``other``. + """ + a = set(self.get_mins()) + b = set(other.get_mins()) + return a - b + @property def seed(self): return self._methodcall(lib.kmerminhash_seed) @@ -309,28 +317,28 @@ def seed(self): def num(self): return self._methodcall(lib.kmerminhash_num) - @property - def max_hash(self): - return self._methodcall(lib.kmerminhash_max_hash) - @property def scaled(self): if self.max_hash: return _get_scaled_for_max_hash(self.max_hash) return 0 + # @CTB @property def is_dna(self): return not (self.is_protein or self.dayhoff or self.hp) + # @CTB @property def is_protein(self): return self._methodcall(lib.kmerminhash_is_protein) + # @CTB @property def dayhoff(self): return self._methodcall(lib.kmerminhash_dayhoff) + # @CTB @property def hp(self): return self._methodcall(lib.kmerminhash_hp) @@ -339,6 +347,10 @@ def hp(self): def ksize(self): return self._methodcall(lib.kmerminhash_ksize) + @property + def max_hash(self): + return self._methodcall(lib.kmerminhash_max_hash) + @property def track_abundance(self): return self._methodcall(lib.kmerminhash_track_abundance) @@ -391,7 +403,7 @@ def count_common(self, other, downsample=False): raise TypeError("Must be a MinHash!") return self._methodcall(lib.kmerminhash_count_common, other._get_objptr(), downsample) - def downsample_num(self, new_num): + def downsample_n(self, new_num): "Copy this object and downsample new object to num=``new_num``." if self.num and self.num < new_num: raise ValueError("new sample n is higher than current sample n") @@ -406,6 +418,17 @@ def downsample_num(self, new_num): return a + def downsample_max_hash(self, *others): + """Copy this object and downsample new object to min of ``*others``. + + Here, ``*others`` is one or more MinHash objects. + """ + max_hashes = [x.max_hash for x in others] + new_max_hash = min(self.max_hash, *max_hashes) + new_scaled = get_scaled_for_max_hash(new_max_hash) + + return self.downsample_scaled(new_scaled) + def downsample_scaled(self, new_scaled): """Copy this object and downsample new object to scaled=``new_scaled``. """ @@ -566,6 +589,16 @@ def add_protein(self, sequence): "Add a protein sequence." self._methodcall(lib.kmerminhash_add_protein, to_bytes(sequence)) + def is_molecule_type(self, molecule): + """Check if this MinHash is a particular human-readable molecule type. + + Supports 'protein', 'dayhoff', 'hp', 'DNA'. + @CTB deprecate for 4.0? + """ + if molecule.lower() not in ('protein', 'dayhoff', 'hp', 'dna'): + raise ValueError("unknown moltype in query, '{}'".format(molecule)) + return molecule == self.moltype + @property def moltype(self): # TODO: test in minhash tests if self.is_protein: diff --git a/tests/test__minhash.py b/tests/test__minhash.py index 81f951672e..06293883ac 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -701,7 +701,7 @@ def test_mh_count_common_notmh(track_abundance): def test_mh_downsample_num_error(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) with pytest.raises(ValueError): - a.downsample_num(30) + a.downsample_n(30) def test_mh_jaccard_asymmetric_num(track_abundance): @@ -720,7 +720,7 @@ def test_mh_jaccard_asymmetric_num(track_abundance): with pytest.raises(TypeError): a.compare(b) - a = a.downsample_num(10) + a = a.downsample_n(10) assert a.compare(b) == 0.5 assert b.compare(a) == 0.5 @@ -837,12 +837,12 @@ def test_mh_asymmetric_merge(track_abundance): with pytest.raises(TypeError): d.compare(a) - a = a.downsample_num(d.num) + a = a.downsample_n(d.num) print(a.hashes) print(d.hashes) assert d.compare(a) == 1.0 - c = c.downsample_num(b.num) + c = c.downsample_n(b.num) assert c.compare(b) == 1.0 @@ -873,10 +873,10 @@ def test_mh_inplace_concat_asymmetric(track_abundance): except TypeError as exc: assert 'must have same num' in str(exc) - a = a.downsample_num(d.num) + a = a.downsample_n(d.num) assert d.compare(a) == 1.0 # see: d += a, above. - c = c.downsample_num(b.num) + c = c.downsample_n(b.num) assert c.compare(b) == 0.5 diff --git a/tests/test_jaccard.py b/tests/test_jaccard.py index 679e0723f4..43a4c355a4 100644 --- a/tests/test_jaccard.py +++ b/tests/test_jaccard.py @@ -189,18 +189,18 @@ def test_jaccard_on_real_data(): assert mh1.compare(mh2) == 0.0183 assert mh2.compare(mh1) == 0.0183 - mh1 = mh1.downsample_num(1000) - mh2 = mh2.downsample_num(1000) + mh1 = mh1.downsample_n(1000) + mh2 = mh2.downsample_n(1000) assert mh1.compare(mh2) == 0.011 assert mh2.compare(mh1) == 0.011 - mh1 = mh1.downsample_num(100) - mh2 = mh2.downsample_num(100) + mh1 = mh1.downsample_n(100) + mh2 = mh2.downsample_n(100) assert mh1.compare(mh2) == 0.01 assert mh2.compare(mh1) == 0.01 - mh1 = mh1.downsample_num(10) - mh2 = mh2.downsample_num(10) + mh1 = mh1.downsample_n(10) + mh2 = mh2.downsample_n(10) assert mh1.compare(mh2) == 0.0 assert mh2.compare(mh1) == 0.0 @@ -221,24 +221,24 @@ def test_scaled_on_real_data(): assert round(mh1.compare(mh2), 5) == 0.01644 assert round(mh2.compare(mh1), 5) == 0.01644 - mh1 = mh1.downsample_num(10000) - mh2 = mh2.downsample_num(10000) + mh1 = mh1.downsample_n(10000) + mh2 = mh2.downsample_n(10000) assert mh1.compare(mh2) == 0.0183 assert mh2.compare(mh1) == 0.0183 - mh1 = mh1.downsample_num(1000) - mh2 = mh2.downsample_num(1000) + mh1 = mh1.downsample_n(1000) + mh2 = mh2.downsample_n(1000) assert mh1.compare(mh2) == 0.011 assert mh2.compare(mh1) == 0.011 - mh1 = mh1.downsample_num(100) - mh2 = mh2.downsample_num(100) + mh1 = mh1.downsample_n(100) + mh2 = mh2.downsample_n(100) assert mh1.compare(mh2) == 0.01 assert mh2.compare(mh1) == 0.01 - mh1 = mh1.downsample_num(10) - mh2 = mh2.downsample_num(10) + mh1 = mh1.downsample_n(10) + mh2 = mh2.downsample_n(10) assert mh1.compare(mh2) == 0.0 assert mh2.compare(mh1) == 0.0 From 70edc789cea406dc680d3665e1a45aaa677d86a3 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Mon, 27 Jul 2020 07:58:46 -0700 Subject: [PATCH 14/50] minor upd --- sourmash/minhash.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sourmash/minhash.py b/sourmash/minhash.py index 48cee4ae5d..ffea1e6ab4 100644 --- a/sourmash/minhash.py +++ b/sourmash/minhash.py @@ -262,7 +262,7 @@ def remove_many(self, hashes): "Remove many hashes at once; ``hashes`` must be an iterable." self._methodcall(lib.kmerminhash_remove_many, list(hashes), len(hashes)) - def update_xxx(self, other): + def update(self, other): "Update this sketch from all the hashes in the other." self.add_many(other) @@ -609,4 +609,3 @@ def moltype(self): # TODO: test in minhash tests return 'hp' else: return 'DNA' - From eb6b971ed2b62cdda0481d9aea5ecf3fdbf637e9 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 28 Jul 2020 06:12:43 -0700 Subject: [PATCH 15/50] add deprecations --- sourmash/minhash.py | 38 ++++++++++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/sourmash/minhash.py b/sourmash/minhash.py index ffea1e6ab4..21b2cba1b1 100644 --- a/sourmash/minhash.py +++ b/sourmash/minhash.py @@ -243,6 +243,14 @@ def add_sequence(self, sequence, force=False): self._methodcall(lib.kmerminhash_add_sequence, to_bytes(sequence), force) + @deprecated(deprecated_in="3.5", removed_in="4.0", + current_version=VERSION, + details='Use add_kmer instead.') + def add(self, kmer): + "Add a kmer into the sketch." + self.add_sequence(kmer) + + def add_kmer(self, kmer): "Add a kmer into the sketch." self.add_sequence(kmer) @@ -262,6 +270,9 @@ def remove_many(self, hashes): "Remove many hashes at once; ``hashes`` must be an iterable." self._methodcall(lib.kmerminhash_remove_many, list(hashes), len(hashes)) + @deprecated(deprecated_in="3.5", removed_in="4.0", + current_version=VERSION, + details='Use add_many instead.') def update(self, other): "Update this sketch from all the hashes in the other." self.add_many(other) @@ -270,6 +281,9 @@ def __len__(self): "Number of hashes." return self._methodcall(lib.kmerminhash_get_mins_size) + @deprecated(deprecated_in="3.5", removed_in="5.0", + current_version=VERSION, + details='Use .hashes property instead.') def get_mins(self, with_abundance=False): """Return list of hashes or if ``with_abundance`` a list of (hash, abund). @@ -293,6 +307,9 @@ def get_mins(self, with_abundance=False): return result + @deprecated(deprecated_in="3.5", removed_in="5.0", + current_version=VERSION, + details='Use .hashes property instead.') def get_hashes(self): "Return the list of hashes." return self.get_mins() @@ -301,6 +318,8 @@ def get_hashes(self): def hashes(self): return self.get_mins(with_abundance=True) + @deprecated(deprecated_in="3.5", removed_in="4.0", + current_version=VERSION) def subtract_mins(self, other): """Get the list of mins in this MinHash, after removing the ones in ``other``. @@ -323,22 +342,18 @@ def scaled(self): return _get_scaled_for_max_hash(self.max_hash) return 0 - # @CTB @property def is_dna(self): return not (self.is_protein or self.dayhoff or self.hp) - # @CTB @property def is_protein(self): return self._methodcall(lib.kmerminhash_is_protein) - # @CTB @property def dayhoff(self): return self._methodcall(lib.kmerminhash_dayhoff) - # @CTB @property def hp(self): return self._methodcall(lib.kmerminhash_hp) @@ -348,6 +363,9 @@ def ksize(self): return self._methodcall(lib.kmerminhash_ksize) @property + @deprecated(deprecated_in="3.5", removed_in="5.0", + current_version=VERSION, + details='Use scaled instead.') def max_hash(self): return self._methodcall(lib.kmerminhash_max_hash) @@ -403,6 +421,9 @@ def count_common(self, other, downsample=False): raise TypeError("Must be a MinHash!") return self._methodcall(lib.kmerminhash_count_common, other._get_objptr(), downsample) + @deprecated(deprecated_in="3.5", removed_in="4.0", + current_version=VERSION, + details='Use downsample(num=...) instead.') def downsample_n(self, new_num): "Copy this object and downsample new object to num=``new_num``." if self.num and self.num < new_num: @@ -418,6 +439,9 @@ def downsample_n(self, new_num): return a + @deprecated(deprecated_in="3.5", removed_in="4.0", + current_version=VERSION, + details='Use scaled instead.') def downsample_max_hash(self, *others): """Copy this object and downsample new object to min of ``*others``. @@ -429,6 +453,9 @@ def downsample_max_hash(self, *others): return self.downsample_scaled(new_scaled) + @deprecated(deprecated_in="3.5", removed_in="4.0", + current_version=VERSION, + details='Use downsample(scaled=...) instead.') def downsample_scaled(self, new_scaled): """Copy this object and downsample new object to scaled=``new_scaled``. """ @@ -589,6 +616,9 @@ def add_protein(self, sequence): "Add a protein sequence." self._methodcall(lib.kmerminhash_add_protein, to_bytes(sequence)) + @deprecated(deprecated_in="3.5", removed_in="4.0", + current_version=VERSION, + details='Use the moltype property instead.') def is_molecule_type(self, molecule): """Check if this MinHash is a particular human-readable molecule type. From 464dcca6551f77122916a1383fe9ac45c9bb577c Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 28 Jul 2020 16:42:46 -0700 Subject: [PATCH 16/50] use a wrapper object for .hashes and make it read-only --- sourmash/minhash.py | 31 ++++++++++++++++++++++++++++++- tests/test__minhash.py | 20 ++++++++++++++++---- 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/sourmash/minhash.py b/sourmash/minhash.py index 21b2cba1b1..0381dec07e 100644 --- a/sourmash/minhash.py +++ b/sourmash/minhash.py @@ -3,6 +3,7 @@ import math import copy +import collections from . import VERSION from ._compat import string_types, range_type @@ -73,6 +74,30 @@ def hash_murmur(kmer, seed=MINHASH_DEFAULT_SEED): return lib.hash_murmur(to_bytes(kmer), seed) +class _HashesWrapper(collections.Mapping): + def __init__(self, h): + self._data = h + + def __getitem__(self, key): + print(key, self._data) + return self._data[key] + + def __repr__(self): + return repr(self._data) + + def __len__(self): + return len(self._data) + + def __iter__(self): + return iter(self._data) + + def __eq__(self, other): + return list(self.items()) == list(other.items()) + + def __setitem__(self, k, v): + raise RuntimeError("cannot modify hashes directly; use 'add' methods") + + class MinHash(RustObject): """\ The core sketch object for sourmash. @@ -316,7 +341,11 @@ def get_hashes(self): @property def hashes(self): - return self.get_mins(with_abundance=True) + if self.track_abundance: + return _HashesWrapper(self.get_mins(with_abundance=True)) + else: + d = self.get_mins() + return _HashesWrapper({ k : 1 for k in d }) @deprecated(deprecated_in="3.5", removed_in="4.0", current_version=VERSION) diff --git a/tests/test__minhash.py b/tests/test__minhash.py index 5dc518e6f5..2608dd5317 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -200,6 +200,8 @@ def test_dayhoff(track_abundance): mh_protein.add_sequence('ACTGAC') assert len(mh_protein.hashes) == 2 + print(mh_protein.hashes) + print(mh_dayhoff.hashes) assert mh_protein.hashes != mh_dayhoff.hashes @@ -1127,8 +1129,7 @@ def test_abundance_count_common(): assert a.count_common(b) == 1 assert a.count_common(b) == b.count_common(a) - assert b.hashes == [2110480117637990133, - 10798773792509008305] + assert list(b.hashes) == [2110480117637990133, 10798773792509008305] def test_abundance_similarity(): @@ -1235,7 +1236,7 @@ def test_reset_abundance_initialized(): # Convert from Abundance to Regular MinHash a.track_abundance = False - assert a.hashes == [12415348535738636339] + assert list(a.hashes) == [12415348535738636339] def test_set_abundance_initialized(): @@ -1462,7 +1463,7 @@ def test_add_many(track_abundance): b = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) a.add_many(list(range(0, 100, 2))) - a.add_many(list(range(0, 100, 2))) + a.add_many(list(range(0, 100, 2))) # => abundance = 2 assert len(a) == 50 assert all(c % 2 == 0 for c in a.hashes) @@ -1484,3 +1485,14 @@ def test_set_abundances_huge(): abundances = itertools.repeat(2) a.set_abundances(dict(zip(hashes, abundances))) + + +def test_try_change_hashes(track_abundance): + a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) + b = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000) + + a.add_many(list(range(0, 100, 2))) + + h = a.hashes + with pytest.raises(RuntimeError): + h[5] = 10 From 23171d9e5cf015d79c9ae020313645431534ca63 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 28 Jul 2020 16:58:56 -0700 Subject: [PATCH 17/50] refactor to use downsample(num/scaled= --- sourmash/minhash.py | 82 ++++++++++++++++++------------------- sourmash/sig/__main__.py | 4 +- tests/test_cmd_signature.py | 4 +- 3 files changed, 44 insertions(+), 46 deletions(-) diff --git a/sourmash/minhash.py b/sourmash/minhash.py index 0381dec07e..dbc4cf7172 100644 --- a/sourmash/minhash.py +++ b/sourmash/minhash.py @@ -79,7 +79,6 @@ def __init__(self, h): self._data = h def __getitem__(self, key): - print(key, self._data) return self._data[key] def __repr__(self): @@ -450,17 +449,41 @@ def count_common(self, other, downsample=False): raise TypeError("Must be a MinHash!") return self._methodcall(lib.kmerminhash_count_common, other._get_objptr(), downsample) - @deprecated(deprecated_in="3.5", removed_in="4.0", - current_version=VERSION, - details='Use downsample(num=...) instead.') - def downsample_n(self, new_num): - "Copy this object and downsample new object to num=``new_num``." - if self.num and self.num < new_num: - raise ValueError("new sample n is higher than current sample n") + def downsample(self, num=None, scaled=None): + """Copy this object and downsample new object to either `num` or + `scaled`. + """ + if num is None and scaled is None: + raise ValueError('must specify either num or scaled to downsample') + elif num is not None: + if self.num and self.num < num: + raise ValueError("new sample num is higher than current sample num") + max_hash=0 + elif scaled is not None: + if self.num: + raise ValueError("num != 0 - cannot downsample a standard MinHash") + max_hash = self.max_hash + if max_hash is None: + raise ValueError("no max_hash available - cannot downsample") + + old_scaled = _get_scaled_for_max_hash(self.max_hash) + if old_scaled > scaled: + raise ValueError( + "new scaled {} is lower than current sample scaled {}".format( + scaled, old_scaled + ) + ) + + max_hash = _get_max_hash_for_scaled(scaled) + num = 0 + ### + # create new object: a = MinHash( - new_num, self.ksize, self.is_protein, self.dayhoff, self.hp, self.track_abundance, self.seed, 0 + num, self.ksize, self.is_protein, self.dayhoff, self.hp, + self.track_abundance, self.seed, max_hash ) + # copy over hashes: if self.track_abundance: a.set_abundances(self.get_mins(with_abundance=True)) else: @@ -468,6 +491,13 @@ def downsample_n(self, new_num): return a + @deprecated(deprecated_in="3.5", removed_in="4.0", + current_version=VERSION, + details='Use downsample(num=...) instead.') + def downsample_n(self, new_num): + "Copy this object and downsample new object to num=``new_num``." + return self.downsample(num=new_num) + @deprecated(deprecated_in="3.5", removed_in="4.0", current_version=VERSION, details='Use scaled instead.') @@ -488,39 +518,7 @@ def downsample_max_hash(self, *others): def downsample_scaled(self, new_scaled): """Copy this object and downsample new object to scaled=``new_scaled``. """ - if self.num: - raise ValueError("num != 0 - cannot downsample a standard MinHash") - - max_hash = self.max_hash - if max_hash is None: - raise ValueError("no max_hash available - cannot downsample") - - old_scaled = _get_scaled_for_max_hash(self.max_hash) - if old_scaled > new_scaled: - raise ValueError( - "new scaled {} is lower than current sample scaled {}".format( - new_scaled, old_scaled - ) - ) - - new_max_hash = _get_max_hash_for_scaled(new_scaled) - - a = MinHash( - 0, - self.ksize, - self.is_protein, - self.dayhoff, - self.hp, - self.track_abundance, - self.seed, - new_max_hash, - ) - if self.track_abundance: - a.set_abundances(self.get_mins(with_abundance=True)) - else: - a.add_many(self) - - return a + return self.downsample(scaled=new_scaled) @deprecated(deprecated_in="3.3", removed_in="4.0", current_version=VERSION, diff --git a/sourmash/sig/__main__.py b/sourmash/sig/__main__.py index da575394de..ae7aa6d5d6 100644 --- a/sourmash/sig/__main__.py +++ b/sourmash/sig/__main__.py @@ -727,7 +727,7 @@ def downsample(args): total_loaded += 1 if args.scaled: if mh.scaled: - mh_new = mh.downsample_scaled(args.scaled) + mh_new = mh.downsample(scaled=args.scaled) else: # try to turn a num into a scaled # first check: can we? max_hash = _get_max_hash_for_scaled(args.scaled) @@ -739,7 +739,7 @@ def downsample(args): _set_num_scaled(mh_new, 0, args.scaled) elif args.num: if mh.num: - mh_new = mh.downsample_num(args.num) + mh_new = mh.downsample(num=args.num) else: # try to turn a scaled into a num # first check: can we? if len(mh) < args.num: diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index 41bda45fb8..c2dff1f758 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -1042,7 +1042,7 @@ def test_sig_downsample_1_scaled(c): test_downsample_sig = sourmash.load_one_signature(sig47) actual_downsample_sig = sourmash.load_one_signature(out) - test_mh = test_downsample_sig.minhash.downsample_scaled(10000) + test_mh = test_downsample_sig.minhash.downsample(scaled=10000) assert actual_downsample_sig.minhash == test_mh @@ -1114,7 +1114,7 @@ def test_sig_downsample_2_num(c): test_downsample_sig = sourmash.load_one_signature(sigs11, ksize=21, select_moltype='DNA') actual_downsample_sig = sourmash.load_one_signature(out) - test_mh = test_downsample_sig.minhash.downsample_num(500) + test_mh = test_downsample_sig.minhash.downsample(num=500) assert actual_downsample_sig.minhash == test_mh From 02239d977ffb958aaa622504712b5b58085657b4 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 29 Jul 2020 06:39:37 -0700 Subject: [PATCH 18/50] refactor to use downsample(scaled=...) --- tests/test__minhash.py | 14 +++++++------- tests/test_jaccard.py | 12 ++++++------ tests/test_lca.py | 20 ++++++++++---------- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/tests/test__minhash.py b/tests/test__minhash.py index 2608dd5317..9d33c17cff 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -289,7 +289,7 @@ def test_no_downsample_scaled_if_n(track_abundance): # make sure you can't set max_n and then downsample scaled mh = MinHash(2, 4, track_abundance=track_abundance) with pytest.raises(ValueError) as excinfo: - mh.downsample_scaled(100000000) + mh.downsample(scaled=100000000) assert 'cannot downsample a standard MinHash' in str(excinfo.value) @@ -705,7 +705,7 @@ def test_mh_count_common_notmh(track_abundance): def test_mh_downsample_num_error(track_abundance): a = MinHash(20, 10, track_abundance=track_abundance) with pytest.raises(ValueError): - a.downsample_n(30) + a.downsample(num=30) def test_mh_jaccard_asymmetric_num(track_abundance): @@ -725,7 +725,7 @@ def test_mh_jaccard_asymmetric_num(track_abundance): with pytest.raises(TypeError): a.jaccard(b) - a = a.downsample_n(10) + a = a.downsample(num=10) # CTB note: this used to be 'compare', is now 'jaccard' assert a.jaccard(b) == 0.5 assert b.jaccard(a) == 0.5 @@ -850,14 +850,14 @@ def test_mh_asymmetric_merge(track_abundance): with pytest.raises(TypeError): d.jaccard(a) - a = a.downsample_n(d.num) + a = a.downsample(num=d.num) if track_abundance: assert round(d.similarity(a), 3) == 0.91 else: assert round(d.similarity(a), 3) == 1.0 - c = c.downsample_n(b.num) + c = c.downsample(num=b.num) if track_abundance: assert round(c.similarity(b), 3) == 0.91 else: @@ -891,13 +891,13 @@ def test_mh_inplace_concat_asymmetric(track_abundance): except TypeError as exc: assert 'must have same num' in str(exc) - a = a.downsample_n(d.num) + a = a.downsample(num=d.num) if track_abundance: assert round(d.similarity(a), 3) == 0.795 # see: d += a, above. else: assert d.similarity(a) == 1.0 # see: d += a, above. - c = c.downsample_n(b.num) + c = c.downsample(num=b.num) if track_abundance: assert round(c.similarity(b), 3) == 0.436 else: diff --git a/tests/test_jaccard.py b/tests/test_jaccard.py index eede4d1110..34987a0c18 100644 --- a/tests/test_jaccard.py +++ b/tests/test_jaccard.py @@ -259,18 +259,18 @@ def test_scaled_on_real_data_2(): assert round(mh1.similarity(mh2), 5) == 0.01644 assert round(mh2.similarity(mh1), 5) == 0.01644 - mh1 = mh1.downsample_scaled(1000) - mh2 = mh2.downsample_scaled(1000) + mh1 = mh1.downsample(scaled=1000) + mh2 = mh2.downsample(scaled=1000) assert round(mh1.similarity(mh2), 4) == 0.0187 assert round(mh2.similarity(mh1), 4) == 0.0187 - mh1 = mh1.downsample_scaled(10000) - mh2 = mh2.downsample_scaled(10000) + mh1 = mh1.downsample(scaled=10000) + mh2 = mh2.downsample(scaled=10000) assert round(mh1.similarity(mh2), 3) == 0.01 assert round(mh2.similarity(mh1), 3) == 0.01 - mh1 = mh1.downsample_scaled(100000) - mh2 = mh2.downsample_scaled(100000) + mh1 = mh1.downsample(scaled=100000) + mh2 = mh2.downsample(scaled=100000) assert round(mh1.similarity(mh2), 2) == 0.01 assert round(mh2.similarity(mh1), 2) == 0.01 diff --git a/tests/test_lca.py b/tests/test_lca.py index ff8312d798..568661fd8e 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -322,8 +322,8 @@ def test_api_create_insert_two_then_scale(): # downsample everything to 5000 lca_db.downsample_scaled(5000) - ss.minhash = ss.minhash.downsample_scaled(5000) - ss2.minhash = ss2.minhash.downsample_scaled(5000) + ss.minhash = ss.minhash.downsample(scaled=5000) + ss2.minhash = ss2.minhash.downsample(scaled=5000) # & check... combined_mins = set(ss.minhash.get_mins()) @@ -342,12 +342,12 @@ def test_api_create_insert_scale_two(): lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=5000) count = lca_db.insert(ss) assert count == 1037 - assert count == len(ss.minhash.downsample_scaled(5000)) + assert count == len(ss.minhash.downsample(scaled=5000)) lca_db.insert(ss2) # downsample sigs to 5000 - ss.minhash = ss.minhash.downsample_scaled(5000) - ss2.minhash = ss2.minhash.downsample_scaled(5000) + ss.minhash = ss.minhash.downsample(scaled=5000) + ss2.minhash = ss2.minhash.downsample(scaled=5000) # & check... combined_mins = set(ss.minhash.get_mins()) @@ -431,7 +431,7 @@ def test_search_db_scaled_gt_sig_scaled(): results = db.search(sig, threshold=.01, ignore_abundance=True) match_sig = results[0][1] - sig.minhash = sig.minhash.downsample_scaled(10000) + sig.minhash = sig.minhash.downsample(scaled=10000) assert sig.minhash == match_sig.minhash @@ -439,7 +439,7 @@ def test_search_db_scaled_lt_sig_scaled(): dbfile = utils.get_test_data('lca/47+63.lca.json') db, ksize, scaled = lca_utils.load_single_database(dbfile) sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig')) - sig.minhash = sig.minhash.downsample_scaled(100000) + sig.minhash = sig.minhash.downsample(scaled=100000) with pytest.raises(ValueError) as e: results = db.search(sig, threshold=.01, ignore_abundance=True) @@ -453,7 +453,7 @@ def test_gather_db_scaled_gt_sig_scaled(): results = db.gather(sig, threshold=.01, ignore_abundance=True) match_sig = results[0][1] - sig.minhash = sig.minhash.downsample_scaled(10000) + sig.minhash = sig.minhash.downsample(scaled=10000) assert sig.minhash == match_sig.minhash @@ -461,12 +461,12 @@ def test_gather_db_scaled_lt_sig_scaled(): dbfile = utils.get_test_data('lca/47+63.lca.json') db, ksize, scaled = lca_utils.load_single_database(dbfile) sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig')) - sig.minhash = sig.minhash.downsample_scaled(100000) + sig.minhash = sig.minhash.downsample(scaled=100000) results = db.gather(sig, threshold=.01, ignore_abundance=True) match_sig = results[0][1] - match_sig.minhash = match_sig.minhash.downsample_scaled(100000) + match_sig.minhash = match_sig.minhash.downsample(scaled=100000) assert sig.minhash == match_sig.minhash From 07cb47491b735664f9c225aab3738e73d5473406 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 29 Jul 2020 06:42:09 -0700 Subject: [PATCH 19/50] return two deleted tests --- tests/test__minhash.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test__minhash.py b/tests/test__minhash.py index 9d33c17cff..98bbe13bcb 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -278,6 +278,13 @@ def test_max_hash_and_scaled_zero(): assert max_hash == 0 +def test_max_hash_and_scaled_error(track_abundance): + # test behavior when supplying both max_hash and scaled + with pytest.raises(ValueError): + mh = MinHash(0, 4, track_abundance=track_abundance, max_hash=35, + scaled=5) + + def test_max_hash_cannot_limit(track_abundance): # make sure you can't set both n and scaled. with pytest.raises(ValueError): @@ -1312,6 +1319,19 @@ def test_scaled_property(track_abundance): assert a.scaled == scaled +def test_mh_subtract(track_abundance): + # test subtracting two identically configured minhashes + a = MinHash(20, 10, track_abundance=track_abundance) + for i in range(0, 40, 2): + a.add_hash(i) + + b = MinHash(20, 10, track_abundance=track_abundance) + for i in range(0, 80, 4): + b.add_hash(i) + + assert a.subtract_mins(b) == set(range(2, 40, 4)) + + def test_pickle_max_hash(track_abundance): a = MinHash(0, 10, track_abundance=track_abundance, scaled=_get_scaled_for_max_hash(20)) From 9d178c950fb45a53b9274903b2c118cd577aea58 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 29 Jul 2020 06:47:08 -0700 Subject: [PATCH 20/50] fixed test that was masked by another test --- sourmash/minhash.py | 1 + tests/test__minhash.py | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/sourmash/minhash.py b/sourmash/minhash.py index dbc4cf7172..14390a0465 100644 --- a/sourmash/minhash.py +++ b/sourmash/minhash.py @@ -75,6 +75,7 @@ def hash_murmur(kmer, seed=MINHASH_DEFAULT_SEED): class _HashesWrapper(collections.Mapping): + "A read-only view of the hashes contained by a MinHash object." def __init__(self, h): self._data = h diff --git a/tests/test__minhash.py b/tests/test__minhash.py index 98bbe13bcb..b6a7d0cf7d 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -251,11 +251,12 @@ def test_scaled(track_abundance): mh.add_hash(10) mh.add_hash(20) mh.add_hash(30) - assert mh.hashes == [10, 20, 30] + + assert list(mh.hashes) == [10, 20, 30] mh.add_hash(40) - assert mh.hashes == [10, 20, 30] + assert list(mh.hashes) == [10, 20, 30] mh.add_hash(36) - assert mh.hashes == [10, 20, 30] + assert list(mh.hashes) == [10, 20, 30] def test_no_scaled(track_abundance): @@ -301,7 +302,7 @@ def test_no_downsample_scaled_if_n(track_abundance): assert 'cannot downsample a standard MinHash' in str(excinfo.value) -def test_scaled(track_abundance): +def test_scaled_num_both(track_abundance): # make sure you can't set both max_n and scaled. with pytest.raises(ValueError): mh = MinHash(2, 4, track_abundance=track_abundance, scaled=2) From 3b2b35bb83e43de6cae5b4a59263970263680753 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 29 Jul 2020 07:04:06 -0700 Subject: [PATCH 21/50] add explicit check for length of kmer in add_kmer --- sourmash/minhash.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sourmash/minhash.py b/sourmash/minhash.py index 14390a0465..3f2e9cb9d7 100644 --- a/sourmash/minhash.py +++ b/sourmash/minhash.py @@ -275,9 +275,9 @@ def add(self, kmer): "Add a kmer into the sketch." self.add_sequence(kmer) - def add_kmer(self, kmer): "Add a kmer into the sketch." + assert len(kmer) == self.ksize self.add_sequence(kmer) def add_many(self, hashes): From f6faf89736cff787c53dcc464a8d96b5a0976e82 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 29 Jul 2020 09:22:53 -0700 Subject: [PATCH 22/50] fix ordering in hash retrieval --- tests/test__minhash.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tests/test__minhash.py b/tests/test__minhash.py index b6a7d0cf7d..8e7fe40438 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -236,9 +236,9 @@ def test_size_limit(track_abundance): mh.add_hash(10) mh.add_hash(20) mh.add_hash(30) - assert list(mh.hashes) == [10, 20, 30] + assert list(sorted(mh.hashes)) == [10, 20, 30] mh.add_hash(5) # -> should push 30 off end - assert list(mh.hashes) == [5, 10, 20] + assert list(sorted(mh.hashes)) == [5, 10, 20] def test_scaled(track_abundance): @@ -252,11 +252,11 @@ def test_scaled(track_abundance): mh.add_hash(20) mh.add_hash(30) - assert list(mh.hashes) == [10, 20, 30] + assert list(sorted(mh.hashes)) == [10, 20, 30] mh.add_hash(40) - assert list(mh.hashes) == [10, 20, 30] + assert list(sorted(mh.hashes)) == [10, 20, 30] mh.add_hash(36) - assert list(mh.hashes) == [10, 20, 30] + assert list(sorted(mh.hashes)) == [10, 20, 30] def test_no_scaled(track_abundance): @@ -646,7 +646,7 @@ def test_mh_len(track_abundance): for i in range(0, 40, 2): a.add_hash(i) - assert list(a.hashes) == list(range(0, 40, 2)) + assert list(sorted(a.hashes)) == list(range(0, 40, 2)) def test_mh_unsigned_long_long(track_abundance): @@ -759,7 +759,7 @@ def test_mh_merge(track_abundance): d = b.merge(a) assert len(c) == len(d) - assert list(c.hashes) == list(d.hashes) + assert list(sorted(c.hashes)) == list(d.hashes) if track_abundance: assert round(c.similarity(d), 3) == 0.91 @@ -783,7 +783,7 @@ def test_mh_merge_empty_num(track_abundance): assert len(c) assert len(c) == len(d) - assert list(c.hashes) == list(d.hashes) + assert list(sorted(c.hashes)) == list(sorted(d.hashes)) assert round(c.similarity(d), 3) == 1.0 assert round(d.similarity(c), 3) == 1.0 @@ -802,7 +802,7 @@ def test_mh_merge_empty_scaled(track_abundance): assert len(c) assert len(c) == len(d) - assert list(c.hashes) == list(d.hashes) + assert list(sorted(c.hashes)) == list(sorted(d.hashes)) assert round(c.similarity(d), 3) == 1.0 assert round(d.similarity(c), 3) == 1.0 @@ -1137,7 +1137,7 @@ def test_abundance_count_common(): assert a.count_common(b) == 1 assert a.count_common(b) == b.count_common(a) - assert list(b.hashes) == [2110480117637990133, 10798773792509008305] + assert list(sorted(b.hashes)) == [2110480117637990133, 10798773792509008305] def test_abundance_similarity(): @@ -1200,7 +1200,7 @@ def test_set_abundance_clear(): a.set_abundances({1: 3, 2: 4}, clear=True) b.set_abundances({1: 3, 2: 4}, clear=False) - assert list(a.hashes) == list(b.hashes) + assert list(sorted(a.hashes)) == list(sorted(b.hashes)) def test_set_abundance_clear_2(): From aa5441f37530d7269035e5be8c7f80d3bc88dd2a Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 29 Jul 2020 09:25:31 -0700 Subject: [PATCH 23/50] fix more tests for py2 --- tests/test__minhash.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test__minhash.py b/tests/test__minhash.py index 8e7fe40438..e5f8541662 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -759,7 +759,7 @@ def test_mh_merge(track_abundance): d = b.merge(a) assert len(c) == len(d) - assert list(sorted(c.hashes)) == list(d.hashes) + assert list(sorted(c.hashes)) == list(sorted(d.hashes)) if track_abundance: assert round(c.similarity(d), 3) == 0.91 @@ -1189,7 +1189,7 @@ def test_set_abundance_2(): new_mh.track_abundance = True new_mh.set_abundances(mins) - assert new_mh.hashes == mins + assert set(new_mh.hashes) == set(mins) def test_set_abundance_clear(): From 64f99e34b28f74cc8708d1efadf3bc49d357ae89 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 29 Jul 2020 17:05:23 -0700 Subject: [PATCH 24/50] add 'flatten' method to MinHash --- sourmash/minhash.py | 11 +++++++++++ sourmash/sig/__main__.py | 6 +----- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/sourmash/minhash.py b/sourmash/minhash.py index 3f2e9cb9d7..2c8c37c19a 100644 --- a/sourmash/minhash.py +++ b/sourmash/minhash.py @@ -555,6 +555,17 @@ def intersection(self, other, in_common=False): return common, max(size, 1) + def flatten(self): + """Return a new MinHash with track_abundance=False.""" + # create new object: + a = MinHash( + self.num, self.ksize, self.is_protein, self.dayhoff, self.hp, + False, self.seed, self.max_hash + ) + a.add_many(self) + + return a + def jaccard(self, other, downsample=False): "Calculate Jaccard similarity of two MinHash objects." if self.num != other.num: diff --git a/sourmash/sig/__main__.py b/sourmash/sig/__main__.py index ae7aa6d5d6..5c892668a1 100644 --- a/sourmash/sig/__main__.py +++ b/sourmash/sig/__main__.py @@ -677,11 +677,7 @@ def flatten(args): siglist = [ ss for ss in siglist if args.name in ss.name() ] for ss in siglist: - flattened_mh = ss.minhash.copy_and_clear() - flattened_mh.track_abundance = False - flattened_mh.add_many(ss.minhash.hashes) - - ss.minhash = flattened_mh + ss.minhash = ss.minhash.flatten() outlist.extend(siglist) From d6222b6e3cdfe9a9266c88e7958c19de6fd1b005 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 29 Jul 2020 17:10:09 -0700 Subject: [PATCH 25/50] add test for MinHash.flatten --- tests/test__minhash.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/tests/test__minhash.py b/tests/test__minhash.py index e5f8541662..c14233432e 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -1517,3 +1517,30 @@ def test_try_change_hashes(track_abundance): h = a.hashes with pytest.raises(RuntimeError): h[5] = 10 + + +def test_flatten(): + # test behavior with scaled + scaled = _get_scaled_for_max_hash(35) + mh = MinHash(0, 4, track_abundance=True, scaled=scaled) + assert mh.max_hash == 35 + + mh.add_hash(10) + mh.add_hash(10) + mh.add_hash(10) + mh.add_hash(20) + mh.add_hash(20) + mh.add_hash(30) + mh.add_hash(30) + mh.add_hash(30) + + assert mh.hashes[10] == 3 + assert mh.hashes[20] == 2 + assert mh.hashes[30] == 3 + + mh2 = mh.flatten() + + assert mh2.hashes[10] == 1 + assert mh2.hashes[20] == 1 + assert mh2.hashes[30] == 1 + assert len(mh2) == 3 From 372f4ec23469f7865902d43d584580698e6215a4 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 30 Jul 2020 06:09:40 -0700 Subject: [PATCH 26/50] add tests for add and add_kmer --- sourmash/minhash.py | 3 ++- tests/test__minhash.py | 44 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 1 deletion(-) diff --git a/sourmash/minhash.py b/sourmash/minhash.py index 2c8c37c19a..5630b37f70 100644 --- a/sourmash/minhash.py +++ b/sourmash/minhash.py @@ -277,7 +277,8 @@ def add(self, kmer): def add_kmer(self, kmer): "Add a kmer into the sketch." - assert len(kmer) == self.ksize + if len(kmer) != self.ksize: + raise ValueError("kmer to add is not {} in length".format(self.ksize)) self.add_sequence(kmer) def add_many(self, hashes): diff --git a/tests/test__minhash.py b/tests/test__minhash.py index c14233432e..4eedf87bd2 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -1544,3 +1544,47 @@ def test_flatten(): assert mh2.hashes[20] == 1 assert mh2.hashes[30] == 1 assert len(mh2) == 3 + + +def test_add_kmer(track_abundance): + # test add_kmer method + mh1 = MinHash(0, 4, scaled=1, track_abundance=track_abundance) + mh2 = MinHash(0, 4, scaled=1, track_abundance=track_abundance) + + mh1.add_sequence('ATGCGTGC') + a = mh1.hashes + + mh2.add_kmer('ATGC') + mh2.add_kmer('TGCG') + mh2.add_kmer('GCGT') + mh2.add_kmer('CGTG') + mh2.add_kmer('GTGC') + b = mh2.hashes + + assert set(a.items()) == set(b.items()) + + +def test_add_kmer_too_long(track_abundance): + # test add_kmer method - should only take length k + mh1 = MinHash(0, 4, scaled=1, track_abundance=track_abundance) + + with pytest.raises(ValueError): + mh1.add_kmer('ATGCGTGC') + + +def test_add_deprecated(track_abundance): + # test 'add' method, now deprecated + mh1 = MinHash(0, 4, scaled=1, track_abundance=track_abundance) + mh2 = MinHash(0, 4, scaled=1, track_abundance=track_abundance) + + mh1.add_sequence('ATGCGTGC') + a = mh1.hashes + + mh2.add('ATGC') + mh2.add('TGCG') + mh2.add('GCGT') + mh2.add('CGTG') + mh2.add('GTGC') + b = mh2.hashes + + assert set(a.items()) == set(b.items()) From 4ef2505d59e5c328343f2f27dfdcfc2fcf0fae5f Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 30 Jul 2020 06:21:34 -0700 Subject: [PATCH 27/50] remove nonsense test --- tests/test__minhash.py | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/tests/test__minhash.py b/tests/test__minhash.py index 4eedf87bd2..9fd4b27a07 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -1257,18 +1257,6 @@ def test_set_abundance_initialized(): assert "Can only set track_abundance=True if the MinHash is empty" in e.value.args[0] -def test_reviving_minhash(): - # simulate reading a MinHash from disk - scaled = _get_max_hash_for_scaled(184467440737095520) - mh = MinHash(0, 21, scaled=scaled, seed=42, track_abundance=False) - mins = (28945103950853965, 74690756200987412, 82962372765557409, - 93503551367950366, 106923350319729608, 135116761470196737, - 160165359281648267, 162390811417732001, 177939655451276972) - - for m in mins: - mh.add_hash(m) - - def test_set_abundance_num(): a = MinHash(2, 10, track_abundance=True) From 7b77b770f7a5dc5ac22e35f4d8938de046af9efe Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 30 Jul 2020 06:24:00 -0700 Subject: [PATCH 28/50] test the (now deprecated) get_mins function --- tests/test__minhash.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/tests/test__minhash.py b/tests/test__minhash.py index 9fd4b27a07..b9b15bf19b 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -1576,3 +1576,20 @@ def test_add_deprecated(track_abundance): b = mh2.hashes assert set(a.items()) == set(b.items()) + + +def test_get_mins_deprecated(track_abundance): + mh = MinHash(0, 21, scaled=1, track_abundance=track_abundance) + mins = (28945103950853965, 74690756200987412, 82962372765557409) + + mh.add_many(mins) + mh.add_many(mins) + mh.add_many(mins) + mh.add_many(mins) + + assert set(mh.get_mins()) == set(mins) + if track_abundance: + d = mh.get_mins(with_abundance=True) + for k in mins: + assert d[k] == 4 + assert len(d) == len(mins) From 1cb391ca2f50b35e85fc62f2589104900fd0644d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 30 Jul 2020 06:25:00 -0700 Subject: [PATCH 29/50] test (deprecated) get_hashes --- tests/test__minhash.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test__minhash.py b/tests/test__minhash.py index b9b15bf19b..67e01a5ba6 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -1593,3 +1593,15 @@ def test_get_mins_deprecated(track_abundance): for k in mins: assert d[k] == 4 assert len(d) == len(mins) + + +def test_get_hashes_deprecated(track_abundance): + mh = MinHash(0, 21, scaled=1, track_abundance=track_abundance) + mins = (28945103950853965, 74690756200987412, 82962372765557409) + + mh.add_many(mins) + mh.add_many(mins) + mh.add_many(mins) + mh.add_many(mins) + + assert set(mh.get_hashes()) == set(mins) From 899ec4c65f3d6524d1a031a080add8da94039272 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 30 Jul 2020 06:37:38 -0700 Subject: [PATCH 30/50] add tests for downsample and is_molecule_type --- tests/test__minhash.py | 94 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 94 insertions(+) diff --git a/tests/test__minhash.py b/tests/test__minhash.py index 67e01a5ba6..3779179d83 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -1605,3 +1605,97 @@ def test_get_hashes_deprecated(track_abundance): mh.add_many(mins) assert set(mh.get_hashes()) == set(mins) + + +def test_downsample_num(track_abundance): + # test downsample(num=...) function + mh = MinHash(10, 21, track_abundance=track_abundance) + for i in range(20): + mh.add_hash(i) + + assert mh.num == 10 + assert len(mh) == 10 + + assert list(sorted(mh.hashes)) == list(range(10)) + + mh2 = mh.downsample(num=5) + assert mh2.num == 5 + assert len(mh2) == 5 + + assert list(sorted(mh2.hashes)) == list(range(5)) + + +def test_downsample_n_deprecated(track_abundance): + # test downsample_n(...) function, now deprecated + mh = MinHash(10, 21, track_abundance=track_abundance) + for i in range(20): + mh.add_hash(i) + + assert mh.num == 10 + assert len(mh) == 10 + + assert list(sorted(mh.hashes)) == list(range(10)) + + mh2 = mh.downsample_n(5) + assert mh2.num == 5 + assert len(mh2) == 5 + + assert list(sorted(mh2.hashes)) == list(range(5)) + + +def test_downsample_scaled(track_abundance): + # test downsample(scaled...) method + mh = MinHash(0, 21, scaled=1, track_abundance=track_abundance) + + mins = (1, 2, 3, + 9223372036854775808 + 1, 9223372036854775808 + 2, + 9223372036854775808 + 3) + mh.add_many(mins) + + assert len(mh) == 6 + assert list(mh.hashes) == list(mins) + + mh2 = mh.downsample(scaled=2) + print(mh.max_hash, mh2.max_hash) + + assert len(mh2) == 3 + assert list(mh2.hashes) == list(mins[:3]) + + +def test_downsample_scaled_deprecated(track_abundance): + # test downsample_scaled(...) method, now deprecated + mh = MinHash(0, 21, scaled=1, track_abundance=track_abundance) + + mins = (1, 2, 3, + 9223372036854775808 + 1, 9223372036854775808 + 2, + 9223372036854775808 + 3) + mh.add_many(mins) + + assert len(mh) == 6 + assert list(mh.hashes) == list(mins) + + mh2 = mh.downsample_scaled(2) + print(mh.max_hash, mh2.max_hash) + + assert len(mh2) == 3 + assert list(mh2.hashes) == list(mins[:3]) + + +def test_is_molecule_type_1_deprecated(track_abundance): + mh = MinHash(1, 21, track_abundance=track_abundance) + assert mh.is_molecule_type('DNA') + + +def test_is_molecule_type_2_deprecated(track_abundance): + mh = MinHash(1, 21, track_abundance=track_abundance, is_protein=True) + assert mh.is_molecule_type('protein') + + +def test_is_molecule_type_3_deprecated(track_abundance): + mh = MinHash(1, 21, track_abundance=track_abundance, hp=True) + assert mh.is_molecule_type('hp') + + +def test_is_molecule_type_4_deprecated(track_abundance): + mh = MinHash(1, 21, track_abundance=track_abundance, dayhoff=True) + assert mh.is_molecule_type('dayhoff') From 6b33685bda3de62e335e9631b24a9ccde4fbdba2 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 30 Jul 2020 10:35:02 -0700 Subject: [PATCH 31/50] test moltype properties more explicitly --- tests/test__minhash.py | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) diff --git a/tests/test__minhash.py b/tests/test__minhash.py index 3779179d83..7a97dae309 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -1681,21 +1681,42 @@ def test_downsample_scaled_deprecated(track_abundance): assert list(mh2.hashes) == list(mins[:3]) -def test_is_molecule_type_1_deprecated(track_abundance): +def test_is_molecule_type_1(track_abundance): mh = MinHash(1, 21, track_abundance=track_abundance) assert mh.is_molecule_type('DNA') + assert mh.moltype == 'DNA' + assert mh.is_dna + assert not mh.is_protein + assert not mh.hp + assert not mh.dayhoff -def test_is_molecule_type_2_deprecated(track_abundance): +def test_is_molecule_type_2(track_abundance): mh = MinHash(1, 21, track_abundance=track_abundance, is_protein=True) assert mh.is_molecule_type('protein') + assert mh.moltype == 'protein' + assert not mh.is_dna + assert mh.is_protein + assert not mh.hp + assert not mh.dayhoff -def test_is_molecule_type_3_deprecated(track_abundance): +def test_is_molecule_type_3(track_abundance): mh = MinHash(1, 21, track_abundance=track_abundance, hp=True) assert mh.is_molecule_type('hp') + assert mh.moltype == 'hp' + assert not mh.is_dna + assert not mh.is_protein + assert mh.hp + assert not mh.dayhoff + -def test_is_molecule_type_4_deprecated(track_abundance): +def test_is_molecule_type_4(track_abundance): mh = MinHash(1, 21, track_abundance=track_abundance, dayhoff=True) assert mh.is_molecule_type('dayhoff') + assert mh.moltype == 'dayhoff' + assert not mh.is_dna + assert not mh.is_protein + assert not mh.hp + assert mh.dayhoff From 09068a6756ee795377ac712dd6c67ec8a4bad819 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 30 Jul 2020 12:20:51 -0700 Subject: [PATCH 32/50] fix py27 --- tests/test__minhash.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test__minhash.py b/tests/test__minhash.py index 7a97dae309..2c6e551aef 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -1653,13 +1653,13 @@ def test_downsample_scaled(track_abundance): mh.add_many(mins) assert len(mh) == 6 - assert list(mh.hashes) == list(mins) + assert list(sorted(mh.hashes)) == list(mins) mh2 = mh.downsample(scaled=2) print(mh.max_hash, mh2.max_hash) assert len(mh2) == 3 - assert list(mh2.hashes) == list(mins[:3]) + assert list(sorted(mh2.hashes)) == list(mins[:3]) def test_downsample_scaled_deprecated(track_abundance): @@ -1672,13 +1672,13 @@ def test_downsample_scaled_deprecated(track_abundance): mh.add_many(mins) assert len(mh) == 6 - assert list(mh.hashes) == list(mins) + assert list(sorted(mh.hashes)) == list(mins) mh2 = mh.downsample_scaled(2) print(mh.max_hash, mh2.max_hash) assert len(mh2) == 3 - assert list(mh2.hashes) == list(mins[:3]) + assert list(sorted(mh2.hashes)) == list(mins[:3]) def test_is_molecule_type_1(track_abundance): From 2f6909c450d65082cee1a8235bca1cbe9fdd061b Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 1 Aug 2020 14:08:36 -0700 Subject: [PATCH 33/50] move translate_codon to module level --- sourmash/minhash.py | 12 ++++++++++++ tests/test__minhash.py | 16 ++++++++++++++-- 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/sourmash/minhash.py b/sourmash/minhash.py index 5630b37f70..5ddd64aaf6 100644 --- a/sourmash/minhash.py +++ b/sourmash/minhash.py @@ -74,6 +74,15 @@ def hash_murmur(kmer, seed=MINHASH_DEFAULT_SEED): return lib.hash_murmur(to_bytes(kmer), seed) +def translate_codon(codon): + "Translate a codon into an amino acid." + try: + return rustcall(lib.sourmash_translate_codon, + to_bytes(codon)).decode('utf-8') + except SourmashError as e: + raise ValueError(e.message) + + class _HashesWrapper(collections.Mapping): "A read-only view of the hashes contained by a MinHash object." def __init__(self, h): @@ -433,6 +442,9 @@ def clear(self): "Clears all hashes and abundances." return self._methodcall(lib.kmerminhash_clear) + @deprecated(deprecated_in="3.5", removed_in="4.0", + current_version=VERSION, + details='Use translate_codon function at module level instead.') def translate_codon(self, codon): "Translate a codon into an amino acid." try: diff --git a/tests/test__minhash.py b/tests/test__minhash.py index 2c6e551aef..077a639ff9 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -48,6 +48,7 @@ hash_murmur, _get_scaled_for_max_hash, _get_max_hash_for_scaled, + translate_codon ) from sourmash import signature @@ -174,8 +175,8 @@ def test_protein_hp(track_abundance, hp): assert len(mh.hashes) == 4 -def test_translate_codon(track_abundance): - # Ensure that translation occurs properly +def test_translate_codon_method_deprecated(track_abundance): + # Ensure that translation occurs properly - deprecated => module function mh = MinHash(10, 6, is_protein=True) assert mh.moltype == 'protein' @@ -188,6 +189,17 @@ def test_translate_codon(track_abundance): mh.translate_codon("TCTA") +def test_module_translate_codon(track_abundance): + # Ensure that translation occurs properly - module level function tests + assert "S" == translate_codon('TCT') + assert "S" == translate_codon('TC') + assert "X" == translate_codon("T") + + with pytest.raises(ValueError): + translate_codon("") + translate_codon("TCTA") + + def test_dayhoff(track_abundance): # verify that we can hash to dayhoff-encoded protein/aa sequences mh_dayhoff = MinHash(10, 6, is_protein=True, From 8d3c083af123e824baf5960b397a575193bdabd9 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 2 Aug 2020 08:40:58 -0700 Subject: [PATCH 34/50] put a stub in place of _minhash with a FutureWarning --- sourmash/_minhash.py | 6 ++++++ tests/test__minhash.py | 10 ++++++++++ 2 files changed, 16 insertions(+) create mode 100644 sourmash/_minhash.py diff --git a/sourmash/_minhash.py b/sourmash/_minhash.py new file mode 100644 index 0000000000..8e41fe9149 --- /dev/null +++ b/sourmash/_minhash.py @@ -0,0 +1,6 @@ +"Legacy / deprecated; will be removed in sourmash 4.0." +import warnings + +warnings.warn("Please import from the top level sourmash module instead of using _minhash, which will be renamed in 4.x", FutureWarning) + +from .minhash import * diff --git a/tests/test__minhash.py b/tests/test__minhash.py index 077a639ff9..6818f6f49b 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -1732,3 +1732,13 @@ def test_is_molecule_type_4(track_abundance): assert not mh.is_protein assert not mh.hp assert mh.dayhoff + + +def test__minhash_import(): + from sourmash._minhash import ( + MinHash, + hash_murmur, + _get_scaled_for_max_hash, + _get_max_hash_for_scaled, + translate_codon + ) From 9a8cf64d07127249e590ca24bce0a23e9aee79ef Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 2 Aug 2020 09:01:52 -0700 Subject: [PATCH 35/50] adjust import req --- tests/test__minhash.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/test__minhash.py b/tests/test__minhash.py index 6818f6f49b..899690c941 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -1738,7 +1738,5 @@ def test__minhash_import(): from sourmash._minhash import ( MinHash, hash_murmur, - _get_scaled_for_max_hash, - _get_max_hash_for_scaled, translate_codon ) From f8c9c008b07569773ff65119614ba2fb83dcaddd Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 4 Aug 2020 11:28:03 -0700 Subject: [PATCH 36/50] remove __future__ imports --- benchmarks/benchmarks.py | 1 - setup.py | 1 - sourmash/__init__.py | 1 - sourmash/__main__.py | 1 - sourmash/_minhash.py | 2 -- sourmash/command_compute.py | 2 -- sourmash/commands.py | 2 -- sourmash/index.py | 1 - sourmash/lca/command_classify.py | 1 - sourmash/lca/command_compare_csv.py | 1 - sourmash/lca/command_gather.py | 1 - sourmash/lca/command_index.py | 1 - sourmash/lca/command_rankinfo.py | 1 - sourmash/lca/command_summarize.py | 1 - sourmash/lca/lca_db.py | 1 - sourmash/lca/lca_utils.py | 1 - sourmash/logging.py | 1 - sourmash/nodegraph.py | 1 - sourmash/sbt.py | 1 - sourmash/sbt_storage.py | 2 -- sourmash/sbtmh.py | 3 --- sourmash/search.py | 1 - sourmash/sig/__main__.py | 1 - sourmash/signature.py | 2 -- tests/sourmash_tst_utils.py | 1 - tests/test__minhash.py | 3 --- tests/test_api.py | 1 - tests/test_bugs.py | 1 - tests/test_cmd_signature.py | 1 - tests/test_index.py | 2 -- tests/test_jaccard.py | 2 -- tests/test_lca.py | 1 - tests/test_sbt.py | 2 -- tests/test_signature.py | 2 -- tests/test_sourmash.py | 1 - tests/test_sourmash_compute.py | 1 - 36 files changed, 49 deletions(-) diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py index cb2ae91ddf..4cbde86b39 100644 --- a/benchmarks/benchmarks.py +++ b/benchmarks/benchmarks.py @@ -1,4 +1,3 @@ -from __future__ import unicode_literals import random diff --git a/setup.py b/setup.py index 290325174d..c748f3eef3 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,3 @@ -from __future__ import print_function import os from setuptools import setup, find_packages import sys diff --git a/sourmash/__init__.py b/sourmash/__init__.py index dca58d86d2..d95234a3ae 100644 --- a/sourmash/__init__.py +++ b/sourmash/__init__.py @@ -2,7 +2,6 @@ """ An implementation of a MinHash bottom sketch, applied to k-mers in DNA. """ -from __future__ import print_function import re import math import os diff --git a/sourmash/__main__.py b/sourmash/__main__.py index 70428bb349..ef6b8665c4 100644 --- a/sourmash/__main__.py +++ b/sourmash/__main__.py @@ -1,4 +1,3 @@ -from __future__ import print_function import sourmash diff --git a/sourmash/_minhash.py b/sourmash/_minhash.py index eee206db88..68760e296b 100644 --- a/sourmash/_minhash.py +++ b/sourmash/_minhash.py @@ -1,6 +1,4 @@ # -*- coding: UTF-8 -*- -from __future__ import unicode_literals, division - import math import copy diff --git a/sourmash/command_compute.py b/sourmash/command_compute.py index b3ebc199e4..bd63dcf3d0 100644 --- a/sourmash/command_compute.py +++ b/sourmash/command_compute.py @@ -1,8 +1,6 @@ """ Functions implementing the 'compute' command and related functions. """ -from __future__ import print_function, division, absolute_import - import os import os.path import sys diff --git a/sourmash/commands.py b/sourmash/commands.py index 7ae42f32fb..7adccc7152 100644 --- a/sourmash/commands.py +++ b/sourmash/commands.py @@ -1,8 +1,6 @@ """ Functions implementing the main command-line subcommands. """ -from __future__ import print_function, division, absolute_import - import csv import os import os.path diff --git a/sourmash/index.py b/sourmash/index.py index 2a9eb8faef..9b33fe5899 100644 --- a/sourmash/index.py +++ b/sourmash/index.py @@ -1,6 +1,5 @@ "An Abstract Base Class for collections of signatures." -from __future__ import division from abc import abstractmethod from collections import namedtuple diff --git a/sourmash/lca/command_classify.py b/sourmash/lca/command_classify.py index 568040c56d..817ddcaf2d 100644 --- a/sourmash/lca/command_classify.py +++ b/sourmash/lca/command_classify.py @@ -2,7 +2,6 @@ """ Classify individual signature files down to deepest possible node. """ -from __future__ import print_function import sys import csv diff --git a/sourmash/lca/command_compare_csv.py b/sourmash/lca/command_compare_csv.py index 70e4780fba..3182cba9ef 100644 --- a/sourmash/lca/command_compare_csv.py +++ b/sourmash/lca/command_compare_csv.py @@ -2,7 +2,6 @@ """ Compare two taxonomy spreadsheets. """ -from __future__ import print_function import sys from collections import defaultdict diff --git a/sourmash/lca/command_gather.py b/sourmash/lca/command_gather.py index 812c6b3b41..3ae54fc81e 100644 --- a/sourmash/lca/command_gather.py +++ b/sourmash/lca/command_gather.py @@ -4,7 +4,6 @@ Mimics `sourmash gather` but provides taxonomic information. """ -from __future__ import print_function, division import sys import csv from collections import Counter, defaultdict, namedtuple diff --git a/sourmash/lca/command_index.py b/sourmash/lca/command_index.py index 78b6a0663f..6735f6c290 100644 --- a/sourmash/lca/command_index.py +++ b/sourmash/lca/command_index.py @@ -2,7 +2,6 @@ """ Build a lowest-common-ancestor database with given taxonomy and genome sigs. """ -from __future__ import print_function import sys import csv from collections import defaultdict diff --git a/sourmash/lca/command_rankinfo.py b/sourmash/lca/command_rankinfo.py index cb1454ba7e..081f1bf481 100644 --- a/sourmash/lca/command_rankinfo.py +++ b/sourmash/lca/command_rankinfo.py @@ -2,7 +2,6 @@ """ Summarize rank-specific information from LCAs in one or more databases. """ -from __future__ import print_function import sys from collections import defaultdict diff --git a/sourmash/lca/command_summarize.py b/sourmash/lca/command_summarize.py index 74155a2d6c..efbb6560df 100644 --- a/sourmash/lca/command_summarize.py +++ b/sourmash/lca/command_summarize.py @@ -2,7 +2,6 @@ """ Summarize the taxonomic content of the given signatures, combined. """ -from __future__ import print_function import sys import csv from collections import defaultdict diff --git a/sourmash/lca/lca_db.py b/sourmash/lca/lca_db.py index 6c9ecea3cd..b7d9958a75 100644 --- a/sourmash/lca/lca_db.py +++ b/sourmash/lca/lca_db.py @@ -1,6 +1,5 @@ "LCA database class and utilities." -from __future__ import print_function, division import json import gzip from collections import OrderedDict, defaultdict, Counter diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py index 98186fc603..3f8478e421 100644 --- a/sourmash/lca/lca_utils.py +++ b/sourmash/lca/lca_utils.py @@ -1,7 +1,6 @@ """ Utility functions for lowest-common-ancestor analysis tools. """ -from __future__ import print_function, division from os.path import exists from collections import namedtuple, defaultdict, Counter diff --git a/sourmash/logging.py b/sourmash/logging.py index 2c1de32d2a..49c3dc26b3 100644 --- a/sourmash/logging.py +++ b/sourmash/logging.py @@ -1,4 +1,3 @@ -from __future__ import print_function import sys from io import StringIO diff --git a/sourmash/nodegraph.py b/sourmash/nodegraph.py index c865a3c7c3..ccaf35697c 100644 --- a/sourmash/nodegraph.py +++ b/sourmash/nodegraph.py @@ -1,5 +1,4 @@ # -*- coding: UTF-8 -*- -from __future__ import unicode_literals, division, print_function from struct import pack, unpack import sys diff --git a/sourmash/sbt.py b/sourmash/sbt.py index 25226402ee..3b7a08de6b 100644 --- a/sourmash/sbt.py +++ b/sourmash/sbt.py @@ -41,7 +41,6 @@ def search_transcript(node, seq, threshold): return 0 """ -from __future__ import print_function, unicode_literals, division from collections import namedtuple try: diff --git a/sourmash/sbt_storage.py b/sourmash/sbt_storage.py index 88d79e4886..4cc9b9baaf 100644 --- a/sourmash/sbt_storage.py +++ b/sourmash/sbt_storage.py @@ -1,5 +1,3 @@ -from __future__ import print_function, unicode_literals, division - import abc from io import BytesIO import os diff --git a/sourmash/sbtmh.py b/sourmash/sbtmh.py index 751be6cdcd..4dd6cc8d00 100644 --- a/sourmash/sbtmh.py +++ b/sourmash/sbtmh.py @@ -1,6 +1,3 @@ -from __future__ import print_function -from __future__ import division - from io import BytesIO import sys diff --git a/sourmash/search.py b/sourmash/search.py index ad2da3a92b..dbd08c873b 100644 --- a/sourmash/search.py +++ b/sourmash/search.py @@ -1,4 +1,3 @@ -from __future__ import division from collections import namedtuple import sys diff --git a/sourmash/sig/__main__.py b/sourmash/sig/__main__.py index 0d1cd0a258..f4cc183fc9 100644 --- a/sourmash/sig/__main__.py +++ b/sourmash/sig/__main__.py @@ -1,7 +1,6 @@ """ Command-line entry point for 'python -m sourmash.sig' """ -from __future__ import print_function, unicode_literals import sys import csv import json diff --git a/sourmash/signature.py b/sourmash/signature.py index 4bcd9293cc..9ef3c466a7 100644 --- a/sourmash/signature.py +++ b/sourmash/signature.py @@ -2,8 +2,6 @@ """ Save and load MinHash sketches in a JSON format, along with some metadata. """ -from __future__ import print_function - import sys import os import weakref diff --git a/tests/sourmash_tst_utils.py b/tests/sourmash_tst_utils.py index 34ce47513f..cf33c89b49 100644 --- a/tests/sourmash_tst_utils.py +++ b/tests/sourmash_tst_utils.py @@ -1,6 +1,5 @@ "Various utilities used by sourmash tests." -from __future__ import print_function import sys import os import tempfile diff --git a/tests/test__minhash.py b/tests/test__minhash.py index e4fe15d311..3cca807a90 100644 --- a/tests/test__minhash.py +++ b/tests/test__minhash.py @@ -33,9 +33,6 @@ # Contact: titus@idyll.org # pylint: disable=missing-docstring,protected-access -from __future__ import print_function -from __future__ import absolute_import, unicode_literals - import itertools import pickle import math diff --git a/tests/test_api.py b/tests/test_api.py index 243cc7ca62..fd8a66cabd 100644 --- a/tests/test_api.py +++ b/tests/test_api.py @@ -1,4 +1,3 @@ -from __future__ import print_function, unicode_literals import pytest import sourmash diff --git a/tests/test_bugs.py b/tests/test_bugs.py index d1276cfe28..20608f0a0d 100644 --- a/tests/test_bugs.py +++ b/tests/test_bugs.py @@ -1,4 +1,3 @@ -from __future__ import print_function, unicode_literals from . import sourmash_tst_utils as utils def test_bug_781(): diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index 73a8aa0706..9d06624477 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -1,7 +1,6 @@ """ Tests for the 'sourmash signature' command line. """ -from __future__ import print_function, unicode_literals import csv import shutil import os diff --git a/tests/test_index.py b/tests/test_index.py index 3904030663..8a335d24f0 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -1,5 +1,3 @@ -from __future__ import print_function, unicode_literals - import glob import os import zipfile diff --git a/tests/test_jaccard.py b/tests/test_jaccard.py index 99716d496e..f48c014949 100644 --- a/tests/test_jaccard.py +++ b/tests/test_jaccard.py @@ -3,8 +3,6 @@ objects. """ -from __future__ import print_function, unicode_literals - import pytest from sourmash import MinHash from . import sourmash_tst_utils as utils diff --git a/tests/test_lca.py b/tests/test_lca.py index ff8312d798..7285952441 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -1,7 +1,6 @@ """ Tests for the 'sourmash lca' command line and high level API. """ -from __future__ import print_function, unicode_literals import os import shutil import csv diff --git a/tests/test_sbt.py b/tests/test_sbt.py index 05f654584a..827bcd5e4b 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -1,5 +1,3 @@ -from __future__ import print_function, unicode_literals - import json import shutil import os diff --git a/tests/test_signature.py b/tests/test_signature.py index 94ef3770e0..d1ea5062f3 100644 --- a/tests/test_signature.py +++ b/tests/test_signature.py @@ -1,5 +1,3 @@ -from __future__ import print_function, unicode_literals - import os import pytest diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 251ed2b2b0..3735f4b6b7 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -1,7 +1,6 @@ """ Tests for the 'sourmash' command line. """ -from __future__ import print_function, unicode_literals import os import gzip import shutil diff --git a/tests/test_sourmash_compute.py b/tests/test_sourmash_compute.py index bc2ee59ef1..efa67d53a0 100644 --- a/tests/test_sourmash_compute.py +++ b/tests/test_sourmash_compute.py @@ -1,7 +1,6 @@ """ Tests for sourmash compute command-line functionality. """ -from __future__ import print_function, unicode_literals import os import gzip import shutil From 5d8602096c67b182d97affbb95ee18e3b289dbc5 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 4 Aug 2020 11:29:45 -0700 Subject: [PATCH 37/50] remove sys.version checks for py 2 --- sourmash/lca/command_index.py | 2 -- sourmash/signature.py | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/sourmash/lca/command_index.py b/sourmash/lca/command_index.py index 6735f6c290..f43a20755f 100644 --- a/sourmash/lca/command_index.py +++ b/sourmash/lca/command_index.py @@ -24,8 +24,6 @@ def load_taxonomy_assignments(filename, delimiter=',', start_column=2, lineage tuples. """ mode = 'rt' - if sys.version_info < (3, ): - mode = 'rtU' # parse spreadsheet! fp = open(filename, mode) diff --git a/sourmash/signature.py b/sourmash/signature.py index 9ef3c466a7..421a5c67ce 100644 --- a/sourmash/signature.py +++ b/sourmash/signature.py @@ -268,8 +268,7 @@ def load_signatures( try: if input_type == SigInput.FILE_LIKE: if hasattr(data, "mode") and "t" in data.mode: # need to reopen handler as binary - if sys.version_info >= (3,): - data = data.buffer + data = data.buffer buf = data.read() data.close() From 274be2e69791498426281cd06809a88e365c4ba8 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 4 Aug 2020 11:30:30 -0700 Subject: [PATCH 38/50] remove requirement for enum34 --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index c748f3eef3..399afafd7e 100644 --- a/setup.py +++ b/setup.py @@ -62,7 +62,6 @@ def build_native(spec): ] }, "install_requires": ['screed>=0.9', 'cffi>=1.14.0', 'numpy', - 'enum34; python_version < "3.4"', 'matplotlib', 'scipy', 'deprecation>=2.0.6'], "setup_requires": [ "setuptools>=38.6.0", From cace054e4b532815fcb91112d913a15dd71c0766 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 4 Aug 2020 13:44:52 -0700 Subject: [PATCH 39/50] remove __reduce__ from MinHash class (#1144) --- sourmash/_minhash.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/sourmash/_minhash.py b/sourmash/_minhash.py index eee206db88..cbdaa92eac 100644 --- a/sourmash/_minhash.py +++ b/sourmash/_minhash.py @@ -202,24 +202,6 @@ def __setstate__(self, tup): else: self.add_many(mins) - def __reduce__(self): - "alternative pickling protocol." - return ( - MinHash, - ( - self.num, - self.ksize, - self.is_protein, - self.dayhoff, - self.hp, - self.track_abundance, - self.seed, - self.max_hash, - self.get_mins(with_abundance=self.track_abundance), - 0, - ), - ) - def __eq__(self, other): "equality testing via ==" return self.__getstate__() == other.__getstate__() From f10d632e89567bd5bcf567e3fa3fe0ba95f3a20f Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 5 Aug 2020 05:25:49 -0700 Subject: [PATCH 40/50] avoid the DeprecationWarning --- sourmash/minhash.py | 5 +++-- tests/test_sourmash.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/sourmash/minhash.py b/sourmash/minhash.py index 1bba1db6f9..66750ca4e9 100644 --- a/sourmash/minhash.py +++ b/sourmash/minhash.py @@ -359,8 +359,9 @@ def num(self): @property def scaled(self): - if self.max_hash: - return _get_scaled_for_max_hash(self.max_hash) + mx = self._methodcall(lib.kmerminhash_max_hash) + if mx: + return _get_scaled_for_max_hash(mx) return 0 @property diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 251ed2b2b0..f64927cc42 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -4002,7 +4002,7 @@ def test_do_sourmash_index_zipfile_append(c): c.run_sourmash('index', '-k', '31', 'zzz.sbt.zip', *first_half) # UserWarning is raised when there are duplicated entries in the zipfile - assert not record + assert not record, record outfile = c.output('zzz.sbt.zip') assert os.path.exists(outfile) From 57679d215ff058e7a4eacbc92adf9c8dca7fbc0d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 5 Aug 2020 05:42:16 -0700 Subject: [PATCH 41/50] update docs: only python 3.7 and 3.8 --- README.md | 2 +- doc/developer.md | 2 +- doc/requirements.md | 5 ++--- setup.py | 5 ++--- 4 files changed, 6 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 93ced683f9..852567db76 100644 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ A quickstart tutorial [is available](https://sourmash.readthedocs.io/en/latest/t ### Requirements -sourmash runs under both Python 2.7.x and Python 3.5+. The base +sourmash runs under Python 3.7 and later. The base requirements are screed, cffi, numpy, matplotlib, and scipy. Conda (see below) will install everything necessary, and is our recommended installation method. diff --git a/doc/developer.md b/doc/developer.md index 52d80eac1e..3d4cb35189 100644 --- a/doc/developer.md +++ b/doc/developer.md @@ -7,7 +7,7 @@ You can get the latest development master branch with: ``` git clone https://github.com/dib-lab/sourmash.git ``` -sourmash runs under both Python 2.7.x and Python 3.5+. The base +sourmash runs under Python 3.7 and later. The base requirements are screed and cffi, together with a Rust environment (for the extension code). We suggest using `rustup` to install the Rust environment: diff --git a/doc/requirements.md b/doc/requirements.md index dd95ea8dd7..60d545a3d9 100644 --- a/doc/requirements.md +++ b/doc/requirements.md @@ -1,6 +1,5 @@ # Computational requirements - sourmash has no particular memory requirements; it will need to hold the largest single sequence you have in memory, but the individual signatures are quite small and we do no special buffer allocation. @@ -11,8 +10,8 @@ in a second or so on a rather slow 2016 Mac laptop. MinHash sketches and signatures are quite small on disk. -sourmash should run with little modification on Linux and Mac OS X, -under Python 2.7.11 and Python 3.5. Please see [the development repository README][0] +sourmash should run with no modification on Linux and Mac OS X, +under Python 3.7 and later. Please see [the development repository README][0] for information on source code, tests, and continuous integration. [0]:https://github.com/dib-lab/sourmash/blob/master/README.md diff --git a/setup.py b/setup.py index 399afafd7e..0a1cbc8da2 100644 --- a/setup.py +++ b/setup.py @@ -36,9 +36,8 @@ def build_native(spec): "Operating System :: POSIX :: Linux", "Operating System :: MacOS :: MacOS X", "Programming Language :: Rust", - "Programming Language :: Python :: 2.7", - "Programming Language :: Python :: 3.5", - "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Programming Language :: Python :: 3.8", "Topic :: Scientific/Engineering :: Bio-Informatics", ] From d00e77d881ee14ccc0a3599b1c85ec24b5427a2f Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 5 Aug 2020 05:43:48 -0700 Subject: [PATCH 42/50] remove 2.7 from travis --- .travis.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 5c6a228bcf..ae968cea90 100644 --- a/.travis.yml +++ b/.travis.yml @@ -110,8 +110,6 @@ jobs: python: 3.7 env: - TOXENV=docs - - <<: *test - python: 2.7 - &wheel stage: build wheel and send to github releases From fef2c6464977a9e4c5b75d937a11d6d08a4e5330 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 5 Aug 2020 05:54:06 -0700 Subject: [PATCH 43/50] remove _compat from signature.py --- sourmash/signature.py | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/sourmash/signature.py b/sourmash/signature.py index 421a5c67ce..695ba74769 100644 --- a/sourmash/signature.py +++ b/sourmash/signature.py @@ -12,7 +12,6 @@ from ._minhash import to_bytes from ._lowlevel import ffi, lib from .utils import RustObject, rustcall, decode_str -from ._compat import PY2 SIGNATURE_VERSION = 0.4 @@ -208,12 +207,6 @@ def _detect_input_type(data): try: if data.find("sourmash_signature") > 0: return SigInput.BUFFER - elif PY2: - try: - if data.startswith(b'\x1F\x8B'): # gzip compressed - return SigInput.BUFFER - except UnicodeDecodeError: - pass except TypeError: if data.find(b"sourmash_signature") > 0: return SigInput.BUFFER @@ -286,7 +279,7 @@ def load_signatures( ) if input_type == SigInput.BUFFER: - if hasattr(data, "encode") and not PY2: + if hasattr(data, "encode"): data = data.encode("utf-8") sigs_ptr = rustcall( From 5745db13f8a8bf0e494b1c7fdacf243c25ee5dc6 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 5 Aug 2020 05:56:33 -0700 Subject: [PATCH 44/50] remove _compat from exceptions.py --- sourmash/exceptions.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/sourmash/exceptions.py b/sourmash/exceptions.py index 6f73a59593..4895c50947 100644 --- a/sourmash/exceptions.py +++ b/sourmash/exceptions.py @@ -1,4 +1,3 @@ -from ._compat import implements_to_string from ._lowlevel import lib @@ -6,7 +5,6 @@ exceptions_by_code = {} -@implements_to_string class SourmashError(Exception): code = None From 1fe766865a1086a83847a40685dfd10dcbf55d46 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 5 Aug 2020 05:58:45 -0700 Subject: [PATCH 45/50] remove _compat from index and sbt_storage --- sourmash/index.py | 4 +--- sourmash/sbt_storage.py | 3 +-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/sourmash/index.py b/sourmash/index.py index 9b33fe5899..d8fb3ce8ca 100644 --- a/sourmash/index.py +++ b/sourmash/index.py @@ -1,10 +1,8 @@ "An Abstract Base Class for collections of signatures." -from abc import abstractmethod +from abc import abstractmethod, ABC from collections import namedtuple -from ._compat import ABC - class Index(ABC): @abstractmethod diff --git a/sourmash/sbt_storage.py b/sourmash/sbt_storage.py index 4cc9b9baaf..4cd4cccc5f 100644 --- a/sourmash/sbt_storage.py +++ b/sourmash/sbt_storage.py @@ -6,8 +6,7 @@ import tarfile from tempfile import NamedTemporaryFile import zipfile - -from ._compat import ABC +from abc import ABC class Storage(ABC): From 26e4c0d8f6c845d616e2377109f2d184bf3f130a Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 5 Aug 2020 06:01:11 -0700 Subject: [PATCH 46/50] remove _compat from nodegraph --- sourmash/nodegraph.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/sourmash/nodegraph.py b/sourmash/nodegraph.py index ccaf35697c..b3270ab62d 100644 --- a/sourmash/nodegraph.py +++ b/sourmash/nodegraph.py @@ -4,7 +4,6 @@ import sys from tempfile import NamedTemporaryFile -from ._compat import string_types, range_type from ._lowlevel import ffi, lib from ._minhash import to_bytes, MinHash from .utils import RustObject, rustcall, decode_str @@ -51,12 +50,12 @@ def update(self, other): raise TypeError("Must be a Nodegraph or MinHash") def count(self, h): - if isinstance(h, string_types): + if isinstance(h, str): return self._methodcall(lib.nodegraph_count_kmer, to_bytes(h)) return self._methodcall(lib.nodegraph_count, h) def get(self, h): - if isinstance(h, string_types): + if isinstance(h, str): return self._methodcall(lib.nodegraph_get_kmer, to_bytes(h)) return self._methodcall(lib.nodegraph_get, h) From c5f1c4395b8ebf8a5eba302cead2bc46dd751ed0 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 5 Aug 2020 06:02:49 -0700 Subject: [PATCH 47/50] remove _compat completely --- sourmash/_compat.py | 27 --------------------------- sourmash/_minhash.py | 5 ++--- 2 files changed, 2 insertions(+), 30 deletions(-) delete mode 100644 sourmash/_compat.py diff --git a/sourmash/_compat.py b/sourmash/_compat.py deleted file mode 100644 index 90f7afabf2..0000000000 --- a/sourmash/_compat.py +++ /dev/null @@ -1,27 +0,0 @@ -import abc -import sys - - -PY2 = sys.version_info[0] == 2 - -if PY2: - text_type = unicode - int_types = (int, long) - string_types = (str, unicode) - range_type = xrange - itervalues = lambda x: x.itervalues() - NUL = '\x00' - def implements_to_string(cls): - cls.__unicode__ = cls.__str__ - cls.__str__ = lambda x: x.__unicode__().encode('utf-8') - return cls - ABC = abc.ABCMeta(str('ABC'), (object,), {'__slots__': ()}) -else: - text_type = str - int_types = (int,) - string_types = (str,) - range_type = range - itervalues = lambda x: x.values() - NUL = 0 - implements_to_string = lambda x: x - from abc import ABC diff --git a/sourmash/_minhash.py b/sourmash/_minhash.py index 68760e296b..f528389cd5 100644 --- a/sourmash/_minhash.py +++ b/sourmash/_minhash.py @@ -3,7 +3,6 @@ import copy from . import VERSION -from ._compat import string_types, range_type from ._lowlevel import ffi, lib from .utils import RustObject, rustcall, decode_str from .exceptions import SourmashError @@ -52,10 +51,10 @@ def to_bytes(s): if isinstance(s, bytes): return s - if not isinstance(s, string_types + (bytes, int)): + if not isinstance(s, (str, bytes, int)): raise TypeError("Requires a string-like sequence") - if isinstance(s, string_types): + if isinstance(s, str): s = s.encode("utf-8") elif isinstance(s, int): s = bytes([s]) From ac6e2fcfc8fbff120a618d5ce7ed6f5e7210951d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 5 Aug 2020 06:11:44 -0700 Subject: [PATCH 48/50] make signature -> sig in CLI using py3 'aliases' --- sourmash/cli/__init__.py | 2 +- sourmash/cli/sig/__init__.py | 2 +- sourmash/cli/signature/__init__.py | 45 ------------------------------ 3 files changed, 2 insertions(+), 47 deletions(-) delete mode 100644 sourmash/cli/signature/__init__.py diff --git a/sourmash/cli/__init__.py b/sourmash/cli/__init__.py index 083d097f7e..124427a55d 100644 --- a/sourmash/cli/__init__.py +++ b/sourmash/cli/__init__.py @@ -34,7 +34,7 @@ # Subcommand groups from . import lca from . import sig -from . import signature +from . import sig as signature from . import storage diff --git a/sourmash/cli/sig/__init__.py b/sourmash/cli/sig/__init__.py index 5d7d1dc4fa..36a224ef86 100644 --- a/sourmash/cli/sig/__init__.py +++ b/sourmash/cli/sig/__init__.py @@ -25,7 +25,7 @@ def subparser(subparsers): - subparser = subparsers.add_parser('sig', formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS) + subparser = subparsers.add_parser('sig', formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS, aliases=['signature']) desc = 'Operations\n' clidir = os.path.dirname(__file__) ops = command_list(clidir) diff --git a/sourmash/cli/signature/__init__.py b/sourmash/cli/signature/__init__.py deleted file mode 100644 index 7bc2d0ab7f..0000000000 --- a/sourmash/cli/signature/__init__.py +++ /dev/null @@ -1,45 +0,0 @@ -"""Define the command line interface for sourmash signature. - -Copy commands over from 'sourmash sig'. - -This can be removed once Python 2.7 is no longer supported, in favor of an -'aliases' argument to add_subparser in ../sig/__init__.py. -""" - -from ..sig import cat -from ..sig import split -from ..sig import describe -from ..sig import downsample -from ..sig import extract -from ..sig import filter -from ..sig import flatten -from ..sig import intersect -from ..sig import merge -from ..sig import rename -from ..sig import subtract -from ..sig import ingest -from ..sig import export -from ..sig import overlap -from ..utils import command_list -from argparse import SUPPRESS, RawDescriptionHelpFormatter -import os -import sys - - -def subparser(subparsers): - subparser = subparsers.add_parser('signature', formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS) - desc = 'Operations\n' - clidir = os.path.join(os.path.dirname(__file__), '../sig/') - ops = command_list(clidir) - for subcmd in ops: - docstring = getattr(sys.modules[__name__], subcmd).__doc__ - helpstring = 'sourmash signature {op:s} --help'.format(op=subcmd) - desc += ' {hs:33s} {ds:s}\n'.format(hs=helpstring, ds=docstring) - s = subparser.add_subparsers( - title='Manipulate signature files', dest='subcmd', metavar='subcmd', help=SUPPRESS, - description=desc - ) - for subcmd in ops: - getattr(sys.modules[__name__], subcmd).subparser(s) - subparser._action_groups.reverse() - subparser._optionals.title = 'Options' From 310b2676aef046934deafbb9b5ee8239ed047500 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 5 Aug 2020 06:12:54 -0700 Subject: [PATCH 49/50] put back assert that didn't work in py2 --- tests/test_cmd_signature.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index 9d06624477..bd4a4b2db7 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -16,16 +16,14 @@ def test_run_sourmash_signature_cmd(): status, out, err = utils.runscript('sourmash', ['signature'], fail_ok=True) assert not 'sourmash: error: argument cmd: invalid choice:' in err - # doesn't work in py2.7 - # assert 'Manipulate signature files:' in out + assert 'Manipulate signature files:' in out assert status != 0 # no args provided, ok ;) def test_run_sourmash_sig_cmd(): status, out, err = utils.runscript('sourmash', ['sig'], fail_ok=True) assert not 'sourmash: error: argument cmd: invalid choice:' in err - # doesn't work in py2.7 - # assert 'Manipulate signature files:' in out + assert 'Manipulate signature files:' in out assert status != 0 # no args provided, ok ;) From 91de874456bd50e7f68430bf16987224fb6a7051 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 5 Aug 2020 15:46:30 -0700 Subject: [PATCH 50/50] Update sourmash/minhash.py Co-authored-by: Luiz Irber --- sourmash/minhash.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sourmash/minhash.py b/sourmash/minhash.py index d334d854ac..915a32c996 100644 --- a/sourmash/minhash.py +++ b/sourmash/minhash.py @@ -504,7 +504,7 @@ def downsample_max_hash(self, *others): """ max_hashes = [x.max_hash for x in others] new_max_hash = min(self.max_hash, *max_hashes) - new_scaled = get_scaled_for_max_hash(new_max_hash) + new_scaled = _get_scaled_for_max_hash(new_max_hash) return self.downsample_scaled(new_scaled)