From 883732cafa560fef6ac8b66d9e5b3777729209e8 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 25 Jul 2020 11:58:06 -0700
Subject: [PATCH 01/50] move sourmash._minhash to sourmash.minhash

---
 benchmarks/benchmarks.py             | 5 +----
 doc/conf.py                          | 2 +-
 doc/developer.md                     | 2 +-
 sourmash/__init__.py                 | 2 +-
 sourmash/cli/compute.py              | 2 +-
 sourmash/lca/lca_db.py               | 2 +-
 sourmash/{_minhash.py => minhash.py} | 0
 sourmash/nodegraph.py                | 2 +-
 sourmash/search.py                   | 2 +-
 sourmash/sig/__main__.py             | 6 +++---
 sourmash/signature.py                | 2 +-
 tests/test__minhash.py               | 2 +-
 tests/test__minhash_hypothesis.py    | 2 +-
 tests/test_rustobj.py                | 2 +-
 14 files changed, 15 insertions(+), 18 deletions(-)
 rename sourmash/{_minhash.py => minhash.py} (100%)

diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py
index cb2ae91ddf..d9bdfaf6a9 100644
--- a/benchmarks/benchmarks.py
+++ b/benchmarks/benchmarks.py
@@ -2,10 +2,7 @@
 import random
 
 
-try:
-    from sourmash._minhash import MinHash
-except:
-    from sourmash.minhash import MinHash
+from sourmash.minhash import MinHash
 
 
 def load_sequences():
diff --git a/doc/conf.py b/doc/conf.py
index 87dbf3ad88..802fadc71b 100644
--- a/doc/conf.py
+++ b/doc/conf.py
@@ -297,4 +297,4 @@
 # If true, do not generate a @detailmenu in the "Top" node's menu.
 #texinfo_no_detailmenu = False
 
-autodoc_mock_imports = ["sourmash._minhash"]
+autodoc_mock_imports = ["sourmash.minhash"]
diff --git a/doc/developer.md b/doc/developer.md
index 35aec5345b..561f6b8857 100644
--- a/doc/developer.md
+++ b/doc/developer.md
@@ -34,7 +34,7 @@ run the Rust tests.
 
 ### If you're having trouble installing or using the development environment
 
-If you are getting an error that contains `ImportError: cannot import name 'to_bytes' from 'sourmash._minhash'`, then it's likely you need to update Rust and clean up your environment. Some installation issues can be solved by simply removing the intermediate build files with: 
+If you are getting an error that contains `ImportError: cannot import name 'to_bytes' from 'sourmash.minhash'`, then it's likely you need to update Rust and clean up your environment. Some installation issues can be solved by simply removing the intermediate build files with: 
 
 ```
 make clean
diff --git a/sourmash/__init__.py b/sourmash/__init__.py
index dca58d86d2..e50b7f7f42 100644
--- a/sourmash/__init__.py
+++ b/sourmash/__init__.py
@@ -25,7 +25,7 @@
             "use the PyPI ones."
         )
 
-from ._minhash import MinHash, get_minhash_default_seed, get_minhash_max_hash
+from .minhash import MinHash, get_minhash_default_seed, get_minhash_max_hash
 
 DEFAULT_SEED = get_minhash_default_seed()
 MAX_HASH = get_minhash_max_hash()
diff --git a/sourmash/cli/compute.py b/sourmash/cli/compute.py
index cdfcfa7645..d5e959e0a5 100644
--- a/sourmash/cli/compute.py
+++ b/sourmash/cli/compute.py
@@ -28,7 +28,7 @@
 
 from argparse import FileType
 
-from sourmash._minhash import get_minhash_default_seed
+from sourmash.minhash import get_minhash_default_seed
 from sourmash.cli.utils import add_construct_moltype_args
 
 
diff --git a/sourmash/lca/lca_db.py b/sourmash/lca/lca_db.py
index 6c9ecea3cd..1430637090 100644
--- a/sourmash/lca/lca_db.py
+++ b/sourmash/lca/lca_db.py
@@ -7,7 +7,7 @@
 import functools
 
 import sourmash
-from sourmash._minhash import get_max_hash_for_scaled
+from sourmash.minhash import get_max_hash_for_scaled
 from sourmash.logging import notify, error, debug
 from sourmash.index import Index
 
diff --git a/sourmash/_minhash.py b/sourmash/minhash.py
similarity index 100%
rename from sourmash/_minhash.py
rename to sourmash/minhash.py
diff --git a/sourmash/nodegraph.py b/sourmash/nodegraph.py
index c865a3c7c3..ec0165dca4 100644
--- a/sourmash/nodegraph.py
+++ b/sourmash/nodegraph.py
@@ -7,7 +7,7 @@
 
 from ._compat import string_types, range_type
 from ._lowlevel import ffi, lib
-from ._minhash import to_bytes, MinHash
+from .minhash import to_bytes, MinHash
 from .utils import RustObject, rustcall, decode_str
 from .exceptions import SourmashError
 
diff --git a/sourmash/search.py b/sourmash/search.py
index ad2da3a92b..02424e719d 100644
--- a/sourmash/search.py
+++ b/sourmash/search.py
@@ -4,7 +4,7 @@
 
 from .logging import notify, error
 from .signature import SourmashSignature
-from ._minhash import get_max_hash_for_scaled
+from .minhash import get_max_hash_for_scaled
 
 
 # generic SearchResult.
diff --git a/sourmash/sig/__main__.py b/sourmash/sig/__main__.py
index 0d1cd0a258..938833df3a 100644
--- a/sourmash/sig/__main__.py
+++ b/sourmash/sig/__main__.py
@@ -12,9 +12,9 @@
 import copy
 from sourmash.sourmash_args import FileOutput
 
-from ..logging import set_quiet, error, notify, set_quiet, print_results, debug
-from .. import sourmash_args
-from .._minhash import get_max_hash_for_scaled
+from sourmash.logging import set_quiet, error, notify, set_quiet, print_results, debug
+from sourmash import sourmash_args
+from sourmash.minhash import get_max_hash_for_scaled
 
 usage='''
 sourmash signature <command> [<args>] - manipulate/work with signature files.
diff --git a/sourmash/signature.py b/sourmash/signature.py
index 4bcd9293cc..e6d5d50d84 100644
--- a/sourmash/signature.py
+++ b/sourmash/signature.py
@@ -11,7 +11,7 @@
 
 from .logging import error
 from . import MinHash
-from ._minhash import to_bytes
+from .minhash import to_bytes
 from ._lowlevel import ffi, lib
 from .utils import RustObject, rustcall, decode_str
 from ._compat import PY2
diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index 866ceb1769..ccfb6f5fd5 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -43,7 +43,7 @@
 import pytest
 
 import sourmash
-from sourmash._minhash import (
+from sourmash.minhash import (
     MinHash,
     hash_murmur,
     get_scaled_for_max_hash,
diff --git a/tests/test__minhash_hypothesis.py b/tests/test__minhash_hypothesis.py
index f3a1446907..6055e1d870 100644
--- a/tests/test__minhash_hypothesis.py
+++ b/tests/test__minhash_hypothesis.py
@@ -4,7 +4,7 @@
 import hypothesis.strategies as st
 
 from sourmash import MinHash
-from sourmash._minhash import get_max_hash_for_scaled
+from sourmash.minhash import get_max_hash_for_scaled
 
 
 @given(st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000),
diff --git a/tests/test_rustobj.py b/tests/test_rustobj.py
index 4be7a0e1ee..2b9b8e6877 100644
--- a/tests/test_rustobj.py
+++ b/tests/test_rustobj.py
@@ -1,7 +1,7 @@
 import pytest
 
 from sourmash.utils import RustObject
-from sourmash._minhash import to_bytes
+from sourmash.minhash import to_bytes
 
 
 def test_rustobj_init():

From 80f9bef51c5c11f8797e253f4ca9a04c957aaae7 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 25 Jul 2020 12:23:44 -0700
Subject: [PATCH 02/50] deprecate max_hash throughout

---
 sourmash/lca/lca_db.py            |   4 +-
 sourmash/minhash.py               |  47 +++++------
 sourmash/search.py                |   4 +-
 sourmash/sig/__main__.py          |   6 +-
 tests/test__minhash.py            | 128 ++++++++++++++----------------
 tests/test__minhash_hypothesis.py |   4 +-
 6 files changed, 88 insertions(+), 105 deletions(-)

diff --git a/sourmash/lca/lca_db.py b/sourmash/lca/lca_db.py
index 1430637090..3931813e19 100644
--- a/sourmash/lca/lca_db.py
+++ b/sourmash/lca/lca_db.py
@@ -7,7 +7,7 @@
 import functools
 
 import sourmash
-from sourmash.minhash import get_max_hash_for_scaled
+from sourmash.minhash import _get_max_hash_for_scaled
 from sourmash.logging import notify, error, debug
 from sourmash.index import Index
 
@@ -369,7 +369,7 @@ def downsample_scaled(self, scaled):
 
         self._invalidate_cache()
 
-        max_hash = get_max_hash_for_scaled(scaled)
+        max_hash = _get_max_hash_for_scaled(scaled)
 
         # filter out all hashes over max_hash in value.
         new_hashvals = {}
diff --git a/sourmash/minhash.py b/sourmash/minhash.py
index eee206db88..b624c6c24f 100644
--- a/sourmash/minhash.py
+++ b/sourmash/minhash.py
@@ -30,7 +30,7 @@ def get_minhash_max_hash():
     return MINHASH_MAX_HASH
 
 
-def get_max_hash_for_scaled(scaled):
+def _get_max_hash_for_scaled(scaled):
     "Convert a 'scaled' value into a 'max_hash' value."
     if scaled == 0:
         return 0
@@ -40,7 +40,7 @@ def get_max_hash_for_scaled(scaled):
     return int(round(get_minhash_max_hash() / scaled, 0))
 
 
-def get_scaled_for_max_hash(max_hash):
+def _get_scaled_for_max_hash(max_hash):
     "Convert a 'max_hash' value into a 'scaled' value."
     if max_hash == 0:
         return 0
@@ -130,24 +130,24 @@ def __init__(
            * track_abundance (default False) - track hash multiplicity
            * mins (default None) - list of hashvals, or (hashval, abund) pairs
            * seed (default 42) - murmurhash seed
-
-        Deprecated: @CTB
-           * ``max_hash=<int>``; use ``scaled`` instead.
         """
-        if max_hash and scaled:
-            raise ValueError("cannot set both max_hash and scaled")
-        elif scaled:
-            max_hash = get_max_hash_for_scaled(scaled)
+        # support max_hash in constructor, for now.
+        if max_hash:
+            if scaled:
+                raise ValueError("cannot set both max_hash and scaled")
+            scaled = _get_scaled_for_max_hash(max_hash)
 
-        if max_hash and n:
+        if scaled and n:
             raise ValueError("cannot set both n and max_hash")
 
-        if not n and not (max_hash or scaled):
+        if not n and not scaled:
             raise ValueError("cannot omit both n and scaled")
 
         if dayhoff or hp:
             is_protein = False
 
+        # ok, for Rust API, go from scaled back to max_hash
+        max_hash = _get_max_hash_for_scaled(scaled)
         self._objptr = lib.kmerminhash_new(
             n, ksize, is_protein, dayhoff, hp, seed, int(max_hash), track_abundance
         )
@@ -313,10 +313,14 @@ def seed(self):
     def num(self):
         return self._methodcall(lib.kmerminhash_num)
 
+    @property
+    def max_hash(self):
+        return self._methodcall(lib.kmerminhash_max_hash)
+
     @property
     def scaled(self):
         if self.max_hash:
-            return get_scaled_for_max_hash(self.max_hash)
+            return _get_scaled_for_max_hash(self.max_hash)
         return 0
 
     @property
@@ -339,10 +343,6 @@ def hp(self):
     def ksize(self):
         return self._methodcall(lib.kmerminhash_ksize)
 
-    @property
-    def max_hash(self):
-        return self._methodcall(lib.kmerminhash_max_hash)
-
     @property
     def track_abundance(self):
         return self._methodcall(lib.kmerminhash_track_abundance)
@@ -410,17 +410,6 @@ def downsample_n(self, new_num):
 
         return a
 
-    def downsample_max_hash(self, *others):
-        """Copy this object and downsample new object to min of ``*others``.
-
-        Here, ``*others`` is one or more MinHash objects.
-        """
-        max_hashes = [x.max_hash for x in others]
-        new_max_hash = min(self.max_hash, *max_hashes)
-        new_scaled = get_scaled_for_max_hash(new_max_hash)
-
-        return self.downsample_scaled(new_scaled)
-
     def downsample_scaled(self, new_scaled):
         """Copy this object and downsample new object to scaled=``new_scaled``.
         """
@@ -431,7 +420,7 @@ def downsample_scaled(self, new_scaled):
         if max_hash is None:
             raise ValueError("no max_hash available - cannot downsample")
 
-        old_scaled = get_scaled_for_max_hash(self.max_hash)
+        old_scaled = _get_scaled_for_max_hash(self.max_hash)
         if old_scaled > new_scaled:
             raise ValueError(
                 "new scaled {} is lower than current sample scaled {}".format(
@@ -439,7 +428,7 @@ def downsample_scaled(self, new_scaled):
                 )
             )
 
-        new_max_hash = get_max_hash_for_scaled(new_scaled)
+        new_max_hash = _get_max_hash_for_scaled(new_scaled)
 
         a = MinHash(
             0,
diff --git a/sourmash/search.py b/sourmash/search.py
index 02424e719d..be7cdc1cfe 100644
--- a/sourmash/search.py
+++ b/sourmash/search.py
@@ -4,7 +4,7 @@
 
 from .logging import notify, error
 from .signature import SourmashSignature
-from .minhash import get_max_hash_for_scaled
+from .minhash import _get_max_hash_for_scaled
 
 
 # generic SearchResult.
@@ -154,7 +154,7 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance):
         # eliminate mins under this new resolution.
         # (CTB note: this means that if a high scaled/low res signature is
         # found early on, resolution will be low from then on.)
-        new_max_hash = get_max_hash_for_scaled(cmp_scaled)
+        new_max_hash = _get_max_hash_for_scaled(cmp_scaled)
         query_mins = set(_filter_max_hash(query_mins, new_max_hash))
         found_mins = set(_filter_max_hash(found_mins, new_max_hash))
         orig_query_mins = set(_filter_max_hash(orig_query_mins, new_max_hash))
diff --git a/sourmash/sig/__main__.py b/sourmash/sig/__main__.py
index 938833df3a..8f25aa46a9 100644
--- a/sourmash/sig/__main__.py
+++ b/sourmash/sig/__main__.py
@@ -14,7 +14,7 @@
 
 from sourmash.logging import set_quiet, error, notify, set_quiet, print_results, debug
 from sourmash import sourmash_args
-from sourmash.minhash import get_max_hash_for_scaled
+from sourmash.minhash import _get_max_hash_for_scaled
 
 usage='''
 sourmash signature <command> [<args>] - manipulate/work with signature files.
@@ -53,7 +53,7 @@ def _set_num_scaled(mh, num, scaled):
     # Number of hashes is 0th parameter
     mh_params[0] = num
     # Scale is 8th parameter
-    mh_params[8] = get_max_hash_for_scaled(scaled)
+    mh_params[8] = _get_max_hash_for_scaled(scaled)
     mh.__setstate__(mh_params)
     assert mh.num == num
     assert mh.scaled == scaled
@@ -730,7 +730,7 @@ def downsample(args):
                     mh_new = mh.downsample_scaled(args.scaled)
                 else:                         # try to turn a num into a scaled
                     # first check: can we?
-                    max_hash = get_max_hash_for_scaled(args.scaled)
+                    max_hash = _get_max_hash_for_scaled(args.scaled)
                     mins = mh.get_mins()
                     if max(mins) < max_hash:
                         raise ValueError("this num MinHash does not have enough hashes to convert it into a scaled MinHash.")
diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index ccfb6f5fd5..1ec34b2f3a 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -46,8 +46,8 @@
 from sourmash.minhash import (
     MinHash,
     hash_murmur,
-    get_scaled_for_max_hash,
-    get_max_hash_for_scaled,
+    _get_scaled_for_max_hash,
+    _get_max_hash_for_scaled,
 )
 from sourmash import signature
 
@@ -61,6 +61,10 @@
 # * nan on empty minhash
 # * define equals
 
+scaled50 = _get_scaled_for_max_hash(50)
+scaled100 = _get_scaled_for_max_hash(100)
+scaled5000 = _get_scaled_for_max_hash(5000)
+
 
 def test_basic_dna(track_abundance):
     # verify that MHs of size 1 stay size 1, & act properly as bottom sketches.
@@ -235,23 +239,10 @@ def test_size_limit(track_abundance):
     assert mh.get_mins() == [5, 10, 20]
 
 
-def test_max_hash(track_abundance):
-    # test behavior with max_hash
-    mh = MinHash(0, 4, track_abundance=track_abundance, max_hash=35)
-    mh.add_hash(10)
-    mh.add_hash(20)
-    mh.add_hash(30)
-    assert mh.get_mins() == [10, 20, 30]
-    mh.add_hash(40)
-    assert mh.get_mins() == [10, 20, 30]
-    mh.add_hash(36)
-    assert mh.get_mins() == [10, 20, 30]
-
-
 def test_scaled(track_abundance):
-    # test behavior with scaled (alt to max_hash)
-    scaled = get_scaled_for_max_hash(35)
-    print('XX', scaled, get_max_hash_for_scaled(scaled))
+    # test behavior with scaled
+    scaled = _get_scaled_for_max_hash(35)
+    print('XX', scaled, _get_max_hash_for_scaled(scaled))
     mh = MinHash(0, 4, track_abundance=track_abundance, scaled=scaled)
     assert mh.max_hash == 35
 
@@ -273,29 +264,23 @@ def test_no_scaled(track_abundance):
 
 def test_max_hash_conversion():
     SCALED=100000
-    max_hash = get_max_hash_for_scaled(SCALED)
-    new_scaled = get_scaled_for_max_hash(max_hash)
+    max_hash = _get_max_hash_for_scaled(SCALED)
+    new_scaled = _get_scaled_for_max_hash(max_hash)
     assert new_scaled == SCALED
 
 
 def test_max_hash_and_scaled_zero():
-    max_hash = get_max_hash_for_scaled(0)
-    new_scaled = get_scaled_for_max_hash(0)
+    max_hash = _get_max_hash_for_scaled(0)
+    new_scaled = _get_scaled_for_max_hash(0)
     assert max_hash == new_scaled
     assert max_hash == 0
 
 
-def test_max_hash_and_scaled_error(track_abundance):
-    # test behavior when supplying both max_hash and scaled
-    with pytest.raises(ValueError):
-        mh = MinHash(0, 4, track_abundance=track_abundance, max_hash=35,
-                     scaled=5)
-
-
 def test_max_hash_cannot_limit(track_abundance):
-    # make sure you can't set both max_n and max_hash.
+    # make sure you can't set both n and scaled.
     with pytest.raises(ValueError):
-        mh = MinHash(2, 4, track_abundance=track_abundance, max_hash=35)
+        mh = MinHash(2, 4, track_abundance=track_abundance,
+                     scaled=_get_scaled_for_max_hash(1))
 
 
 def test_no_downsample_scaled_if_n(track_abundance):
@@ -315,8 +300,8 @@ def test_scaled(track_abundance):
 
 def test_mh_jaccard_similarity():
     # check actual Jaccard value for a non-trivial case
-    a = MinHash(0, 20, max_hash=50, track_abundance=False)
-    b = MinHash(0, 20, max_hash=50, track_abundance=False)
+    a = MinHash(0, 20, scaled=scaled50, track_abundance=False)
+    b = MinHash(0, 20, scaled=scaled50, track_abundance=False)
     a.add_many([1, 3, 5, 8])
     b.add_many([1, 3, 5, 6, 8, 10])
 
@@ -327,9 +312,9 @@ def test_mh_similarity_downsample_jaccard_value():
     # check jaccard value after downsampling
 
     # max_hash = 50
-    a = MinHash(0, 20, max_hash=50, track_abundance=False)
+    a = MinHash(0, 20, scaled=scaled50, track_abundance=False)
     # max_hash = 100
-    b = MinHash(0, 20, max_hash=100, track_abundance=False)
+    b = MinHash(0, 20, scaled=scaled100, track_abundance=False)
 
     a.add_many([1, 3, 5, 8, 70])
     b.add_many([1, 3, 5, 6, 8, 10, 70 ])
@@ -343,8 +328,8 @@ def test_mh_angular_similarity():
     # https://www.sciencedirect.com/topics/computer-science/cosine-similarity
     # note: angular similarity is 1 - 2*(acos(sim) / pi), when elements
     # are always positive (https://en.wikipedia.org/wiki/Cosine_similarity)
-    a = MinHash(0, 20, max_hash=50, track_abundance=True)
-    b = MinHash(0, 20, max_hash=50, track_abundance=True)
+    a = MinHash(0, 20, scaled=scaled50, track_abundance=True)
+    b = MinHash(0, 20, scaled=scaled50, track_abundance=True)
     a.set_abundances({ 1:5, 3:3, 5:2, 8:2})
     b.set_abundances({ 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 })
 
@@ -357,8 +342,8 @@ def test_mh_angular_similarity():
 
 def test_mh_angular_similarity_2():
     # check actual angular similarity for a second non-trivial case
-    a = MinHash(0, 20, max_hash=100, track_abundance=True)
-    b = MinHash(0, 20, max_hash=100, track_abundance=True)
+    a = MinHash(0, 20, scaled=scaled100, track_abundance=True)
+    b = MinHash(0, 20, scaled=scaled100, track_abundance=True)
     a.set_abundances({ 1:5, 3:3, 5:2, 8:2, 70:70 })
     b.set_abundances({ 1:3, 3:2, 5:1, 6:1, 8:1, 10:1, 70:70 })
 
@@ -372,9 +357,9 @@ def test_mh_similarity_downsample_angular_value():
     # test downsample=True argument to MinHash.similarity
 
     # max_hash = 50
-    a = MinHash(0, 20, max_hash=50, track_abundance=True)
+    a = MinHash(0, 20, scaled=scaled50, track_abundance=True)
     # max_hash = 100
-    b = MinHash(0, 20, max_hash=100, track_abundance=True)
+    b = MinHash(0, 20, scaled=scaled100, track_abundance=True)
 
     a.set_abundances({ 1:5, 3:3, 5:2, 8:2, 70:70 })
     b.set_abundances({ 1:3, 3:2, 5:1, 6:1, 8:1, 10:1, 70:70 })
@@ -392,9 +377,9 @@ def test_mh_similarity_downsample_true(track_abundance):
     # verify sim(a, b) == sim(b, a), with and without ignore_abundance
 
     # max_hash = 50
-    a = MinHash(0, 20, max_hash=50, track_abundance=track_abundance)
+    a = MinHash(0, 20, scaled=scaled50, track_abundance=track_abundance)
     # max_hash = 100
-    b = MinHash(0, 20, max_hash=100, track_abundance=track_abundance)
+    b = MinHash(0, 20, scaled=scaled100, track_abundance=track_abundance)
 
     a_values = { 1:5, 3:3, 5:2, 8:2}
     b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 }
@@ -420,9 +405,9 @@ def test_mh_similarity_downsample_errors(track_abundance):
     # test downsample=False (default) argument to MinHash.similarity
 
     # max_hash = 50
-    a = MinHash(0, 20, max_hash=50, track_abundance=track_abundance)
+    a = MinHash(0, 20, scaled=scaled50, track_abundance=track_abundance)
     # max_hash = 100
-    b = MinHash(0, 20, max_hash=100, track_abundance=track_abundance)
+    b = MinHash(0, 20, scaled=scaled100, track_abundance=track_abundance)
 
     a_values = { 1:5, 3:3, 5:2, 8:2}
     b_values = { 1:3, 3:2, 5:1, 6:1, 8:1, 10:1 }
@@ -680,8 +665,10 @@ def test_mh_count_common_diff_protein(track_abundance):
 
 
 def test_mh_count_common_diff_maxhash(track_abundance):
-    a = MinHash(0, 5, False, track_abundance=track_abundance, max_hash=1)
-    b = MinHash(0, 5, True, track_abundance=track_abundance, max_hash=2)
+    a = MinHash(0, 5, False, track_abundance=track_abundance,
+                scaled=_get_scaled_for_max_hash(1))
+    b = MinHash(0, 5, True, track_abundance=track_abundance,
+                scaled=_get_scaled_for_max_hash(2))
 
     with pytest.raises(ValueError):
         a.count_common(b)
@@ -955,8 +942,11 @@ def test_mh_compare_diff_seed(track_abundance):
 
 
 def test_mh_compare_diff_max_hash(track_abundance):
-    a = MinHash(0, 5, track_abundance=track_abundance, max_hash=5)
-    b = MinHash(0, 5, track_abundance=track_abundance, max_hash=10)
+    a = MinHash(0, 5, track_abundance=track_abundance,
+                scaled=_get_max_hash_for_scaled(5))
+
+    b = MinHash(0, 5, track_abundance=track_abundance,
+                scaled=_get_max_hash_for_scaled(10))
 
     with pytest.raises(ValueError):
         a.compare(b)
@@ -979,8 +969,10 @@ def test_mh_concat_diff_ksize(track_abundance):
 
 
 def test_mh_concat_diff_max_hash(track_abundance):
-    a = MinHash(0, 5, track_abundance=track_abundance, max_hash=5)
-    b = MinHash(0, 5, track_abundance=track_abundance, max_hash=10)
+    a = MinHash(0, 5, track_abundance=track_abundance,
+                scaled=_get_max_hash_for_scaled(5))
+    b = MinHash(0, 5, track_abundance=track_abundance,
+                scaled=_get_max_hash_for_scaled(10))
 
     with pytest.raises(ValueError):
         a += b
@@ -1236,8 +1228,8 @@ def test_set_abundance_initialized():
 
 def test_reviving_minhash():
     # simulate reading a MinHash from disk
-    mh = MinHash(0, 21, max_hash=184467440737095520, seed=42,
-                 track_abundance=False)
+    scaled = _get_max_hash_for_scaled(184467440737095520)
+    mh = MinHash(0, 21, scaled=scaled, seed=42, track_abundance=False)
     mins = (28945103950853965, 74690756200987412, 82962372765557409,
             93503551367950366, 106923350319729608, 135116761470196737,
             160165359281648267, 162390811417732001, 177939655451276972)
@@ -1274,7 +1266,8 @@ def test_mh_copy_and_clear(track_abundance):
 
 def test_mh_copy_and_clear_with_max_hash(track_abundance):
     # test basic creation of new, empty MinHash w/max_hash param set
-    a = MinHash(0, 10, track_abundance=track_abundance, max_hash=20)
+    a = MinHash(0, 10, track_abundance=track_abundance,
+                scaled=_get_scaled_for_max_hash(20))
     for i in range(0, 40, 2):
         a.add_hash(i)
 
@@ -1292,8 +1285,7 @@ def test_mh_copy_and_clear_with_max_hash(track_abundance):
 
 def test_scaled_property(track_abundance):
     scaled = 10000
-    a = MinHash(0, 10, track_abundance=track_abundance,
-                max_hash=round(2**64 / scaled))
+    a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled)
     assert a.scaled == scaled
 
 
@@ -1311,7 +1303,8 @@ def test_mh_subtract(track_abundance):
 
 
 def test_pickle_max_hash(track_abundance):
-    a = MinHash(0, 10, track_abundance=track_abundance, max_hash=20)
+    a = MinHash(0, 10, track_abundance=track_abundance,
+                scaled=_get_scaled_for_max_hash(20))
     for i in range(0, 40, 2):
         a.add_hash(i)
 
@@ -1353,7 +1346,7 @@ def test_minhash_abund_add():
     # std::vector iterators upon vector resizing - in this case, there
     # was also a bug in inserting into the middle of mins when scaled was set.
 
-    a = MinHash(0, 10, track_abundance=True, max_hash=5000)
+    a = MinHash(0, 10, track_abundance=True, scaled=scaled5000)
 
     n = 0
     for i in range(10, 0, -1):
@@ -1369,7 +1362,7 @@ def test_minhash_abund_capacity_increase():
 
     # this should set capacity to 1000 - see KmerMinHash constructor call
     # to 'reserve' when n > 0 for specific parameter.
-    a = MinHash(0, 10, track_abundance=True, max_hash=5000)
+    a = MinHash(0, 10, track_abundance=True, scaled=scaled5000)
 
     # 1001 is dependent on the value passed to reserve (currently 1000).
     for i in range(1001, 0, -1):
@@ -1381,8 +1374,8 @@ def test_minhash_abund_merge_flat():
     # of a signature with abundance and a signature without abundance.
     # the correct behavior for now is to calculate simple Jaccard,
     # i.e. 'flatten' both of them.
-    a = MinHash(0, 10, track_abundance=True, max_hash=5000)
-    b = MinHash(0, 10, max_hash=5000)
+    a = MinHash(0, 10, track_abundance=True, scaled=scaled5000)
+    b = MinHash(0, 10, scaled=scaled5000)
 
     for i in range(0, 10, 2):
         a.add_hash(i)
@@ -1399,8 +1392,8 @@ def test_minhash_abund_merge_flat_2():
     # this targets a segfault caused by trying to merge
     # a signature with abundance and a signature without abundance.
 
-    a = MinHash(0, 10, track_abundance=True, max_hash=5000)
-    b = MinHash(0, 10, max_hash=5000)
+    a = MinHash(0, 10, track_abundance=True, scaled=scaled5000)
+    b = MinHash(0, 10, scaled=scaled5000)
 
     for i in range(0, 10, 2):
         a.add_hash(i)
@@ -1436,7 +1429,7 @@ def test_distance_matrix(track_abundance):
 
 
 def test_remove_many(track_abundance):
-    a = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000)
+    a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000)
 
     a.add_many(list(range(0, 100, 2)))
 
@@ -1456,8 +1449,8 @@ def test_remove_many(track_abundance):
 
 
 def test_add_many(track_abundance):
-    a = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000)
-    b = MinHash(0, 10, track_abundance=track_abundance, max_hash=5000)
+    a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000)
+    b = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000)
 
     a.add_many(list(range(0, 100, 2)))
     a.add_many(list(range(0, 100, 2)))
@@ -1475,7 +1468,8 @@ def test_add_many(track_abundance):
 
 def test_set_abundances_huge():
     max_hash = 4000000
-    a = MinHash(0, 10, track_abundance=True, max_hash=max_hash)
+    a = MinHash(0, 10, track_abundance=True,
+                scaled=_get_scaled_for_max_hash(max_hash))
 
     hashes = list(range(max_hash))
     abundances = itertools.repeat(2)
diff --git a/tests/test__minhash_hypothesis.py b/tests/test__minhash_hypothesis.py
index 6055e1d870..9271ae6eed 100644
--- a/tests/test__minhash_hypothesis.py
+++ b/tests/test__minhash_hypothesis.py
@@ -4,7 +4,7 @@
 import hypothesis.strategies as st
 
 from sourmash import MinHash
-from sourmash.minhash import get_max_hash_for_scaled
+from sourmash.minhash import _get_max_hash_for_scaled
 
 
 @given(st.lists(st.integers(min_value=0, max_value=2**64 - 1), min_size=10, max_size=1000),
@@ -35,7 +35,7 @@ def test_set_abundance_scaled_hypothesis(hashes, abundances, scaled):
 
     a.set_abundances(oracle)
 
-    max_hash = get_max_hash_for_scaled(scaled)
+    max_hash = _get_max_hash_for_scaled(scaled)
     below_max_hash = sum(1 for (k, v) in oracle.items() if k <= max_hash and v > 0)
 
     mins = a.get_mins(with_abundance=True)

From 1584283ba11b70dc380ff78e80dc43e3c908eed0 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 25 Jul 2020 13:41:21 -0700
Subject: [PATCH 03/50] change MinHash.add(...) to MinHash.add_kmer(...)

---
 sourmash/minhash.py     |  2 +-
 tests/test_jaccard.py   |  4 ++--
 tests/test_signature.py | 26 +++++++++++++-------------
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/sourmash/minhash.py b/sourmash/minhash.py
index b624c6c24f..2eca595dcf 100644
--- a/sourmash/minhash.py
+++ b/sourmash/minhash.py
@@ -243,7 +243,7 @@ def add_sequence(self, sequence, force=False):
         self._methodcall(lib.kmerminhash_add_sequence, to_bytes(sequence),
                          force)
 
-    def add(self, kmer):
+    def add_kmer(self, kmer):
         "Add a kmer into the sketch."
         self.add_sequence(kmer)
 
diff --git a/tests/test_jaccard.py b/tests/test_jaccard.py
index 93bda700b9..43a4c355a4 100644
--- a/tests/test_jaccard.py
+++ b/tests/test_jaccard.py
@@ -76,7 +76,7 @@ def test_dna_mh(track_abundance):
     seq = 'ATGGCAGTGACGATGCCAG'
     e1.add_sequence(seq)
     for i in range(len(seq) - 3):
-        e2.add(seq[i:i + 4])
+        e2.add_kmer(seq[i:i + 4])
 
     assert e1.get_mins() == e2.get_mins()
     print(e1.get_mins())
@@ -95,7 +95,7 @@ def test_protein_mh(track_abundance):
 
     for i in range(len(seq) - 5):
         kmer = seq[i:i + 6]
-        e2.add(kmer)
+        e2.add_kmer(kmer)
 
     assert e1.get_mins() == e2.get_mins()
     assert 901193879228338100 in e1.get_mins()
diff --git a/tests/test_signature.py b/tests/test_signature.py
index 94ef3770e0..7ceaf2ee70 100644
--- a/tests/test_signature.py
+++ b/tests/test_signature.py
@@ -13,11 +13,11 @@
 def test_compare(track_abundance):
     # same content, same name -> equal
     e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
-    e.add("AT" * 10)
+    e.add_kmer("AT" * 10)
     sig1 = SourmashSignature(e, name='foo')
 
     f = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
-    f.add("AT" * 10)
+    f.add_kmer("AT" * 10)
     sig2 = SourmashSignature(f, name='foo')
 
     assert e == f
@@ -26,11 +26,11 @@ def test_compare(track_abundance):
 def test_compare_ne(track_abundance):
     # same content, different names -> different
     e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
-    e.add("AT" * 10)
+    e.add_kmer("AT" * 10)
     sig1 = SourmashSignature(e, name='foo')
 
     f = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
-    f.add("AT" * 10)
+    f.add_kmer("AT" * 10)
     sig2 = SourmashSignature(f, name='bar')
 
     assert sig1 != sig2
@@ -39,11 +39,11 @@ def test_compare_ne(track_abundance):
 def test_compare_ne2(track_abundance):
     # same content, different filename -> different
     e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
-    e.add("AT" * 10)
+    e.add_kmer("AT" * 10)
     sig1 = SourmashSignature(e, name='foo', filename='a')
 
     f = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
-    f.add("AT" * 10)
+    f.add_kmer("AT" * 10)
     sig2 = SourmashSignature(f, name='foo', filename='b')
 
     assert sig1 != sig2
@@ -53,11 +53,11 @@ def test_compare_ne2(track_abundance):
 def test_compare_ne2_reverse(track_abundance):
     # same content, one has filename, other does not -> different
     e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
-    e.add("AT" * 10)
+    e.add_kmer("AT" * 10)
     sig1 = SourmashSignature(e, name='foo')
 
     f = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
-    f.add("AT" * 10)
+    f.add_kmer("AT" * 10)
     sig2 = SourmashSignature(f, filename='b')
 
     assert sig2 != sig1
@@ -67,7 +67,7 @@ def test_compare_ne2_reverse(track_abundance):
 def test_hashable(track_abundance):
     # check: can we use signatures as keys in dictionaries and sets?
     e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
-    e.add("AT" * 10)
+    e.add_kmer("AT" * 10)
 
     sig = SourmashSignature(e)
 
@@ -78,7 +78,7 @@ def test_hashable(track_abundance):
 def test_str(track_abundance):
     # signatures should be printable
     e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
-    e.add("AT" * 10)
+    e.add_kmer("AT" * 10)
 
     sig = SourmashSignature(e)
 
@@ -93,7 +93,7 @@ def test_str(track_abundance):
 
 def test_roundtrip(track_abundance):
     e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
-    e.add("AT" * 10)
+    e.add_kmer("AT" * 10)
     sig = SourmashSignature(e)
     s = save_signatures([sig])
     siglist = list(load_signatures(s))
@@ -106,7 +106,7 @@ def test_roundtrip(track_abundance):
 
 def test_load_signature_ksize_nonint(track_abundance):
     e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
-    e.add("AT" * 10)
+    e.add_kmer("AT" * 10)
     sig = SourmashSignature(e)
     s = save_signatures([sig])
     siglist = list(load_signatures(s, ksize='20'))
@@ -312,7 +312,7 @@ def test_load_compressed(track_abundance):
 
 def test_binary_fp(tmpdir, track_abundance):
     e = sourmash.MinHash(n=1, ksize=20, track_abundance=track_abundance)
-    e.add("AT" * 10)
+    e.add_kmer("AT" * 10)
 
     path = tmpdir.join("1.sig")
     with open(str(path), 'wb') as fp:

From 72b3ab946b49b4e8dc2d5352c2f0c022ee1c4f3b Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 25 Jul 2020 13:46:27 -0700
Subject: [PATCH 04/50] remove update and is_molecule_type from MinHash

---
 sourmash/minhash.py            | 13 ++-----------
 sourmash/sourmash_args.py      | 11 +++--------
 tests/test_sourmash_compute.py | 20 ++++++++++----------
 3 files changed, 15 insertions(+), 29 deletions(-)

diff --git a/sourmash/minhash.py b/sourmash/minhash.py
index 2eca595dcf..8ccba4ee27 100644
--- a/sourmash/minhash.py
+++ b/sourmash/minhash.py
@@ -262,7 +262,7 @@ def remove_many(self, hashes):
         "Remove many hashes at once; ``hashes`` must be an iterable."
         self._methodcall(lib.kmerminhash_remove_many, list(hashes), len(hashes))
 
-    def update(self, other):
+    def update_xxx(self, other):
         "Update this sketch from all the hashes in the other."
         self.add_many(other)
 
@@ -570,16 +570,6 @@ def add_protein(self, sequence):
         "Add a protein sequence."
         self._methodcall(lib.kmerminhash_add_protein, to_bytes(sequence))
 
-    def is_molecule_type(self, molecule):
-        """Check if this MinHash is a particular human-readable molecule type.
-
-        Supports 'protein', 'dayhoff', 'hp', 'DNA'.
-        @CTB deprecate for 4.0?
-        """
-        if molecule.lower() not in ('protein', 'dayhoff', 'hp', 'dna'):
-            raise ValueError("unknown moltype in query, '{}'".format(molecule))
-        return molecule == self.moltype
-
     @property
     def moltype(self):                    # TODO: test in minhash tests
         if self.is_protein:
@@ -590,3 +580,4 @@ def moltype(self):                    # TODO: test in minhash tests
             return 'hp'
         else:
             return 'DNA'
+
diff --git a/sourmash/sourmash_args.py b/sourmash/sourmash_args.py
index 5c82692170..b3bccbe82d 100644
--- a/sourmash/sourmash_args.py
+++ b/sourmash/sourmash_args.py
@@ -28,14 +28,9 @@
 
 
 def get_moltype(sig, require=False):
-    if sig.minhash.is_molecule_type('DNA'):
-        moltype = 'DNA'
-    elif sig.minhash.is_molecule_type('dayhoff'):
-        moltype = 'dayhoff'
-    elif sig.minhash.is_molecule_type('hp'):
-        moltype = 'hp'
-    elif sig.minhash.is_molecule_type('protein'):
-        moltype = 'protein'
+    mh = sig.minhash
+    if mh.moltype in ('DNA', 'dayhoff', 'hp', 'protein'):
+        moltype = mh.moltype
     else:
         raise ValueError('unknown molecule type for sig {}'.format(sig.name()))
 
diff --git a/tests/test_sourmash_compute.py b/tests/test_sourmash_compute.py
index bc2ee59ef1..bd61737a7d 100644
--- a/tests/test_sourmash_compute.py
+++ b/tests/test_sourmash_compute.py
@@ -436,8 +436,8 @@ def test_do_sourmash_compute_multik_with_dayhoff_and_dna():
             ksizes = set([ x.minhash.ksize for x in siglist ])
             assert 21 in ksizes
             assert 30 in ksizes
-            assert sum(x.minhash.is_molecule_type('DNA') for x in siglist) == 2
-            assert sum(x.minhash.is_molecule_type('dayhoff') for x in siglist) == 2
+            assert sum(x.minhash.moltype == 'DNA' for x in siglist) == 2
+            assert sum(x.minhash.moltype == 'dayhoff' for x in siglist) == 2
 
 
 def test_do_sourmash_compute_multik_with_hp():
@@ -493,9 +493,9 @@ def test_do_sourmash_compute_multik_with_dayhoff_dna_protein():
             ksizes = set([ x.minhash.ksize for x in siglist ])
             assert 21 in ksizes
             assert 30 in ksizes
-            assert sum(x.minhash.is_molecule_type('DNA') for x in siglist) == 2
-            assert sum(x.minhash.is_molecule_type('dayhoff') for x in siglist) == 2
-            assert sum(x.minhash.is_molecule_type('protein') for x in siglist) == 2
+            assert sum(x.minhash.moltype == 'DNA' for x in siglist) == 2
+            assert sum(x.minhash.moltype == 'dayhoff' for x in siglist) == 2
+            assert sum(x.minhash.moltype == 'protein' for x in siglist) == 2
 
 
 def test_do_sourmash_compute_multik_with_dayhoff_hp_dna_protein():
@@ -516,11 +516,11 @@ def test_do_sourmash_compute_multik_with_dayhoff_hp_dna_protein():
             ksizes = set([ x.minhash.ksize for x in siglist ])
             assert 21 in ksizes
             assert 30 in ksizes
-            assert sum(x.minhash.is_molecule_type('DNA') for x in siglist) == 2
-            assert sum(x.minhash.is_molecule_type('dayhoff') for x in siglist) == 2
-            assert sum(x.minhash.is_molecule_type('hp') for x in siglist) == 2
+            assert sum(x.minhash.moltype == 'DNA' for x in siglist) == 2
+            assert sum(x.minhash.moltype == 'dayhoff' for x in siglist) == 2
+            assert sum(x.minhash.moltype == 'hp' for x in siglist) == 2
             # 2 = dayhoff, 2 = hp = 4 protein
-            assert sum(x.minhash.is_molecule_type('protein') for x in siglist) == 2
+            assert sum(x.minhash.moltype == 'protein' for x in siglist) == 2
 
 
 def test_do_sourmash_compute_multik_with_nothing():
@@ -641,7 +641,7 @@ def test_do_sourmash_compute_multik_input_is_protein():
             assert 21 in ksizes
             assert 30 in ksizes
 
-            moltype = set([ x.minhash.is_molecule_type('protein')
+            moltype = set([ x.minhash.moltype == 'protein'
                             for x in siglist ])
             assert len(moltype) == 1
             assert True in moltype

From c3567ed2fb3f0ab337e208ae5053273a484baaeb Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 25 Jul 2020 13:48:27 -0700
Subject: [PATCH 05/50] remove subtract_mins

---
 sourmash/minhash.py    |  8 --------
 tests/test__minhash.py | 13 -------------
 2 files changed, 21 deletions(-)

diff --git a/sourmash/minhash.py b/sourmash/minhash.py
index 8ccba4ee27..589b4bb96a 100644
--- a/sourmash/minhash.py
+++ b/sourmash/minhash.py
@@ -297,14 +297,6 @@ def get_hashes(self):
         "Return the list of hashes."
         return self.get_mins()
 
-    def subtract_mins(self, other):
-        """Get the list of mins in this MinHash, after removing the ones in
-        ``other``.
-        """
-        a = set(self.get_mins())
-        b = set(other.get_mins())
-        return a - b
-
     @property
     def seed(self):
         return self._methodcall(lib.kmerminhash_seed)
diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index 1ec34b2f3a..340aeaecf4 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -1289,19 +1289,6 @@ def test_scaled_property(track_abundance):
     assert a.scaled == scaled
 
 
-def test_mh_subtract(track_abundance):
-    # test subtracting two identically configured minhashes
-    a = MinHash(20, 10, track_abundance=track_abundance)
-    for i in range(0, 40, 2):
-        a.add_hash(i)
-
-    b = MinHash(20, 10, track_abundance=track_abundance)
-    for i in range(0, 80, 4):
-        b.add_hash(i)
-
-    assert a.subtract_mins(b) == set(range(2, 40, 4))
-
-
 def test_pickle_max_hash(track_abundance):
     a = MinHash(0, 10, track_abundance=track_abundance,
                 scaled=_get_scaled_for_max_hash(20))

From 19a436598191a9d7c1075fc04240a702da0c5bac Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 25 Jul 2020 13:51:36 -0700
Subject: [PATCH 06/50] rename downsample_n to downsample_num

---
 sourmash/minhash.py         |  2 +-
 sourmash/sig/__main__.py    |  2 +-
 tests/test__minhash.py      | 14 +++++++-------
 tests/test_cmd_signature.py |  2 +-
 tests/test_jaccard.py       | 28 ++++++++++++++--------------
 5 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/sourmash/minhash.py b/sourmash/minhash.py
index 589b4bb96a..50a0e1d228 100644
--- a/sourmash/minhash.py
+++ b/sourmash/minhash.py
@@ -387,7 +387,7 @@ def count_common(self, other, downsample=False):
             raise TypeError("Must be a MinHash!")
         return self._methodcall(lib.kmerminhash_count_common, other._get_objptr(), downsample)
 
-    def downsample_n(self, new_num):
+    def downsample_num(self, new_num):
         "Copy this object and downsample new object to num=``new_num``."
         if self.num and self.num < new_num:
             raise ValueError("new sample n is higher than current sample n")
diff --git a/sourmash/sig/__main__.py b/sourmash/sig/__main__.py
index 8f25aa46a9..5af9662c12 100644
--- a/sourmash/sig/__main__.py
+++ b/sourmash/sig/__main__.py
@@ -739,7 +739,7 @@ def downsample(args):
                     _set_num_scaled(mh_new, 0, args.scaled)
             elif args.num:
                 if mh.num:
-                    mh_new = mh.downsample_n(args.num)
+                    mh_new = mh.downsample_num(args.num)
                 else:                         # try to turn a scaled into a num
                     # first check: can we?
                     if len(mh) < args.num:
diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index 340aeaecf4..c245102c6d 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -698,10 +698,10 @@ def test_mh_count_common_notmh(track_abundance):
         a.count_common(b)
 
 
-def test_mh_downsample_n_error(track_abundance):
+def test_mh_downsample_num_error(track_abundance):
     a = MinHash(20, 10, track_abundance=track_abundance)
     with pytest.raises(ValueError):
-        a.downsample_n(30)
+        a.downsample_num(30)
 
 
 def test_mh_jaccard_asymmetric_num(track_abundance):
@@ -720,7 +720,7 @@ def test_mh_jaccard_asymmetric_num(track_abundance):
     with pytest.raises(TypeError):
         a.compare(b)
 
-    a = a.downsample_n(10)
+    a = a.downsample_num(10)
     assert a.compare(b) == 0.5
     assert b.compare(a) == 0.5
 
@@ -837,12 +837,12 @@ def test_mh_asymmetric_merge(track_abundance):
     with pytest.raises(TypeError):
         d.compare(a)
 
-    a = a.downsample_n(d.num)
+    a = a.downsample_num(d.num)
     print(a.get_mins())
     print(d.get_mins())
     assert d.compare(a) == 1.0
 
-    c = c.downsample_n(b.num)
+    c = c.downsample_num(b.num)
     assert c.compare(b) == 1.0
 
 
@@ -873,10 +873,10 @@ def test_mh_inplace_concat_asymmetric(track_abundance):
     except TypeError as exc:
         assert 'must have same num' in str(exc)
 
-    a = a.downsample_n(d.num)
+    a = a.downsample_num(d.num)
     assert d.compare(a) == 1.0 # see: d += a, above.
 
-    c = c.downsample_n(b.num)
+    c = c.downsample_num(b.num)
     assert c.compare(b) == 0.5
 
 
diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py
index 73a8aa0706..41bda45fb8 100644
--- a/tests/test_cmd_signature.py
+++ b/tests/test_cmd_signature.py
@@ -1114,7 +1114,7 @@ def test_sig_downsample_2_num(c):
     test_downsample_sig = sourmash.load_one_signature(sigs11, ksize=21,
                                                       select_moltype='DNA')
     actual_downsample_sig = sourmash.load_one_signature(out)
-    test_mh = test_downsample_sig.minhash.downsample_n(500)
+    test_mh = test_downsample_sig.minhash.downsample_num(500)
 
     assert actual_downsample_sig.minhash == test_mh
 
diff --git a/tests/test_jaccard.py b/tests/test_jaccard.py
index 43a4c355a4..679e0723f4 100644
--- a/tests/test_jaccard.py
+++ b/tests/test_jaccard.py
@@ -189,18 +189,18 @@ def test_jaccard_on_real_data():
     assert mh1.compare(mh2) == 0.0183
     assert mh2.compare(mh1) == 0.0183
 
-    mh1 = mh1.downsample_n(1000)
-    mh2 = mh2.downsample_n(1000)
+    mh1 = mh1.downsample_num(1000)
+    mh2 = mh2.downsample_num(1000)
     assert mh1.compare(mh2) == 0.011
     assert mh2.compare(mh1) == 0.011
 
-    mh1 = mh1.downsample_n(100)
-    mh2 = mh2.downsample_n(100)
+    mh1 = mh1.downsample_num(100)
+    mh2 = mh2.downsample_num(100)
     assert mh1.compare(mh2) == 0.01
     assert mh2.compare(mh1) == 0.01
 
-    mh1 = mh1.downsample_n(10)
-    mh2 = mh2.downsample_n(10)
+    mh1 = mh1.downsample_num(10)
+    mh2 = mh2.downsample_num(10)
     assert mh1.compare(mh2) == 0.0
     assert mh2.compare(mh1) == 0.0
 
@@ -221,24 +221,24 @@ def test_scaled_on_real_data():
     assert round(mh1.compare(mh2), 5) == 0.01644
     assert round(mh2.compare(mh1), 5) == 0.01644
 
-    mh1 = mh1.downsample_n(10000)
-    mh2 = mh2.downsample_n(10000)
+    mh1 = mh1.downsample_num(10000)
+    mh2 = mh2.downsample_num(10000)
 
     assert mh1.compare(mh2) == 0.0183
     assert mh2.compare(mh1) == 0.0183
 
-    mh1 = mh1.downsample_n(1000)
-    mh2 = mh2.downsample_n(1000)
+    mh1 = mh1.downsample_num(1000)
+    mh2 = mh2.downsample_num(1000)
     assert mh1.compare(mh2) == 0.011
     assert mh2.compare(mh1) == 0.011
 
-    mh1 = mh1.downsample_n(100)
-    mh2 = mh2.downsample_n(100)
+    mh1 = mh1.downsample_num(100)
+    mh2 = mh2.downsample_num(100)
     assert mh1.compare(mh2) == 0.01
     assert mh2.compare(mh1) == 0.01
 
-    mh1 = mh1.downsample_n(10)
-    mh2 = mh2.downsample_n(10)
+    mh1 = mh1.downsample_num(10)
+    mh2 = mh2.downsample_num(10)
     assert mh1.compare(mh2) == 0.0
     assert mh2.compare(mh1) == 0.0
 

From 01e8a4592e5906a98c9ededb9d0c61bbb8b5a46c Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 25 Jul 2020 13:59:11 -0700
Subject: [PATCH 07/50] switch to hashes property instead of using get_mins()

---
 sourmash/minhash.py    |   4 +
 tests/test__minhash.py | 176 ++++++++++++++++++++---------------------
 2 files changed, 92 insertions(+), 88 deletions(-)

diff --git a/sourmash/minhash.py b/sourmash/minhash.py
index 50a0e1d228..eb49cde095 100644
--- a/sourmash/minhash.py
+++ b/sourmash/minhash.py
@@ -297,6 +297,10 @@ def get_hashes(self):
         "Return the list of hashes."
         return self.get_mins()
 
+    @property
+    def hashes(self):
+        return self.get_mins(with_abundance=True)
+
     @property
     def seed(self):
         return self._methodcall(lib.kmerminhash_seed)
diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index c245102c6d..81f951672e 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -72,15 +72,15 @@ def test_basic_dna(track_abundance):
     assert mh.moltype == 'DNA'
 
     mh.add_sequence('ATGC')
-    a = mh.get_mins()
+    a = mh.hashes
 
     mh.add_sequence('GCAT')             # this will not get added; hash > ATGC
-    b = mh.get_mins()
+    b = mh.hashes
 
     print(a, b)
-    assert a == b
+    assert list(a) == list(b)
     assert len(b) == 1
-    assert a[0] == b[0] == 12415348535738636339
+    assert list(a)[0] == list(b)[0] == 12415348535738636339
 
 
 def test_div_zero(track_abundance):
@@ -108,15 +108,15 @@ def test_bytes_dna(track_abundance):
     mh.add_sequence('ATGC')
     mh.add_sequence(b'ATGC')
     mh.add_sequence('ATGC')
-    a = mh.get_mins()
+    a = mh.hashes
 
     mh.add_sequence('GCAT')             # this will not get added; hash > ATGC
     mh.add_sequence(b'GCAT')             # this will not get added; hash > ATGC
     mh.add_sequence('GCAT')             # this will not get added; hash > ATGC
-    b = mh.get_mins()
+    b = mh.hashes
 
     print(a, b)
-    assert a == b
+    assert list(a) == list(b)
     assert len(b) == 1
 
 
@@ -134,7 +134,7 @@ def test_bytes_protein_dayhoff(track_abundance, dayhoff):
     mh.add_protein('AGYYG')
     mh.add_protein(b'AGYYG')
 
-    assert len(mh.get_mins()) == 4
+    assert len(mh.hashes) == 4
 
 
 def test_protein_dayhoff(track_abundance, dayhoff):
@@ -142,7 +142,7 @@ def test_protein_dayhoff(track_abundance, dayhoff):
     mh = MinHash(10, 6, True, dayhoff=dayhoff, hp=False, track_abundance=track_abundance)
     mh.add_protein('AGYYG')
 
-    assert len(mh.get_mins()) == 4
+    assert len(mh.hashes) == 4
 
 
 def test_bytes_protein_hp(track_abundance, hp):
@@ -158,9 +158,9 @@ def test_bytes_protein_hp(track_abundance, hp):
     mh.add_protein(b'AGYYG')
 
     if hp:
-        assert len(mh.get_mins()) == 1
+        assert len(mh.hashes) == 1
     else:
-        assert len(mh.get_mins()) == 4
+        assert len(mh.hashes) == 4
 
 
 def test_protein_hp(track_abundance, hp):
@@ -169,9 +169,9 @@ def test_protein_hp(track_abundance, hp):
     mh.add_protein('AGYYG')
 
     if hp:
-        assert len(mh.get_mins()) == 1
+        assert len(mh.hashes) == 1
     else:
-        assert len(mh.get_mins()) == 4
+        assert len(mh.hashes) == 4
 
 
 def test_translate_codon(track_abundance):
@@ -194,13 +194,13 @@ def test_dayhoff(track_abundance):
                          dayhoff=True, hp=False, track_abundance=track_abundance)
     mh_dayhoff.add_sequence('ACTGAC')
 
-    assert len(mh_dayhoff.get_mins()) == 2
+    assert len(mh_dayhoff.hashes) == 2
     # verify that dayhoff-encoded hashes are different from protein/aa hashes
     mh_protein = MinHash(10, 6, is_protein=True, track_abundance=track_abundance)
     mh_protein.add_sequence('ACTGAC')
 
-    assert len(mh_protein.get_mins()) == 2
-    assert mh_protein.get_mins() != mh_dayhoff.get_mins()
+    assert len(mh_protein.hashes) == 2
+    assert mh_protein.hashes != mh_dayhoff.hashes
 
 
 def test_hp(track_abundance):
@@ -211,13 +211,13 @@ def test_hp(track_abundance):
 
     mh_hp.add_sequence('ACTGAC')
 
-    assert len(mh_hp.get_mins()) == 2
+    assert len(mh_hp.hashes) == 2
     # verify that hp-encoded hashes are different from protein/aa hashes
     mh_protein = MinHash(10, 6, is_protein=True, track_abundance=track_abundance)
     mh_protein.add_sequence('ACTGAC')
 
-    assert len(mh_protein.get_mins()) == 2
-    assert mh_protein.get_mins() != mh_hp.get_mins()
+    assert len(mh_protein.hashes) == 2
+    assert mh_protein.hashes != mh_hp.hashes
 
 
 def test_protein_short(track_abundance):
@@ -225,7 +225,7 @@ def test_protein_short(track_abundance):
     mh = MinHash(10, 9, True, track_abundance=track_abundance)
     mh.add_protein('AG')
 
-    assert len(mh.get_mins()) == 0, mh.get_mins()
+    assert len(mh.hashes) == 0, mh.hashes
 
 
 def test_size_limit(track_abundance):
@@ -234,9 +234,9 @@ def test_size_limit(track_abundance):
     mh.add_hash(10)
     mh.add_hash(20)
     mh.add_hash(30)
-    assert mh.get_mins() == [10, 20, 30]
+    assert list(mh.hashes) == [10, 20, 30]
     mh.add_hash(5) # -> should push 30 off end
-    assert mh.get_mins() == [5, 10, 20]
+    assert list(mh.hashes) == [5, 10, 20]
 
 
 def test_scaled(track_abundance):
@@ -249,11 +249,11 @@ def test_scaled(track_abundance):
     mh.add_hash(10)
     mh.add_hash(20)
     mh.add_hash(30)
-    assert mh.get_mins() == [10, 20, 30]
+    assert mh.hashes == [10, 20, 30]
     mh.add_hash(40)
-    assert mh.get_mins() == [10, 20, 30]
+    assert mh.hashes == [10, 20, 30]
     mh.add_hash(36)
-    assert mh.get_mins() == [10, 20, 30]
+    assert mh.hashes == [10, 20, 30]
 
 
 def test_no_scaled(track_abundance):
@@ -458,26 +458,26 @@ def test_basic_dna_bad_2(track_abundance):
 def test_basic_dna_bad_force(track_abundance):
     # test behavior on bad DNA; use 100 so multiple hashes get added.
     mh = MinHash(100, 4, track_abundance=track_abundance)
-    assert len(mh.get_mins()) == 0
+    assert len(mh.hashes) == 0
     mh.add_sequence('ATGN', True)     # ambiguous kmer skipped.
-    assert len(mh.get_mins()) == 0
+    assert len(mh.hashes) == 0
     mh.add_sequence('AATGN', True)    # but good k-mers still used.
-    assert len(mh.get_mins()) == 1
+    assert len(mh.hashes) == 1
     mh.add_sequence('AATG', True)     # checking that right kmer was added
-    assert len(mh.get_mins()) == 1    # (only 1 hash <- this is a dup)
+    assert len(mh.hashes) == 1    # (only 1 hash <- this is a dup)
 
 
 def test_basic_dna_bad_force_2(track_abundance):
     # test behavior on bad DNA
     mh = MinHash(100, 4, track_abundance=track_abundance)
-    assert len(mh.get_mins()) == 0
+    assert len(mh.hashes) == 0
     mh.add_sequence('AAGNCGG', True)     # ambiguous kmers skipped.
-    assert len(mh.get_mins()) == 0
+    assert len(mh.hashes) == 0
     mh.add_sequence('AATGNGCGG', True)  # ambiguous kmers skipped.
-    assert len(mh.get_mins()) == 2
+    assert len(mh.hashes) == 2
     mh.add_sequence('AATG', True)        # checking that right kmers were added
     mh.add_sequence('GCGG', True)
-    assert len(mh.get_mins()) == 2       # (only 2 hashes should be there)
+    assert len(mh.hashes) == 2       # (only 2 hashes should be there)
 
 
 def test_consume_lowercase(track_abundance):
@@ -531,7 +531,7 @@ def test_intersection_errors(track_abundance):
     a.add_sequence("TGCCGCCCAGCA")
     b.add_sequence("TGCCGCCCAGCA")
 
-    common = set(a.get_mins())
+    common = set(a.hashes)
     combined_size = 3
 
     intersection, size = a.intersection(b, in_common=False)
@@ -554,7 +554,7 @@ def test_intersection_1(track_abundance):
     a.add_sequence('TGCCGCCCAGCA')
     b.add_sequence('TGCCGCCCAGCA')
 
-    common = set(a.get_mins())
+    common = set(a.hashes)
     combined_size = 3
 
     intersection, size = a.intersection(b, in_common=True)
@@ -595,7 +595,7 @@ def test_intersection_1(track_abundance):
     a.add_sequence('GTCCGCCCAGTGA')
     b.add_sequence('GTCCGCCCAGTGG')
 
-    new_in_common = set(a.get_mins()).intersection(set(b.get_mins()))
+    new_in_common = set(a.hashes).intersection(set(b.hashes))
     new_combined_size = 8
 
     intersection, size = a.intersection(b, in_common=True)
@@ -607,10 +607,10 @@ def test_intersection_1(track_abundance):
     assert size == new_combined_size
 
     intersection, size = a.intersection(a, in_common=True)
-    assert intersection == set(a.get_mins())
+    assert intersection == set(a.hashes)
 
     intersection, size = b.intersection(b, in_common=True)
-    assert intersection == set(b.get_mins())
+    assert intersection == set(b.hashes)
 
 
 def test_mh_copy(track_abundance):
@@ -634,13 +634,13 @@ def test_mh_len(track_abundance):
     for i in range(0, 40, 2):
         a.add_hash(i)
 
-    assert a.get_mins() == list(range(0, 40, 2))
+    assert list(a.hashes) == list(range(0, 40, 2))
 
 
 def test_mh_unsigned_long_long(track_abundance):
     a = MinHash(20, 10, track_abundance=track_abundance)
     a.add_hash(9227159859419181011)        # too big for a C long int.
-    assert 9227159859419181011 in a.get_mins()
+    assert 9227159859419181011 in a.hashes
 
 
 def test_mh_count_common(track_abundance):
@@ -745,7 +745,7 @@ def test_mh_merge(track_abundance):
     d = b.merge(a)
 
     assert len(c) == len(d)
-    assert c.get_mins() == d.get_mins()
+    assert list(c.hashes) == list(d.hashes)
     assert c.compare(d) == 1.0
     assert d.compare(c) == 1.0
 
@@ -763,7 +763,7 @@ def test_mh_merge_empty_num(track_abundance):
 
     assert len(c)
     assert len(c) == len(d)
-    assert c.get_mins() == d.get_mins()
+    assert list(c.hashes) == list(d.hashes)
     assert c.compare(d) == 1.0
     assert d.compare(c) == 1.0
 
@@ -781,7 +781,7 @@ def test_mh_merge_empty_scaled(track_abundance):
 
     assert len(c)
     assert len(c) == len(d)
-    assert c.get_mins() == d.get_mins()
+    assert list(c.hashes) == list(d.hashes)
     assert c.compare(d) == 1.0
     assert d.compare(c) == 1.0
 
@@ -796,7 +796,7 @@ def test_mh_merge_check_length(track_abundance):
         b.add_hash(i)
 
     c = a.merge(b)
-    assert len(c.get_mins()) == 20
+    assert len(c.hashes) == 20
 
 
 def test_mh_merge_check_length2(track_abundance):
@@ -812,7 +812,7 @@ def test_mh_merge_check_length2(track_abundance):
     b.add_hash(4)
 
     c = a.merge(b)
-    assert len(c.get_mins()) == 3
+    assert len(c.hashes) == 3
 
 def test_mh_asymmetric_merge(track_abundance):
     # test merging two asymmetric (different size) MHs
@@ -838,8 +838,8 @@ def test_mh_asymmetric_merge(track_abundance):
         d.compare(a)
 
     a = a.downsample_num(d.num)
-    print(a.get_mins())
-    print(d.get_mins())
+    print(a.hashes)
+    print(d.hashes)
     assert d.compare(a) == 1.0
 
     c = c.downsample_num(b.num)
@@ -896,7 +896,7 @@ def test_mh_inplace_concat(track_abundance):
     d += a
 
     assert len(c) == len(d)
-    assert c.get_mins() == d.get_mins()
+    assert c.hashes == d.hashes
     assert c.compare(d) == 1.0
     assert d.compare(c) == 1.0
 
@@ -990,7 +990,7 @@ def test_short_sequence(track_abundance):
     a = MinHash(20, 5, track_abundance=track_abundance)
     a.add_sequence('GGGG')
     # adding a short sequence should fail silently
-    assert len(a.get_mins()) == 0
+    assert len(a.hashes) == 0
 
 
 def test_bytes_murmur():
@@ -1025,25 +1025,25 @@ def test_abundance_simple():
     a = MinHash(20, 5, False, track_abundance=True)
 
     a.add_sequence('AAAAA')
-    assert a.get_mins() == [2110480117637990133]
-    assert a.get_mins(with_abundance=True) == {2110480117637990133: 1}
+    assert list(a.hashes) == [2110480117637990133]
+    assert a.hashes == {2110480117637990133: 1}
 
     a.add_sequence('AAAAA')
-    assert a.get_mins() == [2110480117637990133]
-    assert a.get_mins(with_abundance=True) == {2110480117637990133: 2}
+    assert list(a.hashes) == [2110480117637990133]
+    assert a.hashes == {2110480117637990133: 2}
 
 
 def test_add_hash_with_abundance():
     a = MinHash(20, 5, False, track_abundance=True)
 
     a.add_hash_with_abundance(10, 1)
-    assert a.get_mins(with_abundance=True) == {10: 1}
+    assert a.hashes == {10: 1}
 
     a.add_hash_with_abundance(20, 2)
-    assert a.get_mins(with_abundance=True) == {10: 1, 20: 2}
+    assert a.hashes == {10: 1, 20: 2}
 
     a.add_hash_with_abundance(10, 2)
-    assert a.get_mins(with_abundance=True) == {10: 3, 20: 2}
+    assert a.hashes == {10: 3, 20: 2}
 
 
 def test_add_hash_with_abundance_2():
@@ -1059,20 +1059,20 @@ def test_clear():
     a = MinHash(20, 5, False, track_abundance=True)
 
     a.add_hash(10)
-    assert a.get_mins(with_abundance=True) == {10: 1}
+    assert a.hashes == {10: 1}
 
     a.clear()
-    assert a.get_mins(with_abundance=True) == {}
+    assert a.hashes == {}
 
 
 def test_clear_2():
     a = MinHash(20, 5, False, track_abundance=False)
 
     a.add_hash(10)
-    assert a.get_mins() == [10]
+    assert list(a.hashes) == [10]
 
     a.clear()
-    assert a.get_mins() == []
+    assert list(a.hashes) == []
 
 
 def test_abundance_simple_2():
@@ -1080,12 +1080,12 @@ def test_abundance_simple_2():
     b = MinHash(20, 5, False, track_abundance=True)
 
     a.add_sequence('AAAAA')
-    assert a.get_mins() == [2110480117637990133]
-    assert a.get_mins(with_abundance=True) == {2110480117637990133: 1}
+    assert list(a.hashes) == [2110480117637990133]
+    assert a.hashes == {2110480117637990133: 1}
 
     a.add_sequence('AAAAA')
-    assert a.get_mins() == [2110480117637990133]
-    assert a.get_mins(with_abundance=True) == {2110480117637990133: 2}
+    assert list(a.hashes) == [2110480117637990133]
+    assert a.hashes == {2110480117637990133: 2}
 
     b.add_sequence('AAAAA')
     assert a.count_common(b) == 1
@@ -1097,15 +1097,15 @@ def test_abundance_count_common():
 
     a.add_sequence('AAAAA')
     a.add_sequence('AAAAA')
-    assert a.get_mins() == [2110480117637990133]
-    assert a.get_mins(with_abundance=True) == {2110480117637990133: 2}
+    assert list(a.hashes) == [2110480117637990133]
+    assert a.hashes == {2110480117637990133: 2}
 
     b.add_sequence('AAAAA')
     b.add_sequence('GGGGG')
     assert a.count_common(b) == 1
     assert a.count_common(b) == b.count_common(a)
 
-    assert b.get_mins(with_abundance=True) == [2110480117637990133,
+    assert b.hashes == [2110480117637990133,
                                                10798773792509008305]
 
 
@@ -1153,12 +1153,12 @@ def test_set_abundance_2():
                                       ksize=30,
                                       select_moltype='dna')
     new_mh = sig.minhash.copy_and_clear()
-    mins = sig.minhash.get_mins()
+    mins = sig.minhash.hashes
     mins = {k: 1 for k in mins}
     new_mh.track_abundance = True
     new_mh.set_abundances(mins)
 
-    assert new_mh.get_mins(with_abundance=True) == mins
+    assert new_mh.hashes == mins
 
 
 def test_set_abundance_clear():
@@ -1169,7 +1169,7 @@ def test_set_abundance_clear():
     a.set_abundances({1: 3, 2: 4}, clear=True)
     b.set_abundances({1: 3, 2: 4}, clear=False)
 
-    assert a.get_mins() == b.get_mins()
+    assert list(a.hashes) == list(b.hashes)
 
 
 def test_set_abundance_clear_2():
@@ -1177,20 +1177,20 @@ def test_set_abundance_clear_2():
     a = MinHash(20, 5, False, track_abundance=True)
 
     a.add_hash(10)
-    assert a.get_mins(with_abundance=True) == {10: 1}
+    assert a.hashes == {10: 1}
 
     a.set_abundances({20: 2})
-    assert a.get_mins(with_abundance=True) == {20: 2}
+    assert a.hashes == {20: 2}
 
 
 def test_set_abundance_clear_3():
     a = MinHash(20, 5, False, track_abundance=True)
 
     a.add_hash(10)
-    assert a.get_mins(with_abundance=True) == {10: 1}
+    assert a.hashes == {10: 1}
     
     a.set_abundances({20: 1, 30: 4}, clear=False)
-    assert a.get_mins(with_abundance=True) == {10: 1, 20: 1, 30: 4}
+    assert a.hashes == {10: 1, 20: 1, 30: 4}
 
 
 def test_set_abundance_clear_4():
@@ -1199,10 +1199,10 @@ def test_set_abundance_clear_4():
     a = MinHash(20, 5, False, track_abundance=True)
 
     a.set_abundances({20: 2, 10: 1}, clear=False)   # should also sort the hashes
-    assert a.get_mins(with_abundance=True) == {10: 1, 20: 2}
+    assert a.hashes == {10: 1, 20: 2}
 
     a.set_abundances({20: 1, 10: 2}, clear=False)
-    assert a.get_mins(with_abundance=True) == {10: 3, 20: 3}
+    assert a.hashes == {10: 3, 20: 3}
 
 
 def test_reset_abundance_initialized():
@@ -1213,7 +1213,7 @@ def test_reset_abundance_initialized():
     # Convert from Abundance to Regular MinHash
     a.track_abundance = False
 
-    assert a.get_mins(with_abundance=True) == [12415348535738636339]
+    assert a.hashes == [12415348535738636339]
 
 
 def test_set_abundance_initialized():
@@ -1243,7 +1243,7 @@ def test_set_abundance_num():
 
     a.set_abundances({1: 3, 2: 4})
 
-    assert a.get_mins(with_abundance=True) == {1: 3, 2: 4}
+    assert a.hashes == {1: 3, 2: 4}
 
 
 def test_mh_copy_and_clear(track_abundance):
@@ -1259,7 +1259,7 @@ def test_mh_copy_and_clear(track_abundance):
     assert not b.is_protein
     assert b.track_abundance == track_abundance
     assert b.seed == a.seed
-    assert len(b.get_mins()) == 0
+    assert len(b.hashes) == 0
     assert a.scaled == b.scaled
     assert b.scaled == 0
 
@@ -1278,7 +1278,7 @@ def test_mh_copy_and_clear_with_max_hash(track_abundance):
     assert not b.is_protein
     assert b.track_abundance == track_abundance
     assert b.seed == a.seed
-    assert len(b.get_mins()) == 0
+    assert len(b.hashes) == 0
     assert a.scaled == b.scaled
     assert b.scaled != 0
 
@@ -1303,8 +1303,8 @@ def test_pickle_max_hash(track_abundance):
     assert not b.is_protein
     assert b.track_abundance == track_abundance
     assert b.seed == a.seed
-    assert len(b.get_mins()) == len(a.get_mins())
-    assert len(b.get_mins()) == 11
+    assert len(b.hashes) == len(a.hashes)
+    assert len(b.hashes) == 11
     assert a.scaled == b.scaled
     assert b.scaled != 0
 
@@ -1322,8 +1322,8 @@ def test_pickle_scaled(track_abundance):
     assert not b.is_protein
     assert b.track_abundance == track_abundance
     assert b.seed == a.seed
-    assert len(b.get_mins()) == len(a.get_mins())
-    assert len(b.get_mins()) == 11
+    assert len(b.hashes) == len(a.hashes)
+    assert len(b.hashes) == 11
     assert a.scaled == b.scaled
     assert b.scaled != 0
 
@@ -1339,8 +1339,8 @@ def test_minhash_abund_add():
     for i in range(10, 0, -1):
         a.add_hash(i)
         n += 1
-        assert len(a.get_mins()) == n
-        print(len(a.get_mins()))
+        assert len(a.hashes) == n
+        print(len(a.hashes))
 
 
 def test_minhash_abund_capacity_increase():
@@ -1432,7 +1432,7 @@ def test_remove_many(track_abundance):
     assert orig_md5 != new_md5
 
     assert len(a) == 33
-    assert all(c % 6 != 0 for c in a.get_mins())
+    assert all(c % 6 != 0 for c in a.hashes)
 
 
 def test_add_many(track_abundance):
@@ -1443,7 +1443,7 @@ def test_add_many(track_abundance):
     a.add_many(list(range(0, 100, 2)))
 
     assert len(a) == 50
-    assert all(c % 2 == 0 for c in a.get_mins())
+    assert all(c % 2 == 0 for c in a.hashes)
 
     for h in range(0, 100, 2):
         b.add_hash(h)

From de589ea50a3a4536f5b9b7633f28a88e4e8dd52c Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 25 Jul 2020 14:37:01 -0700
Subject: [PATCH 08/50] replace get_mins(...) with hashes thruout

---
 sourmash/commands.py              |  6 +++---
 sourmash/lca/command_classify.py  |  2 +-
 sourmash/lca/command_gather.py    |  6 +++---
 sourmash/lca/command_summarize.py |  4 ++--
 sourmash/lca/lca_db.py            |  4 ++--
 sourmash/search.py                |  2 +-
 sourmash/sig/__main__.py          | 22 +++++++++++-----------
 7 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/sourmash/commands.py b/sourmash/commands.py
index 7ae42f32fb..7249d3ee9d 100644
--- a/sourmash/commands.py
+++ b/sourmash/commands.py
@@ -703,10 +703,10 @@ def gather(args):
             e = MinHash(ksize=query.minhash.ksize, n=0, max_hash=new_max_hash,
                         track_abundance=with_abundance)
             if with_abundance:
-                abunds = next_query.minhash.get_mins(with_abundance=True)
+                abunds = next_query.minhash.hashes
                 e.set_abundances(abunds)
             else:
-                e.add_many(next_query.minhash.get_mins())
+                e.add_many(next_query.minhash.hashes)
 
             with FileOutput(args.output_unassigned, 'wt') as fp:
                 sig.save_signatures([ sig.SourmashSignature(e) ], fp)
@@ -849,7 +849,7 @@ def multigather(args):
                     notify('saving unassigned hashes to "{}"', output_unassigned)
 
                     e = MinHash(ksize=query.minhash.ksize, n=0, max_hash=new_max_hash)
-                    e.add_many(next_query.minhash.get_mins())
+                    e.add_many(next_query.minhash.hashes)
                     sig.save_signatures([ sig.SourmashSignature(e) ], fp)
             n += 1
 
diff --git a/sourmash/lca/command_classify.py b/sourmash/lca/command_classify.py
index 568040c56d..e4e8c7b452 100644
--- a/sourmash/lca/command_classify.py
+++ b/sourmash/lca/command_classify.py
@@ -36,7 +36,7 @@ def classify_signature(query_sig, dblist, threshold, majority):
 
       """
     # gather assignments from across all the databases
-    assignments = lca_utils.gather_assignments(query_sig.minhash.get_mins(),
+    assignments = lca_utils.gather_assignments(query_sig.minhash.hashes,
                                                dblist)
 
     # now convert to trees -> do LCA & counts
diff --git a/sourmash/lca/command_gather.py b/sourmash/lca/command_gather.py
index 812c6b3b41..03c72f58ec 100644
--- a/sourmash/lca/command_gather.py
+++ b/sourmash/lca/command_gather.py
@@ -65,11 +65,11 @@ def gather_signature(query_sig, dblist, ignore_abundance):
                                          query_sig.minhash.ksize)
 
     # extract the basic set of mins
-    query_mins = set(query_sig.minhash.get_mins())
+    query_mins = set(query_sig.minhash.hashes)
     n_mins = len(query_mins)
 
     if query_sig.minhash.track_abundance and not ignore_abundance:
-        orig_abunds = query_sig.minhash.get_mins(with_abundance=True)
+        orig_abunds = query_sig.minhash.hashes
     else:
         if query_sig.minhash.track_abundance and ignore_abundance:
             notify('** ignoring abundance')
@@ -236,7 +236,7 @@ def gather_main(args):
             print_results('')
     # nothing found.
     else:
-        est_bp = len(query_sig.minhash.get_mins()) * query_sig.minhash.scaled
+        est_bp = len(query_sig.minhash) * query_sig.minhash.scaled
         print_results('')
         print_results('No assignment for est {} of sequence.',
                       format_bp(est_bp))
diff --git a/sourmash/lca/command_summarize.py b/sourmash/lca/command_summarize.py
index 74155a2d6c..2ac453e8be 100644
--- a/sourmash/lca/command_summarize.py
+++ b/sourmash/lca/command_summarize.py
@@ -130,11 +130,11 @@ def count_signature(sig, scaled, hashvals):
     mh = sig.minhash.downsample_scaled(scaled)
 
     if mh.track_abundance:
-        abunds = mh.get_mins(with_abundance=True)
+        abunds = mh.hashes
         for hashval, count in abunds.items():
             hashvals[hashval] += count
     else:
-        for hashval in mh.get_mins():
+        for hashval in mh.hashes:
             hashvals[hashval] += 1
 
 
diff --git a/sourmash/lca/lca_db.py b/sourmash/lca/lca_db.py
index 3931813e19..dbf8a015ad 100644
--- a/sourmash/lca/lca_db.py
+++ b/sourmash/lca/lca_db.py
@@ -156,7 +156,7 @@ def insert(self, sig, ident=None, lineage=None):
             except TypeError:
                 raise ValueError('lineage cannot be used as a key?!')
 
-        for hashval in minhash.get_mins():
+        for hashval in minhash.hashes:
             self.hashval_to_idx[hashval].add(idx)
 
         return len(minhash)
@@ -462,7 +462,7 @@ def _find_signatures(self, minhash, threshold, containment=False,
             # note that containment can be calculated w/o matching scaled.
             raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled))
 
-        query_mins = set(minhash.get_mins())
+        query_mins = set(minhash.hashes)
 
         # collect matching hashes for the query:
         c = Counter()
diff --git a/sourmash/search.py b/sourmash/search.py
index be7cdc1cfe..9281571767 100644
--- a/sourmash/search.py
+++ b/sourmash/search.py
@@ -125,7 +125,7 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance):
     orig_query_abunds = { k: 1 for k in orig_query_mins }
     if track_abundance:
         import numpy as np
-        orig_query_abunds = orig_query_mh.get_mins(with_abundance=True)
+        orig_query_abunds = orig_query_mh.hashes
 
     cmp_scaled = query.minhash.scaled    # initialize with resolution of query
     while query.minhash:
diff --git a/sourmash/sig/__main__.py b/sourmash/sig/__main__.py
index 5af9662c12..da575394de 100644
--- a/sourmash/sig/__main__.py
+++ b/sourmash/sig/__main__.py
@@ -298,8 +298,8 @@ def overlap(args):
 
     scaled = sig1.minhash.scaled
 
-    hashes_1 = set(sig1.minhash.get_mins())
-    hashes_2 = set(sig2.minhash.get_mins())
+    hashes_1 = set(sig1.minhash.hashes)
+    hashes_2 = set(sig2.minhash.hashes)
 
     num_common = len(hashes_1.intersection(hashes_2))
     disjoint_1 = len(hashes_1 - hashes_2)
@@ -418,14 +418,14 @@ def intersect(args):
                                                progress=progress):
             if first_sig is None:
                 first_sig = sigobj
-                mins = set(sigobj.minhash.get_mins())
+                mins = set(sigobj.minhash.hashes)
             else:
                 # check signature compatibility --
                 if not sigobj.minhash.is_compatible(first_sig.minhash):
                     error("incompatible minhashes; specify -k and/or molecule type.")
                     sys.exit(-1)
 
-            mins.intersection_update(sigobj.minhash.get_mins())
+            mins.intersection_update(sigobj.minhash.hashes)
             total_loaded += 1
         notify('loaded and intersected signatures from {}...', sigfile, end='\r')
 
@@ -449,7 +449,7 @@ def intersect(args):
             error("--track-abundance not set on loaded signature?! exiting.")
             sys.exit(-1)
         intersect_mh = abund_sig.minhash.copy_and_clear()
-        abund_mins = abund_sig.minhash.get_mins(with_abundance=True)
+        abund_mins = abund_sig.minhash.hashes
 
         # do one last intersection
         mins.intersection_update(abund_mins)
@@ -479,7 +479,7 @@ def subtract(args):
         error('Cannot use subtract on signatures with abundance tracking, sorry!')
         sys.exit(1)
 
-    subtract_mins = set(from_mh.get_mins())
+    subtract_mins = set(from_mh.hashes)
 
     notify('loaded signature from {}...', from_sigfile, end='\r')
 
@@ -500,7 +500,7 @@ def subtract(args):
                 error('Cannot use subtract on signatures with abundance tracking, sorry!')
                 sys.exit(1)
 
-            subtract_mins -= set(sigobj.minhash.get_mins())
+            subtract_mins -= set(sigobj.minhash.hashes)
 
             notify('loaded and subtracted signatures from {}...', sigfile, end='\r')
             total_loaded += 1
@@ -625,7 +625,7 @@ def filter(args):
                        ss)
                 continue
 
-            abunds = mh.get_mins(with_abundance=True)
+            abunds = mh.hashes
             abunds2 = {}
             for k, v in abunds.items():
                 if v >= args.min_abundance:
@@ -679,7 +679,7 @@ def flatten(args):
         for ss in siglist:
             flattened_mh = ss.minhash.copy_and_clear()
             flattened_mh.track_abundance = False
-            flattened_mh.add_many(ss.minhash.get_mins())
+            flattened_mh.add_many(ss.minhash.hashes)
 
             ss.minhash = flattened_mh
 
@@ -731,7 +731,7 @@ def downsample(args):
                 else:                         # try to turn a num into a scaled
                     # first check: can we?
                     max_hash = _get_max_hash_for_scaled(args.scaled)
-                    mins = mh.get_mins()
+                    mins = mh.hashes
                     if max(mins) < max_hash:
                         raise ValueError("this num MinHash does not have enough hashes to convert it into a scaled MinHash.")
 
@@ -810,7 +810,7 @@ def export(args):
     x['hashBits'] = 64
     x['hashSeed'] = mh.seed
 
-    ll = list(mh.get_mins())
+    ll = list(mh.hashes)
     x['sketches'] = [{ 'hashes': ll }]
 
     with FileOutput(args.output, 'wt') as fp:

From c3b4416eb5d9f089d12c7c0c5f24233f2745db1d Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 26 Jul 2020 08:20:15 -0700
Subject: [PATCH 09/50] change deprecated 'compare' usage to 'similarity' in
 test_jaccard

---
 tests/test_jaccard.py | 52 +++++++++++++++++++++----------------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/tests/test_jaccard.py b/tests/test_jaccard.py
index 93bda700b9..99716d496e 100644
--- a/tests/test_jaccard.py
+++ b/tests/test_jaccard.py
@@ -186,23 +186,23 @@ def test_jaccard_on_real_data():
     sig2 = list(load_signatures(b))[0]
     mh2 = sig2.minhash
 
-    assert mh1.compare(mh2) == 0.0183
-    assert mh2.compare(mh1) == 0.0183
+    assert mh1.similarity(mh2) == 0.0183
+    assert mh2.similarity(mh1) == 0.0183
 
     mh1 = mh1.downsample_n(1000)
     mh2 = mh2.downsample_n(1000)
-    assert mh1.compare(mh2) == 0.011
-    assert mh2.compare(mh1) == 0.011
+    assert mh1.similarity(mh2) == 0.011
+    assert mh2.similarity(mh1) == 0.011
 
     mh1 = mh1.downsample_n(100)
     mh2 = mh2.downsample_n(100)
-    assert mh1.compare(mh2) == 0.01
-    assert mh2.compare(mh1) == 0.01
+    assert mh1.similarity(mh2) == 0.01
+    assert mh2.similarity(mh1) == 0.01
 
     mh1 = mh1.downsample_n(10)
     mh2 = mh2.downsample_n(10)
-    assert mh1.compare(mh2) == 0.0
-    assert mh2.compare(mh1) == 0.0
+    assert mh1.similarity(mh2) == 0.0
+    assert mh2.similarity(mh1) == 0.0
 
 
 def test_scaled_on_real_data():
@@ -218,29 +218,29 @@ def test_scaled_on_real_data():
     sig2 = list(load_signatures(b))[0]
     mh2 = sig2.minhash
 
-    assert round(mh1.compare(mh2), 5) == 0.01644
-    assert round(mh2.compare(mh1), 5) == 0.01644
+    assert round(mh1.similarity(mh2), 5) == 0.01644
+    assert round(mh2.similarity(mh1), 5) == 0.01644
 
     mh1 = mh1.downsample_n(10000)
     mh2 = mh2.downsample_n(10000)
 
-    assert mh1.compare(mh2) == 0.0183
-    assert mh2.compare(mh1) == 0.0183
+    assert mh1.similarity(mh2) == 0.0183
+    assert mh2.similarity(mh1) == 0.0183
 
     mh1 = mh1.downsample_n(1000)
     mh2 = mh2.downsample_n(1000)
-    assert mh1.compare(mh2) == 0.011
-    assert mh2.compare(mh1) == 0.011
+    assert mh1.similarity(mh2) == 0.011
+    assert mh2.similarity(mh1) == 0.011
 
     mh1 = mh1.downsample_n(100)
     mh2 = mh2.downsample_n(100)
-    assert mh1.compare(mh2) == 0.01
-    assert mh2.compare(mh1) == 0.01
+    assert mh1.similarity(mh2) == 0.01
+    assert mh2.similarity(mh1) == 0.01
 
     mh1 = mh1.downsample_n(10)
     mh2 = mh2.downsample_n(10)
-    assert mh1.compare(mh2) == 0.0
-    assert mh2.compare(mh1) == 0.0
+    assert mh1.similarity(mh2) == 0.0
+    assert mh2.similarity(mh1) == 0.0
 
 
 def test_scaled_on_real_data_2():
@@ -256,21 +256,21 @@ def test_scaled_on_real_data_2():
     sig2 = list(load_signatures(b))[0]
     mh2 = sig2.minhash
 
-    assert round(mh1.compare(mh2), 5) == 0.01644
-    assert round(mh2.compare(mh1), 5) == 0.01644
+    assert round(mh1.similarity(mh2), 5) == 0.01644
+    assert round(mh2.similarity(mh1), 5) == 0.01644
 
     mh1 = mh1.downsample_scaled(1000)
     mh2 = mh2.downsample_scaled(1000)
 
-    assert round(mh1.compare(mh2), 4) == 0.0187
-    assert round(mh2.compare(mh1), 4) == 0.0187
+    assert round(mh1.similarity(mh2), 4) == 0.0187
+    assert round(mh2.similarity(mh1), 4) == 0.0187
 
     mh1 = mh1.downsample_scaled(10000)
     mh2 = mh2.downsample_scaled(10000)
-    assert round(mh1.compare(mh2), 3) == 0.01
-    assert round(mh2.compare(mh1), 3) == 0.01
+    assert round(mh1.similarity(mh2), 3) == 0.01
+    assert round(mh2.similarity(mh1), 3) == 0.01
 
     mh1 = mh1.downsample_scaled(100000)
     mh2 = mh2.downsample_scaled(100000)
-    assert round(mh1.compare(mh2), 2) == 0.01
-    assert round(mh2.compare(mh1), 2) == 0.01
+    assert round(mh1.similarity(mh2), 2) == 0.01
+    assert round(mh2.similarity(mh1), 2) == 0.01

From 7a7bba98844d2f5a05c7f80ab7ea21a5acdf0eb4 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 26 Jul 2020 08:34:52 -0700
Subject: [PATCH 10/50] elminate most of the deprecation warnings in
 test__minhash by switching compare to similarity

---
 tests/test__minhash.py | 123 ++++++++++++++++++++++-------------------
 1 file changed, 66 insertions(+), 57 deletions(-)

diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index 866ceb1769..98cb015cd7 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -502,43 +502,44 @@ def test_consume_lowercase(track_abundance):
     a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA'.lower())
     b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
 
-    assert a.compare(b) == 1.0
-    assert b.compare(b) == 1.0
-    assert b.compare(a) == 1.0
-    assert a.compare(a) == 1.0
+    assert round(a.similarity(b), 3) == 1.0
+    assert round(b.similarity(b), 3) == 1.0
+    assert round(b.similarity(a), 3) == 1.0
+    assert round(a.similarity(a), 3) == 1.0
 
 
-def test_compare_1(track_abundance):
+def test_similarity_1(track_abundance):
     a = MinHash(20, 10, track_abundance=track_abundance)
     b = MinHash(20, 10, track_abundance=track_abundance)
 
     a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
     b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
 
-    assert a.compare(b) == 1.0
-    assert b.compare(b) == 1.0
-    assert b.compare(a) == 1.0
-    assert a.compare(a) == 1.0
+    assert round(a.similarity(b), 3) == 1.0
+    assert round(b.similarity(b), 3) == 1.0
+    assert round(b.similarity(a), 3) == 1.0
+    assert round(a.similarity(a), 3) == 1.0
 
     # add same sequence again
     b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
-    assert a.compare(b) == 1.0
-    assert b.compare(b) == 1.0
-    assert b.compare(a) == 1.0
-    assert a.compare(a) == 1.0
+    assert round(a.similarity(b), 3) == 1.0
+    assert round(b.similarity(b), 3) == 1.0
+    assert round(b.similarity(a), 3) == 1.0
+    assert round(a.similarity(a), 3) == 1.0
 
 
     b.add_sequence('GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT')
-    x = a.compare(b)
+    x = a.similarity(b)
     assert x >= 0.3, x
 
-    x = b.compare(a)
+    x = b.similarity(a)
     assert x >= 0.3, x
-    assert a.compare(a) == 1.0
-    assert b.compare(b) == 1.0
+    assert round(a.similarity(a), 3) == 1.0
+    assert round(b.similarity(b), 3) == 1.0
 
 
 def test_intersection_errors(track_abundance):
+    # CTB: remove this test in 4.0
     a = MinHash(20, 10, track_abundance=track_abundance)
     b = MinHash(20, 10, track_abundance=track_abundance)
     c = MinHash(30, 10, track_abundance=track_abundance)
@@ -563,6 +564,7 @@ def test_intersection_errors(track_abundance):
 # this filter doesn't work, but leaving it in pour encourages les autres.
 @pytest.mark.filterwarnings("ignore")
 def test_intersection_1(track_abundance):
+    # CTB: remove this test in 4.0
     a = MinHash(20, 10, track_abundance=track_abundance)
     b = MinHash(20, 10, track_abundance=track_abundance)
 
@@ -633,7 +635,7 @@ def test_mh_copy(track_abundance):
 
     a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
     b = a.__copy__()
-    assert b.compare(a) == 1.0
+    assert round(b.similarity(a), 3) == 1.0
 
 
 def test_mh_len(track_abundance):
@@ -730,12 +732,14 @@ def test_mh_jaccard_asymmetric_num(track_abundance):
     assert a.count_common(b) == 10
     assert b.count_common(a) == 10
 
+    # with 'jaccard', this will raise an error b/c different num
     with pytest.raises(TypeError):
-        a.compare(b)
+        a.jaccard(b)
 
     a = a.downsample_n(10)
-    assert a.compare(b) == 0.5
-    assert b.compare(a) == 0.5
+    # CTB note: this used to be 'compare', is now 'jaccard'; @CTB check compat
+    assert a.jaccard(b) == 0.5
+    assert b.jaccard(a) == 0.5
 
 
 def test_mh_merge_typeerror(track_abundance):
@@ -759,8 +763,9 @@ def test_mh_merge(track_abundance):
 
     assert len(c) == len(d)
     assert c.get_mins() == d.get_mins()
-    assert c.compare(d) == 1.0
-    assert d.compare(c) == 1.0
+    # @CTB
+    #assert round(c.similarity(d), 3) == 1.0
+    #assert round(d.similarity(c), 3) == 1.0
 
 
 def test_mh_merge_empty_num(track_abundance):
@@ -777,8 +782,8 @@ def test_mh_merge_empty_num(track_abundance):
     assert len(c)
     assert len(c) == len(d)
     assert c.get_mins() == d.get_mins()
-    assert c.compare(d) == 1.0
-    assert d.compare(c) == 1.0
+    assert round(c.similarity(d), 3) == 1.0
+    assert round(d.similarity(c), 3) == 1.0
 
 
 def test_mh_merge_empty_scaled(track_abundance):
@@ -795,8 +800,8 @@ def test_mh_merge_empty_scaled(track_abundance):
     assert len(c)
     assert len(c) == len(d)
     assert c.get_mins() == d.get_mins()
-    assert c.compare(d) == 1.0
-    assert d.compare(c) == 1.0
+    assert round(c.similarity(d), 3) == 1.0
+    assert round(d.similarity(c), 3) == 1.0
 
 
 def test_mh_merge_check_length(track_abundance):
@@ -846,17 +851,19 @@ def test_mh_asymmetric_merge(track_abundance):
     assert len(c) == len(a)
     assert len(d) == len(b)
 
-    # can't compare different sizes without downsampling
+    # can't use jaccard on different nums without downsampling
     with pytest.raises(TypeError):
-        d.compare(a)
+        d.jaccard(a)
 
     a = a.downsample_n(d.num)
     print(a.get_mins())
     print(d.get_mins())
-    assert d.compare(a) == 1.0
+    # @CTB
+    #assert round(d.similarity(a), 3) == 1.0
 
     c = c.downsample_n(b.num)
-    assert c.compare(b) == 1.0
+    # @CTB
+    #assert c.similarity(b) == 1.0
 
 
 def test_mh_inplace_concat_asymmetric(track_abundance):
@@ -882,15 +889,17 @@ def test_mh_inplace_concat_asymmetric(track_abundance):
     assert len(d) == len(b)
 
     try:
-        d.compare(a)
+        d.similarity(a)
     except TypeError as exc:
         assert 'must have same num' in str(exc)
 
     a = a.downsample_n(d.num)
-    assert d.compare(a) == 1.0 # see: d += a, above.
+    # @CTB
+    #assert d.similarity(a) == 1.0 # see: d += a, above.
 
     c = c.downsample_n(b.num)
-    assert c.compare(b) == 0.5
+    # @CTB
+    # assert c.similarity(b) == 0.5
 
 
 def test_mh_inplace_concat(track_abundance):
@@ -910,8 +919,8 @@ def test_mh_inplace_concat(track_abundance):
 
     assert len(c) == len(d)
     assert c.get_mins() == d.get_mins()
-    assert c.compare(d) == 1.0
-    assert d.compare(c) == 1.0
+    assert round(c.similarity(d), 3) == 1.0
+    assert round(d.similarity(c), 3) == 1.0
 
 
 def test_mh_merge_diff_protein(track_abundance):
@@ -930,36 +939,36 @@ def test_mh_merge_diff_ksize(track_abundance):
         a.merge(b)
 
 
-def test_mh_compare_diff_protein(track_abundance):
+def test_mh_similarity_diff_protein(track_abundance):
     a = MinHash(20, 5, False, track_abundance=track_abundance)
     b = MinHash(20, 5, True, track_abundance=track_abundance)
 
     with pytest.raises(ValueError):
-        a.compare(b)
+        a.similarity(b)
 
 
-def test_mh_compare_diff_ksize(track_abundance):
+def test_mh_similarity_diff_ksize(track_abundance):
     a = MinHash(20, 5, track_abundance=track_abundance)
     b = MinHash(20, 6, track_abundance=track_abundance)
 
     with pytest.raises(ValueError):
-        a.compare(b)
+        a.similarity(b)
 
 
-def test_mh_compare_diff_seed(track_abundance):
+def test_mh_similarity_diff_seed(track_abundance):
     a = MinHash(20, 5, track_abundance=track_abundance, seed=1)
     b = MinHash(20, 5, track_abundance=track_abundance, seed=2)
 
     with pytest.raises(ValueError):
-        a.compare(b)
+        a.similarity(b)
 
 
-def test_mh_compare_diff_max_hash(track_abundance):
+def test_mh_similarity_diff_max_hash(track_abundance):
     a = MinHash(0, 5, track_abundance=track_abundance, max_hash=5)
     b = MinHash(0, 5, track_abundance=track_abundance, max_hash=10)
 
     with pytest.raises(ValueError):
-        a.compare(b)
+        a.similarity(b)
 
 
 def test_mh_concat_diff_protein(track_abundance):
@@ -1117,33 +1126,33 @@ def test_abundance_count_common():
                                                10798773792509008305]
 
 
-def test_abundance_compare():
+def test_abundance_similarity():
     a = MinHash(20, 10, track_abundance=True)
     b = MinHash(20, 10, track_abundance=False)
 
     a.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
     b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
 
-    assert a.compare(b) == 1.0
-    assert b.compare(b) == 1.0
-    assert b.compare(a) == 1.0
-    assert a.compare(a) == 1.0
+    assert round(a.similarity(b), 3) == 1.0
+    assert round(b.similarity(b), 3) == 1.0
+    assert round(b.similarity(a), 3) == 1.0
+    assert round(a.similarity(a), 3) == 1.0
 
     # add same sequence again
     b.add_sequence('TGCCGCCCAGCACCGGGTGACTAGGTTGAGCCATGATTAACCTGCAATGA')
-    assert a.compare(b) == 1.0
-    assert b.compare(b) == 1.0
-    assert b.compare(a) == 1.0
-    assert a.compare(a) == 1.0
+    assert round(a.similarity(b), 3) == 1.0
+    assert round(b.similarity(b), 3) == 1.0
+    assert round(b.similarity(a), 3) == 1.0
+    assert round(a.similarity(a), 3) == 1.0
 
     b.add_sequence('GATTGGTGCACACTTAACTGGGTGCCGCGCTGGTGCTGATCCATGAAGTT')
-    x = a.compare(b)
+    x = a.similarity(b)
     assert x >= 0.3, x
 
-    x = b.compare(a)
+    x = b.similarity(a)
     assert x >= 0.3, x
-    assert a.compare(a) == 1.0
-    assert b.compare(b) == 1.0
+    assert round(a.similarity(a), 3) == 1.0
+    assert round(b.similarity(b), 3) == 1.0
 
 
 def test_set_abundance():

From f97bf396f2635fec3789751b7305ac04acc79e4a Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 26 Jul 2020 08:40:06 -0700
Subject: [PATCH 11/50] fix remaining tests in test__minhash

---
 tests/test__minhash.py | 35 ++++++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 11 deletions(-)

diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index 98cb015cd7..62bf6acbac 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -763,9 +763,13 @@ def test_mh_merge(track_abundance):
 
     assert len(c) == len(d)
     assert c.get_mins() == d.get_mins()
-    # @CTB
-    #assert round(c.similarity(d), 3) == 1.0
-    #assert round(d.similarity(c), 3) == 1.0
+
+    if track_abundance:
+        assert round(c.similarity(d), 3) == 0.91
+        assert round(d.similarity(c), 3) == 0.91
+    else:
+        assert round(c.similarity(d), 3) == 1.0
+        assert round(d.similarity(c), 3) == 1.0
 
 
 def test_mh_merge_empty_num(track_abundance):
@@ -858,12 +862,17 @@ def test_mh_asymmetric_merge(track_abundance):
     a = a.downsample_n(d.num)
     print(a.get_mins())
     print(d.get_mins())
-    # @CTB
-    #assert round(d.similarity(a), 3) == 1.0
+
+    if track_abundance:
+        assert round(d.similarity(a), 3) == 0.91
+    else:
+        assert round(d.similarity(a), 3) == 1.0
 
     c = c.downsample_n(b.num)
-    # @CTB
-    #assert c.similarity(b) == 1.0
+    if track_abundance:
+        assert round(c.similarity(b), 3) == 0.91
+    else:
+        assert c.similarity(b) == 1.0
 
 
 def test_mh_inplace_concat_asymmetric(track_abundance):
@@ -894,12 +903,16 @@ def test_mh_inplace_concat_asymmetric(track_abundance):
         assert 'must have same num' in str(exc)
 
     a = a.downsample_n(d.num)
-    # @CTB
-    #assert d.similarity(a) == 1.0 # see: d += a, above.
+    if track_abundance:
+        assert round(d.similarity(a), 3) == 0.795 # see: d += a, above.
+    else:
+        assert d.similarity(a) == 1.0 # see: d += a, above.
 
     c = c.downsample_n(b.num)
-    # @CTB
-    # assert c.similarity(b) == 0.5
+    if track_abundance:
+        assert round(c.similarity(b), 3) == 0.436
+    else:
+        assert c.similarity(b) == 0.5
 
 
 def test_mh_inplace_concat(track_abundance):

From 1bd9df82b5ba3f0c66ed30d3ff8ca0719e5adeec Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 26 Jul 2020 09:02:48 -0700
Subject: [PATCH 12/50] fix compat message

---
 tests/test__minhash.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index 62bf6acbac..e4fe15d311 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -737,7 +737,7 @@ def test_mh_jaccard_asymmetric_num(track_abundance):
         a.jaccard(b)
 
     a = a.downsample_n(10)
-    # CTB note: this used to be 'compare', is now 'jaccard'; @CTB check compat
+    # CTB note: this used to be 'compare', is now 'jaccard'
     assert a.jaccard(b) == 0.5
     assert b.jaccard(a) == 0.5
 

From 7743a95152f8eeffa1a740c30df2bb4e8c03d14d Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Mon, 27 Jul 2020 07:57:15 -0700
Subject: [PATCH 13/50] restore removed functions, sigh :)

---
 sourmash/minhash.py    | 43 +++++++++++++++++++++++++++++++++++++-----
 tests/test__minhash.py | 12 ++++++------
 tests/test_jaccard.py  | 28 +++++++++++++--------------
 3 files changed, 58 insertions(+), 25 deletions(-)

diff --git a/sourmash/minhash.py b/sourmash/minhash.py
index eb49cde095..48cee4ae5d 100644
--- a/sourmash/minhash.py
+++ b/sourmash/minhash.py
@@ -301,6 +301,14 @@ def get_hashes(self):
     def hashes(self):
         return self.get_mins(with_abundance=True)
 
+    def subtract_mins(self, other):
+        """Get the list of mins in this MinHash, after removing the ones in
+        ``other``.
+        """
+        a = set(self.get_mins())
+        b = set(other.get_mins())
+        return a - b
+
     @property
     def seed(self):
         return self._methodcall(lib.kmerminhash_seed)
@@ -309,28 +317,28 @@ def seed(self):
     def num(self):
         return self._methodcall(lib.kmerminhash_num)
 
-    @property
-    def max_hash(self):
-        return self._methodcall(lib.kmerminhash_max_hash)
-
     @property
     def scaled(self):
         if self.max_hash:
             return _get_scaled_for_max_hash(self.max_hash)
         return 0
 
+    # @CTB
     @property
     def is_dna(self):
         return not (self.is_protein or self.dayhoff or self.hp)
 
+    # @CTB
     @property
     def is_protein(self):
         return self._methodcall(lib.kmerminhash_is_protein)
 
+    # @CTB
     @property
     def dayhoff(self):
         return self._methodcall(lib.kmerminhash_dayhoff)
 
+    # @CTB
     @property
     def hp(self):
         return self._methodcall(lib.kmerminhash_hp)
@@ -339,6 +347,10 @@ def hp(self):
     def ksize(self):
         return self._methodcall(lib.kmerminhash_ksize)
 
+    @property
+    def max_hash(self):
+        return self._methodcall(lib.kmerminhash_max_hash)
+
     @property
     def track_abundance(self):
         return self._methodcall(lib.kmerminhash_track_abundance)
@@ -391,7 +403,7 @@ def count_common(self, other, downsample=False):
             raise TypeError("Must be a MinHash!")
         return self._methodcall(lib.kmerminhash_count_common, other._get_objptr(), downsample)
 
-    def downsample_num(self, new_num):
+    def downsample_n(self, new_num):
         "Copy this object and downsample new object to num=``new_num``."
         if self.num and self.num < new_num:
             raise ValueError("new sample n is higher than current sample n")
@@ -406,6 +418,17 @@ def downsample_num(self, new_num):
 
         return a
 
+    def downsample_max_hash(self, *others):
+        """Copy this object and downsample new object to min of ``*others``.
+
+        Here, ``*others`` is one or more MinHash objects.
+        """
+        max_hashes = [x.max_hash for x in others]
+        new_max_hash = min(self.max_hash, *max_hashes)
+        new_scaled = get_scaled_for_max_hash(new_max_hash)
+
+        return self.downsample_scaled(new_scaled)
+
     def downsample_scaled(self, new_scaled):
         """Copy this object and downsample new object to scaled=``new_scaled``.
         """
@@ -566,6 +589,16 @@ def add_protein(self, sequence):
         "Add a protein sequence."
         self._methodcall(lib.kmerminhash_add_protein, to_bytes(sequence))
 
+    def is_molecule_type(self, molecule):
+        """Check if this MinHash is a particular human-readable molecule type.
+
+        Supports 'protein', 'dayhoff', 'hp', 'DNA'.
+        @CTB deprecate for 4.0?
+        """
+        if molecule.lower() not in ('protein', 'dayhoff', 'hp', 'dna'):
+            raise ValueError("unknown moltype in query, '{}'".format(molecule))
+        return molecule == self.moltype
+
     @property
     def moltype(self):                    # TODO: test in minhash tests
         if self.is_protein:
diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index 81f951672e..06293883ac 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -701,7 +701,7 @@ def test_mh_count_common_notmh(track_abundance):
 def test_mh_downsample_num_error(track_abundance):
     a = MinHash(20, 10, track_abundance=track_abundance)
     with pytest.raises(ValueError):
-        a.downsample_num(30)
+        a.downsample_n(30)
 
 
 def test_mh_jaccard_asymmetric_num(track_abundance):
@@ -720,7 +720,7 @@ def test_mh_jaccard_asymmetric_num(track_abundance):
     with pytest.raises(TypeError):
         a.compare(b)
 
-    a = a.downsample_num(10)
+    a = a.downsample_n(10)
     assert a.compare(b) == 0.5
     assert b.compare(a) == 0.5
 
@@ -837,12 +837,12 @@ def test_mh_asymmetric_merge(track_abundance):
     with pytest.raises(TypeError):
         d.compare(a)
 
-    a = a.downsample_num(d.num)
+    a = a.downsample_n(d.num)
     print(a.hashes)
     print(d.hashes)
     assert d.compare(a) == 1.0
 
-    c = c.downsample_num(b.num)
+    c = c.downsample_n(b.num)
     assert c.compare(b) == 1.0
 
 
@@ -873,10 +873,10 @@ def test_mh_inplace_concat_asymmetric(track_abundance):
     except TypeError as exc:
         assert 'must have same num' in str(exc)
 
-    a = a.downsample_num(d.num)
+    a = a.downsample_n(d.num)
     assert d.compare(a) == 1.0 # see: d += a, above.
 
-    c = c.downsample_num(b.num)
+    c = c.downsample_n(b.num)
     assert c.compare(b) == 0.5
 
 
diff --git a/tests/test_jaccard.py b/tests/test_jaccard.py
index 679e0723f4..43a4c355a4 100644
--- a/tests/test_jaccard.py
+++ b/tests/test_jaccard.py
@@ -189,18 +189,18 @@ def test_jaccard_on_real_data():
     assert mh1.compare(mh2) == 0.0183
     assert mh2.compare(mh1) == 0.0183
 
-    mh1 = mh1.downsample_num(1000)
-    mh2 = mh2.downsample_num(1000)
+    mh1 = mh1.downsample_n(1000)
+    mh2 = mh2.downsample_n(1000)
     assert mh1.compare(mh2) == 0.011
     assert mh2.compare(mh1) == 0.011
 
-    mh1 = mh1.downsample_num(100)
-    mh2 = mh2.downsample_num(100)
+    mh1 = mh1.downsample_n(100)
+    mh2 = mh2.downsample_n(100)
     assert mh1.compare(mh2) == 0.01
     assert mh2.compare(mh1) == 0.01
 
-    mh1 = mh1.downsample_num(10)
-    mh2 = mh2.downsample_num(10)
+    mh1 = mh1.downsample_n(10)
+    mh2 = mh2.downsample_n(10)
     assert mh1.compare(mh2) == 0.0
     assert mh2.compare(mh1) == 0.0
 
@@ -221,24 +221,24 @@ def test_scaled_on_real_data():
     assert round(mh1.compare(mh2), 5) == 0.01644
     assert round(mh2.compare(mh1), 5) == 0.01644
 
-    mh1 = mh1.downsample_num(10000)
-    mh2 = mh2.downsample_num(10000)
+    mh1 = mh1.downsample_n(10000)
+    mh2 = mh2.downsample_n(10000)
 
     assert mh1.compare(mh2) == 0.0183
     assert mh2.compare(mh1) == 0.0183
 
-    mh1 = mh1.downsample_num(1000)
-    mh2 = mh2.downsample_num(1000)
+    mh1 = mh1.downsample_n(1000)
+    mh2 = mh2.downsample_n(1000)
     assert mh1.compare(mh2) == 0.011
     assert mh2.compare(mh1) == 0.011
 
-    mh1 = mh1.downsample_num(100)
-    mh2 = mh2.downsample_num(100)
+    mh1 = mh1.downsample_n(100)
+    mh2 = mh2.downsample_n(100)
     assert mh1.compare(mh2) == 0.01
     assert mh2.compare(mh1) == 0.01
 
-    mh1 = mh1.downsample_num(10)
-    mh2 = mh2.downsample_num(10)
+    mh1 = mh1.downsample_n(10)
+    mh2 = mh2.downsample_n(10)
     assert mh1.compare(mh2) == 0.0
     assert mh2.compare(mh1) == 0.0
 

From 70edc789cea406dc680d3665e1a45aaa677d86a3 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Mon, 27 Jul 2020 07:58:46 -0700
Subject: [PATCH 14/50] minor upd

---
 sourmash/minhash.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/sourmash/minhash.py b/sourmash/minhash.py
index 48cee4ae5d..ffea1e6ab4 100644
--- a/sourmash/minhash.py
+++ b/sourmash/minhash.py
@@ -262,7 +262,7 @@ def remove_many(self, hashes):
         "Remove many hashes at once; ``hashes`` must be an iterable."
         self._methodcall(lib.kmerminhash_remove_many, list(hashes), len(hashes))
 
-    def update_xxx(self, other):
+    def update(self, other):
         "Update this sketch from all the hashes in the other."
         self.add_many(other)
 
@@ -609,4 +609,3 @@ def moltype(self):                    # TODO: test in minhash tests
             return 'hp'
         else:
             return 'DNA'
-

From eb6b971ed2b62cdda0481d9aea5ecf3fdbf637e9 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Tue, 28 Jul 2020 06:12:43 -0700
Subject: [PATCH 15/50] add deprecations

---
 sourmash/minhash.py | 38 ++++++++++++++++++++++++++++++++++----
 1 file changed, 34 insertions(+), 4 deletions(-)

diff --git a/sourmash/minhash.py b/sourmash/minhash.py
index ffea1e6ab4..21b2cba1b1 100644
--- a/sourmash/minhash.py
+++ b/sourmash/minhash.py
@@ -243,6 +243,14 @@ def add_sequence(self, sequence, force=False):
         self._methodcall(lib.kmerminhash_add_sequence, to_bytes(sequence),
                          force)
 
+    @deprecated(deprecated_in="3.5", removed_in="4.0",
+                current_version=VERSION,
+                details='Use add_kmer instead.')
+    def add(self, kmer):
+        "Add a kmer into the sketch."
+        self.add_sequence(kmer)
+
+
     def add_kmer(self, kmer):
         "Add a kmer into the sketch."
         self.add_sequence(kmer)
@@ -262,6 +270,9 @@ def remove_many(self, hashes):
         "Remove many hashes at once; ``hashes`` must be an iterable."
         self._methodcall(lib.kmerminhash_remove_many, list(hashes), len(hashes))
 
+    @deprecated(deprecated_in="3.5", removed_in="4.0",
+                current_version=VERSION,
+                details='Use add_many instead.')
     def update(self, other):
         "Update this sketch from all the hashes in the other."
         self.add_many(other)
@@ -270,6 +281,9 @@ def __len__(self):
         "Number of hashes."
         return self._methodcall(lib.kmerminhash_get_mins_size)
 
+    @deprecated(deprecated_in="3.5", removed_in="5.0",
+                current_version=VERSION,
+                details='Use .hashes property instead.')
     def get_mins(self, with_abundance=False):
         """Return list of hashes or if ``with_abundance`` a list
         of (hash, abund).
@@ -293,6 +307,9 @@ def get_mins(self, with_abundance=False):
 
         return result
 
+    @deprecated(deprecated_in="3.5", removed_in="5.0",
+                current_version=VERSION,
+                details='Use .hashes property instead.')
     def get_hashes(self):
         "Return the list of hashes."
         return self.get_mins()
@@ -301,6 +318,8 @@ def get_hashes(self):
     def hashes(self):
         return self.get_mins(with_abundance=True)
 
+    @deprecated(deprecated_in="3.5", removed_in="4.0",
+                current_version=VERSION)
     def subtract_mins(self, other):
         """Get the list of mins in this MinHash, after removing the ones in
         ``other``.
@@ -323,22 +342,18 @@ def scaled(self):
             return _get_scaled_for_max_hash(self.max_hash)
         return 0
 
-    # @CTB
     @property
     def is_dna(self):
         return not (self.is_protein or self.dayhoff or self.hp)
 
-    # @CTB
     @property
     def is_protein(self):
         return self._methodcall(lib.kmerminhash_is_protein)
 
-    # @CTB
     @property
     def dayhoff(self):
         return self._methodcall(lib.kmerminhash_dayhoff)
 
-    # @CTB
     @property
     def hp(self):
         return self._methodcall(lib.kmerminhash_hp)
@@ -348,6 +363,9 @@ def ksize(self):
         return self._methodcall(lib.kmerminhash_ksize)
 
     @property
+    @deprecated(deprecated_in="3.5", removed_in="5.0",
+                current_version=VERSION,
+                details='Use scaled instead.')
     def max_hash(self):
         return self._methodcall(lib.kmerminhash_max_hash)
 
@@ -403,6 +421,9 @@ def count_common(self, other, downsample=False):
             raise TypeError("Must be a MinHash!")
         return self._methodcall(lib.kmerminhash_count_common, other._get_objptr(), downsample)
 
+    @deprecated(deprecated_in="3.5", removed_in="4.0",
+                current_version=VERSION,
+                details='Use downsample(num=...) instead.')
     def downsample_n(self, new_num):
         "Copy this object and downsample new object to num=``new_num``."
         if self.num and self.num < new_num:
@@ -418,6 +439,9 @@ def downsample_n(self, new_num):
 
         return a
 
+    @deprecated(deprecated_in="3.5", removed_in="4.0",
+                current_version=VERSION,
+                details='Use scaled instead.')
     def downsample_max_hash(self, *others):
         """Copy this object and downsample new object to min of ``*others``.
 
@@ -429,6 +453,9 @@ def downsample_max_hash(self, *others):
 
         return self.downsample_scaled(new_scaled)
 
+    @deprecated(deprecated_in="3.5", removed_in="4.0",
+                current_version=VERSION,
+                details='Use downsample(scaled=...) instead.')
     def downsample_scaled(self, new_scaled):
         """Copy this object and downsample new object to scaled=``new_scaled``.
         """
@@ -589,6 +616,9 @@ def add_protein(self, sequence):
         "Add a protein sequence."
         self._methodcall(lib.kmerminhash_add_protein, to_bytes(sequence))
 
+    @deprecated(deprecated_in="3.5", removed_in="4.0",
+                current_version=VERSION,
+                details='Use the moltype property instead.')
     def is_molecule_type(self, molecule):
         """Check if this MinHash is a particular human-readable molecule type.
 

From 464dcca6551f77122916a1383fe9ac45c9bb577c Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Tue, 28 Jul 2020 16:42:46 -0700
Subject: [PATCH 16/50] use a wrapper object for .hashes and make it read-only

---
 sourmash/minhash.py    | 31 ++++++++++++++++++++++++++++++-
 tests/test__minhash.py | 20 ++++++++++++++++----
 2 files changed, 46 insertions(+), 5 deletions(-)

diff --git a/sourmash/minhash.py b/sourmash/minhash.py
index 21b2cba1b1..0381dec07e 100644
--- a/sourmash/minhash.py
+++ b/sourmash/minhash.py
@@ -3,6 +3,7 @@
 
 import math
 import copy
+import collections
 
 from . import VERSION
 from ._compat import string_types, range_type
@@ -73,6 +74,30 @@ def hash_murmur(kmer, seed=MINHASH_DEFAULT_SEED):
     return lib.hash_murmur(to_bytes(kmer), seed)
 
 
+class _HashesWrapper(collections.Mapping):
+    def __init__(self, h):
+        self._data = h
+
+    def __getitem__(self, key):
+        print(key, self._data)
+        return self._data[key]
+
+    def __repr__(self):
+        return repr(self._data)
+
+    def __len__(self):
+        return len(self._data)
+
+    def __iter__(self):
+        return iter(self._data)
+
+    def __eq__(self, other):
+        return list(self.items()) == list(other.items())
+
+    def __setitem__(self, k, v):
+        raise RuntimeError("cannot modify hashes directly; use 'add' methods")
+
+
 class MinHash(RustObject):
     """\
     The core sketch object for sourmash.
@@ -316,7 +341,11 @@ def get_hashes(self):
 
     @property
     def hashes(self):
-        return self.get_mins(with_abundance=True)
+        if self.track_abundance:
+            return _HashesWrapper(self.get_mins(with_abundance=True))
+        else:
+            d = self.get_mins()
+            return _HashesWrapper({ k : 1 for k in d })
 
     @deprecated(deprecated_in="3.5", removed_in="4.0",
                 current_version=VERSION)
diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index 5dc518e6f5..2608dd5317 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -200,6 +200,8 @@ def test_dayhoff(track_abundance):
     mh_protein.add_sequence('ACTGAC')
 
     assert len(mh_protein.hashes) == 2
+    print(mh_protein.hashes)
+    print(mh_dayhoff.hashes)
     assert mh_protein.hashes != mh_dayhoff.hashes
 
 
@@ -1127,8 +1129,7 @@ def test_abundance_count_common():
     assert a.count_common(b) == 1
     assert a.count_common(b) == b.count_common(a)
 
-    assert b.hashes == [2110480117637990133,
-                                               10798773792509008305]
+    assert list(b.hashes) == [2110480117637990133, 10798773792509008305]
 
 
 def test_abundance_similarity():
@@ -1235,7 +1236,7 @@ def test_reset_abundance_initialized():
     # Convert from Abundance to Regular MinHash
     a.track_abundance = False
 
-    assert a.hashes == [12415348535738636339]
+    assert list(a.hashes) == [12415348535738636339]
 
 
 def test_set_abundance_initialized():
@@ -1462,7 +1463,7 @@ def test_add_many(track_abundance):
     b = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000)
 
     a.add_many(list(range(0, 100, 2)))
-    a.add_many(list(range(0, 100, 2)))
+    a.add_many(list(range(0, 100, 2)))    # => abundance = 2
 
     assert len(a) == 50
     assert all(c % 2 == 0 for c in a.hashes)
@@ -1484,3 +1485,14 @@ def test_set_abundances_huge():
     abundances = itertools.repeat(2)
 
     a.set_abundances(dict(zip(hashes, abundances)))
+
+
+def test_try_change_hashes(track_abundance):
+    a = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000)
+    b = MinHash(0, 10, track_abundance=track_abundance, scaled=scaled5000)
+
+    a.add_many(list(range(0, 100, 2)))
+
+    h = a.hashes
+    with pytest.raises(RuntimeError):
+        h[5] = 10

From 23171d9e5cf015d79c9ae020313645431534ca63 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Tue, 28 Jul 2020 16:58:56 -0700
Subject: [PATCH 17/50] refactor to use downsample(num/scaled=

---
 sourmash/minhash.py         | 82 ++++++++++++++++++-------------------
 sourmash/sig/__main__.py    |  4 +-
 tests/test_cmd_signature.py |  4 +-
 3 files changed, 44 insertions(+), 46 deletions(-)

diff --git a/sourmash/minhash.py b/sourmash/minhash.py
index 0381dec07e..dbc4cf7172 100644
--- a/sourmash/minhash.py
+++ b/sourmash/minhash.py
@@ -79,7 +79,6 @@ def __init__(self, h):
         self._data = h
 
     def __getitem__(self, key):
-        print(key, self._data)
         return self._data[key]
 
     def __repr__(self):
@@ -450,17 +449,41 @@ def count_common(self, other, downsample=False):
             raise TypeError("Must be a MinHash!")
         return self._methodcall(lib.kmerminhash_count_common, other._get_objptr(), downsample)
 
-    @deprecated(deprecated_in="3.5", removed_in="4.0",
-                current_version=VERSION,
-                details='Use downsample(num=...) instead.')
-    def downsample_n(self, new_num):
-        "Copy this object and downsample new object to num=``new_num``."
-        if self.num and self.num < new_num:
-            raise ValueError("new sample n is higher than current sample n")
+    def downsample(self, num=None, scaled=None):
+        """Copy this object and downsample new object to either `num` or
+        `scaled`.
+        """
+        if num is None and scaled is None:
+            raise ValueError('must specify either num or scaled to downsample')
+        elif num is not None:
+            if self.num and self.num < num:
+                raise ValueError("new sample num is higher than current sample num")
+            max_hash=0
+        elif scaled is not None:
+            if self.num:
+                raise ValueError("num != 0 - cannot downsample a standard MinHash")
+            max_hash = self.max_hash
+            if max_hash is None:
+                raise ValueError("no max_hash available - cannot downsample")
+
+            old_scaled = _get_scaled_for_max_hash(self.max_hash)
+            if old_scaled > scaled:
+                raise ValueError(
+                    "new scaled {} is lower than current sample scaled {}".format(
+                        scaled, old_scaled
+                    )
+                )
+
+            max_hash = _get_max_hash_for_scaled(scaled)
+            num = 0
+        ###
 
+        # create new object:
         a = MinHash(
-            new_num, self.ksize, self.is_protein, self.dayhoff, self.hp, self.track_abundance, self.seed, 0
+            num, self.ksize, self.is_protein, self.dayhoff, self.hp,
+            self.track_abundance, self.seed, max_hash
         )
+        # copy over hashes:
         if self.track_abundance:
             a.set_abundances(self.get_mins(with_abundance=True))
         else:
@@ -468,6 +491,13 @@ def downsample_n(self, new_num):
 
         return a
 
+    @deprecated(deprecated_in="3.5", removed_in="4.0",
+                current_version=VERSION,
+                details='Use downsample(num=...) instead.')
+    def downsample_n(self, new_num):
+        "Copy this object and downsample new object to num=``new_num``."
+        return self.downsample(num=new_num)
+
     @deprecated(deprecated_in="3.5", removed_in="4.0",
                 current_version=VERSION,
                 details='Use scaled instead.')
@@ -488,39 +518,7 @@ def downsample_max_hash(self, *others):
     def downsample_scaled(self, new_scaled):
         """Copy this object and downsample new object to scaled=``new_scaled``.
         """
-        if self.num:
-            raise ValueError("num != 0 - cannot downsample a standard MinHash")
-
-        max_hash = self.max_hash
-        if max_hash is None:
-            raise ValueError("no max_hash available - cannot downsample")
-
-        old_scaled = _get_scaled_for_max_hash(self.max_hash)
-        if old_scaled > new_scaled:
-            raise ValueError(
-                "new scaled {} is lower than current sample scaled {}".format(
-                    new_scaled, old_scaled
-                )
-            )
-
-        new_max_hash = _get_max_hash_for_scaled(new_scaled)
-
-        a = MinHash(
-            0,
-            self.ksize,
-            self.is_protein,
-            self.dayhoff,
-            self.hp,
-            self.track_abundance,
-            self.seed,
-            new_max_hash,
-        )
-        if self.track_abundance:
-            a.set_abundances(self.get_mins(with_abundance=True))
-        else:
-            a.add_many(self)
-
-        return a
+        return self.downsample(scaled=new_scaled)
 
     @deprecated(deprecated_in="3.3", removed_in="4.0",
                 current_version=VERSION,
diff --git a/sourmash/sig/__main__.py b/sourmash/sig/__main__.py
index da575394de..ae7aa6d5d6 100644
--- a/sourmash/sig/__main__.py
+++ b/sourmash/sig/__main__.py
@@ -727,7 +727,7 @@ def downsample(args):
             total_loaded += 1
             if args.scaled:
                 if mh.scaled:
-                    mh_new = mh.downsample_scaled(args.scaled)
+                    mh_new = mh.downsample(scaled=args.scaled)
                 else:                         # try to turn a num into a scaled
                     # first check: can we?
                     max_hash = _get_max_hash_for_scaled(args.scaled)
@@ -739,7 +739,7 @@ def downsample(args):
                     _set_num_scaled(mh_new, 0, args.scaled)
             elif args.num:
                 if mh.num:
-                    mh_new = mh.downsample_num(args.num)
+                    mh_new = mh.downsample(num=args.num)
                 else:                         # try to turn a scaled into a num
                     # first check: can we?
                     if len(mh) < args.num:
diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py
index 41bda45fb8..c2dff1f758 100644
--- a/tests/test_cmd_signature.py
+++ b/tests/test_cmd_signature.py
@@ -1042,7 +1042,7 @@ def test_sig_downsample_1_scaled(c):
     test_downsample_sig = sourmash.load_one_signature(sig47)
     actual_downsample_sig = sourmash.load_one_signature(out)
 
-    test_mh = test_downsample_sig.minhash.downsample_scaled(10000)
+    test_mh = test_downsample_sig.minhash.downsample(scaled=10000)
 
     assert actual_downsample_sig.minhash == test_mh
 
@@ -1114,7 +1114,7 @@ def test_sig_downsample_2_num(c):
     test_downsample_sig = sourmash.load_one_signature(sigs11, ksize=21,
                                                       select_moltype='DNA')
     actual_downsample_sig = sourmash.load_one_signature(out)
-    test_mh = test_downsample_sig.minhash.downsample_num(500)
+    test_mh = test_downsample_sig.minhash.downsample(num=500)
 
     assert actual_downsample_sig.minhash == test_mh
 

From 02239d977ffb958aaa622504712b5b58085657b4 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 29 Jul 2020 06:39:37 -0700
Subject: [PATCH 18/50] refactor to use downsample(scaled=...)

---
 tests/test__minhash.py | 14 +++++++-------
 tests/test_jaccard.py  | 12 ++++++------
 tests/test_lca.py      | 20 ++++++++++----------
 3 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index 2608dd5317..9d33c17cff 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -289,7 +289,7 @@ def test_no_downsample_scaled_if_n(track_abundance):
     # make sure you can't set max_n and then downsample scaled
     mh = MinHash(2, 4, track_abundance=track_abundance)
     with pytest.raises(ValueError) as excinfo:
-        mh.downsample_scaled(100000000)
+        mh.downsample(scaled=100000000)
 
     assert 'cannot downsample a standard MinHash' in str(excinfo.value)
 
@@ -705,7 +705,7 @@ def test_mh_count_common_notmh(track_abundance):
 def test_mh_downsample_num_error(track_abundance):
     a = MinHash(20, 10, track_abundance=track_abundance)
     with pytest.raises(ValueError):
-        a.downsample_n(30)
+        a.downsample(num=30)
 
 
 def test_mh_jaccard_asymmetric_num(track_abundance):
@@ -725,7 +725,7 @@ def test_mh_jaccard_asymmetric_num(track_abundance):
     with pytest.raises(TypeError):
         a.jaccard(b)
 
-    a = a.downsample_n(10)
+    a = a.downsample(num=10)
     # CTB note: this used to be 'compare', is now 'jaccard'
     assert a.jaccard(b) == 0.5
     assert b.jaccard(a) == 0.5
@@ -850,14 +850,14 @@ def test_mh_asymmetric_merge(track_abundance):
     with pytest.raises(TypeError):
         d.jaccard(a)
 
-    a = a.downsample_n(d.num)
+    a = a.downsample(num=d.num)
 
     if track_abundance:
         assert round(d.similarity(a), 3) == 0.91
     else:
         assert round(d.similarity(a), 3) == 1.0
 
-    c = c.downsample_n(b.num)
+    c = c.downsample(num=b.num)
     if track_abundance:
         assert round(c.similarity(b), 3) == 0.91
     else:
@@ -891,13 +891,13 @@ def test_mh_inplace_concat_asymmetric(track_abundance):
     except TypeError as exc:
         assert 'must have same num' in str(exc)
 
-    a = a.downsample_n(d.num)
+    a = a.downsample(num=d.num)
     if track_abundance:
         assert round(d.similarity(a), 3) == 0.795 # see: d += a, above.
     else:
         assert d.similarity(a) == 1.0 # see: d += a, above.
 
-    c = c.downsample_n(b.num)
+    c = c.downsample(num=b.num)
     if track_abundance:
         assert round(c.similarity(b), 3) == 0.436
     else:
diff --git a/tests/test_jaccard.py b/tests/test_jaccard.py
index eede4d1110..34987a0c18 100644
--- a/tests/test_jaccard.py
+++ b/tests/test_jaccard.py
@@ -259,18 +259,18 @@ def test_scaled_on_real_data_2():
     assert round(mh1.similarity(mh2), 5) == 0.01644
     assert round(mh2.similarity(mh1), 5) == 0.01644
 
-    mh1 = mh1.downsample_scaled(1000)
-    mh2 = mh2.downsample_scaled(1000)
+    mh1 = mh1.downsample(scaled=1000)
+    mh2 = mh2.downsample(scaled=1000)
 
     assert round(mh1.similarity(mh2), 4) == 0.0187
     assert round(mh2.similarity(mh1), 4) == 0.0187
 
-    mh1 = mh1.downsample_scaled(10000)
-    mh2 = mh2.downsample_scaled(10000)
+    mh1 = mh1.downsample(scaled=10000)
+    mh2 = mh2.downsample(scaled=10000)
     assert round(mh1.similarity(mh2), 3) == 0.01
     assert round(mh2.similarity(mh1), 3) == 0.01
 
-    mh1 = mh1.downsample_scaled(100000)
-    mh2 = mh2.downsample_scaled(100000)
+    mh1 = mh1.downsample(scaled=100000)
+    mh2 = mh2.downsample(scaled=100000)
     assert round(mh1.similarity(mh2), 2) == 0.01
     assert round(mh2.similarity(mh1), 2) == 0.01
diff --git a/tests/test_lca.py b/tests/test_lca.py
index ff8312d798..568661fd8e 100644
--- a/tests/test_lca.py
+++ b/tests/test_lca.py
@@ -322,8 +322,8 @@ def test_api_create_insert_two_then_scale():
     # downsample everything to 5000
     lca_db.downsample_scaled(5000)
 
-    ss.minhash = ss.minhash.downsample_scaled(5000)
-    ss2.minhash = ss2.minhash.downsample_scaled(5000)
+    ss.minhash = ss.minhash.downsample(scaled=5000)
+    ss2.minhash = ss2.minhash.downsample(scaled=5000)
 
     # & check...
     combined_mins = set(ss.minhash.get_mins())
@@ -342,12 +342,12 @@ def test_api_create_insert_scale_two():
     lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=5000)
     count = lca_db.insert(ss)
     assert count == 1037
-    assert count == len(ss.minhash.downsample_scaled(5000))
+    assert count == len(ss.minhash.downsample(scaled=5000))
     lca_db.insert(ss2)
 
     # downsample sigs to 5000
-    ss.minhash = ss.minhash.downsample_scaled(5000)
-    ss2.minhash = ss2.minhash.downsample_scaled(5000)
+    ss.minhash = ss.minhash.downsample(scaled=5000)
+    ss2.minhash = ss2.minhash.downsample(scaled=5000)
 
     # & check...
     combined_mins = set(ss.minhash.get_mins())
@@ -431,7 +431,7 @@ def test_search_db_scaled_gt_sig_scaled():
     results = db.search(sig, threshold=.01, ignore_abundance=True)
     match_sig = results[0][1]
 
-    sig.minhash = sig.minhash.downsample_scaled(10000)
+    sig.minhash = sig.minhash.downsample(scaled=10000)
     assert sig.minhash == match_sig.minhash
 
 
@@ -439,7 +439,7 @@ def test_search_db_scaled_lt_sig_scaled():
     dbfile = utils.get_test_data('lca/47+63.lca.json')
     db, ksize, scaled = lca_utils.load_single_database(dbfile)
     sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'))
-    sig.minhash = sig.minhash.downsample_scaled(100000)
+    sig.minhash = sig.minhash.downsample(scaled=100000)
 
     with pytest.raises(ValueError) as e:
         results = db.search(sig, threshold=.01, ignore_abundance=True)
@@ -453,7 +453,7 @@ def test_gather_db_scaled_gt_sig_scaled():
     results = db.gather(sig, threshold=.01, ignore_abundance=True)
     match_sig = results[0][1]
 
-    sig.minhash = sig.minhash.downsample_scaled(10000)
+    sig.minhash = sig.minhash.downsample(scaled=10000)
     assert sig.minhash == match_sig.minhash
 
 
@@ -461,12 +461,12 @@ def test_gather_db_scaled_lt_sig_scaled():
     dbfile = utils.get_test_data('lca/47+63.lca.json')
     db, ksize, scaled = lca_utils.load_single_database(dbfile)
     sig = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'))
-    sig.minhash = sig.minhash.downsample_scaled(100000)
+    sig.minhash = sig.minhash.downsample(scaled=100000)
 
     results = db.gather(sig, threshold=.01, ignore_abundance=True)
     match_sig = results[0][1]
 
-    match_sig.minhash = match_sig.minhash.downsample_scaled(100000)
+    match_sig.minhash = match_sig.minhash.downsample(scaled=100000)
     assert sig.minhash == match_sig.minhash
 
 

From 07cb47491b735664f9c225aab3738e73d5473406 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 29 Jul 2020 06:42:09 -0700
Subject: [PATCH 19/50] return two deleted tests

---
 tests/test__minhash.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index 9d33c17cff..98bbe13bcb 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -278,6 +278,13 @@ def test_max_hash_and_scaled_zero():
     assert max_hash == 0
 
 
+def test_max_hash_and_scaled_error(track_abundance):
+    # test behavior when supplying both max_hash and scaled
+    with pytest.raises(ValueError):
+        mh = MinHash(0, 4, track_abundance=track_abundance, max_hash=35,
+                     scaled=5)
+
+
 def test_max_hash_cannot_limit(track_abundance):
     # make sure you can't set both n and scaled.
     with pytest.raises(ValueError):
@@ -1312,6 +1319,19 @@ def test_scaled_property(track_abundance):
     assert a.scaled == scaled
 
 
+def test_mh_subtract(track_abundance):
+    # test subtracting two identically configured minhashes
+    a = MinHash(20, 10, track_abundance=track_abundance)
+    for i in range(0, 40, 2):
+        a.add_hash(i)
+
+    b = MinHash(20, 10, track_abundance=track_abundance)
+    for i in range(0, 80, 4):
+        b.add_hash(i)
+
+    assert a.subtract_mins(b) == set(range(2, 40, 4))
+
+
 def test_pickle_max_hash(track_abundance):
     a = MinHash(0, 10, track_abundance=track_abundance,
                 scaled=_get_scaled_for_max_hash(20))

From 9d178c950fb45a53b9274903b2c118cd577aea58 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 29 Jul 2020 06:47:08 -0700
Subject: [PATCH 20/50] fixed test that was masked by another test

---
 sourmash/minhash.py    | 1 +
 tests/test__minhash.py | 9 +++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/sourmash/minhash.py b/sourmash/minhash.py
index dbc4cf7172..14390a0465 100644
--- a/sourmash/minhash.py
+++ b/sourmash/minhash.py
@@ -75,6 +75,7 @@ def hash_murmur(kmer, seed=MINHASH_DEFAULT_SEED):
 
 
 class _HashesWrapper(collections.Mapping):
+    "A read-only view of the hashes contained by a MinHash object."
     def __init__(self, h):
         self._data = h
 
diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index 98bbe13bcb..b6a7d0cf7d 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -251,11 +251,12 @@ def test_scaled(track_abundance):
     mh.add_hash(10)
     mh.add_hash(20)
     mh.add_hash(30)
-    assert mh.hashes == [10, 20, 30]
+
+    assert list(mh.hashes) == [10, 20, 30]
     mh.add_hash(40)
-    assert mh.hashes == [10, 20, 30]
+    assert list(mh.hashes) == [10, 20, 30]
     mh.add_hash(36)
-    assert mh.hashes == [10, 20, 30]
+    assert list(mh.hashes) == [10, 20, 30]
 
 
 def test_no_scaled(track_abundance):
@@ -301,7 +302,7 @@ def test_no_downsample_scaled_if_n(track_abundance):
     assert 'cannot downsample a standard MinHash' in str(excinfo.value)
 
 
-def test_scaled(track_abundance):
+def test_scaled_num_both(track_abundance):
     # make sure you can't set both max_n and scaled.
     with pytest.raises(ValueError):
         mh = MinHash(2, 4, track_abundance=track_abundance, scaled=2)

From 3b2b35bb83e43de6cae5b4a59263970263680753 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 29 Jul 2020 07:04:06 -0700
Subject: [PATCH 21/50] add explicit check for length of kmer in add_kmer

---
 sourmash/minhash.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sourmash/minhash.py b/sourmash/minhash.py
index 14390a0465..3f2e9cb9d7 100644
--- a/sourmash/minhash.py
+++ b/sourmash/minhash.py
@@ -275,9 +275,9 @@ def add(self, kmer):
         "Add a kmer into the sketch."
         self.add_sequence(kmer)
 
-
     def add_kmer(self, kmer):
         "Add a kmer into the sketch."
+        assert len(kmer) == self.ksize
         self.add_sequence(kmer)
 
     def add_many(self, hashes):

From f6faf89736cff787c53dcc464a8d96b5a0976e82 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 29 Jul 2020 09:22:53 -0700
Subject: [PATCH 22/50] fix ordering in hash retrieval

---
 tests/test__minhash.py | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index b6a7d0cf7d..8e7fe40438 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -236,9 +236,9 @@ def test_size_limit(track_abundance):
     mh.add_hash(10)
     mh.add_hash(20)
     mh.add_hash(30)
-    assert list(mh.hashes) == [10, 20, 30]
+    assert list(sorted(mh.hashes)) == [10, 20, 30]
     mh.add_hash(5) # -> should push 30 off end
-    assert list(mh.hashes) == [5, 10, 20]
+    assert list(sorted(mh.hashes)) == [5, 10, 20]
 
 
 def test_scaled(track_abundance):
@@ -252,11 +252,11 @@ def test_scaled(track_abundance):
     mh.add_hash(20)
     mh.add_hash(30)
 
-    assert list(mh.hashes) == [10, 20, 30]
+    assert list(sorted(mh.hashes)) == [10, 20, 30]
     mh.add_hash(40)
-    assert list(mh.hashes) == [10, 20, 30]
+    assert list(sorted(mh.hashes)) == [10, 20, 30]
     mh.add_hash(36)
-    assert list(mh.hashes) == [10, 20, 30]
+    assert list(sorted(mh.hashes)) == [10, 20, 30]
 
 
 def test_no_scaled(track_abundance):
@@ -646,7 +646,7 @@ def test_mh_len(track_abundance):
     for i in range(0, 40, 2):
         a.add_hash(i)
 
-    assert list(a.hashes) == list(range(0, 40, 2))
+    assert list(sorted(a.hashes)) == list(range(0, 40, 2))
 
 
 def test_mh_unsigned_long_long(track_abundance):
@@ -759,7 +759,7 @@ def test_mh_merge(track_abundance):
     d = b.merge(a)
 
     assert len(c) == len(d)
-    assert list(c.hashes) == list(d.hashes)
+    assert list(sorted(c.hashes)) == list(d.hashes)
 
     if track_abundance:
         assert round(c.similarity(d), 3) == 0.91
@@ -783,7 +783,7 @@ def test_mh_merge_empty_num(track_abundance):
     assert len(c)
     assert len(c) == len(d)
 
-    assert list(c.hashes) == list(d.hashes)
+    assert list(sorted(c.hashes)) == list(sorted(d.hashes))
     assert round(c.similarity(d), 3) == 1.0
     assert round(d.similarity(c), 3) == 1.0
 
@@ -802,7 +802,7 @@ def test_mh_merge_empty_scaled(track_abundance):
     assert len(c)
     assert len(c) == len(d)
 
-    assert list(c.hashes) == list(d.hashes)
+    assert list(sorted(c.hashes)) == list(sorted(d.hashes))
     assert round(c.similarity(d), 3) == 1.0
     assert round(d.similarity(c), 3) == 1.0
 
@@ -1137,7 +1137,7 @@ def test_abundance_count_common():
     assert a.count_common(b) == 1
     assert a.count_common(b) == b.count_common(a)
 
-    assert list(b.hashes) == [2110480117637990133, 10798773792509008305]
+    assert list(sorted(b.hashes)) == [2110480117637990133, 10798773792509008305]
 
 
 def test_abundance_similarity():
@@ -1200,7 +1200,7 @@ def test_set_abundance_clear():
     a.set_abundances({1: 3, 2: 4}, clear=True)
     b.set_abundances({1: 3, 2: 4}, clear=False)
 
-    assert list(a.hashes) == list(b.hashes)
+    assert list(sorted(a.hashes)) == list(sorted(b.hashes))
 
 
 def test_set_abundance_clear_2():

From aa5441f37530d7269035e5be8c7f80d3bc88dd2a Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 29 Jul 2020 09:25:31 -0700
Subject: [PATCH 23/50] fix more tests for py2 <khaaaaaaaaaan>

---
 tests/test__minhash.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index 8e7fe40438..e5f8541662 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -759,7 +759,7 @@ def test_mh_merge(track_abundance):
     d = b.merge(a)
 
     assert len(c) == len(d)
-    assert list(sorted(c.hashes)) == list(d.hashes)
+    assert list(sorted(c.hashes)) == list(sorted(d.hashes))
 
     if track_abundance:
         assert round(c.similarity(d), 3) == 0.91
@@ -1189,7 +1189,7 @@ def test_set_abundance_2():
     new_mh.track_abundance = True
     new_mh.set_abundances(mins)
 
-    assert new_mh.hashes == mins
+    assert set(new_mh.hashes) == set(mins)
 
 
 def test_set_abundance_clear():

From 64f99e34b28f74cc8708d1efadf3bc49d357ae89 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 29 Jul 2020 17:05:23 -0700
Subject: [PATCH 24/50] add 'flatten' method to MinHash

---
 sourmash/minhash.py      | 11 +++++++++++
 sourmash/sig/__main__.py |  6 +-----
 2 files changed, 12 insertions(+), 5 deletions(-)

diff --git a/sourmash/minhash.py b/sourmash/minhash.py
index 3f2e9cb9d7..2c8c37c19a 100644
--- a/sourmash/minhash.py
+++ b/sourmash/minhash.py
@@ -555,6 +555,17 @@ def intersection(self, other, in_common=False):
 
         return common, max(size, 1)
 
+    def flatten(self):
+        """Return a new MinHash with track_abundance=False."""
+        # create new object:
+        a = MinHash(
+            self.num, self.ksize, self.is_protein, self.dayhoff, self.hp,
+            False, self.seed, self.max_hash
+        )
+        a.add_many(self)
+
+        return a
+
     def jaccard(self, other, downsample=False):
         "Calculate Jaccard similarity of two MinHash objects."
         if self.num != other.num:
diff --git a/sourmash/sig/__main__.py b/sourmash/sig/__main__.py
index ae7aa6d5d6..5c892668a1 100644
--- a/sourmash/sig/__main__.py
+++ b/sourmash/sig/__main__.py
@@ -677,11 +677,7 @@ def flatten(args):
             siglist = [ ss for ss in siglist if args.name in ss.name() ]
 
         for ss in siglist:
-            flattened_mh = ss.minhash.copy_and_clear()
-            flattened_mh.track_abundance = False
-            flattened_mh.add_many(ss.minhash.hashes)
-
-            ss.minhash = flattened_mh
+            ss.minhash = ss.minhash.flatten()
 
         outlist.extend(siglist)
 

From d6222b6e3cdfe9a9266c88e7958c19de6fd1b005 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 29 Jul 2020 17:10:09 -0700
Subject: [PATCH 25/50] add test for MinHash.flatten

---
 tests/test__minhash.py | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index e5f8541662..c14233432e 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -1517,3 +1517,30 @@ def test_try_change_hashes(track_abundance):
     h = a.hashes
     with pytest.raises(RuntimeError):
         h[5] = 10
+
+
+def test_flatten():
+    # test behavior with scaled
+    scaled = _get_scaled_for_max_hash(35)
+    mh = MinHash(0, 4, track_abundance=True, scaled=scaled)
+    assert mh.max_hash == 35
+
+    mh.add_hash(10)
+    mh.add_hash(10)
+    mh.add_hash(10)
+    mh.add_hash(20)
+    mh.add_hash(20)
+    mh.add_hash(30)
+    mh.add_hash(30)
+    mh.add_hash(30)
+
+    assert mh.hashes[10] == 3
+    assert mh.hashes[20] == 2
+    assert mh.hashes[30] == 3
+
+    mh2 = mh.flatten()
+
+    assert mh2.hashes[10] == 1
+    assert mh2.hashes[20] == 1
+    assert mh2.hashes[30] == 1
+    assert len(mh2) == 3

From 372f4ec23469f7865902d43d584580698e6215a4 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 30 Jul 2020 06:09:40 -0700
Subject: [PATCH 26/50] add tests for add and add_kmer

---
 sourmash/minhash.py    |  3 ++-
 tests/test__minhash.py | 44 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 46 insertions(+), 1 deletion(-)

diff --git a/sourmash/minhash.py b/sourmash/minhash.py
index 2c8c37c19a..5630b37f70 100644
--- a/sourmash/minhash.py
+++ b/sourmash/minhash.py
@@ -277,7 +277,8 @@ def add(self, kmer):
 
     def add_kmer(self, kmer):
         "Add a kmer into the sketch."
-        assert len(kmer) == self.ksize
+        if len(kmer) != self.ksize:
+            raise ValueError("kmer to add is not {} in length".format(self.ksize))
         self.add_sequence(kmer)
 
     def add_many(self, hashes):
diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index c14233432e..4eedf87bd2 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -1544,3 +1544,47 @@ def test_flatten():
     assert mh2.hashes[20] == 1
     assert mh2.hashes[30] == 1
     assert len(mh2) == 3
+
+
+def test_add_kmer(track_abundance):
+    # test add_kmer method
+    mh1 = MinHash(0, 4, scaled=1, track_abundance=track_abundance)
+    mh2 = MinHash(0, 4, scaled=1, track_abundance=track_abundance)
+
+    mh1.add_sequence('ATGCGTGC')
+    a = mh1.hashes
+
+    mh2.add_kmer('ATGC')
+    mh2.add_kmer('TGCG')
+    mh2.add_kmer('GCGT')
+    mh2.add_kmer('CGTG')
+    mh2.add_kmer('GTGC')
+    b = mh2.hashes
+
+    assert set(a.items()) == set(b.items())
+
+
+def test_add_kmer_too_long(track_abundance):
+    # test add_kmer method - should only take length k
+    mh1 = MinHash(0, 4, scaled=1, track_abundance=track_abundance)
+
+    with pytest.raises(ValueError):
+        mh1.add_kmer('ATGCGTGC')
+
+
+def test_add_deprecated(track_abundance):
+    # test 'add' method, now deprecated
+    mh1 = MinHash(0, 4, scaled=1, track_abundance=track_abundance)
+    mh2 = MinHash(0, 4, scaled=1, track_abundance=track_abundance)
+
+    mh1.add_sequence('ATGCGTGC')
+    a = mh1.hashes
+
+    mh2.add('ATGC')
+    mh2.add('TGCG')
+    mh2.add('GCGT')
+    mh2.add('CGTG')
+    mh2.add('GTGC')
+    b = mh2.hashes
+
+    assert set(a.items()) == set(b.items())

From 4ef2505d59e5c328343f2f27dfdcfc2fcf0fae5f Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 30 Jul 2020 06:21:34 -0700
Subject: [PATCH 27/50] remove nonsense test

---
 tests/test__minhash.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index 4eedf87bd2..9fd4b27a07 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -1257,18 +1257,6 @@ def test_set_abundance_initialized():
     assert "Can only set track_abundance=True if the MinHash is empty" in e.value.args[0]
 
 
-def test_reviving_minhash():
-    # simulate reading a MinHash from disk
-    scaled = _get_max_hash_for_scaled(184467440737095520)
-    mh = MinHash(0, 21, scaled=scaled, seed=42, track_abundance=False)
-    mins = (28945103950853965, 74690756200987412, 82962372765557409,
-            93503551367950366, 106923350319729608, 135116761470196737,
-            160165359281648267, 162390811417732001, 177939655451276972)
-
-    for m in mins:
-        mh.add_hash(m)
-
-
 def test_set_abundance_num():
     a = MinHash(2, 10, track_abundance=True)
 

From 7b77b770f7a5dc5ac22e35f4d8938de046af9efe Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 30 Jul 2020 06:24:00 -0700
Subject: [PATCH 28/50] test the (now deprecated) get_mins function

---
 tests/test__minhash.py | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index 9fd4b27a07..b9b15bf19b 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -1576,3 +1576,20 @@ def test_add_deprecated(track_abundance):
     b = mh2.hashes
 
     assert set(a.items()) == set(b.items())
+
+
+def test_get_mins_deprecated(track_abundance):
+    mh = MinHash(0, 21, scaled=1, track_abundance=track_abundance)
+    mins = (28945103950853965, 74690756200987412, 82962372765557409)
+
+    mh.add_many(mins)
+    mh.add_many(mins)
+    mh.add_many(mins)
+    mh.add_many(mins)
+
+    assert set(mh.get_mins()) == set(mins)
+    if track_abundance:
+        d = mh.get_mins(with_abundance=True)
+        for k in mins:
+            assert d[k] == 4
+        assert len(d) == len(mins)

From 1cb391ca2f50b35e85fc62f2589104900fd0644d Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 30 Jul 2020 06:25:00 -0700
Subject: [PATCH 29/50] test (deprecated) get_hashes

---
 tests/test__minhash.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index b9b15bf19b..67e01a5ba6 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -1593,3 +1593,15 @@ def test_get_mins_deprecated(track_abundance):
         for k in mins:
             assert d[k] == 4
         assert len(d) == len(mins)
+
+
+def test_get_hashes_deprecated(track_abundance):
+    mh = MinHash(0, 21, scaled=1, track_abundance=track_abundance)
+    mins = (28945103950853965, 74690756200987412, 82962372765557409)
+
+    mh.add_many(mins)
+    mh.add_many(mins)
+    mh.add_many(mins)
+    mh.add_many(mins)
+
+    assert set(mh.get_hashes()) == set(mins)

From 899ec4c65f3d6524d1a031a080add8da94039272 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 30 Jul 2020 06:37:38 -0700
Subject: [PATCH 30/50] add tests for downsample and is_molecule_type

---
 tests/test__minhash.py | 94 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 94 insertions(+)

diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index 67e01a5ba6..3779179d83 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -1605,3 +1605,97 @@ def test_get_hashes_deprecated(track_abundance):
     mh.add_many(mins)
 
     assert set(mh.get_hashes()) == set(mins)
+
+
+def test_downsample_num(track_abundance):
+    # test downsample(num=...) function
+    mh = MinHash(10, 21, track_abundance=track_abundance)
+    for i in range(20):
+        mh.add_hash(i)
+
+    assert mh.num == 10
+    assert len(mh) == 10
+
+    assert list(sorted(mh.hashes)) == list(range(10))
+
+    mh2 = mh.downsample(num=5)
+    assert mh2.num == 5
+    assert len(mh2) == 5
+
+    assert list(sorted(mh2.hashes)) == list(range(5))
+
+
+def test_downsample_n_deprecated(track_abundance):
+    # test downsample_n(...) function, now deprecated
+    mh = MinHash(10, 21, track_abundance=track_abundance)
+    for i in range(20):
+        mh.add_hash(i)
+
+    assert mh.num == 10
+    assert len(mh) == 10
+
+    assert list(sorted(mh.hashes)) == list(range(10))
+
+    mh2 = mh.downsample_n(5)
+    assert mh2.num == 5
+    assert len(mh2) == 5
+
+    assert list(sorted(mh2.hashes)) == list(range(5))
+
+
+def test_downsample_scaled(track_abundance):
+    # test downsample(scaled...) method
+    mh = MinHash(0, 21, scaled=1, track_abundance=track_abundance)
+
+    mins = (1, 2, 3,
+            9223372036854775808 + 1, 9223372036854775808 + 2,
+            9223372036854775808 + 3)
+    mh.add_many(mins)
+
+    assert len(mh) == 6
+    assert list(mh.hashes) == list(mins)
+
+    mh2 = mh.downsample(scaled=2)
+    print(mh.max_hash, mh2.max_hash)
+
+    assert len(mh2) == 3
+    assert list(mh2.hashes) == list(mins[:3])
+
+
+def test_downsample_scaled_deprecated(track_abundance):
+    # test downsample_scaled(...) method, now deprecated
+    mh = MinHash(0, 21, scaled=1, track_abundance=track_abundance)
+
+    mins = (1, 2, 3,
+            9223372036854775808 + 1, 9223372036854775808 + 2,
+            9223372036854775808 + 3)
+    mh.add_many(mins)
+
+    assert len(mh) == 6
+    assert list(mh.hashes) == list(mins)
+
+    mh2 = mh.downsample_scaled(2)
+    print(mh.max_hash, mh2.max_hash)
+
+    assert len(mh2) == 3
+    assert list(mh2.hashes) == list(mins[:3])
+
+
+def test_is_molecule_type_1_deprecated(track_abundance):
+    mh = MinHash(1, 21, track_abundance=track_abundance)
+    assert mh.is_molecule_type('DNA')
+
+
+def test_is_molecule_type_2_deprecated(track_abundance):
+    mh = MinHash(1, 21, track_abundance=track_abundance, is_protein=True)
+    assert mh.is_molecule_type('protein')
+
+
+def test_is_molecule_type_3_deprecated(track_abundance):
+    mh = MinHash(1, 21, track_abundance=track_abundance, hp=True)
+    assert mh.is_molecule_type('hp')
+
+
+def test_is_molecule_type_4_deprecated(track_abundance):
+    mh = MinHash(1, 21, track_abundance=track_abundance, dayhoff=True)
+    assert mh.is_molecule_type('dayhoff')

From 6b33685bda3de62e335e9631b24a9ccde4fbdba2 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 30 Jul 2020 10:35:02 -0700
Subject: [PATCH 31/50] test moltype properties more explicitly

---
 tests/test__minhash.py | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index 3779179d83..7a97dae309 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -1681,21 +1681,42 @@ def test_downsample_scaled_deprecated(track_abundance):
     assert list(mh2.hashes) == list(mins[:3])
 
 
-def test_is_molecule_type_1_deprecated(track_abundance):
+def test_is_molecule_type_1(track_abundance):
     mh = MinHash(1, 21, track_abundance=track_abundance)
     assert mh.is_molecule_type('DNA')
+    assert mh.moltype == 'DNA'
+    assert mh.is_dna
+    assert not mh.is_protein
+    assert not mh.hp
+    assert not mh.dayhoff
 
 
-def test_is_molecule_type_2_deprecated(track_abundance):
+def test_is_molecule_type_2(track_abundance):
     mh = MinHash(1, 21, track_abundance=track_abundance, is_protein=True)
     assert mh.is_molecule_type('protein')
+    assert mh.moltype == 'protein'
+    assert not mh.is_dna
+    assert mh.is_protein
+    assert not mh.hp
+    assert not mh.dayhoff
 
 
-def test_is_molecule_type_3_deprecated(track_abundance):
+def test_is_molecule_type_3(track_abundance):
     mh = MinHash(1, 21, track_abundance=track_abundance, hp=True)
     assert mh.is_molecule_type('hp')
+    assert mh.moltype == 'hp'
+    assert not mh.is_dna
+    assert not mh.is_protein
+    assert mh.hp
+    assert not mh.dayhoff
+
 
 
-def test_is_molecule_type_4_deprecated(track_abundance):
+def test_is_molecule_type_4(track_abundance):
     mh = MinHash(1, 21, track_abundance=track_abundance, dayhoff=True)
     assert mh.is_molecule_type('dayhoff')
+    assert mh.moltype == 'dayhoff'
+    assert not mh.is_dna
+    assert not mh.is_protein
+    assert not mh.hp
+    assert mh.dayhoff

From 09068a6756ee795377ac712dd6c67ec8a4bad819 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 30 Jul 2020 12:20:51 -0700
Subject: [PATCH 32/50] fix py27

---
 tests/test__minhash.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index 7a97dae309..2c6e551aef 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -1653,13 +1653,13 @@ def test_downsample_scaled(track_abundance):
     mh.add_many(mins)
 
     assert len(mh) == 6
-    assert list(mh.hashes) == list(mins)
+    assert list(sorted(mh.hashes)) == list(mins)
 
     mh2 = mh.downsample(scaled=2)
     print(mh.max_hash, mh2.max_hash)
 
     assert len(mh2) == 3
-    assert list(mh2.hashes) == list(mins[:3])
+    assert list(sorted(mh2.hashes)) == list(mins[:3])
 
 
 def test_downsample_scaled_deprecated(track_abundance):
@@ -1672,13 +1672,13 @@ def test_downsample_scaled_deprecated(track_abundance):
     mh.add_many(mins)
 
     assert len(mh) == 6
-    assert list(mh.hashes) == list(mins)
+    assert list(sorted(mh.hashes)) == list(mins)
 
     mh2 = mh.downsample_scaled(2)
     print(mh.max_hash, mh2.max_hash)
 
     assert len(mh2) == 3
-    assert list(mh2.hashes) == list(mins[:3])
+    assert list(sorted(mh2.hashes)) == list(mins[:3])
 
 
 def test_is_molecule_type_1(track_abundance):

From 2f6909c450d65082cee1a8235bca1cbe9fdd061b Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 1 Aug 2020 14:08:36 -0700
Subject: [PATCH 33/50] move translate_codon to module level

---
 sourmash/minhash.py    | 12 ++++++++++++
 tests/test__minhash.py | 16 ++++++++++++++--
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/sourmash/minhash.py b/sourmash/minhash.py
index 5630b37f70..5ddd64aaf6 100644
--- a/sourmash/minhash.py
+++ b/sourmash/minhash.py
@@ -74,6 +74,15 @@ def hash_murmur(kmer, seed=MINHASH_DEFAULT_SEED):
     return lib.hash_murmur(to_bytes(kmer), seed)
 
 
+def translate_codon(codon):
+    "Translate a codon into an amino acid."
+    try:
+        return rustcall(lib.sourmash_translate_codon,
+                        to_bytes(codon)).decode('utf-8')
+    except SourmashError as e:
+        raise ValueError(e.message)
+
+
 class _HashesWrapper(collections.Mapping):
     "A read-only view of the hashes contained by a MinHash object."
     def __init__(self, h):
@@ -433,6 +442,9 @@ def clear(self):
         "Clears all hashes and abundances."
         return self._methodcall(lib.kmerminhash_clear)
 
+    @deprecated(deprecated_in="3.5", removed_in="4.0",
+                current_version=VERSION,
+                details='Use translate_codon function at module level instead.')
     def translate_codon(self, codon):
         "Translate a codon into an amino acid."
         try:
diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index 2c6e551aef..077a639ff9 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -48,6 +48,7 @@
     hash_murmur,
     _get_scaled_for_max_hash,
     _get_max_hash_for_scaled,
+    translate_codon
 )
 from sourmash import signature
 
@@ -174,8 +175,8 @@ def test_protein_hp(track_abundance, hp):
         assert len(mh.hashes) == 4
 
 
-def test_translate_codon(track_abundance):
-    # Ensure that translation occurs properly
+def test_translate_codon_method_deprecated(track_abundance):
+    # Ensure that translation occurs properly - deprecated => module function
     mh = MinHash(10, 6, is_protein=True)
     assert mh.moltype == 'protein'
 
@@ -188,6 +189,17 @@ def test_translate_codon(track_abundance):
         mh.translate_codon("TCTA")
 
 
+def test_module_translate_codon(track_abundance):
+    # Ensure that translation occurs properly - module level function tests
+    assert "S" == translate_codon('TCT')
+    assert "S" == translate_codon('TC')
+    assert "X" == translate_codon("T")
+
+    with pytest.raises(ValueError):
+        translate_codon("")
+        translate_codon("TCTA")
+
+
 def test_dayhoff(track_abundance):
     # verify that we can hash to dayhoff-encoded protein/aa sequences
     mh_dayhoff = MinHash(10, 6, is_protein=True,

From 8d3c083af123e824baf5960b397a575193bdabd9 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 2 Aug 2020 08:40:58 -0700
Subject: [PATCH 34/50] put a stub in place of _minhash with a FutureWarning

---
 sourmash/_minhash.py   |  6 ++++++
 tests/test__minhash.py | 10 ++++++++++
 2 files changed, 16 insertions(+)
 create mode 100644 sourmash/_minhash.py

diff --git a/sourmash/_minhash.py b/sourmash/_minhash.py
new file mode 100644
index 0000000000..8e41fe9149
--- /dev/null
+++ b/sourmash/_minhash.py
@@ -0,0 +1,6 @@
+"Legacy / deprecated; will be removed in sourmash 4.0."
+import warnings
+
+warnings.warn("Please import from the top level sourmash module instead of using _minhash, which will be renamed in 4.x", FutureWarning)
+
+from .minhash import *
diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index 077a639ff9..6818f6f49b 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -1732,3 +1732,13 @@ def test_is_molecule_type_4(track_abundance):
     assert not mh.is_protein
     assert not mh.hp
     assert mh.dayhoff
+
+
+def test__minhash_import():
+    from sourmash._minhash import (
+        MinHash,
+        hash_murmur,
+        _get_scaled_for_max_hash,
+        _get_max_hash_for_scaled,
+        translate_codon
+    )

From 9a8cf64d07127249e590ca24bce0a23e9aee79ef Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sun, 2 Aug 2020 09:01:52 -0700
Subject: [PATCH 35/50] adjust import req

---
 tests/test__minhash.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index 6818f6f49b..899690c941 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -1738,7 +1738,5 @@ def test__minhash_import():
     from sourmash._minhash import (
         MinHash,
         hash_murmur,
-        _get_scaled_for_max_hash,
-        _get_max_hash_for_scaled,
         translate_codon
     )

From f8c9c008b07569773ff65119614ba2fb83dcaddd Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Tue, 4 Aug 2020 11:28:03 -0700
Subject: [PATCH 36/50] remove __future__ imports

---
 benchmarks/benchmarks.py            | 1 -
 setup.py                            | 1 -
 sourmash/__init__.py                | 1 -
 sourmash/__main__.py                | 1 -
 sourmash/_minhash.py                | 2 --
 sourmash/command_compute.py         | 2 --
 sourmash/commands.py                | 2 --
 sourmash/index.py                   | 1 -
 sourmash/lca/command_classify.py    | 1 -
 sourmash/lca/command_compare_csv.py | 1 -
 sourmash/lca/command_gather.py      | 1 -
 sourmash/lca/command_index.py       | 1 -
 sourmash/lca/command_rankinfo.py    | 1 -
 sourmash/lca/command_summarize.py   | 1 -
 sourmash/lca/lca_db.py              | 1 -
 sourmash/lca/lca_utils.py           | 1 -
 sourmash/logging.py                 | 1 -
 sourmash/nodegraph.py               | 1 -
 sourmash/sbt.py                     | 1 -
 sourmash/sbt_storage.py             | 2 --
 sourmash/sbtmh.py                   | 3 ---
 sourmash/search.py                  | 1 -
 sourmash/sig/__main__.py            | 1 -
 sourmash/signature.py               | 2 --
 tests/sourmash_tst_utils.py         | 1 -
 tests/test__minhash.py              | 3 ---
 tests/test_api.py                   | 1 -
 tests/test_bugs.py                  | 1 -
 tests/test_cmd_signature.py         | 1 -
 tests/test_index.py                 | 2 --
 tests/test_jaccard.py               | 2 --
 tests/test_lca.py                   | 1 -
 tests/test_sbt.py                   | 2 --
 tests/test_signature.py             | 2 --
 tests/test_sourmash.py              | 1 -
 tests/test_sourmash_compute.py      | 1 -
 36 files changed, 49 deletions(-)

diff --git a/benchmarks/benchmarks.py b/benchmarks/benchmarks.py
index cb2ae91ddf..4cbde86b39 100644
--- a/benchmarks/benchmarks.py
+++ b/benchmarks/benchmarks.py
@@ -1,4 +1,3 @@
-from __future__ import unicode_literals
 import random
 
 
diff --git a/setup.py b/setup.py
index 290325174d..c748f3eef3 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import os
 from setuptools import setup, find_packages
 import sys
diff --git a/sourmash/__init__.py b/sourmash/__init__.py
index dca58d86d2..d95234a3ae 100644
--- a/sourmash/__init__.py
+++ b/sourmash/__init__.py
@@ -2,7 +2,6 @@
 """
 An implementation of a MinHash bottom sketch, applied to k-mers in DNA.
 """
-from __future__ import print_function
 import re
 import math
 import os
diff --git a/sourmash/__main__.py b/sourmash/__main__.py
index 70428bb349..ef6b8665c4 100644
--- a/sourmash/__main__.py
+++ b/sourmash/__main__.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import sourmash
 
 
diff --git a/sourmash/_minhash.py b/sourmash/_minhash.py
index eee206db88..68760e296b 100644
--- a/sourmash/_minhash.py
+++ b/sourmash/_minhash.py
@@ -1,6 +1,4 @@
 # -*- coding: UTF-8 -*-
-from __future__ import unicode_literals, division
-
 import math
 import copy
 
diff --git a/sourmash/command_compute.py b/sourmash/command_compute.py
index b3ebc199e4..bd63dcf3d0 100644
--- a/sourmash/command_compute.py
+++ b/sourmash/command_compute.py
@@ -1,8 +1,6 @@
 """
 Functions implementing the 'compute' command and related functions.
 """
-from __future__ import print_function, division, absolute_import
-
 import os
 import os.path
 import sys
diff --git a/sourmash/commands.py b/sourmash/commands.py
index 7ae42f32fb..7adccc7152 100644
--- a/sourmash/commands.py
+++ b/sourmash/commands.py
@@ -1,8 +1,6 @@
 """
 Functions implementing the main command-line subcommands.
 """
-from __future__ import print_function, division, absolute_import
-
 import csv
 import os
 import os.path
diff --git a/sourmash/index.py b/sourmash/index.py
index 2a9eb8faef..9b33fe5899 100644
--- a/sourmash/index.py
+++ b/sourmash/index.py
@@ -1,6 +1,5 @@
 "An Abstract Base Class for collections of signatures."
 
-from __future__ import division
 from abc import abstractmethod
 from collections import namedtuple
 
diff --git a/sourmash/lca/command_classify.py b/sourmash/lca/command_classify.py
index 568040c56d..817ddcaf2d 100644
--- a/sourmash/lca/command_classify.py
+++ b/sourmash/lca/command_classify.py
@@ -2,7 +2,6 @@
 """
 Classify individual signature files down to deepest possible node.
 """
-from __future__ import print_function
 import sys
 import csv
 
diff --git a/sourmash/lca/command_compare_csv.py b/sourmash/lca/command_compare_csv.py
index 70e4780fba..3182cba9ef 100644
--- a/sourmash/lca/command_compare_csv.py
+++ b/sourmash/lca/command_compare_csv.py
@@ -2,7 +2,6 @@
 """
 Compare two taxonomy spreadsheets.
 """
-from __future__ import print_function
 import sys
 from collections import defaultdict
 
diff --git a/sourmash/lca/command_gather.py b/sourmash/lca/command_gather.py
index 812c6b3b41..3ae54fc81e 100644
--- a/sourmash/lca/command_gather.py
+++ b/sourmash/lca/command_gather.py
@@ -4,7 +4,6 @@
 
 Mimics `sourmash gather` but provides taxonomic information.
 """
-from __future__ import print_function, division
 import sys
 import csv
 from collections import Counter, defaultdict, namedtuple
diff --git a/sourmash/lca/command_index.py b/sourmash/lca/command_index.py
index 78b6a0663f..6735f6c290 100644
--- a/sourmash/lca/command_index.py
+++ b/sourmash/lca/command_index.py
@@ -2,7 +2,6 @@
 """
 Build a lowest-common-ancestor database with given taxonomy and genome sigs.
 """
-from __future__ import print_function
 import sys
 import csv
 from collections import defaultdict
diff --git a/sourmash/lca/command_rankinfo.py b/sourmash/lca/command_rankinfo.py
index cb1454ba7e..081f1bf481 100644
--- a/sourmash/lca/command_rankinfo.py
+++ b/sourmash/lca/command_rankinfo.py
@@ -2,7 +2,6 @@
 """
 Summarize rank-specific information from LCAs in one or more databases.
 """
-from __future__ import print_function
 import sys
 from collections import defaultdict
 
diff --git a/sourmash/lca/command_summarize.py b/sourmash/lca/command_summarize.py
index 74155a2d6c..efbb6560df 100644
--- a/sourmash/lca/command_summarize.py
+++ b/sourmash/lca/command_summarize.py
@@ -2,7 +2,6 @@
 """
 Summarize the taxonomic content of the given signatures, combined.
 """
-from __future__ import print_function
 import sys
 import csv
 from collections import defaultdict
diff --git a/sourmash/lca/lca_db.py b/sourmash/lca/lca_db.py
index 6c9ecea3cd..b7d9958a75 100644
--- a/sourmash/lca/lca_db.py
+++ b/sourmash/lca/lca_db.py
@@ -1,6 +1,5 @@
 "LCA database class and utilities."
 
-from __future__ import print_function, division
 import json
 import gzip
 from collections import OrderedDict, defaultdict, Counter
diff --git a/sourmash/lca/lca_utils.py b/sourmash/lca/lca_utils.py
index 98186fc603..3f8478e421 100644
--- a/sourmash/lca/lca_utils.py
+++ b/sourmash/lca/lca_utils.py
@@ -1,7 +1,6 @@
 """
 Utility functions for lowest-common-ancestor analysis tools.
 """
-from __future__ import print_function, division
 from os.path import exists
 from collections import namedtuple, defaultdict, Counter
 
diff --git a/sourmash/logging.py b/sourmash/logging.py
index 2c1de32d2a..49c3dc26b3 100644
--- a/sourmash/logging.py
+++ b/sourmash/logging.py
@@ -1,4 +1,3 @@
-from __future__ import print_function
 import sys
 from io import StringIO
 
diff --git a/sourmash/nodegraph.py b/sourmash/nodegraph.py
index c865a3c7c3..ccaf35697c 100644
--- a/sourmash/nodegraph.py
+++ b/sourmash/nodegraph.py
@@ -1,5 +1,4 @@
 # -*- coding: UTF-8 -*-
-from __future__ import unicode_literals, division, print_function
 
 from struct import pack, unpack
 import sys
diff --git a/sourmash/sbt.py b/sourmash/sbt.py
index 25226402ee..3b7a08de6b 100644
--- a/sourmash/sbt.py
+++ b/sourmash/sbt.py
@@ -41,7 +41,6 @@ def search_transcript(node, seq, threshold):
         return 0
 """
 
-from __future__ import print_function, unicode_literals, division
 
 from collections import namedtuple
 try:
diff --git a/sourmash/sbt_storage.py b/sourmash/sbt_storage.py
index 88d79e4886..4cc9b9baaf 100644
--- a/sourmash/sbt_storage.py
+++ b/sourmash/sbt_storage.py
@@ -1,5 +1,3 @@
-from __future__ import print_function, unicode_literals, division
-
 import abc
 from io import BytesIO
 import os
diff --git a/sourmash/sbtmh.py b/sourmash/sbtmh.py
index 751be6cdcd..4dd6cc8d00 100644
--- a/sourmash/sbtmh.py
+++ b/sourmash/sbtmh.py
@@ -1,6 +1,3 @@
-from __future__ import print_function
-from __future__ import division
-
 from io import BytesIO
 import sys
 
diff --git a/sourmash/search.py b/sourmash/search.py
index ad2da3a92b..dbd08c873b 100644
--- a/sourmash/search.py
+++ b/sourmash/search.py
@@ -1,4 +1,3 @@
-from __future__ import division
 from collections import namedtuple
 import sys
 
diff --git a/sourmash/sig/__main__.py b/sourmash/sig/__main__.py
index 0d1cd0a258..f4cc183fc9 100644
--- a/sourmash/sig/__main__.py
+++ b/sourmash/sig/__main__.py
@@ -1,7 +1,6 @@
 """
 Command-line entry point for 'python -m sourmash.sig'
 """
-from __future__ import print_function, unicode_literals
 import sys
 import csv
 import json
diff --git a/sourmash/signature.py b/sourmash/signature.py
index 4bcd9293cc..9ef3c466a7 100644
--- a/sourmash/signature.py
+++ b/sourmash/signature.py
@@ -2,8 +2,6 @@
 """
 Save and load MinHash sketches in a JSON format, along with some metadata.
 """
-from __future__ import print_function
-
 import sys
 import os
 import weakref
diff --git a/tests/sourmash_tst_utils.py b/tests/sourmash_tst_utils.py
index 34ce47513f..cf33c89b49 100644
--- a/tests/sourmash_tst_utils.py
+++ b/tests/sourmash_tst_utils.py
@@ -1,6 +1,5 @@
 "Various utilities used by sourmash tests."
 
-from __future__ import print_function
 import sys
 import os
 import tempfile
diff --git a/tests/test__minhash.py b/tests/test__minhash.py
index e4fe15d311..3cca807a90 100644
--- a/tests/test__minhash.py
+++ b/tests/test__minhash.py
@@ -33,9 +33,6 @@
 # Contact: titus@idyll.org
 # pylint: disable=missing-docstring,protected-access
 
-from __future__ import print_function
-from __future__ import absolute_import, unicode_literals
-
 import itertools
 import pickle
 import math
diff --git a/tests/test_api.py b/tests/test_api.py
index 243cc7ca62..fd8a66cabd 100644
--- a/tests/test_api.py
+++ b/tests/test_api.py
@@ -1,4 +1,3 @@
-from __future__ import print_function, unicode_literals
 import pytest
 import sourmash
 
diff --git a/tests/test_bugs.py b/tests/test_bugs.py
index d1276cfe28..20608f0a0d 100644
--- a/tests/test_bugs.py
+++ b/tests/test_bugs.py
@@ -1,4 +1,3 @@
-from __future__ import print_function, unicode_literals
 from . import sourmash_tst_utils as utils
 
 def test_bug_781():
diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py
index 73a8aa0706..9d06624477 100644
--- a/tests/test_cmd_signature.py
+++ b/tests/test_cmd_signature.py
@@ -1,7 +1,6 @@
 """
 Tests for the 'sourmash signature' command line.
 """
-from __future__ import print_function, unicode_literals
 import csv
 import shutil
 import os
diff --git a/tests/test_index.py b/tests/test_index.py
index 3904030663..8a335d24f0 100644
--- a/tests/test_index.py
+++ b/tests/test_index.py
@@ -1,5 +1,3 @@
-from __future__ import print_function, unicode_literals
-
 import glob
 import os
 import zipfile
diff --git a/tests/test_jaccard.py b/tests/test_jaccard.py
index 99716d496e..f48c014949 100644
--- a/tests/test_jaccard.py
+++ b/tests/test_jaccard.py
@@ -3,8 +3,6 @@
 objects.
 """
 
-from __future__ import print_function, unicode_literals
-
 import pytest
 from sourmash import MinHash
 from . import sourmash_tst_utils as utils
diff --git a/tests/test_lca.py b/tests/test_lca.py
index ff8312d798..7285952441 100644
--- a/tests/test_lca.py
+++ b/tests/test_lca.py
@@ -1,7 +1,6 @@
 """
 Tests for the 'sourmash lca' command line and high level API.
 """
-from __future__ import print_function, unicode_literals
 import os
 import shutil
 import csv
diff --git a/tests/test_sbt.py b/tests/test_sbt.py
index 05f654584a..827bcd5e4b 100644
--- a/tests/test_sbt.py
+++ b/tests/test_sbt.py
@@ -1,5 +1,3 @@
-from __future__ import print_function, unicode_literals
-
 import json
 import shutil
 import os
diff --git a/tests/test_signature.py b/tests/test_signature.py
index 94ef3770e0..d1ea5062f3 100644
--- a/tests/test_signature.py
+++ b/tests/test_signature.py
@@ -1,5 +1,3 @@
-from __future__ import print_function, unicode_literals
-
 import os
 
 import pytest
diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
index 251ed2b2b0..3735f4b6b7 100644
--- a/tests/test_sourmash.py
+++ b/tests/test_sourmash.py
@@ -1,7 +1,6 @@
 """
 Tests for the 'sourmash' command line.
 """
-from __future__ import print_function, unicode_literals
 import os
 import gzip
 import shutil
diff --git a/tests/test_sourmash_compute.py b/tests/test_sourmash_compute.py
index bc2ee59ef1..efa67d53a0 100644
--- a/tests/test_sourmash_compute.py
+++ b/tests/test_sourmash_compute.py
@@ -1,7 +1,6 @@
 """
 Tests for sourmash compute command-line functionality.
 """
-from __future__ import print_function, unicode_literals
 import os
 import gzip
 import shutil

From 5d8602096c67b182d97affbb95ee18e3b289dbc5 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Tue, 4 Aug 2020 11:29:45 -0700
Subject: [PATCH 37/50] remove sys.version checks for py 2

---
 sourmash/lca/command_index.py | 2 --
 sourmash/signature.py         | 3 +--
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/sourmash/lca/command_index.py b/sourmash/lca/command_index.py
index 6735f6c290..f43a20755f 100644
--- a/sourmash/lca/command_index.py
+++ b/sourmash/lca/command_index.py
@@ -24,8 +24,6 @@ def load_taxonomy_assignments(filename, delimiter=',', start_column=2,
     lineage tuples.
     """
     mode = 'rt'
-    if sys.version_info < (3, ):
-        mode = 'rtU'
 
     # parse spreadsheet!
     fp = open(filename, mode)
diff --git a/sourmash/signature.py b/sourmash/signature.py
index 9ef3c466a7..421a5c67ce 100644
--- a/sourmash/signature.py
+++ b/sourmash/signature.py
@@ -268,8 +268,7 @@ def load_signatures(
     try:
         if input_type == SigInput.FILE_LIKE:
             if hasattr(data, "mode") and "t" in data.mode:  # need to reopen handler as binary
-                if sys.version_info >= (3,):
-                    data = data.buffer
+                data = data.buffer
 
             buf = data.read()
             data.close()

From 274be2e69791498426281cd06809a88e365c4ba8 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Tue, 4 Aug 2020 11:30:30 -0700
Subject: [PATCH 38/50] remove requirement for enum34

---
 setup.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/setup.py b/setup.py
index c748f3eef3..399afafd7e 100644
--- a/setup.py
+++ b/setup.py
@@ -62,7 +62,6 @@ def build_native(spec):
         ]
     },
     "install_requires": ['screed>=0.9', 'cffi>=1.14.0', 'numpy',
-                         'enum34; python_version < "3.4"',
                          'matplotlib', 'scipy', 'deprecation>=2.0.6'],
     "setup_requires": [
         "setuptools>=38.6.0",

From cace054e4b532815fcb91112d913a15dd71c0766 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Tue, 4 Aug 2020 13:44:52 -0700
Subject: [PATCH 39/50] remove __reduce__ from MinHash class (#1144)

---
 sourmash/_minhash.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/sourmash/_minhash.py b/sourmash/_minhash.py
index eee206db88..cbdaa92eac 100644
--- a/sourmash/_minhash.py
+++ b/sourmash/_minhash.py
@@ -202,24 +202,6 @@ def __setstate__(self, tup):
         else:
             self.add_many(mins)
 
-    def __reduce__(self):
-        "alternative pickling protocol."
-        return (
-            MinHash,
-            (
-                self.num,
-                self.ksize,
-                self.is_protein,
-                self.dayhoff,
-                self.hp,
-                self.track_abundance,
-                self.seed,
-                self.max_hash,
-                self.get_mins(with_abundance=self.track_abundance),
-                0,
-            ),
-        )
-
     def __eq__(self, other):
         "equality testing via =="
         return self.__getstate__() == other.__getstate__()

From f10d632e89567bd5bcf567e3fa3fe0ba95f3a20f Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 5 Aug 2020 05:25:49 -0700
Subject: [PATCH 40/50] avoid the DeprecationWarning

---
 sourmash/minhash.py    | 5 +++--
 tests/test_sourmash.py | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/sourmash/minhash.py b/sourmash/minhash.py
index 1bba1db6f9..66750ca4e9 100644
--- a/sourmash/minhash.py
+++ b/sourmash/minhash.py
@@ -359,8 +359,9 @@ def num(self):
 
     @property
     def scaled(self):
-        if self.max_hash:
-            return _get_scaled_for_max_hash(self.max_hash)
+        mx = self._methodcall(lib.kmerminhash_max_hash)
+        if mx:
+            return _get_scaled_for_max_hash(mx)
         return 0
 
     @property
diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
index 251ed2b2b0..f64927cc42 100644
--- a/tests/test_sourmash.py
+++ b/tests/test_sourmash.py
@@ -4002,7 +4002,7 @@ def test_do_sourmash_index_zipfile_append(c):
         c.run_sourmash('index', '-k', '31', 'zzz.sbt.zip',
                        *first_half)
     # UserWarning is raised when there are duplicated entries in the zipfile
-    assert not record
+    assert not record, record
 
     outfile = c.output('zzz.sbt.zip')
     assert os.path.exists(outfile)

From 57679d215ff058e7a4eacbc92adf9c8dca7fbc0d Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 5 Aug 2020 05:42:16 -0700
Subject: [PATCH 41/50] update docs: only python 3.7 and 3.8

---
 README.md           | 2 +-
 doc/developer.md    | 2 +-
 doc/requirements.md | 5 ++---
 setup.py            | 5 ++---
 4 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/README.md b/README.md
index 93ced683f9..852567db76 100644
--- a/README.md
+++ b/README.md
@@ -54,7 +54,7 @@ A quickstart tutorial [is available](https://sourmash.readthedocs.io/en/latest/t
 
 ### Requirements
 
-sourmash runs under both Python 2.7.x and Python 3.5+.  The base
+sourmash runs under Python 3.7 and later.  The base
 requirements are screed, cffi, numpy, matplotlib, and scipy.  Conda
 (see below) will install everything necessary, and is our recommended
 installation method.
diff --git a/doc/developer.md b/doc/developer.md
index 52d80eac1e..3d4cb35189 100644
--- a/doc/developer.md
+++ b/doc/developer.md
@@ -7,7 +7,7 @@ You can get the latest development master branch with:
 ```
 git clone https://github.com/dib-lab/sourmash.git
 ```
-sourmash runs under both Python 2.7.x and Python 3.5+.  The base
+sourmash runs under Python 3.7 and later.  The base
 requirements are screed and cffi, together with a Rust environment (for the
 extension code). We suggest using `rustup` to install the Rust environment:
 
diff --git a/doc/requirements.md b/doc/requirements.md
index dd95ea8dd7..60d545a3d9 100644
--- a/doc/requirements.md
+++ b/doc/requirements.md
@@ -1,6 +1,5 @@
 # Computational requirements
 
-
 sourmash has no particular memory requirements; it will need to hold
 the largest single sequence you have in memory, but the individual
 signatures are quite small and we do no special buffer allocation.
@@ -11,8 +10,8 @@ in a second or so on a rather slow 2016 Mac laptop.
 
 MinHash sketches and signatures are quite small on disk.
 
-sourmash should run with little modification on Linux and Mac OS X,
-under Python 2.7.11 and Python 3.5.  Please see [the development repository README][0]
+sourmash should run with no modification on Linux and Mac OS X,
+under Python 3.7 and later.  Please see [the development repository README][0]
 for
 information on source code, tests, and continuous integration.
 [0]:https://github.com/dib-lab/sourmash/blob/master/README.md
diff --git a/setup.py b/setup.py
index 399afafd7e..0a1cbc8da2 100644
--- a/setup.py
+++ b/setup.py
@@ -36,9 +36,8 @@ def build_native(spec):
     "Operating System :: POSIX :: Linux",
     "Operating System :: MacOS :: MacOS X",
     "Programming Language :: Rust",
-    "Programming Language :: Python :: 2.7",
-    "Programming Language :: Python :: 3.5",
-    "Programming Language :: Python :: 3.6",
+    "Programming Language :: Python :: 3.7",
+    "Programming Language :: Python :: 3.8",
     "Topic :: Scientific/Engineering :: Bio-Informatics",
 ]
 

From d00e77d881ee14ccc0a3599b1c85ec24b5427a2f Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 5 Aug 2020 05:43:48 -0700
Subject: [PATCH 42/50] remove 2.7 from travis

---
 .travis.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.travis.yml b/.travis.yml
index 5c6a228bcf..ae968cea90 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -110,8 +110,6 @@ jobs:
       python: 3.7
       env:
         - TOXENV=docs
-    - <<: *test
-      python: 2.7
 
     - &wheel
       stage: build wheel and send to github releases

From fef2c6464977a9e4c5b75d937a11d6d08a4e5330 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 5 Aug 2020 05:54:06 -0700
Subject: [PATCH 43/50] remove _compat from signature.py

---
 sourmash/signature.py | 9 +--------
 1 file changed, 1 insertion(+), 8 deletions(-)

diff --git a/sourmash/signature.py b/sourmash/signature.py
index 421a5c67ce..695ba74769 100644
--- a/sourmash/signature.py
+++ b/sourmash/signature.py
@@ -12,7 +12,6 @@
 from ._minhash import to_bytes
 from ._lowlevel import ffi, lib
 from .utils import RustObject, rustcall, decode_str
-from ._compat import PY2
 
 
 SIGNATURE_VERSION = 0.4
@@ -208,12 +207,6 @@ def _detect_input_type(data):
         try:
             if data.find("sourmash_signature") > 0:
                 return SigInput.BUFFER
-            elif PY2:
-                try:
-                    if data.startswith(b'\x1F\x8B'):  # gzip compressed
-                        return SigInput.BUFFER
-                except UnicodeDecodeError:
-                    pass
         except TypeError:
             if data.find(b"sourmash_signature") > 0:
                 return SigInput.BUFFER
@@ -286,7 +279,7 @@ def load_signatures(
             )
 
         if input_type == SigInput.BUFFER:
-            if hasattr(data, "encode") and not PY2:
+            if hasattr(data, "encode"):
                 data = data.encode("utf-8")
 
             sigs_ptr = rustcall(

From 5745db13f8a8bf0e494b1c7fdacf243c25ee5dc6 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 5 Aug 2020 05:56:33 -0700
Subject: [PATCH 44/50] remove _compat from exceptions.py

---
 sourmash/exceptions.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/sourmash/exceptions.py b/sourmash/exceptions.py
index 6f73a59593..4895c50947 100644
--- a/sourmash/exceptions.py
+++ b/sourmash/exceptions.py
@@ -1,4 +1,3 @@
-from ._compat import implements_to_string
 from ._lowlevel import lib
 
 
@@ -6,7 +5,6 @@
 exceptions_by_code = {}
 
 
-@implements_to_string
 class SourmashError(Exception):
     code = None
 

From 1fe766865a1086a83847a40685dfd10dcbf55d46 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 5 Aug 2020 05:58:45 -0700
Subject: [PATCH 45/50] remove _compat from index and sbt_storage

---
 sourmash/index.py       | 4 +---
 sourmash/sbt_storage.py | 3 +--
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/sourmash/index.py b/sourmash/index.py
index 9b33fe5899..d8fb3ce8ca 100644
--- a/sourmash/index.py
+++ b/sourmash/index.py
@@ -1,10 +1,8 @@
 "An Abstract Base Class for collections of signatures."
 
-from abc import abstractmethod
+from abc import abstractmethod, ABC
 from collections import namedtuple
 
-from ._compat import ABC
-
 
 class Index(ABC):
     @abstractmethod
diff --git a/sourmash/sbt_storage.py b/sourmash/sbt_storage.py
index 4cc9b9baaf..4cd4cccc5f 100644
--- a/sourmash/sbt_storage.py
+++ b/sourmash/sbt_storage.py
@@ -6,8 +6,7 @@
 import tarfile
 from tempfile import NamedTemporaryFile
 import zipfile
-
-from ._compat import ABC
+from abc import ABC
 
 
 class Storage(ABC):

From 26e4c0d8f6c845d616e2377109f2d184bf3f130a Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 5 Aug 2020 06:01:11 -0700
Subject: [PATCH 46/50] remove _compat from nodegraph

---
 sourmash/nodegraph.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/sourmash/nodegraph.py b/sourmash/nodegraph.py
index ccaf35697c..b3270ab62d 100644
--- a/sourmash/nodegraph.py
+++ b/sourmash/nodegraph.py
@@ -4,7 +4,6 @@
 import sys
 from tempfile import NamedTemporaryFile
 
-from ._compat import string_types, range_type
 from ._lowlevel import ffi, lib
 from ._minhash import to_bytes, MinHash
 from .utils import RustObject, rustcall, decode_str
@@ -51,12 +50,12 @@ def update(self, other):
             raise TypeError("Must be a Nodegraph or MinHash")
 
     def count(self, h):
-        if isinstance(h, string_types):
+        if isinstance(h, str):
             return self._methodcall(lib.nodegraph_count_kmer, to_bytes(h))
         return self._methodcall(lib.nodegraph_count, h)
 
     def get(self, h):
-        if isinstance(h, string_types):
+        if isinstance(h, str):
             return self._methodcall(lib.nodegraph_get_kmer, to_bytes(h))
         return self._methodcall(lib.nodegraph_get, h)
 

From c5f1c4395b8ebf8a5eba302cead2bc46dd751ed0 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 5 Aug 2020 06:02:49 -0700
Subject: [PATCH 47/50] remove _compat completely

---
 sourmash/_compat.py  | 27 ---------------------------
 sourmash/_minhash.py |  5 ++---
 2 files changed, 2 insertions(+), 30 deletions(-)
 delete mode 100644 sourmash/_compat.py

diff --git a/sourmash/_compat.py b/sourmash/_compat.py
deleted file mode 100644
index 90f7afabf2..0000000000
--- a/sourmash/_compat.py
+++ /dev/null
@@ -1,27 +0,0 @@
-import abc
-import sys
-
-
-PY2 = sys.version_info[0] == 2
-
-if PY2:
-    text_type = unicode
-    int_types = (int, long)
-    string_types = (str, unicode)
-    range_type = xrange
-    itervalues = lambda x: x.itervalues()
-    NUL = '\x00'
-    def implements_to_string(cls):
-        cls.__unicode__ = cls.__str__
-        cls.__str__ = lambda x: x.__unicode__().encode('utf-8')
-        return cls
-    ABC = abc.ABCMeta(str('ABC'), (object,), {'__slots__': ()})
-else:
-    text_type = str
-    int_types = (int,)
-    string_types = (str,)
-    range_type = range
-    itervalues = lambda x: x.values()
-    NUL = 0
-    implements_to_string = lambda x: x
-    from abc import ABC
diff --git a/sourmash/_minhash.py b/sourmash/_minhash.py
index 68760e296b..f528389cd5 100644
--- a/sourmash/_minhash.py
+++ b/sourmash/_minhash.py
@@ -3,7 +3,6 @@
 import copy
 
 from . import VERSION
-from ._compat import string_types, range_type
 from ._lowlevel import ffi, lib
 from .utils import RustObject, rustcall, decode_str
 from .exceptions import SourmashError
@@ -52,10 +51,10 @@ def to_bytes(s):
     if isinstance(s, bytes):
         return s
 
-    if not isinstance(s, string_types + (bytes, int)):
+    if not isinstance(s, (str, bytes, int)):
         raise TypeError("Requires a string-like sequence")
 
-    if isinstance(s, string_types):
+    if isinstance(s, str):
         s = s.encode("utf-8")
     elif isinstance(s, int):
         s = bytes([s])

From ac6e2fcfc8fbff120a618d5ce7ed6f5e7210951d Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 5 Aug 2020 06:11:44 -0700
Subject: [PATCH 48/50] make signature -> sig in CLI using py3 'aliases'

---
 sourmash/cli/__init__.py           |  2 +-
 sourmash/cli/sig/__init__.py       |  2 +-
 sourmash/cli/signature/__init__.py | 45 ------------------------------
 3 files changed, 2 insertions(+), 47 deletions(-)
 delete mode 100644 sourmash/cli/signature/__init__.py

diff --git a/sourmash/cli/__init__.py b/sourmash/cli/__init__.py
index 083d097f7e..124427a55d 100644
--- a/sourmash/cli/__init__.py
+++ b/sourmash/cli/__init__.py
@@ -34,7 +34,7 @@
 # Subcommand groups
 from . import lca
 from . import sig
-from . import signature
+from . import sig as signature
 from . import storage
 
 
diff --git a/sourmash/cli/sig/__init__.py b/sourmash/cli/sig/__init__.py
index 5d7d1dc4fa..36a224ef86 100644
--- a/sourmash/cli/sig/__init__.py
+++ b/sourmash/cli/sig/__init__.py
@@ -25,7 +25,7 @@
 
 
 def subparser(subparsers):
-    subparser = subparsers.add_parser('sig', formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS)
+    subparser = subparsers.add_parser('sig', formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS, aliases=['signature'])
     desc = 'Operations\n'
     clidir = os.path.dirname(__file__)
     ops = command_list(clidir)
diff --git a/sourmash/cli/signature/__init__.py b/sourmash/cli/signature/__init__.py
deleted file mode 100644
index 7bc2d0ab7f..0000000000
--- a/sourmash/cli/signature/__init__.py
+++ /dev/null
@@ -1,45 +0,0 @@
-"""Define the command line interface for sourmash signature.
-
-Copy commands over from 'sourmash sig'.
-
-This can be removed once Python 2.7 is no longer supported, in favor of an
-'aliases' argument to add_subparser in ../sig/__init__.py.
-"""
-
-from ..sig import cat
-from ..sig import split
-from ..sig import describe
-from ..sig import downsample
-from ..sig import extract
-from ..sig import filter
-from ..sig import flatten
-from ..sig import intersect
-from ..sig import merge
-from ..sig import rename
-from ..sig import subtract
-from ..sig import ingest
-from ..sig import export
-from ..sig import overlap
-from ..utils import command_list
-from argparse import SUPPRESS, RawDescriptionHelpFormatter
-import os
-import sys
-
-
-def subparser(subparsers):
-    subparser = subparsers.add_parser('signature', formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS)
-    desc = 'Operations\n'
-    clidir = os.path.join(os.path.dirname(__file__), '../sig/')
-    ops = command_list(clidir)
-    for subcmd in ops:
-        docstring = getattr(sys.modules[__name__], subcmd).__doc__
-        helpstring = 'sourmash signature {op:s} --help'.format(op=subcmd)
-        desc += '        {hs:33s} {ds:s}\n'.format(hs=helpstring, ds=docstring)
-    s = subparser.add_subparsers(
-        title='Manipulate signature files', dest='subcmd', metavar='subcmd', help=SUPPRESS,
-        description=desc
-    )
-    for subcmd in ops:
-        getattr(sys.modules[__name__], subcmd).subparser(s)
-    subparser._action_groups.reverse()
-    subparser._optionals.title = 'Options'

From 310b2676aef046934deafbb9b5ee8239ed047500 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 5 Aug 2020 06:12:54 -0700
Subject: [PATCH 49/50] put back assert that didn't work in py2

---
 tests/test_cmd_signature.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py
index 9d06624477..bd4a4b2db7 100644
--- a/tests/test_cmd_signature.py
+++ b/tests/test_cmd_signature.py
@@ -16,16 +16,14 @@
 def test_run_sourmash_signature_cmd():
     status, out, err = utils.runscript('sourmash', ['signature'], fail_ok=True)
     assert not 'sourmash: error: argument cmd: invalid choice:' in err
-    # doesn't work in py2.7
-    # assert 'Manipulate signature files:' in out
+    assert 'Manipulate signature files:' in out
     assert status != 0                    # no args provided, ok ;)
 
 
 def test_run_sourmash_sig_cmd():
     status, out, err = utils.runscript('sourmash', ['sig'], fail_ok=True)
     assert not 'sourmash: error: argument cmd: invalid choice:' in err
-    # doesn't work in py2.7
-    # assert 'Manipulate signature files:' in out
+    assert 'Manipulate signature files:' in out
     assert status != 0                    # no args provided, ok ;)
 
 

From 91de874456bd50e7f68430bf16987224fb6a7051 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 5 Aug 2020 15:46:30 -0700
Subject: [PATCH 50/50] Update sourmash/minhash.py

Co-authored-by: Luiz Irber <luizirber@users.noreply.github.com>
---
 sourmash/minhash.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/sourmash/minhash.py b/sourmash/minhash.py
index d334d854ac..915a32c996 100644
--- a/sourmash/minhash.py
+++ b/sourmash/minhash.py
@@ -504,7 +504,7 @@ def downsample_max_hash(self, *others):
         """
         max_hashes = [x.max_hash for x in others]
         new_max_hash = min(self.max_hash, *max_hashes)
-        new_scaled = get_scaled_for_max_hash(new_max_hash)
+        new_scaled = _get_scaled_for_max_hash(new_max_hash)
 
         return self.downsample_scaled(new_scaled)