sourmash-bio · ctb · Jul 16, 2022 · Jul 8, 2022 · Jul 8, 2022 · Jul 8, 2022
diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
@@ -733,9 +733,9 @@ def gather(args):
                 else:
                     raise       # re-raise other errors, if no picklist.
 
-            save_prefetch.add_many(counter.siglist)
+            save_prefetch.add_many(counter.signatures())
             # subtract found hashes as we can.
-            for found_sig in counter.siglist:
+            for found_sig in counter.signatures():
                 noident_mh.remove_many(found_sig.minhash)
 
                 # optionally calculate and save prefetch csv
@@ -935,7 +935,7 @@ def multigather(args):
             counters = []
             for db in databases:
                 counter = db.counter_gather(prefetch_query, args.threshold_bp)
-                for found_sig in counter.siglist:
+                for found_sig in counter.signatures():
                     noident_mh.remove_many(found_sig.minhash)
                 counters.append(counter)
 

diff --git a/src/sourmash/index/__init__.py b/src/sourmash/index/__init__.py
@@ -39,11 +39,15 @@
 from abc import abstractmethod, ABC
 from collections import namedtuple, Counter
 
-from sourmash.search import (make_jaccard_search_query, make_gather_query,
+from sourmash.search import (make_jaccard_search_query,
+                             make_containment_query,
                              calc_threshold_from_bp)
 from sourmash.manifest import CollectionManifest
 from sourmash.logging import debug_literal
 from sourmash.signature import load_signatures, save_signatures
+from sourmash.minhash import (flatten_and_downsample_scaled,
+                              flatten_and_downsample_num,
+                              flatten_and_intersect_scaled)
 
 # generic return tuple for Index.search and Index.gather
 IndexSearchResult = namedtuple('Result', 'score, signature, location')
@@ -108,7 +112,7 @@ def find(self, search_fn, query, **kwargs):
 
         search_fn follows the protocol in JaccardSearch objects.
 
-        Returns a list.
+        Generator. Returns 0 or more IndexSearchResult objects.
         """
         # first: is this query compatible with this search?
         search_fn.check_is_compatible(query)
@@ -124,50 +128,19 @@ def find(self, search_fn, query, **kwargs):
             query_scaled = query_mh.scaled
 
             def prepare_subject(subj_mh):
-                assert subj_mh.scaled
-                if subj_mh.track_abundance:
-                    subj_mh = subj_mh.flatten()
-
-                # downsample subject to highest scaled
-                subj_scaled = subj_mh.scaled
-                if subj_scaled < query_scaled:
-                    return subj_mh.downsample(scaled=query_scaled)
-                else:
-                    return subj_mh
+                return flatten_and_downsample_scaled(subj_mh, query_scaled)
 
             def prepare_query(query_mh, subj_mh):
-                assert subj_mh.scaled
-
-                # downsample query to highest scaled
-                subj_scaled = subj_mh.scaled
-                if subj_scaled > query_scaled:
-                    return query_mh.downsample(scaled=subj_scaled)
-                else:
-                    return query_mh
+                return flatten_and_downsample_scaled(query_mh, subj_mh.scaled)
 
         else:                   # num
             query_num = query_mh.num
 
             def prepare_subject(subj_mh):
-                assert subj_mh.num
-                if subj_mh.track_abundance:
-                    subj_mh = subj_mh.flatten()
-
-                # downsample subject to smallest num
-                subj_num = subj_mh.num
-                if subj_num > query_num:
-                    return subj_mh.downsample(num=query_num)
-                else:
-                    return subj_mh
+                return flatten_and_downsample_num(subj_mh, query_num)
 
             def prepare_query(query_mh, subj_mh):
-                assert subj_mh.num
-                # downsample query to smallest num
-                subj_num = subj_mh.num
-                if subj_num < query_num:
-                    return query_mh.downsample(num=subj_num)
-                else:
-                    return query_mh
+                return flatten_and_downsample_num(query_mh, subj_mh.num)
 
         # now, do the search!
         for subj, location in self.signatures_with_location():
@@ -195,7 +168,7 @@ def prepare_query(query_mh, subj_mh):
                     yield IndexSearchResult(score, subj, location)
 
     def search_abund(self, query, *, threshold=None, **kwargs):
-        """Return set of matches with angular similarity above 'threshold'.
+        """Return list of IndexSearchResult with angular similarity above 'threshold'.
 
         Results will be sorted by similarity, highest to lowest.
         """
@@ -223,7 +196,7 @@ def search_abund(self, query, *, threshold=None, **kwargs):
     def search(self, query, *, threshold=None,
                do_containment=False, do_max_containment=False,
                best_only=False, **kwargs):
-        """Return set of matches with similarity above 'threshold'.
+        """Return list of IndexSearchResult with similarity above 'threshold'.
 
         Results will be sorted by similarity, highest to lowest.
 
@@ -239,50 +212,55 @@ def search(self, query, *, threshold=None,
         threshold = float(threshold)
 
         search_obj = make_jaccard_search_query(do_containment=do_containment,
-                                               do_max_containment=do_max_containment,
+                                        do_max_containment=do_max_containment,
                                                best_only=best_only,
                                                threshold=threshold)
 
         # do the actual search:
-        matches = []
-
-        for sr in self.find(search_obj, query, **kwargs):
-            matches.append(sr)
+        matches = list(self.find(search_obj, query, **kwargs))
 
         # sort!
         matches.sort(key=lambda x: -x.score)
         return matches
 
     def prefetch(self, query, threshold_bp, **kwargs):
-        "Return all matches with minimum overlap."
+        """Return all matches with minimum overlap.
+
+        Generator. Returns 0 or more IndexSearchResult namedtuples.
+        """
         if not self:            # empty database? quit.
             raise ValueError("no signatures to search")
 
-        search_fn = make_gather_query(query.minhash, threshold_bp,
-                                      best_only=False)
+        # default best_only to False
+        best_only = kwargs.get('best_only', False)
+
+        search_fn = make_containment_query(query.minhash, threshold_bp,
+                                           best_only=best_only)
 
         for sr in self.find(search_fn, query, **kwargs):
             yield sr
 
-    def gather(self, query, threshold_bp=None, **kwargs):
-        "Return the match with the best Jaccard containment in the Index."
+    def best_containment(self, query, threshold_bp=None, **kwargs):
+        """Return the match with the best Jaccard containment in the Index.
 
-        results = []
-        for result in self.prefetch(query, threshold_bp, **kwargs):
-            results.append(result)
+        Returns an IndexSearchResult namedtuple or None.
+        """
 
-        # sort results by best score.
-        results.sort(reverse=True,
-                     key=lambda x: (x.score, x.signature.md5sum()))
+        results = self.prefetch(query, threshold_bp, best_only=True, **kwargs)
+        results = sorted(results,
+                         key=lambda x: (-x.score, x.signature.md5sum()))
 
-        return results[:1]
+        try:
+            return next(iter(results))
+        except StopIteration:
+            return None
 
     def peek(self, query_mh, *, threshold_bp=0):
         """Mimic CounterGather.peek() on top of Index.
 
         This is implemented for situations where we don't want to use
         'prefetch' functionality. It is a light wrapper around the
-        'gather'/search-by-containment method.
+        'best_containment(...)' method.
         """
         from sourmash import SourmashSignature
 
@@ -291,22 +269,18 @@ def peek(self, query_mh, *, threshold_bp=0):
 
         # run query!
         try:
-            result = self.gather(query_ss, threshold_bp=threshold_bp)
+            result = self.best_containment(query_ss, threshold_bp=threshold_bp)
         except ValueError:
             result = None
 
         if not result:
             return []
 
         # if matches, calculate intersection & return.
-        sr = result[0]
-        match_mh = sr.signature.minhash
-        scaled = max(query_mh.scaled, match_mh.scaled)
-        match_mh = match_mh.downsample(scaled=scaled).flatten()
-        query_mh = query_mh.downsample(scaled=scaled)
-        intersect_mh = match_mh & query_mh
+        intersect_mh = flatten_and_intersect_scaled(result.signature.minhash,
+                                                    query_mh)
 
-        return [sr, intersect_mh]
+        return [result, intersect_mh]
 
     def consume(self, intersect_mh):
         "Mimic CounterGather.consume on top of Index. Yes, this is backwards."
@@ -326,7 +300,7 @@ def counter_gather(self, query, threshold_bp, **kwargs):
         prefetch_query.minhash = prefetch_query.minhash.flatten()
 
         # find all matches and construct a CounterGather object.
-        counter = CounterGather(prefetch_query.minhash)
+        counter = CounterGather(prefetch_query)
         for result in self.prefetch(prefetch_query, threshold_bp, **kwargs):
             counter.add(result.signature, location=result.location)
 
@@ -721,9 +695,14 @@ class CounterGather:
     This particular implementation maintains a collections.Counter that
     is used to quickly find the best match when 'peek' is called, but
     other implementations are possible ;).
+
+    Note that redundant matches (SourmashSignature objects) with
+    duplicate md5s are collapsed inside the class, because we use the
+    md5sum as a key into the dictionary used to store matches.
     """
-    def __init__(self, query_mh):
-        "Constructor - takes a query FracMinHash."
+    def __init__(self, query):
+        "Constructor - takes a query SourmashSignature."
+        query_mh = query.minhash
         if not query_mh.scaled:
             raise ValueError('gather requires scaled signatures')
 
@@ -732,8 +711,8 @@ def __init__(self, query_mh):
         self.scaled = query_mh.scaled
 
         # use these to track loaded matches & their locations
-        self.siglist = []
-        self.locations = []
+        self.siglist = {}
+        self.locations = {}
 
         # ...and also track overlaps with the progressive query
         self.counter = Counter()
@@ -749,11 +728,11 @@ def add(self, ss, *, location=None, require_overlap=True):
         # upon insertion, count & track overlap with the specific query.
         overlap = self.orig_query_mh.count_common(ss.minhash, True)
         if overlap:
-            i = len(self.siglist)
+            md5 = ss.md5sum()
 
-            self.counter[i] = overlap
-            self.siglist.append(ss)
-            self.locations.append(location)
+            self.counter[md5] = overlap
+            self.siglist[md5] = ss
+            self.locations[md5] = location
 
             # note: scaled will be max of all matches.
             self.downsample(ss.minhash.scaled)
@@ -766,6 +745,11 @@ def downsample(self, scaled):
             self.scaled = scaled
         return self.scaled
 
+    def signatures(self):
+        "Return all signatures."
+        for ss in self.siglist.values():
+            yield ss
+
     def peek(self, cur_query_mh, *, threshold_bp=0):
         "Get next 'gather' result for this database, w/o changing counters."
         self.query_started = 1
@@ -789,11 +773,11 @@ def peek(self, cur_query_mh, *, threshold_bp=0):
             raise ValueError("current query not a subset of original query")
 
         # are we setting a threshold?
-        threshold, n_threshold_hashes = calc_threshold_from_bp(threshold_bp,
-                                                               scaled,
-                                                             len(cur_query_mh))
-        # is it too high to ever match? if so, exit.
-        if threshold > 1.0:
+        try:
+            x = calc_threshold_from_bp(threshold_bp, scaled, len(cur_query_mh))
+            threshold, n_threshold_hashes = x
+        except ValueError:
+            # too high to ever match => exit
             return []
 
         # Find the best match using the internal Counter.

diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py
@@ -103,6 +103,39 @@ def translate_codon(codon):
         raise ValueError(e.message)
 
 
+def flatten_and_downsample_scaled(mh, *scaled_vals):
+    "Flatten MinHash object and downsample to max of scaled values."
+    assert mh.scaled
+    assert all( (x > 0 for x in scaled_vals) )
+
+    mh = mh.flatten()
+    scaled = max(scaled_vals)
+    if scaled > mh.scaled:
+        return mh.downsample(scaled=scaled)
+    return mh
+
+
+def flatten_and_downsample_num(mh, *num_vals):
+    "Flatten MinHash object and downsample to min of num values."
+    assert mh.num
+    assert all( (x > 0 for x in num_vals) )
+
+    mh = mh.flatten()
+    num = min(num_vals)
+    if num < mh.num:
+        return mh.downsample(num=num)
+    return mh
+
+
+def flatten_and_intersect_scaled(mh1, mh2):
+    "Flatten and downsample two scaled MinHash objs, then return intersection."
+    scaled = max(mh1.scaled, mh2.scaled)
+    mh1 = mh1.flatten().downsample(scaled=scaled)
+    mh2 = mh2.flatten().downsample(scaled=scaled)
+
+    return mh1 & mh2
+
+
 class _HashesWrapper(Mapping):
     "A read-only view of the hashes contained by a MinHash object."
     def __init__(self, h):

diff --git a/src/sourmash/search.py b/src/sourmash/search.py
@@ -20,13 +20,19 @@ def calc_threshold_from_bp(threshold_bp, scaled, query_size):
     n_threshold_hashes = 0
 
     if threshold_bp:
+        if threshold_bp < 0:
+            raise TypeError("threshold_bp must be non-negative")
+
         # if we have a threshold_bp of N, then that amounts to N/scaled
         # hashes:
         n_threshold_hashes = float(threshold_bp) / scaled
 
         # that then requires the following containment:
         threshold = n_threshold_hashes / query_size
 
+        # is it too high to ever match?
+        if threshold > 1.0:
+            raise ValueError("requested threshold_bp is unattainable with this query")
     return threshold, n_threshold_hashes
 
 
@@ -62,8 +68,8 @@ def make_jaccard_search_query(*,
     return search_obj
 
 
-def make_gather_query(query_mh, threshold_bp, *, best_only=True):
-    "Make a search object for gather."
+def make_containment_query(query_mh, threshold_bp, *, best_only=True):
+    "Make a search object for containment, with threshold_bp."
     if not query_mh:
         raise ValueError("query is empty!?")
 
@@ -72,21 +78,7 @@ def make_gather_query(query_mh, threshold_bp, *, best_only=True):
         raise TypeError("query signature must be calculated with scaled")
 
     # are we setting a threshold?
-    threshold = 0
-    if threshold_bp:
-        if threshold_bp < 0:
-            raise TypeError("threshold_bp must be non-negative")
-
-        # if we have a threshold_bp of N, then that amounts to N/scaled
-        # hashes:
-        n_threshold_hashes = threshold_bp / scaled
-
-        # that then requires the following containment:
-        threshold = n_threshold_hashes / len(query_mh)
-
-        # is it too high to ever match? if so, exit.
-        if threshold > 1.0:
-            raise ValueError("requested threshold_bp is unattainable with this query")
+    threshold, _ = calc_threshold_from_bp(threshold_bp, scaled, len(query_mh))
 
     if best_only:
         search_obj = JaccardSearchBestOnly(SearchType.CONTAINMENT,