Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Remove deprecated minhash functions #1149

Merged
merged 5 commits into from
Aug 6, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions doc/api-example.md
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ We can downsample this to 500 by extracting the hashes and using
Also note that there's a convenience function that does the same thing,
faster!
```
>>> smaller2 = larger.downsample_n(500)
>>> smaller2 = larger.downsample(num=500)
>>> smaller2 == smaller
True

Expand All @@ -312,7 +312,7 @@ The same can be done with scaled MinHashes:

And, again, there's a convenience function that you can use:
```
>>> small_scaled2 = large_scaled.downsample_scaled(500)
>>> small_scaled2 = large_scaled.downsample(scaled=500)
>>> small_scaled == small_scaled2
True

Expand Down
6 changes: 0 additions & 6 deletions sourmash/_minhash.py

This file was deleted.

10 changes: 5 additions & 5 deletions sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,7 @@ def compare(args):
if not printed_scaled_msg:
notify('downsampling to scaled value of {}'.format(max_scaled))
printed_scaled_msg = True
s.minhash = s.minhash.downsample_scaled(max_scaled)
s.minhash = s.minhash.downsample(scaled=max_scaled)

if len(siglist) == 0:
error('no signatures!')
Expand Down Expand Up @@ -389,7 +389,7 @@ def index(args):
nums.add(ss.minhash.num)

if args.scaled:
ss.minhash = ss.minhash.downsample_scaled(args.scaled)
ss.minhash = ss.minhash.downsample(scaled=args.scaled)
scaleds.add(ss.minhash.scaled)

tree.insert(ss)
Expand Down Expand Up @@ -450,7 +450,7 @@ def search(args):
if args.scaled != query.minhash.scaled:
notify('downsampling query from scaled={} to {}',
query.minhash.scaled, int(args.scaled))
query.minhash = query.minhash.downsample_scaled(args.scaled)
query.minhash = query.minhash.downsample(scaled=args.scaled)

# set up the search databases
databases = sourmash_args.load_dbs_and_sigs(args.databases, query,
Expand Down Expand Up @@ -610,7 +610,7 @@ def gather(args):
if args.scaled:
notify('downsampling query from scaled={} to {}',
query.minhash.scaled, int(args.scaled))
query.minhash = query.minhash.downsample_scaled(args.scaled)
query.minhash = query.minhash.downsample(scaled=args.scaled)

# empty?
if not len(query.minhash):
Expand Down Expand Up @@ -762,7 +762,7 @@ def multigather(args):
if args.scaled:
notify('downsampling query from scaled={} to {}',
query.minhash.scaled, int(args.scaled))
query.minhash = query.minhash.downsample_scaled(args.scaled)
query.minhash = query.minhash.downsample(scaled=args.scaled)

# empty?
if not len(query.minhash):
Expand Down
2 changes: 1 addition & 1 deletion sourmash/lca/command_classify.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def classify(args):
total_count += 1

# make sure we're looking at the same scaled value as database
query_sig.minhash = query_sig.minhash.downsample_scaled(scaled)
query_sig.minhash = query_sig.minhash.downsample(scaled=scaled)

# do the classification
lineage, status = classify_signature(query_sig, dblist,
Expand Down
2 changes: 1 addition & 1 deletion sourmash/lca/command_gather.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def gather_main(args):
debug('classifying', query_sig.name())

# make sure we're looking at the same scaled value as database
query_sig.minhash = query_sig.minhash.downsample_scaled(scaled)
query_sig.minhash = query_sig.minhash.downsample(scaled=scaled)

# do the classification, output results
found = []
Expand Down
2 changes: 1 addition & 1 deletion sourmash/lca/command_summarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def load_singletons_and_count(filenames, ksize, scaled, with_abundance, traverse

def count_signature(sig, scaled, hashvals):
"Downsample sig to given scaled, count hashvalues."
mh = sig.minhash.downsample_scaled(scaled)
mh = sig.minhash.downsample(scaled=scaled)

if mh.track_abundance:
abunds = mh.hashes
Expand Down
4 changes: 2 additions & 2 deletions sourmash/lca/lca_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def insert(self, sig, ident=None, lineage=None):
# downsample to specified scaled; this has the side effect of
# making sure they're all at the same scaled value!
try:
minhash = minhash.downsample_scaled(self.scaled)
minhash = minhash.downsample(scaled=self.scaled)
except ValueError:
raise ValueError("cannot downsample signature; is it a scaled signature?")

Expand Down Expand Up @@ -456,7 +456,7 @@ def _find_signatures(self, minhash, threshold, containment=False,
"""
# make sure we're looking at the same scaled value as database
if self.scaled > minhash.scaled:
minhash = minhash.downsample_scaled(self.scaled)
minhash = minhash.downsample(scaled=self.scaled)
elif self.scaled < minhash.scaled and not ignore_scaled:
# note that containment can be calculated w/o matching scaled.
raise ValueError("lca db scaled is {} vs query {}; must downsample".format(self.scaled, minhash.scaled))
Expand Down
127 changes: 0 additions & 127 deletions sourmash/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -258,13 +258,6 @@ def add_sequence(self, sequence, force=False):
self._methodcall(lib.kmerminhash_add_sequence, to_bytes(sequence),
force)

@deprecated(deprecated_in="3.5", removed_in="4.0",
current_version=VERSION,
details='Use add_kmer instead.')
def add(self, kmer):
"Add a kmer into the sketch."
self.add_sequence(kmer)

def add_kmer(self, kmer):
"Add a kmer into the sketch."
if len(kmer) != self.ksize:
Expand All @@ -286,13 +279,6 @@ def remove_many(self, hashes):
"Remove many hashes at once; ``hashes`` must be an iterable."
self._methodcall(lib.kmerminhash_remove_many, list(hashes), len(hashes))

@deprecated(deprecated_in="3.5", removed_in="4.0",
current_version=VERSION,
details='Use add_many instead.')
def update(self, other):
"Update this sketch from all the hashes in the other."
self.add_many(other)

def __len__(self):
"Number of hashes."
return self._methodcall(lib.kmerminhash_get_mins_size)
Expand Down Expand Up @@ -338,16 +324,6 @@ def hashes(self):
d = self.get_mins()
return _HashesWrapper({ k : 1 for k in d })

@deprecated(deprecated_in="3.5", removed_in="4.0",
current_version=VERSION)
def subtract_mins(self, other):
"""Get the list of mins in this MinHash, after removing the ones in
``other``.
"""
a = set(self.get_mins())
b = set(other.get_mins())
return a - b

@property
def seed(self):
return self._methodcall(lib.kmerminhash_seed)
Expand Down Expand Up @@ -424,17 +400,6 @@ def clear(self):
"Clears all hashes and abundances."
return self._methodcall(lib.kmerminhash_clear)

@deprecated(deprecated_in="3.5", removed_in="4.0",
current_version=VERSION,
details='Use translate_codon function at module level instead.')
def translate_codon(self, codon):
"Translate a codon into an amino acid."
try:
return rustcall(lib.sourmash_translate_codon,
to_bytes(codon)).decode('utf-8')
except SourmashError as e:
raise ValueError(e.message)

def count_common(self, other, downsample=False):
"""\
Return the number of hashes in common between ``self`` and ``other``.
Expand Down Expand Up @@ -487,69 +452,6 @@ def downsample(self, num=None, scaled=None):

return a

@deprecated(deprecated_in="3.5", removed_in="4.0",
current_version=VERSION,
details='Use downsample(num=...) instead.')
def downsample_n(self, new_num):
"Copy this object and downsample new object to num=``new_num``."
return self.downsample(num=new_num)

@deprecated(deprecated_in="3.5", removed_in="4.0",
current_version=VERSION,
details='Use scaled instead.')
def downsample_max_hash(self, *others):
"""Copy this object and downsample new object to min of ``*others``.

Here, ``*others`` is one or more MinHash objects.
"""
max_hashes = [x.max_hash for x in others]
new_max_hash = min(self.max_hash, *max_hashes)
new_scaled = _get_scaled_for_max_hash(new_max_hash)

return self.downsample_scaled(new_scaled)

@deprecated(deprecated_in="3.5", removed_in="4.0",
current_version=VERSION,
details='Use downsample(scaled=...) instead.')
def downsample_scaled(self, new_scaled):
"""Copy this object and downsample new object to scaled=``new_scaled``.
"""
return self.downsample(scaled=new_scaled)

@deprecated(deprecated_in="3.3", removed_in="4.0",
current_version=VERSION,
details='Use count_common or set methods instead.')
def intersection(self, other, in_common=False):
"""Calculate the intersection between ``self`` and ``other``, and
return ``(mins, size)`` where ``mins`` are the hashes in common, and
``size`` is the number of hashes.

if ``in_common``, return the actual hashes. Otherwise, mins will be
empty.
"""
if not isinstance(other, MinHash):
raise TypeError("Must be a MinHash!")

if self.num != other.num:
err = "must have same num: {} != {}".format(self.num, other.num)
raise TypeError(err)

if in_common:
# TODO: copy from buffer to Python land instead,
# this way involves more moving data around.
combined_mh = self.copy_and_clear()
combined_mh.merge(self)
combined_mh.merge(other)

size = len(combined_mh)
common = set(self.get_mins())
common.intersection_update(other.get_mins())
else:
size = self._methodcall(lib.kmerminhash_intersection, other._get_objptr())
common = set()

return common, max(size, 1)

def flatten(self):
"""Return a new MinHash with track_abundance=False."""
# create new object:
Expand All @@ -568,14 +470,6 @@ def jaccard(self, other, downsample=False):
raise TypeError(err)
return self._methodcall(lib.kmerminhash_similarity, other._get_objptr(), True, downsample)

@deprecated(deprecated_in="3.3", removed_in="4.0",
current_version=VERSION,
details="Use 'similarity' instead of compare.")
def compare(self, other, downsample=False):
"Calculate Jaccard similarity of two sketches."
return self.jaccard(other, downsample=downsample)


def similarity(self, other, ignore_abundance=False, downsample=False):
"""Calculate similarity of two sketches.

Expand Down Expand Up @@ -611,14 +505,6 @@ def contained_by(self, other, downsample=False):

return self.count_common(other, downsample) / len(self)

@deprecated(deprecated_in="3.3", removed_in="4.0",
current_version=VERSION,
details="Use 'contained_by' with downsample=True instead.")
def containment_ignore_maxhash(self, other):
"""Calculate contained_by, with downsampling.
"""
return self.contained_by(other, downsample=True)

def __iadd__(self, other):
if not isinstance(other, MinHash):
raise TypeError("Must be a MinHash!")
Expand Down Expand Up @@ -650,19 +536,6 @@ def add_protein(self, sequence):
"Add a protein sequence."
self._methodcall(lib.kmerminhash_add_protein, to_bytes(sequence))

@deprecated(deprecated_in="3.5", removed_in="4.0",
current_version=VERSION,
details='Use the moltype property instead.')
def is_molecule_type(self, molecule):
"""Check if this MinHash is a particular human-readable molecule type.

Supports 'protein', 'dayhoff', 'hp', 'DNA'.
@CTB deprecate for 4.0?
"""
if molecule.lower() not in ('protein', 'dayhoff', 'hp', 'dna'):
raise ValueError("unknown moltype in query, '{}'".format(molecule))
return molecule == self.moltype

@property
def moltype(self): # TODO: test in minhash tests
if self.is_protein:
Expand Down
2 changes: 1 addition & 1 deletion sourmash/sbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -308,7 +308,7 @@ def search(self, query, *args, **kwargs):
if tree_mh.scaled and query.minhash.scaled and \
tree_mh.scaled > query.minhash.scaled:
resampled_query_mh = tree_query.minhash
resampled_query_mh = resampled_query_mh.downsample_scaled(tree_mh.scaled)
resampled_query_mh = resampled_query_mh.downsample(scaled=tree_mh.scaled)
tree_query = SourmashSignature(resampled_query_mh)

# define both search function and post-search calculation function
Expand Down
4 changes: 2 additions & 2 deletions sourmash/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ def search_databases(query, databases, threshold, do_containment, best_only,
# build a new query object, subtracting found mins and downsampling
def _subtract_and_downsample(to_remove, old_query, scaled=None):
mh = old_query.minhash
mh = mh.downsample_scaled(scaled)
mh = mh.downsample(scaled=scaled)
mh.remove_many(to_remove)

return SourmashSignature(mh)
Expand Down Expand Up @@ -171,7 +171,7 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance):
float(len(orig_query_mins))

# calculate fractions wrt second denominator - metagenome size
orig_query_mh = orig_query_mh.downsample_scaled(cmp_scaled)
orig_query_mh = orig_query_mh.downsample(scaled=cmp_scaled)
query_n_mins = len(orig_query_mh)
f_unique_to_query = len(intersect_mins) / float(query_n_mins)

Expand Down
15 changes: 0 additions & 15 deletions sourmash_lib/__init__.py

This file was deleted.

Loading