diff --git a/doc/classifying-signatures.md b/doc/classifying-signatures.md index 7a95fd2064..10e311c33b 100644 --- a/doc/classifying-signatures.md +++ b/doc/classifying-signatures.md @@ -59,11 +59,12 @@ genome; it then subtracts that match from the metagenome, and repeats. At the end it reports how much of the metagenome remains unknown. The [basic sourmash tutorial](http://sourmash.readthedocs.io/en/latest/tutorials.html#what-s-in-my-metagenome) -has some sample output from using gather with GenBank. +has some sample output from using gather with GenBank. See the appendix at +the bottom of this page for more technical details. -Our preliminary benchmarking suggests that `gather` is the most accurate -method available for doing strain-level resolution of genomes. More on that -as we move forward! +Some benchmarking on CAMI suggests that `gather` is a very accurate +method for doing strain-level resolution of genomes. More on +that as we move forward! ## To do taxonomy, or not to do taxonomy? @@ -116,3 +117,48 @@ We suggest the following approach: This helps us figure out what people are actually interested in doing, and any help we provide via the issue tracker will eventually be added into the documentation. + +## Appendix: how `sourmash gather` works. + +The sourmash gather algorithm works as follows: + +* find the best match in the database, based on containment; +* subtract that match from the query; +* repeat. + +The output below is the CSV output for a fictional metagenome. + +The first column, `f_unique_to_query`, is the fraction of the database +match that is _unique_ with respect to the original query. It will +always decrease as you get more matches. + +The second column, `f_match_orig`, is how much of the match is in the +_original_ query. For this fictional metagenome, each match is +entirely contained in the original query. This is the number you would +get by running `sourmash search --containment `. + +The third column, `f_match`, is how much of the match is in the remaining +query metagenome, after all of the previous matches have been removed. + +The fourth column, `f_orig_query`, is how much of the original query +belongs to the match. This is the number you'd get by running +`sourmash search --containment `. + +``` +f_unique_to_query f_match_orig f_match f_orig_query +0.3321964529331514 1.0 1.0 0.3321964529331514 +0.13096862210095497 1.0 1.0 0.13096862210095497 +0.11527967257844475 1.0 0.898936170212766 0.12824010914051842 +0.10709413369713507 1.0 1.0 0.10709413369713507 +0.10368349249658936 1.0 0.3134020618556701 0.33083219645293316 +``` + +A few quick notes for the algorithmic folk out there -- + +* the key innovation for gather is that it looks for **groups** of + k-mers in the databases, and picks the best group (based on + containment). It does not treat k-mers individually. +* because of this, gather does not saturate as databases grow in size, + and in fact should only become more sensitive and specific as we + increase database size. (Although of course it may get a lot + slower...) diff --git a/sourmash/commands.py b/sourmash/commands.py index 1186560fb5..5c21849de5 100644 --- a/sourmash/commands.py +++ b/sourmash/commands.py @@ -662,7 +662,8 @@ def gather(args): if found and args.output: fieldnames = ['intersect_bp', 'f_orig_query', 'f_match', 'f_unique_to_query', 'f_unique_weighted', - 'average_abund', 'median_abund', 'std_abund', 'name', 'filename', 'md5'] + 'average_abund', 'median_abund', 'std_abund', 'name', + 'filename', 'md5', 'f_match_orig'] with FileOutput(args.output, 'wt') as fp: w = csv.DictWriter(fp, fieldnames=fieldnames) @@ -798,8 +799,9 @@ def multigather(args): output_csv = output_base + '.csv' fieldnames = ['intersect_bp', 'f_orig_query', 'f_match', - 'f_unique_to_query', 'f_unique_weighted', - 'average_abund', 'median_abund', 'std_abund', 'name', 'filename', 'md5'] + 'f_unique_to_query', 'f_unique_weighted', + 'average_abund', 'median_abund', 'std_abund', 'name', + 'filename', 'md5', 'f_match_orig'] with open(output_csv, 'wt') as fp: w = csv.DictWriter(fp, fieldnames=fieldnames) w.writeheader() diff --git a/sourmash/search.py b/sourmash/search.py index 066f033dfb..d012171966 100644 --- a/sourmash/search.py +++ b/sourmash/search.py @@ -59,7 +59,7 @@ def search_databases(query, databases, threshold, do_containment, best_only, ### GatherResult = namedtuple('GatherResult', - 'intersect_bp, f_orig_query, f_match, f_unique_to_query, f_unique_weighted, average_abund, median_abund, std_abund, filename, name, md5, match') + 'intersect_bp, f_orig_query, f_match, f_unique_to_query, f_unique_weighted, average_abund, median_abund, std_abund, filename, name, md5, match,f_match_orig') # build a new query object, subtracting found mins and downsampling @@ -101,6 +101,12 @@ def _find_best(dblist, query, threshold_bp): return best_cont, best_match, best_filename +def _filter_max_hash(values, max_hash): + for v in values: + if v < max_hash: + yield v + + def gather_databases(query, databases, threshold_bp, ignore_abundance): """ Iteratively find the best containment of `query` in all the `databases`, @@ -108,14 +114,14 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance): """ # track original query information for later usage. track_abundance = query.minhash.track_abundance and not ignore_abundance - orig_mh = query.minhash - orig_mins = orig_mh.get_hashes() - orig_abunds = { k: 1 for k in orig_mins } + orig_query_mh = query.minhash + orig_query_mins = orig_query_mh.get_hashes() # do we pay attention to abundances? + orig_query_abunds = { k: 1 for k in orig_query_mins } if track_abundance: import numpy as np - orig_abunds = orig_mh.get_mins(with_abundance=True) + orig_query_abunds = orig_query_mh.get_mins(with_abundance=True) cmp_scaled = query.minhash.scaled # initialize with resolution of query while query.minhash: @@ -142,15 +148,15 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance): # (CTB note: this means that if a high scaled/low res signature is # found early on, resolution will be low from then on.) new_max_hash = get_max_hash_for_scaled(cmp_scaled) - query_mins = set([ i for i in query_mins if i < new_max_hash ]) - found_mins = set([ i for i in found_mins if i < new_max_hash ]) - orig_mins = set([ i for i in orig_mins if i < new_max_hash ]) - sum_abunds = sum([ v for (k,v) in orig_abunds.items() if k < new_max_hash ]) + query_mins = set(_filter_max_hash(query_mins, new_max_hash)) + found_mins = set(_filter_max_hash(found_mins, new_max_hash)) + orig_query_mins = set(_filter_max_hash(orig_query_mins, new_max_hash)) + sum_abunds = sum(( v for (k,v) in orig_query_abunds.items() if k < new_max_hash )) - # calculate intersection: + # calculate intersection with query mins: intersect_mins = query_mins.intersection(found_mins) - intersect_orig_mins = orig_mins.intersection(found_mins) - intersect_bp = cmp_scaled * len(intersect_orig_mins) + intersect_orig_query_mins = orig_query_mins.intersection(found_mins) + intersect_bp = cmp_scaled * len(intersect_orig_query_mins) if intersect_bp < threshold_bp: # hard cutoff for now notify('found less than {} in common. => exiting', @@ -160,21 +166,28 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance): # calculate fractions wrt first denominator - genome size genome_n_mins = len(found_mins) f_match = len(intersect_mins) / float(genome_n_mins) - f_orig_query = len(intersect_orig_mins) / float(len(orig_mins)) + f_orig_query = len(intersect_orig_query_mins) / \ + float(len(orig_query_mins)) # calculate fractions wrt second denominator - metagenome size - orig_mh = orig_mh.downsample_scaled(cmp_scaled) - query_n_mins = len(orig_mh) + orig_query_mh = orig_query_mh.downsample_scaled(cmp_scaled) + query_n_mins = len(orig_query_mh) f_unique_to_query = len(intersect_mins) / float(query_n_mins) + # calculate fraction of subject match with orig query + f_match_orig = best_match.minhash.contained_by(orig_query_mh, + downsample=True) + # calculate scores weighted by abundances - f_unique_weighted = sum((orig_abunds[k] for k in intersect_mins)) \ - / sum_abunds + f_unique_weighted = sum((orig_query_abunds[k] for k in intersect_mins)) + f_unique_weighted /= sum_abunds # calculate stats on abundances, if desired. average_abund, median_abund, std_abund = 0, 0, 0 if track_abundance: - intersect_abunds = list((orig_abunds[k] for k in intersect_mins)) + intersect_abunds = (orig_query_abunds[k] for k in intersect_mins) + intersect_abunds = list(intersect_abunds) + average_abund = np.mean(intersect_abunds) median_abund = np.median(intersect_abunds) std_abund = np.std(intersect_abunds) @@ -183,6 +196,7 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance): result = GatherResult(intersect_bp=intersect_bp, f_orig_query=f_orig_query, f_match=f_match, + f_match_orig=f_match_orig, f_unique_to_query=f_unique_to_query, f_unique_weighted=f_unique_weighted, average_abund=average_abund, @@ -198,7 +212,7 @@ def gather_databases(query, databases, threshold_bp, ignore_abundance): # compute weighted_missed: query_mins -= set(found_mins) - weighted_missed = sum((orig_abunds[k] for k in query_mins)) \ + weighted_missed = sum((orig_query_abunds[k] for k in query_mins)) \ / sum_abunds yield result, weighted_missed, new_max_hash, query diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 0f308eabf8..28646d914f 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -2567,6 +2567,67 @@ def test_gather_file_output(): assert '910,1.0,1.0' in output +@utils.in_tempdir +def test_gather_f_match_orig(c): + import copy + + testdata_combined = utils.get_test_data('gather/combined.sig') + testdata_glob = utils.get_test_data('gather/GCF*.sig') + testdata_sigs = glob.glob(testdata_glob) + + c.run_sourmash('gather', testdata_combined, '-o', 'out.csv', + *testdata_sigs) + + combined_sig = sourmash.load_one_signature(testdata_combined, ksize=21) + remaining_mh = copy.copy(combined_sig.minhash) + + def approx_equal(a, b, n=5): + return round(a, n) == round(b, n) + + with open(c.output('out.csv'), 'rt') as fp: + r = csv.DictReader(fp) + for n, row in enumerate(r): + print(n, row['f_match'], row['f_match_orig']) + + # each match is completely in the original query + assert row['f_match_orig'] == "1.0" + + # double check -- should match 'search --containment'. + # (this is kind of useless for a 1.0 contained_by, I guess) + filename = row['filename'] + match = sourmash.load_one_signature(filename, ksize=21) + assert match.contained_by(combined_sig) == 1.0 + + # check other fields, too. + f_orig_query = float(row['f_orig_query']) + f_match_orig = float(row['f_match_orig']) + f_match = float(row['f_match']) + f_unique_to_query = float(row['f_unique_to_query']) + + # f_orig_query is the containment of the query by the match. + # (note, this only works because containment is 100% in combined). + assert approx_equal(combined_sig.contained_by(match), f_orig_query) + + # just redoing above, for completeness; this is always 1.0 for + # this data set. + assert approx_equal(match.contained_by(combined_sig), f_match_orig) + + # f_match is how much of the match is in the unallocated hashes + assert approx_equal(match.minhash.contained_by(remaining_mh), + f_match) + + # f_unique_to_query is how much of the match is unique wrt + # the original query. + a = set(remaining_mh.get_mins()) + b = set(match.minhash.get_mins()) + n_intersect = len(a.intersection(b)) + f_intersect = n_intersect / float(len(combined_sig.minhash)) + assert approx_equal(f_unique_to_query, f_intersect) + + # now, subtract current match from remaining... and iterate! + remaining_mh.remove_many(match.minhash.get_mins()) + + def test_gather_nomatch(): with utils.TempDirectory() as location: testdata_query = utils.get_test_data('gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig')