From b510e8e1620e4dc273513cf1632cbaa6f020cdac Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 2 Oct 2024 06:05:27 -0700 Subject: [PATCH 01/10] add support for ignoring abundance --- src/lib.rs | 7 ++++++- src/manysearch.rs | 3 ++- src/python/sourmash_plugin_branchwater/__init__.py | 5 ++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 40789191..26c62f09 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -24,7 +24,7 @@ mod singlesketch; use camino::Utf8PathBuf as PathBuf; #[pyfunction] -#[pyo3(signature = (querylist_path, siglist_path, threshold, ksize, scaled, moltype, output_path=None))] +#[pyo3(signature = (querylist_path, siglist_path, threshold, ksize, scaled, moltype, output_path=None, ignore_abundance=false))] fn do_manysearch( querylist_path: String, siglist_path: String, @@ -33,14 +33,18 @@ fn do_manysearch( scaled: usize, moltype: String, output_path: Option, + ignore_abundance: Option ) -> anyhow::Result { let againstfile_path: PathBuf = siglist_path.clone().into(); let selection = build_selection(ksize, scaled, &moltype); eprintln!("selection scaled: {:?}", selection.scaled()); let allow_failed_sigpaths = true; + let ignore_abundance = ignore_abundance.unwrap_or(false); + // if siglist_path is revindex, run mastiff_manysearch; otherwise run manysearch if is_revindex_database(&againstfile_path) { + // note: mastiff_manysearch ignores abundance automatically. match mastiff_manysearch::mastiff_manysearch( querylist_path, againstfile_path, @@ -63,6 +67,7 @@ fn do_manysearch( threshold, output_path, allow_failed_sigpaths, + ignore_abundance, ) { Ok(_) => Ok(0), Err(e) => { diff --git a/src/manysearch.rs b/src/manysearch.rs index a200b52d..e7703fdd 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -21,6 +21,7 @@ pub fn manysearch( threshold: f64, output: Option, allow_failed_sigpaths: bool, + ignore_abundance: bool, ) -> Result<()> { // Load query collection let query_collection = load_collection( @@ -72,7 +73,7 @@ pub fn manysearch( if let Some(against_mh) = against_sig.minhash() { for query in query_sketchlist.iter() { // to do - let user choose? - let calc_abund_stats = against_mh.track_abundance(); + let calc_abund_stats = against_mh.track_abundance() && !ignore_abundance; let against_mh_ds = against_mh.downsample_scaled(query.minhash.scaled()).unwrap(); let overlap = diff --git a/src/python/sourmash_plugin_branchwater/__init__.py b/src/python/sourmash_plugin_branchwater/__init__.py index 4280a257..2efc0bc6 100755 --- a/src/python/sourmash_plugin_branchwater/__init__.py +++ b/src/python/sourmash_plugin_branchwater/__init__.py @@ -65,6 +65,8 @@ def __init__(self, p): p.add_argument('-N', '--no-pretty-print', action='store_false', dest='pretty_print', help="do not display results (e.g. for large output)") + p.add_argument('--ignore-abundance', action='store_true', + help="do not do expensive abundance calculations") def main(self, args): print_version() @@ -80,7 +82,8 @@ def main(self, args): args.ksize, args.scaled, args.moltype, - args.output) + args.output, + args.ignore_abundance) if status == 0: notify(f"...manysearch is done! results in '{args.output}'") From 0993b39c6377f289c0ec199a3aecc53cc326928d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 2 Oct 2024 06:14:22 -0700 Subject: [PATCH 02/10] cargo fmt --- src/lib.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib.rs b/src/lib.rs index 26c62f09..1c7379d8 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -33,7 +33,7 @@ fn do_manysearch( scaled: usize, moltype: String, output_path: Option, - ignore_abundance: Option + ignore_abundance: Option, ) -> anyhow::Result { let againstfile_path: PathBuf = siglist_path.clone().into(); let selection = build_selection(ksize, scaled, &moltype); From ac82fb355a9aab5315da66c9b573a482fd6d6de7 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 4 Oct 2024 06:22:54 -0700 Subject: [PATCH 03/10] avoid downsampling until we know there is overlap --- src/lib.rs | 1 - src/manysearch.rs | 11 +++++------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 1c7379d8..0f653337 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,5 @@ /// Python interface Rust code for sourmash_plugin_branchwater. use pyo3::prelude::*; -use singlesketch::singlesketch; #[macro_use] extern crate simple_error; diff --git a/src/manysearch.rs b/src/manysearch.rs index e7703fdd..725124a8 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -72,15 +72,14 @@ pub fn manysearch( Ok(against_sig) => { if let Some(against_mh) = against_sig.minhash() { for query in query_sketchlist.iter() { - // to do - let user choose? - let calc_abund_stats = against_mh.track_abundance() && !ignore_abundance; - - let against_mh_ds = against_mh.downsample_scaled(query.minhash.scaled()).unwrap(); - let overlap = - query.minhash.count_common(&against_mh_ds, false).unwrap() as f64; + // avoid calculating details unless there is overlap + let overlap = query.minhash.count_common(against_mh, false).expect("incompatible sketches") as f64; // only calculate results if we have shared hashes if overlap > 0.0 { + let calc_abund_stats = against_mh.track_abundance() && !ignore_abundance; + + let against_mh_ds = against_mh.downsample_scaled(query.minhash.scaled()).expect("cannot downsample sketch"); let query_size = query.minhash.size() as f64; let containment_query_in_target = overlap / query_size; if containment_query_in_target > threshold { From 7ea9a402674d0d5ee68c1be58f9a5757dc30b5df Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 5 Oct 2024 11:44:46 -0700 Subject: [PATCH 04/10] change downsample to true; add panic assertion --- src/manysearch.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/manysearch.rs b/src/manysearch.rs index 725124a8..d73bd998 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -73,7 +73,7 @@ pub fn manysearch( if let Some(against_mh) = against_sig.minhash() { for query in query_sketchlist.iter() { // avoid calculating details unless there is overlap - let overlap = query.minhash.count_common(against_mh, false).expect("incompatible sketches") as f64; + let overlap = query.minhash.count_common(against_mh, true).expect("incompatible sketches") as f64; // only calculate results if we have shared hashes if overlap > 0.0 { @@ -104,6 +104,7 @@ pub fn manysearch( let max_containment_ani = Some(f64::max(qani, mani)); let (total_weighted_hashes, n_weighted_found, average_abund, median_abund, std_abund) = if calc_abund_stats { + panic!("should not be reached."); match query.minhash.inflated_abundances(&against_mh_ds) { Ok((abunds, sum_weighted_overlap)) => { let sum_all_abunds = against_mh_ds.sum_abunds() as usize; From 03b9da0f84a3298760ce984ec5f3b9638f2c8d04 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 5 Oct 2024 12:42:11 -0700 Subject: [PATCH 05/10] move downsampling side guard --- src/manysearch.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/manysearch.rs b/src/manysearch.rs index d73bd998..dd43ee65 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -79,7 +79,6 @@ pub fn manysearch( if overlap > 0.0 { let calc_abund_stats = against_mh.track_abundance() && !ignore_abundance; - let against_mh_ds = against_mh.downsample_scaled(query.minhash.scaled()).expect("cannot downsample sketch"); let query_size = query.minhash.size() as f64; let containment_query_in_target = overlap / query_size; if containment_query_in_target > threshold { @@ -105,6 +104,8 @@ pub fn manysearch( let (total_weighted_hashes, n_weighted_found, average_abund, median_abund, std_abund) = if calc_abund_stats { panic!("should not be reached."); + let against_mh_ds = against_mh.downsample_scaled(query.minhash.scaled()).expect("cannot downsample sketch"); + match query.minhash.inflated_abundances(&against_mh_ds) { Ok((abunds, sum_weighted_overlap)) => { let sum_all_abunds = against_mh_ds.sum_abunds() as usize; From b954daabea0d0d9879a2d9764ff56f0cc1fd48cd Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 5 Oct 2024 13:39:03 -0700 Subject: [PATCH 06/10] eliminate redundant overlap check --- src/manysearch.rs | 128 ++++++++++++++++++++++------------------------ 1 file changed, 62 insertions(+), 66 deletions(-) diff --git a/src/manysearch.rs b/src/manysearch.rs index dd43ee65..324f91eb 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -75,74 +75,70 @@ pub fn manysearch( // avoid calculating details unless there is overlap let overlap = query.minhash.count_common(against_mh, true).expect("incompatible sketches") as f64; + let calc_abund_stats = against_mh.track_abundance() && !ignore_abundance; + let query_size = query.minhash.size() as f64; + let containment_query_in_target = overlap / query_size; // only calculate results if we have shared hashes - if overlap > 0.0 { - let calc_abund_stats = against_mh.track_abundance() && !ignore_abundance; - - let query_size = query.minhash.size() as f64; - let containment_query_in_target = overlap / query_size; - if containment_query_in_target > threshold { - let target_size = against_mh.size() as f64; - let containment_target_in_query = overlap / target_size; - - let max_containment = - containment_query_in_target.max(containment_target_in_query); - let jaccard = overlap / (target_size + query_size - overlap); - - let qani = ani_from_containment( - containment_query_in_target, - against_mh.ksize() as f64, - ); - let mani = ani_from_containment( - containment_target_in_query, - against_mh.ksize() as f64, - ); - let query_containment_ani = Some(qani); - let match_containment_ani = Some(mani); - let average_containment_ani = Some((qani + mani) / 2.); - let max_containment_ani = Some(f64::max(qani, mani)); - - let (total_weighted_hashes, n_weighted_found, average_abund, median_abund, std_abund) = if calc_abund_stats { - panic!("should not be reached."); - let against_mh_ds = against_mh.downsample_scaled(query.minhash.scaled()).expect("cannot downsample sketch"); - - match query.minhash.inflated_abundances(&against_mh_ds) { - Ok((abunds, sum_weighted_overlap)) => { - let sum_all_abunds = against_mh_ds.sum_abunds() as usize; - let average_abund = sum_weighted_overlap as f64 / abunds.len() as f64; - let median_abund = median(abunds.iter().cloned()).unwrap(); - let std_abund = stddev(abunds.iter().cloned()); - (Some(sum_all_abunds), Some(sum_weighted_overlap as usize), Some(average_abund), Some(median_abund), Some(std_abund)) - } - Err(e) => { - eprintln!("Error calculating abundances for query: {}, against: {}; Error: {}", query.name, against_sig.name(), e); - continue; - } + if containment_query_in_target > threshold { + let target_size = against_mh.size() as f64; + let containment_target_in_query = overlap / target_size; + + let max_containment = + containment_query_in_target.max(containment_target_in_query); + let jaccard = overlap / (target_size + query_size - overlap); + + let qani = ani_from_containment( + containment_query_in_target, + against_mh.ksize() as f64, + ); + let mani = ani_from_containment( + containment_target_in_query, + against_mh.ksize() as f64, + ); + let query_containment_ani = Some(qani); + let match_containment_ani = Some(mani); + let average_containment_ani = Some((qani + mani) / 2.); + let max_containment_ani = Some(f64::max(qani, mani)); + + let (total_weighted_hashes, n_weighted_found, average_abund, median_abund, std_abund) = if calc_abund_stats { + let against_mh_ds = against_mh.downsample_scaled(query.minhash.scaled()).expect("cannot downsample sketch"); + + match query.minhash.inflated_abundances(&against_mh_ds) { + Ok((abunds, sum_weighted_overlap)) => { + let sum_all_abunds = against_mh_ds.sum_abunds() as usize; + let average_abund = sum_weighted_overlap as f64 / abunds.len() as f64; + let median_abund = median(abunds.iter().cloned()).unwrap(); + let std_abund = stddev(abunds.iter().cloned()); + (Some(sum_all_abunds), Some(sum_weighted_overlap as usize), Some(average_abund), Some(median_abund), Some(std_abund)) } - } else { - (None, None, None, None, None) - }; - - results.push(SearchResult { - query_name: query.name.clone(), - query_md5: query.md5sum.clone(), - match_name: against_sig.name(), - containment: containment_query_in_target, - intersect_hashes: overlap as usize, - match_md5: Some(against_sig.md5sum()), - jaccard: Some(jaccard), - max_containment: Some(max_containment), - average_abund, - median_abund, - std_abund, - query_containment_ani, - match_containment_ani, - average_containment_ani, - max_containment_ani, - n_weighted_found, - total_weighted_hashes, - }); - } + Err(e) => { + eprintln!("Error calculating abundances for query: {}, against: {}; Error: {}", query.name, against_sig.name(), e); + continue; + } + } + } else { + (None, None, None, None, None) + }; + + results.push(SearchResult { + query_name: query.name.clone(), + query_md5: query.md5sum.clone(), + match_name: against_sig.name(), + containment: containment_query_in_target, + intersect_hashes: overlap as usize, + match_md5: Some(against_sig.md5sum()), + jaccard: Some(jaccard), + max_containment: Some(max_containment), + average_abund, + median_abund, + std_abund, + query_containment_ani, + match_containment_ani, + average_containment_ani, + max_containment_ani, + n_weighted_found, + total_weighted_hashes, + }); } } } else { From b0bcc660b2b191ad820e5ba0ee63de376f37ff71 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 5 Oct 2024 13:39:43 -0700 Subject: [PATCH 07/10] move calc_abund_stats --- src/manysearch.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/manysearch.rs b/src/manysearch.rs index 324f91eb..82b7f5a3 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -75,7 +75,6 @@ pub fn manysearch( // avoid calculating details unless there is overlap let overlap = query.minhash.count_common(against_mh, true).expect("incompatible sketches") as f64; - let calc_abund_stats = against_mh.track_abundance() && !ignore_abundance; let query_size = query.minhash.size() as f64; let containment_query_in_target = overlap / query_size; // only calculate results if we have shared hashes @@ -100,6 +99,7 @@ pub fn manysearch( let average_containment_ani = Some((qani + mani) / 2.); let max_containment_ani = Some(f64::max(qani, mani)); + let calc_abund_stats = against_mh.track_abundance() && !ignore_abundance; let (total_weighted_hashes, n_weighted_found, average_abund, median_abund, std_abund) = if calc_abund_stats { let against_mh_ds = against_mh.downsample_scaled(query.minhash.scaled()).expect("cannot downsample sketch"); From a2871c0d58d4d3bddde224479d0e49376d100ad2 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 5 Oct 2024 14:34:46 -0700 Subject: [PATCH 08/10] extract abundance code into own function; avoid downsampling if poss --- src/manysearch.rs | 49 ++++++++++++++++++++++++++++++++--------------- 1 file changed, 34 insertions(+), 15 deletions(-) diff --git a/src/manysearch.rs b/src/manysearch.rs index 82b7f5a3..d93b3eb6 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -13,6 +13,9 @@ use crate::utils::{csvwriter_thread, load_collection, load_sketches, ReportType, use sourmash::ani_utils::ani_from_containment; use sourmash::selection::Selection; use sourmash::signature::SigsTrait; +use sourmash::sketch::minhash::KmerMinHash; +use sourmash::errors::SourmashError; + pub fn manysearch( query_filepath: String, @@ -101,21 +104,11 @@ pub fn manysearch( let calc_abund_stats = against_mh.track_abundance() && !ignore_abundance; let (total_weighted_hashes, n_weighted_found, average_abund, median_abund, std_abund) = if calc_abund_stats { - let against_mh_ds = against_mh.downsample_scaled(query.minhash.scaled()).expect("cannot downsample sketch"); - - match query.minhash.inflated_abundances(&against_mh_ds) { - Ok((abunds, sum_weighted_overlap)) => { - let sum_all_abunds = against_mh_ds.sum_abunds() as usize; - let average_abund = sum_weighted_overlap as f64 / abunds.len() as f64; - let median_abund = median(abunds.iter().cloned()).unwrap(); - let std_abund = stddev(abunds.iter().cloned()); - (Some(sum_all_abunds), Some(sum_weighted_overlap as usize), Some(average_abund), Some(median_abund), Some(std_abund)) - } - Err(e) => { - eprintln!("Error calculating abundances for query: {}, against: {}; Error: {}", query.name, against_sig.name(), e); - continue; - } - } + downsample_and_inflate_abundances(&query.minhash, against_mh).ok()? +// Err(e) => { +// eprintln!("Error calculating abundances for query: {}, against: {}; Error: {}", query.name, against_sig.name(), e); +// continue; +// } } else { (None, None, None, None, None) }; @@ -195,3 +188,29 @@ pub fn manysearch( Ok(()) } + + +fn downsample_and_inflate_abundances(query: &KmerMinHash, against: &KmerMinHash) -> Result<(Option, Option, Option, Option, Option), SourmashError> { + let query_scaled = query.scaled(); + let against_scaled = against.scaled(); + + let abunds: Vec; + let sum_weighted: u64; + let sum_all_abunds : usize; + + // avoid downsampling if we can + if against_scaled != query_scaled { + let against_ds = against.downsample_scaled(query.scaled()).expect("cannot downsample sketch"); + (abunds, sum_weighted) = query.inflated_abundances(&against_ds)?; + sum_all_abunds = against_ds.sum_abunds() as usize; + } else { + (abunds, sum_weighted) = query.inflated_abundances(against)?; + sum_all_abunds = against.sum_abunds() as usize; + } + + let average_abund = sum_weighted as f64 / abunds.len() as f64; + let median_abund = median(abunds.iter().cloned()).expect("error"); + let std_abund = stddev(abunds.iter().cloned()); + + Ok((Some(sum_all_abunds), Some(sum_weighted as usize), Some(average_abund), Some(median_abund), Some(std_abund))) +} From d853ef38a4367b475608592bed972d10f44d0c1c Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 5 Oct 2024 14:35:34 -0700 Subject: [PATCH 09/10] cleanup --- src/manysearch.rs | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/manysearch.rs b/src/manysearch.rs index d93b3eb6..d453c857 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -105,10 +105,6 @@ pub fn manysearch( let calc_abund_stats = against_mh.track_abundance() && !ignore_abundance; let (total_weighted_hashes, n_weighted_found, average_abund, median_abund, std_abund) = if calc_abund_stats { downsample_and_inflate_abundances(&query.minhash, against_mh).ok()? -// Err(e) => { -// eprintln!("Error calculating abundances for query: {}, against: {}; Error: {}", query.name, against_sig.name(), e); -// continue; -// } } else { (None, None, None, None, None) }; From 453f943351c6c702235e1b085cd04d3616b1a09a Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 5 Oct 2024 19:06:43 -0400 Subject: [PATCH 10/10] fmt --- src/manysearch.rs | 52 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 41 insertions(+), 11 deletions(-) diff --git a/src/manysearch.rs b/src/manysearch.rs index d453c857..d343493d 100644 --- a/src/manysearch.rs +++ b/src/manysearch.rs @@ -11,11 +11,10 @@ use std::sync::atomic::AtomicUsize; use crate::utils::{csvwriter_thread, load_collection, load_sketches, ReportType, SearchResult}; use sourmash::ani_utils::ani_from_containment; +use sourmash::errors::SourmashError; use sourmash::selection::Selection; use sourmash::signature::SigsTrait; use sourmash::sketch::minhash::KmerMinHash; -use sourmash::errors::SourmashError; - pub fn manysearch( query_filepath: String, @@ -76,7 +75,11 @@ pub fn manysearch( if let Some(against_mh) = against_sig.minhash() { for query in query_sketchlist.iter() { // avoid calculating details unless there is overlap - let overlap = query.minhash.count_common(against_mh, true).expect("incompatible sketches") as f64; + let overlap = query + .minhash + .count_common(against_mh, true) + .expect("incompatible sketches") + as f64; let query_size = query.minhash.size() as f64; let containment_query_in_target = overlap / query_size; @@ -102,9 +105,17 @@ pub fn manysearch( let average_containment_ani = Some((qani + mani) / 2.); let max_containment_ani = Some(f64::max(qani, mani)); - let calc_abund_stats = against_mh.track_abundance() && !ignore_abundance; - let (total_weighted_hashes, n_weighted_found, average_abund, median_abund, std_abund) = if calc_abund_stats { - downsample_and_inflate_abundances(&query.minhash, against_mh).ok()? + let calc_abund_stats = + against_mh.track_abundance() && !ignore_abundance; + let ( + total_weighted_hashes, + n_weighted_found, + average_abund, + median_abund, + std_abund, + ) = if calc_abund_stats { + downsample_and_inflate_abundances(&query.minhash, against_mh) + .ok()? } else { (None, None, None, None, None) }; @@ -185,18 +196,31 @@ pub fn manysearch( Ok(()) } - -fn downsample_and_inflate_abundances(query: &KmerMinHash, against: &KmerMinHash) -> Result<(Option, Option, Option, Option, Option), SourmashError> { +fn downsample_and_inflate_abundances( + query: &KmerMinHash, + against: &KmerMinHash, +) -> Result< + ( + Option, + Option, + Option, + Option, + Option, + ), + SourmashError, +> { let query_scaled = query.scaled(); let against_scaled = against.scaled(); let abunds: Vec; let sum_weighted: u64; - let sum_all_abunds : usize; + let sum_all_abunds: usize; // avoid downsampling if we can if against_scaled != query_scaled { - let against_ds = against.downsample_scaled(query.scaled()).expect("cannot downsample sketch"); + let against_ds = against + .downsample_scaled(query.scaled()) + .expect("cannot downsample sketch"); (abunds, sum_weighted) = query.inflated_abundances(&against_ds)?; sum_all_abunds = against_ds.sum_abunds() as usize; } else { @@ -208,5 +232,11 @@ fn downsample_and_inflate_abundances(query: &KmerMinHash, against: &KmerMinHash) let median_abund = median(abunds.iter().cloned()).expect("error"); let std_abund = stddev(abunds.iter().cloned()); - Ok((Some(sum_all_abunds), Some(sum_weighted as usize), Some(average_abund), Some(median_abund), Some(std_abund))) + Ok(( + Some(sum_all_abunds), + Some(sum_weighted as usize), + Some(average_abund), + Some(median_abund), + Some(std_abund), + )) }