Skip to content

Commit

Permalink
MRG: change sig_from_record to use scaled from Record to downsamp…
Browse files Browse the repository at this point in the history
…le (#3387)

Fixes #3384.

This PR changes `Manifest::select` so that if `scaled` is set in the
selection, all matching `Record`s have their scaled value updated. It
also updates `Selection::from_record` to set `scaled` to match the
`Record` scaled value. In turn, this allows
`Collection::sig_from_record` to respect the specified `scaled` value
when loading `Signature`.

---------

Co-authored-by: Luiz Irber <luizirber@users.noreply.github.com>
  • Loading branch information
ctb and luizirber authored Nov 13, 2024
1 parent c6cadb4 commit 97e7808
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 45 deletions.
2 changes: 1 addition & 1 deletion src/core/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[package]
name = "sourmash"
version = "0.17.1"
authors = ["Luiz Irber <luiz@sourmash.bio>", "N. Tessa Pierce-Ward <tessa@sourmash.bio>"]
authors = ["Luiz Irber <luiz@sourmash.bio>", "N. Tessa Pierce-Ward <tessa@sourmash.bio>", "C. Titus Brown <titus@idyll.org>"]
description = "tools for comparing biological sequences with k-mer sketches"
repository = "https://github.com/sourmash-bio/sourmash"
keywords = ["minhash", "bioinformatics"]
Expand Down
11 changes: 5 additions & 6 deletions src/core/src/collection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -409,8 +409,8 @@ mod test {
// no sigs should remain
assert_eq!(cl.len(), 6);
for (_idx, rec) in cl.iter() {
// need to pass select again here so we actually downsample
let this_sig = cl.sig_from_record(rec).unwrap().select(&selection).unwrap();
dbg!("record scaled is: {}", rec.scaled());
let this_sig = cl.sig_from_record(rec).unwrap();
let this_mh = this_sig.minhash().unwrap();
assert_eq!(this_mh.scaled(), 2000);
}
Expand Down Expand Up @@ -440,7 +440,7 @@ mod test {
filename.push("../../tests/test-data/prot/hp.zip");
// create Selection object
let mut selection = Selection::default();
selection.set_scaled(100);
selection.set_scaled(200);
selection.set_moltype(HashFunctions::Murmur64Hp);
// load sigs into collection + select compatible signatures
let cl = Collection::from_zipfile(&filename)
Expand All @@ -450,10 +450,9 @@ mod test {
// count collection length
assert_eq!(cl.len(), 2);
for (idx, _rec) in cl.iter() {
// need to pass select again here so we actually downsample
let this_sig = cl.sig_for_dataset(idx).unwrap().select(&selection).unwrap();
let this_sig = cl.sig_for_dataset(idx).unwrap();
let this_mh = this_sig.minhash().unwrap();
assert_eq!(this_mh.scaled(), 100);
assert_eq!(this_mh.scaled(), 200);
}
}

Expand Down
63 changes: 26 additions & 37 deletions src/core/src/manifest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -259,8 +259,13 @@ impl Manifest {
}

impl Select for Manifest {
// select only records that satisfy selection conditions; also update
// scaled value to match.
fn select(self, selection: &Selection) -> Result<Self> {
let rows = self.records.iter().filter(|row| {
let Manifest { mut records } = self;

// TODO: with num as well?
records.retain_mut(|row| {
let mut valid = true;
valid = if let Some(ksize) = selection.ksize() {
row.ksize == ksize
Expand All @@ -279,7 +284,12 @@ impl Select for Manifest {
};
valid = if let Some(scaled) = selection.scaled() {
// num sigs have row.scaled = 0, don't include them
valid && row.scaled != 0 && row.scaled <= scaled
let v = valid && row.scaled != 0 && row.scaled <= scaled;
// if scaled is set, update!
if v {
row.scaled = scaled
};
v
} else {
valid
};
Expand All @@ -291,41 +301,7 @@ impl Select for Manifest {
valid
});

Ok(Manifest {
records: rows.cloned().collect(),
})

/*
matching_rows = self.rows
if ksize:
matching_rows = ( row for row in matching_rows
if row['ksize'] == ksize )
if moltype:
matching_rows = ( row for row in matching_rows
if row['moltype'] == moltype )
if scaled or containment:
if containment and not scaled:
raise ValueError("'containment' requires 'scaled' in Index.select'")
matching_rows = ( row for row in matching_rows
if row['scaled'] and not row['num'] )
if num:
matching_rows = ( row for row in matching_rows
if row['num'] and not row['scaled'] )
if abund:
# only need to concern ourselves if abundance is _required_
matching_rows = ( row for row in matching_rows
if row['with_abundance'] )
if picklist:
matching_rows = ( row for row in matching_rows
if picklist.matches_manifest_row(row) )
# return only the internal filenames!
for row in matching_rows:
yield row
*/
Ok(Manifest { records })
}
}

Expand Down Expand Up @@ -567,6 +543,19 @@ mod test {
selection.set_scaled(100);
let scaled100 = manifest.select(&selection).unwrap();
assert_eq!(scaled100.len(), 6);

// check that 'scaled' is updated
let manifest = collection.manifest().clone();
selection = Selection::default();
selection.set_scaled(400);
let scaled400 = manifest.select(&selection).unwrap();
assert_eq!(scaled400.len(), 6);
let max_scaled = scaled400
.iter()
.map(|r| r.scaled())
.max()
.expect("no records?!");
assert_eq!(*max_scaled, 400);
}

#[test]
Expand Down
2 changes: 1 addition & 1 deletion src/core/src/selection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ impl Selection {
abund: Some(row.with_abundance()),
moltype: Some(row.moltype()),
num: None,
scaled: None,
scaled: Some(*row.scaled()),
containment: None,
picklist: None,
})
Expand Down

0 comments on commit 97e7808

Please sign in to comment.