Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MRG: standardize on u32 for scaled, and introduce ScaledType #3364

Merged
merged 22 commits into from
Nov 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 6 additions & 4 deletions include/sourmash.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,8 @@ typedef struct SourmashSignature SourmashSignature;

typedef struct SourmashZipStorage SourmashZipStorage;

typedef uint32_t ScaledType;

/**
* Represents a string.
*/
Expand Down Expand Up @@ -104,7 +106,7 @@ uint32_t computeparams_num_hashes(const SourmashComputeParameters *ptr);

bool computeparams_protein(const SourmashComputeParameters *ptr);

uint64_t computeparams_scaled(const SourmashComputeParameters *ptr);
ScaledType computeparams_scaled(const SourmashComputeParameters *ptr);

uint64_t computeparams_seed(const SourmashComputeParameters *ptr);

Expand All @@ -122,7 +124,7 @@ void computeparams_set_num_hashes(SourmashComputeParameters *ptr, uint32_t num);

void computeparams_set_protein(SourmashComputeParameters *ptr, bool v);

void computeparams_set_scaled(SourmashComputeParameters *ptr, uint64_t scaled);
void computeparams_set_scaled(SourmashComputeParameters *ptr, uint32_t scaled);

void computeparams_set_seed(SourmashComputeParameters *ptr, uint64_t new_seed);

Expand Down Expand Up @@ -230,7 +232,7 @@ SourmashStr kmerminhash_md5sum(const SourmashKmerMinHash *ptr);

void kmerminhash_merge(SourmashKmerMinHash *ptr, const SourmashKmerMinHash *other);

SourmashKmerMinHash *kmerminhash_new(uint64_t scaled,
SourmashKmerMinHash *kmerminhash_new(uint32_t scaled,
uint32_t k,
HashFunctions hash_function,
uint64_t seed,
Expand Down Expand Up @@ -342,7 +344,7 @@ SourmashRevIndex *revindex_new_with_sigs(const SourmashSignature *const *search_
const SourmashKmerMinHash *const *queries_ptr,
uintptr_t inqueries);

uint64_t revindex_scaled(const SourmashRevIndex *ptr);
ScaledType revindex_scaled(const SourmashRevIndex *ptr);

const SourmashSearchResult *const *revindex_search(const SourmashRevIndex *ptr,
const SourmashSignature *sig_ptr,
Expand Down
2 changes: 1 addition & 1 deletion src/core/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "sourmash"
version = "0.16.0"
version = "0.17.0"
authors = ["Luiz Irber <luiz@sourmash.bio>", "N. Tessa Pierce-Ward <tessa@sourmash.bio>"]
description = "tools for comparing biological sequences with k-mer sketches"
repository = "https://github.com/sourmash-bio/sourmash"
Expand Down
4 changes: 2 additions & 2 deletions src/core/src/ani_utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
use roots::{find_root_brent, SimpleConvergency};
use statrs::distribution::{ContinuousCDF, Normal};

use crate::Error;
use crate::{Error, ScaledType};

fn exp_n_mutated(l: f64, k: f64, r1: f64) -> f64 {
let q = r1_to_q(k, r1);
Expand Down Expand Up @@ -93,7 +93,7 @@ pub fn ani_from_containment(containment: f64, ksize: f64) -> f64 {
pub fn ani_ci_from_containment(
containment: f64,
ksize: f64,
scaled: u64,
scaled: ScaledType,
n_unique_kmers: u64,
confidence: Option<f64>,
) -> Result<(f64, f64), Error> {
Expand Down
4 changes: 2 additions & 2 deletions src/core/src/cmd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ pub struct ComputeParameters {
singleton: bool,

#[getset(get_copy = "pub", set = "pub")]
#[builder(default = 0u64)]
scaled: u64,
#[builder(default = 0u32)]
scaled: u32,

#[getset(get_copy = "pub", set = "pub")]
#[builder(default = false)]
Expand Down
5 changes: 3 additions & 2 deletions src/core/src/ffi/cmd/compute.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use std::slice;

use crate::cmd::ComputeParameters;
use crate::ScaledType;

use crate::ffi::utils::ForeignObject;

Expand Down Expand Up @@ -155,15 +156,15 @@ pub unsafe extern "C" fn computeparams_set_num_hashes(
}

#[no_mangle]
pub unsafe extern "C" fn computeparams_scaled(ptr: *const SourmashComputeParameters) -> u64 {
pub unsafe extern "C" fn computeparams_scaled(ptr: *const SourmashComputeParameters) -> ScaledType {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

k! was wondering where to do the conversion ;)

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ScaledType is just an alias, so it gets resolved to u32 in include/sourmash.h. So it doesn't change much now, but easier to play with different ScaledType values in the future =]

let cp = SourmashComputeParameters::as_rust(ptr);
cp.scaled()
}

#[no_mangle]
pub unsafe extern "C" fn computeparams_set_scaled(
ptr: *mut SourmashComputeParameters,
scaled: u64,
scaled: u32,
) {
let cp = SourmashComputeParameters::as_rust_mut(ptr);
cp.set_scaled(scaled);
Expand Down
7 changes: 4 additions & 3 deletions src/core/src/ffi/index/revindex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
use crate::signature::{Signature, SigsTrait};
use crate::sketch::minhash::KmerMinHash;
use crate::sketch::Sketch;
use crate::ScaledType;

pub struct SourmashRevIndex;

Expand All @@ -22,8 +23,8 @@
// TODO: remove this when it is possible to pass Selection thru the FFI
fn from_template(template: &Sketch) -> Selection {
let (num, scaled) = match template {
Sketch::MinHash(mh) => (mh.num(), mh.scaled() as u32),
Sketch::LargeMinHash(mh) => (mh.num(), mh.scaled() as u32),
Sketch::MinHash(mh) => (mh.num(), mh.scaled()),
Sketch::LargeMinHash(mh) => (mh.num(), mh.scaled()),
_ => unimplemented!(),
};

Expand Down Expand Up @@ -233,7 +234,7 @@
}

#[no_mangle]
pub unsafe extern "C" fn revindex_scaled(ptr: *const SourmashRevIndex) -> u64 {
pub unsafe extern "C" fn revindex_scaled(ptr: *const SourmashRevIndex) -> ScaledType {

Check warning on line 237 in src/core/src/ffi/index/revindex.rs

View check run for this annotation

Codecov / codecov/patch

src/core/src/ffi/index/revindex.rs#L237

Added line #L237 was not covered by tests
let revindex = SourmashRevIndex::as_rust(ptr);
if let Sketch::MinHash(mh) = revindex.template() {
mh.scaled()
Expand Down
2 changes: 1 addition & 1 deletion src/core/src/ffi/minhash.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ impl ForeignObject for SourmashKmerMinHash {

#[no_mangle]
pub unsafe extern "C" fn kmerminhash_new(
scaled: u64,
scaled: u32,
k: u32,
hash_function: HashFunctions,
seed: u64,
Expand Down
9 changes: 5 additions & 4 deletions src/core/src/index/linear.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,8 @@ impl LinearIndex {
.internal_location()
.into();
let match_sig = self.collection.sig_for_dataset(dataset_id)?;
let result = self.stats_for_match(match_sig, query, match_size, match_path, round)?;
let result =
self.stats_for_match(match_sig, query, match_size, match_path, round as u32)?;
Ok(result)
}

Expand All @@ -161,7 +162,7 @@ impl LinearIndex {
query: &KmerMinHash,
match_size: usize,
match_path: PathBuf,
gather_result_rank: usize,
gather_result_rank: u32,
) -> Result<GatherResult> {
let template = self.template();

Expand All @@ -176,10 +177,10 @@ impl LinearIndex {
let f_match = match_size as f64 / match_mh.size() as f64;
let filename = match_path.into_string();
let name = match_sig.name();
let unique_intersect_bp = match_mh.scaled() as usize * match_size;
let unique_intersect_bp = (match_mh.scaled() as usize * match_size) as u64;

let (intersect_orig, _) = match_mh.intersection_size(query)?;
let intersect_bp = (match_mh.scaled() * intersect_orig) as usize;
let intersect_bp: u64 = match_mh.scaled() as u64 * intersect_orig;

let f_unique_to_query = intersect_orig as f64 / query.size() as f64;
let match_ = match_sig;
Expand Down
29 changes: 15 additions & 14 deletions src/core/src/index/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ use crate::Result;
#[derive(TypedBuilder, CopyGetters, Getters, Setters, Serialize, Deserialize, Debug, PartialEq)]
pub struct GatherResult {
#[getset(get_copy = "pub")]
intersect_bp: usize,
intersect_bp: u64,

#[getset(get_copy = "pub")]
f_orig_query: f64,
Expand Down Expand Up @@ -72,22 +72,22 @@ pub struct GatherResult {
f_match_orig: f64,

#[getset(get_copy = "pub")]
unique_intersect_bp: usize,
unique_intersect_bp: u64,

#[getset(get_copy = "pub")]
gather_result_rank: usize,
gather_result_rank: u32,

#[getset(get_copy = "pub")]
remaining_bp: usize,
remaining_bp: u64,

#[getset(get_copy = "pub")]
n_unique_weighted_found: usize,
n_unique_weighted_found: u64,

#[getset(get_copy = "pub")]
total_weighted_hashes: usize,
total_weighted_hashes: u64,

#[getset(get_copy = "pub")]
sum_weighted_found: usize,
sum_weighted_found: u64,

#[getset(get_copy = "pub")]
query_containment_ani: f64,
Expand Down Expand Up @@ -212,9 +212,9 @@ pub fn calculate_gather_stats(
remaining_query: KmerMinHash,
match_sig: SigStore,
match_size: usize,
gather_result_rank: usize,
sum_weighted_found: usize,
total_weighted_hashes: usize,
gather_result_rank: u32,
sum_weighted_found: u64,
total_weighted_hashes: u64,
calc_abund_stats: bool,
calc_ani_ci: bool,
confidence: Option<f64>,
Expand Down Expand Up @@ -242,17 +242,18 @@ pub fn calculate_gather_stats(
trace!("query.size: {}", remaining_query.size());

//bp remaining in subtracted query
let remaining_bp = (remaining_query.size() - isect_size) * remaining_query.scaled() as usize;
let remaining_bp =
(remaining_query.size() - isect_size) as u64 * remaining_query.scaled() as u64;

// stats for this match vs original query
let (intersect_orig, _) = match_mh.intersection_size(orig_query).unwrap();
let intersect_bp = (match_mh.scaled() * intersect_orig) as usize;
let intersect_bp = match_mh.scaled() as u64 * intersect_orig;
let f_orig_query = intersect_orig as f64 / orig_query.size() as f64;
let f_match_orig = intersect_orig as f64 / match_mh.size() as f64;

// stats for this match vs current (subtracted) query
let f_match = match_size as f64 / match_mh.size() as f64;
let unique_intersect_bp = match_mh.scaled() as usize * isect_size;
let unique_intersect_bp = match_mh.scaled() as u64 * isect_size as u64;
let f_unique_to_query = isect_size as f64 / orig_query.size() as f64;

// // get ANI values
Expand Down Expand Up @@ -309,7 +310,7 @@ pub fn calculate_gather_stats(
}
};

n_unique_weighted_found = unique_weighted_found as usize;
n_unique_weighted_found = unique_weighted_found;
sum_total_weighted_found = sum_weighted_found + n_unique_weighted_found;
f_unique_weighted = n_unique_weighted_found as f64 / total_weighted_hashes as f64;

Expand Down
4 changes: 2 additions & 2 deletions src/core/src/index/revindex/disk_revindex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -411,7 +411,7 @@ impl RevIndexOps for RevIndex {
let query_mh = KmerMinHash::from(query.clone());

// just calculate essentials here
let gather_result_rank = matches.len();
let gather_result_rank = matches.len() as u32;

// grab the specific intersection:
// Calculate stats
Expand All @@ -422,7 +422,7 @@ impl RevIndexOps for RevIndex {
match_size,
gather_result_rank,
sum_weighted_found,
total_weighted_hashes.try_into().unwrap(),
total_weighted_hashes,
calc_abund_stats,
calc_ani_ci,
ani_confidence_interval_fraction,
Expand Down
1 change: 1 addition & 0 deletions src/core/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ cfg_if! {
}

type HashIntoType = u64;
pub type ScaledType = u32;

pub fn _hash_murmur(kmer: &[u8], seed: u64) -> u64 {
murmurhash3_x64_128(kmer, seed).0
Expand Down
6 changes: 3 additions & 3 deletions src/core/src/manifest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ use crate::encodings::HashFunctions;
use crate::prelude::*;
use crate::signature::SigsTrait;
use crate::sketch::Sketch;
use crate::Result;
use crate::{Result, ScaledType};

/// Individual manifest record, containing information about sketches.

Expand All @@ -38,7 +38,7 @@ pub struct Record {
num: u32,

#[getset(get = "pub")]
scaled: u64,
scaled: ScaledType,

#[getset(get = "pub")]
n_hashes: usize,
Expand Down Expand Up @@ -279,7 +279,7 @@ impl Select for Manifest {
};
valid = if let Some(scaled) = selection.scaled() {
// num sigs have row.scaled = 0, don't include them
valid && row.scaled != 0 && row.scaled <= scaled as u64
valid && row.scaled != 0 && row.scaled <= scaled
} else {
valid
};
Expand Down
8 changes: 4 additions & 4 deletions src/core/src/selection.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ use typed_builder::TypedBuilder;

use crate::encodings::HashFunctions;
use crate::manifest::Record;
use crate::Result;
use crate::{Result, ScaledType};

#[derive(Default, Debug, TypedBuilder, Clone)]
pub struct Selection {
Expand All @@ -17,7 +17,7 @@ pub struct Selection {
num: Option<u32>,

#[builder(default, setter(strip_option))]
scaled: Option<u32>,
scaled: Option<ScaledType>,

#[builder(default, setter(strip_option))]
containment: Option<bool>,
Expand Down Expand Up @@ -87,11 +87,11 @@ impl Selection {
self.num = Some(num);
}

pub fn scaled(&self) -> Option<u32> {
pub fn scaled(&self) -> Option<ScaledType> {
self.scaled
}

pub fn set_scaled(&mut self, scaled: u32) {
pub fn set_scaled(&mut self, scaled: ScaledType) {
self.scaled = Some(scaled);
}

Expand Down
6 changes: 3 additions & 3 deletions src/core/src/signature.rs
Original file line number Diff line number Diff line change
Expand Up @@ -812,7 +812,7 @@ impl Select for Signature {
// keep compatible scaled if applicable
valid = if let Some(sel_scaled) = selection.scaled() {
match s {
Sketch::MinHash(mh) => valid && mh.scaled() <= sel_scaled as u64,
Sketch::MinHash(mh) => valid && mh.scaled() <= sel_scaled,
// TODO: test LargeMinHash
// Sketch::LargeMinHash(lmh) => valid && lmh.scaled() <= sel_scaled as u64,
_ => valid, // other sketch types or invalid cases
Expand Down Expand Up @@ -841,8 +841,8 @@ impl Select for Signature {
for sketch in self.signatures.iter_mut() {
// TODO: also account for LargeMinHash
if let Sketch::MinHash(mh) = sketch {
if (mh.scaled() as u32) < sel_scaled {
*sketch = Sketch::MinHash(mh.clone().downsample_scaled(sel_scaled as u64)?);
if mh.scaled() < sel_scaled {
*sketch = Sketch::MinHash(mh.clone().downsample_scaled(sel_scaled)?);
}
}
}
Expand Down
Loading
Loading