From 1d2d6b89b08b6a5c74993ea8b6c1aaae3b79fc68 Mon Sep 17 00:00:00 2001 From: Max Bachmann Date: Sun, 31 Dec 2023 23:08:25 +0100 Subject: [PATCH] only boost similarity in Jaro-Winkler once the Jaro similarity exceeds 0.7 --- CHANGELOG.md | 2 ++ src/lib.rs | 22 +++++++++++++--------- 2 files changed, 15 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index de12bd4..d65999b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,6 +17,8 @@ This project attempts to adhere to [Semantic Versioning](http://semver.org). - reduce runtime in our own benchmark by more than `70%` - reduce binary size by more than `25%` +- only boost similarity in Jaro-Winkler once the Jaro similarity exceeds 0.7 + ### Fixed - Fix transposition counting in Jaro and Jaro-Winkler. diff --git a/src/lib.rs b/src/lib.rs index 0d0c78c..f761e3e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -179,16 +179,20 @@ where &'b Iter2: IntoIterator, Elem1: PartialEq, { - let jaro_distance = generic_jaro(a, b); + let sim = generic_jaro(a, b); - let prefix_length = a - .into_iter() - .take(4) - .zip(b.into_iter()) - .take_while(|(a_elem, b_elem)| a_elem == b_elem) - .count(); + if sim > 0.7 { + let prefix_length = a + .into_iter() + .take(4) + .zip(b.into_iter()) + .take_while(|(a_elem, b_elem)| a_elem == b_elem) + .count(); - return jaro_distance + 0.1 * prefix_length as f64 * (1.0 - jaro_distance); + sim + 0.1 * prefix_length as f64 * (1.0 - sim) + } else { + sim + } } /// Like Jaro but gives a boost to strings that have a common prefix. @@ -918,7 +922,7 @@ mod tests { #[test] fn jaro_winkler_names() { - assert!((0.562 - jaro_winkler("Friedrich Nietzsche", "Fran-Paul Sartre")).abs() < 0.001); + assert!((0.452 - jaro_winkler("Friedrich Nietzsche", "Fran-Paul Sartre")).abs() < 0.001); } #[test]