From f5540d264631e725deb5de76a5b1ccce7e740f2c Mon Sep 17 00:00:00 2001 From: Cornelius Roemer Date: Mon, 23 Jan 2023 22:36:02 +0100 Subject: [PATCH] fix(suggestions): Replace wrong Jaro-Winkler Implementation of Jaro-Winkler similarity in the dguo/strsim-rs crate is wrong, causing strings with common prefix >=10 to all be considered perfect matches Using Jaro instead from the same crate fixes this issue Benefit of favoring long prefixes exists for matching common names But not for typo detection Hence use of Jaro instead of Jaro-Winkler is acceptable Confidence threshold adjusted so that `bar` is still suggested for `baz` since Jaro is strictly < Jaro-Winkler such an adjustment is expected. This is acceptable. While exact suggestions may change, the net change will be positive Suggestions are purely decorative and should thus not breaking change Fixes #4660 Also see https://github.com/dguo/strsim-rs/issues/53 --- src/parser/features/suggestions.rs | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/src/parser/features/suggestions.rs b/src/parser/features/suggestions.rs index b64fcce3971..b8bb7adfa15 100644 --- a/src/parser/features/suggestions.rs +++ b/src/parser/features/suggestions.rs @@ -4,10 +4,9 @@ use std::cmp::Ordering; // Internal use crate::builder::Command; -/// Produces multiple strings from a given list of possible values which are similar -/// to the passed in value `v` within a certain confidence by least confidence. -/// Thus in a list of possible values like ["foo", "bar"], the value "fop" will yield -/// `Some("foo")`, whereas "blark" would yield `None`. +/// Find strings from an iterable of `possible_values` similar to a given value `v` +/// Returns a Vec of all possible values that exceed a similarity threshold +/// sorted by ascending similarity, most similar comes last #[cfg(feature = "suggestions")] pub(crate) fn did_you_mean(v: &str, possible_values: I) -> Vec where @@ -16,8 +15,11 @@ where { let mut candidates: Vec<(f64, String)> = possible_values .into_iter() - .map(|pv| (strsim::jaro_winkler(v, pv.as_ref()), pv.as_ref().to_owned())) - .filter(|(confidence, _)| *confidence > 0.8) + // GH #4660: using `jaro` because `jaro_winkler` implementation in `strsim-rs` is wrong + // causing strings with common prefix >=10 to be considered perfectly similar + .map(|pv| (strsim::jaro(v, pv.as_ref()), pv.as_ref().to_owned())) + // Confidence of 0.7 so that bar -> baz is suggested + .filter(|(confidence, _)| *confidence > 0.7) .collect(); candidates.sort_by(|a, b| a.0.partial_cmp(&b.0).unwrap_or(Ordering::Equal)); candidates.into_iter().map(|(_, pv)| pv).collect() @@ -112,6 +114,15 @@ mod test { ); } + #[test] + fn best_fit_long_common_prefix_issue_4660() { + let p_vals = ["alignmentScore", "alignmentStart"]; + assert_eq!( + did_you_mean("alignmentScorr", p_vals.iter()), + vec!["alignmentStart", "alignmentScore"] + ); + } + #[test] fn flag_missing_letter() { let p_vals = ["test", "possible", "values"];