@@ -46,6 +46,62 @@ pub fn lev_distance(a: &str, b: &str, limit: usize) -> Option<usize> {
46
46
( dcol[ m] <= limit) . then_some ( dcol[ m] )
47
47
}
48
48
49
+ /// Provides a word similarity score between two words that accounts for substrings being more
50
+ /// meaningful than a typical Levenshtein distance. The lower the score, the closer the match.
51
+ /// 0 is an identical match.
52
+ ///
53
+ /// Uses the Levenshtein distance between the two strings and removes the cost of the length
54
+ /// difference. If this is 0 then it is either a substring match or a full word match, in the
55
+ /// substring match case we detect this and return `1`. To prevent finding meaningless substrings,
56
+ /// eg. "in" in "shrink", we only perform this subtraction of length difference if one of the words
57
+ /// is not greater than twice the length of the other. For cases where the words are close in size
58
+ /// but not an exact substring then the cost of the length difference is discounted by half.
59
+ ///
60
+ /// Returns `None` if the distance exceeds the limit.
61
+ pub fn lev_distance_with_substrings ( a : & str , b : & str , limit : usize ) -> Option < usize > {
62
+ let n = a. chars ( ) . count ( ) ;
63
+ let m = b. chars ( ) . count ( ) ;
64
+
65
+ // Check one isn't less than half the length of the other. If this is true then there is a
66
+ // big difference in length.
67
+ let big_len_diff = ( n * 2 ) < m || ( m * 2 ) < n;
68
+ let len_diff = if n < m { m - n } else { n - m } ;
69
+ let lev = lev_distance ( a, b, limit + len_diff) ?;
70
+
71
+ // This is the crux, subtracting length difference means exact substring matches will now be 0
72
+ let score = lev - len_diff;
73
+
74
+ // If the score is 0 but the words have different lengths then it's a substring match not a full
75
+ // word match
76
+ let score = if score == 0 && len_diff > 0 && !big_len_diff {
77
+ 1 // Exact substring match, but not a total word match so return non-zero
78
+ } else if !big_len_diff {
79
+ // Not a big difference in length, discount cost of length difference
80
+ score + ( len_diff + 1 ) / 2
81
+ } else {
82
+ // A big difference in length, add back the difference in length to the score
83
+ score + len_diff
84
+ } ;
85
+
86
+ ( score <= limit) . then_some ( score)
87
+ }
88
+
89
+ /// Finds the best match for given word in the given iterator where substrings are meaningful.
90
+ ///
91
+ /// A version of [`find_best_match_for_name`] that uses [`lev_distance_with_substrings`] as the score
92
+ /// for word similarity. This takes an optional distance limit which defaults to one-third of the
93
+ /// given word.
94
+ ///
95
+ /// Besides the modified Levenshtein, we use case insensitive comparison to improve accuracy
96
+ /// on an edge case with a lower(upper)case letters mismatch.
97
+ pub fn find_best_match_for_name_with_substrings (
98
+ candidates : & [ Symbol ] ,
99
+ lookup : Symbol ,
100
+ dist : Option < usize > ,
101
+ ) -> Option < Symbol > {
102
+ find_best_match_for_name_impl ( true , candidates, lookup, dist)
103
+ }
104
+
49
105
/// Finds the best match for a given word in the given iterator.
50
106
///
51
107
/// As a loose rule to avoid the obviously incorrect suggestions, it takes
@@ -54,11 +110,20 @@ pub fn lev_distance(a: &str, b: &str, limit: usize) -> Option<usize> {
54
110
///
55
111
/// Besides Levenshtein, we use case insensitive comparison to improve accuracy
56
112
/// on an edge case with a lower(upper)case letters mismatch.
57
- #[ cold]
58
113
pub fn find_best_match_for_name (
59
114
candidates : & [ Symbol ] ,
60
115
lookup : Symbol ,
61
116
dist : Option < usize > ,
117
+ ) -> Option < Symbol > {
118
+ find_best_match_for_name_impl ( false , candidates, lookup, dist)
119
+ }
120
+
121
+ #[ cold]
122
+ fn find_best_match_for_name_impl (
123
+ use_substring_score : bool ,
124
+ candidates : & [ Symbol ] ,
125
+ lookup : Symbol ,
126
+ dist : Option < usize > ,
62
127
) -> Option < Symbol > {
63
128
let lookup = lookup. as_str ( ) ;
64
129
let lookup_uppercase = lookup. to_uppercase ( ) ;
@@ -74,7 +139,11 @@ pub fn find_best_match_for_name(
74
139
let mut dist = dist. unwrap_or_else ( || cmp:: max ( lookup. len ( ) , 3 ) / 3 ) ;
75
140
let mut best = None ;
76
141
for c in candidates {
77
- match lev_distance ( lookup, c. as_str ( ) , dist) {
142
+ match if use_substring_score {
143
+ lev_distance_with_substrings ( lookup, c. as_str ( ) , dist)
144
+ } else {
145
+ lev_distance ( lookup, c. as_str ( ) , dist)
146
+ } {
78
147
Some ( 0 ) => return Some ( * c) ,
79
148
Some ( d) => {
80
149
dist = d - 1 ;
0 commit comments