1#![forbid(unsafe_code)]
4#![allow(
5 clippy::cast_possible_wrap,
9 clippy::cast_sign_loss,
10 clippy::cast_precision_loss,
11 clippy::needless_pass_by_value,
13 clippy::similar_names,
14 clippy::missing_errors_doc,
16 clippy::missing_panics_doc,
17 clippy::must_use_candidate,
18 clippy::range_plus_one
20)]
21
22use std::char;
23use std::cmp::{max, min};
24use std::collections::HashMap;
25use std::convert::TryFrom;
26use std::error::Error;
27use std::fmt::{self, Display, Formatter};
28use std::hash::Hash;
29use std::mem;
30use std::str::Chars;
31
32#[derive(Debug, PartialEq)]
33pub enum StrSimError {
34 DifferentLengthArgs,
35}
36
37impl Display for StrSimError {
38 fn fmt(&self, fmt: &mut Formatter) -> Result<(), fmt::Error> {
39 let text = match self {
40 StrSimError::DifferentLengthArgs => "Differing length arguments provided",
41 };
42
43 write!(fmt, "{}", text)
44 }
45}
46
47impl Error for StrSimError {}
48
49pub type HammingResult = Result<usize, StrSimError>;
50
51pub fn generic_hamming<Iter1, Iter2, Elem1, Elem2>(a: Iter1, b: Iter2) -> HammingResult
54where
55 Iter1: IntoIterator<Item = Elem1>,
56 Iter2: IntoIterator<Item = Elem2>,
57 Elem1: PartialEq<Elem2>,
58{
59 let (mut ita, mut itb) = (a.into_iter(), b.into_iter());
60 let mut count = 0;
61 loop {
62 match (ita.next(), itb.next()) {
63 (Some(x), Some(y)) => {
64 if x != y {
65 count += 1;
66 }
67 }
68 (None, None) => return Ok(count),
69 _ => return Err(StrSimError::DifferentLengthArgs),
70 }
71 }
72}
73
74pub fn hamming(a: &str, b: &str) -> HammingResult {
85 generic_hamming(a.chars(), b.chars())
86}
87
88pub fn generic_jaro<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64
91where
92 &'a Iter1: IntoIterator<Item = Elem1>,
93 &'b Iter2: IntoIterator<Item = Elem2>,
94 Elem1: PartialEq<Elem2>,
95{
96 let a_len = a.into_iter().count();
97 let b_len = b.into_iter().count();
98
99 if a_len == 0 && b_len == 0 {
100 return 1.0;
101 } else if a_len == 0 || b_len == 0 {
102 return 0.0;
103 }
104
105 let mut search_range = max(a_len, b_len) / 2;
106 search_range = search_range.saturating_sub(1);
107
108 let mut flags_memory = vec![false; a_len + b_len];
110 let (a_flags, b_flags) = flags_memory.split_at_mut(a_len);
111
112 let mut matches = 0_usize;
113
114 for (i, a_elem) in a.into_iter().enumerate() {
115 let min_bound = if i > search_range {
117 i - search_range
118 } else {
119 0
120 };
121
122 let max_bound = min(b_len, i + search_range + 1);
123
124 for (j, b_elem) in b.into_iter().enumerate().take(max_bound) {
125 if min_bound <= j && a_elem == b_elem && !b_flags[j] {
126 a_flags[i] = true;
127 b_flags[j] = true;
128 matches += 1;
129 break;
130 }
131 }
132 }
133
134 let mut transpositions = 0_usize;
135 if matches != 0 {
136 let mut b_iter = b_flags.iter().zip(b);
137 for (a_flag, ch1) in a_flags.iter().zip(a) {
138 if *a_flag {
139 loop {
140 if let Some((b_flag, ch2)) = b_iter.next() {
141 if !*b_flag {
142 continue;
143 }
144
145 if ch1 != ch2 {
146 transpositions += 1;
147 }
148 break;
149 }
150 }
151 }
152 }
153 }
154 transpositions /= 2;
155
156 if matches == 0 {
157 0.0
158 } else {
159 ((matches as f64 / a_len as f64)
160 + (matches as f64 / b_len as f64)
161 + ((matches - transpositions) as f64 / matches as f64))
162 / 3.0
163 }
164}
165
166struct StringWrapper<'a>(&'a str);
167
168impl<'a, 'b> IntoIterator for &'a StringWrapper<'b> {
169 type Item = char;
170 type IntoIter = Chars<'b>;
171
172 fn into_iter(self) -> Self::IntoIter {
173 self.0.chars()
174 }
175}
176
177pub fn jaro(a: &str, b: &str) -> f64 {
187 generic_jaro(&StringWrapper(a), &StringWrapper(b))
188}
189
190pub fn generic_jaro_winkler<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64
192where
193 &'a Iter1: IntoIterator<Item = Elem1>,
194 &'b Iter2: IntoIterator<Item = Elem2>,
195 Elem1: PartialEq<Elem2>,
196{
197 let sim = generic_jaro(a, b);
198
199 if sim > 0.7 {
200 let prefix_length = a
201 .into_iter()
202 .take(4)
203 .zip(b)
204 .take_while(|(a_elem, b_elem)| a_elem == b_elem)
205 .count();
206
207 sim + 0.1 * prefix_length as f64 * (1.0 - sim)
208 } else {
209 sim
210 }
211}
212
213pub fn jaro_winkler(a: &str, b: &str) -> f64 {
222 generic_jaro_winkler(&StringWrapper(a), &StringWrapper(b))
223}
224
225pub fn generic_levenshtein<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> usize
234where
235 &'a Iter1: IntoIterator<Item = Elem1>,
236 &'b Iter2: IntoIterator<Item = Elem2>,
237 Elem1: PartialEq<Elem2>,
238{
239 let b_len = b.into_iter().count();
240
241 let mut cache: Vec<usize> = (1..b_len + 1).collect();
242
243 let mut result = b_len;
244
245 for (i, a_elem) in a.into_iter().enumerate() {
246 result = i + 1;
247 let mut distance_b = i;
248
249 for (j, b_elem) in b.into_iter().enumerate() {
250 let cost = usize::from(a_elem != b_elem);
251 let distance_a = distance_b + cost;
252 distance_b = cache[j];
253 result = min(result + 1, min(distance_a, distance_b + 1));
254 cache[j] = result;
255 }
256 }
257
258 result
259}
260
261pub fn levenshtein(a: &str, b: &str) -> usize {
270 generic_levenshtein(&StringWrapper(a), &StringWrapper(b))
271}
272
273pub fn normalized_levenshtein(a: &str, b: &str) -> f64 {
286 if a.is_empty() && b.is_empty() {
287 return 1.0;
288 }
289 1.0 - (levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64)
290}
291
292pub fn osa_distance(a: &str, b: &str) -> usize {
301 let b_len = b.chars().count();
302 let mut prev_two_distances: Vec<usize> = (0..b_len + 1).collect();
305 let mut prev_distances: Vec<usize> = (0..b_len + 1).collect();
306 let mut curr_distances: Vec<usize> = vec![0; b_len + 1];
307
308 let mut prev_a_char = char::MAX;
309 let mut prev_b_char = char::MAX;
310
311 for (i, a_char) in a.chars().enumerate() {
312 curr_distances[0] = i + 1;
313
314 for (j, b_char) in b.chars().enumerate() {
315 let cost = usize::from(a_char != b_char);
316 curr_distances[j + 1] = min(
317 curr_distances[j] + 1,
318 min(prev_distances[j + 1] + 1, prev_distances[j] + cost),
319 );
320 if i > 0 && j > 0 && a_char != b_char && a_char == prev_b_char && b_char == prev_a_char
321 {
322 curr_distances[j + 1] = min(curr_distances[j + 1], prev_two_distances[j - 1] + 1);
323 }
324
325 prev_b_char = b_char;
326 }
327
328 mem::swap(&mut prev_two_distances, &mut prev_distances);
329 mem::swap(&mut prev_distances, &mut curr_distances);
330 prev_a_char = a_char;
331 }
332
333 prev_distances[b_len]
337}
338
339fn flat_index(i: usize, j: usize, width: usize) -> usize {
342 j * width + i
343}
344
345pub fn generic_damerau_levenshtein<Elem>(a_elems: &[Elem], b_elems: &[Elem]) -> usize
354where
355 Elem: Eq + Hash + Clone,
356{
357 let a_len = a_elems.len();
358 let b_len = b_elems.len();
359
360 if a_len == 0 {
361 return b_len;
362 }
363 if b_len == 0 {
364 return a_len;
365 }
366
367 let width = a_len + 2;
368 let mut distances = vec![0; (a_len + 2) * (b_len + 2)];
369 let max_distance = a_len + b_len;
370 distances[0] = max_distance;
371
372 for i in 0..(a_len + 1) {
373 distances[flat_index(i + 1, 0, width)] = max_distance;
374 distances[flat_index(i + 1, 1, width)] = i;
375 }
376
377 for j in 0..(b_len + 1) {
378 distances[flat_index(0, j + 1, width)] = max_distance;
379 distances[flat_index(1, j + 1, width)] = j;
380 }
381
382 let mut elems: HashMap<Elem, usize> = HashMap::with_capacity(64);
383
384 for i in 1..(a_len + 1) {
385 let mut db = 0;
386
387 for j in 1..(b_len + 1) {
388 let k = match elems.get(&b_elems[j - 1]) {
389 Some(&value) => value,
390 None => 0,
391 };
392
393 let insertion_cost = distances[flat_index(i, j + 1, width)] + 1;
394 let deletion_cost = distances[flat_index(i + 1, j, width)] + 1;
395 let transposition_cost =
396 distances[flat_index(k, db, width)] + (i - k - 1) + 1 + (j - db - 1);
397
398 let mut substitution_cost = distances[flat_index(i, j, width)] + 1;
399 if a_elems[i - 1] == b_elems[j - 1] {
400 db = j;
401 substitution_cost -= 1;
402 }
403
404 distances[flat_index(i + 1, j + 1, width)] = min(
405 substitution_cost,
406 min(insertion_cost, min(deletion_cost, transposition_cost)),
407 );
408 }
409
410 elems.insert(a_elems[i - 1].clone(), i);
411 }
412
413 distances[flat_index(a_len + 1, b_len + 1, width)]
414}
415
416#[derive(Clone, Copy, PartialEq, Eq)]
417struct RowId {
418 val: isize,
419}
420
421impl Default for RowId {
422 fn default() -> Self {
423 Self { val: -1 }
424 }
425}
426
427#[derive(Default, Clone)]
428struct GrowingHashmapMapElemChar<ValueType> {
429 key: u32,
430 value: ValueType,
431}
432
433struct GrowingHashmapChar<ValueType> {
441 used: i32,
442 fill: i32,
443 mask: i32,
444 map: Option<Vec<GrowingHashmapMapElemChar<ValueType>>>,
445}
446
447impl<ValueType> Default for GrowingHashmapChar<ValueType>
448where
449 ValueType: Default + Clone + Eq,
450{
451 fn default() -> Self {
452 Self {
453 used: 0,
454 fill: 0,
455 mask: -1,
456 map: None,
457 }
458 }
459}
460
461impl<ValueType> GrowingHashmapChar<ValueType>
462where
463 ValueType: Default + Clone + Eq + Copy,
464{
465 fn get(&self, key: u32) -> ValueType {
466 self.map
467 .as_ref()
468 .map_or_else(|| Default::default(), |map| map[self.lookup(key)].value)
469 }
470
471 fn get_mut(&mut self, key: u32) -> &mut ValueType {
472 if self.map.is_none() {
473 self.allocate();
474 }
475
476 let mut i = self.lookup(key);
477 if self
478 .map
479 .as_ref()
480 .expect("map should have been created above")[i]
481 .value
482 == Default::default()
483 {
484 self.fill += 1;
485 if self.fill * 3 >= (self.mask + 1) * 2 {
487 self.grow((self.used + 1) * 2);
488 i = self.lookup(key);
489 }
490
491 self.used += 1;
492 }
493
494 let elem = &mut self
495 .map
496 .as_mut()
497 .expect("map should have been created above")[i];
498 elem.key = key;
499 &mut elem.value
500 }
501
502 fn allocate(&mut self) {
503 self.mask = 8 - 1;
504 self.map = Some(vec![GrowingHashmapMapElemChar::default(); 8]);
505 }
506
507 fn lookup(&self, key: u32) -> usize {
510 let hash = key;
511 let mut i = hash as usize & self.mask as usize;
512
513 let map = self
514 .map
515 .as_ref()
516 .expect("callers have to ensure map is allocated");
517
518 if map[i].value == Default::default() || map[i].key == key {
519 return i;
520 }
521
522 let mut perturb = key;
523 loop {
524 i = (i * 5 + perturb as usize + 1) & self.mask as usize;
525
526 if map[i].value == Default::default() || map[i].key == key {
527 return i;
528 }
529
530 perturb >>= 5;
531 }
532 }
533
534 fn grow(&mut self, min_used: i32) {
535 let mut new_size = self.mask + 1;
536 while new_size <= min_used {
537 new_size <<= 1;
538 }
539
540 self.fill = self.used;
541 self.mask = new_size - 1;
542
543 let old_map = std::mem::replace(
544 self.map
545 .as_mut()
546 .expect("callers have to ensure map is allocated"),
547 vec![GrowingHashmapMapElemChar::<ValueType>::default(); new_size as usize],
548 );
549
550 for elem in old_map {
551 if elem.value != Default::default() {
552 let j = self.lookup(elem.key);
553 let new_elem = &mut self.map.as_mut().expect("map created above")[j];
554 new_elem.key = elem.key;
555 new_elem.value = elem.value;
556 self.used -= 1;
557 if self.used == 0 {
558 break;
559 }
560 }
561 }
562
563 self.used = self.fill;
564 }
565}
566
567struct HybridGrowingHashmapChar<ValueType> {
568 map: GrowingHashmapChar<ValueType>,
569 extended_ascii: [ValueType; 256],
570}
571
572impl<ValueType> HybridGrowingHashmapChar<ValueType>
573where
574 ValueType: Default + Clone + Copy + Eq,
575{
576 fn get(&self, key: char) -> ValueType {
577 let value = key as u32;
578 if value <= 255 {
579 let val_u8 = u8::try_from(value).expect("we check the bounds above");
580 self.extended_ascii[usize::from(val_u8)]
581 } else {
582 self.map.get(value)
583 }
584 }
585
586 fn get_mut(&mut self, key: char) -> &mut ValueType {
587 let value = key as u32;
588 if value <= 255 {
589 let val_u8 = u8::try_from(value).expect("we check the bounds above");
590 &mut self.extended_ascii[usize::from(val_u8)]
591 } else {
592 self.map.get_mut(value)
593 }
594 }
595}
596
597impl<ValueType> Default for HybridGrowingHashmapChar<ValueType>
598where
599 ValueType: Default + Clone + Copy + Eq,
600{
601 fn default() -> Self {
602 HybridGrowingHashmapChar {
603 map: GrowingHashmapChar::default(),
604 extended_ascii: [Default::default(); 256],
605 }
606 }
607}
608
609fn damerau_levenshtein_impl<Iter1, Iter2>(s1: Iter1, len1: usize, s2: Iter2, len2: usize) -> usize
610where
611 Iter1: Iterator<Item = char> + Clone,
612 Iter2: Iterator<Item = char> + Clone,
613{
614 let max_val = max(len1, len2) as isize + 1;
620
621 let mut last_row_id = HybridGrowingHashmapChar::<RowId>::default();
622
623 let size = len2 + 2;
624 let mut fr = vec![max_val; size];
625 let mut r1 = vec![max_val; size];
626 let mut r: Vec<isize> = (max_val..max_val + 1)
627 .chain(0..(size - 1) as isize)
628 .collect();
629
630 for (i, ch1) in s1.enumerate().map(|(i, ch1)| (i + 1, ch1)) {
631 mem::swap(&mut r, &mut r1);
632 let mut last_col_id: isize = -1;
633 let mut last_i2l1 = r[1];
634 r[1] = i as isize;
635 let mut t = max_val;
636
637 for (j, ch2) in s2.clone().enumerate().map(|(j, ch2)| (j + 1, ch2)) {
638 let diag = r1[j] + isize::from(ch1 != ch2);
639 let left = r[j] + 1;
640 let up = r1[j + 1] + 1;
641 let mut temp = min(diag, min(left, up));
642
643 if ch1 == ch2 {
644 last_col_id = j as isize; fr[j + 1] = r1[j - 1]; t = last_i2l1; } else {
648 let k = last_row_id.get(ch2).val;
649 let l = last_col_id;
650
651 if j as isize - l == 1 {
652 let transpose = fr[j + 1] + (i as isize - k);
653 temp = min(temp, transpose);
654 } else if i as isize - k == 1 {
655 let transpose = t + (j as isize - l);
656 temp = min(temp, transpose);
657 }
658 }
659
660 last_i2l1 = r[j + 1];
661 r[j + 1] = temp;
662 }
663 last_row_id.get_mut(ch1).val = i as isize;
664 }
665
666 r[len2 + 1] as usize
667}
668
669pub fn damerau_levenshtein(a: &str, b: &str) -> usize {
678 damerau_levenshtein_impl(a.chars(), a.chars().count(), b.chars(), b.chars().count())
679}
680
681pub fn normalized_damerau_levenshtein(a: &str, b: &str) -> f64 {
694 if a.is_empty() && b.is_empty() {
695 return 1.0;
696 }
697
698 let len1 = a.chars().count();
699 let len2 = b.chars().count();
700 let dist = damerau_levenshtein_impl(a.chars(), len1, b.chars(), len2);
701 1.0 - (dist as f64) / (max(len1, len2) as f64)
702}
703
704fn bigrams(s: &str) -> impl Iterator<Item = (char, char)> + '_ {
706 s.chars().zip(s.chars().skip(1))
707}
708
709pub fn sorensen_dice(a: &str, b: &str) -> f64 {
722 let a: String = a.chars().filter(|&x| !char::is_whitespace(x)).collect();
726 let b: String = b.chars().filter(|&x| !char::is_whitespace(x)).collect();
727
728 if a == b {
729 return 1.0;
730 }
731
732 if a.len() < 2 || b.len() < 2 {
733 return 0.0;
734 }
735
736 let mut a_bigrams: HashMap<(char, char), usize> = HashMap::new();
737
738 for bigram in bigrams(&a) {
739 *a_bigrams.entry(bigram).or_insert(0) += 1;
740 }
741
742 let mut intersection_size = 0_usize;
743
744 for bigram in bigrams(&b) {
745 a_bigrams.entry(bigram).and_modify(|bi| {
746 if *bi > 0 {
747 *bi -= 1;
748 intersection_size += 1;
749 }
750 });
751 }
752
753 (2 * intersection_size) as f64 / (a.len() + b.len() - 2) as f64
754}
755
756#[cfg(test)]
757mod tests {
758 use super::*;
759
760 macro_rules! assert_delta {
761 ($x:expr, $y:expr) => {
762 assert_delta!($x, $y, 1e-5);
763 };
764 ($x:expr, $y:expr, $d:expr) => {
765 if ($x - $y).abs() > $d {
766 panic!(
767 "assertion failed: actual: `{}`, expected: `{}`: \
768 actual not within < {} of expected",
769 $x, $y, $d
770 );
771 }
772 };
773 }
774
775 #[test]
776 fn bigrams_iterator() {
777 let mut bi = bigrams("abcde");
778
779 assert_eq!(Some(('a', 'b')), bi.next());
780 assert_eq!(Some(('b', 'c')), bi.next());
781 assert_eq!(Some(('c', 'd')), bi.next());
782 assert_eq!(Some(('d', 'e')), bi.next());
783 assert_eq!(None, bi.next());
784 }
785
786 fn assert_hamming_dist(dist: usize, str1: &str, str2: &str) {
787 assert_eq!(Ok(dist), hamming(str1, str2));
788 }
789
790 #[test]
791 fn hamming_empty() {
792 assert_hamming_dist(0, "", "")
793 }
794
795 #[test]
796 fn hamming_same() {
797 assert_hamming_dist(0, "hamming", "hamming")
798 }
799
800 #[test]
801 fn hamming_numbers() {
802 assert_eq!(Ok(1), generic_hamming(&[1, 2, 4], &[1, 2, 3]));
803 }
804
805 #[test]
806 fn hamming_diff() {
807 assert_hamming_dist(3, "hamming", "hammers")
808 }
809
810 #[test]
811 fn hamming_diff_multibyte() {
812 assert_hamming_dist(2, "hamming", "h香mmüng");
813 }
814
815 #[test]
816 fn hamming_unequal_length() {
817 assert_eq!(
818 Err(StrSimError::DifferentLengthArgs),
819 generic_hamming("ham".chars(), "hamming".chars())
820 );
821 }
822
823 #[test]
824 fn hamming_names() {
825 assert_hamming_dist(14, "Friedrich Nietzs", "Jean-Paul Sartre")
826 }
827
828 #[test]
829 fn jaro_both_empty() {
830 assert_eq!(1.0, jaro("", ""));
831 }
832
833 #[test]
834 fn jaro_first_empty() {
835 assert_eq!(0.0, jaro("", "jaro"));
836 }
837
838 #[test]
839 fn jaro_second_empty() {
840 assert_eq!(0.0, jaro("distance", ""));
841 }
842
843 #[test]
844 fn jaro_same() {
845 assert_eq!(1.0, jaro("jaro", "jaro"));
846 }
847
848 #[test]
849 fn jaro_multibyte() {
850 assert_delta!(0.818, jaro("testabctest", "testöঙ香test"), 0.001);
851 assert_delta!(0.818, jaro("testöঙ香test", "testabctest"), 0.001);
852 }
853
854 #[test]
855 fn jaro_diff_short() {
856 assert_delta!(0.767, jaro("dixon", "dicksonx"), 0.001);
857 }
858
859 #[test]
860 fn jaro_diff_one_character() {
861 assert_eq!(0.0, jaro("a", "b"));
862 }
863
864 #[test]
865 fn jaro_same_one_character() {
866 assert_eq!(1.0, jaro("a", "a"));
867 }
868
869 #[test]
870 fn generic_jaro_diff() {
871 assert_eq!(0.0, generic_jaro(&[1, 2], &[3, 4]));
872 }
873
874 #[test]
875 fn jaro_diff_one_and_two() {
876 assert_delta!(0.83, jaro("a", "ab"), 0.01);
877 }
878
879 #[test]
880 fn jaro_diff_two_and_one() {
881 assert_delta!(0.83, jaro("ab", "a"), 0.01);
882 }
883
884 #[test]
885 fn jaro_diff_no_transposition() {
886 assert_delta!(0.822, jaro("dwayne", "duane"), 0.001);
887 }
888
889 #[test]
890 fn jaro_diff_with_transposition() {
891 assert_delta!(0.944, jaro("martha", "marhta"), 0.001);
892 assert_delta!(0.6, jaro("a jke", "jane a k"), 0.001);
893 }
894
895 #[test]
896 fn jaro_names() {
897 assert_delta!(
898 0.392,
899 jaro("Friedrich Nietzsche", "Jean-Paul Sartre"),
900 0.001
901 );
902 }
903
904 #[test]
905 fn jaro_winkler_both_empty() {
906 assert_eq!(1.0, jaro_winkler("", ""));
907 }
908
909 #[test]
910 fn jaro_winkler_first_empty() {
911 assert_eq!(0.0, jaro_winkler("", "jaro-winkler"));
912 }
913
914 #[test]
915 fn jaro_winkler_second_empty() {
916 assert_eq!(0.0, jaro_winkler("distance", ""));
917 }
918
919 #[test]
920 fn jaro_winkler_same() {
921 assert_eq!(1.0, jaro_winkler("Jaro-Winkler", "Jaro-Winkler"));
922 }
923
924 #[test]
925 fn jaro_winkler_multibyte() {
926 assert_delta!(0.89, jaro_winkler("testabctest", "testöঙ香test"), 0.001);
927 assert_delta!(0.89, jaro_winkler("testöঙ香test", "testabctest"), 0.001);
928 }
929
930 #[test]
931 fn jaro_winkler_diff_short() {
932 assert_delta!(0.813, jaro_winkler("dixon", "dicksonx"), 0.001);
933 assert_delta!(0.813, jaro_winkler("dicksonx", "dixon"), 0.001);
934 }
935
936 #[test]
937 fn jaro_winkler_diff_one_character() {
938 assert_eq!(0.0, jaro_winkler("a", "b"));
939 }
940
941 #[test]
942 fn jaro_winkler_same_one_character() {
943 assert_eq!(1.0, jaro_winkler("a", "a"));
944 }
945
946 #[test]
947 fn jaro_winkler_diff_no_transposition() {
948 assert_delta!(0.84, jaro_winkler("dwayne", "duane"), 0.001);
949 }
950
951 #[test]
952 fn jaro_winkler_diff_with_transposition() {
953 assert_delta!(0.961, jaro_winkler("martha", "marhta"), 0.001);
954 assert_delta!(0.6, jaro_winkler("a jke", "jane a k"), 0.001);
955 }
956
957 #[test]
958 fn jaro_winkler_names() {
959 assert_delta!(
960 0.452,
961 jaro_winkler("Friedrich Nietzsche", "Fran-Paul Sartre"),
962 0.001
963 );
964 }
965
966 #[test]
967 fn jaro_winkler_long_prefix() {
968 assert_delta!(0.866, jaro_winkler("cheeseburger", "cheese fries"), 0.001);
969 }
970
971 #[test]
972 fn jaro_winkler_more_names() {
973 assert_delta!(0.868, jaro_winkler("Thorkel", "Thorgier"), 0.001);
974 }
975
976 #[test]
977 fn jaro_winkler_length_of_one() {
978 assert_delta!(0.738, jaro_winkler("Dinsdale", "D"), 0.001);
979 }
980
981 #[test]
982 fn jaro_winkler_very_long_prefix() {
983 assert_delta!(
984 0.98519,
985 jaro_winkler("thequickbrownfoxjumpedoverx", "thequickbrownfoxjumpedovery")
986 );
987 }
988
989 #[test]
990 fn levenshtein_empty() {
991 assert_eq!(0, levenshtein("", ""));
992 }
993
994 #[test]
995 fn levenshtein_same() {
996 assert_eq!(0, levenshtein("levenshtein", "levenshtein"));
997 }
998
999 #[test]
1000 fn levenshtein_diff_short() {
1001 assert_eq!(3, levenshtein("kitten", "sitting"));
1002 }
1003
1004 #[test]
1005 fn levenshtein_diff_with_space() {
1006 assert_eq!(5, levenshtein("hello, world", "bye, world"));
1007 }
1008
1009 #[test]
1010 fn levenshtein_diff_multibyte() {
1011 assert_eq!(3, levenshtein("öঙ香", "abc"));
1012 assert_eq!(3, levenshtein("abc", "öঙ香"));
1013 }
1014
1015 #[test]
1016 fn levenshtein_diff_longer() {
1017 let a = "The quick brown fox jumped over the angry dog.";
1018 let b = "Lorem ipsum dolor sit amet, dicta latine an eam.";
1019 assert_eq!(37, levenshtein(a, b));
1020 }
1021
1022 #[test]
1023 fn levenshtein_first_empty() {
1024 assert_eq!(7, levenshtein("", "sitting"));
1025 }
1026
1027 #[test]
1028 fn levenshtein_second_empty() {
1029 assert_eq!(6, levenshtein("kitten", ""));
1030 }
1031
1032 #[test]
1033 fn normalized_levenshtein_diff_short() {
1034 assert_delta!(0.57142, normalized_levenshtein("kitten", "sitting"));
1035 }
1036
1037 #[test]
1038 fn normalized_levenshtein_for_empty_strings() {
1039 assert_delta!(1.0, normalized_levenshtein("", ""));
1040 }
1041
1042 #[test]
1043 fn normalized_levenshtein_first_empty() {
1044 assert_delta!(0.0, normalized_levenshtein("", "second"));
1045 }
1046
1047 #[test]
1048 fn normalized_levenshtein_second_empty() {
1049 assert_delta!(0.0, normalized_levenshtein("first", ""));
1050 }
1051
1052 #[test]
1053 fn normalized_levenshtein_identical_strings() {
1054 assert_delta!(1.0, normalized_levenshtein("identical", "identical"));
1055 }
1056
1057 #[test]
1058 fn osa_distance_empty() {
1059 assert_eq!(0, osa_distance("", ""));
1060 }
1061
1062 #[test]
1063 fn osa_distance_same() {
1064 assert_eq!(0, osa_distance("damerau", "damerau"));
1065 }
1066
1067 #[test]
1068 fn osa_distance_first_empty() {
1069 assert_eq!(7, osa_distance("", "damerau"));
1070 }
1071
1072 #[test]
1073 fn osa_distance_second_empty() {
1074 assert_eq!(7, osa_distance("damerau", ""));
1075 }
1076
1077 #[test]
1078 fn osa_distance_diff() {
1079 assert_eq!(3, osa_distance("ca", "abc"));
1080 }
1081
1082 #[test]
1083 fn osa_distance_diff_short() {
1084 assert_eq!(3, osa_distance("damerau", "aderua"));
1085 }
1086
1087 #[test]
1088 fn osa_distance_diff_reversed() {
1089 assert_eq!(3, osa_distance("aderua", "damerau"));
1090 }
1091
1092 #[test]
1093 fn osa_distance_diff_multibyte() {
1094 assert_eq!(3, osa_distance("öঙ香", "abc"));
1095 assert_eq!(3, osa_distance("abc", "öঙ香"));
1096 }
1097
1098 #[test]
1099 fn osa_distance_diff_unequal_length() {
1100 assert_eq!(6, osa_distance("damerau", "aderuaxyz"));
1101 }
1102
1103 #[test]
1104 fn osa_distance_diff_unequal_length_reversed() {
1105 assert_eq!(6, osa_distance("aderuaxyz", "damerau"));
1106 }
1107
1108 #[test]
1109 fn osa_distance_diff_comedians() {
1110 assert_eq!(5, osa_distance("Stewart", "Colbert"));
1111 }
1112
1113 #[test]
1114 fn osa_distance_many_transpositions() {
1115 assert_eq!(4, osa_distance("abcdefghijkl", "bacedfgihjlk"));
1116 }
1117
1118 #[test]
1119 fn osa_distance_diff_longer() {
1120 let a = "The quick brown fox jumped over the angry dog.";
1121 let b = "Lehem ipsum dolor sit amet, dicta latine an eam.";
1122 assert_eq!(36, osa_distance(a, b));
1123 }
1124
1125 #[test]
1126 fn osa_distance_beginning_transposition() {
1127 assert_eq!(1, osa_distance("foobar", "ofobar"));
1128 }
1129
1130 #[test]
1131 fn osa_distance_end_transposition() {
1132 assert_eq!(1, osa_distance("specter", "spectre"));
1133 }
1134
1135 #[test]
1136 fn osa_distance_restricted_edit() {
1137 assert_eq!(4, osa_distance("a cat", "an abct"));
1138 }
1139
1140 #[test]
1141 fn damerau_levenshtein_empty() {
1142 assert_eq!(0, damerau_levenshtein("", ""));
1143 }
1144
1145 #[test]
1146 fn damerau_levenshtein_same() {
1147 assert_eq!(0, damerau_levenshtein("damerau", "damerau"));
1148 }
1149
1150 #[test]
1151 fn damerau_levenshtein_first_empty() {
1152 assert_eq!(7, damerau_levenshtein("", "damerau"));
1153 }
1154
1155 #[test]
1156 fn damerau_levenshtein_second_empty() {
1157 assert_eq!(7, damerau_levenshtein("damerau", ""));
1158 }
1159
1160 #[test]
1161 fn damerau_levenshtein_diff() {
1162 assert_eq!(2, damerau_levenshtein("ca", "abc"));
1163 }
1164
1165 #[test]
1166 fn damerau_levenshtein_diff_short() {
1167 assert_eq!(3, damerau_levenshtein("damerau", "aderua"));
1168 }
1169
1170 #[test]
1171 fn damerau_levenshtein_diff_reversed() {
1172 assert_eq!(3, damerau_levenshtein("aderua", "damerau"));
1173 }
1174
1175 #[test]
1176 fn damerau_levenshtein_diff_multibyte() {
1177 assert_eq!(3, damerau_levenshtein("öঙ香", "abc"));
1178 assert_eq!(3, damerau_levenshtein("abc", "öঙ香"));
1179 }
1180
1181 #[test]
1182 fn damerau_levenshtein_diff_unequal_length() {
1183 assert_eq!(6, damerau_levenshtein("damerau", "aderuaxyz"));
1184 }
1185
1186 #[test]
1187 fn damerau_levenshtein_diff_unequal_length_reversed() {
1188 assert_eq!(6, damerau_levenshtein("aderuaxyz", "damerau"));
1189 }
1190
1191 #[test]
1192 fn damerau_levenshtein_diff_comedians() {
1193 assert_eq!(5, damerau_levenshtein("Stewart", "Colbert"));
1194 }
1195
1196 #[test]
1197 fn damerau_levenshtein_many_transpositions() {
1198 assert_eq!(4, damerau_levenshtein("abcdefghijkl", "bacedfgihjlk"));
1199 }
1200
1201 #[test]
1202 fn damerau_levenshtein_diff_longer() {
1203 let a = "The quick brown fox jumped over the angry dog.";
1204 let b = "Lehem ipsum dolor sit amet, dicta latine an eam.";
1205 assert_eq!(36, damerau_levenshtein(a, b));
1206 }
1207
1208 #[test]
1209 fn damerau_levenshtein_beginning_transposition() {
1210 assert_eq!(1, damerau_levenshtein("foobar", "ofobar"));
1211 }
1212
1213 #[test]
1214 fn damerau_levenshtein_end_transposition() {
1215 assert_eq!(1, damerau_levenshtein("specter", "spectre"));
1216 }
1217
1218 #[test]
1219 fn damerau_levenshtein_unrestricted_edit() {
1220 assert_eq!(3, damerau_levenshtein("a cat", "an abct"));
1221 }
1222
1223 #[test]
1224 fn normalized_damerau_levenshtein_diff_short() {
1225 assert_delta!(
1226 0.27272,
1227 normalized_damerau_levenshtein("levenshtein", "löwenbräu")
1228 );
1229 }
1230
1231 #[test]
1232 fn normalized_damerau_levenshtein_for_empty_strings() {
1233 assert_delta!(1.0, normalized_damerau_levenshtein("", ""));
1234 }
1235
1236 #[test]
1237 fn normalized_damerau_levenshtein_first_empty() {
1238 assert_delta!(0.0, normalized_damerau_levenshtein("", "flower"));
1239 }
1240
1241 #[test]
1242 fn normalized_damerau_levenshtein_second_empty() {
1243 assert_delta!(0.0, normalized_damerau_levenshtein("tree", ""));
1244 }
1245
1246 #[test]
1247 fn normalized_damerau_levenshtein_identical_strings() {
1248 assert_delta!(
1249 1.0,
1250 normalized_damerau_levenshtein("sunglasses", "sunglasses")
1251 );
1252 }
1253
1254 #[test]
1255 fn sorensen_dice_all() {
1256 assert_delta!(1.0, sorensen_dice("a", "a"));
1260 assert_delta!(0.0, sorensen_dice("a", "b"));
1261 assert_delta!(1.0, sorensen_dice("", ""));
1262 assert_delta!(0.0, sorensen_dice("a", ""));
1263 assert_delta!(0.0, sorensen_dice("", "a"));
1264 assert_delta!(1.0, sorensen_dice("apple event", "apple event"));
1265 assert_delta!(0.90909, sorensen_dice("iphone", "iphone x"));
1266 assert_delta!(0.0, sorensen_dice("french", "quebec"));
1267 assert_delta!(1.0, sorensen_dice("france", "france"));
1268 assert_delta!(0.2, sorensen_dice("fRaNce", "france"));
1269 assert_delta!(0.8, sorensen_dice("healed", "sealed"));
1270 assert_delta!(
1271 0.78788,
1272 sorensen_dice("web applications", "applications of the web")
1273 );
1274 assert_delta!(
1275 0.92,
1276 sorensen_dice(
1277 "this will have a typo somewhere",
1278 "this will huve a typo somewhere"
1279 )
1280 );
1281 assert_delta!(
1282 0.60606,
1283 sorensen_dice(
1284 "Olive-green table for sale, in extremely good condition.",
1285 "For sale: table in very good condition, olive green in colour."
1286 )
1287 );
1288 assert_delta!(
1289 0.25581,
1290 sorensen_dice(
1291 "Olive-green table for sale, in extremely good condition.",
1292 "For sale: green Subaru Impreza, 210,000 miles"
1293 )
1294 );
1295 assert_delta!(
1296 0.14118,
1297 sorensen_dice(
1298 "Olive-green table for sale, in extremely good condition.",
1299 "Wanted: mountain bike with at least 21 gears."
1300 )
1301 );
1302 assert_delta!(
1303 0.77419,
1304 sorensen_dice("this has one extra word", "this has one word")
1305 );
1306 }
1307}