fuzzt-0.3.1/.cargo_vcs_info.json0000644000000001360000000000100122170ustar { "git": { "sha1": "e5af243e1fa319bfbec60f689808d344f8778fd8" }, "path_in_vcs": "" }fuzzt-0.3.1/.gitignore000064400000000000000000000002131046102023000127730ustar 00000000000000# Generated by Cargo # will have compiled files and executables /target/ # These are backup files generated by rustfmt **/*.rs.bk .vscode fuzzt-0.3.1/CHANGELOG.md000064400000000000000000000135261046102023000126270ustar 00000000000000# Change Log This project attempts to adhere to [Semantic Versioning](http://semver.org). ## [Unreleased] ## [0.11.0] - (2024-01-07) ### Changed - improve OSA implementation - reduce runtime - reduce binary size by more than `25%` - reduce binary size of Levenshtein distance - improve Damerau-Levenshtein implementation - reduce memory usage from `O(N*M)` to `O(N+M)` - reduce runtime in our own benchmark by more than `70%` - reduce binary size by more than `25%` - only boost similarity in Jaro-Winkler once the Jaro similarity exceeds 0.7 ### Fixed - Fix transposition counting in Jaro and Jaro-Winkler. - Limit common prefix in Jaro-Winkler to 4 characters ## [0.10.0] - (2020-01-31) ### Added - Sørensen-Dice implementation (thanks [@robjtede](https://github.com/robjtede)) ## [0.9.3] - (2019-12-12) ### Fixed - Fix Jaro and Jaro-Winkler when the arguments have lengths of 1 and are equal. Previously, the functions would erroneously return 0 instead of 1. Thanks to [@vvrably](https://github.com/vvrably) for pointing out the issue. ## [0.9.2] - (2019-05-09) ### Changed - Revert back to the standard library hashmap because it will use hashbrown very soon - Remove ndarray in favor of using a single vector to represent the 2d grid in Damerau-Levenshtein ## [0.9.1] - (2019-04-08) ### Changed - Faster Damerau-Levenshtein implementation (thanks [@lovasoa](https://github.com/lovasoa)) ## [0.9.0] - (2019-04-06) ### Added - Generic distance functions (thanks [@lovasoa](https://github.com/lovasoa)) ## [0.8.0] - (2018-08-19) ### Added - Normalized versions of Levenshtein and Damerau-Levenshtein (thanks [@gentoid](https://github.com/gentoid)) ## [0.7.0] - (2018-01-17) ### Changed - Faster Levenshtein implementation (thanks [@wdv4758h](https://github.com/wdv4758h)) ### Removed - Remove the "against_vec" functions. They are one-liners now, so they don't seem to add enough value to justify making the API larger. I didn't find anybody using them when I skimmed through a GitHub search. If you do use them, you can change the calls to something like: ```rust let distances = strings.iter().map(|a| jaro(target, a)).collect(); ``` ## [0.6.0] - (2016-12-26) ### Added - Add optimal string alignment distance ### Fixed - Fix Damerau-Levenshtein implementation (previous implementation was actually optimal string alignment; see this [Damerau-Levenshtein explanation]) ## [0.5.2] - (2016-11-21) ### Changed - Remove Cargo generated documentation in favor of a [docs.rs] link ## [0.5.1] - (2016-08-23) ### Added - Add Cargo generated documentation ### Fixed - Fix panic when Jaro or Jaro-Winkler are given strings both with a length of one ## [0.5.0] - (2016-08-11) ### Changed - Make Hamming faster (thanks @IBUzPE9) when the two strings have the same length but slower when they have different lengths ## [0.4.1] - (2016-04-18) ### Added - Add Vagrant setup for development - Add AppVeyor configuration for Windows CI ### Fixed - Fix metrics when given strings with multibyte characters (thanks @WanzenBug) ## [0.4.0] - (2015-06-10) ### Added - For each metric, add a function that takes a vector of strings and returns a vector of results (thanks @ovarene) ## [0.3.0] - (2015-04-30) ### Changed - Remove usage of unstable Rust features ## [0.2.5] - (2015-04-24) ### Fixed - Remove unnecessary `Float` import from doc tests ## [0.2.4] - (2015-04-15) ### Fixed - Remove unused `core` feature flag ## [0.2.3] - (2015-04-01) ### Fixed - Remove now unnecessary `Float` import ## [0.2.2] - (2015-03-29) ### Fixed - Remove usage of `char_at` (marked as unstable) ## [0.2.1] - (2015-02-20) ### Fixed - Update bit vector import to match Rust update ## [0.2.0] - (2015-02-19) ### Added - Implement Damerau-Levenshtein - Add tests in docs ## [0.1.1] - (2015-02-10) ### Added - Configure Travis for CI - Add rustdoc comments ### Fixed - Limit Jaro-Winkler return value to a maximum of 1.0 - Fix float comparisons in tests ## [0.1.0] - (2015-02-09) ### Added - Implement Hamming, Jaro, Jaro-Winkler, and Levenshtein [Unreleased]: https://github.com/rapidfuzz/strsim-rs/compare/0.11.0...HEAD [0.11.0]: https://github.com/rapidfuzz/strsim-rs/compare/0.10.0...0.11.0 [0.10.0]: https://github.com/rapidfuzz/strsim-rs/compare/0.9.3...0.10.0 [0.9.3]: https://github.com/rapidfuzz/strsim-rs/compare/0.9.2...0.9.3 [0.9.2]: https://github.com/rapidfuzz/strsim-rs/compare/0.9.1...0.9.2 [0.9.1]: https://github.com/rapidfuzz/strsim-rs/compare/0.9.0...0.9.1 [0.9.0]: https://github.com/rapidfuzz/strsim-rs/compare/0.8.0...0.9.0 [0.8.0]: https://github.com/rapidfuzz/strsim-rs/compare/0.7.0...0.8.0 [0.7.0]: https://github.com/rapidfuzz/strsim-rs/compare/0.6.0...0.7.0 [0.6.0]: https://github.com/rapidfuzz/strsim-rs/compare/0.5.2...0.6.0 [0.5.2]: https://github.com/rapidfuzz/strsim-rs/compare/0.5.1...0.5.2 [0.5.1]: https://github.com/rapidfuzz/strsim-rs/compare/0.5.0...0.5.1 [0.5.0]: https://github.com/rapidfuzz/strsim-rs/compare/0.4.1...0.5.0 [0.4.1]: https://github.com/rapidfuzz/strsim-rs/compare/0.4.0...0.4.1 [0.4.0]: https://github.com/rapidfuzz/strsim-rs/compare/0.3.0...0.4.0 [0.3.0]: https://github.com/rapidfuzz/strsim-rs/compare/0.2.5...0.3.0 [0.2.5]: https://github.com/rapidfuzz/strsim-rs/compare/0.2.4...0.2.5 [0.2.4]: https://github.com/rapidfuzz/strsim-rs/compare/0.2.3...0.2.4 [0.2.3]: https://github.com/rapidfuzz/strsim-rs/compare/0.2.2...0.2.3 [0.2.2]: https://github.com/rapidfuzz/strsim-rs/compare/0.2.1...0.2.2 [0.2.1]: https://github.com/rapidfuzz/strsim-rs/compare/0.2.0...0.2.1 [0.2.0]: https://github.com/rapidfuzz/strsim-rs/compare/0.1.1...0.2.0 [0.1.1]: https://github.com/rapidfuzz/strsim-rs/compare/0.1.0...0.1.1 [0.1.0]: https://github.com/rapidfuzz/strsim-rs/compare/fabad4...0.1.0 [docs.rs]: https://docs.rs/strsim/ [Damerau-Levenshtein explanation]: http://scarcitycomputing.blogspot.com/2013/04/damerau-levenshtein-edit-distance.html fuzzt-0.3.1/Cargo.toml0000644000000025620000000000100102220ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" rust-version = "1.70.0" name = "fuzzt" version = "0.3.1" authors = ["Luiz Otavio Vilas Boas Oliveira "] exclude = [ "/.github", "/dev", ] description = """ Implementations of string similarity metrics. Includes Hamming, Levenshtein, OSA, Damerau-Levenshtein, Jaro, Jaro-Winkler, and Sørensen-Dice. """ homepage = "https://github.com/luizvbo/fuzzt" readme = "README.md" keywords = [ "string", "similarity", "Hamming", "Levenshtein", "Jaro", ] categories = ["text-processing"] license = "MIT" repository = "https://github.com/luizvbo/fuzzt" [dev-dependencies.rstest] version = "0.18.2" [features] damerau_levenshtein = [] default = [ "damerau_levenshtein", "hamming", "jaro", "levenshtein", "optimal_string_alignment", "sorensen_dice", ] hamming = [] jaro = [] levenshtein = [] optimal_string_alignment = [] sorensen_dice = [] fuzzt-0.3.1/Cargo.toml.orig000064400000000000000000000015401046102023000136760ustar 00000000000000[package] name = "fuzzt" version = "0.3.1" edition = "2021" authors = ["Luiz Otavio Vilas Boas Oliveira "] description = """ Implementations of string similarity metrics. Includes Hamming, Levenshtein, OSA, Damerau-Levenshtein, Jaro, Jaro-Winkler, and Sørensen-Dice. """ license = "MIT" readme = "README.md" keywords = ["string", "similarity", "Hamming", "Levenshtein", "Jaro"] homepage = "https://github.com/luizvbo/fuzzt" repository = "https://github.com/luizvbo/fuzzt" exclude = ["/.github", "/dev"] categories = ["text-processing"] rust-version = "1.70.0" [features] default = [ "damerau_levenshtein", "hamming", "jaro", "levenshtein", "optimal_string_alignment", "sorensen_dice" ] damerau_levenshtein = [] hamming = [] jaro = [] levenshtein = [] optimal_string_alignment = [] sorensen_dice = [] [dev-dependencies] rstest = "0.18.2" fuzzt-0.3.1/LICENSE000064400000000000000000000022671046102023000120230ustar 00000000000000MIT License Copyright (c) 2015 Danny Guo Copyright (c) 2016 Titus Wormer Copyright (c) 2018 Akash Kurdekar Copyright (c) 2024 Luiz Otavio Vilas Boas Oliveira Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. fuzzt-0.3.1/README.md000064400000000000000000000172031046102023000122710ustar 00000000000000# Fuzzt [Rust](https://www.rust-lang.org) implementations of [string similarity metrics]: - [Hamming](#hamming) - [Levenshtein](#levenshtein) (distance & normalized) - [Optimal string alignment](#optimal-string-alignment) - [Damerau-Levenshtein](#damerau-levenshtein) (distance & normalized) - [Jaro and Jaro-Winkler](#jaro-and-jaro-winkler) - [Sørensen-Dice](#sørensen-dice) - [Gestalt pattern matching](#gestalt-pattern-matching) The normalized versions return values between `0.0` and `1.0`, where `1.0` means an exact match. There are also generic versions of the functions for non-string inputs. ## What is new? This crate is heavily based on the [strsim-rs](https://github.com/rapidfuzz/strsim-rs) crate, with some nice additions: - [Gestalt pattern matching](#gestalt-pattern-matching), the algorithm used by python difflib SequenceMatcher - [Top-N matching](#top-n-matching), a method to retrieve the best N matches from a collection of choices. - [Feature selection](#feature-selection), allows you to select only the features (metrics) you want to use, reducing the memory footprint of your application. ### Top-N Matching The method `get_top_n` gets a list of the best matches from a collection of choices. This feature is inspired by the `extractBests` method from the Python [fuzzywuzzy](https://github.com/seatgeek/fuzzywuzzy) package (now [thefuzz](https://github.com/seatgeek/thefuzz)). The `get_top_n` method takes a query string, an array of choice strings, a cutoff similarity score, an optional number of top matches to return, an optional string processor, and an optional similarity metric. It processes each choice and the query using the provided or default string processor, computes the similarity between the processed query and each processed choice using the provided or default similarity metric, and returns the top-N matches that have a similarity score greater than or equal to the cutoff. Here's the signature of the `get_top_n` method: ```rust extern crate fuzzt; use fuzzt::{algorithms::NormalizedLevenshtein, get_top_n, processors::NullStringProcessor}; fn main() { let matches = get_top_n( "apple", &["apply", "apples", "ape", "applet", "applesauce"], Some(0.8), Some(3), Some(&NullStringProcessor), Some(&NormalizedLevenshtein), ); assert_eq!(matches, ["apples", "applet", "apply"]); } ``` ### Feature selection `fuzzt` is designed with flexibility in mind, allowing you to select only the features you need for your specific use case. This can help to reduce the footprint of your application and optimize performance. The crate includes the following features: - damerau_levenshtein - gestalt - hamming - jaro - levenshtein - optimal_string_alignment - sorensen_dice By default, all features are included when you add `fuzzt` as a dependency. However, you can choose to include only specific features by listing them under the `features` key in your `Cargo.toml` file. For example: ```toml [dependencies] fuzzt = { version = "*", default-features = false, features = ["levenshtein", "jaro"] } ``` ## Installation `Fuzzt` is available on [crates.io](https://crates.io/crates/fuzzt). Add it to your project: ```sh cargo add fuzzt ``` ## Usage Go to [Docs.rs](https://docs.rs/fuzzt/) for the full documentation. You can also clone the repo, and run `$ cargo doc --open`. ### Examples ```rust extern crate fuzzt; use fuzzt::{ damerau_levenshtein, hamming, jaro, jaro_winkler, levenshtein, normalized_damerau_levenshtein, normalized_levenshtein, osa_distance, sequence_matcher, sorensen_dice, }; fn main() { match hamming("hamming", "hammers") { Ok(distance) => assert_eq!(3, distance), Err(why) => panic!("{:?}", why), } assert_eq!(levenshtein("kitten", "sitting"), 3); assert!((normalized_levenshtein("kitten", "sitting") - 0.571).abs() < 0.001); assert_eq!(osa_distance("ac", "cba"), 3); assert_eq!(damerau_levenshtein("ac", "cba"), 2); assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.272).abs() < 0.001); assert_eq!(jaro("Friedrich Nietzsche", "Jean-Paul Sartre"), 0.3918859649122807); assert_eq!( jaro_winkler("cheeseburger", "cheese fries"), 0.8666666666666666 ); assert_eq!( sorensen_dice("web applications", "applications of the web"), 0.7878787878787878 ); assert_eq!( sequence_matcher("this is a test", "this is a test!"), 0.9655172413793104 ); } ``` Using the generic versions of the functions: ```rust extern crate fuzzt; use fuzzt::generic_levenshtein; fn main() { assert_eq!(2, generic_levenshtein(&[1, 2, 3], &[0, 2, 5])); } ``` ## Algorithms ### Hamming The Hamming distance between two strings of equal length is the number of positions at which the corresponding symbols are different. It measures the minimum number of substitutions required to change one string into the other. ### Levenshtein The Levenshtein distance is a string metric for measuring the difference between two sequences. It quantifies how many edits (insertions, deletions, or substitutions) you need to make to change one string into another. The normalized version of this metric gives you a proportion between 0 and 1, where 1 means the strings are identical. ### Optimal String Alignment The Optimal String Alignment (OSA), also known as the restricted Damerau-Levenshtein distance, computes the shortest distance considering only adjacent transpositions. This means it doesn't allow substrings to move as a block, unlike the Damerau-Levenshtein distance. ### Damerau-Levenshtein Damerau-Levenshtein distance is an extension of the Levenshtein distance, allowing for transpositions of two adjacent characters along with insertions, deletions, and substitutions. The normalized version gives a proportion between 0 and 1, where 1 means the strings are identical. ### Jaro and Jaro-Winkler The Jaro distance allows for transpositions and takes into account the number and order of common characters between two strings. The Jaro-Winkler distance is a modification of the Jaro distance that gives more favorable ratings to strings that match from the beginning. ### Sørensen-Dice This coefficient is a statistic used to gauge the similarity of two samples. It's calculated as twice the size of the intersection of the sets, divided by the sum of the sizes of the two sets. ### Gestalt Pattern Matching This is the algorithm used by Python's `difflib.SequenceMatcher`. It uses a heuristic called "Ratcliff/Obershelp" that computes the doubled number of matching characters divided by the total number of characters in the two strings. It's particularly good at detecting close matches and some types of typos. ## Contributing If you don't want to install Rust itself, you can run `$ ./dev` for a development CLI if you have [Docker] installed. Benchmarks require a Nightly toolchain. Run `$ cargo +nightly bench`. ## License [MIT](https://github.com/luizvbo/fuzzt/blob/main/LICENSE) [string similarity metrics]: http://en.wikipedia.org/wiki/String_metric [Damerau-Levenshtein]: http://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance [Jaro and Jaro-Winkler]: http://en.wikipedia.org/wiki/Jaro%E2%80%93Winkler_distance [Levenshtein]: http://en.wikipedia.org/wiki/Levenshtein_distance [Hamming]: http://en.wikipedia.org/wiki/Hamming_distance [Optimal string alignment]: https://en.wikipedia.org/wiki/Damerau%E2%80%93Levenshtein_distance#Optimal_string_alignment_distance [Sørensen-Dice]: http://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient [Gestalt pattern matching]: https://en.wikipedia.org/wiki/Gestalt_pattern_matching [Docker]: https://docs.docker.com/engine/installation/ fuzzt-0.3.1/benches/benches.rs000064400000000000000000000045501046102023000143770ustar 00000000000000//! Benchmarks for strsim. #![feature(test)] extern crate fuzzt; extern crate test; use self::test::Bencher; #[bench] fn bench_hamming(bencher: &mut Bencher) { let a = "ACAAGATGCCATTGTCCCCCGGCCTCCTGCTGCTGCTGCTCTCCGGGG"; let b = "CCTGGAGGGTGGCCCCACCGGCCGAGACAGCGAGCATATGCAGGAAGC"; bencher.iter(|| { fuzzt::algorithms::hamming(a, b).unwrap(); }) } #[bench] fn bench_jaro(bencher: &mut Bencher) { let a = "Philosopher Friedrich Nietzsche"; let b = "Philosopher Jean-Paul Sartre"; bencher.iter(|| { fuzzt::algorithms::jaro(a, b); }) } #[bench] fn bench_jaro_winkler(bencher: &mut Bencher) { let a = "Philosopher Friedrich Nietzsche"; let b = "Philosopher Jean-Paul Sartre"; bencher.iter(|| { fuzzt::algorithms::jaro_winkler(a, b); }) } #[bench] fn bench_levenshtein(bencher: &mut Bencher) { let a = "Philosopher Friedrich Nietzsche"; let b = "Philosopher Jean-Paul Sartre"; bencher.iter(|| { fuzzt::algorithms::levenshtein(a, b); }) } #[bench] fn bench_levenshtein_on_u8(bencher: &mut Bencher) { bencher.iter(|| { fuzzt::algorithms::generic_levenshtein(&vec![0u8; 30], &vec![7u8; 31]); }) } #[bench] fn bench_normalized_levenshtein(bencher: &mut Bencher) { let a = "Philosopher Friedrich Nietzsche"; let b = "Philosopher Jean-Paul Sartre"; bencher.iter(|| { fuzzt::algorithms::normalized_levenshtein(a, b); }) } #[bench] fn bench_osa_distance(bencher: &mut Bencher) { let a = "Philosopher Friedrich Nietzsche"; let b = "Philosopher Jean-Paul Sartre"; bencher.iter(|| { fuzzt::algorithms::osa_distance(a, b); }) } #[bench] fn bench_damerau_levenshtein(bencher: &mut Bencher) { let a = "Philosopher Friedrich Nietzsche"; let b = "Philosopher Jean-Paul Sartre"; bencher.iter(|| { fuzzt::algorithms::damerau_levenshtein(a, b); }) } #[bench] fn bench_normalized_damerau_levenshtein(bencher: &mut Bencher) { let a = "Philosopher Friedrich Nietzsche"; let b = "Philosopher Jean-Paul Sartre"; bencher.iter(|| { fuzzt::algorithms::normalized_damerau_levenshtein(a, b); }) } #[bench] fn bench_sorensen_dice(bencher: &mut Bencher) { let a = "Philosopher Friedrich Nietzsche"; let b = "Philosopher Jean-Paul Sartre"; bencher.iter(|| { fuzzt::algorithms::sorensen_dice(a, b); }) } fuzzt-0.3.1/src/algorithms/damerau_levenshtein.rs000064400000000000000000000222601046102023000203410ustar 00000000000000use crate::algorithms::{Similarity, SimilarityMetric}; use crate::utils::{flat_index, HybridGrowingHashmapChar, RowId}; use std::cmp::{max, min}; use std::collections::HashMap; use std::hash::Hash; use std::mem; /// Like optimal string alignment, but substrings can be edited an unlimited /// number of times, and the triangle inequality holds. /// /// ``` /// use fuzzt::algorithms::generic_damerau_levenshtein; /// /// assert_eq!(2, generic_damerau_levenshtein(&[1,2], &[2,3,1])); /// ``` pub fn generic_damerau_levenshtein(a_elems: &[Elem], b_elems: &[Elem]) -> usize where Elem: Eq + Hash + Clone, { let a_len = a_elems.len(); let b_len = b_elems.len(); if a_len == 0 { return b_len; } if b_len == 0 { return a_len; } let width = a_len + 2; let mut distances = vec![0; (a_len + 2) * (b_len + 2)]; let max_distance = a_len + b_len; distances[0] = max_distance; for i in 0..(a_len + 1) { distances[flat_index(i + 1, 0, width)] = max_distance; distances[flat_index(i + 1, 1, width)] = i; } for j in 0..(b_len + 1) { distances[flat_index(0, j + 1, width)] = max_distance; distances[flat_index(1, j + 1, width)] = j; } let mut elems: HashMap = HashMap::with_capacity(64); for i in 1..(a_len + 1) { let mut db = 0; for j in 1..(b_len + 1) { let k = match elems.get(&b_elems[j - 1]) { Some(&value) => value, None => 0, }; let insertion_cost = distances[flat_index(i, j + 1, width)] + 1; let deletion_cost = distances[flat_index(i + 1, j, width)] + 1; let transposition_cost = distances[flat_index(k, db, width)] + (i - k - 1) + 1 + (j - db - 1); let mut substitution_cost = distances[flat_index(i, j, width)] + 1; if a_elems[i - 1] == b_elems[j - 1] { db = j; substitution_cost -= 1; } distances[flat_index(i + 1, j + 1, width)] = min( substitution_cost, min(insertion_cost, min(deletion_cost, transposition_cost)), ); } elems.insert(a_elems[i - 1].clone(), i); } distances[flat_index(a_len + 1, b_len + 1, width)] } fn damerau_levenshtein_impl(s1: Iter1, len1: usize, s2: Iter2, len2: usize) -> usize where Iter1: Iterator + Clone, Iter2: Iterator + Clone, { // The implementations is based on the paper // `Linear space string correction algorithm using the Damerau-Levenshtein distance` // from Chunchun Zhao and Sartaj Sahni // // It has a runtime complexity of `O(N*M)` and a memory usage of `O(N+M)`. let max_val = max(len1, len2) as isize + 1; let mut last_row_id = HybridGrowingHashmapChar::::default(); let size = len2 + 2; let mut fr = vec![max_val; size]; let mut r1 = vec![max_val; size]; let mut r: Vec = (max_val..max_val + 1) .chain(0..(size - 1) as isize) .collect(); for (i, ch1) in s1.enumerate().map(|(i, ch1)| (i + 1, ch1)) { mem::swap(&mut r, &mut r1); let mut last_col_id: isize = -1; let mut last_i2l1 = r[1]; r[1] = i as isize; let mut t = max_val; for (j, ch2) in s2.clone().enumerate().map(|(j, ch2)| (j + 1, ch2)) { let diag = r1[j] + isize::from(ch1 != ch2); let left = r[j] + 1; let up = r1[j + 1] + 1; let mut temp = min(diag, min(left, up)); if ch1 == ch2 { last_col_id = j as isize; // last occurence of s1_i fr[j + 1] = r1[j - 1]; // save H_k-1,j-2 t = last_i2l1; // save H_i-2,l-1 } else { let k = last_row_id.get(ch2).val; let l = last_col_id; if j as isize - l == 1 { let transpose = fr[j + 1] + (i as isize - k); temp = min(temp, transpose); } else if i as isize - k == 1 { let transpose = t + (j as isize - l); temp = min(temp, transpose); } } last_i2l1 = r[j + 1]; r[j + 1] = temp; } last_row_id.get_mut(ch1).val = i as isize; } r[len2 + 1] as usize } /// Like optimal string alignment, but substrings can be edited an unlimited /// number of times, and the triangle inequality holds. /// /// ``` /// use fuzzt::algorithms::damerau_levenshtein; /// /// assert_eq!(2, damerau_levenshtein("ab", "bca")); /// ``` pub fn damerau_levenshtein(a: &str, b: &str) -> usize { damerau_levenshtein_impl(a.chars(), a.chars().count(), b.chars(), b.chars().count()) } /// Calculates a normalized score of the Damerau–Levenshtein algorithm between /// 0.0 and 1.0 (inclusive), where 1.0 means the strings are the same. /// /// ``` /// use fuzzt::algorithms::normalized_damerau_levenshtein; /// /// assert!((normalized_damerau_levenshtein("levenshtein", "löwenbräu") - 0.27272).abs() < 0.00001); /// assert!((normalized_damerau_levenshtein("", "") - 1.0).abs() < 0.00001); /// assert!(normalized_damerau_levenshtein("", "flower").abs() < 0.00001); /// assert!(normalized_damerau_levenshtein("tree", "").abs() < 0.00001); /// assert!((normalized_damerau_levenshtein("sunglasses", "sunglasses") - 1.0).abs() < 0.00001); /// ``` pub fn normalized_damerau_levenshtein(a: &str, b: &str) -> f64 { if a.is_empty() && b.is_empty() { return 1.0; } let len1 = a.chars().count(); let len2 = b.chars().count(); let dist = damerau_levenshtein_impl(a.chars(), len1, b.chars(), len2); 1.0 - (dist as f64) / (max(len1, len2) as f64) } pub struct DamerauLevenshtein; pub struct NormalizedDamerauLevenshtein; impl SimilarityMetric for DamerauLevenshtein { fn compute_metric(&self, a: &str, b: &str) -> Similarity { Similarity::Usize(damerau_levenshtein(a, b)) } } impl SimilarityMetric for NormalizedDamerauLevenshtein { fn compute_metric(&self, a: &str, b: &str) -> Similarity { Similarity::Float(normalized_damerau_levenshtein(a, b)) } } #[cfg(test)] mod tests { use super::*; #[test] fn damerau_levenshtein_empty() { assert_eq!(0, damerau_levenshtein("", "")); } #[test] fn damerau_levenshtein_same() { assert_eq!(0, damerau_levenshtein("damerau", "damerau")); } #[test] fn damerau_levenshtein_first_empty() { assert_eq!(7, damerau_levenshtein("", "damerau")); } #[test] fn damerau_levenshtein_second_empty() { assert_eq!(7, damerau_levenshtein("damerau", "")); } #[test] fn damerau_levenshtein_diff() { assert_eq!(2, damerau_levenshtein("ca", "abc")); } #[test] fn damerau_levenshtein_diff_short() { assert_eq!(3, damerau_levenshtein("damerau", "aderua")); } #[test] fn damerau_levenshtein_diff_reversed() { assert_eq!(3, damerau_levenshtein("aderua", "damerau")); } #[test] fn damerau_levenshtein_diff_multibyte() { assert_eq!(3, damerau_levenshtein("öঙ香", "abc")); assert_eq!(3, damerau_levenshtein("abc", "öঙ香")); } #[test] fn damerau_levenshtein_diff_unequal_length() { assert_eq!(6, damerau_levenshtein("damerau", "aderuaxyz")); } #[test] fn damerau_levenshtein_diff_unequal_length_reversed() { assert_eq!(6, damerau_levenshtein("aderuaxyz", "damerau")); } #[test] fn damerau_levenshtein_diff_comedians() { assert_eq!(5, damerau_levenshtein("Stewart", "Colbert")); } #[test] fn damerau_levenshtein_many_transpositions() { assert_eq!(4, damerau_levenshtein("abcdefghijkl", "bacedfgihjlk")); } #[test] fn damerau_levenshtein_diff_longer() { let a = "The quick brown fox jumped over the angry dog."; let b = "Lehem ipsum dolor sit amet, dicta latine an eam."; assert_eq!(36, damerau_levenshtein(a, b)); } #[test] fn damerau_levenshtein_beginning_transposition() { assert_eq!(1, damerau_levenshtein("foobar", "ofobar")); } #[test] fn damerau_levenshtein_end_transposition() { assert_eq!(1, damerau_levenshtein("specter", "spectre")); } #[test] fn damerau_levenshtein_unrestricted_edit() { assert_eq!(3, damerau_levenshtein("a cat", "an abct")); } #[test] fn normalized_damerau_levenshtein_diff_short() { assert_delta!( 0.27272, normalized_damerau_levenshtein("levenshtein", "löwenbräu") ); } #[test] fn normalized_damerau_levenshtein_for_empty_strings() { assert_delta!(1.0, normalized_damerau_levenshtein("", "")); } #[test] fn normalized_damerau_levenshtein_first_empty() { assert_delta!(0.0, normalized_damerau_levenshtein("", "flower")); } #[test] fn normalized_damerau_levenshtein_second_empty() { assert_delta!(0.0, normalized_damerau_levenshtein("tree", "")); } #[test] fn normalized_damerau_levenshtein_identical_strings() { assert_delta!( 1.0, normalized_damerau_levenshtein("sunglasses", "sunglasses") ); } } fuzzt-0.3.1/src/algorithms/gestalt.rs000064400000000000000000000047621046102023000157710ustar 00000000000000use crate::algorithms::{Similarity, SimilarityMetric}; use std::collections::HashMap; /// Compares two strings `s1` and `s2` and returns a measure of their similarity as a float in the range [0, 1]. /// /// The returned measure is computed as follows: /// 1. If the total length of the two strings is 0, the function returns 1.0. /// 2. Otherwise, it computes the intersection of the character counts of the two strings, /// sums up the counts in the intersection, and returns the ratio of twice the sum of the counts to the total length. /// /// # Arguments /// /// * `s1` - The first string to compare. /// * `s2` - The second string to compare. /// /// # Returns /// /// * A float between 0 and 1 representing the similarity of the two strings. /// ``` /// use fuzzt::algorithms::sequence_matcher; /// /// assert_eq!(sequence_matcher("test", "test"), 1.0); /// assert_eq!(sequence_matcher("test", "tent"), 0.75); /// assert_eq!(sequence_matcher("kitten", "sitting"), 0.6153846153846154); /// assert_eq!(sequence_matcher("", ""), 1.0); /// assert_eq!(sequence_matcher("test", ""), 0.0); /// assert_eq!(sequence_matcher("", "test"), 0.0); /// ``` pub fn sequence_matcher(s1: &str, s2: &str) -> f64 { let length = s1.len() + s2.len(); if length == 0 { return 1.0; } let intersect = intersect(&counter(s1), &counter(s2)); let matches: usize = intersect.values().sum(); 2.0 * (matches as f64) / (length as f64) } fn counter(s: &str) -> HashMap { let mut count = HashMap::new(); for c in s.chars() { *count.entry(c).or_insert(0) += 1; } count } fn intersect(map1: &HashMap, map2: &HashMap) -> HashMap { let mut intersect = HashMap::new(); for (k, v) in map1 { if let Some(v2) = map2.get(k) { intersect.insert(*k, *v.min(v2)); } } intersect } pub struct SequenceMatcher; impl SimilarityMetric for SequenceMatcher { fn compute_metric(&self, a: &str, b: &str) -> Similarity { Similarity::Float(sequence_matcher(a, b)) } } #[cfg(test)] mod tests { use super::*; #[test] fn test_quick_ratio() { assert_eq!(sequence_matcher("test", "test"), 1.0); assert_eq!(sequence_matcher("test", "tent"), 0.75); assert_eq!(sequence_matcher("kitten", "sitting"), 0.6153846153846154); assert_eq!(sequence_matcher("", ""), 1.0); assert_eq!(sequence_matcher("test", ""), 0.0); assert_eq!(sequence_matcher("", "test"), 0.0); } } fuzzt-0.3.1/src/algorithms/hamming.rs000064400000000000000000000047261046102023000157460ustar 00000000000000use crate::algorithms::{Similarity, SimilarityMetric}; use crate::utils::FuzztError; pub type HammingResult = Result; /// Calculates the number of positions in the two sequences where the elements /// differ. Returns an error if the sequences have different lengths. fn generic_hamming(a: Iter1, b: Iter2) -> HammingResult where Iter1: IntoIterator, Iter2: IntoIterator, Elem1: PartialEq, { let (mut ita, mut itb) = (a.into_iter(), b.into_iter()); let mut count = 0; loop { match (ita.next(), itb.next()) { (Some(x), Some(y)) => { if x != y { count += 1; } } (None, None) => return Ok(count), _ => return Err(FuzztError::DifferentLengthArgs), } } } /// Calculates the number of positions in the two strings where the characters /// differ. Returns an error if the strings have different lengths. /// /// ``` /// use fuzzt::{FuzztError::DifferentLengthArgs}; /// use fuzzt::algorithms::hamming; /// /// assert_eq!(Ok(3), hamming("hamming", "hammers")); /// /// assert_eq!(Err(DifferentLengthArgs), hamming("hamming", "ham")); /// ``` pub fn hamming(a: &str, b: &str) -> HammingResult { generic_hamming(a.chars(), b.chars()) } pub struct Hamming; impl SimilarityMetric for Hamming { fn compute_metric(&self, a: &str, b: &str) -> Similarity { Similarity::Usize(hamming(a, b).unwrap()) } } #[cfg(test)] mod tests { use super::*; fn assert_hamming_dist(dist: usize, str1: &str, str2: &str) { assert_eq!(Ok(dist), hamming(str1, str2)); } #[test] fn hamming_empty() { assert_hamming_dist(0, "", "") } #[test] fn hamming_same() { assert_hamming_dist(0, "hamming", "hamming") } #[test] fn hamming_numbers() { assert_eq!(Ok(1), generic_hamming(&[1, 2, 4], &[1, 2, 3])); } #[test] fn hamming_diff() { assert_hamming_dist(3, "hamming", "hammers") } #[test] fn hamming_diff_multibyte() { assert_hamming_dist(2, "hamming", "h香mmüng"); } #[test] fn hamming_unequal_length() { assert_eq!( Err(FuzztError::DifferentLengthArgs), generic_hamming("ham".chars(), "hamming".chars()) ); } #[test] fn hamming_names() { assert_hamming_dist(14, "Friedrich Nietzs", "Jean-Paul Sartre") } } fuzzt-0.3.1/src/algorithms/jaro.rs000064400000000000000000000174361046102023000152630ustar 00000000000000use crate::algorithms::{Similarity, SimilarityMetric}; use crate::utils::StringWrapper; use std::cmp::{max, min}; /// Calculates the Jaro similarity between two sequences. The returned value /// is between 0.0 and 1.0 (higher value means more similar). fn generic_jaro<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64 where &'a Iter1: IntoIterator, &'b Iter2: IntoIterator, Elem1: PartialEq, { let a_len = a.into_iter().count(); let b_len = b.into_iter().count(); if a_len == 0 && b_len == 0 { return 1.0; } else if a_len == 0 || b_len == 0 { return 0.0; } let mut search_range = max(a_len, b_len) / 2; search_range = search_range.saturating_sub(1); // combine memory allocations to reduce runtime let mut flags_memory = vec![false; a_len + b_len]; let (a_flags, b_flags) = flags_memory.split_at_mut(a_len); let mut matches = 0_usize; for (i, a_elem) in a.into_iter().enumerate() { // prevent integer wrapping let min_bound = if i > search_range { i - search_range } else { 0 }; let max_bound = min(b_len, i + search_range + 1); for (j, b_elem) in b.into_iter().enumerate().take(max_bound) { if min_bound <= j && a_elem == b_elem && !b_flags[j] { a_flags[i] = true; b_flags[j] = true; matches += 1; break; } } } let mut transpositions = 0_usize; if matches != 0 { let mut b_iter = b_flags.iter().zip(b); for (a_flag, ch1) in a_flags.iter().zip(a) { if *a_flag { loop { if let Some((b_flag, ch2)) = b_iter.next() { if !*b_flag { continue; } if ch1 != ch2 { transpositions += 1; } break; } } } } } transpositions /= 2; if matches == 0 { 0.0 } else { ((matches as f64 / a_len as f64) + (matches as f64 / b_len as f64) + ((matches - transpositions) as f64 / matches as f64)) / 3.0 } } /// Like Jaro but gives a boost to sequences that have a common prefix. fn generic_jaro_winkler<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> f64 where &'a Iter1: IntoIterator, &'b Iter2: IntoIterator, Elem1: PartialEq, { let sim = generic_jaro(a, b); if sim > 0.7 { let prefix_length = a .into_iter() .take(4) .zip(b) .take_while(|(a_elem, b_elem)| a_elem == b_elem) .count(); sim + 0.1 * prefix_length as f64 * (1.0 - sim) } else { sim } } /// Calculates the Jaro similarity between two strings. The returned value /// is between 0.0 and 1.0 (higher value means more similar). /// /// ``` /// use fuzzt::algorithms::jaro; /// /// assert!((0.392 - jaro("Friedrich Nietzsche", "Jean-Paul Sartre")).abs() < /// 0.001); /// ``` pub fn jaro(a: &str, b: &str) -> f64 { generic_jaro(&StringWrapper(a), &StringWrapper(b)) } /// Like Jaro but gives a boost to strings that have a common prefix. /// /// ``` /// use fuzzt::algorithms::jaro_winkler; /// /// assert!((0.866 - jaro_winkler("cheeseburger", "cheese fries")).abs() < /// 0.001); /// ``` pub fn jaro_winkler(a: &str, b: &str) -> f64 { generic_jaro_winkler(&StringWrapper(a), &StringWrapper(b)) } pub struct Jaro; pub struct JaroWinkler; impl SimilarityMetric for Jaro { fn compute_metric(&self, a: &str, b: &str) -> Similarity { Similarity::Float(jaro(a, b)) } } impl SimilarityMetric for JaroWinkler { fn compute_metric(&self, a: &str, b: &str) -> Similarity { Similarity::Float(jaro_winkler(a, b)) } } #[cfg(test)] mod tests { use super::*; #[test] fn jaro_both_empty() { assert_eq!(1.0, jaro("", "")); } #[test] fn jaro_first_empty() { assert_eq!(0.0, jaro("", "jaro")); } #[test] fn jaro_second_empty() { assert_eq!(0.0, jaro("distance", "")); } #[test] fn jaro_same() { assert_eq!(1.0, jaro("jaro", "jaro")); } #[test] fn jaro_multibyte() { assert_delta!(0.818, jaro("testabctest", "testöঙ香test"), 0.001); assert_delta!(0.818, jaro("testöঙ香test", "testabctest"), 0.001); } #[test] fn jaro_diff_short() { assert_delta!(0.767, jaro("dixon", "dicksonx"), 0.001); } #[test] fn jaro_diff_one_character() { assert_eq!(0.0, jaro("a", "b")); } #[test] fn jaro_same_one_character() { assert_eq!(1.0, jaro("a", "a")); } #[test] fn generic_jaro_diff() { assert_eq!(0.0, generic_jaro(&[1, 2], &[3, 4])); } #[test] fn jaro_diff_one_and_two() { assert_delta!(0.83, jaro("a", "ab"), 0.01); } #[test] fn jaro_diff_two_and_one() { assert_delta!(0.83, jaro("ab", "a"), 0.01); } #[test] fn jaro_diff_no_transposition() { assert_delta!(0.822, jaro("dwayne", "duane"), 0.001); } #[test] fn jaro_diff_with_transposition() { assert_delta!(0.944, jaro("martha", "marhta"), 0.001); assert_delta!(0.6, jaro("a jke", "jane a k"), 0.001); } #[test] fn jaro_names() { assert_delta!( 0.392, jaro("Friedrich Nietzsche", "Jean-Paul Sartre"), 0.001 ); } #[test] fn jaro_winkler_both_empty() { assert_eq!(1.0, jaro_winkler("", "")); } #[test] fn jaro_winkler_first_empty() { assert_eq!(0.0, jaro_winkler("", "jaro-winkler")); } #[test] fn jaro_winkler_second_empty() { assert_eq!(0.0, jaro_winkler("distance", "")); } #[test] fn jaro_winkler_same() { assert_eq!(1.0, jaro_winkler("Jaro-Winkler", "Jaro-Winkler")); } #[test] fn jaro_winkler_multibyte() { assert_delta!(0.89, jaro_winkler("testabctest", "testöঙ香test"), 0.001); assert_delta!(0.89, jaro_winkler("testöঙ香test", "testabctest"), 0.001); } #[test] fn jaro_winkler_diff_short() { assert_delta!(0.813, jaro_winkler("dixon", "dicksonx"), 0.001); assert_delta!(0.813, jaro_winkler("dicksonx", "dixon"), 0.001); } #[test] fn jaro_winkler_diff_one_character() { assert_eq!(0.0, jaro_winkler("a", "b")); } #[test] fn jaro_winkler_same_one_character() { assert_eq!(1.0, jaro_winkler("a", "a")); } #[test] fn jaro_winkler_diff_no_transposition() { assert_delta!(0.84, jaro_winkler("dwayne", "duane"), 0.001); } #[test] fn jaro_winkler_diff_with_transposition() { assert_delta!(0.961, jaro_winkler("martha", "marhta"), 0.001); assert_delta!(0.6, jaro_winkler("a jke", "jane a k"), 0.001); } #[test] fn jaro_winkler_names() { assert_delta!( 0.452, jaro_winkler("Friedrich Nietzsche", "Fran-Paul Sartre"), 0.001 ); } #[test] fn jaro_winkler_long_prefix() { assert_delta!(0.866, jaro_winkler("cheeseburger", "cheese fries"), 0.001); } #[test] fn jaro_winkler_more_names() { assert_delta!(0.868, jaro_winkler("Thorkel", "Thorgier"), 0.001); } #[test] fn jaro_winkler_length_of_one() { assert_delta!(0.738, jaro_winkler("Dinsdale", "D"), 0.001); } #[test] fn jaro_winkler_very_long_prefix() { assert_delta!( 0.98519, jaro_winkler("thequickbrownfoxjumpedoverx", "thequickbrownfoxjumpedovery") ); } } fuzzt-0.3.1/src/algorithms/levenshtein.rs000064400000000000000000000106071046102023000166450ustar 00000000000000use crate::utils::StringWrapper; use crate::algorithms::{Similarity, SimilarityMetric}; use std::cmp::min; /// Calculates the minimum number of insertions, deletions, and substitutions /// required to change one sequence into the other. /// /// ``` /// use fuzzt::algorithms::generic_levenshtein; /// /// assert_eq!(3, generic_levenshtein(&[1,2,3], &[1,2,3,4,5,6])); /// ``` pub fn generic_levenshtein<'a, 'b, Iter1, Iter2, Elem1, Elem2>(a: &'a Iter1, b: &'b Iter2) -> usize where &'a Iter1: IntoIterator, &'b Iter2: IntoIterator, Elem1: PartialEq, { let b_len = b.into_iter().count(); let mut cache: Vec = (1..b_len + 1).collect(); let mut result = b_len; for (i, a_elem) in a.into_iter().enumerate() { result = i + 1; let mut distance_b = i; for (j, b_elem) in b.into_iter().enumerate() { let cost = usize::from(a_elem != b_elem); let distance_a = distance_b + cost; distance_b = cache[j]; result = min(result + 1, min(distance_a, distance_b + 1)); cache[j] = result; } } result } /// Calculates the minimum number of insertions, deletions, and substitutions /// required to change one string into the other. /// /// ``` /// use fuzzt::algorithms::levenshtein; /// /// assert_eq!(3, levenshtein("kitten", "sitting")); /// ``` pub fn levenshtein(a: &str, b: &str) -> usize { generic_levenshtein(&StringWrapper(a), &StringWrapper(b)) } /// Calculates a normalized score of the Levenshtein algorithm between 0.0 and /// 1.0 (inclusive), where 1.0 means the strings are the same. /// /// ``` /// use fuzzt::algorithms::normalized_levenshtein; /// /// assert!((normalized_levenshtein("kitten", "sitting") - 0.57142).abs() < 0.00001); /// assert!((normalized_levenshtein("", "") - 1.0).abs() < 0.00001); /// assert!(normalized_levenshtein("", "second").abs() < 0.00001); /// assert!(normalized_levenshtein("first", "").abs() < 0.00001); /// assert!((normalized_levenshtein("string", "string") - 1.0).abs() < 0.00001); /// ``` pub fn normalized_levenshtein(a: &str, b: &str) -> f64 { if a.is_empty() && b.is_empty() { return 1.0; } 1.0 - (levenshtein(a, b) as f64) / (a.chars().count().max(b.chars().count()) as f64) } pub struct Levenshtein; pub struct NormalizedLevenshtein; impl SimilarityMetric for Levenshtein { fn compute_metric(&self, a: &str, b: &str) -> Similarity { Similarity::Usize(levenshtein(a, b)) } } impl SimilarityMetric for NormalizedLevenshtein { fn compute_metric(&self, a: &str, b: &str) -> Similarity { Similarity::Float(normalized_levenshtein(a, b)) } } #[cfg(test)] mod tests { use super::*; #[test] fn levenshtein_empty() { assert_eq!(0, levenshtein("", "")); } #[test] fn levenshtein_same() { assert_eq!(0, levenshtein("levenshtein", "levenshtein")); } #[test] fn levenshtein_diff_short() { assert_eq!(3, levenshtein("kitten", "sitting")); } #[test] fn levenshtein_diff_with_space() { assert_eq!(5, levenshtein("hello, world", "bye, world")); } #[test] fn levenshtein_diff_multibyte() { assert_eq!(3, levenshtein("öঙ香", "abc")); assert_eq!(3, levenshtein("abc", "öঙ香")); } #[test] fn levenshtein_diff_longer() { let a = "The quick brown fox jumped over the angry dog."; let b = "Lorem ipsum dolor sit amet, dicta latine an eam."; assert_eq!(37, levenshtein(a, b)); } #[test] fn levenshtein_first_empty() { assert_eq!(7, levenshtein("", "sitting")); } #[test] fn levenshtein_second_empty() { assert_eq!(6, levenshtein("kitten", "")); } #[test] fn normalized_levenshtein_diff_short() { assert_delta!(0.57142, normalized_levenshtein("kitten", "sitting")); } #[test] fn normalized_levenshtein_for_empty_strings() { assert_delta!(1.0, normalized_levenshtein("", "")); } #[test] fn normalized_levenshtein_first_empty() { assert_delta!(0.0, normalized_levenshtein("", "second")); } #[test] fn normalized_levenshtein_second_empty() { assert_delta!(0.0, normalized_levenshtein("first", "")); } #[test] fn normalized_levenshtein_identical_strings() { assert_delta!(1.0, normalized_levenshtein("identical", "identical")); } } fuzzt-0.3.1/src/algorithms/mod.rs000064400000000000000000000033001046102023000150700ustar 00000000000000#[macro_export] macro_rules! assert_delta { ($x:expr, $y:expr) => { assert_delta!($x, $y, 1e-5); }; ($x:expr, $y:expr, $d:expr) => { if ($x - $y).abs() > $d { panic!( "assertion failed: actual: `{}`, expected: `{}`: \ actual not within < {} of expected", $x, $y, $d ); } }; } #[cfg(feature = "damerau_levenshtein")] pub mod damerau_levenshtein; #[cfg(feature = "damerau_levenshtein")] pub use damerau_levenshtein::{ damerau_levenshtein, generic_damerau_levenshtein, normalized_damerau_levenshtein, DamerauLevenshtein, NormalizedDamerauLevenshtein, }; pub mod gestalt; pub use gestalt::{sequence_matcher, SequenceMatcher}; #[cfg(feature = "hamming")] pub mod hamming; #[cfg(feature = "hamming")] pub use hamming::{hamming, Hamming}; #[cfg(feature = "jaro")] pub mod jaro; #[cfg(feature = "jaro")] pub use jaro::{jaro, jaro_winkler, Jaro, JaroWinkler}; #[cfg(feature = "levenshtein")] pub mod levenshtein; #[cfg(feature = "levenshtein")] pub use levenshtein::{ generic_levenshtein, levenshtein, normalized_levenshtein, Levenshtein, NormalizedLevenshtein, }; #[cfg(feature = "optimal_string_alignment")] pub mod optimal_string_alignment; #[cfg(feature = "optimal_string_alignment")] pub use optimal_string_alignment::{osa_distance, OSADistance}; #[cfg(feature = "sorensen_dice")] pub mod sorensen_dice; #[cfg(feature = "sorensen_dice")] pub use sorensen_dice::{sorensen_dice, SorensenDice}; pub enum Similarity { Usize(usize), Float(f64), } pub trait SimilarityMetric { // The smaller, the more similar 2 strings are. fn compute_metric(&self, a: &str, b: &str) -> Similarity; } fuzzt-0.3.1/src/algorithms/optimal_string_alignment.rs000064400000000000000000000100031046102023000214000ustar 00000000000000use std::cmp::min; use std::mem; use crate::algorithms::{Similarity, SimilarityMetric}; /// Like Levenshtein but allows for adjacent transpositions. Each substring can /// only be edited once. /// /// ``` /// use fuzzt::algorithms::osa_distance; /// /// assert_eq!(3, osa_distance("ab", "bca")); /// ``` pub fn osa_distance(a: &str, b: &str) -> usize { let b_len = b.chars().count(); // 0..=b_len behaves like 0..b_len.saturating_add(1) which could be a different size // this leads to significantly worse code gen when swapping the vectors below let mut prev_two_distances: Vec = (0..b_len + 1).collect(); let mut prev_distances: Vec = (0..b_len + 1).collect(); let mut curr_distances: Vec = vec![0; b_len + 1]; let mut prev_a_char = char::MAX; let mut prev_b_char = char::MAX; for (i, a_char) in a.chars().enumerate() { curr_distances[0] = i + 1; for (j, b_char) in b.chars().enumerate() { let cost = usize::from(a_char != b_char); curr_distances[j + 1] = min( curr_distances[j] + 1, min(prev_distances[j + 1] + 1, prev_distances[j] + cost), ); if i > 0 && j > 0 && a_char != b_char && a_char == prev_b_char && b_char == prev_a_char { curr_distances[j + 1] = min(curr_distances[j + 1], prev_two_distances[j - 1] + 1); } prev_b_char = b_char; } mem::swap(&mut prev_two_distances, &mut prev_distances); mem::swap(&mut prev_distances, &mut curr_distances); prev_a_char = a_char; } // access prev_distances instead of curr_distances since we swapped // them above. In case a is empty this would still contain the correct value // from initializing the last element to b_len prev_distances[b_len] } pub struct OSADistance; impl SimilarityMetric for OSADistance { fn compute_metric(&self, a: &str, b: &str) -> Similarity { Similarity::Usize(osa_distance(a, b)) } } #[cfg(test)] mod tests { use super::*; #[test] fn osa_distance_empty() { assert_eq!(0, osa_distance("", "")); } #[test] fn osa_distance_same() { assert_eq!(0, osa_distance("damerau", "damerau")); } #[test] fn osa_distance_first_empty() { assert_eq!(7, osa_distance("", "damerau")); } #[test] fn osa_distance_second_empty() { assert_eq!(7, osa_distance("damerau", "")); } #[test] fn osa_distance_diff() { assert_eq!(3, osa_distance("ca", "abc")); } #[test] fn osa_distance_diff_short() { assert_eq!(3, osa_distance("damerau", "aderua")); } #[test] fn osa_distance_diff_reversed() { assert_eq!(3, osa_distance("aderua", "damerau")); } #[test] fn osa_distance_diff_multibyte() { assert_eq!(3, osa_distance("öঙ香", "abc")); assert_eq!(3, osa_distance("abc", "öঙ香")); } #[test] fn osa_distance_diff_unequal_length() { assert_eq!(6, osa_distance("damerau", "aderuaxyz")); } #[test] fn osa_distance_diff_unequal_length_reversed() { assert_eq!(6, osa_distance("aderuaxyz", "damerau")); } #[test] fn osa_distance_diff_comedians() { assert_eq!(5, osa_distance("Stewart", "Colbert")); } #[test] fn osa_distance_many_transpositions() { assert_eq!(4, osa_distance("abcdefghijkl", "bacedfgihjlk")); } #[test] fn osa_distance_diff_longer() { let a = "The quick brown fox jumped over the angry dog."; let b = "Lehem ipsum dolor sit amet, dicta latine an eam."; assert_eq!(36, osa_distance(a, b)); } #[test] fn osa_distance_beginning_transposition() { assert_eq!(1, osa_distance("foobar", "ofobar")); } #[test] fn osa_distance_end_transposition() { assert_eq!(1, osa_distance("specter", "spectre")); } #[test] fn osa_distance_restricted_edit() { assert_eq!(4, osa_distance("a cat", "an abct")); } } fuzzt-0.3.1/src/algorithms/sorensen_dice.rs000064400000000000000000000073471046102023000171500ustar 00000000000000use crate::algorithms::{Similarity, SimilarityMetric}; use crate::utils::bigrams; use std::collections::HashMap; /// Calculates a Sørensen-Dice similarity distance using bigrams. /// See . /// /// ``` /// use fuzzt::algorithms::sorensen_dice; /// /// assert_eq!(1.0, sorensen_dice("", "")); /// assert_eq!(0.0, sorensen_dice("", "a")); /// assert_eq!(0.0, sorensen_dice("french", "quebec")); /// assert_eq!(1.0, sorensen_dice("ferris", "ferris")); /// assert_eq!(0.8888888888888888, sorensen_dice("feris", "ferris")); /// ``` pub fn sorensen_dice(a: &str, b: &str) -> f64 { // implementation guided by // https://github.com/aceakash/string-similarity/blob/f83ba3cd7bae874c20c429774e911ae8cff8bced/src/index.js#L6 let a: String = a.chars().filter(|&x| !char::is_whitespace(x)).collect(); let b: String = b.chars().filter(|&x| !char::is_whitespace(x)).collect(); if a == b { return 1.0; } if a.len() < 2 || b.len() < 2 { return 0.0; } let mut a_bigrams: HashMap<(char, char), usize> = HashMap::new(); for bigram in bigrams(&a) { *a_bigrams.entry(bigram).or_insert(0) += 1; } let mut intersection_size = 0_usize; for bigram in bigrams(&b) { a_bigrams.entry(bigram).and_modify(|bi| { if *bi > 0 { *bi -= 1; intersection_size += 1; } }); } (2 * intersection_size) as f64 / (a.len() + b.len() - 2) as f64 } pub struct SorensenDice; impl SimilarityMetric for SorensenDice { fn compute_metric(&self, a: &str, b: &str) -> Similarity { Similarity::Float(sorensen_dice(a, b)) } } #[cfg(test)] mod tests { use super::*; #[test] fn sorensen_dice_all() { // test cases taken from // https://github.com/aceakash/string-similarity/blob/f83ba3cd7bae874c20c429774e911ae8cff8bced/src/spec/index.spec.js#L11 assert_delta!(1.0, sorensen_dice("a", "a")); assert_delta!(0.0, sorensen_dice("a", "b")); assert_delta!(1.0, sorensen_dice("", "")); assert_delta!(0.0, sorensen_dice("a", "")); assert_delta!(0.0, sorensen_dice("", "a")); assert_delta!(1.0, sorensen_dice("apple event", "apple event")); assert_delta!(0.90909, sorensen_dice("iphone", "iphone x")); assert_delta!(0.0, sorensen_dice("french", "quebec")); assert_delta!(1.0, sorensen_dice("france", "france")); assert_delta!(0.2, sorensen_dice("fRaNce", "france")); assert_delta!(0.8, sorensen_dice("healed", "sealed")); assert_delta!( 0.78788, sorensen_dice("web applications", "applications of the web") ); assert_delta!( 0.92, sorensen_dice( "this will have a typo somewhere", "this will huve a typo somewhere" ) ); assert_delta!( 0.60606, sorensen_dice( "Olive-green table for sale, in extremely good condition.", "For sale: table in very good condition, olive green in colour." ) ); assert_delta!( 0.25581, sorensen_dice( "Olive-green table for sale, in extremely good condition.", "For sale: green Subaru Impreza, 210,000 miles" ) ); assert_delta!( 0.14118, sorensen_dice( "Olive-green table for sale, in extremely good condition.", "Wanted: mountain bike with at least 21 gears." ) ); assert_delta!( 0.77419, sorensen_dice("this has one extra word", "this has one word") ); } } fuzzt-0.3.1/src/lib.rs000064400000000000000000000014071046102023000127140ustar 00000000000000//! This library implements string similarity metrics. #![forbid(unsafe_code)] #![allow( // these casts are sometimes needed. They restrict the length of input iterators // but there isn't really any way around this except for always working with // 128 bit types clippy::cast_possible_wrap, clippy::cast_sign_loss, clippy::cast_precision_loss, // not practical clippy::needless_pass_by_value, clippy::similar_names, // noisy clippy::missing_errors_doc, clippy::missing_panics_doc, clippy::must_use_candidate, // todo https://github.com/rapidfuzz/strsim-rs/issues/59 clippy::range_plus_one )] pub mod algorithms; pub mod processors; pub use utils::FuzztError; mod matcher; mod utils; pub use matcher::get_top_n; fuzzt-0.3.1/src/matcher.rs000064400000000000000000000100151046102023000135640ustar 00000000000000use crate::{ algorithms::{SequenceMatcher, Similarity, SimilarityMetric}, processors::{NullStringProcessor, StringProcessor}, }; use std::cmp::Reverse; use std::collections::BinaryHeap; /// Returns a list of the best matches to a collection of choices. /// /// This is a convenience function for getting the choices with the highest scores. /// /// # Arguments /// /// * `query` - A string to match against. /// * `choices` - A list of choices to compare against the query. /// * `cutoff` - A score threshold. No matches with a score less than this number will be returned. Defaults to 0.7. /// * `n` - Optional maximum for the number of elements returned. Defaults to 3. /// * `processor` - Optional function for transforming choices before matching. If not provided, `NullStringProcessor` is used. /// * `scorer` - Optional scoring function for extract(). If not provided, `SequenceMatcher` is used. /// /// # Returns /// /// * A vector of the top 'n' matches from the given choices. /// /// # Example /// /// ``` /// extern crate fuzzt; /// use fuzzt::{algorithms::NormalizedLevenshtein, get_top_n, processors::NullStringProcessor}; /// /// let matches = get_top_n( /// "apple", /// &["apply", "apples", "ape", "applet", "applesauce"], /// Some(0.8), /// Some(3), /// Some(&NullStringProcessor), /// Some(&NormalizedLevenshtein), /// ); /// assert_eq!(matches, ["apples", "applet", "apply"]); /// ``` pub fn get_top_n<'a>( query: &str, choices: &[&'a str], cutoff: Option, n: Option, processor: Option<&dyn StringProcessor>, scorer: Option<&dyn SimilarityMetric>, ) -> Vec<&'a str> { let mut matches = BinaryHeap::new(); let n = n.unwrap_or(3); let cutoff = cutoff.unwrap_or(0.7); let scorer = match scorer { Some(scorer_trait) => scorer_trait, None => &SequenceMatcher, }; let processor = match processor { Some(some_processor) => some_processor, None => &NullStringProcessor, }; let processed_query = processor.process(query); for &choice in choices { let processed_choice = processor.process(choice); let raw_ratio = scorer.compute_metric(processed_query.as_str(), processed_choice.as_str()); let ratio = match raw_ratio { Similarity::Usize(r) => r as f64, Similarity::Float(r) => r, }; if ratio >= cutoff { let int_ratio = match raw_ratio { Similarity::Usize(r) => r as i64, Similarity::Float(r) => (r * std::u32::MAX as f64) as i64, }; // we're putting the word itself in reverse in so that matches with // the same ratio are ordered lexicographically. matches.push((int_ratio, Reverse(choice))); } } let mut rv = vec![]; for _ in 0..n { if let Some((_, elt)) = matches.pop() { rv.push(elt.0); } else { break; } } rv } #[cfg(test)] mod tests { use super::get_top_n; use crate::algorithms::jaro::JaroWinkler; use crate::algorithms::SimilarityMetric; use crate::processors::{LowerAlphaNumStringProcessor, StringProcessor}; use rstest::rstest; #[rstest] #[case(Some(0.7), Some(3), None, None, &["brazil", "braziu", "trazil"])] #[case(Some(0.9), Some(5), None, None, &["brazil"])] #[case(Some(0.7), Some(2), None, Some(&JaroWinkler as &dyn SimilarityMetric), &["brazil", "braziu"])] #[case(Some(0.7), Some(2), Some(&LowerAlphaNumStringProcessor as &dyn StringProcessor), None, &["brazil", "BRA ZIL"])] fn test_get_top_n<'a>( #[case] cutoff: Option, #[case] n: Option, #[case] processor: Option<&dyn StringProcessor>, #[case] scorer: Option<&dyn SimilarityMetric>, #[case] expected: &[&'a str], ) { let choices = &["trazil", "BRA ZIL", "brazil", "spain", "braziu"][..]; let query = "brazil"; let matches = get_top_n(query, choices, cutoff, n, processor, scorer); assert_eq!(matches, expected); } } fuzzt-0.3.1/src/processors/mod.rs000064400000000000000000000002601046102023000151230ustar 00000000000000mod simple_processors; pub use simple_processors::{LowerAlphaNumStringProcessor, NullStringProcessor}; pub trait StringProcessor { fn process(&self, s: &str) -> String; } fuzzt-0.3.1/src/processors/simple_processors.rs000064400000000000000000000010671046102023000201250ustar 00000000000000use super::StringProcessor; pub struct LowerAlphaNumStringProcessor; pub struct NullStringProcessor; impl StringProcessor for LowerAlphaNumStringProcessor { fn process(&self, input: &str) -> String { let processed: String = input .chars() .filter(|c| c.is_alphanumeric() || c.is_whitespace()) .collect::() .trim() .to_lowercase(); processed } } impl StringProcessor for NullStringProcessor { fn process(&self, input: &str) -> String { input.to_owned() } } fuzzt-0.3.1/src/utils.rs000064400000000000000000000147521046102023000133150ustar 00000000000000use std::char; use std::convert::TryFrom; use std::error::Error; use std::fmt::{self, Display, Formatter}; use std::str::Chars; #[derive(Debug, PartialEq)] pub enum FuzztError { DifferentLengthArgs, } impl Display for FuzztError { fn fmt(&self, fmt: &mut Formatter) -> Result<(), fmt::Error> { let text = match self { FuzztError::DifferentLengthArgs => "Differing length arguments provided", }; write!(fmt, "{text}") } } impl Error for FuzztError {} pub struct StringWrapper<'a>(pub &'a str); impl<'a, 'b> IntoIterator for &'a StringWrapper<'b> { type Item = char; type IntoIter = Chars<'b>; fn into_iter(self) -> Self::IntoIter { self.0.chars() } } #[derive(Default, Clone)] struct GrowingHashmapMapElemChar { key: u32, value: ValueType, } /// specialized hashmap to store user provided types /// this implementation relies on a couple of base assumptions in order to simplify the implementation /// - the hashmap does not have an upper limit of included items /// - the default value for the `ValueType` can be used as a dummy value to indicate an empty cell /// - elements can't be removed /// - only allocates memory on first write access. /// This improves performance for hashmaps that are never written to struct GrowingHashmapChar { used: i32, fill: i32, mask: i32, map: Option>>, } impl Default for GrowingHashmapChar where ValueType: Default + Clone + Eq, { fn default() -> Self { Self { used: 0, fill: 0, mask: -1, map: None, } } } impl GrowingHashmapChar where ValueType: Default + Clone + Eq + Copy, { fn get(&self, key: u32) -> ValueType { self.map .as_ref() .map_or_else(|| Default::default(), |map| map[self.lookup(key)].value) } fn get_mut(&mut self, key: u32) -> &mut ValueType { if self.map.is_none() { self.allocate(); } let mut i = self.lookup(key); if self .map .as_ref() .expect("map should have been created above")[i] .value == Default::default() { self.fill += 1; // resize when 2/3 full if self.fill * 3 >= (self.mask + 1) * 2 { self.grow((self.used + 1) * 2); i = self.lookup(key); } self.used += 1; } let elem = &mut self .map .as_mut() .expect("map should have been created above")[i]; elem.key = key; &mut elem.value } fn allocate(&mut self) { self.mask = 8 - 1; self.map = Some(vec![GrowingHashmapMapElemChar::default(); 8]); } /// lookup key inside the hashmap using a similar collision resolution /// strategy to `CPython` and `Ruby` fn lookup(&self, key: u32) -> usize { let hash = key; let mut i = hash as usize & self.mask as usize; let map = self .map .as_ref() .expect("callers have to ensure map is allocated"); if map[i].value == Default::default() || map[i].key == key { return i; } let mut perturb = key; loop { i = (i * 5 + perturb as usize + 1) & self.mask as usize; if map[i].value == Default::default() || map[i].key == key { return i; } perturb >>= 5; } } fn grow(&mut self, min_used: i32) { let mut new_size = self.mask + 1; while new_size <= min_used { new_size <<= 1; } self.fill = self.used; self.mask = new_size - 1; let old_map = std::mem::replace( self.map .as_mut() .expect("callers have to ensure map is allocated"), vec![GrowingHashmapMapElemChar::::default(); new_size as usize], ); for elem in old_map { if elem.value != Default::default() { let j = self.lookup(elem.key); let new_elem = &mut self.map.as_mut().expect("map created above")[j]; new_elem.key = elem.key; new_elem.value = elem.value; self.used -= 1; if self.used == 0 { break; } } } self.used = self.fill; } } pub struct HybridGrowingHashmapChar { map: GrowingHashmapChar, extended_ascii: [ValueType; 256], } impl HybridGrowingHashmapChar where ValueType: Default + Clone + Copy + Eq, { pub fn get(&self, key: char) -> ValueType { let value = key as u32; if value <= 255 { let val_u8 = u8::try_from(value).expect("we check the bounds above"); self.extended_ascii[usize::from(val_u8)] } else { self.map.get(value) } } pub fn get_mut(&mut self, key: char) -> &mut ValueType { let value = key as u32; if value <= 255 { let val_u8 = u8::try_from(value).expect("we check the bounds above"); &mut self.extended_ascii[usize::from(val_u8)] } else { self.map.get_mut(value) } } } impl Default for HybridGrowingHashmapChar where ValueType: Default + Clone + Copy + Eq, { fn default() -> Self { HybridGrowingHashmapChar { map: GrowingHashmapChar::default(), extended_ascii: [Default::default(); 256], } } } #[derive(Clone, Copy, PartialEq, Eq)] pub struct RowId { pub val: isize, } impl Default for RowId { fn default() -> Self { Self { val: -1 } } } /// Returns an Iterator of char tuples. pub fn bigrams(s: &str) -> impl Iterator + '_ { s.chars().zip(s.chars().skip(1)) } /* Returns the final index for a value in a single vector that represents a fixed 2d grid */ pub fn flat_index(i: usize, j: usize, width: usize) -> usize { j * width + i } #[cfg(test)] mod tests { use super::*; #[test] fn bigrams_iterator() { let mut bi = bigrams("abcde"); assert_eq!(Some(('a', 'b')), bi.next()); assert_eq!(Some(('b', 'c')), bi.next()); assert_eq!(Some(('c', 'd')), bi.next()); assert_eq!(Some(('d', 'e')), bi.next()); assert_eq!(None, bi.next()); } } fuzzt-0.3.1/tests/lib.rs000064400000000000000000000024771046102023000132770ustar 00000000000000extern crate fuzzt; use fuzzt::{ algorithms::{ damerau_levenshtein, hamming, jaro, jaro_winkler, levenshtein, normalized_damerau_levenshtein, normalized_levenshtein, osa_distance, sequence_matcher, }, assert_delta, }; #[test] fn hamming_works() { match hamming("hamming", "hammers") { Ok(distance) => assert_eq!(3, distance), Err(why) => panic!("{:?}", why), } } #[test] fn levenshtein_works() { assert_eq!(3, levenshtein("kitten", "sitting")); } #[test] fn normalized_levenshtein_works() { assert_delta!(0.57142, normalized_levenshtein("kitten", "sitting")); } #[test] fn osa_distance_works() { assert_eq!(3, osa_distance("ac", "cba")); } #[test] fn damerau_levenshtein_works() { assert_eq!(2, damerau_levenshtein("ac", "cba")); } #[test] fn normalized_damerau_levenshtein_works() { assert_delta!( 0.27272, normalized_damerau_levenshtein("levenshtein", "löwenbräu") ); } #[test] fn jaro_works() { assert_delta!( 0.392, jaro("Friedrich Nietzsche", "Jean-Paul Sartre"), 0.001 ); } #[test] fn jaro_winkler_works() { assert_delta!(0.866, jaro_winkler("cheeseburger", "cheese fries"), 0.001); } #[test] fn sequence_matcher_works() { assert_delta!(0.615, sequence_matcher("kitten", "sitting"), 0.001); }