finl_unicode-1.2.0/.cargo_vcs_info.json0000644000000001360000000000100134720ustar { "git": { "sha1": "7abd8d97e6dfe313c027688ec667dbc1aeb9467b" }, "path_in_vcs": "" }finl_unicode-1.2.0/.github/FUNDING.yml000064400000000000000000000014651046102023000154450ustar 00000000000000# These are supported funding model platforms github: dahosek # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2] #patreon: # Replace with a single Patreon username #open_collective: # Replace with a single Open Collective username #ko_fi: # Replace with a single Ko-fi username #tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel #community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry #liberapay: # Replace with a single Liberapay username #issuehunt: # Replace with a single IssueHunt username #otechie: # Replace with a single Otechie username #lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry #custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] finl_unicode-1.2.0/.gitignore000064400000000000000000000000731046102023000142520ustar 00000000000000/target Cargo.lock .idea .DS_Store /generate-sources/targetfinl_unicode-1.2.0/Cargo.toml0000644000000026010000000000100114670ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2021" name = "finl_unicode" version = "1.2.0" description = "Library for handling Unicode functionality for finl (categories and grapheme segmentation)" homepage = "https://finl.xyz" readme = "README.md" keywords = [ "unicode", "segmentation", "graphemes", ] categories = [ "text-processing", "internationalization", ] license = "MIT OR Apache-2.0" repository = "https://github.com/dahosek/finl_unicode" resolver = "2" [[bench]] name = "categories" harness = false [[bench]] name = "grapheme_clusters" harness = false [dependencies] [dev-dependencies.bstr] version = "1.0.0" [dev-dependencies.criterion] version = "0.3.5" features = ["html_reports"] [dev-dependencies.unicode-segmentation] version = "1.9.0" [dev-dependencies.unicode_categories] version = "0.1.1" [build-dependencies] [features] categories = [] default = [ "categories", "grapheme_clusters", ] grapheme_clusters = [] finl_unicode-1.2.0/Cargo.toml.orig0000644000000015340000000000100124320ustar [package] name = "finl_unicode" version = "1.2.0" edition = "2021" license = "MIT OR Apache-2.0" keywords = ["unicode", "segmentation", "graphemes"] categories = ["text-processing", "internationalization"] description = "Library for handling Unicode functionality for finl (categories and grapheme segmentation)" homepage = "https://finl.xyz" repository = "https://github.com/dahosek/finl_unicode" [dependencies] [dev-dependencies] criterion = { version = "0.3.5", features=["html_reports"]} unicode_categories = "0.1.1" finl_unicode = {path=".", features=["grapheme_clusters", "categories"]} unicode-segmentation = "1.9.0" bstr = "1.0.0" [features] default = ["categories", "grapheme_clusters"] categories = [] grapheme_clusters = [] [build-dependencies] [[bench]] name = "categories" harness = false [[bench]] name = "grapheme_clusters" harness = falsefinl_unicode-1.2.0/Cargo.toml.orig000064400000000000000000000015341046102023000151540ustar 00000000000000[package] name = "finl_unicode" version = "1.2.0" edition = "2021" license = "MIT OR Apache-2.0" keywords = ["unicode", "segmentation", "graphemes"] categories = ["text-processing", "internationalization"] description = "Library for handling Unicode functionality for finl (categories and grapheme segmentation)" homepage = "https://finl.xyz" repository = "https://github.com/dahosek/finl_unicode" [dependencies] [dev-dependencies] criterion = { version = "0.3.5", features=["html_reports"]} unicode_categories = "0.1.1" finl_unicode = {path=".", features=["grapheme_clusters", "categories"]} unicode-segmentation = "1.9.0" bstr = "1.0.0" [features] default = ["categories", "grapheme_clusters"] categories = [] grapheme_clusters = [] [build-dependencies] [[bench]] name = "categories" harness = false [[bench]] name = "grapheme_clusters" harness = falsefinl_unicode-1.2.0/LICENSE-APACHE000064400000000000000000000227731046102023000142210ustar 00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS finl_unicode-1.2.0/LICENSE-MIT000064400000000000000000000017771046102023000137320ustar 00000000000000Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. finl_unicode-1.2.0/README.md000064400000000000000000000205751046102023000135520ustar 00000000000000# finl Unicode support This crate is designed for the Unicode needs of the finl project, but is designed to be usable by other software as well. In the current release (1.0.x), support is provided for character code identification and grapheme segmentation and Unicode14.0.0. ## Overview ### Category identification Loading the `finl_unicode` crate with the `categories` feature will add methods onto the char type to test the category of a character or identify its category. See the rustdoc for detail. ### Grapheme clusters Loading the `finl_unicode` crate with the `grapheme_clusters` feature will extend `Peekable` to have a `next_cluster()` method which will return the next grapheme cluster from the iterator. There is also a pure cluster iterator available by calling `Graphemes::new(s)` on a `&str`. I don’t use this in finl, but wrote it using the same algorithm as the extension of `Peekable` for the purposes of benchmarking.¹ ## Why? There *are* existing crates for these purposes, but segmentation lacked the interface for segmentation that I wanted (which was to be able to extend `Peekable` with a method to fetch the next grapheme cluster if it existed). I incorrectly assumed that this would require character code identification, which turned out to be incorrect, but it turned out that the crate I was using was outdated and possibly abandoned and had an inefficient algorithm so it turned out to be a good thing that I wrote it. I did benchmarks comparing my code against existing crates and discovered that I had managed to eke out performance gains against all of them, so that’s an added bonus. ### Benchmark results All benchmarks are generated using Criterion You can replicate them by running `cargo bench` from the project directory. Three numbers are given for all results: low/mean/high, all from the output of Criterion. The mean value is given in **bold**. #### Unicode categories I ran three benchmarks to compare the performance of the crates. The Japanese text benchmark reads the Project Gutenberg EBook of *Kumogata monsho* by John Falkner and counts the characters in it which are Unicode letters. The Czech text benchmark reads the Project Gutenberg EBook of *Cítanka pro skoly obecné* by Jan Stastný and Jan Lepar and Josef Sokol (this was to exercise testing against a Latin-alphabet text with lots of diacriticals). All letters and lowercase letters are counted. The English text benchmark reads the Project Gutenberg eBook of *Frankenstein* by Mary Wollstonecraft Shelley (to run against a text which is pure ASCII). All letters and lowercase letters are counted. The source code check is from neovim. Again, letters and lowercase letters are counted in the sample. I compared against [unicode_categories](https://docs.rs/unicode_categories/latest/unicode_categories/) 0.1.1. All times are in ms. Smaller is better. | Benchmark | `finl_unicode` | `unicode_categories` | |--------------------------|-----------------------------|--------------------------| | Japanese text | 0.62484/**0.64200**/0.66311 | 15.382/**15.719**/16.092 | | Czech text | 0.18248/**0.19137**/0.19975 | 3.2322/**3.3329**/3.4435 | | Czech text (lowercase) | 0.20361/**0.20529**/0.20724 | 1.8496/**1.8742**/1.9026 | | English text | 0.52260/**0.54461**/0.56682 | 13.038/**13.330**/13.655 | | English text (lowercase) | 0.72885/**0.74219**/0.75668 | 8.3998/**8.5037**/8.6233 | | Source code | 0.05544/**0.05785**/0.06046 | 1.6512/**1.7063**/1.7656 | | Source code (lowercase) | 0.07506/**0.07673**/0.07895 | 0.7285/**0.7536**/0.7821 | As you can see, this is a clear win (the difference is the choice of algorithm. `finl_unicode` uses two-step table lookup to be able to store categories compactly while `unicode_categories` uses a combination of range checks and binary searches on tables). #### Grapheme clusters I compared against [unicode_segmentation](https://docs.rs/unicode-segmentation/latest/unicode_segmentation/) 1.9.0 (part of the unicode-rs project) and [bstr](https://docs.rs/bstr/latest/bstr/) 1.0.0. Comparisons are run against graphemes.txt, derived from the Unicode test suite, plus several language texts that were part of the `unicode_segmentation` benchmark suite. All times are in µs, smaller is better. | Benchmark | `finl_unicde` | `unicode_segmentation` | `bstr` | |------------------|--------------------------|--------------------------|--------------------------| | Unicode graphemes | 130.34/**133.31**/137.00 | 209.51/**217.50**/225.53 | 337.68/**354.59**/372.75 | | Arabic text | 262.05/**268.78**/273.65 | 443.11/**463.19**/482.25 | 842.78/**872.47**/906.84 | | English text | 387.88/**395.08**/404.00 | 527.29/**552.92**/586.04 | 424.73/**437.04**/449.23 | | Hindi text | 204.88/**216.04**/228.14 | 489.75/**500.55**/512.20 | 638.01/**641.28**/644.87 | | Japanese text | 181.65/**190.87**/202.92 | 437.98/**451.51**/467.17 | 855.04/**880.48**/904.88 | | Korean text | 298.19/**304.42**/312.47 | 813.45/**844.54**/880.53 | 1259.2/**1304.7**/1350.6 | | Mandarin text | 154.55/**159.33**/164.22 | 284.59/**293.63**/306.59 | 679.67/**704.13**/730.46 | | Russian text | 300.56/**312.86**/327.44 | 372.59/**392.12**/419.40 | 783.41/**838.96**/896.44 | | Source code | 424.39/**443.88**/463.77 | 501.16/**506.81**/513.27 | 513.79/**531.82**/551.31 | Adding some additional tests reveals some interesting contrasts in performance. On text with minimal clustering (English and source code), my code is faster than `unicode_segmentation` and `bstr` (but not dramatically so) and it's interesting to see that `bstr` is slightly faster than `unicode_segmentation` on the English text benchmark, but where grapheme clusters become more common (Arabic and Hindi), the performance is dramatically better with my crate. I wouldn’t expect clusters in the Japanese, but it and Korean show the most dramatic differences in performance. ## Why not? You may want to avoid this if you need `no_std` (maybe I’ll cover that in a future version, but probably not). If you need other clustering algorithms, I have no near future plans to implement them (but I would do it for money). There is no equivalent to `unicode_segmentation`’s `GraphemeCursor` as I don’t need that functionality for finl. Reverse iteration over graphemes is not supported, nor do I have plans to support it. I do not support legacy clustering algorithms which are supported by `unicode-segmentation`. However, the Unicode specification discourages the use of legacy clustering which is only documented for backwards compatability with very old versions of the Unicode standard.² ## Unicode copyright notice This package incorporates data from Unicode Inc. Copyright © 1991–2022 Unicode, Inc. All rights reserved. ## Support I’ve released this under an MIT/Apache license. Do what you like with it. I wouldn’t mind contributions to the ongoing support of developing finl, but they’re not necessary (although if you’re Microsoft or Google and you use my code, surely you can throw some dollars in my bank account). I guarantee no warranty or support, although if you care to throw some money my way, I can prioritize your requests. ## Version history - **1.0.0** Initial release - **1.0.1** Build-process changes to make docs.rs documentation build - **1.0.2** More changes because the first round apparently weren’t enough - **1.1.0** Add support for Unicode 15.0.0, added new benchmark comparisons. - **1.2.0** Allow grapheme clustering to work on any `Peekable` iterator over `char` or `(usize,char)`. --- 1. For technical reasons, the iterator extension returns `Option` rather than `Option<&str>` and thus will autmoatically underperform other implementations which are returning *all* the grapheme clusters. For finl, however, I would need an owned value for the string containing the cluster anyway and since I only occasionally need a cluster, I decided it was acceptable to take the performance hit. But see the benchmark results for the fact that I apparently managed to implement a faster algorithm anyway when doing an apples-to-apples comparison of speeds. 2. Pure speculation, but I think that this might be the entire reason for the difference in performance between `finl_unicode` and `unicode_segmentation`. However, I have not looked at the source code to confirm my suspicion.finl_unicode-1.2.0/src/data/mod.rs000064400000000000000000000001221046102023000151020ustar 00000000000000pub mod characters; pub mod grapheme_property; #[cfg(test)] pub mod grapheme_test;finl_unicode-1.2.0/src/grapheme_clusters.rs000064400000000000000000000410641046102023000171400ustar 00000000000000//! This module provides two interfaces for accessing clusters from an underlying string. The //! `GraphemeCluster` trait extends the `Peekable` iterators over `Chars` or `CharIndices` //! to add a `next_cluster` method which returns `Option` with the next //! cluster if one exists. This is the best method for getting individual clusters from a stream which is normally //! only getting `char`s but is not recommended if you wish to iterate over clusters. //! ``` //! # use crate::finl_unicode::grapheme_clusters::GraphemeCluster; //! let mut char_iterator = "A\u{301}✋🏽🇦🇹!".chars().peekable(); //! assert_eq!(char_iterator.next_cluster(), Some("A\u{301}".to_string())); //! assert_eq!(char_iterator.next_cluster(), Some("✋🏽".to_string())); //! assert_eq!(char_iterator.next_cluster(), Some("🇦🇹".to_string())); //! assert_eq!(char_iterator.next_cluster(), Some("!".to_string())); //! assert_eq!(char_iterator.next_cluster(), None); //! ``` //! //! For the iterating over clusters case there is a struct `Graphemes` which implements `iterator` //! and can be constructed from a `&str`. This returns references to substrings of the original //! `&str` and is more performant for that case than the extended iterator provided through //! `GraphemeCluster` which allocates a new `String` for each cluster found. //! ``` //! # use crate::finl_unicode::grapheme_clusters::Graphemes; //! let graphemes = Graphemes::new("A\u{301}✋🏽🇦🇹!"); //! assert_eq!(graphemes.collect::>(), ["A\u{301}", "✋🏽", "🇦🇹", "!"]) //! ``` use std::iter::Peekable; use std::str::CharIndices; use std::str::Chars; use crate::data::grapheme_property::{GP_PAGES,GP_TABLE}; /// `Graphemes` provides an iterator over the grapheme clusters of a string. pub struct Graphemes<'a> { input: &'a str, iter: Peekable>, } impl<'a> Graphemes<'a> { /// A new instance of graphemes can be constructed from a string using `Graphemes::new` /// ``` /// # use crate::finl_unicode::grapheme_clusters::Graphemes; /// let graphemes = Graphemes::new("some string"); /// ``` pub fn new(input: &'a str) -> Graphemes<'a> { let iter = input.char_indices().peekable(); Graphemes { input, iter } } } impl<'a> Iterator for Graphemes<'a> { type Item = &'a str; #[inline] /// Return a slice of the underlying /// string corresponding to the next cluster if one exists, or `None` if the end of the string /// has been reached. fn next(&mut self) -> Option { if let Some(&(start, _)) = self.iter.peek() { let mut cluster_machine = ClusterMachine::new(); loop { if let Some(&(curr_loc, ch)) = self.iter.peek() { match cluster_machine.find_cluster(ch) { Break::None => { self.iter.next(); } Break::Before => { return Some(&self.input[start..curr_loc]); } Break::After => { self.iter.next(); return Some( if let Some(&(curr_loc, _)) = self.iter.peek() { &self.input[start..curr_loc] } else { &self.input[start..] }); } } } else { return Some(&self.input[start..]); } } } else { None } } } /// Get the next grapheme cluster from a stream of characters or char indices /// This trait is implemented for any `Peekable` iterator over either `char` or `(usize, char)` (so /// it will work on `Peekable` and `Peekable` as well as any other peekable iterator /// which meets this requirement. pub trait GraphemeCluster { fn next_cluster(&mut self) -> Option; } impl GraphemeCluster for T where T: PeekChar { /// Returns the next cluster if there is one in an `Option`. Since this has a heap allocation /// it is *not* recommended for iterating over all the clusters in a string. In that case, use /// `Graphemes` instead. #[inline] fn next_cluster(&mut self) -> Option { if self.has_next() { let mut cluster_machine = ClusterMachine::new(); let mut rv = String::new(); loop { if let Some(ch) = self.peek_char() { let state = cluster_machine.find_cluster(ch); match state { Break::None => { rv.push(ch); self.next(); } Break::Before => { return Some(rv); } Break::After => { rv.push(ch); self.next(); return Some(rv); } } } else { break; } } Some(rv) } else { None } } } /// This trait exists primarily to allow a single implementation to be used for both `Peekable` /// and `Peekable`. You could implement this for some other iterator if you like as /// long as you can implement the two methods below. pub trait PeekChar: Iterator { /// Returns the next character (if it exists) or `None` otherwise. fn peek_char(&mut self) -> Option; /// Returns `true` if there is another character available on the iterator, `false` otherwise. fn has_next(&mut self) -> bool; } trait HasChar { fn get_char(& self) -> char; } impl HasChar for char { fn get_char(& self) -> char { *self } } impl HasChar for (usize, char) { fn get_char(&self) -> char { self.1 } } impl PeekChar for Peekable where CharIter: Iterator { #[inline] fn peek_char(&mut self) -> Option { self.peek().map(|c| c.get_char()) } #[inline] fn has_next(&mut self) -> bool { self.peek().is_some() } } // ------------------------ // Private implementation details follow #[derive(PartialEq)] enum ClusterMachineState { Start, Precore, CcsBase, CrLf, HangulSyllableL, HangulSyllableV, HangulSyllableT, CcsExtend, Flag, Emoji, EmojiZWJ, Other, } #[derive(Debug, PartialEq)] enum Break { None, Before, After, } struct ClusterMachine { state: ClusterMachineState, } impl ClusterMachine { #[inline] pub fn new() -> ClusterMachine { ClusterMachine { state: ClusterMachineState::Start, } } /// If we have a cluster, we return the cluster in a `String` in an `Option` long with a `bool` /// If the `bool` is true, it means that we are also consuming the character in the cluster. #[inline] pub fn find_cluster(&mut self, c: char) -> Break { if self.state == ClusterMachineState::Start { return self.first_character(c); } let property = get_property(c); if property == GraphemeProperty::CONTROL { return if self.state == ClusterMachineState::CrLf && c == '\n' { self.state = ClusterMachineState::Start; Break::After } else { if c == '\r' { self.state = ClusterMachineState::CrLf; } else { self.state = ClusterMachineState::Start; } Break::Before } } match self.state { ClusterMachineState::Start => self.first_character(c), ClusterMachineState::Precore => { self.first_character(c); Break::None } ClusterMachineState::HangulSyllableL => { match property { GraphemeProperty::L => Break::None, GraphemeProperty::V | GraphemeProperty::LV => { self.state = ClusterMachineState::HangulSyllableV; Break::None } GraphemeProperty::LVT => { self.state = ClusterMachineState::HangulSyllableT; Break::None } GraphemeProperty::EXTEND | GraphemeProperty::SPACING_MARK | GraphemeProperty::ZWJ => { self.state = ClusterMachineState::CcsBase; Break::None } _ => { self.first_character(c); Break::Before } } } ClusterMachineState::HangulSyllableV => { match property { GraphemeProperty::V => Break::None, GraphemeProperty::T => { self.state = ClusterMachineState::HangulSyllableT; Break::None } GraphemeProperty::EXTEND | GraphemeProperty::SPACING_MARK | GraphemeProperty::ZWJ => { self.state = ClusterMachineState::CcsBase; Break::None } _ => { self.first_character(c); Break::Before } } } ClusterMachineState::HangulSyllableT => { match property { GraphemeProperty::T => Break::None, GraphemeProperty::EXTEND | GraphemeProperty::SPACING_MARK | GraphemeProperty::ZWJ => { self.state = ClusterMachineState::CcsBase; Break::None } _ => { self.first_character(c); Break::Before } } } ClusterMachineState::CcsExtend => { match property { GraphemeProperty::EXTEND | GraphemeProperty::SPACING_MARK | GraphemeProperty::ZWJ => Break::None, _ => Break::Before } } ClusterMachineState::Flag => { self.state = ClusterMachineState::Start; match property { GraphemeProperty::REGIONAL_INDICATOR => { self.state = ClusterMachineState::Other; Break::None } GraphemeProperty::EXTEND | GraphemeProperty::SPACING_MARK | GraphemeProperty::ZWJ => { self.state = ClusterMachineState::CcsExtend; Break::None } _ => { self.first_character(c); Break::Before } } } ClusterMachineState::Emoji => { match property { GraphemeProperty::ZWJ => { self.state = ClusterMachineState::EmojiZWJ; Break::None } GraphemeProperty::EXTEND | GraphemeProperty::SPACING_MARK => { self.state = ClusterMachineState::Emoji; Break::None } _ => { self.first_character(c); Break::Before } } } ClusterMachineState::EmojiZWJ => { if property == GraphemeProperty::EXTENDED_GRAPHEME { self.state = ClusterMachineState::Emoji; Break::None } else { Break::Before } } ClusterMachineState::CrLf => Break::Before, _ => { if is_continuation(property) { Break::None } else { self.first_character(c); Break::Before } } } } #[inline] fn first_character(&mut self, c: char) -> Break { if c == '\r' { self.state = ClusterMachineState::CrLf; return Break::None; } let property = get_property(c); if property == GraphemeProperty::CONTROL { self.state = ClusterMachineState::Start; return Break::After; } match property { GraphemeProperty::PREPEND => { self.state = ClusterMachineState::Precore; } GraphemeProperty::EXTEND => { self.state = ClusterMachineState::CcsExtend; } GraphemeProperty::SPACING_MARK => { self.state = ClusterMachineState::CcsExtend; } GraphemeProperty::L => { self.state = ClusterMachineState::HangulSyllableL; } GraphemeProperty::V => { self.state = ClusterMachineState::HangulSyllableV; } GraphemeProperty::T => { self.state = ClusterMachineState::HangulSyllableT; } GraphemeProperty::LV => { self.state = ClusterMachineState::HangulSyllableV; } GraphemeProperty::LVT => { self.state = ClusterMachineState::HangulSyllableT; } GraphemeProperty::EXTENDED_GRAPHEME => { self.state = ClusterMachineState::Emoji; } GraphemeProperty::REGIONAL_INDICATOR => { self.state = ClusterMachineState::Flag; } _ => { self.state = ClusterMachineState::Other; } } Break::None } } #[inline] fn is_continuation(property: u8) -> bool { property != 0 && property & 0xc == 0 } // Symbolic names for properties in data tables struct GraphemeProperty {} impl GraphemeProperty { const EXTEND: u8 = 0x01; const SPACING_MARK: u8 = 0x02; const ZWJ: u8 = 0x03; const CONTROL: u8 = 0x04; const PREPEND: u8 = 0x05; const EXTENDED_GRAPHEME: u8 = 0x06; const REGIONAL_INDICATOR: u8 = 0x07; const L: u8 = 0x0c; const V: u8 = 0x08; const T: u8 = 0x09; const LV: u8 = 0x0d; const LVT: u8 = 0x0e; } #[inline] fn get_property(c: char) -> u8 { GP_PAGES[usize::from(GP_TABLE[(c as usize) >> 8])][(c as usize) & 0xff] } #[cfg(test)] pub (crate) mod tests { use crate::grapheme_clusters::*; #[test] fn low_level_interface_test() { let mut machine = ClusterMachine::new(); assert_eq!(machine.find_cluster('\r'), Break::None); assert_eq!(machine.find_cluster('a'), Break::Before); assert_eq!(machine.find_cluster('\r'), Break::Before); assert_eq!(machine.find_cluster('\n'), Break::After); } #[test] fn can_get_clusters() { let mut peekable_index = "\r\ne\u{301}f".char_indices().peekable(); assert_eq!(Some("\r\n".to_string()), peekable_index.next_cluster()); assert_eq!(Some("e\u{301}".to_string()), peekable_index.next_cluster()); assert_eq!(Some("f".to_string()), peekable_index.next_cluster()); } pub (crate) fn grapheme_test(input: &str, expected_output: &[&str], message: &str) { let mut iter = input.char_indices().peekable(); let mut clusters = vec!(); while let Some(cluster) = iter.next_cluster() { clusters.push(cluster); } assert_eq!(clusters.len(), expected_output.len(), "Lengths did not match on Grapheme Cluster\n\t{message}\n\tOutput: {clusters:?}\n\tExpected: {expected_output:?}"); clusters.iter().zip(expected_output.into_iter()) .for_each(|(actual, &expected)| assert_eq!(actual.as_str(), expected, "GraphemeCluster mismatch: {message}")); let iter = Graphemes::new(input); let clusters = iter.collect::>(); assert_eq!(clusters.len(), expected_output.len(), "Lengths did not match on Grapheme Cluster Indices\n\t{message}\n\tOutput: {clusters:?}\n\tExpected: {expected_output:?}"); clusters.iter().zip(expected_output.into_iter()) .for_each(|(actual, &expected)| assert_eq!(*actual, expected, "Grapheme cluster indices mismatch: {message}\n{} ≠ {}", actual.escape_unicode(), expected.escape_unicode())); } } finl_unicode-1.2.0/src/lib.rs000064400000000000000000000023461046102023000141720ustar 00000000000000//! `finl_unicode` is a crate to provide Unicode support for the finl project. This is not necessarily //! meant to be a comoprehensive Unicode support, although I will consider adding additional use cases //! as necessary. Unicode 14.0.0 is implemented in the current version. //! //! Two features are currently supported: //! - **Unicode segmentation**. (Specify `clusters` as a feature when importing the crate.) For a peekable iterator of `CharIndices`, we extend that iterator to //! include a `next_cluster` method which returns `Option` which will contain the next //! grapheme cluster if there is one or `None` if there isn't. //! - **Character category**. (Specify `categories` as a feature when importing the crate.) Extends the `char` class with methods for testing the //! category of the character. //! //! The default is to compile all features. Note that the Rust compiler/linker will not automatically //! link unused code, so you most of the time, there will be no need to remove features. //! //! Building the crate runs a build script which connects to unicode.org to download the data files. #[cfg(feature = "categories")] pub mod categories; #[cfg(feature = "grapheme_clusters")] pub mod grapheme_clusters; mod data;