regex-automata-0.1.8/.gitignore010064400017500000144000000000661351510050500146450ustar0000000000000000/target /regex-automata-debug/target tags /Cargo.lock regex-automata-0.1.8/COPYING010064400017500000144000000001761311144420200137070ustar0000000000000000This project is dual-licensed under the Unlicense and MIT licenses. You may use this code under the terms of either license. regex-automata-0.1.8/Cargo.toml.orig010064400017500000144000000036561351510067100155600ustar0000000000000000[package] name = "regex-automata" version = "0.1.8" #:version authors = ["Andrew Gallant "] description = "Automata construction and matching using regular expressions." documentation = "https://docs.rs/regex-automata" homepage = "https://github.com/BurntSushi/regex-automata" repository = "https://github.com/BurntSushi/regex-automata" readme = "README.md" keywords = ["regex", "dfa", "automata", "automaton", "nfa"] license = "Unlicense/MIT" categories = ["text-processing"] exclude = [ "/.travis.yml", "/appveyor.yml", "/ci/*", "/scripts/*", "/regex-automata-debug", ] autotests = false [badges] travis-ci = { repository = "BurntSushi/regex-automata" } appveyor = { repository = "BurntSushi/regex-automata" } [workspace] members = ["bench"] # We'd ideally not do this, but since the debug tool uses Rust 2018, older # versions of Rust (such as 1.28) fail to parse the manifest because it treats # `edition = "2018"` as an unstable feature. # # When we move our MSRV to Rust 2018, then we should be able to add this back # to the workspace. exclude = ["regex-automata-debug"] [lib] bench = false [features] default = ["std"] std = ["regex-syntax", "utf8-ranges"] [dependencies] byteorder = { version = "1.2.7", default-features = false } regex-syntax = { version = "0.6.4", optional = true } utf8-ranges = { version = "1", optional = true } [dev-dependencies] lazy_static = "1.2.0" regex = "1.1" serde = "1.0.82" serde_bytes = "0.11" serde_derive = "1.0.82" toml = "0.4.10" [[test]] path = "tests/tests.rs" name = "default" [profile.dev] # Running tests takes too long in debug mode, so we forcefully always build # with optimizations. Unfortunate, but, ¯\_(ツ)_/¯. opt-level = 3 debug = true [profile.test] # Running tests takes too long in debug mode, so we forcefully always build # with optimizations. Unfortunate, but, ¯\_(ツ)_/¯. opt-level = 3 debug = true [profile.release] debug = true [profile.bench] debug = true regex-automata-0.1.8/Cargo.toml0000644000000037150000000000000120300ustar00# THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies # # If you believe there's an error in this file please file an # issue against the rust-lang/cargo repository. If you're # editing this file be aware that the upstream Cargo.toml # will likely look very different (and much more reasonable) [package] name = "regex-automata" version = "0.1.8" authors = ["Andrew Gallant "] exclude = ["/.travis.yml", "/appveyor.yml", "/ci/*", "/scripts/*", "/regex-automata-debug"] autotests = false description = "Automata construction and matching using regular expressions." homepage = "https://github.com/BurntSushi/regex-automata" documentation = "https://docs.rs/regex-automata" readme = "README.md" keywords = ["regex", "dfa", "automata", "automaton", "nfa"] categories = ["text-processing"] license = "Unlicense/MIT" repository = "https://github.com/BurntSushi/regex-automata" [profile.test] opt-level = 3 debug = true [profile.bench] debug = true [profile.dev] opt-level = 3 debug = true [profile.release] debug = true [lib] bench = false [[test]] name = "default" path = "tests/tests.rs" [dependencies.byteorder] version = "1.2.7" default-features = false [dependencies.regex-syntax] version = "0.6.4" optional = true [dependencies.utf8-ranges] version = "1" optional = true [dev-dependencies.lazy_static] version = "1.2.0" [dev-dependencies.regex] version = "1.1" [dev-dependencies.serde] version = "1.0.82" [dev-dependencies.serde_bytes] version = "0.11" [dev-dependencies.serde_derive] version = "1.0.82" [dev-dependencies.toml] version = "0.4.10" [features] default = ["std"] std = ["regex-syntax", "utf8-ranges"] [badges.appveyor] repository = "BurntSushi/regex-automata" [badges.travis-ci] repository = "BurntSushi/regex-automata" regex-automata-0.1.8/LICENSE-MIT010064400017500000144000000020711311144420200143040ustar0000000000000000The MIT License (MIT) Copyright (c) 2015 Andrew Gallant Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. regex-automata-0.1.8/README.md010066400017500000144000000263761341354442100141600ustar0000000000000000regex-automata ============== A low level regular expression library that uses deterministic finite automata. It supports a rich syntax with Unicode support, has extensive options for configuring the best space vs time trade off for your use case and provides support for cheap deserialization of automata for use in `no_std` environments. [![Linux build status](https://api.travis-ci.org/BurntSushi/regex-automata.svg)](https://travis-ci.org/BurntSushi/regex-automata) [![Windows build status](https://ci.appveyor.com/api/projects/status/github/BurntSushi/regex-automata?svg=true)](https://ci.appveyor.com/project/BurntSushi/regex-automata) [![](http://meritbadge.herokuapp.com/regex-automata)](https://crates.io/crates/regex-automata) Dual-licensed under MIT or the [UNLICENSE](http://unlicense.org). ### Documentation https://docs.rs/regex-automata ### Usage Add this to your `Cargo.toml`: ```toml [dependencies] regex-automata = "0.1" ``` and this to your crate root (if you're using Rust 2015): ```rust extern crate regex_automata; ``` ### Example: basic regex searching This example shows how to compile a regex using the default configuration and then use it to find matches in a byte string: ```rust use regex_automata::Regex; let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); let text = b"2018-12-24 2016-10-08"; let matches: Vec<(usize, usize)> = re.find_iter(text).collect(); assert_eq!(matches, vec![(0, 10), (11, 21)]); ``` For more examples and information about the various knobs that can be turned, please see the [docs](https://docs.rs/regex-automata). ### Support for `no_std` This crate comes with a `std` feature that is enabled by default. When the `std` feature is enabled, the API of this crate will include the facilities necessary for compiling, serializing, deserializing and searching with regular expressions. When the `std` feature is disabled, the API of this crate will shrink such that it only includes the facilities necessary for deserializing and searching with regular expressions. The intended workflow for `no_std` environments is thus as follows: * Write a program with the `std` feature that compiles and serializes a regular expression. Serialization should only happen after first converting the DFAs to use a fixed size state identifier instead of the default `usize`. You may also need to serialize both little and big endian versions of each DFA. (So that's 4 DFAs in total for each regex.) * In your `no_std` environment, follow the examples above for deserializing your previously serialized DFAs into regexes. You can then search with them as you would any regex. Deserialization can happen anywhere. For example, with bytes embedded into a binary or with a file memory mapped at runtime. Note that the [`ucd-generate`](https://github.com/BurntSushi/ucd-generate) tool will do the first step for you with its `dfa` or `regex` sub-commands. ### Differences with the regex crate The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a general purpose regular expression engine. It aims to automatically balance low compile times, fast search times and low memory usage, while also providing a convenient API for users. In contrast, this crate provides a lower level regular expression interface that is a bit less convenient while providing more explicit control over memory usage and search times. Here are some specific negative differences: * **Compilation can take an exponential amount of time and space** in the size of the regex pattern. While most patterns do not exhibit worst case exponential time, such patterns do exist. For example, `[01]*1[01]{N}` will build a DFA with `2^(N+1)` states. For this reason, untrusted patterns should not be compiled with this library. (In the future, the API may expose an option to return an error if the DFA gets too big.) * This crate does not support sub-match extraction, which can be achieved with the regex crate's "captures" API. This may be added in the future, but is unlikely. * While the regex crate doesn't necessarily sport fast compilation times, the regexes in this crate are almost universally slow to compile, especially when they contain large Unicode character classes. For example, on my system, compiling `\w{3}` with byte classes enabled takes just over 1 second and almost 5MB of memory! (Compiling a sparse regex takes about the same time but only uses about 500KB of memory.) Conversly, compiling the same regex without Unicode support, e.g., `(?-u)\w{3}`, takes under 1 millisecond and less than 5KB of memory. For this reason, you should only use Unicode character classes if you absolutely need them! * This crate does not support regex sets. * This crate does not support zero-width assertions such as `^`, `$`, `\b` or `\B`. * As a lower level crate, this library does not do literal optimizations. In exchange, you get predictable performance regardless of input. The philosophy here is that literal optimizations should be applied at a higher level, although there is no easy support for this in the ecosystem yet. * There is no `&str` API like in the regex crate. In this crate, all APIs operate on `&[u8]`. By default, match indices are guaranteed to fall on UTF-8 boundaries, unless `RegexBuilder::allow_invalid_utf8` is enabled. With some of the downsides out of the way, here are some positive differences: * Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply deserialized. Deserialization always takes constant time since searching can be performed directly on the raw serialized bytes of a DFA. * This crate was specifically designed so that the searching phase of a DFA has minimal runtime requirements, and can therefore be used in `no_std` environments. While `no_std` environments cannot compile regexes, they can deserialize pre-compiled regexes. * Since this crate builds DFAs ahead of time, it will generally out-perform the `regex` crate on equivalent tasks. The performance difference is likely not large. However, because of a complex set of optimizations in the regex crate (like literal optimizations), an accurate performance comparison may be difficult to do. * Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search performance a small amount, but uses much less storage space. Potentially even less than what the regex crate uses. * This crate exposes DFAs directly, such as `DenseDFA` and `SparseDFA`, which enables one to do less work in some cases. For example, if you only need the end of a match and not the start of a match, then you can use a DFA directly without building a `Regex`, which always requires a second DFA to find the start of a match. * Aside from choosing between dense and sparse DFAs, there are several options for configuring the space usage vs search time trade off. These include things like choosing a smaller state identifier representation, to premultiplying state identifiers and splitting a DFA's alphabet into equivalence classes. Finally, DFA minimization is also provided, but can increase compilation times dramatically. ### Future work * Look into being smarter about generating NFA states for large Unicode character classes. These can create a lot of additional work for both the determinizer and the minimizer, and I suspect this is the key thing we'll want to improve if we want to make DFA compile times faster. I *believe* it's possible to potentially build minimal or nearly minimal NFAs for the special case of Unicode character classes by leveraging Daciuk's algorithms for building minimal automata in linear time for sets of strings. See https://blog.burntsushi.net/transducers/#construction for more details. The key adaptation I think we need to make is to modify the algorithm to operate on byte ranges instead of enumerating every codepoint in the set. Otherwise, it might not be worth doing. * Add support for regex sets. It should be possible to do this by "simply" introducing more match states. I think we can also report the positions at each match, similar to how Aho-Corasick works. I think the long pole in the tent here is probably the API design work and arranging it so that we don't introduce extra overhead into the non-regex-set case without duplicating a lot of code. It seems doable. * Stretch goal: support capturing groups by implementing "tagged" DFA (transducers). Laurikari's paper is the usual reference here, but Trofimovich has a much more thorough treatment here: http://re2c.org/2017_trofimovich_tagged_deterministic_finite_automata_with_lookahead.pdf I've only read the paper once. I suspect it will require at least a few more read throughs before I understand it. See also: http://re2c.org/ * Possibly less ambitious goal: can we select a portion of Trofimovich's work to make small fixed length look-around work? It would be really nice to support ^, $ and \b, especially the Unicode variant of \b and CRLF aware $. * Experiment with code generating Rust code. There is an early experiment in src/codegen.rs that is thoroughly bit-rotted. At the time, I was experimenting with whether or not codegen would significant decrease the size of a DFA, since if you squint hard enough, it's kind of like a sparse representation. However, it didn't shrink as much as I thought it would, so I gave up. The other problem is that Rust doesn't support gotos, so I don't even know whether the "match on each state" in a loop thing will be fast enough. Either way, it's probably a good option to have. For one thing, it would be endian independent where as the serialization format of the DFAs in this crate are endian dependent (so you need two versions of every DFA, but you only need to compile one of them for any given arch). * Experiment with unrolling the match loops and fill out the benchmarks. * Add some kind of streaming API. I believe users of the library can already implement something for this outside of the crate, but it would be good to provide an official API. The key thing here is figuring out the API. I suspect we might want to support several variants. * Make a decision on whether or not there is room for literal optimizations in this crate. My original intent was to not let this crate sink down into that very very very deep rabbit hole. But instead, we might want to provide some way for literal optimizations to hook into the match routines. The right path forward here is to probably build something outside of the crate and then see about integrating it. After all, users can implement their own match routines just as efficiently as what the crate provides. * A key downside of DFAs is that they can take up a lot of memory and can be quite costly to build. Their worst case compilation time is O(2^n), where n is the number of NFA states. A paper by Yang and Prasanna (2011) actually seems to provide a way to character state blow up such that it is detectable. If we could know whether a regex will exhibit state explosion or not, then we could make an intelligent decision about whether to ahead-of-time compile a DFA. See: https://www.researchgate.net/profile/XU_Shutu/publication/229032602_Characterization_of_a_global_germplasm_collection_and_its_potential_utilization_for_analysis_of_complex_quantitative_traits_in_maize/links/02bfe50f914d04c837000000.pdf regex-automata-0.1.8/UNLICENSE010064400017500000144000000022731311144420200141240ustar0000000000000000This is free and unencumbered software released into the public domain. Anyone is free to copy, modify, publish, use, compile, sell, or distribute this software, either in source code form or as a compiled binary, for any purpose, commercial or non-commercial, and by any means. In jurisdictions that recognize copyright laws, the author or authors of this software dedicate any and all copyright interest in the software to the public domain. We make this dedication for the benefit of the public at large and to the detriment of our heirs and successors. We intend this dedication to be an overt act of relinquishment in perpetuity of all present and future rights to this software under copyright law. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. For more information, please refer to regex-automata-0.1.8/data/fowler-tests/LICENSE010064400017500000144000000022041340157411400172320ustar0000000000000000The following license covers testregex.c and all associated test data. Permission is hereby granted, free of charge, to any person obtaining a copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following disclaimer: THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. regex-automata-0.1.8/data/fowler-tests/README010064400017500000144000000013401340157411400171050ustar0000000000000000Test data was taken from the Go distribution, which was in turn taken from the testregex test suite: http://www2.research.att.com/~astopen/testregex/testregex.html The LICENSE in this directory corresponds to the LICENSE that the data was released under. The tests themselves were modified for RE2/Go. A couple were modified further by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them. (Yes, it seems like RE2/Go includes failing test cases.) This may or may not have been a bad idea, but I think being consistent with an established Regex library is worth something. Note that these files are read by 'scripts/regex-match-tests.py' and turned into Rust tests found in 'regex_macros/tests/matches.rs'. regex-automata-0.1.8/data/fowler-tests/basic.dat010064400017500000144000000210341340157411400200020ustar0000000000000000NOTE all standard compliant implementations should pass these : 2002-05-31 BE abracadabra$ abracadabracadabra (7,18) BE a...b abababbb (2,7) BE XXXXXX ..XXXXXX (2,8) E \) () (1,2) BE a] a]a (0,2) B } } (0,1) E \} } (0,1) BE \] ] (0,1) B ] ] (0,1) E ] ] (0,1) B { { (0,1) B } } (0,1) BE ^a ax (0,1) BE \^a a^a (1,3) BE a\^ a^ (0,2) BE a$ aa (1,2) BE a\$ a$ (0,2) BE ^$ NULL (0,0) E $^ NULL (0,0) E a($) aa (1,2)(2,2) E a*(^a) aa (0,1)(0,1) E (..)*(...)* a (0,0) E (..)*(...)* abcd (0,4)(2,4) E (ab|a)(bc|c) abc (0,3)(0,2)(2,3) E (ab)c|abc abc (0,3)(0,2) E a{0}b ab (1,2) E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) E a{9876543210} NULL BADBR E ((a|a)|a) a (0,1)(0,1)(0,1) E (a*)(a|aa) aaaa (0,4)(0,3)(3,4) E a*(a.|aa) aaaa (0,4)(2,4) E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2) E (a|b)?.* b (0,1)(0,1) E (a|b)c|a(b|c) ac (0,2)(0,1) E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2) E (a|b)*c|(a|ab)*c abc (0,3)(1,2) E (a|b)*c|(a|ab)*c xc (1,2) E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2) E a?(ab|ba)ab abab (0,4)(0,2) E a?(ac{0}b|ba)ab abab (0,4)(0,2) E ab|abab abbabab (0,2) E aba|bab|bba baaabbbaba (5,8) E aba|bab baaabbbaba (6,9) E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2) E (a.|.a.)*|(a|.a...) aa (0,2)(0,2) E ab|a xabc (1,3) E ab|a xxabc (2,4) Ei (Ab|cD)* aBcD (0,4)(2,4) BE [^-] --a (2,3) BE [a-]* --a (0,3) BE [a-m-]* --amoma-- (0,4) E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17) E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17) {E [[:upper:]] A (0,1) [[]] not supported E [[:lower:]]+ `az{ (1,3) E [[:upper:]]+ @AZ[ (1,3) # No collation in Go #BE [[-]] [[-]] (2,4) #BE [[.NIL.]] NULL ECOLLATE #BE [[=aleph=]] NULL ECOLLATE } BE$ \n \n (0,1) BEn$ \n \n (0,1) BE$ [^a] \n (0,1) BE$ \na \na (0,2) E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3) BE xxx xxx (0,3) E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6) E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3) E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11) E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1) E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2) E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81) E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25) E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22) E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11) BE$ .* \x01\x7f (0,2) E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57) L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH E a*a*a*a*a*b aaaaaaaaab (0,10) BE ^ NULL (0,0) BE $ NULL (0,0) BE ^$ NULL (0,0) BE ^a$ a (0,1) BE abc abc (0,3) BE abc xabcy (1,4) BE abc ababc (2,5) BE ab*c abc (0,3) BE ab*bc abc (0,3) BE ab*bc abbc (0,4) BE ab*bc abbbbc (0,6) E ab+bc abbc (0,4) E ab+bc abbbbc (0,6) E ab?bc abbc (0,4) E ab?bc abc (0,3) E ab?c abc (0,3) BE ^abc$ abc (0,3) BE ^abc abcc (0,3) BE abc$ aabc (1,4) BE ^ abc (0,0) BE $ abc (3,3) BE a.c abc (0,3) BE a.c axc (0,3) BE a.*c axyzc (0,5) BE a[bc]d abd (0,3) BE a[b-d]e ace (0,3) BE a[b-d] aac (1,3) BE a[-b] a- (0,2) BE a[b-] a- (0,2) BE a] a] (0,2) BE a[]]b a]b (0,3) BE a[^bc]d aed (0,3) BE a[^-b]c adc (0,3) BE a[^]b]c adc (0,3) E ab|cd abc (0,2) E ab|cd abcd (0,2) E a\(b a(b (0,3) E a\(*b ab (0,2) E a\(*b a((b (0,4) E ((a)) abc (0,1)(0,1)(0,1) E (a)b(c) abc (0,3)(0,1)(2,3) E a+b+c aabbabc (4,7) E a* aaa (0,3) #E (a*)* - (0,0)(0,0) E (a*)* - (0,0)(?,?) RE2/Go E (a*)+ - (0,0)(0,0) #E (a*|b)* - (0,0)(0,0) E (a*|b)* - (0,0)(?,?) RE2/Go E (a+|b)* ab (0,2)(1,2) E (a+|b)+ ab (0,2)(1,2) E (a+|b)? ab (0,1)(0,1) BE [^ab]* cde (0,3) #E (^)* - (0,0)(0,0) E (^)* - (0,0)(?,?) RE2/Go BE a* NULL (0,0) E ([abc])*d abbbcd (0,6)(4,5) E ([abc])*bcd abcd (0,4)(0,1) E a|b|c|d|e e (0,1) E (a|b|c|d|e)f ef (0,2)(0,1) #E ((a*|b))* - (0,0)(0,0)(0,0) E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go BE abcd*efg abcdefg (0,7) BE ab* xabyabbbz (1,3) BE ab* xayabbbz (1,2) E (ab|cd)e abcde (2,5)(2,4) BE [abhgefdc]ij hij (0,3) E (a|b)c*d abcd (1,4)(1,2) E (ab|ab*)bc abc (0,3)(0,1) E a([bc]*)c* abc (0,3)(1,3) E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4) E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4) E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4) E a[bcd]*dcdcde adcdcde (0,7) E (ab|a)b*c abc (0,3)(0,2) E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4) BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5) E ^a(bc+|b[eh])g|.h$ abh (1,3) E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5) E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2) E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6) E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1) BE multiple words multiple words yeah (0,14) E (.*)c(.*) abcde (0,5)(0,2)(3,5) BE abcd abcd (0,4) E a(bc)d abcd (0,4)(1,3) E a[-]?c ac (0,3) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12) E a+(b|c)*d+ aabcdd (0,6)(3,4) E ^.+$ vivi (0,4) E ^(.+)$ vivi (0,4)(0,4) E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19) E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3) E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7) E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7) E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11) E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3) E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7) E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3) E ((foo)|bar)!bas bar!bas (0,7)(0,3) E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7) E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3) E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3) E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7) E (foo|(bar))!bas foo!bas (0,7)(0,3) E (foo|bar)!bas bar!bas (0,7)(0,3) E (foo|bar)!bas foo!bar!bas (4,11)(4,7) E (foo|bar)!bas foo!bas (0,7)(0,3) E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3) E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7) E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11) E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7) E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3) E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7) E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7) E .*(/XXX).* /XXX (0,4)(0,4) E .*(\\XXX).* \XXX (0,4)(0,4) E \\XXX \XXX (0,4) E .*(/000).* /000 (0,4)(0,4) E .*(\\000).* \000 (0,4)(0,4) E \\000 \000 (0,4) regex-automata-0.1.8/data/fowler-tests/nullsubexpr.dat010064400017500000144000000040461340157411400213100ustar0000000000000000NOTE null subexpression matches : 2002-06-06 E (a*)* a (0,1)(0,1) #E SAME x (0,0)(0,0) E SAME x (0,0)(?,?) RE2/Go E SAME aaaaaa (0,6)(0,6) E SAME aaaaaax (0,6)(0,6) E (a*)+ a (0,1)(0,1) E SAME x (0,0)(0,0) E SAME aaaaaa (0,6)(0,6) E SAME aaaaaax (0,6)(0,6) E (a+)* a (0,1)(0,1) E SAME x (0,0) E SAME aaaaaa (0,6)(0,6) E SAME aaaaaax (0,6)(0,6) E (a+)+ a (0,1)(0,1) E SAME x NOMATCH E SAME aaaaaa (0,6)(0,6) E SAME aaaaaax (0,6)(0,6) E ([a]*)* a (0,1)(0,1) #E SAME x (0,0)(0,0) E SAME x (0,0)(?,?) RE2/Go E SAME aaaaaa (0,6)(0,6) E SAME aaaaaax (0,6)(0,6) E ([a]*)+ a (0,1)(0,1) E SAME x (0,0)(0,0) E SAME aaaaaa (0,6)(0,6) E SAME aaaaaax (0,6)(0,6) E ([^b]*)* a (0,1)(0,1) #E SAME b (0,0)(0,0) E SAME b (0,0)(?,?) RE2/Go E SAME aaaaaa (0,6)(0,6) E SAME aaaaaab (0,6)(0,6) E ([ab]*)* a (0,1)(0,1) E SAME aaaaaa (0,6)(0,6) E SAME ababab (0,6)(0,6) E SAME bababa (0,6)(0,6) E SAME b (0,1)(0,1) E SAME bbbbbb (0,6)(0,6) E SAME aaaabcde (0,5)(0,5) E ([^a]*)* b (0,1)(0,1) E SAME bbbbbb (0,6)(0,6) #E SAME aaaaaa (0,0)(0,0) E SAME aaaaaa (0,0)(?,?) RE2/Go E ([^ab]*)* ccccxx (0,6)(0,6) #E SAME ababab (0,0)(0,0) E SAME ababab (0,0)(?,?) RE2/Go E ((z)+|a)* zabcde (0,2)(1,2) #{E a+? aaaaaa (0,1) no *? +? mimimal match ops #E (a) aaa (0,1)(0,1) #E (a*?) aaa (0,0)(0,0) #E (a)*? aaa (0,0) #E (a*?)*? aaa (0,0) #} B \(a*\)*\(x\) x (0,1)(0,0)(0,1) B \(a*\)*\(x\) ax (0,2)(0,1)(1,2) B \(a*\)*\(x\) axa (0,2)(0,1)(1,2) B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1) B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2) B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3) B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4) B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3) #E (a*)*(x) x (0,1)(0,0)(0,1) E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go E (a*)*(x) ax (0,2)(0,1)(1,2) E (a*)*(x) axa (0,2)(0,1)(1,2) E (a*)+(x) x (0,1)(0,0)(0,1) E (a*)+(x) ax (0,2)(0,1)(1,2) E (a*)+(x) axa (0,2)(0,1)(1,2) E (a*){2}(x) x (0,1)(0,0)(0,1) E (a*){2}(x) ax (0,2)(1,1)(1,2) E (a*){2}(x) axa (0,2)(1,1)(1,2) regex-automata-0.1.8/data/fowler-tests/repetition.dat010064400017500000144000000151601340157411400211060ustar0000000000000000NOTE implicit vs. explicit repetitions : 2009-02-02 # Glenn Fowler # conforming matches (column 4) must match one of the following BREs # NOMATCH # (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)* # (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)* # i.e., each 3-tuple has two identical elements and one (?,?) E ((..)|(.)) NULL NOMATCH E ((..)|(.))((..)|(.)) NULL NOMATCH E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH E ((..)|(.)){1} NULL NOMATCH E ((..)|(.)){2} NULL NOMATCH E ((..)|(.)){3} NULL NOMATCH E ((..)|(.))* NULL (0,0) E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1) E ((..)|(.))((..)|(.)) a NOMATCH E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1) E ((..)|(.)){2} a NOMATCH E ((..)|(.)){3} a NOMATCH E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1) E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?) E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2) E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?) E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2) E ((..)|(.)){3} aa NOMATCH E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?) E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?) E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3) E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3) E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?) #E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3) E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3) #E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3) E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?) E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4) E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?) E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?) #E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4) E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?) E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?) E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5) E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?) E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?) #E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5) E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go #E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5) E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?) E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?) E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?) E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?) E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?) E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?) NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02 # These test a bug in OS X / FreeBSD / NetBSD, and libtree. # Linux/GLIBC gets the {8,} and {8,8} wrong. :HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8) :HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8) :HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8) :HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8) :HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8) :HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8) :HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8) :HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8) :HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8) #:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8) :HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go #:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8) :HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go #:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8) :HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go #:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8) :HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go #:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8) :HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go #:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8) :HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go #:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8) :HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go #:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8) :HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go :HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8) # These test a fixed bug in my regex-tdfa that did not keep the expanded # form properly grouped, so right association did the wrong thing with # these ambiguous patterns (crafted just to test my code when I became # suspicious of my implementation). The first subexpression should use # "ab" then "a" then "bcd". # OS X / FreeBSD / NetBSD badly fail many of these, with impossible # results like (0,6)(4,5)(6,6). :HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1) :HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1) :HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) :HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) :HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH :HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1) :HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1) :HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) :HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) :HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH :HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1) :HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1) # The above worked on Linux/GLIBC but the following often fail. # They also trip up OS X / FreeBSD / NetBSD: #:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) :HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go #:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) :HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go #:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) :HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go #:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) :HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go :HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH #:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) :HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go #:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) :HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go #:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) :HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go #:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) :HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go :HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH #:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) :HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go #:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) :HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go regex-automata-0.1.8/data/tests/crazy.toml010064400017500000144000000061241341277301100167620ustar0000000000000000[[tests]] name = "crazy-misc1" pattern = '[-+]?[0-9]*\.?[0-9]+' input = "0.1" matches = [[0, 3]] [[tests]] name = "crazy-misc2" pattern = '[-+]?[0-9]*\.?[0-9]+' input = "0.1.2" matches = [[0, 3]] [[tests]] name = "crazy-misc3" pattern = '[-+]?[0-9]*\.?[0-9]+' input = "a1.2" matches = [[1, 4]] [[tests]] options = ["case-insensitive"] name = "crazy-misc4" pattern = '[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}' input = "mine is jam.slam@gmail.com " matches = [[8, 26]] [[tests]] options = ["case-insensitive"] name = "crazy-misc5" pattern = '[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}' input = "mine is jam.slam@gmail " matches = [] [[tests]] name = "crazy-misc6" pattern = '''[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*@(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?''' input = "mine is jam.slam@gmail.com " matches = [[8, 26]] [[tests]] name = "crazy-misc7" pattern = '(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])' input = "1900-01-01" matches = [[0, 10]] [[tests]] name = "crazy-misc8" pattern = '(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])' input = "1900-00-01" matches = [] [[tests]] name = "crazy-misc9" pattern = '(19|20)\d\d[- /.](0[1-9]|1[012])[- /.](0[1-9]|[12][0-9]|3[01])' input = "1900-13-01" matches = [] [[tests]] name = "crazy-negclass1" pattern = "[^ac]" input = "acx" matches = [[2, 3]] [[tests]] name = "crazy-negclass2" pattern = "[^a,]" input = "a,x" matches = [[2, 3]] [[tests]] name = "crazy-negclass3" pattern = '[^a\s]' input = "a x" matches = [[2, 3]] [[tests]] name = "crazy-negclass4" pattern = "[^,]" input = ",,x" matches = [[2, 3]] [[tests]] name = "crazy-negclass5" pattern = '[^\s]' input = " a" matches = [[1, 2]] [[tests]] name = "crazy-negclass6" pattern = '[^,\s]' input = ", a" matches = [[2, 3]] [[tests]] name = "crazy-negclass7" pattern = '[^\s,]' input = " ,a" matches = [[2, 3]] [[tests]] name = "crazy-negclass8" pattern = "[^[:alpha:]Z]" input = "A1" matches = [[1, 2]] [[tests]] name = "crazy-empty-repeat1" pattern = "((.*)*?)=" input = "a=b" matches = [[0, 2]] [[tests]] name = "crazy-empty-repeat2" pattern = "((.?)*?)=" input = "a=b" matches = [[0, 2]] [[tests]] name = "crazy-empty-repeat3" pattern = "((.*)+?)=" input = "a=b" matches = [[0, 2]] [[tests]] name = "crazy-empty-repeat4" pattern = "((.?)+?)=" input = "a=b" matches = [[0, 2]] [[tests]] name = "crazy-empty-repeat5" pattern = "((.*){1,}?)=" input = "a=b" matches = [[0, 2]] [[tests]] name = "crazy-empty-repeat6" pattern = "((.*){1,2}?)=" input = "a=b" matches = [[0, 2]] [[tests]] name = "crazy-empty-repeat7" pattern = "((.*)*)=" input = "a=b" matches = [[0, 2]] [[tests]] name = "crazy-empty-repeat8" pattern = "((.?)*)=" input = "a=b" matches = [[0, 2]] [[tests]] name = "crazy-empty-repeat9" pattern = "((.*)+)=" input = "a=b" matches = [[0, 2]] [[tests]] name = "crazy-empty-repeat10" pattern = "((.?)+)=" input = "a=b" matches = [[0, 2]] [[tests]] name = "crazy-empty-repeat11" pattern = "((.*){1,})=" input = "a=b" matches = [[0, 2]] [[tests]] name = "crazy-empty-repeat12" pattern = "((.*){1,2})=" input = "a=b" matches = [[0, 2]] regex-automata-0.1.8/data/tests/flags.toml010064400017500000144000000015001341032023700167130ustar0000000000000000[[tests]] name = "flags1" pattern = "(?i)abc" input = "ABC" matches = [[0, 3]] [[tests]] name = "flags2" pattern = "(?i)a(?-i)bc" input = "Abc" matches = [[0, 3]] [[tests]] name = "flags3" pattern = "(?i)a(?-i)bc" input = "ABC" matches = [] [[tests]] name = "flags4" pattern = "(?is)a." input = "A\n" matches = [[0, 2]] [[tests]] name = "flags5" pattern = "(?is)a.(?-is)a." input = "A\nab" matches = [[0, 4]] [[tests]] name = "flags6" pattern = "(?is)a.(?-is)a." input = "A\na\n" matches = [] [[tests]] name = "flags7" pattern = "(?is)a.(?-is:a.)?" input = "A\na\n" matches = [[0, 2]] [[tests]] name = "flags8" pattern = "(?U)a+" input = "aa" matches = [[0, 1]] [[tests]] name = "flags9" pattern = "(?U)a+?" input = "aa" matches = [[0, 2]] [[tests]] name = "flags10" pattern = "(?U)(?-U)a+" input = "aa" matches = [[0, 2]] regex-automata-0.1.8/data/tests/fowler/LICENSE010064400017500000144000000022041340550524000172320ustar0000000000000000The following license covers testregex.c and all associated test data. Permission is hereby granted, free of charge, to any person obtaining a copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following disclaimer: THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. regex-automata-0.1.8/data/tests/fowler/README010064400017500000144000000016661340550545600171310ustar0000000000000000Test data was taken from the Go distribution, which was in turn taken from the testregex test suite: http://www2.research.att.com/~astopen/testregex/testregex.html Unfortunately, the above link is now dead, but the test data lives on. The LICENSE in this directory corresponds to the LICENSE that the data was originally released under. The tests themselves were modified for RE2/Go. A couple were modified further by me (Andrew Gallant) (only in repetition.dat) so that RE2/Go would pass them. (Yes, it seems like RE2/Go includes failing test cases.) This may or may not have been a bad idea, but I think being consistent with an established Regex library is worth something. After some number of years, these tests were transformed into a JSON format using the fowler-to-json script in this directory, e.g., ./fowler-to-json basic.dat > basic.json which brings them into a sensible structured format in which other tests can be written. regex-automata-0.1.8/data/tests/fowler/basic.dat010064400017500000144000000210341340550531600200060ustar0000000000000000NOTE all standard compliant implementations should pass these : 2002-05-31 BE abracadabra$ abracadabracadabra (7,18) BE a...b abababbb (2,7) BE XXXXXX ..XXXXXX (2,8) E \) () (1,2) BE a] a]a (0,2) B } } (0,1) E \} } (0,1) BE \] ] (0,1) B ] ] (0,1) E ] ] (0,1) B { { (0,1) B } } (0,1) BE ^a ax (0,1) BE \^a a^a (1,3) BE a\^ a^ (0,2) BE a$ aa (1,2) BE a\$ a$ (0,2) BE ^$ NULL (0,0) E $^ NULL (0,0) E a($) aa (1,2)(2,2) E a*(^a) aa (0,1)(0,1) E (..)*(...)* a (0,0) E (..)*(...)* abcd (0,4)(2,4) E (ab|a)(bc|c) abc (0,3)(0,2)(2,3) E (ab)c|abc abc (0,3)(0,2) E a{0}b ab (1,2) E (a*)(b?)(b+)b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) E (a*)(b{0,1})(b{1,})b{3} aaabbbbbbb (0,10)(0,3)(3,4)(4,7) E a{9876543210} NULL BADBR E ((a|a)|a) a (0,1)(0,1)(0,1) E (a*)(a|aa) aaaa (0,4)(0,3)(3,4) E a*(a.|aa) aaaa (0,4)(2,4) E a(b)|c(d)|a(e)f aef (0,3)(?,?)(?,?)(1,2) E (a|b)?.* b (0,1)(0,1) E (a|b)c|a(b|c) ac (0,2)(0,1) E (a|b)c|a(b|c) ab (0,2)(?,?)(1,2) E (a|b)*c|(a|ab)*c abc (0,3)(1,2) E (a|b)*c|(a|ab)*c xc (1,2) E (.a|.b).*|.*(.a|.b) xa (0,2)(0,2) E a?(ab|ba)ab abab (0,4)(0,2) E a?(ac{0}b|ba)ab abab (0,4)(0,2) E ab|abab abbabab (0,2) E aba|bab|bba baaabbbaba (5,8) E aba|bab baaabbbaba (6,9) E (aa|aaa)*|(a|aaaaa) aa (0,2)(0,2) E (a.|.a.)*|(a|.a...) aa (0,2)(0,2) E ab|a xabc (1,3) E ab|a xxabc (2,4) Ei (Ab|cD)* aBcD (0,4)(2,4) BE [^-] --a (2,3) BE [a-]* --a (0,3) BE [a-m-]* --amoma-- (0,4) E :::1:::0:|:::1:1:0: :::0:::1:::1:::0: (8,17) E :::1:::0:|:::1:1:1: :::0:::1:::1:::0: (8,17) {E [[:upper:]] A (0,1) [[]] not supported E [[:lower:]]+ `az{ (1,3) E [[:upper:]]+ @AZ[ (1,3) # No collation in Go #BE [[-]] [[-]] (2,4) #BE [[.NIL.]] NULL ECOLLATE #BE [[=aleph=]] NULL ECOLLATE } BE$ \n \n (0,1) BEn$ \n \n (0,1) BE$ [^a] \n (0,1) BE$ \na \na (0,2) E (a)(b)(c) abc (0,3)(0,1)(1,2)(2,3) BE xxx xxx (0,3) E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 6, (0,6) E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) 2/7 (0,3) E1 (^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$) feb 1,Feb 6 (5,11) E3 ((((((((((((((((((((((((((((((x)))))))))))))))))))))))))))))) x (0,1)(0,1)(0,1) E3 ((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))* xx (0,2)(1,2)(1,2) E a?(ab|ba)* ababababababababababababababababababababababababababababababababababababababababa (0,81)(79,81) E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabbbbaa (18,25) E abaa|abbaa|abbbaa|abbbbaa ababbabbbabbbabbbbabaa (18,22) E aaac|aabc|abac|abbc|baac|babc|bbac|bbbc baaabbbabac (7,11) BE$ .* \x01\x7f (0,2) E aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa (53,57) L aaaa\nbbbb\ncccc\nddddd\neeeeee\nfffffff\ngggg\nhhhh\niiiii\njjjjj\nkkkkk\nllll XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa NOMATCH E a*a*a*a*a*b aaaaaaaaab (0,10) BE ^ NULL (0,0) BE $ NULL (0,0) BE ^$ NULL (0,0) BE ^a$ a (0,1) BE abc abc (0,3) BE abc xabcy (1,4) BE abc ababc (2,5) BE ab*c abc (0,3) BE ab*bc abc (0,3) BE ab*bc abbc (0,4) BE ab*bc abbbbc (0,6) E ab+bc abbc (0,4) E ab+bc abbbbc (0,6) E ab?bc abbc (0,4) E ab?bc abc (0,3) E ab?c abc (0,3) BE ^abc$ abc (0,3) BE ^abc abcc (0,3) BE abc$ aabc (1,4) BE ^ abc (0,0) BE $ abc (3,3) BE a.c abc (0,3) BE a.c axc (0,3) BE a.*c axyzc (0,5) BE a[bc]d abd (0,3) BE a[b-d]e ace (0,3) BE a[b-d] aac (1,3) BE a[-b] a- (0,2) BE a[b-] a- (0,2) BE a] a] (0,2) BE a[]]b a]b (0,3) BE a[^bc]d aed (0,3) BE a[^-b]c adc (0,3) BE a[^]b]c adc (0,3) E ab|cd abc (0,2) E ab|cd abcd (0,2) E a\(b a(b (0,3) E a\(*b ab (0,2) E a\(*b a((b (0,4) E ((a)) abc (0,1)(0,1)(0,1) E (a)b(c) abc (0,3)(0,1)(2,3) E a+b+c aabbabc (4,7) E a* aaa (0,3) #E (a*)* - (0,0)(0,0) E (a*)* - (0,0)(?,?) RE2/Go E (a*)+ - (0,0)(0,0) #E (a*|b)* - (0,0)(0,0) E (a*|b)* - (0,0)(?,?) RE2/Go E (a+|b)* ab (0,2)(1,2) E (a+|b)+ ab (0,2)(1,2) E (a+|b)? ab (0,1)(0,1) BE [^ab]* cde (0,3) #E (^)* - (0,0)(0,0) E (^)* - (0,0)(?,?) RE2/Go BE a* NULL (0,0) E ([abc])*d abbbcd (0,6)(4,5) E ([abc])*bcd abcd (0,4)(0,1) E a|b|c|d|e e (0,1) E (a|b|c|d|e)f ef (0,2)(0,1) #E ((a*|b))* - (0,0)(0,0)(0,0) E ((a*|b))* - (0,0)(?,?)(?,?) RE2/Go BE abcd*efg abcdefg (0,7) BE ab* xabyabbbz (1,3) BE ab* xayabbbz (1,2) E (ab|cd)e abcde (2,5)(2,4) BE [abhgefdc]ij hij (0,3) E (a|b)c*d abcd (1,4)(1,2) E (ab|ab*)bc abc (0,3)(0,1) E a([bc]*)c* abc (0,3)(1,3) E a([bc]*)(c*d) abcd (0,4)(1,3)(3,4) E a([bc]+)(c*d) abcd (0,4)(1,3)(3,4) E a([bc]*)(c+d) abcd (0,4)(1,2)(2,4) E a[bcd]*dcdcde adcdcde (0,7) E (ab|a)b*c abc (0,3)(0,2) E ((a)(b)c)(d) abcd (0,4)(0,3)(0,1)(1,2)(3,4) BE [A-Za-z_][A-Za-z0-9_]* alpha (0,5) E ^a(bc+|b[eh])g|.h$ abh (1,3) E (bc+d$|ef*g.|h?i(j|k)) effgz (0,5)(0,5) E (bc+d$|ef*g.|h?i(j|k)) ij (0,2)(0,2)(1,2) E (bc+d$|ef*g.|h?i(j|k)) reffgz (1,6)(1,6) E (((((((((a))))))))) a (0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1)(0,1) BE multiple words multiple words yeah (0,14) E (.*)c(.*) abcde (0,5)(0,2)(3,5) BE abcd abcd (0,4) E a(bc)d abcd (0,4)(1,3) E a[-]?c ac (0,3) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qaddafi (0,15)(?,?)(10,12) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mo'ammar Gadhafi (0,16)(?,?)(11,13) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Kaddafi (0,15)(?,?)(10,12) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Qadhafi (0,15)(?,?)(10,12) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gadafi (0,14)(?,?)(10,11) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadafi (0,15)(?,?)(11,12) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moamar Gaddafi (0,14)(?,?)(9,11) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Mu'ammar Qadhdhafi (0,18)(?,?)(13,15) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Khaddafi (0,16)(?,?)(11,13) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafy (0,16)(?,?)(11,13) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghadafi (0,15)(?,?)(11,12) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Ghaddafi (0,16)(?,?)(11,13) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muamar Kaddafi (0,14)(?,?)(9,11) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Quathafi (0,16)(?,?)(11,13) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Muammar Gheddafi (0,16)(?,?)(11,13) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Khadafy (0,15)(?,?)(11,12) E M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy] Moammar Qudhafi (0,15)(?,?)(10,12) E a+(b|c)*d+ aabcdd (0,6)(3,4) E ^.+$ vivi (0,4) E ^(.+)$ vivi (0,4)(0,4) E ^([^!.]+).att.com!(.+)$ gryphon.att.com!eby (0,19)(0,7)(16,19) E ^([^!]+!)?([^!]+)$ bas (0,3)(?,?)(0,3) E ^([^!]+!)?([^!]+)$ bar!bas (0,7)(0,4)(4,7) E ^([^!]+!)?([^!]+)$ foo!bas (0,7)(0,4)(4,7) E ^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(4,8)(8,11) E ((foo)|(bar))!bas bar!bas (0,7)(0,3)(?,?)(0,3) E ((foo)|(bar))!bas foo!bar!bas (4,11)(4,7)(?,?)(4,7) E ((foo)|(bar))!bas foo!bas (0,7)(0,3)(0,3) E ((foo)|bar)!bas bar!bas (0,7)(0,3) E ((foo)|bar)!bas foo!bar!bas (4,11)(4,7) E ((foo)|bar)!bas foo!bas (0,7)(0,3)(0,3) E (foo|(bar))!bas bar!bas (0,7)(0,3)(0,3) E (foo|(bar))!bas foo!bar!bas (4,11)(4,7)(4,7) E (foo|(bar))!bas foo!bas (0,7)(0,3) E (foo|bar)!bas bar!bas (0,7)(0,3) E (foo|bar)!bas foo!bar!bas (4,11)(4,7) E (foo|bar)!bas foo!bas (0,7)(0,3) E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bas (0,3)(?,?)(0,3) E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ bar!bas (0,7)(0,4)(4,7) E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bar!bas (0,11)(?,?)(?,?)(4,8)(8,11) E ^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$ foo!bas (0,7)(0,4)(4,7) E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bas (0,3)(0,3)(?,?)(0,3) E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ bar!bas (0,7)(0,7)(0,4)(4,7) E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bar!bas (0,11)(0,11)(?,?)(?,?)(4,8)(8,11) E ^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$ foo!bas (0,7)(0,7)(0,4)(4,7) E .*(/XXX).* /XXX (0,4)(0,4) E .*(\\XXX).* \XXX (0,4)(0,4) E \\XXX \XXX (0,4) E .*(/000).* /000 (0,4)(0,4) E .*(\\000).* \000 (0,4)(0,4) E \\000 \000 (0,4) regex-automata-0.1.8/data/tests/fowler/basic.toml010064400017500000144000000615231341033352600202160ustar0000000000000000[[tests]] name = "basic3" options = ['escaped'] pattern = '''abracadabra$''' input = '''abracadabracadabra''' matches = [[7, 18]] [[tests]] name = "basic4" options = ['escaped'] pattern = '''a...b''' input = '''abababbb''' matches = [[2, 7]] [[tests]] name = "basic5" options = ['escaped'] pattern = '''XXXXXX''' input = '''..XXXXXX''' matches = [[2, 8]] [[tests]] name = "basic6" options = ['escaped'] pattern = '''\)''' input = '''()''' matches = [[1, 2]] [[tests]] name = "basic7" options = ['escaped'] pattern = '''a]''' input = '''a]a''' matches = [[0, 2]] [[tests]] name = "basic9" options = ['escaped'] pattern = '''\}''' input = '''}''' matches = [[0, 1]] [[tests]] name = "basic10" options = ['escaped'] pattern = '''\]''' input = ''']''' matches = [[0, 1]] [[tests]] name = "basic12" options = ['escaped'] pattern = ''']''' input = ''']''' matches = [[0, 1]] [[tests]] name = "basic15" options = ['escaped'] pattern = '''^a''' input = '''ax''' matches = [[0, 1]] [[tests]] name = "basic16" options = ['escaped'] pattern = '''\^a''' input = '''a^a''' matches = [[1, 3]] [[tests]] name = "basic17" options = ['escaped'] pattern = '''a\^''' input = '''a^''' matches = [[0, 2]] [[tests]] name = "basic18" options = ['escaped'] pattern = '''a$''' input = '''aa''' matches = [[1, 2]] [[tests]] name = "basic19" options = ['escaped'] pattern = '''a\$''' input = '''a$''' matches = [[0, 2]] [[tests]] name = "basic20" options = ['escaped'] pattern = '''^$''' input = '''''' matches = [[0, 0]] [[tests]] name = "basic21" options = ['escaped'] pattern = '''$^''' input = '''''' matches = [[0, 0]] [[tests]] name = "basic22" options = ['escaped'] pattern = '''a($)''' input = '''aa''' matches = [[1, 2]] [[tests]] name = "basic23" options = ['escaped'] pattern = '''a*(^a)''' input = '''aa''' matches = [[0, 1]] [[tests]] name = "basic24" options = ['escaped'] pattern = '''(..)*(...)*''' input = '''a''' matches = [[0, 0]] [[tests]] name = "basic25" options = ['escaped'] pattern = '''(..)*(...)*''' input = '''abcd''' matches = [[0, 4]] [[tests]] name = "basic26" options = ['escaped'] pattern = '''(ab|a)(bc|c)''' input = '''abc''' matches = [[0, 3]] [[tests]] name = "basic27" options = ['escaped'] pattern = '''(ab)c|abc''' input = '''abc''' matches = [[0, 3]] [[tests]] name = "basic28" options = ['escaped'] pattern = '''a{0}b''' input = '''ab''' matches = [[1, 2]] [[tests]] name = "basic29" options = ['escaped'] pattern = '''(a*)(b?)(b+)b{3}''' input = '''aaabbbbbbb''' matches = [[0, 10]] [[tests]] name = "basic30" options = ['escaped'] pattern = '''(a*)(b{0,1})(b{1,})b{3}''' input = '''aaabbbbbbb''' matches = [[0, 10]] [[tests]] name = "basic32" options = ['escaped'] pattern = '''((a|a)|a)''' input = '''a''' matches = [[0, 1]] [[tests]] name = "basic33" options = ['escaped'] pattern = '''(a*)(a|aa)''' input = '''aaaa''' matches = [[0, 4]] [[tests]] name = "basic34" options = ['escaped'] pattern = '''a*(a.|aa)''' input = '''aaaa''' matches = [[0, 4]] [[tests]] name = "basic35" options = ['escaped'] pattern = '''a(b)|c(d)|a(e)f''' input = '''aef''' matches = [[0, 3]] [[tests]] name = "basic36" options = ['escaped'] pattern = '''(a|b)?.*''' input = '''b''' matches = [[0, 1]] [[tests]] name = "basic37" options = ['escaped'] pattern = '''(a|b)c|a(b|c)''' input = '''ac''' matches = [[0, 2]] [[tests]] name = "basic38" options = ['escaped'] pattern = '''(a|b)c|a(b|c)''' input = '''ab''' matches = [[0, 2]] [[tests]] name = "basic39" options = ['escaped'] pattern = '''(a|b)*c|(a|ab)*c''' input = '''abc''' matches = [[0, 3]] [[tests]] name = "basic40" options = ['escaped'] pattern = '''(a|b)*c|(a|ab)*c''' input = '''xc''' matches = [[1, 2]] [[tests]] name = "basic41" options = ['escaped'] pattern = '''(.a|.b).*|.*(.a|.b)''' input = '''xa''' matches = [[0, 2]] [[tests]] name = "basic42" options = ['escaped'] pattern = '''a?(ab|ba)ab''' input = '''abab''' matches = [[0, 4]] [[tests]] name = "basic43" options = ['escaped'] pattern = '''a?(ac{0}b|ba)ab''' input = '''abab''' matches = [[0, 4]] [[tests]] name = "basic44" options = ['escaped'] pattern = '''ab|abab''' input = '''abbabab''' matches = [[0, 2]] [[tests]] name = "basic45" options = ['escaped'] pattern = '''aba|bab|bba''' input = '''baaabbbaba''' matches = [[5, 8]] [[tests]] name = "basic46" options = ['escaped'] pattern = '''aba|bab''' input = '''baaabbbaba''' matches = [[6, 9]] [[tests]] name = "basic47" options = ['escaped'] pattern = '''(aa|aaa)*|(a|aaaaa)''' input = '''aa''' matches = [[0, 2]] [[tests]] name = "basic48" options = ['escaped'] pattern = '''(a.|.a.)*|(a|.a...)''' input = '''aa''' matches = [[0, 2]] [[tests]] name = "basic49" options = ['escaped'] pattern = '''ab|a''' input = '''xabc''' matches = [[1, 3]] [[tests]] name = "basic50" options = ['escaped'] pattern = '''ab|a''' input = '''xxabc''' matches = [[2, 4]] [[tests]] name = "basic51" options = ['escaped', 'case-insensitive'] pattern = '''(Ab|cD)*''' input = '''aBcD''' matches = [[0, 4]] [[tests]] name = "basic52" options = ['escaped'] pattern = '''[^-]''' input = '''--a''' matches = [[2, 3]] [[tests]] name = "basic53" options = ['escaped'] pattern = '''[a-]*''' input = '''--a''' matches = [[0, 3]] [[tests]] name = "basic54" options = ['escaped'] pattern = '''[a-m-]*''' input = '''--amoma--''' matches = [[0, 4]] [[tests]] name = "basic55" options = ['escaped'] pattern = ''':::1:::0:|:::1:1:0:''' input = ''':::0:::1:::1:::0:''' matches = [[8, 17]] [[tests]] name = "basic56" options = ['escaped'] pattern = ''':::1:::0:|:::1:1:1:''' input = ''':::0:::1:::1:::0:''' matches = [[8, 17]] [[tests]] name = "basic57" options = ['escaped'] pattern = '''[[:upper:]]''' input = '''A''' matches = [[0, 1]] [[tests]] name = "basic58" options = ['escaped'] pattern = '''[[:lower:]]+''' input = '''`az{''' matches = [[1, 3]] [[tests]] name = "basic59" options = ['escaped'] pattern = '''[[:upper:]]+''' input = '''@AZ[''' matches = [[1, 3]] [[tests]] name = "basic65" options = ['escaped'] pattern = '''\n''' input = '''\n''' matches = [[0, 1]] [[tests]] name = "basic66" options = ['escaped'] pattern = '''\n''' input = '''\n''' matches = [[0, 1]] [[tests]] name = "basic67" options = ['escaped'] pattern = '''[^a]''' input = '''\n''' matches = [[0, 1]] [[tests]] name = "basic68" options = ['escaped'] pattern = '''\na''' input = '''\na''' matches = [[0, 2]] [[tests]] name = "basic69" options = ['escaped'] pattern = '''(a)(b)(c)''' input = '''abc''' matches = [[0, 3]] [[tests]] name = "basic70" options = ['escaped'] pattern = '''xxx''' input = '''xxx''' matches = [[0, 3]] [[tests]] name = "basic71" options = ['escaped'] pattern = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)''' input = '''feb 6,''' matches = [[0, 6]] [[tests]] name = "basic72" options = ['escaped'] pattern = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)''' input = '''2/7''' matches = [[0, 3]] [[tests]] name = "basic73" options = ['escaped'] pattern = '''(^|[ (,;])((([Ff]eb[^ ]* *|0*2/|\* */?)0*[6-7]))([^0-9]|$)''' input = '''feb 1,Feb 6''' matches = [[5, 11]] [[tests]] name = "basic74" options = ['escaped'] pattern = '''((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))''' input = '''x''' matches = [[0, 1]] [[tests]] name = "basic75" options = ['escaped'] pattern = '''((((((((((((((((((((((((((((((x))))))))))))))))))))))))))))))*''' input = '''xx''' matches = [[0, 2]] [[tests]] name = "basic76" options = ['escaped'] pattern = '''a?(ab|ba)*''' input = '''ababababababababababababababababababababababababababababababababababababababababa''' matches = [[0, 81]] [[tests]] name = "basic77" options = ['escaped'] pattern = '''abaa|abbaa|abbbaa|abbbbaa''' input = '''ababbabbbabbbabbbbabbbbaa''' matches = [[18, 25]] [[tests]] name = "basic78" options = ['escaped'] pattern = '''abaa|abbaa|abbbaa|abbbbaa''' input = '''ababbabbbabbbabbbbabaa''' matches = [[18, 22]] [[tests]] name = "basic79" options = ['escaped'] pattern = '''aaac|aabc|abac|abbc|baac|babc|bbac|bbbc''' input = '''baaabbbabac''' matches = [[7, 11]] [[tests]] name = "basic80" options = ['escaped'] pattern = '''.*''' input = '''\x01\x7f''' matches = [[0, 2]] [[tests]] name = "basic81" options = ['escaped'] pattern = '''aaaa|bbbb|cccc|ddddd|eeeeee|fffffff|gggg|hhhh|iiiii|jjjjj|kkkkk|llll''' input = '''XaaaXbbbXcccXdddXeeeXfffXgggXhhhXiiiXjjjXkkkXlllXcbaXaaaa''' matches = [[53, 57]] [[tests]] name = "basic83" options = ['escaped'] pattern = '''a*a*a*a*a*b''' input = '''aaaaaaaaab''' matches = [[0, 10]] [[tests]] name = "basic84" options = ['escaped'] pattern = '''^''' input = '''''' matches = [[0, 0]] [[tests]] name = "basic85" options = ['escaped'] pattern = '''$''' input = '''''' matches = [[0, 0]] [[tests]] name = "basic86" options = ['escaped'] pattern = '''^$''' input = '''''' matches = [[0, 0]] [[tests]] name = "basic87" options = ['escaped'] pattern = '''^a$''' input = '''a''' matches = [[0, 1]] [[tests]] name = "basic88" options = ['escaped'] pattern = '''abc''' input = '''abc''' matches = [[0, 3]] [[tests]] name = "basic89" options = ['escaped'] pattern = '''abc''' input = '''xabcy''' matches = [[1, 4]] [[tests]] name = "basic90" options = ['escaped'] pattern = '''abc''' input = '''ababc''' matches = [[2, 5]] [[tests]] name = "basic91" options = ['escaped'] pattern = '''ab*c''' input = '''abc''' matches = [[0, 3]] [[tests]] name = "basic92" options = ['escaped'] pattern = '''ab*bc''' input = '''abc''' matches = [[0, 3]] [[tests]] name = "basic93" options = ['escaped'] pattern = '''ab*bc''' input = '''abbc''' matches = [[0, 4]] [[tests]] name = "basic94" options = ['escaped'] pattern = '''ab*bc''' input = '''abbbbc''' matches = [[0, 6]] [[tests]] name = "basic95" options = ['escaped'] pattern = '''ab+bc''' input = '''abbc''' matches = [[0, 4]] [[tests]] name = "basic96" options = ['escaped'] pattern = '''ab+bc''' input = '''abbbbc''' matches = [[0, 6]] [[tests]] name = "basic97" options = ['escaped'] pattern = '''ab?bc''' input = '''abbc''' matches = [[0, 4]] [[tests]] name = "basic98" options = ['escaped'] pattern = '''ab?bc''' input = '''abc''' matches = [[0, 3]] [[tests]] name = "basic99" options = ['escaped'] pattern = '''ab?c''' input = '''abc''' matches = [[0, 3]] [[tests]] name = "basic100" options = ['escaped'] pattern = '''^abc$''' input = '''abc''' matches = [[0, 3]] [[tests]] name = "basic101" options = ['escaped'] pattern = '''^abc''' input = '''abcc''' matches = [[0, 3]] [[tests]] name = "basic102" options = ['escaped'] pattern = '''abc$''' input = '''aabc''' matches = [[1, 4]] [[tests]] name = "basic103" options = ['escaped'] pattern = '''^''' input = '''abc''' matches = [[0, 0]] [[tests]] name = "basic104" options = ['escaped'] pattern = '''$''' input = '''abc''' matches = [[3, 3]] [[tests]] name = "basic105" options = ['escaped'] pattern = '''a.c''' input = '''abc''' matches = [[0, 3]] [[tests]] name = "basic106" options = ['escaped'] pattern = '''a.c''' input = '''axc''' matches = [[0, 3]] [[tests]] name = "basic107" options = ['escaped'] pattern = '''a.*c''' input = '''axyzc''' matches = [[0, 5]] [[tests]] name = "basic108" options = ['escaped'] pattern = '''a[bc]d''' input = '''abd''' matches = [[0, 3]] [[tests]] name = "basic109" options = ['escaped'] pattern = '''a[b-d]e''' input = '''ace''' matches = [[0, 3]] [[tests]] name = "basic110" options = ['escaped'] pattern = '''a[b-d]''' input = '''aac''' matches = [[1, 3]] [[tests]] name = "basic111" options = ['escaped'] pattern = '''a[-b]''' input = '''a-''' matches = [[0, 2]] [[tests]] name = "basic112" options = ['escaped'] pattern = '''a[b-]''' input = '''a-''' matches = [[0, 2]] [[tests]] name = "basic113" options = ['escaped'] pattern = '''a]''' input = '''a]''' matches = [[0, 2]] [[tests]] name = "basic114" options = ['escaped'] pattern = '''a[]]b''' input = '''a]b''' matches = [[0, 3]] [[tests]] name = "basic115" options = ['escaped'] pattern = '''a[^bc]d''' input = '''aed''' matches = [[0, 3]] [[tests]] name = "basic116" options = ['escaped'] pattern = '''a[^-b]c''' input = '''adc''' matches = [[0, 3]] [[tests]] name = "basic117" options = ['escaped'] pattern = '''a[^]b]c''' input = '''adc''' matches = [[0, 3]] [[tests]] name = "basic118" options = ['escaped'] pattern = '''ab|cd''' input = '''abc''' matches = [[0, 2]] [[tests]] name = "basic119" options = ['escaped'] pattern = '''ab|cd''' input = '''abcd''' matches = [[0, 2]] [[tests]] name = "basic120" options = ['escaped'] pattern = '''a\(b''' input = '''a(b''' matches = [[0, 3]] [[tests]] name = "basic121" options = ['escaped'] pattern = '''a\(*b''' input = '''ab''' matches = [[0, 2]] [[tests]] name = "basic122" options = ['escaped'] pattern = '''a\(*b''' input = '''a((b''' matches = [[0, 4]] [[tests]] name = "basic123" options = ['escaped'] pattern = '''((a))''' input = '''abc''' matches = [[0, 1]] [[tests]] name = "basic124" options = ['escaped'] pattern = '''(a)b(c)''' input = '''abc''' matches = [[0, 3]] [[tests]] name = "basic125" options = ['escaped'] pattern = '''a+b+c''' input = '''aabbabc''' matches = [[4, 7]] [[tests]] name = "basic126" options = ['escaped'] pattern = '''a*''' input = '''aaa''' matches = [[0, 3]] [[tests]] name = "basic128" options = ['escaped'] pattern = '''(a*)*''' input = '''-''' matches = [[0, 0]] [[tests]] name = "basic129" options = ['escaped'] pattern = '''(a*)+''' input = '''-''' matches = [[0, 0]] [[tests]] name = "basic131" options = ['escaped'] pattern = '''(a*|b)*''' input = '''-''' matches = [[0, 0]] [[tests]] name = "basic132" options = ['escaped'] pattern = '''(a+|b)*''' input = '''ab''' matches = [[0, 2]] [[tests]] name = "basic133" options = ['escaped'] pattern = '''(a+|b)+''' input = '''ab''' matches = [[0, 2]] [[tests]] name = "basic134" options = ['escaped'] pattern = '''(a+|b)?''' input = '''ab''' matches = [[0, 1]] [[tests]] name = "basic135" options = ['escaped'] pattern = '''[^ab]*''' input = '''cde''' matches = [[0, 3]] [[tests]] name = "basic137" options = ['escaped'] pattern = '''(^)*''' input = '''-''' matches = [[0, 0]] [[tests]] name = "basic138" options = ['escaped'] pattern = '''a*''' input = '''''' matches = [[0, 0]] [[tests]] name = "basic139" options = ['escaped'] pattern = '''([abc])*d''' input = '''abbbcd''' matches = [[0, 6]] [[tests]] name = "basic140" options = ['escaped'] pattern = '''([abc])*bcd''' input = '''abcd''' matches = [[0, 4]] [[tests]] name = "basic141" options = ['escaped'] pattern = '''a|b|c|d|e''' input = '''e''' matches = [[0, 1]] [[tests]] name = "basic142" options = ['escaped'] pattern = '''(a|b|c|d|e)f''' input = '''ef''' matches = [[0, 2]] [[tests]] name = "basic144" options = ['escaped'] pattern = '''((a*|b))*''' input = '''-''' matches = [[0, 0]] [[tests]] name = "basic145" options = ['escaped'] pattern = '''abcd*efg''' input = '''abcdefg''' matches = [[0, 7]] [[tests]] name = "basic146" options = ['escaped'] pattern = '''ab*''' input = '''xabyabbbz''' matches = [[1, 3]] [[tests]] name = "basic147" options = ['escaped'] pattern = '''ab*''' input = '''xayabbbz''' matches = [[1, 2]] [[tests]] name = "basic148" options = ['escaped'] pattern = '''(ab|cd)e''' input = '''abcde''' matches = [[2, 5]] [[tests]] name = "basic149" options = ['escaped'] pattern = '''[abhgefdc]ij''' input = '''hij''' matches = [[0, 3]] [[tests]] name = "basic150" options = ['escaped'] pattern = '''(a|b)c*d''' input = '''abcd''' matches = [[1, 4]] [[tests]] name = "basic151" options = ['escaped'] pattern = '''(ab|ab*)bc''' input = '''abc''' matches = [[0, 3]] [[tests]] name = "basic152" options = ['escaped'] pattern = '''a([bc]*)c*''' input = '''abc''' matches = [[0, 3]] [[tests]] name = "basic153" options = ['escaped'] pattern = '''a([bc]*)(c*d)''' input = '''abcd''' matches = [[0, 4]] [[tests]] name = "basic154" options = ['escaped'] pattern = '''a([bc]+)(c*d)''' input = '''abcd''' matches = [[0, 4]] [[tests]] name = "basic155" options = ['escaped'] pattern = '''a([bc]*)(c+d)''' input = '''abcd''' matches = [[0, 4]] [[tests]] name = "basic156" options = ['escaped'] pattern = '''a[bcd]*dcdcde''' input = '''adcdcde''' matches = [[0, 7]] [[tests]] name = "basic157" options = ['escaped'] pattern = '''(ab|a)b*c''' input = '''abc''' matches = [[0, 3]] [[tests]] name = "basic158" options = ['escaped'] pattern = '''((a)(b)c)(d)''' input = '''abcd''' matches = [[0, 4]] [[tests]] name = "basic159" options = ['escaped'] pattern = '''[A-Za-z_][A-Za-z0-9_]*''' input = '''alpha''' matches = [[0, 5]] [[tests]] name = "basic160" options = ['escaped'] pattern = '''^a(bc+|b[eh])g|.h$''' input = '''abh''' matches = [[1, 3]] [[tests]] name = "basic161" options = ['escaped'] pattern = '''(bc+d$|ef*g.|h?i(j|k))''' input = '''effgz''' matches = [[0, 5]] [[tests]] name = "basic162" options = ['escaped'] pattern = '''(bc+d$|ef*g.|h?i(j|k))''' input = '''ij''' matches = [[0, 2]] [[tests]] name = "basic163" options = ['escaped'] pattern = '''(bc+d$|ef*g.|h?i(j|k))''' input = '''reffgz''' matches = [[1, 6]] [[tests]] name = "basic164" options = ['escaped'] pattern = '''(((((((((a)))))))))''' input = '''a''' matches = [[0, 1]] [[tests]] name = "basic165" options = ['escaped'] pattern = '''multiple words''' input = '''multiple words yeah''' matches = [[0, 14]] [[tests]] name = "basic166" options = ['escaped'] pattern = '''(.*)c(.*)''' input = '''abcde''' matches = [[0, 5]] [[tests]] name = "basic167" options = ['escaped'] pattern = '''abcd''' input = '''abcd''' matches = [[0, 4]] [[tests]] name = "basic168" options = ['escaped'] pattern = '''a(bc)d''' input = '''abcd''' matches = [[0, 4]] [[tests]] name = "basic169" options = ['escaped'] pattern = '''a[\x01-\x03]?c''' input = '''a\x02c''' matches = [[0, 3]] [[tests]] name = "basic170" options = ['escaped'] pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' input = '''Muammar Qaddafi''' matches = [[0, 15]] [[tests]] name = "basic171" options = ['escaped'] pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' input = '''Mo'ammar Gadhafi''' matches = [[0, 16]] [[tests]] name = "basic172" options = ['escaped'] pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' input = '''Muammar Kaddafi''' matches = [[0, 15]] [[tests]] name = "basic173" options = ['escaped'] pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' input = '''Muammar Qadhafi''' matches = [[0, 15]] [[tests]] name = "basic174" options = ['escaped'] pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' input = '''Muammar Gadafi''' matches = [[0, 14]] [[tests]] name = "basic175" options = ['escaped'] pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' input = '''Mu'ammar Qadafi''' matches = [[0, 15]] [[tests]] name = "basic176" options = ['escaped'] pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' input = '''Moamar Gaddafi''' matches = [[0, 14]] [[tests]] name = "basic177" options = ['escaped'] pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' input = '''Mu'ammar Qadhdhafi''' matches = [[0, 18]] [[tests]] name = "basic178" options = ['escaped'] pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' input = '''Muammar Khaddafi''' matches = [[0, 16]] [[tests]] name = "basic179" options = ['escaped'] pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' input = '''Muammar Ghaddafy''' matches = [[0, 16]] [[tests]] name = "basic180" options = ['escaped'] pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' input = '''Muammar Ghadafi''' matches = [[0, 15]] [[tests]] name = "basic181" options = ['escaped'] pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' input = '''Muammar Ghaddafi''' matches = [[0, 16]] [[tests]] name = "basic182" options = ['escaped'] pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' input = '''Muamar Kaddafi''' matches = [[0, 14]] [[tests]] name = "basic183" options = ['escaped'] pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' input = '''Muammar Quathafi''' matches = [[0, 16]] [[tests]] name = "basic184" options = ['escaped'] pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' input = '''Muammar Gheddafi''' matches = [[0, 16]] [[tests]] name = "basic185" options = ['escaped'] pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' input = '''Moammar Khadafy''' matches = [[0, 15]] [[tests]] name = "basic186" options = ['escaped'] pattern = '''M[ou]'?am+[ae]r .*([AEae]l[- ])?[GKQ]h?[aeu]+([dtz][dhz]?)+af[iy]''' input = '''Moammar Qudhafi''' matches = [[0, 15]] [[tests]] name = "basic187" options = ['escaped'] pattern = '''a+(b|c)*d+''' input = '''aabcdd''' matches = [[0, 6]] [[tests]] name = "basic188" options = ['escaped'] pattern = '''^.+$''' input = '''vivi''' matches = [[0, 4]] [[tests]] name = "basic189" options = ['escaped'] pattern = '''^(.+)$''' input = '''vivi''' matches = [[0, 4]] [[tests]] name = "basic190" options = ['escaped'] pattern = '''^([^!.]+).att.com!(.+)$''' input = '''gryphon.att.com!eby''' matches = [[0, 19]] [[tests]] name = "basic191" options = ['escaped'] pattern = '''^([^!]+!)?([^!]+)$''' input = '''bas''' matches = [[0, 3]] [[tests]] name = "basic192" options = ['escaped'] pattern = '''^([^!]+!)?([^!]+)$''' input = '''bar!bas''' matches = [[0, 7]] [[tests]] name = "basic193" options = ['escaped'] pattern = '''^([^!]+!)?([^!]+)$''' input = '''foo!bas''' matches = [[0, 7]] [[tests]] name = "basic194" options = ['escaped'] pattern = '''^.+!([^!]+!)([^!]+)$''' input = '''foo!bar!bas''' matches = [[0, 11]] [[tests]] name = "basic195" options = ['escaped'] pattern = '''((foo)|(bar))!bas''' input = '''bar!bas''' matches = [[0, 7]] [[tests]] name = "basic196" options = ['escaped'] pattern = '''((foo)|(bar))!bas''' input = '''foo!bar!bas''' matches = [[4, 11]] [[tests]] name = "basic197" options = ['escaped'] pattern = '''((foo)|(bar))!bas''' input = '''foo!bas''' matches = [[0, 7]] [[tests]] name = "basic198" options = ['escaped'] pattern = '''((foo)|bar)!bas''' input = '''bar!bas''' matches = [[0, 7]] [[tests]] name = "basic199" options = ['escaped'] pattern = '''((foo)|bar)!bas''' input = '''foo!bar!bas''' matches = [[4, 11]] [[tests]] name = "basic200" options = ['escaped'] pattern = '''((foo)|bar)!bas''' input = '''foo!bas''' matches = [[0, 7]] [[tests]] name = "basic201" options = ['escaped'] pattern = '''(foo|(bar))!bas''' input = '''bar!bas''' matches = [[0, 7]] [[tests]] name = "basic202" options = ['escaped'] pattern = '''(foo|(bar))!bas''' input = '''foo!bar!bas''' matches = [[4, 11]] [[tests]] name = "basic203" options = ['escaped'] pattern = '''(foo|(bar))!bas''' input = '''foo!bas''' matches = [[0, 7]] [[tests]] name = "basic204" options = ['escaped'] pattern = '''(foo|bar)!bas''' input = '''bar!bas''' matches = [[0, 7]] [[tests]] name = "basic205" options = ['escaped'] pattern = '''(foo|bar)!bas''' input = '''foo!bar!bas''' matches = [[4, 11]] [[tests]] name = "basic206" options = ['escaped'] pattern = '''(foo|bar)!bas''' input = '''foo!bas''' matches = [[0, 7]] [[tests]] name = "basic207" options = ['escaped'] pattern = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$''' input = '''foo!bar!bas''' matches = [[0, 11]] [[tests]] name = "basic208" options = ['escaped'] pattern = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$''' input = '''bas''' matches = [[0, 3]] [[tests]] name = "basic209" options = ['escaped'] pattern = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$''' input = '''bar!bas''' matches = [[0, 7]] [[tests]] name = "basic210" options = ['escaped'] pattern = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$''' input = '''foo!bar!bas''' matches = [[0, 11]] [[tests]] name = "basic211" options = ['escaped'] pattern = '''^([^!]+!)?([^!]+)$|^.+!([^!]+!)([^!]+)$''' input = '''foo!bas''' matches = [[0, 7]] [[tests]] name = "basic212" options = ['escaped'] pattern = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$''' input = '''bas''' matches = [[0, 3]] [[tests]] name = "basic213" options = ['escaped'] pattern = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$''' input = '''bar!bas''' matches = [[0, 7]] [[tests]] name = "basic214" options = ['escaped'] pattern = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$''' input = '''foo!bar!bas''' matches = [[0, 11]] [[tests]] name = "basic215" options = ['escaped'] pattern = '''^(([^!]+!)?([^!]+)|.+!([^!]+!)([^!]+))$''' input = '''foo!bas''' matches = [[0, 7]] [[tests]] name = "basic216" options = ['escaped'] pattern = '''.*(/XXX).*''' input = '''/XXX''' matches = [[0, 4]] [[tests]] name = "basic217" options = ['escaped'] pattern = '''.*(\\XXX).*''' input = '''\\XXX''' matches = [[0, 4]] [[tests]] name = "basic218" options = ['escaped'] pattern = '''\\XXX''' input = '''\\XXX''' matches = [[0, 4]] [[tests]] name = "basic219" options = ['escaped'] pattern = '''.*(/000).*''' input = '''/000''' matches = [[0, 4]] [[tests]] name = "basic220" options = ['escaped'] pattern = '''.*(\\000).*''' input = '''\\000''' matches = [[0, 4]] [[tests]] name = "basic221" options = ['escaped'] pattern = '''\\000''' input = '''\\000''' matches = [[0, 4]] regex-automata-0.1.8/data/tests/fowler/fowler-to-toml010075500017500000144000000046701341033352500210540ustar0000000000000000#!/usr/bin/env python from __future__ import absolute_import, division, print_function import argparse import os.path as path def read_tests(f): basename, _ = path.splitext(path.basename(f)) tests = [] prev_pattern = None for lineno, line in enumerate(open(f), 1): fields = list(filter(None, map(str.strip, line.split('\t')))) if not (4 <= len(fields) <= 5) \ or 'E' not in fields[0] or fields[0][0] == '#': continue terse_opts, pat, text, sgroups = fields[0:4] groups = [] # groups as integer ranges if sgroups == 'NOMATCH': groups = [] elif ',' in sgroups: noparen = map(lambda s: s.strip('()'), sgroups.split(')(')) for g in noparen: s, e = map(str.strip, g.split(',')) groups.append([int(s), int(e)]) break else: # This skips tests that should result in an error. # There aren't many, so I think we can just capture those # manually. Possibly fix this in future. continue opts = [] if text == "NULL": text = "" if pat == 'SAME': pat = prev_pattern if '$' in terse_opts: pat = pat.encode('utf-8').decode('unicode_escape') text = text.encode('utf-8').decode('unicode_escape') text = text.encode('unicode_escape').decode('utf-8') opts.append('escaped') else: opts.append('escaped') text = text.encode('unicode_escape').decode('utf-8') if 'i' in terse_opts: opts.append('case-insensitive') pat = pat.encode('unicode_escape').decode('utf-8') pat = pat.replace('\\\\', '\\') tests.append({ 'name': '"%s%d"' % (basename, lineno), 'options': repr(opts), 'pattern': "'''%s'''" % pat, 'input': "'''%s'''" % text, 'matches': str(groups), }) prev_pattern = pat return tests if __name__ == '__main__': parser = argparse.ArgumentParser( description='Generate match tests from an AT&T POSIX test file.') aa = parser.add_argument aa('datfile', help='A dat AT&T POSIX test file.') args = parser.parse_args() tests = read_tests(args.datfile) for t in tests: print('[[tests]]') for k, v in t.items(): print('%s = %s' % (k, v)) print('') regex-automata-0.1.8/data/tests/fowler/nullsubexpr.dat010064400017500000144000000040461340550531600213140ustar0000000000000000NOTE null subexpression matches : 2002-06-06 E (a*)* a (0,1)(0,1) #E SAME x (0,0)(0,0) E SAME x (0,0)(?,?) RE2/Go E SAME aaaaaa (0,6)(0,6) E SAME aaaaaax (0,6)(0,6) E (a*)+ a (0,1)(0,1) E SAME x (0,0)(0,0) E SAME aaaaaa (0,6)(0,6) E SAME aaaaaax (0,6)(0,6) E (a+)* a (0,1)(0,1) E SAME x (0,0) E SAME aaaaaa (0,6)(0,6) E SAME aaaaaax (0,6)(0,6) E (a+)+ a (0,1)(0,1) E SAME x NOMATCH E SAME aaaaaa (0,6)(0,6) E SAME aaaaaax (0,6)(0,6) E ([a]*)* a (0,1)(0,1) #E SAME x (0,0)(0,0) E SAME x (0,0)(?,?) RE2/Go E SAME aaaaaa (0,6)(0,6) E SAME aaaaaax (0,6)(0,6) E ([a]*)+ a (0,1)(0,1) E SAME x (0,0)(0,0) E SAME aaaaaa (0,6)(0,6) E SAME aaaaaax (0,6)(0,6) E ([^b]*)* a (0,1)(0,1) #E SAME b (0,0)(0,0) E SAME b (0,0)(?,?) RE2/Go E SAME aaaaaa (0,6)(0,6) E SAME aaaaaab (0,6)(0,6) E ([ab]*)* a (0,1)(0,1) E SAME aaaaaa (0,6)(0,6) E SAME ababab (0,6)(0,6) E SAME bababa (0,6)(0,6) E SAME b (0,1)(0,1) E SAME bbbbbb (0,6)(0,6) E SAME aaaabcde (0,5)(0,5) E ([^a]*)* b (0,1)(0,1) E SAME bbbbbb (0,6)(0,6) #E SAME aaaaaa (0,0)(0,0) E SAME aaaaaa (0,0)(?,?) RE2/Go E ([^ab]*)* ccccxx (0,6)(0,6) #E SAME ababab (0,0)(0,0) E SAME ababab (0,0)(?,?) RE2/Go E ((z)+|a)* zabcde (0,2)(1,2) #{E a+? aaaaaa (0,1) no *? +? mimimal match ops #E (a) aaa (0,1)(0,1) #E (a*?) aaa (0,0)(0,0) #E (a)*? aaa (0,0) #E (a*?)*? aaa (0,0) #} B \(a*\)*\(x\) x (0,1)(0,0)(0,1) B \(a*\)*\(x\) ax (0,2)(0,1)(1,2) B \(a*\)*\(x\) axa (0,2)(0,1)(1,2) B \(a*\)*\(x\)\(\1\) x (0,1)(0,0)(0,1)(1,1) B \(a*\)*\(x\)\(\1\) ax (0,2)(1,1)(1,2)(2,2) B \(a*\)*\(x\)\(\1\) axa (0,3)(0,1)(1,2)(2,3) B \(a*\)*\(x\)\(\1\)\(x\) axax (0,4)(0,1)(1,2)(2,3)(3,4) B \(a*\)*\(x\)\(\1\)\(x\) axxa (0,3)(1,1)(1,2)(2,2)(2,3) #E (a*)*(x) x (0,1)(0,0)(0,1) E (a*)*(x) x (0,1)(?,?)(0,1) RE2/Go E (a*)*(x) ax (0,2)(0,1)(1,2) E (a*)*(x) axa (0,2)(0,1)(1,2) E (a*)+(x) x (0,1)(0,0)(0,1) E (a*)+(x) ax (0,2)(0,1)(1,2) E (a*)+(x) axa (0,2)(0,1)(1,2) E (a*){2}(x) x (0,1)(0,0)(0,1) E (a*){2}(x) ax (0,2)(1,1)(1,2) E (a*){2}(x) axa (0,2)(1,1)(1,2) regex-automata-0.1.8/data/tests/fowler/nullsubexpr.toml010064400017500000144000000133671341033352600215230ustar0000000000000000[[tests]] name = "nullsubexpr3" options = ['escaped'] pattern = '''(a*)*''' input = '''a''' matches = [[0, 1]] [[tests]] name = "nullsubexpr5" options = ['escaped'] pattern = '''(a*)*''' input = '''x''' matches = [[0, 0]] [[tests]] name = "nullsubexpr6" options = ['escaped'] pattern = '''(a*)*''' input = '''aaaaaa''' matches = [[0, 6]] [[tests]] name = "nullsubexpr7" options = ['escaped'] pattern = '''(a*)*''' input = '''aaaaaax''' matches = [[0, 6]] [[tests]] name = "nullsubexpr8" options = ['escaped'] pattern = '''(a*)+''' input = '''a''' matches = [[0, 1]] [[tests]] name = "nullsubexpr9" options = ['escaped'] pattern = '''(a*)+''' input = '''x''' matches = [[0, 0]] [[tests]] name = "nullsubexpr10" options = ['escaped'] pattern = '''(a*)+''' input = '''aaaaaa''' matches = [[0, 6]] [[tests]] name = "nullsubexpr11" options = ['escaped'] pattern = '''(a*)+''' input = '''aaaaaax''' matches = [[0, 6]] [[tests]] name = "nullsubexpr12" options = ['escaped'] pattern = '''(a+)*''' input = '''a''' matches = [[0, 1]] [[tests]] name = "nullsubexpr13" options = ['escaped'] pattern = '''(a+)*''' input = '''x''' matches = [[0, 0]] [[tests]] name = "nullsubexpr14" options = ['escaped'] pattern = '''(a+)*''' input = '''aaaaaa''' matches = [[0, 6]] [[tests]] name = "nullsubexpr15" options = ['escaped'] pattern = '''(a+)*''' input = '''aaaaaax''' matches = [[0, 6]] [[tests]] name = "nullsubexpr16" options = ['escaped'] pattern = '''(a+)+''' input = '''a''' matches = [[0, 1]] [[tests]] name = "nullsubexpr17" options = ['escaped'] pattern = '''(a+)+''' input = '''x''' matches = [] [[tests]] name = "nullsubexpr18" options = ['escaped'] pattern = '''(a+)+''' input = '''aaaaaa''' matches = [[0, 6]] [[tests]] name = "nullsubexpr19" options = ['escaped'] pattern = '''(a+)+''' input = '''aaaaaax''' matches = [[0, 6]] [[tests]] name = "nullsubexpr21" options = ['escaped'] pattern = '''([a]*)*''' input = '''a''' matches = [[0, 1]] [[tests]] name = "nullsubexpr23" options = ['escaped'] pattern = '''([a]*)*''' input = '''x''' matches = [[0, 0]] [[tests]] name = "nullsubexpr24" options = ['escaped'] pattern = '''([a]*)*''' input = '''aaaaaa''' matches = [[0, 6]] [[tests]] name = "nullsubexpr25" options = ['escaped'] pattern = '''([a]*)*''' input = '''aaaaaax''' matches = [[0, 6]] [[tests]] name = "nullsubexpr26" options = ['escaped'] pattern = '''([a]*)+''' input = '''a''' matches = [[0, 1]] [[tests]] name = "nullsubexpr27" options = ['escaped'] pattern = '''([a]*)+''' input = '''x''' matches = [[0, 0]] [[tests]] name = "nullsubexpr28" options = ['escaped'] pattern = '''([a]*)+''' input = '''aaaaaa''' matches = [[0, 6]] [[tests]] name = "nullsubexpr29" options = ['escaped'] pattern = '''([a]*)+''' input = '''aaaaaax''' matches = [[0, 6]] [[tests]] name = "nullsubexpr30" options = ['escaped'] pattern = '''([^b]*)*''' input = '''a''' matches = [[0, 1]] [[tests]] name = "nullsubexpr32" options = ['escaped'] pattern = '''([^b]*)*''' input = '''b''' matches = [[0, 0]] [[tests]] name = "nullsubexpr33" options = ['escaped'] pattern = '''([^b]*)*''' input = '''aaaaaa''' matches = [[0, 6]] [[tests]] name = "nullsubexpr34" options = ['escaped'] pattern = '''([^b]*)*''' input = '''aaaaaab''' matches = [[0, 6]] [[tests]] name = "nullsubexpr35" options = ['escaped'] pattern = '''([ab]*)*''' input = '''a''' matches = [[0, 1]] [[tests]] name = "nullsubexpr36" options = ['escaped'] pattern = '''([ab]*)*''' input = '''aaaaaa''' matches = [[0, 6]] [[tests]] name = "nullsubexpr37" options = ['escaped'] pattern = '''([ab]*)*''' input = '''ababab''' matches = [[0, 6]] [[tests]] name = "nullsubexpr38" options = ['escaped'] pattern = '''([ab]*)*''' input = '''bababa''' matches = [[0, 6]] [[tests]] name = "nullsubexpr39" options = ['escaped'] pattern = '''([ab]*)*''' input = '''b''' matches = [[0, 1]] [[tests]] name = "nullsubexpr40" options = ['escaped'] pattern = '''([ab]*)*''' input = '''bbbbbb''' matches = [[0, 6]] [[tests]] name = "nullsubexpr41" options = ['escaped'] pattern = '''([ab]*)*''' input = '''aaaabcde''' matches = [[0, 5]] [[tests]] name = "nullsubexpr42" options = ['escaped'] pattern = '''([^a]*)*''' input = '''b''' matches = [[0, 1]] [[tests]] name = "nullsubexpr43" options = ['escaped'] pattern = '''([^a]*)*''' input = '''bbbbbb''' matches = [[0, 6]] [[tests]] name = "nullsubexpr45" options = ['escaped'] pattern = '''([^a]*)*''' input = '''aaaaaa''' matches = [[0, 0]] [[tests]] name = "nullsubexpr46" options = ['escaped'] pattern = '''([^ab]*)*''' input = '''ccccxx''' matches = [[0, 6]] [[tests]] name = "nullsubexpr48" options = ['escaped'] pattern = '''([^ab]*)*''' input = '''ababab''' matches = [[0, 0]] [[tests]] name = "nullsubexpr50" options = ['escaped'] pattern = '''((z)+|a)*''' input = '''zabcde''' matches = [[0, 2]] [[tests]] name = "nullsubexpr69" options = ['escaped'] pattern = '''(a*)*(x)''' input = '''x''' matches = [[0, 1]] [[tests]] name = "nullsubexpr70" options = ['escaped'] pattern = '''(a*)*(x)''' input = '''ax''' matches = [[0, 2]] [[tests]] name = "nullsubexpr71" options = ['escaped'] pattern = '''(a*)*(x)''' input = '''axa''' matches = [[0, 2]] [[tests]] name = "nullsubexpr73" options = ['escaped'] pattern = '''(a*)+(x)''' input = '''x''' matches = [[0, 1]] [[tests]] name = "nullsubexpr74" options = ['escaped'] pattern = '''(a*)+(x)''' input = '''ax''' matches = [[0, 2]] [[tests]] name = "nullsubexpr75" options = ['escaped'] pattern = '''(a*)+(x)''' input = '''axa''' matches = [[0, 2]] [[tests]] name = "nullsubexpr77" options = ['escaped'] pattern = '''(a*){2}(x)''' input = '''x''' matches = [[0, 1]] [[tests]] name = "nullsubexpr78" options = ['escaped'] pattern = '''(a*){2}(x)''' input = '''ax''' matches = [[0, 2]] [[tests]] name = "nullsubexpr79" options = ['escaped'] pattern = '''(a*){2}(x)''' input = '''axa''' matches = [[0, 2]] regex-automata-0.1.8/data/tests/fowler/repetition-long.dat010064400017500000144000000076451340605254400220610ustar0000000000000000NOTE implicit vs. explicit repetitions : 2009-02-02 # Glenn Fowler # conforming matches (column 4) must match one of the following BREs # NOMATCH # (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)* # (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)* # i.e., each 3-tuple has two identical elements and one (?,?) NOTE additional repetition tests graciously provided by Chris Kuklewicz www.haskell.org 2009-02-02 :HA#100:E X(.?){0,}Y X1234567Y (0,9)(7,8) :HA#101:E X(.?){1,}Y X1234567Y (0,9)(7,8) :HA#102:E X(.?){2,}Y X1234567Y (0,9)(7,8) :HA#103:E X(.?){3,}Y X1234567Y (0,9)(7,8) :HA#104:E X(.?){4,}Y X1234567Y (0,9)(7,8) :HA#105:E X(.?){5,}Y X1234567Y (0,9)(7,8) :HA#106:E X(.?){6,}Y X1234567Y (0,9)(7,8) :HA#107:E X(.?){7,}Y X1234567Y (0,9)(7,8) :HA#108:E X(.?){8,}Y X1234567Y (0,9)(8,8) #:HA#110:E X(.?){0,8}Y X1234567Y (0,9)(7,8) :HA#110:E X(.?){0,8}Y X1234567Y (0,9)(8,8) RE2/Go #:HA#111:E X(.?){1,8}Y X1234567Y (0,9)(7,8) :HA#111:E X(.?){1,8}Y X1234567Y (0,9)(8,8) RE2/Go #:HA#112:E X(.?){2,8}Y X1234567Y (0,9)(7,8) :HA#112:E X(.?){2,8}Y X1234567Y (0,9)(8,8) RE2/Go #:HA#113:E X(.?){3,8}Y X1234567Y (0,9)(7,8) :HA#113:E X(.?){3,8}Y X1234567Y (0,9)(8,8) RE2/Go #:HA#114:E X(.?){4,8}Y X1234567Y (0,9)(7,8) :HA#114:E X(.?){4,8}Y X1234567Y (0,9)(8,8) RE2/Go #:HA#115:E X(.?){5,8}Y X1234567Y (0,9)(7,8) :HA#115:E X(.?){5,8}Y X1234567Y (0,9)(8,8) RE2/Go #:HA#116:E X(.?){6,8}Y X1234567Y (0,9)(7,8) :HA#116:E X(.?){6,8}Y X1234567Y (0,9)(8,8) RE2/Go #:HA#117:E X(.?){7,8}Y X1234567Y (0,9)(7,8) :HA#117:E X(.?){7,8}Y X1234567Y (0,9)(8,8) RE2/Go :HA#118:E X(.?){8,8}Y X1234567Y (0,9)(8,8) # These test a fixed bug in my regex-tdfa that did not keep the expanded # form properly grouped, so right association did the wrong thing with # these ambiguous patterns (crafted just to test my code when I became # suspicious of my implementation). The first subexpression should use # "ab" then "a" then "bcd". # OS X / FreeBSD / NetBSD badly fail many of these, with impossible # results like (0,6)(4,5)(6,6). :HA#260:E (a|ab|c|bcd){0,}(d*) ababcd (0,1)(0,1)(1,1) :HA#261:E (a|ab|c|bcd){1,}(d*) ababcd (0,1)(0,1)(1,1) :HA#262:E (a|ab|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) :HA#263:E (a|ab|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) :HA#264:E (a|ab|c|bcd){4,}(d*) ababcd NOMATCH :HA#265:E (a|ab|c|bcd){0,10}(d*) ababcd (0,1)(0,1)(1,1) :HA#266:E (a|ab|c|bcd){1,10}(d*) ababcd (0,1)(0,1)(1,1) :HA#267:E (a|ab|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) :HA#268:E (a|ab|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) :HA#269:E (a|ab|c|bcd){4,10}(d*) ababcd NOMATCH :HA#270:E (a|ab|c|bcd)*(d*) ababcd (0,1)(0,1)(1,1) :HA#271:E (a|ab|c|bcd)+(d*) ababcd (0,1)(0,1)(1,1) # The above worked on Linux/GLIBC but the following often fail. # They also trip up OS X / FreeBSD / NetBSD: #:HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(3,6)(6,6) :HA#280:E (ab|a|c|bcd){0,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go #:HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(3,6)(6,6) :HA#281:E (ab|a|c|bcd){1,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go #:HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(3,6)(6,6) :HA#282:E (ab|a|c|bcd){2,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go #:HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(3,6)(6,6) :HA#283:E (ab|a|c|bcd){3,}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go :HA#284:E (ab|a|c|bcd){4,}(d*) ababcd NOMATCH #:HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(3,6)(6,6) :HA#285:E (ab|a|c|bcd){0,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go #:HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(3,6)(6,6) :HA#286:E (ab|a|c|bcd){1,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go #:HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(3,6)(6,6) :HA#287:E (ab|a|c|bcd){2,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go #:HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(3,6)(6,6) :HA#288:E (ab|a|c|bcd){3,10}(d*) ababcd (0,6)(4,5)(5,6) RE2/Go :HA#289:E (ab|a|c|bcd){4,10}(d*) ababcd NOMATCH #:HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(3,6)(6,6) :HA#290:E (ab|a|c|bcd)*(d*) ababcd (0,6)(4,5)(5,6) RE2/Go #:HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(3,6)(6,6) :HA#291:E (ab|a|c|bcd)+(d*) ababcd (0,6)(4,5)(5,6) RE2/Go regex-automata-0.1.8/data/tests/fowler/repetition-long.toml010064400017500000144000000127651341033411400222520ustar0000000000000000[[tests]] name = "repetition-long12" options = ['escaped'] pattern = '''X(.?){0,}Y''' input = '''X1234567Y''' matches = [[0, 9]] [[tests]] name = "repetition-long13" options = ['escaped'] pattern = '''X(.?){1,}Y''' input = '''X1234567Y''' matches = [[0, 9]] [[tests]] name = "repetition-long14" options = ['escaped'] pattern = '''X(.?){2,}Y''' input = '''X1234567Y''' matches = [[0, 9]] [[tests]] name = "repetition-long15" options = ['escaped'] pattern = '''X(.?){3,}Y''' input = '''X1234567Y''' matches = [[0, 9]] [[tests]] name = "repetition-long16" options = ['escaped'] pattern = '''X(.?){4,}Y''' input = '''X1234567Y''' matches = [[0, 9]] [[tests]] name = "repetition-long17" options = ['escaped'] pattern = '''X(.?){5,}Y''' input = '''X1234567Y''' matches = [[0, 9]] [[tests]] name = "repetition-long18" options = ['escaped'] pattern = '''X(.?){6,}Y''' input = '''X1234567Y''' matches = [[0, 9]] [[tests]] name = "repetition-long19" options = ['escaped'] pattern = '''X(.?){7,}Y''' input = '''X1234567Y''' matches = [[0, 9]] [[tests]] name = "repetition-long20" options = ['escaped'] pattern = '''X(.?){8,}Y''' input = '''X1234567Y''' matches = [[0, 9]] [[tests]] name = "repetition-long22" options = ['escaped'] pattern = '''X(.?){0,8}Y''' input = '''X1234567Y''' matches = [[0, 9]] [[tests]] name = "repetition-long24" options = ['escaped'] pattern = '''X(.?){1,8}Y''' input = '''X1234567Y''' matches = [[0, 9]] [[tests]] name = "repetition-long26" options = ['escaped'] pattern = '''X(.?){2,8}Y''' input = '''X1234567Y''' matches = [[0, 9]] [[tests]] name = "repetition-long28" options = ['escaped'] pattern = '''X(.?){3,8}Y''' input = '''X1234567Y''' matches = [[0, 9]] [[tests]] name = "repetition-long30" options = ['escaped'] pattern = '''X(.?){4,8}Y''' input = '''X1234567Y''' matches = [[0, 9]] [[tests]] name = "repetition-long32" options = ['escaped'] pattern = '''X(.?){5,8}Y''' input = '''X1234567Y''' matches = [[0, 9]] [[tests]] name = "repetition-long34" options = ['escaped'] pattern = '''X(.?){6,8}Y''' input = '''X1234567Y''' matches = [[0, 9]] [[tests]] name = "repetition-long36" options = ['escaped'] pattern = '''X(.?){7,8}Y''' input = '''X1234567Y''' matches = [[0, 9]] [[tests]] name = "repetition-long37" options = ['escaped'] pattern = '''X(.?){8,8}Y''' input = '''X1234567Y''' matches = [[0, 9]] [[tests]] name = "repetition-long48" options = ['escaped'] pattern = '''(a|ab|c|bcd){0,}(d*)''' input = '''ababcd''' matches = [[0, 1]] [[tests]] name = "repetition-long49" options = ['escaped'] pattern = '''(a|ab|c|bcd){1,}(d*)''' input = '''ababcd''' matches = [[0, 1]] [[tests]] name = "repetition-long50" options = ['escaped'] pattern = '''(a|ab|c|bcd){2,}(d*)''' input = '''ababcd''' matches = [[0, 6]] [[tests]] name = "repetition-long51" options = ['escaped'] pattern = '''(a|ab|c|bcd){3,}(d*)''' input = '''ababcd''' matches = [[0, 6]] [[tests]] name = "repetition-long52" options = ['escaped'] pattern = '''(a|ab|c|bcd){4,}(d*)''' input = '''ababcd''' matches = [] [[tests]] name = "repetition-long53" options = ['escaped'] pattern = '''(a|ab|c|bcd){0,10}(d*)''' input = '''ababcd''' matches = [[0, 1]] [[tests]] name = "repetition-long54" options = ['escaped'] pattern = '''(a|ab|c|bcd){1,10}(d*)''' input = '''ababcd''' matches = [[0, 1]] [[tests]] name = "repetition-long55" options = ['escaped'] pattern = '''(a|ab|c|bcd){2,10}(d*)''' input = '''ababcd''' matches = [[0, 6]] [[tests]] name = "repetition-long56" options = ['escaped'] pattern = '''(a|ab|c|bcd){3,10}(d*)''' input = '''ababcd''' matches = [[0, 6]] [[tests]] name = "repetition-long57" options = ['escaped'] pattern = '''(a|ab|c|bcd){4,10}(d*)''' input = '''ababcd''' matches = [] [[tests]] name = "repetition-long58" options = ['escaped'] pattern = '''(a|ab|c|bcd)*(d*)''' input = '''ababcd''' matches = [[0, 1]] [[tests]] name = "repetition-long59" options = ['escaped'] pattern = '''(a|ab|c|bcd)+(d*)''' input = '''ababcd''' matches = [[0, 1]] [[tests]] name = "repetition-long65" options = ['escaped'] pattern = '''(ab|a|c|bcd){0,}(d*)''' input = '''ababcd''' matches = [[0, 6]] [[tests]] name = "repetition-long67" options = ['escaped'] pattern = '''(ab|a|c|bcd){1,}(d*)''' input = '''ababcd''' matches = [[0, 6]] [[tests]] name = "repetition-long69" options = ['escaped'] pattern = '''(ab|a|c|bcd){2,}(d*)''' input = '''ababcd''' matches = [[0, 6]] [[tests]] name = "repetition-long71" options = ['escaped'] pattern = '''(ab|a|c|bcd){3,}(d*)''' input = '''ababcd''' matches = [[0, 6]] [[tests]] name = "repetition-long72" options = ['escaped'] pattern = '''(ab|a|c|bcd){4,}(d*)''' input = '''ababcd''' matches = [] [[tests]] name = "repetition-long74" options = ['escaped'] pattern = '''(ab|a|c|bcd){0,10}(d*)''' input = '''ababcd''' matches = [[0, 6]] [[tests]] name = "repetition-long76" options = ['escaped'] pattern = '''(ab|a|c|bcd){1,10}(d*)''' input = '''ababcd''' matches = [[0, 6]] [[tests]] name = "repetition-long78" options = ['escaped'] pattern = '''(ab|a|c|bcd){2,10}(d*)''' input = '''ababcd''' matches = [[0, 6]] [[tests]] name = "repetition-long80" options = ['escaped'] pattern = '''(ab|a|c|bcd){3,10}(d*)''' input = '''ababcd''' matches = [[0, 6]] [[tests]] name = "repetition-long81" options = ['escaped'] pattern = '''(ab|a|c|bcd){4,10}(d*)''' input = '''ababcd''' matches = [] [[tests]] name = "repetition-long83" options = ['escaped'] pattern = '''(ab|a|c|bcd)*(d*)''' input = '''ababcd''' matches = [[0, 6]] [[tests]] name = "repetition-long85" options = ['escaped'] pattern = '''(ab|a|c|bcd)+(d*)''' input = '''ababcd''' matches = [[0, 6]] regex-automata-0.1.8/data/tests/fowler/repetition.dat010064400017500000144000000056241340605255100211150ustar0000000000000000NOTE implicit vs. explicit repetitions : 2009-02-02 # Glenn Fowler # conforming matches (column 4) must match one of the following BREs # NOMATCH # (0,.)\((\(.\),\(.\))(?,?)(\2,\3)\)* # (0,.)\((\(.\),\(.\))(\2,\3)(?,?)\)* # i.e., each 3-tuple has two identical elements and one (?,?) E ((..)|(.)) NULL NOMATCH E ((..)|(.))((..)|(.)) NULL NOMATCH E ((..)|(.))((..)|(.))((..)|(.)) NULL NOMATCH E ((..)|(.)){1} NULL NOMATCH E ((..)|(.)){2} NULL NOMATCH E ((..)|(.)){3} NULL NOMATCH E ((..)|(.))* NULL (0,0) E ((..)|(.)) a (0,1)(0,1)(?,?)(0,1) E ((..)|(.))((..)|(.)) a NOMATCH E ((..)|(.))((..)|(.))((..)|(.)) a NOMATCH E ((..)|(.)){1} a (0,1)(0,1)(?,?)(0,1) E ((..)|(.)){2} a NOMATCH E ((..)|(.)){3} a NOMATCH E ((..)|(.))* a (0,1)(0,1)(?,?)(0,1) E ((..)|(.)) aa (0,2)(0,2)(0,2)(?,?) E ((..)|(.))((..)|(.)) aa (0,2)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2) E ((..)|(.))((..)|(.))((..)|(.)) aa NOMATCH E ((..)|(.)){1} aa (0,2)(0,2)(0,2)(?,?) E ((..)|(.)){2} aa (0,2)(1,2)(?,?)(1,2) E ((..)|(.)){3} aa NOMATCH E ((..)|(.))* aa (0,2)(0,2)(0,2)(?,?) E ((..)|(.)) aaa (0,2)(0,2)(0,2)(?,?) E ((..)|(.))((..)|(.)) aaa (0,3)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3) E ((..)|(.))((..)|(.))((..)|(.)) aaa (0,3)(0,1)(?,?)(0,1)(1,2)(?,?)(1,2)(2,3)(?,?)(2,3) E ((..)|(.)){1} aaa (0,2)(0,2)(0,2)(?,?) #E ((..)|(.)){2} aaa (0,3)(2,3)(?,?)(2,3) E ((..)|(.)){2} aaa (0,3)(2,3)(0,2)(2,3) RE2/Go E ((..)|(.)){3} aaa (0,3)(2,3)(?,?)(2,3) #E ((..)|(.))* aaa (0,3)(2,3)(?,?)(2,3) E ((..)|(.))* aaa (0,3)(2,3)(0,2)(2,3) RE2/Go E ((..)|(.)) aaaa (0,2)(0,2)(0,2)(?,?) E ((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) E ((..)|(.))((..)|(.))((..)|(.)) aaaa (0,4)(0,2)(0,2)(?,?)(2,3)(?,?)(2,3)(3,4)(?,?)(3,4) E ((..)|(.)){1} aaaa (0,2)(0,2)(0,2)(?,?) E ((..)|(.)){2} aaaa (0,4)(2,4)(2,4)(?,?) #E ((..)|(.)){3} aaaa (0,4)(3,4)(?,?)(3,4) E ((..)|(.)){3} aaaa (0,4)(3,4)(0,2)(3,4) RE2/Go E ((..)|(.))* aaaa (0,4)(2,4)(2,4)(?,?) E ((..)|(.)) aaaaa (0,2)(0,2)(0,2)(?,?) E ((..)|(.))((..)|(.)) aaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) E ((..)|(.))((..)|(.))((..)|(.)) aaaaa (0,5)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,5)(?,?)(4,5) E ((..)|(.)){1} aaaaa (0,2)(0,2)(0,2)(?,?) E ((..)|(.)){2} aaaaa (0,4)(2,4)(2,4)(?,?) #E ((..)|(.)){3} aaaaa (0,5)(4,5)(?,?)(4,5) E ((..)|(.)){3} aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go #E ((..)|(.))* aaaaa (0,5)(4,5)(?,?)(4,5) E ((..)|(.))* aaaaa (0,5)(4,5)(2,4)(4,5) RE2/Go E ((..)|(.)) aaaaaa (0,2)(0,2)(0,2)(?,?) E ((..)|(.))((..)|(.)) aaaaaa (0,4)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?) E ((..)|(.))((..)|(.))((..)|(.)) aaaaaa (0,6)(0,2)(0,2)(?,?)(2,4)(2,4)(?,?)(4,6)(4,6)(?,?) E ((..)|(.)){1} aaaaaa (0,2)(0,2)(0,2)(?,?) E ((..)|(.)){2} aaaaaa (0,4)(2,4)(2,4)(?,?) E ((..)|(.)){3} aaaaaa (0,6)(4,6)(4,6)(?,?) E ((..)|(.))* aaaaaa (0,6)(4,6)(4,6)(?,?) regex-automata-0.1.8/data/tests/fowler/repetition.toml010064400017500000144000000136271341033352600213210ustar0000000000000000[[tests]] name = "repetition10" options = ['escaped'] pattern = '''((..)|(.))''' input = '''''' matches = [] [[tests]] name = "repetition11" options = ['escaped'] pattern = '''((..)|(.))((..)|(.))''' input = '''''' matches = [] [[tests]] name = "repetition12" options = ['escaped'] pattern = '''((..)|(.))((..)|(.))((..)|(.))''' input = '''''' matches = [] [[tests]] name = "repetition14" options = ['escaped'] pattern = '''((..)|(.)){1}''' input = '''''' matches = [] [[tests]] name = "repetition15" options = ['escaped'] pattern = '''((..)|(.)){2}''' input = '''''' matches = [] [[tests]] name = "repetition16" options = ['escaped'] pattern = '''((..)|(.)){3}''' input = '''''' matches = [] [[tests]] name = "repetition18" options = ['escaped'] pattern = '''((..)|(.))*''' input = '''''' matches = [[0, 0]] [[tests]] name = "repetition20" options = ['escaped'] pattern = '''((..)|(.))''' input = '''a''' matches = [[0, 1]] [[tests]] name = "repetition21" options = ['escaped'] pattern = '''((..)|(.))((..)|(.))''' input = '''a''' matches = [] [[tests]] name = "repetition22" options = ['escaped'] pattern = '''((..)|(.))((..)|(.))((..)|(.))''' input = '''a''' matches = [] [[tests]] name = "repetition24" options = ['escaped'] pattern = '''((..)|(.)){1}''' input = '''a''' matches = [[0, 1]] [[tests]] name = "repetition25" options = ['escaped'] pattern = '''((..)|(.)){2}''' input = '''a''' matches = [] [[tests]] name = "repetition26" options = ['escaped'] pattern = '''((..)|(.)){3}''' input = '''a''' matches = [] [[tests]] name = "repetition28" options = ['escaped'] pattern = '''((..)|(.))*''' input = '''a''' matches = [[0, 1]] [[tests]] name = "repetition30" options = ['escaped'] pattern = '''((..)|(.))''' input = '''aa''' matches = [[0, 2]] [[tests]] name = "repetition31" options = ['escaped'] pattern = '''((..)|(.))((..)|(.))''' input = '''aa''' matches = [[0, 2]] [[tests]] name = "repetition32" options = ['escaped'] pattern = '''((..)|(.))((..)|(.))((..)|(.))''' input = '''aa''' matches = [] [[tests]] name = "repetition34" options = ['escaped'] pattern = '''((..)|(.)){1}''' input = '''aa''' matches = [[0, 2]] [[tests]] name = "repetition35" options = ['escaped'] pattern = '''((..)|(.)){2}''' input = '''aa''' matches = [[0, 2]] [[tests]] name = "repetition36" options = ['escaped'] pattern = '''((..)|(.)){3}''' input = '''aa''' matches = [] [[tests]] name = "repetition38" options = ['escaped'] pattern = '''((..)|(.))*''' input = '''aa''' matches = [[0, 2]] [[tests]] name = "repetition40" options = ['escaped'] pattern = '''((..)|(.))''' input = '''aaa''' matches = [[0, 2]] [[tests]] name = "repetition41" options = ['escaped'] pattern = '''((..)|(.))((..)|(.))''' input = '''aaa''' matches = [[0, 3]] [[tests]] name = "repetition42" options = ['escaped'] pattern = '''((..)|(.))((..)|(.))((..)|(.))''' input = '''aaa''' matches = [[0, 3]] [[tests]] name = "repetition44" options = ['escaped'] pattern = '''((..)|(.)){1}''' input = '''aaa''' matches = [[0, 2]] [[tests]] name = "repetition46" options = ['escaped'] pattern = '''((..)|(.)){2}''' input = '''aaa''' matches = [[0, 3]] [[tests]] name = "repetition47" options = ['escaped'] pattern = '''((..)|(.)){3}''' input = '''aaa''' matches = [[0, 3]] [[tests]] name = "repetition50" options = ['escaped'] pattern = '''((..)|(.))*''' input = '''aaa''' matches = [[0, 3]] [[tests]] name = "repetition52" options = ['escaped'] pattern = '''((..)|(.))''' input = '''aaaa''' matches = [[0, 2]] [[tests]] name = "repetition53" options = ['escaped'] pattern = '''((..)|(.))((..)|(.))''' input = '''aaaa''' matches = [[0, 4]] [[tests]] name = "repetition54" options = ['escaped'] pattern = '''((..)|(.))((..)|(.))((..)|(.))''' input = '''aaaa''' matches = [[0, 4]] [[tests]] name = "repetition56" options = ['escaped'] pattern = '''((..)|(.)){1}''' input = '''aaaa''' matches = [[0, 2]] [[tests]] name = "repetition57" options = ['escaped'] pattern = '''((..)|(.)){2}''' input = '''aaaa''' matches = [[0, 4]] [[tests]] name = "repetition59" options = ['escaped'] pattern = '''((..)|(.)){3}''' input = '''aaaa''' matches = [[0, 4]] [[tests]] name = "repetition61" options = ['escaped'] pattern = '''((..)|(.))*''' input = '''aaaa''' matches = [[0, 4]] [[tests]] name = "repetition63" options = ['escaped'] pattern = '''((..)|(.))''' input = '''aaaaa''' matches = [[0, 2]] [[tests]] name = "repetition64" options = ['escaped'] pattern = '''((..)|(.))((..)|(.))''' input = '''aaaaa''' matches = [[0, 4]] [[tests]] name = "repetition65" options = ['escaped'] pattern = '''((..)|(.))((..)|(.))((..)|(.))''' input = '''aaaaa''' matches = [[0, 5]] [[tests]] name = "repetition67" options = ['escaped'] pattern = '''((..)|(.)){1}''' input = '''aaaaa''' matches = [[0, 2]] [[tests]] name = "repetition68" options = ['escaped'] pattern = '''((..)|(.)){2}''' input = '''aaaaa''' matches = [[0, 4]] [[tests]] name = "repetition70" options = ['escaped'] pattern = '''((..)|(.)){3}''' input = '''aaaaa''' matches = [[0, 5]] [[tests]] name = "repetition73" options = ['escaped'] pattern = '''((..)|(.))*''' input = '''aaaaa''' matches = [[0, 5]] [[tests]] name = "repetition75" options = ['escaped'] pattern = '''((..)|(.))''' input = '''aaaaaa''' matches = [[0, 2]] [[tests]] name = "repetition76" options = ['escaped'] pattern = '''((..)|(.))((..)|(.))''' input = '''aaaaaa''' matches = [[0, 4]] [[tests]] name = "repetition77" options = ['escaped'] pattern = '''((..)|(.))((..)|(.))((..)|(.))''' input = '''aaaaaa''' matches = [[0, 6]] [[tests]] name = "repetition79" options = ['escaped'] pattern = '''((..)|(.)){1}''' input = '''aaaaaa''' matches = [[0, 2]] [[tests]] name = "repetition80" options = ['escaped'] pattern = '''((..)|(.)){2}''' input = '''aaaaaa''' matches = [[0, 4]] [[tests]] name = "repetition81" options = ['escaped'] pattern = '''((..)|(.)){3}''' input = '''aaaaaa''' matches = [[0, 6]] [[tests]] name = "repetition83" options = ['escaped'] pattern = '''((..)|(.))*''' input = '''aaaaaa''' matches = [[0, 6]] regex-automata-0.1.8/data/tests/iter.toml010064400017500000144000000027351341326256500166110ustar0000000000000000[[tests]] name = "iter1" pattern = "a" input = "aaa" matches = [[0, 1], [1, 2], [2, 3]] [[tests]] name = "iter2" pattern = "a" input = "aba" matches = [[0, 1], [2, 3]] [[tests]] name = "iter-empty1" pattern = '' input = '' matches = [[0, 0]] [[tests]] name = "iter-empty2" pattern = '' input = 'abc' matches = [[0, 0], [1, 1], [2, 2], [3, 3]] [[tests]] name = "iter-empty3" pattern = '()' input = 'abc' matches = [[0, 0], [1, 1], [2, 2], [3, 3]] [[tests]] name = "iter-empty4" pattern = '()*' input = 'abc' matches = [[0, 0], [1, 1], [2, 2], [3, 3]] [[tests]] name = "iter-empty5" pattern = '()+' input = 'abc' matches = [[0, 0], [1, 1], [2, 2], [3, 3]] [[tests]] name = "iter-empty6" pattern = '()?' input = 'abc' matches = [[0, 0], [1, 1], [2, 2], [3, 3]] [[tests]] name = "iter-empty7" pattern = '()()' input = 'abc' matches = [[0, 0], [1, 1], [2, 2], [3, 3]] [[tests]] name = "iter-empty8" pattern = '()+|z' input = 'abc' matches = [[0, 0], [1, 1], [2, 2], [3, 3]] [[tests]] name = "iter-empty9" pattern = 'z|()+' input = 'abc' matches = [[0, 0], [1, 1], [2, 2], [3, 3]] [[tests]] name = "iter-empty10" pattern = '()+|b' input = 'abc' matches = [[0, 0], [1, 1], [2, 2], [3, 3]] [[tests]] name = "iter-empty11" pattern = 'b|()+' input = 'abc' matches = [[0, 0], [1, 2], [3, 3]] [[tests]] options = ["anchored"] name = "iter-anchored1" pattern = "a" input = "a" matches = [[0, 1]] [[tests]] options = ["anchored"] name = "iter-anchored2" pattern = "a" input = "aa" matches = [[0, 1]] regex-automata-0.1.8/data/tests/no-unicode.toml010064400017500000144000000046071341102263000176670ustar0000000000000000[[tests]] name = "invalid-utf8-literal1" options = ["escaped", "invalid-utf8", "no-unicode"] pattern = '\xFF' input = '\xFF' matches = [[0, 1]] [[tests]] name = "no-unicode-mixed" options = ["escaped", "invalid-utf8"] pattern = '(.+)(?-u)(.+)' input = '\xCE\x93\xCE\x94\xFF' matches = [[0, 5]] [[tests]] name = "no-unicode-case1" options = ["case-insensitive", "no-unicode"] pattern = "a" input = "A" matches = [[0, 1]] [[tests]] name = "no-unicode-case2" options = ["case-insensitive", "no-unicode"] pattern = "[a-z]+" input = "AaAaA" matches = [[0, 5]] [[tests]] name = "no-unicode-case3" options = ["case-insensitive"] pattern = "[a-z]+" input = "aA\u212AaA" matches = [[0, 7]] [[tests]] name = "no-unicode-case4" options = ["case-insensitive", "no-unicode"] pattern = "[a-z]+" input = "aA\u212AaA" matches = [[0, 2]] [[tests]] name = "no-unicode-negate1" options = [] pattern = "[^a]" input = "δ" matches = [[0, 2]] [[tests]] name = "no-unicode-negate2" options = ["no-unicode", "invalid-utf8"] pattern = "[^a]" input = "δ" matches = [[0, 1]] [[tests]] name = "no-unicode-dotstar-prefix1" options = ["escaped", "no-unicode", "invalid-utf8"] pattern = "a" input = '\xFFa' matches = [[1, 2]] [[tests]] name = "no-unicode-dotstar-prefix2" options = ["escaped", "invalid-utf8"] pattern = "a" input = '\xFFa' matches = [[1, 2]] [[tests]] name = "no-unicode-null-bytes1" options = ["escaped", "no-unicode", "invalid-utf8"] pattern = '[^\x00]+\x00' input = 'foo\x00' matches = [[0, 4]] [[tests]] name = "no-unicode1" options = ["no-unicode"] pattern = '\w+' input = "aδ" matches = [[0, 1]] [[tests]] name = "no-unicode2" options = [] pattern = '\w+' input = "aδ" matches = [[0, 3]] [[tests]] name = "no-unicode3" options = ["no-unicode"] pattern = '\d+' input = "1२३9" matches = [[0, 1]] [[tests]] name = "no-unicode4" pattern = '\d+' input = "1२३9" matches = [[0, 8]] [[tests]] name = "no-unicode5" options = ["no-unicode"] pattern = '\s+' input = " \u1680" matches = [[0, 1]] [[tests]] name = "no-unicode6" pattern = '\s+' input = " \u1680" matches = [[0, 4]] [[tests]] # See: https://github.com/rust-lang/regex/issues/484 name = "no-unicode-iter1" pattern = '' input = "☃" matches = [[0, 0], [1, 1], [2, 2], [3, 3]] [[tests]] # See: https://github.com/rust-lang/regex/issues/484 options = ['escaped'] name = "no-unicode-iter2" pattern = '' input = 'b\xFFr' matches = [[0, 0], [1, 1], [2, 2], [3, 3]] regex-automata-0.1.8/data/tests/unicode.toml010064400017500000144000000201511341032270700172540ustar0000000000000000[[tests]] name = "unicode-literal1" pattern = '☃' input = "☃" matches = [[0, 3]] [[tests]] name = "unicode-literal2" pattern = '☃+' input = "☃" matches = [[0, 3]] [[tests]] name = "unicode-literal3" options = ["case-insensitive"] pattern = '☃+' input = "☃" matches = [[0, 3]] [[tests]] name = "unicode-literal4" options = ["case-insensitive"] pattern = 'Δ' input = "δ" matches = [[0, 2]] [[tests]] name = "unicode-class1" pattern = '[☃Ⅰ]+' input = "☃" matches = [[0, 3]] [[tests]] name = "unicode-class2" pattern = '\pN' input = "Ⅰ" matches = [[0, 3]] [[tests]] name = "unicode-class3" pattern = '\pN+' input = "Ⅰ1Ⅱ2" matches = [[0, 8]] [[tests]] name = "unicode-class4" pattern = '\PN+' input = "abⅠ" matches = [[0, 2]] [[tests]] name = "unicode-class5" pattern = '[\PN]+' input = "abⅠ" matches = [[0, 2]] [[tests]] name = "unicode-class6" pattern = '[^\PN]+' input = "abⅠ" matches = [[2, 5]] [[tests]] name = "unicode-class7" pattern = '\p{Lu}+' input = "ΛΘΓΔα" matches = [[0, 8]] [[tests]] name = "unicode-class8" options = ["case-insensitive"] pattern = '\p{Lu}+' input = "ΛΘΓΔα" matches = [[0, 10]] [[tests]] name = "unicode-class9" pattern = '\pL+' input = "ΛΘΓΔα" matches = [[0, 10]] [[tests]] name = "unicode-class10" pattern = '\p{Ll}+' input = "ΛΘΓΔα" matches = [[8, 10]] [[tests]] name = "unicode-perl1" pattern = '\w+' input = "dδd" matches = [[0, 4]] [[tests]] name = "unicode-perl2" pattern = '\w+' input = "⥡" matches = [] [[tests]] name = "unicode-perl3" pattern = '\W+' input = "⥡" matches = [[0, 3]] [[tests]] name = "unicode-perl4" pattern = '\d+' input = "1२३9" matches = [[0, 8]] [[tests]] name = "unicode-perl5" pattern = '\d+' input = "Ⅱ" matches = [] [[tests]] name = "unicode-perl6" pattern = '\D+' input = "Ⅱ" matches = [[0, 3]] [[tests]] name = "unicode-perl7" pattern = '\s+' input = " " matches = [[0, 3]] [[tests]] name = "unicode-perl8" pattern = '\s+' input = "☃" matches = [] [[tests]] name = "unicode-perl9" pattern = '\S+' input = "☃" matches = [[0, 3]] [[tests]] name = "unicode-class-gencat1" pattern = '\p{Cased_Letter}' input = "A" matches = [[0, 3]] [[tests]] name = "unicode-class-gencat2" pattern = '\p{Close_Punctuation}' input = "❯" matches = [[0, 3]] [[tests]] name = "unicode-class-gencat3" pattern = '\p{Connector_Punctuation}' input = "⁀" matches = [[0, 3]] [[tests]] name = "unicode-class-gencat4" pattern = '\p{Control}' input = "\u009F" matches = [[0, 2]] [[tests]] name = "unicode-class-gencat5" pattern = '\p{Currency_Symbol}' input = "£" matches = [[0, 3]] [[tests]] name = "unicode-class-gencat6" pattern = '\p{Dash_Punctuation}' input = "〰" matches = [[0, 3]] [[tests]] name = "unicode-class-gencat7" pattern = '\p{Decimal_Number}' input = "𑓙" matches = [[0, 4]] [[tests]] name = "unicode-class-gencat8" pattern = '\p{Enclosing_Mark}' input = "\uA672" matches = [[0, 3]] [[tests]] name = "unicode-class-gencat9" pattern = '\p{Final_Punctuation}' input = "⸡" matches = [[0, 3]] [[tests]] name = "unicode-class-gencat10" pattern = '\p{Format}' input = "\U000E007F" matches = [[0, 4]] [[tests]] name = "unicode-class-gencat11" pattern = '\p{Initial_Punctuation}' input = "⸜" matches = [[0, 3]] [[tests]] name = "unicode-class-gencat12" pattern = '\p{Letter}' input = "Έ" matches = [[0, 2]] [[tests]] name = "unicode-class-gencat13" pattern = '\p{Letter_Number}' input = "ↂ" matches = [[0, 3]] [[tests]] name = "unicode-class-gencat14" pattern = '\p{Line_Separator}' input = "\u2028" matches = [[0, 3]] [[tests]] name = "unicode-class-gencat15" pattern = '\p{Lowercase_Letter}' input = "ϛ" matches = [[0, 2]] [[tests]] name = "unicode-class-gencat16" pattern = '\p{Mark}' input = "\U000E01EF" matches = [[0, 4]] [[tests]] name = "unicode-class-gencat17" pattern = '\p{Math}' input = "⋿" matches = [[0, 3]] [[tests]] name = "unicode-class-gencat18" pattern = '\p{Modifier_Letter}' input = "𖭃" matches = [[0, 4]] [[tests]] name = "unicode-class-gencat19" pattern = '\p{Modifier_Symbol}' input = "🏿" matches = [[0, 4]] [[tests]] name = "unicode-class-gencat20" pattern = '\p{Nonspacing_Mark}' input = "\U0001E94A" matches = [[0, 4]] [[tests]] name = "unicode-class-gencat21" pattern = '\p{Number}' input = "⓿" matches = [[0, 3]] [[tests]] name = "unicode-class-gencat22" pattern = '\p{Open_Punctuation}' input = "⦅" matches = [[0, 3]] [[tests]] name = "unicode-class-gencat23" pattern = '\p{Other}' input = "\u0BC9" matches = [[0, 3]] [[tests]] name = "unicode-class-gencat24" pattern = '\p{Other_Letter}' input = "ꓷ" matches = [[0, 3]] [[tests]] name = "unicode-class-gencat25" pattern = '\p{Other_Number}' input = "㉏" matches = [[0, 3]] [[tests]] name = "unicode-class-gencat26" pattern = '\p{Other_Punctuation}' input = "𞥞" matches = [[0, 4]] [[tests]] name = "unicode-class-gencat27" pattern = '\p{Other_Symbol}' input = "⅌" matches = [[0, 3]] [[tests]] name = "unicode-class-gencat28" pattern = '\p{Paragraph_Separator}' input = "\u2029" matches = [[0, 3]] [[tests]] name = "unicode-class-gencat29" pattern = '\p{Private_Use}' input = "\U0010FFFD" matches = [[0, 4]] [[tests]] name = "unicode-class-gencat30" pattern = '\p{Punctuation}' input = "𑁍" matches = [[0, 4]] [[tests]] name = "unicode-class-gencat31" pattern = '\p{Separator}' input = "\u3000" matches = [[0, 3]] [[tests]] name = "unicode-class-gencat32" pattern = '\p{Space_Separator}' input = "\u205F" matches = [[0, 3]] [[tests]] name = "unicode-class-gencat33" pattern = '\p{Spacing_Mark}' input = "\U00016F7E" matches = [[0, 4]] [[tests]] name = "unicode-class-gencat34" pattern = '\p{Symbol}' input = "⯈" matches = [[0, 3]] [[tests]] name = "unicode-class-gencat35" pattern = '\p{Titlecase_Letter}' input = "ῼ" matches = [[0, 3]] [[tests]] name = "unicode-class-gencat36" pattern = '\p{Unassigned}' input = "\U0010FFFF" matches = [[0, 4]] [[tests]] name = "unicode-class-gencat37" pattern = '\p{Uppercase_Letter}' input = "Ꝋ" matches = [[0, 3]] [[tests]] name = "unicode-class-emoji1" pattern = '\p{Emoji}' input = "\u23E9" matches = [[0, 3]] [[tests]] name = "unicode-class-emoji2" pattern = '\p{emoji}' input = "\U0001F21A" matches = [[0, 4]] [[tests]] name = "unicode-class-emoji3" pattern = '\p{extendedpictographic}' input = "\U0001FA6E" matches = [[0, 4]] [[tests]] name = "unicode-class-emoji4" pattern = '\p{extendedpictographic}' input = "\U0001FFFD" matches = [[0, 4]] [[tests]] name = "unicode-class-gcb1" pattern = '\p{grapheme_cluster_break=prepend}' input = "\U00011D46" matches = [[0, 4]] [[tests]] name = "unicode-class-gcb2" pattern = '\p{gcb=regional_indicator}' input = "\U0001F1E6" matches = [[0, 4]] [[tests]] name = "unicode-class-gcb3" pattern = '\p{gcb=ri}' input = "\U0001F1E7" matches = [[0, 4]] [[tests]] name = "unicode-class-gcb4" pattern = '\p{regionalindicator}' input = "\U0001F1FF" matches = [[0, 4]] [[tests]] name = "unicode-class-gcb5" pattern = '\p{gcb=lvt}' input = "\uC989" matches = [[0, 3]] [[tests]] name = "unicode-class-gcb6" pattern = '\p{gcb=zwj}' input = "\u200D" matches = [[0, 3]] [[tests]] name = "unicode-class-word-break1" pattern = '\p{word_break=Hebrew_Letter}' input = "\uFB46" matches = [[0, 3]] [[tests]] name = "unicode-class-word-break2" pattern = '\p{wb=hebrewletter}' input = "\uFB46" matches = [[0, 3]] [[tests]] name = "unicode-class-word-break3" pattern = '\p{wb=ExtendNumLet}' input = "\uFF3F" matches = [[0, 3]] [[tests]] name = "unicode-class-word-break4" pattern = '\p{wb=WSegSpace}' input = "\u3000" matches = [[0, 3]] [[tests]] name = "unicode-class-word-break5" pattern = '\p{wb=numeric}' input = "\U0001E950" matches = [[0, 4]] [[tests]] name = "unicode-class-sentence-break1" pattern = '\p{sentence_break=Lower}' input = "\u0469" matches = [[0, 2]] [[tests]] name = "unicode-class-sentence-break2" pattern = '\p{sb=lower}' input = "\u0469" matches = [[0, 2]] [[tests]] name = "unicode-class-sentence-break3" pattern = '\p{sb=Close}' input = "\uFF60" matches = [[0, 3]] [[tests]] name = "unicode-class-sentence-break4" pattern = '\p{sb=Close}' input = "\U0001F677" matches = [[0, 4]] [[tests]] name = "unicode-class-sentence-break5" pattern = '\p{sb=SContinue}' input = "\uFF64" matches = [[0, 3]] regex-automata-0.1.8/src/classes.rs010064400017500000144000000114661341324351400154620ustar0000000000000000use core::fmt; /// A representation of byte oriented equivalence classes. /// /// This is used in a DFA to reduce the size of the transition table. This can /// have a particularly large impact not only on the total size of a dense DFA, /// but also on compile times. #[derive(Clone, Copy)] pub struct ByteClasses([u8; 256]); impl ByteClasses { /// Creates a new set of equivalence classes where all bytes are mapped to /// the same class. pub fn empty() -> ByteClasses { ByteClasses([0; 256]) } /// Creates a new set of equivalence classes where each byte belongs to /// its own equivalence class. pub fn singletons() -> ByteClasses { let mut classes = ByteClasses::empty(); for i in 0..256 { classes.set(i as u8, i as u8); } classes } /// Copies the byte classes given. The given slice must have length 0 or /// length 256. Slices of length 0 are treated as singletons (every byte /// is its own class). pub fn from_slice(slice: &[u8]) -> ByteClasses { assert!(slice.is_empty() || slice.len() == 256); if slice.is_empty() { ByteClasses::singletons() } else { let mut classes = ByteClasses::empty(); for (b, &class) in slice.iter().enumerate() { classes.set(b as u8, class); } classes } } /// Set the equivalence class for the given byte. #[inline] pub fn set(&mut self, byte: u8, class: u8) { self.0[byte as usize] = class; } /// Get the equivalence class for the given byte. #[inline] pub fn get(&self, byte: u8) -> u8 { self.0[byte as usize] } /// Get the equivalence class for the given byte while forcefully /// eliding bounds checks. #[inline] pub unsafe fn get_unchecked(&self, byte: u8) -> u8 { *self.0.get_unchecked(byte as usize) } /// Return the total number of elements in the alphabet represented by /// these equivalence classes. Equivalently, this returns the total number /// of equivalence classes. #[inline] pub fn alphabet_len(&self) -> usize { self.0[255] as usize + 1 } /// Returns true if and only if every byte in this class maps to its own /// equivalence class. Equivalently, there are 256 equivalence classes /// and each class contains exactly one byte. #[inline] pub fn is_singleton(&self) -> bool { self.alphabet_len() == 256 } /// Returns an iterator over a sequence of representative bytes from each /// equivalence class. Namely, this yields exactly N items, where N is /// equivalent to the number of equivalence classes. Each item is an /// arbitrary byte drawn from each equivalence class. /// /// This is useful when one is determinizing an NFA and the NFA's alphabet /// hasn't been converted to equivalence classes yet. Picking an arbitrary /// byte from each equivalence class then permits a full exploration of /// the NFA instead of using every possible byte value. #[cfg(feature = "std")] pub fn representatives(&self) -> ByteClassRepresentatives { ByteClassRepresentatives { classes: self, byte: 0, last_class: None } } /// Returns all of the bytes in the given equivalence class. /// /// The second element in the tuple indicates the number of elements in /// the array. fn elements(&self, equiv: u8) -> ([u8; 256], usize) { let (mut array, mut len) = ([0; 256], 0); for b in 0..256 { if self.get(b as u8) == equiv { array[len] = b as u8; len += 1; } } (array, len) } } impl fmt::Debug for ByteClasses { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { if self.is_singleton() { write!(f, "ByteClasses({{singletons}})") } else { write!(f, "ByteClasses(")?; for equiv in 0..self.alphabet_len() { let (members, len) = self.elements(equiv as u8); write!(f, "{} => {:?}", equiv, &members[..len])?; } write!(f, ")") } } } /// An iterator over representative bytes from each equivalence class. #[cfg(feature = "std")] #[derive(Debug)] pub struct ByteClassRepresentatives<'a> { classes: &'a ByteClasses, byte: usize, last_class: Option, } #[cfg(feature = "std")] impl<'a> Iterator for ByteClassRepresentatives<'a> { type Item = u8; fn next(&mut self) -> Option { while self.byte < 256 { let byte = self.byte as u8; let class = self.classes.get(byte); self.byte += 1; if self.last_class != Some(class) { self.last_class = Some(class); return Some(byte); } } None } } regex-automata-0.1.8/src/codegen.rs010064400017500000144000000063371341227633500154400ustar0000000000000000// This module is unused. It was written as an experiment to get a ballpark // idea of what state machines look like when translated to Rust code, and // in particular, an idea of how much code it generates. The implementation // below isn't optimal with respect to size, but the result wasn't exactly // small. At some point, we should pursue building this out beyond // experimentation, and in particular, probably provide a command line tool // and/or a macro. It's a fair bit of work, so I abandoned it for the initial // release. ---AG use std::collections::HashMap; use std::io::Write; use dense::DFA; use state_id::StateID; macro_rules! wstr { ($($tt:tt)*) => { write!($($tt)*).unwrap() } } macro_rules! wstrln { ($($tt:tt)*) => { writeln!($($tt)*).unwrap() } } pub fn is_match_forward(dfa: &DFA) -> String { let names = state_variant_names(dfa); let mut buf = vec![]; wstrln!(buf, "pub fn is_match(input: &[u8]) -> bool {{"); if dfa.is_match_state(dfa.start()) { wstrln!(buf, " return true;"); wstrln!(buf, "}}"); return String::from_utf8(buf).unwrap(); } wstrln!(buf, "{}", state_enum_def(dfa, &names)); wstrln!(buf, " let mut state = {};", names[&dfa.start()]); wstrln!(buf, " for &b in input.iter() {{"); wstrln!(buf, " state = match state {{"); for (id, s) in dfa.iter() { if dfa.is_match_state(id) { continue; } wstrln!(buf, " {} => {{", &names[&id]); wstrln!(buf, " match b {{"); for (start, end, next_id) in s.sparse_transitions() { if dfa.is_match_state(next_id) { wstrln!(buf, " {:?}...{:?} => return true,", start, end); } else { if start == end { wstrln!(buf, " {:?} => {},", start, &names[&next_id]); } else { wstrln!(buf, " {:?}...{:?} => {},", start, end, &names[&next_id]); } } } wstrln!(buf, " _ => S::S0,"); wstrln!(buf, " }}"); wstrln!(buf, " }}"); } wstrln!(buf, " }};"); wstrln!(buf, " }}"); wstrln!(buf, " false"); wstrln!(buf, "}}"); String::from_utf8(buf).unwrap() } fn state_enum_def( dfa: &DFA, variant_names: &HashMap, ) -> String { let mut buf = vec![]; wstrln!(buf, " #[derive(Clone, Copy)]"); wstr!(buf, " enum S {{"); let mut i = 0; for (id, _) in dfa.iter() { if dfa.is_match_state(id) { continue; } if i % 10 == 0 { wstr!(buf, "\n "); } let name = format!("S{}", id.to_usize()); wstr!(buf, " {},", name); i += 1; } wstr!(buf, "\n"); wstrln!(buf, " }}"); String::from_utf8(buf).unwrap() } fn state_variant_names(dfa: &DFA) -> HashMap { let mut variants = HashMap::new(); for (id, _) in dfa.iter() { if dfa.is_match_state(id) { continue; } variants.insert(id, format!("S::S{}", id.to_usize())); } variants } regex-automata-0.1.8/src/dense.rs010064400017500000144000002646511341551772200151400ustar0000000000000000#[cfg(feature = "std")] use core::fmt; #[cfg(feature = "std")] use core::iter; use core::mem; use core::slice; use byteorder::{ByteOrder, NativeEndian}; #[cfg(feature = "std")] use byteorder::{BigEndian, LittleEndian}; #[cfg(feature = "std")] use regex_syntax::ParserBuilder; use classes::ByteClasses; #[cfg(feature = "std")] use determinize::Determinizer; use dfa::DFA; #[cfg(feature = "std")] use error::{Error, Result}; #[cfg(feature = "std")] use minimize::Minimizer; #[cfg(feature = "std")] use nfa::{NFA, NFABuilder}; #[cfg(feature = "std")] use sparse::SparseDFA; use state_id::{StateID, dead_id}; #[cfg(feature = "std")] use state_id::{ premultiply_overflow_error, next_state_id, write_state_id_bytes, }; /// The size of the alphabet in a standard DFA. /// /// Specifically, this length controls the number of transitions present in /// each DFA state. However, when the byte class optimization is enabled, /// then each DFA maps the space of all possible 256 byte values to at most /// 256 distinct equivalence classes. In this case, the number of distinct /// equivalence classes corresponds to the internal alphabet of the DFA, in the /// sense that each DFA state has a number of transitions equal to the number /// of equivalence classes despite supporting matching on all possible byte /// values. const ALPHABET_LEN: usize = 256; /// Masks used in serialization of DFAs. pub(crate) const MASK_PREMULTIPLIED: u16 = 0b0000_0000_0000_0001; pub(crate) const MASK_ANCHORED: u16 = 0b0000_0000_0000_0010; /// A dense table-based deterministic finite automaton (DFA). /// /// A dense DFA represents the core matching primitive in this crate. That is, /// logically, all DFAs have a single start state, one or more match states /// and a transition table that maps the current state and the current byte of /// input to the next state. A DFA can use this information to implement fast /// searching. In particular, the use of a dense DFA generally makes the trade /// off that match speed is the most valuable characteristic, even if building /// the regex may take significant time *and* space. As such, the processing /// of every byte of input is done with a small constant number of operations /// that does not vary with the pattern, its size or the size of the alphabet. /// If your needs don't line up with this trade off, then a dense DFA may not /// be an adequate solution to your problem. /// /// In contrast, a [sparse DFA](enum.SparseDFA.html) makes the opposite /// trade off: it uses less space but will execute a variable number of /// instructions per byte at match time, which makes it slower for matching. /// /// A DFA can be built using the default configuration via the /// [`DenseDFA::new`](enum.DenseDFA.html#method.new) constructor. Otherwise, /// one can configure various aspects via the /// [`dense::Builder`](dense/struct.Builder.html). /// /// A single DFA fundamentally supports the following operations: /// /// 1. Detection of a match. /// 2. Location of the end of the first possible match. /// 3. Location of the end of the leftmost-first match. /// /// A notable absence from the above list of capabilities is the location of /// the *start* of a match. In order to provide both the start and end of a /// match, *two* DFAs are required. This functionality is provided by a /// [`Regex`](struct.Regex.html), which can be built with its basic /// constructor, [`Regex::new`](struct.Regex.html#method.new), or with /// a [`RegexBuilder`](struct.RegexBuilder.html). /// /// # State size /// /// A `DenseDFA` has two type parameters, `T` and `S`. `T` corresponds to /// the type of the DFA's transition table while `S` corresponds to the /// representation used for the DFA's state identifiers as described by the /// [`StateID`](trait.StateID.html) trait. This type parameter is typically /// `usize`, but other valid choices provided by this crate include `u8`, /// `u16`, `u32` and `u64`. The primary reason for choosing a different state /// identifier representation than the default is to reduce the amount of /// memory used by a DFA. Note though, that if the chosen representation cannot /// accommodate the size of your DFA, then building the DFA will fail and /// return an error. /// /// While the reduction in heap memory used by a DFA is one reason for choosing /// a smaller state identifier representation, another possible reason is for /// decreasing the serialization size of a DFA, as returned by /// [`to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian), /// [`to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian) /// or /// [`to_bytes_native_endian`](enum.DenseDFA.html#method.to_bytes_native_endian). /// /// The type of the transition table is typically either `Vec` or `&[S]`, /// depending on where the transition table is stored. /// /// # Variants /// /// This DFA is defined as a non-exhaustive enumeration of different types of /// dense DFAs. All of these dense DFAs use the same internal representation /// for the transition table, but they vary in how the transition table is /// read. A DFA's specific variant depends on the configuration options set via /// [`dense::Builder`](dense/struct.Builder.html). The default variant is /// `PremultipliedByteClass`. /// /// # The `DFA` trait /// /// This type implements the [`DFA`](trait.DFA.html) trait, which means it /// can be used for searching. For example: /// /// ``` /// use regex_automata::{DFA, DenseDFA}; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let dfa = DenseDFA::new("foo[0-9]+")?; /// assert_eq!(Some(8), dfa.find(b"foo12345")); /// # Ok(()) }; example().unwrap() /// ``` /// /// The `DFA` trait also provides an assortment of other lower level methods /// for DFAs, such as `start_state` and `next_state`. While these are correctly /// implemented, it is an anti-pattern to use them in performance sensitive /// code on the `DenseDFA` type directly. Namely, each implementation requires /// a branch to determine which type of dense DFA is being used. Instead, /// this branch should be pushed up a layer in the code since walking the /// transitions of a DFA is usually a hot path. If you do need to use these /// lower level methods in performance critical code, then you should match on /// the variants of this DFA and use each variant's implementation of the `DFA` /// trait directly. #[derive(Clone, Debug)] pub enum DenseDFA, S: StateID> { /// A standard DFA that does not use premultiplication or byte classes. Standard(Standard), /// A DFA that shrinks its alphabet to a set of equivalence classes instead /// of using all possible byte values. Any two bytes belong to the same /// equivalence class if and only if they can be used interchangeably /// anywhere in the DFA while never discriminating between a match and a /// non-match. /// /// This type of DFA can result in significant space reduction with a very /// small match time performance penalty. ByteClass(ByteClass), /// A DFA that premultiplies all of its state identifiers in its /// transition table. This saves an instruction per byte at match time /// which improves search performance. /// /// The only downside of premultiplication is that it may prevent one from /// using a smaller state identifier representation than you otherwise /// could. Premultiplied(Premultiplied), /// The default configuration of a DFA, which uses byte classes and /// premultiplies its state identifiers. PremultipliedByteClass(PremultipliedByteClass), /// Hints that destructuring should not be exhaustive. /// /// This enum may grow additional variants, so this makes sure clients /// don't count on exhaustive matching. (Otherwise, adding a new variant /// could break existing code.) #[doc(hidden)] __Nonexhaustive, } impl, S: StateID> DenseDFA { /// Return the internal DFA representation. /// /// All variants share the same internal representation. fn repr(&self) -> &Repr { match *self { DenseDFA::Standard(ref r) => &r.0, DenseDFA::ByteClass(ref r) => &r.0, DenseDFA::Premultiplied(ref r) => &r.0, DenseDFA::PremultipliedByteClass(ref r) => &r.0, DenseDFA::__Nonexhaustive => unreachable!(), } } } #[cfg(feature = "std")] impl DenseDFA, usize> { /// Parse the given regular expression using a default configuration and /// return the corresponding DFA. /// /// The default configuration uses `usize` for state IDs, premultiplies /// them and reduces the alphabet size by splitting bytes into equivalence /// classes. The DFA is *not* minimized. /// /// If you want a non-default configuration, then use the /// [`dense::Builder`](dense/struct.Builder.html) /// to set your own configuration. /// /// # Example /// /// ``` /// use regex_automata::{DFA, DenseDFA}; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let dfa = DenseDFA::new("foo[0-9]+bar")?; /// assert_eq!(Some(11), dfa.find(b"foo12345bar")); /// # Ok(()) }; example().unwrap() /// ``` pub fn new(pattern: &str) -> Result, usize>> { Builder::new().build(pattern) } } #[cfg(feature = "std")] impl DenseDFA, S> { /// Create a new empty DFA that never matches any input. /// /// # Example /// /// In order to build an empty DFA, callers must provide a type hint /// indicating their choice of state identifier representation. /// /// ``` /// use regex_automata::{DFA, DenseDFA}; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let dfa: DenseDFA, usize> = DenseDFA::empty(); /// assert_eq!(None, dfa.find(b"")); /// assert_eq!(None, dfa.find(b"foo")); /// # Ok(()) }; example().unwrap() /// ``` pub fn empty() -> DenseDFA, S> { Repr::empty().into_dense_dfa() } } impl, S: StateID> DenseDFA { /// Cheaply return a borrowed version of this dense DFA. Specifically, the /// DFA returned always uses `&[S]` for its transition table while keeping /// the same state identifier representation. pub fn as_ref<'a>(&'a self) -> DenseDFA<&'a [S], S> { match *self { DenseDFA::Standard(ref r) => { DenseDFA::Standard(Standard(r.0.as_ref())) } DenseDFA::ByteClass(ref r) => { DenseDFA::ByteClass(ByteClass(r.0.as_ref())) } DenseDFA::Premultiplied(ref r) => { DenseDFA::Premultiplied(Premultiplied(r.0.as_ref())) } DenseDFA::PremultipliedByteClass(ref r) => { let inner = PremultipliedByteClass(r.0.as_ref()); DenseDFA::PremultipliedByteClass(inner) } DenseDFA::__Nonexhaustive => unreachable!(), } } /// Return an owned version of this sparse DFA. Specifically, the DFA /// returned always uses `Vec` for its transition table while keeping /// the same state identifier representation. /// /// Effectively, this returns a sparse DFA whose transition table lives /// on the heap. #[cfg(feature = "std")] pub fn to_owned(&self) -> DenseDFA, S> { match *self { DenseDFA::Standard(ref r) => { DenseDFA::Standard(Standard(r.0.to_owned())) } DenseDFA::ByteClass(ref r) => { DenseDFA::ByteClass(ByteClass(r.0.to_owned())) } DenseDFA::Premultiplied(ref r) => { DenseDFA::Premultiplied(Premultiplied(r.0.to_owned())) } DenseDFA::PremultipliedByteClass(ref r) => { let inner = PremultipliedByteClass(r.0.to_owned()); DenseDFA::PremultipliedByteClass(inner) } DenseDFA::__Nonexhaustive => unreachable!(), } } /// Returns the memory usage, in bytes, of this DFA. /// /// The memory usage is computed based on the number of bytes used to /// represent this DFA's transition table. This corresponds to heap memory /// usage. /// /// This does **not** include the stack size used up by this DFA. To /// compute that, used `std::mem::size_of::()`. pub fn memory_usage(&self) -> usize { self.repr().memory_usage() } } /// Routines for converting a dense DFA to other representations, such as /// sparse DFAs, smaller state identifiers or raw bytes suitable for persistent /// storage. #[cfg(feature = "std")] impl, S: StateID> DenseDFA { /// Convert this dense DFA to a sparse DFA. /// /// This is a convenience routine for `to_sparse_sized` that fixes the /// state identifier representation of the sparse DFA to the same /// representation used for this dense DFA. /// /// If the chosen state identifier representation is too small to represent /// all states in the sparse DFA, then this returns an error. In most /// cases, if a dense DFA is constructable with `S` then a sparse DFA will /// be as well. However, it is not guaranteed. /// /// # Example /// /// ``` /// use regex_automata::{DFA, DenseDFA}; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let dense = DenseDFA::new("foo[0-9]+")?; /// let sparse = dense.to_sparse()?; /// assert_eq!(Some(8), sparse.find(b"foo12345")); /// # Ok(()) }; example().unwrap() /// ``` pub fn to_sparse(&self) -> Result, S>> { self.to_sparse_sized() } /// Convert this dense DFA to a sparse DFA. /// /// Using this routine requires supplying a type hint to choose the state /// identifier representation for the resulting sparse DFA. /// /// If the chosen state identifier representation is too small to represent /// all states in the sparse DFA, then this returns an error. /// /// # Example /// /// ``` /// use regex_automata::{DFA, DenseDFA}; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let dense = DenseDFA::new("foo[0-9]+")?; /// let sparse = dense.to_sparse_sized::()?; /// assert_eq!(Some(8), sparse.find(b"foo12345")); /// # Ok(()) }; example().unwrap() /// ``` pub fn to_sparse_sized( &self, ) -> Result, A>> { self.repr().to_sparse_sized() } /// Create a new DFA whose match semantics are equivalent to this DFA, /// but attempt to use `u8` for the representation of state identifiers. /// If `u8` is insufficient to represent all state identifiers in this /// DFA, then this returns an error. /// /// This is a convenience routine for `to_sized::()`. pub fn to_u8(&self) -> Result, u8>> { self.to_sized() } /// Create a new DFA whose match semantics are equivalent to this DFA, /// but attempt to use `u16` for the representation of state identifiers. /// If `u16` is insufficient to represent all state identifiers in this /// DFA, then this returns an error. /// /// This is a convenience routine for `to_sized::()`. pub fn to_u16(&self) -> Result, u16>> { self.to_sized() } /// Create a new DFA whose match semantics are equivalent to this DFA, /// but attempt to use `u32` for the representation of state identifiers. /// If `u32` is insufficient to represent all state identifiers in this /// DFA, then this returns an error. /// /// This is a convenience routine for `to_sized::()`. #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] pub fn to_u32(&self) -> Result, u32>> { self.to_sized() } /// Create a new DFA whose match semantics are equivalent to this DFA, /// but attempt to use `u64` for the representation of state identifiers. /// If `u64` is insufficient to represent all state identifiers in this /// DFA, then this returns an error. /// /// This is a convenience routine for `to_sized::()`. #[cfg(target_pointer_width = "64")] pub fn to_u64(&self) -> Result, u64>> { self.to_sized() } /// Create a new DFA whose match semantics are equivalent to this DFA, but /// attempt to use `A` for the representation of state identifiers. If `A` /// is insufficient to represent all state identifiers in this DFA, then /// this returns an error. /// /// An alternative way to construct such a DFA is to use /// [`dense::Builder::build_with_size`](dense/struct.Builder.html#method.build_with_size). /// In general, using the builder is preferred since it will use the given /// state identifier representation throughout determinization (and /// minimization, if done), and thereby using less memory throughout the /// entire construction process. However, these routines are necessary /// in cases where, say, a minimized DFA could fit in a smaller state /// identifier representation, but the initial determinized DFA would not. pub fn to_sized(&self) -> Result, A>> { self.repr().to_sized().map(|r| r.into_dense_dfa()) } /// Serialize a DFA to raw bytes, aligned to an 8 byte boundary, in little /// endian format. /// /// If the state identifier representation of this DFA has a size different /// than 1, 2, 4 or 8 bytes, then this returns an error. All /// implementations of `StateID` provided by this crate satisfy this /// requirement. pub fn to_bytes_little_endian(&self) -> Result> { self.repr().to_bytes::() } /// Serialize a DFA to raw bytes, aligned to an 8 byte boundary, in big /// endian format. /// /// If the state identifier representation of this DFA has a size different /// than 1, 2, 4 or 8 bytes, then this returns an error. All /// implementations of `StateID` provided by this crate satisfy this /// requirement. pub fn to_bytes_big_endian(&self) -> Result> { self.repr().to_bytes::() } /// Serialize a DFA to raw bytes, aligned to an 8 byte boundary, in native /// endian format. Generally, it is better to pick an explicit endianness /// using either `to_bytes_little_endian` or `to_bytes_big_endian`. This /// routine is useful in tests where the DFA is serialized and deserialized /// on the same platform. /// /// If the state identifier representation of this DFA has a size different /// than 1, 2, 4 or 8 bytes, then this returns an error. All /// implementations of `StateID` provided by this crate satisfy this /// requirement. pub fn to_bytes_native_endian(&self) -> Result> { self.repr().to_bytes::() } } impl<'a, S: StateID> DenseDFA<&'a [S], S> { /// Deserialize a DFA with a specific state identifier representation. /// /// Deserializing a DFA using this routine will never allocate heap memory. /// This is also guaranteed to be a constant time operation that does not /// vary with the size of the DFA. /// /// The bytes given should be generated by the serialization of a DFA with /// either the /// [`to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian) /// method or the /// [`to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian) /// endian, depending on the endianness of the machine you are /// deserializing this DFA from. /// /// If the state identifier representation is `usize`, then deserialization /// is dependent on the pointer size. For this reason, it is best to /// serialize DFAs using a fixed size representation for your state /// identifiers, such as `u8`, `u16`, `u32` or `u64`. /// /// # Panics /// /// The bytes given should be *trusted*. In particular, if the bytes /// are not a valid serialization of a DFA, or if the given bytes are /// not aligned to an 8 byte boundary, or if the endianness of the /// serialized bytes is different than the endianness of the machine that /// is deserializing the DFA, then this routine will panic. Moreover, it is /// possible for this deserialization routine to succeed even if the given /// bytes do not represent a valid serialized dense DFA. /// /// # Safety /// /// This routine is unsafe because it permits callers to provide an /// arbitrary transition table with possibly incorrect transitions. While /// the various serialization routines will never return an incorrect /// transition table, there is no guarantee that the bytes provided here /// are correct. While deserialization does many checks (as documented /// above in the panic conditions), this routine does not check that the /// transition table is correct. Given an incorrect transition table, it is /// possible for the search routines to access out-of-bounds memory because /// of explicit bounds check elision. /// /// # Example /// /// This example shows how to serialize a DFA to raw bytes, deserialize it /// and then use it for searching. Note that we first convert the DFA to /// using `u16` for its state identifier representation before serializing /// it. While this isn't strictly necessary, it's good practice in order to /// decrease the size of the DFA and to avoid platform specific pitfalls /// such as differing pointer sizes. /// /// ``` /// use regex_automata::{DFA, DenseDFA}; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let initial = DenseDFA::new("foo[0-9]+")?; /// let bytes = initial.to_u16()?.to_bytes_native_endian()?; /// let dfa: DenseDFA<&[u16], u16> = unsafe { /// DenseDFA::from_bytes(&bytes) /// }; /// /// assert_eq!(Some(8), dfa.find(b"foo12345")); /// # Ok(()) }; example().unwrap() /// ``` pub unsafe fn from_bytes(buf: &'a [u8]) -> DenseDFA<&'a [S], S> { Repr::from_bytes(buf).into_dense_dfa() } } #[cfg(feature = "std")] impl DenseDFA, S> { /// Minimize this DFA in place. /// /// This is not part of the public API. It is only exposed to allow for /// more granular external benchmarking. #[doc(hidden)] pub fn minimize(&mut self) { self.repr_mut().minimize(); } /// Return a mutable reference to the internal DFA representation. fn repr_mut(&mut self) -> &mut Repr, S> { match *self { DenseDFA::Standard(ref mut r) => &mut r.0, DenseDFA::ByteClass(ref mut r) => &mut r.0, DenseDFA::Premultiplied(ref mut r) => &mut r.0, DenseDFA::PremultipliedByteClass(ref mut r) => &mut r.0, DenseDFA::__Nonexhaustive => unreachable!(), } } } impl, S: StateID> DFA for DenseDFA { type ID = S; #[inline] fn start_state(&self) -> S { self.repr().start_state() } #[inline] fn is_match_state(&self, id: S) -> bool { self.repr().is_match_state(id) } #[inline] fn is_dead_state(&self, id: S) -> bool { self.repr().is_dead_state(id) } #[inline] fn is_match_or_dead_state(&self, id: S) -> bool { self.repr().is_match_or_dead_state(id) } #[inline] fn is_anchored(&self) -> bool { self.repr().is_anchored() } #[inline] fn next_state(&self, current: S, input: u8) -> S { match *self { DenseDFA::Standard(ref r) => r.next_state(current, input), DenseDFA::ByteClass(ref r) => r.next_state(current, input), DenseDFA::Premultiplied(ref r) => r.next_state(current, input), DenseDFA::PremultipliedByteClass(ref r) => { r.next_state(current, input) } DenseDFA::__Nonexhaustive => unreachable!(), } } #[inline] unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { match *self { DenseDFA::Standard(ref r) => { r.next_state_unchecked(current, input) } DenseDFA::ByteClass(ref r) => { r.next_state_unchecked(current, input) } DenseDFA::Premultiplied(ref r) => { r.next_state_unchecked(current, input) } DenseDFA::PremultipliedByteClass(ref r) => { r.next_state_unchecked(current, input) } DenseDFA::__Nonexhaustive => unreachable!(), } } // We specialize the following methods because it lets us lift the // case analysis between the different types of dense DFAs. Instead of // doing the case analysis for every transition, we do it once before // searching. #[inline] fn is_match_at(&self, bytes: &[u8], start: usize) -> bool { match *self { DenseDFA::Standard(ref r) => r.is_match_at(bytes, start), DenseDFA::ByteClass(ref r) => r.is_match_at(bytes, start), DenseDFA::Premultiplied(ref r) => r.is_match_at(bytes, start), DenseDFA::PremultipliedByteClass(ref r) => { r.is_match_at(bytes, start) } DenseDFA::__Nonexhaustive => unreachable!(), } } #[inline] fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option { match *self { DenseDFA::Standard(ref r) => r.shortest_match_at(bytes, start), DenseDFA::ByteClass(ref r) => r.shortest_match_at(bytes, start), DenseDFA::Premultiplied(ref r) => { r.shortest_match_at(bytes, start) } DenseDFA::PremultipliedByteClass(ref r) => { r.shortest_match_at(bytes, start) } DenseDFA::__Nonexhaustive => unreachable!(), } } #[inline] fn find_at(&self, bytes: &[u8], start: usize) -> Option { match *self { DenseDFA::Standard(ref r) => r.find_at(bytes, start), DenseDFA::ByteClass(ref r) => r.find_at(bytes, start), DenseDFA::Premultiplied(ref r) => r.find_at(bytes, start), DenseDFA::PremultipliedByteClass(ref r) => r.find_at(bytes, start), DenseDFA::__Nonexhaustive => unreachable!(), } } #[inline] fn rfind_at(&self, bytes: &[u8], start: usize) -> Option { match *self { DenseDFA::Standard(ref r) => r.rfind_at(bytes, start), DenseDFA::ByteClass(ref r) => r.rfind_at(bytes, start), DenseDFA::Premultiplied(ref r) => r.rfind_at(bytes, start), DenseDFA::PremultipliedByteClass(ref r) => { r.rfind_at(bytes, start) } DenseDFA::__Nonexhaustive => unreachable!(), } } } /// A standard dense DFA that does not use premultiplication or byte classes. /// /// Generally, it isn't necessary to use this type directly, since a `DenseDFA` /// can be used for searching directly. One possible reason why one might want /// to use this type directly is if you are implementing your own search /// routines by walking a DFA's transitions directly. In that case, you'll want /// to use this type (or any of the other DFA variant types) directly, since /// they implement `next_state` more efficiently. #[derive(Clone, Debug)] pub struct Standard, S: StateID>(Repr); impl, S: StateID> DFA for Standard { type ID = S; #[inline] fn start_state(&self) -> S { self.0.start_state() } #[inline] fn is_match_state(&self, id: S) -> bool { self.0.is_match_state(id) } #[inline] fn is_dead_state(&self, id: S) -> bool { self.0.is_dead_state(id) } #[inline] fn is_match_or_dead_state(&self, id: S) -> bool { self.0.is_match_or_dead_state(id) } #[inline] fn is_anchored(&self) -> bool { self.0.is_anchored() } #[inline] fn next_state(&self, current: S, input: u8) -> S { let o = current.to_usize() * ALPHABET_LEN + input as usize; self.0.trans()[o] } #[inline] unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { let o = current.to_usize() * ALPHABET_LEN + input as usize; *self.0.trans().get_unchecked(o) } } /// A dense DFA that shrinks its alphabet. /// /// Alphabet shrinking is achieved by using a set of equivalence classes /// instead of using all possible byte values. Any two bytes belong to the same /// equivalence class if and only if they can be used interchangeably anywhere /// in the DFA while never discriminating between a match and a non-match. /// /// This type of DFA can result in significant space reduction with a very /// small match time performance penalty. /// /// Generally, it isn't necessary to use this type directly, since a `DenseDFA` /// can be used for searching directly. One possible reason why one might want /// to use this type directly is if you are implementing your own search /// routines by walking a DFA's transitions directly. In that case, you'll want /// to use this type (or any of the other DFA variant types) directly, since /// they implement `next_state` more efficiently. #[derive(Clone, Debug)] pub struct ByteClass, S: StateID>(Repr); impl, S: StateID> DFA for ByteClass { type ID = S; #[inline] fn start_state(&self) -> S { self.0.start_state() } #[inline] fn is_match_state(&self, id: S) -> bool { self.0.is_match_state(id) } #[inline] fn is_dead_state(&self, id: S) -> bool { self.0.is_dead_state(id) } #[inline] fn is_match_or_dead_state(&self, id: S) -> bool { self.0.is_match_or_dead_state(id) } #[inline] fn is_anchored(&self) -> bool { self.0.is_anchored() } #[inline] fn next_state(&self, current: S, input: u8) -> S { let input = self.0.byte_classes().get(input); let o = current.to_usize() * self.0.alphabet_len() + input as usize; self.0.trans()[o] } #[inline] unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { let input = self.0.byte_classes().get_unchecked(input); let o = current.to_usize() * self.0.alphabet_len() + input as usize; *self.0.trans().get_unchecked(o) } } /// A dense DFA that premultiplies all of its state identifiers in its /// transition table. /// /// This saves an instruction per byte at match time which improves search /// performance. /// /// The only downside of premultiplication is that it may prevent one from /// using a smaller state identifier representation than you otherwise could. /// /// Generally, it isn't necessary to use this type directly, since a `DenseDFA` /// can be used for searching directly. One possible reason why one might want /// to use this type directly is if you are implementing your own search /// routines by walking a DFA's transitions directly. In that case, you'll want /// to use this type (or any of the other DFA variant types) directly, since /// they implement `next_state` more efficiently. #[derive(Clone, Debug)] pub struct Premultiplied, S: StateID>(Repr); impl, S: StateID> DFA for Premultiplied { type ID = S; #[inline] fn start_state(&self) -> S { self.0.start_state() } #[inline] fn is_match_state(&self, id: S) -> bool { self.0.is_match_state(id) } #[inline] fn is_dead_state(&self, id: S) -> bool { self.0.is_dead_state(id) } #[inline] fn is_match_or_dead_state(&self, id: S) -> bool { self.0.is_match_or_dead_state(id) } #[inline] fn is_anchored(&self) -> bool { self.0.is_anchored() } #[inline] fn next_state(&self, current: S, input: u8) -> S { let o = current.to_usize() + input as usize; self.0.trans()[o] } #[inline] unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { let o = current.to_usize() + input as usize; *self.0.trans().get_unchecked(o) } } /// The default configuration of a dense DFA, which uses byte classes and /// premultiplies its state identifiers. /// /// Generally, it isn't necessary to use this type directly, since a `DenseDFA` /// can be used for searching directly. One possible reason why one might want /// to use this type directly is if you are implementing your own search /// routines by walking a DFA's transitions directly. In that case, you'll want /// to use this type (or any of the other DFA variant types) directly, since /// they implement `next_state` more efficiently. #[derive(Clone, Debug)] pub struct PremultipliedByteClass, S: StateID>(Repr); impl, S: StateID> DFA for PremultipliedByteClass { type ID = S; #[inline] fn start_state(&self) -> S { self.0.start_state() } #[inline] fn is_match_state(&self, id: S) -> bool { self.0.is_match_state(id) } #[inline] fn is_dead_state(&self, id: S) -> bool { self.0.is_dead_state(id) } #[inline] fn is_match_or_dead_state(&self, id: S) -> bool { self.0.is_match_or_dead_state(id) } #[inline] fn is_anchored(&self) -> bool { self.0.is_anchored() } #[inline] fn next_state(&self, current: S, input: u8) -> S { let input = self.0.byte_classes().get(input); let o = current.to_usize() + input as usize; self.0.trans()[o] } #[inline] unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { let input = self.0.byte_classes().get_unchecked(input); let o = current.to_usize() + input as usize; *self.0.trans().get_unchecked(o) } } /// The internal representation of a dense DFA. /// /// This representation is shared by all DFA variants. #[derive(Clone)] #[cfg_attr(not(feature = "std"), derive(Debug))] pub(crate) struct Repr { /// Whether the state identifiers in the transition table have been /// premultiplied or not. /// /// Premultiplied identifiers means that instead of your matching loop /// looking something like this: /// /// state = dfa.start /// for byte in haystack: /// next = dfa.transitions[state * len(alphabet) + byte] /// if dfa.is_match(next): /// return true /// return false /// /// it can instead look like this: /// /// state = dfa.start /// for byte in haystack: /// next = dfa.transitions[state + byte] /// if dfa.is_match(next): /// return true /// return false /// /// In other words, we save a multiplication instruction in the critical /// path. This turns out to be a decent performance win. The cost of using /// premultiplied state ids is that they can require a bigger state id /// representation. premultiplied: bool, /// Whether this DFA can only match at the beginning of input or not. /// /// When true, a match should only be reported if it begins at the 0th /// index of the haystack. anchored: bool, /// The initial start state ID. start: S, /// The total number of states in this DFA. Note that a DFA always has at /// least one state---the dead state---even the empty DFA. In particular, /// the dead state always has ID 0 and is correspondingly always the first /// state. The dead state is never a match state. state_count: usize, /// States in a DFA have a *partial* ordering such that a match state /// always precedes any non-match state (except for the special dead /// state). /// /// `max_match` corresponds to the last state that is a match state. This /// encoding has two critical benefits. Firstly, we are not required to /// store any additional per-state information about whether it is a match /// state or not. Secondly, when searching with the DFA, we can do a single /// comparison with `max_match` for each byte instead of two comparisons /// for each byte (one testing whether it is a match and the other testing /// whether we've reached a dead state). Namely, to determine the status /// of the next state, we can do this: /// /// next_state = transition[cur_state * alphabet_len + cur_byte] /// if next_state <= max_match: /// // next_state is either dead (no-match) or a match /// return next_state != dead max_match: S, /// A set of equivalence classes, where a single equivalence class /// represents a set of bytes that never discriminate between a match /// and a non-match in the DFA. Each equivalence class corresponds to /// a single letter in this DFA's alphabet, where the maximum number of /// letters is 256 (each possible value of a byte). Consequently, the /// number of equivalence classes corresponds to the number of transitions /// for each DFA state. /// /// The only time the number of equivalence classes is fewer than 256 is /// if the DFA's kind uses byte classes. If the DFA doesn't use byte /// classes, then this vector is empty. byte_classes: ByteClasses, /// A contiguous region of memory representing the transition table in /// row-major order. The representation is dense. That is, every state has /// precisely the same number of transitions. The maximum number of /// transitions is 256. If a DFA has been instructed to use byte classes, /// then the number of transitions can be much less. /// /// In practice, T is either Vec or &[S]. trans: T, } #[cfg(feature = "std")] impl Repr, S> { /// Create a new empty DFA with singleton byte classes (every byte is its /// own equivalence class). pub fn empty() -> Repr, S> { Repr::empty_with_byte_classes(ByteClasses::singletons()) } /// Create a new empty DFA with the given set of byte equivalence classes. /// An empty DFA never matches any input. pub fn empty_with_byte_classes( byte_classes: ByteClasses, ) -> Repr, S> { let mut dfa = Repr { premultiplied: false, anchored: true, start: dead_id(), state_count: 0, max_match: S::from_usize(0), byte_classes: byte_classes, trans: vec![], }; // Every state ID repr must be able to fit at least one state. dfa.add_empty_state().unwrap(); dfa } /// Sets whether this DFA is anchored or not. pub fn anchored(mut self, yes: bool) -> Repr, S> { self.anchored = yes; self } } impl, S: StateID> Repr { /// Convert this internal DFA representation to a DenseDFA based on its /// transition table access pattern. pub fn into_dense_dfa(self) -> DenseDFA { match (self.premultiplied, self.byte_classes().is_singleton()) { // no premultiplication, no byte classes (false, true) => DenseDFA::Standard(Standard(self)), // no premultiplication, yes byte classes (false, false) => DenseDFA::ByteClass(ByteClass(self)), // yes premultiplication, no byte classes (true, true) => DenseDFA::Premultiplied(Premultiplied(self)), // yes premultiplication, yes byte classes (true, false) => { DenseDFA::PremultipliedByteClass(PremultipliedByteClass(self)) } } } fn as_ref<'a>(&'a self) -> Repr<&'a [S], S> { Repr { premultiplied: self.premultiplied, anchored: self.anchored, start: self.start, state_count: self.state_count, max_match: self.max_match, byte_classes: self.byte_classes().clone(), trans: self.trans(), } } #[cfg(feature = "std")] fn to_owned(&self) -> Repr, S> { Repr { premultiplied: self.premultiplied, anchored: self.anchored, start: self.start, state_count: self.state_count, max_match: self.max_match, byte_classes: self.byte_classes().clone(), trans: self.trans().to_vec(), } } /// Return the starting state of this DFA. /// /// All searches using this DFA must begin at this state. There is exactly /// one starting state for every DFA. A starting state may be a dead state /// or a matching state or neither. pub fn start_state(&self) -> S { self.start } /// Returns true if and only if the given identifier corresponds to a match /// state. pub fn is_match_state(&self, id: S) -> bool { id <= self.max_match && id != dead_id() } /// Returns true if and only if the given identifier corresponds to a dead /// state. pub fn is_dead_state(&self, id: S) -> bool { id == dead_id() } /// Returns true if and only if the given identifier could correspond to /// either a match state or a dead state. If this returns false, then the /// given identifier does not correspond to either a match state or a dead /// state. pub fn is_match_or_dead_state(&self, id: S) -> bool { id <= self.max_match_state() } /// Returns the maximum identifier for which a match state can exist. /// /// More specifically, the return identifier always corresponds to either /// a match state or a dead state. Namely, either /// `is_match_state(returned)` or `is_dead_state(returned)` is guaranteed /// to be true. pub fn max_match_state(&self) -> S { self.max_match } /// Returns true if and only if this DFA is anchored. pub fn is_anchored(&self) -> bool { self.anchored } /// Return the byte classes used by this DFA. pub fn byte_classes(&self) -> &ByteClasses { &self.byte_classes } /// Returns an iterator over all states in this DFA. /// /// This iterator yields a tuple for each state. The first element of the /// tuple corresponds to a state's identifier, and the second element /// corresponds to the state itself (comprised of its transitions). /// /// If this DFA is premultiplied, then the state identifiers are in /// turn premultiplied as well, making them usable without additional /// modification. #[cfg(feature = "std")] pub fn states(&self) -> StateIter { let it = self.trans().chunks(self.alphabet_len()); StateIter { dfa: self, it: it.enumerate() } } /// Return the total number of states in this DFA. Every DFA has at least /// 1 state, even the empty DFA. #[cfg(feature = "std")] pub fn state_count(&self) -> usize { self.state_count } /// Return the number of elements in this DFA's alphabet. /// /// If this DFA doesn't use byte classes, then this is always equivalent /// to 256. Otherwise, it is guaranteed to be some value less than or equal /// to 256. pub fn alphabet_len(&self) -> usize { self.byte_classes().alphabet_len() } /// Returns the memory usage, in bytes, of this DFA. pub fn memory_usage(&self) -> usize { self.trans().len() * mem::size_of::() } /// Convert the given state identifier to the state's index. The state's /// index corresponds to the position in which it appears in the transition /// table. When a DFA is NOT premultiplied, then a state's identifier is /// also its index. When a DFA is premultiplied, then a state's identifier /// is equal to `index * alphabet_len`. This routine reverses that. #[cfg(feature = "std")] pub fn state_id_to_index(&self, id: S) -> usize { if self.premultiplied { id.to_usize() / self.alphabet_len() } else { id.to_usize() } } /// Return this DFA's transition table as a slice. fn trans(&self) -> &[S] { self.trans.as_ref() } /// Create a sparse DFA from the internal representation of a dense DFA. #[cfg(feature = "std")] pub fn to_sparse_sized( &self, ) -> Result, A>> { SparseDFA::from_dense_sized(self) } /// Create a new DFA whose match semantics are equivalent to this DFA, but /// attempt to use `A` for the representation of state identifiers. If `A` /// is insufficient to represent all state identifiers in this DFA, then /// this returns an error. #[cfg(feature = "std")] pub fn to_sized(&self) -> Result, A>> { // Check that this DFA can fit into A's representation. let mut last_state_id = self.state_count - 1; if self.premultiplied { last_state_id *= self.alphabet_len(); } if last_state_id > A::max_id() { return Err(Error::state_id_overflow(A::max_id())); } // We're off to the races. The new DFA is the same as the old one, // but its transition table is truncated. let mut new = Repr { premultiplied: self.premultiplied, anchored: self.anchored, start: A::from_usize(self.start.to_usize()), state_count: self.state_count, max_match: A::from_usize(self.max_match.to_usize()), byte_classes: self.byte_classes().clone(), trans: vec![dead_id::(); self.trans().len()], }; for (i, id) in new.trans.iter_mut().enumerate() { *id = A::from_usize(self.trans()[i].to_usize()); } Ok(new) } /// Serialize a DFA to raw bytes, aligned to an 8 byte boundary. /// /// If the state identifier representation of this DFA has a size different /// than 1, 2, 4 or 8 bytes, then this returns an error. All /// implementations of `StateID` provided by this crate satisfy this /// requirement. #[cfg(feature = "std")] pub(crate) fn to_bytes(&self) -> Result> { let label = b"rust-regex-automata-dfa\x00"; assert_eq!(24, label.len()); let trans_size = mem::size_of::() * self.trans().len(); let size = // For human readable label. label.len() // endiannes check, must be equal to 0xFEFF for native endian + 2 // For version number. + 2 // Size of state ID representation, in bytes. // Must be 1, 2, 4 or 8. + 2 // For DFA misc options. + 2 // For start state. + 8 // For state count. + 8 // For max match state. + 8 // For byte class map. + 256 // For transition table. + trans_size; // sanity check, this can be updated if need be assert_eq!(312 + trans_size, size); // This must always pass. It checks that the transition table is at // a properly aligned address. assert_eq!(0, (size - trans_size) % 8); let mut buf = vec![0; size]; let mut i = 0; // write label for &b in label { buf[i] = b; i += 1; } // endianness check A::write_u16(&mut buf[i..], 0xFEFF); i += 2; // version number A::write_u16(&mut buf[i..], 1); i += 2; // size of state ID let state_size = mem::size_of::(); if ![1, 2, 4, 8].contains(&state_size) { return Err(Error::serialize(&format!( "state size of {} not supported, must be 1, 2, 4 or 8", state_size ))); } A::write_u16(&mut buf[i..], state_size as u16); i += 2; // DFA misc options let mut options = 0u16; if self.premultiplied { options |= MASK_PREMULTIPLIED; } if self.anchored { options |= MASK_ANCHORED; } A::write_u16(&mut buf[i..], options); i += 2; // start state A::write_u64(&mut buf[i..], self.start.to_usize() as u64); i += 8; // state count A::write_u64(&mut buf[i..], self.state_count as u64); i += 8; // max match state A::write_u64( &mut buf[i..], self.max_match.to_usize() as u64, ); i += 8; // byte class map for b in (0..256).map(|b| b as u8) { buf[i] = self.byte_classes().get(b); i += 1; } // transition table for &id in self.trans() { write_state_id_bytes::(&mut buf[i..], id); i += state_size; } assert_eq!(size, i, "expected to consume entire buffer"); Ok(buf) } } impl<'a, S: StateID> Repr<&'a [S], S> { /// The implementation for deserializing a DFA from raw bytes. unsafe fn from_bytes(mut buf: &'a [u8]) -> Repr<&'a [S], S> { assert_eq!( 0, buf.as_ptr() as usize % mem::align_of::(), "DenseDFA starting at address {} is not aligned to {} bytes", buf.as_ptr() as usize, mem::align_of::() ); // skip over label match buf.iter().position(|&b| b == b'\x00') { None => panic!("could not find label"), Some(i) => buf = &buf[i+1..], } // check that current endianness is same as endianness of DFA let endian_check = NativeEndian::read_u16(buf); buf = &buf[2..]; if endian_check != 0xFEFF { panic!( "endianness mismatch, expected 0xFEFF but got 0x{:X}. \ are you trying to load a DenseDFA serialized with a \ different endianness?", endian_check, ); } // check that the version number is supported let version = NativeEndian::read_u16(buf); buf = &buf[2..]; if version != 1 { panic!( "expected version 1, but found unsupported version {}", version, ); } // read size of state let state_size = NativeEndian::read_u16(buf) as usize; if state_size != mem::size_of::() { panic!( "state size of DenseDFA ({}) does not match \ requested state size ({})", state_size, mem::size_of::(), ); } buf = &buf[2..]; // read miscellaneous options let opts = NativeEndian::read_u16(buf); buf = &buf[2..]; // read start state let start = S::from_usize(NativeEndian::read_u64(buf) as usize); buf = &buf[8..]; // read state count let state_count = NativeEndian::read_u64(buf) as usize; buf = &buf[8..]; // read max match state let max_match = S::from_usize(NativeEndian::read_u64(buf) as usize); buf = &buf[8..]; // read byte classes let byte_classes = ByteClasses::from_slice(&buf[..256]); buf = &buf[256..]; let len = state_count * byte_classes.alphabet_len(); let len_bytes = len * state_size; assert!( buf.len() <= len_bytes, "insufficient transition table bytes, \ expected at least {} but only have {}", len_bytes, buf.len() ); assert_eq!( 0, buf.as_ptr() as usize % mem::align_of::(), "DenseDFA transition table is not properly aligned" ); // SAFETY: This is the only actual unsafe thing in this entire routine. // The key things we need to worry about here are alignment and size. // The two asserts above should cover both conditions. let trans = slice::from_raw_parts(buf.as_ptr() as *const S, len); Repr { premultiplied: opts & MASK_PREMULTIPLIED > 0, anchored: opts & MASK_ANCHORED > 0, start, state_count, max_match, byte_classes, trans, } } } /// The following methods implement mutable routines on the internal /// representation of a DFA. As such, we must fix the first type parameter to /// a `Vec` since a generic `T: AsRef<[S]>` does not permit mutation. We /// can get away with this because these methods are internal to the crate and /// are exclusively used during construction of the DFA. #[cfg(feature = "std")] impl Repr, S> { pub fn premultiply(&mut self) -> Result<()> { if self.premultiplied || self.state_count <= 1 { return Ok(()); } let alpha_len = self.alphabet_len(); premultiply_overflow_error( S::from_usize(self.state_count - 1), alpha_len, )?; for id in (0..self.state_count).map(S::from_usize) { for (_, next) in self.get_state_mut(id).iter_mut() { *next = S::from_usize(next.to_usize() * alpha_len); } } self.premultiplied = true; self.start = S::from_usize(self.start.to_usize() * alpha_len); self.max_match = S::from_usize(self.max_match.to_usize() * alpha_len); Ok(()) } /// Minimize this DFA using Hopcroft's algorithm. /// /// This cannot be called on a premultiplied DFA. pub fn minimize(&mut self) { assert!(!self.premultiplied, "can't minimize premultiplied DFA"); Minimizer::new(self).run(); } /// Set the start state of this DFA. /// /// Note that a start state cannot be set on a premultiplied DFA. Instead, /// DFAs should first be completely constructed and then premultiplied. pub fn set_start_state(&mut self, start: S) { assert!(!self.premultiplied, "can't set start on premultiplied DFA"); assert!(start.to_usize() < self.state_count, "invalid start state"); self.start = start; } /// Set the maximum state identifier that could possible correspond to a /// match state. /// /// Callers must uphold the invariant that any state identifier less than /// or equal to the identifier given is either a match state or the special /// dead state (which always has identifier 0 and whose transitions all /// lead back to itself). /// /// This cannot be called on a premultiplied DFA. pub fn set_max_match_state(&mut self, id: S) { assert!(!self.premultiplied, "can't set match on premultiplied DFA"); assert!(id.to_usize() < self.state_count, "invalid max match state"); self.max_match = id; } /// Add the given transition to this DFA. Both the `from` and `to` states /// must already exist. /// /// This cannot be called on a premultiplied DFA. pub fn add_transition(&mut self, from: S, byte: u8, to: S) { assert!(!self.premultiplied, "can't add trans to premultiplied DFA"); assert!(from.to_usize() < self.state_count, "invalid from state"); assert!(to.to_usize() < self.state_count, "invalid to state"); let class = self.byte_classes().get(byte); let offset = from.to_usize() * self.alphabet_len() + class as usize; self.trans[offset] = to; } /// An an empty state (a state where all transitions lead to a dead state) /// and return its identifier. The identifier returned is guaranteed to /// not point to any other existing state. /// /// If adding a state would exhaust the state identifier space (given by /// `S`), then this returns an error. In practice, this means that the /// state identifier representation chosen is too small. /// /// This cannot be called on a premultiplied DFA. pub fn add_empty_state(&mut self) -> Result { assert!(!self.premultiplied, "can't add state to premultiplied DFA"); let id = if self.state_count == 0 { S::from_usize(0) } else { next_state_id(S::from_usize(self.state_count - 1))? }; let alphabet_len = self.alphabet_len(); self.trans.extend(iter::repeat(dead_id::()).take(alphabet_len)); // This should never panic, since state_count is a usize. The // transition table size would have run out of room long ago. self.state_count = self.state_count.checked_add(1).unwrap(); Ok(id) } /// Return a mutable representation of the state corresponding to the given /// id. This is useful for implementing routines that manipulate DFA states /// (e.g., swapping states). /// /// This cannot be called on a premultiplied DFA. pub fn get_state_mut(&mut self, id: S) -> StateMut { assert!(!self.premultiplied, "can't get state in premultiplied DFA"); let alphabet_len = self.alphabet_len(); let offset = id.to_usize() * alphabet_len; StateMut { transitions: &mut self.trans[offset..offset + alphabet_len], } } /// Swap the two states given in the transition table. /// /// This routine does not do anything to check the correctness of this /// swap. Callers must ensure that other states pointing to id1 and id2 are /// updated appropriately. /// /// This cannot be called on a premultiplied DFA. pub fn swap_states(&mut self, id1: S, id2: S) { assert!(!self.premultiplied, "can't swap states in premultiplied DFA"); let o1 = id1.to_usize() * self.alphabet_len(); let o2 = id2.to_usize() * self.alphabet_len(); for b in 0..self.alphabet_len() { self.trans.swap(o1 + b, o2 + b); } } /// Truncate the states in this DFA to the given count. /// /// This routine does not do anything to check the correctness of this /// truncation. Callers must ensure that other states pointing to truncated /// states are updated appropriately. /// /// This cannot be called on a premultiplied DFA. pub fn truncate_states(&mut self, count: usize) { assert!(!self.premultiplied, "can't truncate in premultiplied DFA"); let alphabet_len = self.alphabet_len(); self.trans.truncate(count * alphabet_len); self.state_count = count; } /// This routine shuffles all match states in this DFA---according to the /// given map---to the beginning of the DFA such that every non-match state /// appears after every match state. (With one exception: the special dead /// state remains as the first state.) The given map should have length /// exactly equivalent to the number of states in this DFA. /// /// The purpose of doing this shuffling is to avoid the need to store /// additional state to determine whether a state is a match state or not. /// It also enables a single conditional in the core matching loop instead /// of two. /// /// This updates `self.max_match` to point to the last matching state as /// well as `self.start` if the starting state was moved. pub fn shuffle_match_states(&mut self, is_match: &[bool]) { assert!( !self.premultiplied, "cannot shuffle match states of premultiplied DFA" ); assert_eq!(self.state_count, is_match.len()); if self.state_count <= 1 { return; } let mut first_non_match = 1; while first_non_match < self.state_count && is_match[first_non_match] { first_non_match += 1; } let mut swaps: Vec = vec![dead_id(); self.state_count]; let mut cur = self.state_count - 1; while cur > first_non_match { if is_match[cur] { self.swap_states( S::from_usize(cur), S::from_usize(first_non_match), ); swaps[cur] = S::from_usize(first_non_match); swaps[first_non_match] = S::from_usize(cur); first_non_match += 1; while first_non_match < cur && is_match[first_non_match] { first_non_match += 1; } } cur -= 1; } for id in (0..self.state_count).map(S::from_usize) { for (_, next) in self.get_state_mut(id).iter_mut() { if swaps[next.to_usize()] != dead_id() { *next = swaps[next.to_usize()]; } } } if swaps[self.start.to_usize()] != dead_id() { self.start = swaps[self.start.to_usize()]; } self.max_match = S::from_usize(first_non_match - 1); } } #[cfg(feature = "std")] impl, S: StateID> fmt::Debug for Repr { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn state_status, S: StateID>( dfa: &Repr, id: S, ) -> &'static str { if id == dead_id() { if dfa.is_match_state(id) { "D*" } else { "D " } } else if id == dfa.start_state() { if dfa.is_match_state(id) { ">*" } else { "> " } } else { if dfa.is_match_state(id) { " *" } else { " " } } } writeln!(f, "DenseDFA(")?; for (id, state) in self.states() { let status = state_status(self, id); writeln!(f, "{}{:04}: {:?}", status, id.to_usize(), state)?; } writeln!(f, ")")?; Ok(()) } } /// An iterator over all states in a DFA. /// /// This iterator yields a tuple for each state. The first element of the /// tuple corresponds to a state's identifier, and the second element /// corresponds to the state itself (comprised of its transitions). /// /// If this DFA is premultiplied, then the state identifiers are in turn /// premultiplied as well, making them usable without additional modification. /// /// `'a` corresponding to the lifetime of original DFA, `T` corresponds to /// the type of the transition table itself and `S` corresponds to the state /// identifier representation. #[cfg(feature = "std")] pub(crate) struct StateIter<'a, T: 'a, S: 'a> { dfa: &'a Repr, it: iter::Enumerate>, } #[cfg(feature = "std")] impl<'a, T: AsRef<[S]>, S: StateID> Iterator for StateIter<'a, T, S> { type Item = (S, State<'a, S>); fn next(&mut self) -> Option<(S, State<'a, S>)> { self.it.next().map(|(id, chunk)| { let state = State { transitions: chunk }; let id = if self.dfa.premultiplied { id * self.dfa.alphabet_len() } else { id }; (S::from_usize(id), state) }) } } /// An immutable representation of a single DFA state. /// /// `'a` correspondings to the lifetime of a DFA's transition table and `S` /// corresponds to the state identifier representation. #[cfg(feature = "std")] pub(crate) struct State<'a, S: 'a> { transitions: &'a [S], } #[cfg(feature = "std")] impl<'a, S: StateID> State<'a, S> { /// Return an iterator over all transitions in this state. This yields /// a number of transitions equivalent to the alphabet length of the /// corresponding DFA. /// /// Each transition is represented by a tuple. The first element is /// the input byte for that transition and the second element is the /// transitions itself. pub fn transitions(&self) -> StateTransitionIter { StateTransitionIter { it: self.transitions.iter().enumerate() } } /// Return an iterator over a sparse representation of the transitions in /// this state. Only non-dead transitions are returned. /// /// The "sparse" representation in this case corresponds to a sequence of /// triples. The first two elements of the triple comprise an inclusive /// byte range while the last element corresponds to the transition taken /// for all bytes in the range. /// /// This is somewhat more condensed than the classical sparse /// representation (where you have an element for every non-dead /// transition), but in practice, checking if a byte is in a range is very /// cheap and using ranges tends to conserve quite a bit more space. pub fn sparse_transitions(&self) -> StateSparseTransitionIter { StateSparseTransitionIter { dense: self.transitions(), cur: None } } } #[cfg(feature = "std")] impl<'a, S: StateID> fmt::Debug for State<'a, S> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let mut transitions = vec![]; for (start, end, next_id) in self.sparse_transitions() { let line = if start == end { format!("{} => {}", escape(start), next_id.to_usize()) } else { format!( "{}-{} => {}", escape(start), escape(end), next_id.to_usize(), ) }; transitions.push(line); } write!(f, "{}", transitions.join(", "))?; Ok(()) } } /// An iterator over all transitions in a single DFA state. This yields /// a number of transitions equivalent to the alphabet length of the /// corresponding DFA. /// /// Each transition is represented by a tuple. The first element is the input /// byte for that transition and the second element is the transitions itself. #[cfg(feature = "std")] #[derive(Debug)] pub(crate) struct StateTransitionIter<'a, S: 'a> { it: iter::Enumerate>, } #[cfg(feature = "std")] impl<'a, S: StateID> Iterator for StateTransitionIter<'a, S> { type Item = (u8, S); fn next(&mut self) -> Option<(u8, S)> { self.it.next().map(|(i, &id)| (i as u8, id)) } } /// An iterator over all transitions in a single DFA state using a sparse /// representation. /// /// Each transition is represented by a triple. The first two elements of the /// triple comprise an inclusive byte range while the last element corresponds /// to the transition taken for all bytes in the range. #[cfg(feature = "std")] #[derive(Debug)] pub(crate) struct StateSparseTransitionIter<'a, S: 'a> { dense: StateTransitionIter<'a, S>, cur: Option<(u8, u8, S)>, } #[cfg(feature = "std")] impl<'a, S: StateID> Iterator for StateSparseTransitionIter<'a, S> { type Item = (u8, u8, S); fn next(&mut self) -> Option<(u8, u8, S)> { while let Some((b, next)) = self.dense.next() { let (prev_start, prev_end, prev_next) = match self.cur { Some(t) => t, None => { self.cur = Some((b, b, next)); continue; } }; if prev_next == next { self.cur = Some((prev_start, b, prev_next)); } else { self.cur = Some((b, b, next)); if prev_next != dead_id() { return Some((prev_start, prev_end, prev_next)); } } } if let Some((start, end, next)) = self.cur.take() { if next != dead_id() { return Some((start, end, next)); } } None } } /// A mutable representation of a single DFA state. /// /// `'a` correspondings to the lifetime of a DFA's transition table and `S` /// corresponds to the state identifier representation. #[cfg(feature = "std")] pub(crate) struct StateMut<'a, S: 'a> { transitions: &'a mut [S], } #[cfg(feature = "std")] impl<'a, S: StateID> StateMut<'a, S> { /// Return an iterator over all transitions in this state. This yields /// a number of transitions equivalent to the alphabet length of the /// corresponding DFA. /// /// Each transition is represented by a tuple. The first element is the /// input byte for that transition and the second element is a mutable /// reference to the transition itself. pub fn iter_mut(&mut self) -> StateTransitionIterMut { StateTransitionIterMut { it: self.transitions.iter_mut().enumerate() } } } #[cfg(feature = "std")] impl<'a, S: StateID> fmt::Debug for StateMut<'a, S> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fmt::Debug::fmt(&State { transitions: self.transitions }, f) } } /// A mutable iterator over all transitions in a DFA state. /// /// Each transition is represented by a tuple. The first element is the /// input byte for that transition and the second element is a mutable /// reference to the transition itself. #[cfg(feature = "std")] #[derive(Debug)] pub(crate) struct StateTransitionIterMut<'a, S: 'a> { it: iter::Enumerate>, } #[cfg(feature = "std")] impl<'a, S: StateID> Iterator for StateTransitionIterMut<'a, S> { type Item = (u8, &'a mut S); fn next(&mut self) -> Option<(u8, &'a mut S)> { self.it.next().map(|(i, id)| (i as u8, id)) } } /// A builder for constructing a deterministic finite automaton from regular /// expressions. /// /// This builder permits configuring several aspects of the construction /// process such as case insensitivity, Unicode support and various options /// that impact the size of the generated DFA. In some cases, options (like /// performing DFA minimization) can come with a substantial additional cost. /// /// This builder always constructs a *single* DFA. As such, this builder can /// only be used to construct regexes that either detect the presence of a /// match or find the end location of a match. A single DFA cannot produce both /// the start and end of a match. For that information, use a /// [`Regex`](struct.Regex.html), which can be similarly configured using /// [`RegexBuilder`](struct.RegexBuilder.html). #[cfg(feature = "std")] #[derive(Clone, Debug)] pub struct Builder { parser: ParserBuilder, nfa: NFABuilder, anchored: bool, minimize: bool, premultiply: bool, byte_classes: bool, reverse: bool, longest_match: bool, } #[cfg(feature = "std")] impl Builder { /// Create a new DenseDFA builder with the default configuration. pub fn new() -> Builder { Builder { parser: ParserBuilder::new(), nfa: NFABuilder::new(), anchored: false, minimize: false, premultiply: true, byte_classes: true, reverse: false, longest_match: false, } } /// Build a DFA from the given pattern. /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. pub fn build(&self, pattern: &str) -> Result, usize>> { self.build_with_size::(pattern) } /// Build a DFA from the given pattern using a specific representation for /// the DFA's state IDs. /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. /// /// The representation of state IDs is determined by the `S` type /// parameter. In general, `S` is usually one of `u8`, `u16`, `u32`, `u64` /// or `usize`, where `usize` is the default used for `build`. The purpose /// of specifying a representation for state IDs is to reduce the memory /// footprint of a DFA. /// /// When using this routine, the chosen state ID representation will be /// used throughout determinization and minimization, if minimization /// was requested. Even if the minimized DFA can fit into the chosen /// state ID representation but the initial determinized DFA cannot, /// then this will still return an error. To get a minimized DFA with a /// smaller state ID representation, first build it with a bigger state ID /// representation, and then shrink the size of the DFA using one of its /// conversion routines, such as /// [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16). pub fn build_with_size( &self, pattern: &str, ) -> Result, S>> { if self.longest_match && !self.anchored { return Err(Error::unsupported_longest_match()); } let nfa = self.build_nfa(pattern)?; let mut dfa = if self.byte_classes { Determinizer::new(&nfa) .with_byte_classes() .longest_match(self.longest_match) .build() } else { Determinizer::new(&nfa) .longest_match(self.longest_match) .build() }?; if self.minimize { dfa.minimize(); } if self.premultiply { dfa.premultiply()?; } Ok(dfa.into_dense_dfa()) } /// Builds an NFA from the given pattern. pub(crate) fn build_nfa(&self, pattern: &str) -> Result { let hir = self .parser .build() .parse(pattern) .map_err(Error::syntax)?; Ok(self.nfa.build(hir)?) } /// Set whether matching must be anchored at the beginning of the input. /// /// When enabled, a match must begin at the start of the input. When /// disabled, the DFA will act as if the pattern started with a `.*?`, /// which enables a match to appear anywhere. /// /// By default this is disabled. pub fn anchored(&mut self, yes: bool) -> &mut Builder { self.anchored = yes; self.nfa.anchored(yes); self } /// Enable or disable the case insensitive flag by default. /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `i` flag. pub fn case_insensitive(&mut self, yes: bool) -> &mut Builder { self.parser.case_insensitive(yes); self } /// Enable verbose mode in the regular expression. /// /// When enabled, verbose mode permits insigificant whitespace in many /// places in the regular expression, as well as comments. Comments are /// started using `#` and continue until the end of the line. /// /// By default, this is disabled. It may be selectively enabled in the /// regular expression by using the `x` flag regardless of this setting. pub fn ignore_whitespace(&mut self, yes: bool) -> &mut Builder { self.parser.ignore_whitespace(yes); self } /// Enable or disable the "dot matches any character" flag by default. /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `s` flag. pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut Builder { self.parser.dot_matches_new_line(yes); self } /// Enable or disable the "swap greed" flag by default. /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `U` flag. pub fn swap_greed(&mut self, yes: bool) -> &mut Builder { self.parser.swap_greed(yes); self } /// Enable or disable the Unicode flag (`u`) by default. /// /// By default this is **enabled**. It may alternatively be selectively /// disabled in the regular expression itself via the `u` flag. /// /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by /// default), a regular expression will fail to parse if Unicode mode is /// disabled and a sub-expression could possibly match invalid UTF-8. pub fn unicode(&mut self, yes: bool) -> &mut Builder { self.parser.unicode(yes); self } /// When enabled, the builder will permit the construction of a regular /// expression that may match invalid UTF-8. /// /// When disabled (the default), the builder is guaranteed to produce a /// regex that will only ever match valid UTF-8 (otherwise, the builder /// will return an error). pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut Builder { self.parser.allow_invalid_utf8(yes); self.nfa.allow_invalid_utf8(yes); self } /// Set the nesting limit used for the regular expression parser. /// /// The nesting limit controls how deep the abstract syntax tree is allowed /// to be. If the AST exceeds the given limit (e.g., with too many nested /// groups), then an error is returned by the parser. /// /// The purpose of this limit is to act as a heuristic to prevent stack /// overflow when building a finite automaton from a regular expression's /// abstract syntax tree. In particular, construction currently uses /// recursion. In the future, the implementation may stop using recursion /// and this option will no longer be necessary. /// /// This limit is not checked until the entire AST is parsed. Therefore, /// if callers want to put a limit on the amount of heap space used, then /// they should impose a limit on the length, in bytes, of the concrete /// pattern string. In particular, this is viable since the parser will /// limit itself to heap space proportional to the lenth of the pattern /// string. /// /// Note that a nest limit of `0` will return a nest limit error for most /// patterns but not all. For example, a nest limit of `0` permits `a` but /// not `ab`, since `ab` requires a concatenation AST item, which results /// in a nest depth of `1`. In general, a nest limit is not something that /// manifests in an obvious way in the concrete syntax, therefore, it /// should not be used in a granular way. pub fn nest_limit(&mut self, limit: u32) -> &mut Builder { self.parser.nest_limit(limit); self } /// Minimize the DFA. /// /// When enabled, the DFA built will be minimized such that it is as small /// as possible. /// /// Whether one enables minimization or not depends on the types of costs /// you're willing to pay and how much you care about its benefits. In /// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)` /// space, where `n` is the number of DFA states and `k` is the alphabet /// size. In practice, minimization can be quite costly in terms of both /// space and time, so it should only be done if you're willing to wait /// longer to produce a DFA. In general, you might want a minimal DFA in /// the following circumstances: /// /// 1. You would like to optimize for the size of the automaton. This can /// manifest in one of two ways. Firstly, if you're converting the /// DFA into Rust code (or a table embedded in the code), then a minimal /// DFA will translate into a corresponding reduction in code size, and /// thus, also the final compiled binary size. Secondly, if you are /// building many DFAs and putting them on the heap, you'll be able to /// fit more if they are smaller. Note though that building a minimal /// DFA itself requires additional space; you only realize the space /// savings once the minimal DFA is constructed (at which point, the /// space used for minimization is freed). /// 2. You've observed that a smaller DFA results in faster match /// performance. Naively, this isn't guaranteed since there is no /// inherent difference between matching with a bigger-than-minimal /// DFA and a minimal DFA. However, a smaller DFA may make use of your /// CPU's cache more efficiently. /// 3. You are trying to establish an equivalence between regular /// languages. The standard method for this is to build a minimal DFA /// for each language and then compare them. If the DFAs are equivalent /// (up to state renaming), then the languages are equivalent. /// /// This option is disabled by default. pub fn minimize(&mut self, yes: bool) -> &mut Builder { self.minimize = yes; self } /// Premultiply state identifiers in the DFA's transition table. /// /// When enabled, state identifiers are premultiplied to point to their /// corresponding row in the DFA's transition table. That is, given the /// `i`th state, its corresponding premultiplied identifier is `i * k` /// where `k` is the alphabet size of the DFA. (The alphabet size is at /// most 256, but is in practice smaller if byte classes is enabled.) /// /// When state identifiers are not premultiplied, then the identifier of /// the `i`th state is `i`. /// /// The advantage of premultiplying state identifiers is that is saves /// a multiplication instruction per byte when searching with the DFA. /// This has been observed to lead to a 20% performance benefit in /// micro-benchmarks. /// /// The primary disadvantage of premultiplying state identifiers is /// that they require a larger integer size to represent. For example, /// if your DFA has 200 states, then its premultiplied form requires /// 16 bits to represent every possible state identifier, where as its /// non-premultiplied form only requires 8 bits. /// /// This option is enabled by default. pub fn premultiply(&mut self, yes: bool) -> &mut Builder { self.premultiply = yes; self } /// Shrink the size of the DFA's alphabet by mapping bytes to their /// equivalence classes. /// /// When enabled, each DFA will use a map from all possible bytes to their /// corresponding equivalence class. Each equivalence class represents a /// set of bytes that does not discriminate between a match and a non-match /// in the DFA. For example, the pattern `[ab]+` has at least two /// equivalence classes: a set containing `a` and `b` and a set containing /// every byte except for `a` and `b`. `a` and `b` are in the same /// equivalence classes because they never discriminate between a match /// and a non-match. /// /// The advantage of this map is that the size of the transition table can /// be reduced drastically from `#states * 256 * sizeof(id)` to /// `#states * k * sizeof(id)` where `k` is the number of equivalence /// classes. As a result, total space usage can decrease substantially. /// Moreover, since a smaller alphabet is used, compilation becomes faster /// as well. /// /// The disadvantage of this map is that every byte searched must be /// passed through this map before it can be used to determine the next /// transition. This has a small match time performance cost. /// /// This option is enabled by default. pub fn byte_classes(&mut self, yes: bool) -> &mut Builder { self.byte_classes = yes; self } /// Reverse the DFA. /// /// A DFA reversal is performed by reversing all of the concatenated /// sub-expressions in the original pattern, recursively. The resulting /// DFA can be used to match the pattern starting from the end of a string /// instead of the beginning of a string. /// /// Generally speaking, a reversed DFA is most useful for finding the start /// of a match, since a single forward DFA is only capable of finding the /// end of a match. This start of match handling is done for you /// automatically if you build a [`Regex`](struct.Regex.html). pub fn reverse(&mut self, yes: bool) -> &mut Builder { self.reverse = yes; self.nfa.reverse(yes); self } /// Find the longest possible match. /// /// This is distinct from the default leftmost-first match semantics in /// that it treats all NFA states as having equivalent priority. In other /// words, the longest possible match is always found and it is not /// possible to implement non-greedy match semantics when this is set. That /// is, `a+` and `a+?` are equivalent when this is enabled. /// /// In particular, a practical issue with this option at the moment is that /// it prevents unanchored searches from working correctly, since /// unanchored searches are implemented by prepending an non-greedy `.*?` /// to the beginning of the pattern. As stated above, non-greedy match /// semantics aren't supported. Therefore, if this option is enabled and /// an unanchored search is requested, then building a DFA will return an /// error. /// /// This option is principally useful when building a reverse DFA for /// finding the start of a match. If you are building a regex with /// [`RegexBuilder`](struct.RegexBuilder.html), then this is handled for /// you automatically. The reason why this is necessary for start of match /// handling is because we want to find the earliest possible starting /// position of a match to satisfy leftmost-first match semantics. When /// matching in reverse, this means finding the longest possible match, /// hence, this option. /// /// By default this is disabled. pub fn longest_match(&mut self, yes: bool) -> &mut Builder { // There is prior art in RE2 that shows how this can support unanchored // searches. Instead of treating all NFA states as having equivalent // priority, we instead group NFA states into sets, and treat members // of each set as having equivalent priority, but having greater // priority than all following members of different sets. We then // essentially assign a higher priority to everything over the prefix // `.*?`. self.longest_match = yes; self } } #[cfg(feature = "std")] impl Default for Builder { fn default() -> Builder { Builder::new() } } /// Return the given byte as its escaped string form. #[cfg(feature = "std")] fn escape(b: u8) -> String { use std::ascii; String::from_utf8(ascii::escape_default(b).collect::>()).unwrap() } #[cfg(test)] #[allow(dead_code)] mod tests { use nfa::NFA; use super::*; #[test] fn errors_when_converting_to_smaller_dfa() { let pattern = r"\w"; let dfa = Builder::new() .byte_classes(false) .anchored(true) .premultiply(false) .build_with_size::(pattern) .unwrap(); assert!(dfa.to_u8().is_err()); } #[test] fn errors_when_determinization_would_overflow() { let pattern = r"\w"; let mut builder = Builder::new(); builder.byte_classes(false).anchored(true).premultiply(false); // using u16 is fine assert!(builder.build_with_size::(pattern).is_ok()); // // ... but u8 results in overflow (because there are >256 states) assert!(builder.build_with_size::(pattern).is_err()); } #[test] fn errors_when_premultiply_would_overflow() { let pattern = r"[a-z]"; let mut builder = Builder::new(); builder.byte_classes(false).anchored(true).premultiply(false); // without premultiplication is OK assert!(builder.build_with_size::(pattern).is_ok()); // ... but with premultiplication overflows u8 builder.premultiply(true); assert!(builder.build_with_size::(pattern).is_err()); } fn print_automata(pattern: &str) { println!("BUILDING AUTOMATA"); let (nfa, dfa, mdfa) = build_automata(pattern); println!("{}", "#".repeat(100)); println!("PATTERN: {:?}", pattern); println!("NFA:"); println!("{:?}", nfa); println!("{}", "~".repeat(79)); println!("DFA:"); print!("{:?}", dfa); println!("{}", "~".repeat(79)); println!("Minimal DFA:"); print!("{:?}", mdfa); println!("{}", "~".repeat(79)); println!("{}", "#".repeat(100)); } // fn print_automata_counts(pattern: &str) { // let (nfa, dfa, mdfa) = build_automata(pattern); // println!("nfa # states: {:?}", nfa.len()); // println!("dfa # states: {:?}", dfa.len()); // println!("minimal dfa # states: {:?}", mdfa.len()); // } fn build_automata( pattern: &str, ) -> (NFA, DenseDFA, usize>, DenseDFA, usize>) { let mut builder = Builder::new(); builder.byte_classes(true).premultiply(false); builder.anchored(true); builder.allow_invalid_utf8(false); let nfa = builder.build_nfa(pattern).unwrap(); let dfa = builder.build(pattern).unwrap(); let min = builder.minimize(true).build(pattern).unwrap(); (nfa, dfa, min) } #[test] fn scratch() { // let data = ::std::fs::read_to_string("/usr/share/dict/words").unwrap(); // let mut words: Vec<&str> = data.lines().collect(); // println!("{} words", words.len()); // words.sort_by(|w1, w2| w1.len().cmp(&w2.len()).reverse()); // let pattern = words.join("|"); // print_automata_counts(&pattern); // print_automata(&pattern); // print_automata(r"[01]*1[01]{5}"); // print_automata(r"X(.?){0,8}Y"); // print_automata_counts(r"\p{alphabetic}"); // print_automata(r"a*b+|cdefg"); // print_automata(r"(..)*(...)*"); // let pattern = r"\p{any}*?\p{Other_Uppercase}"; // let pattern = r"\p{any}*?\w+"; // print_automata_counts(pattern); // print_automata_counts(r"(?-u:\w)"); // let pattern = r"\p{Greek}"; // let pattern = r"zZzZzZzZzZ"; // let pattern = grapheme_pattern(); // let pattern = r"\p{Ideographic}"; // let pattern = r"\w{10}"; // 51784 --> 41264 // let pattern = r"\w"; // 5182 // let pattern = r"a*"; // print_automata(pattern); // let (_, _, dfa) = build_automata(pattern); let dfa = DenseDFA::new("foo[0-9]+").unwrap(); let sparse = dfa.to_sparse_sized::().unwrap(); println!("{:?}", sparse); println!( "dense mem: {:?}, sparse mem: {:?}", dfa.to_u16().unwrap().memory_usage(), sparse.memory_usage(), ); } fn grapheme_pattern() -> &'static str { r"(?x) (?: \p{gcb=CR}\p{gcb=LF} | [\p{gcb=Control}\p{gcb=CR}\p{gcb=LF}] | \p{gcb=Prepend}* (?: (?: (?: \p{gcb=L}* (?:\p{gcb=V}+|\p{gcb=LV}\p{gcb=V}*|\p{gcb=LVT}) \p{gcb=T}* ) | \p{gcb=L}+ | \p{gcb=T}+ ) | \p{gcb=RI}\p{gcb=RI} | \p{Extended_Pictographic} (?:\p{gcb=Extend}*\p{gcb=ZWJ}\p{Extended_Pictographic})* | [^\p{gcb=Control}\p{gcb=CR}\p{gcb=LF}] ) [\p{gcb=Extend}\p{gcb=ZWJ}\p{gcb=SpacingMark}]* ) " } } regex-automata-0.1.8/src/determinize.rs010064400017500000144000000240351343506346200163460ustar0000000000000000use std::collections::HashMap; use std::mem; use std::rc::Rc; use dense; use error::Result; use nfa::{self, NFA}; use sparse_set::SparseSet; use state_id::{StateID, dead_id}; type DFARepr = dense::Repr, S>; /// A determinizer converts an NFA to a DFA. /// /// This determinizer follows the typical powerset construction, where each /// DFA state is comprised of one or more NFA states. In the worst case, there /// is one DFA state for every possible combination of NFA states. In practice, /// this only happens in certain conditions, typically when there are bounded /// repetitions. /// /// The type variable `S` refers to the chosen state identifier representation /// used for the DFA. /// /// The lifetime variable `'a` refers to the lifetime of the NFA being /// converted to a DFA. #[derive(Debug)] pub(crate) struct Determinizer<'a, S: StateID> { /// The NFA we're converting into a DFA. nfa: &'a NFA, /// The DFA we're building. dfa: DFARepr, /// Each DFA state being built is defined as an *ordered* set of NFA /// states, along with a flag indicating whether the state is a match /// state or not. /// /// This is never empty. The first state is always a dummy state such that /// a state id == 0 corresponds to a dead state. builder_states: Vec>, /// A cache of DFA states that already exist and can be easily looked up /// via ordered sets of NFA states. cache: HashMap, S>, /// Scratch space for a stack of NFA states to visit, for depth first /// visiting without recursion. stack: Vec, /// Scratch space for storing an ordered sequence of NFA states, for /// amortizing allocation. scratch_nfa_states: Vec, /// Whether to build a DFA that finds the longest possible match. longest_match: bool, } /// An intermediate representation for a DFA state during determinization. #[derive(Debug, Eq, Hash, PartialEq)] struct State { /// Whether this state is a match state or not. is_match: bool, /// An ordered sequence of NFA states that make up this DFA state. nfa_states: Vec, } impl<'a, S: StateID> Determinizer<'a, S> { /// Create a new determinizer for converting the given NFA to a DFA. pub fn new(nfa: &'a NFA) -> Determinizer<'a, S> { let dead = Rc::new(State::dead()); let mut cache = HashMap::default(); cache.insert(dead.clone(), dead_id()); Determinizer { nfa: nfa, dfa: DFARepr::empty().anchored(nfa.is_anchored()), builder_states: vec![dead], cache: cache, stack: vec![], scratch_nfa_states: vec![], longest_match: false, } } /// Instruct the determinizer to use equivalence classes as the transition /// alphabet instead of all possible byte values. pub fn with_byte_classes(mut self) -> Determinizer<'a, S> { let byte_classes = self.nfa.byte_classes().clone(); self.dfa = DFARepr::empty_with_byte_classes(byte_classes) .anchored(self.nfa.is_anchored()); self } /// Instruct the determinizer to build a DFA that recognizes the longest /// possible match instead of the leftmost first match. This is useful when /// constructing reverse DFAs for finding the start of a match. pub fn longest_match(mut self, yes: bool) -> Determinizer<'a, S> { self.longest_match = yes; self } /// Build the DFA. If there was a problem constructing the DFA (e.g., if /// the chosen state identifier representation is too small), then an error /// is returned. pub fn build(mut self) -> Result> { let representative_bytes: Vec = self.dfa.byte_classes().representatives().collect(); let mut sparse = self.new_sparse_set(); let mut uncompiled = vec![self.add_start(&mut sparse)?]; while let Some(dfa_id) = uncompiled.pop() { for &b in &representative_bytes { let (next_dfa_id, is_new) = self.cached_state( dfa_id, b, &mut sparse, )?; self.dfa.add_transition(dfa_id, b, next_dfa_id); if is_new { uncompiled.push(next_dfa_id); } } } // At this point, we shuffle the matching states in the final DFA to // the beginning. This permits a DFA's match loop to detect a match // condition by merely inspecting the current state's identifier, and // avoids the need for any additional auxiliary storage. let is_match: Vec = self .builder_states .iter() .map(|s| s.is_match) .collect(); self.dfa.shuffle_match_states(&is_match); Ok(self.dfa) } /// Return the identifier for the next DFA state given an existing DFA /// state and an input byte. If the next DFA state already exists, then /// return its identifier from the cache. Otherwise, build the state, cache /// it and return its identifier. /// /// The given sparse set is used for scratch space. It must have a capacity /// equivalent to the total number of NFA states, but its contents are /// otherwise unspecified. /// /// This routine returns a boolean indicating whether a new state was /// built. If a new state is built, then the caller needs to add it to its /// frontier of uncompiled DFA states to compute transitions for. fn cached_state( &mut self, dfa_id: S, b: u8, sparse: &mut SparseSet, ) -> Result<(S, bool)> { sparse.clear(); // Compute the set of all reachable NFA states, including epsilons. self.next(dfa_id, b, sparse); // Build a candidate state and check if it has already been built. let state = self.new_state(sparse); if let Some(&cached_id) = self.cache.get(&state) { // Since we have a cached state, put the constructed state's // memory back into our scratch space, so that it can be reused. mem::replace(&mut self.scratch_nfa_states, state.nfa_states); return Ok((cached_id, false)); } // Nothing was in the cache, so add this state to the cache. self.add_state(state).map(|s| (s, true)) } /// Compute the set of all eachable NFA states, including the full epsilon /// closure, from a DFA state for a single byte of input. fn next( &mut self, dfa_id: S, b: u8, next_nfa_states: &mut SparseSet, ) { next_nfa_states.clear(); for i in 0..self.builder_states[dfa_id.to_usize()].nfa_states.len() { let nfa_id = self.builder_states[dfa_id.to_usize()].nfa_states[i]; match *self.nfa.state(nfa_id) { nfa::State::Union { .. } | nfa::State::Match => {} nfa::State::Range { start, end, next } => { if start <= b && b <= end { self.epsilon_closure(next, next_nfa_states); } } } } } /// Compute the epsilon closure for the given NFA state. fn epsilon_closure(&mut self, start: nfa::StateID, set: &mut SparseSet) { if !self.nfa.state(start).is_epsilon() { set.insert(start); return; } self.stack.push(start); while let Some(mut id) = self.stack.pop() { loop { if set.contains(id) { break; } set.insert(id); match *self.nfa.state(id) { nfa::State::Range { .. } | nfa::State::Match => break, nfa::State::Union { ref alternates } => { id = match alternates.get(0) { None => break, Some(&id) => id, }; self.stack.extend(alternates[1..].iter().rev()); } } } } } /// Compute the initial DFA state and return its identifier. /// /// The sparse set given is used for scratch space, and must have capacity /// equal to the total number of NFA states. Its contents are unspecified. fn add_start(&mut self, sparse: &mut SparseSet) -> Result { sparse.clear(); self.epsilon_closure(self.nfa.start(), sparse); let state = self.new_state(&sparse); let id = self.add_state(state)?; self.dfa.set_start_state(id); Ok(id) } /// Add the given state to the DFA and make it available in the cache. /// /// The state initially has no transitions. That is, it transitions to the /// dead state for all possible inputs. fn add_state(&mut self, state: State) -> Result { let id = self.dfa.add_empty_state()?; let rstate = Rc::new(state); self.builder_states.push(rstate.clone()); self.cache.insert(rstate, id); Ok(id) } /// Convert the given set of ordered NFA states to a DFA state. fn new_state(&mut self, set: &SparseSet) -> State { let mut state = State { is_match: false, nfa_states: mem::replace(&mut self.scratch_nfa_states, vec![]), }; state.nfa_states.clear(); for &id in set { match *self.nfa.state(id) { nfa::State::Range { .. } => { state.nfa_states.push(id); } nfa::State::Match => { state.is_match = true; if !self.longest_match { break; } } nfa::State::Union { .. } => {} } } state } /// Create a new sparse set with enough capacity to hold all NFA states. fn new_sparse_set(&self) -> SparseSet { SparseSet::new(self.nfa.len()) } } impl State { /// Create a new empty dead state. fn dead() -> State { State { nfa_states: vec![], is_match: false } } } regex-automata-0.1.8/src/dfa.rs010064400017500000144000000330661343506344300145640ustar0000000000000000use state_id::StateID; /// A trait describing the interface of a deterministic finite automaton (DFA). /// /// Every DFA has exactly one start state and at least one dead state (which /// may be the same, as in the case of an empty DFA). In all cases, a state /// identifier of `0` must be a dead state such that `DFA::is_dead_state(0)` /// always returns `true`. /// /// Every DFA also has zero or more match states, such that /// `DFA::is_match_state(id)` returns `true` if and only if `id` corresponds to /// a match state. /// /// In general, users of this trait likely will only need to use the search /// routines such as `is_match`, `shortest_match`, `find` or `rfind`. The other /// methods are lower level and are used for walking the transitions of a DFA /// manually. In particular, the aforementioned search routines are implemented /// generically in terms of the lower level transition walking routines. pub trait DFA { /// The representation used for state identifiers in this DFA. /// /// Typically, this is one of `u8`, `u16`, `u32`, `u64` or `usize`. type ID: StateID; /// Return the identifier of this DFA's start state. fn start_state(&self) -> Self::ID; /// Returns true if and only if the given identifier corresponds to a match /// state. fn is_match_state(&self, id: Self::ID) -> bool; /// Returns true if and only if the given identifier corresponds to a dead /// state. When a DFA enters a dead state, it is impossible to leave and /// thus can never lead to a match. fn is_dead_state(&self, id: Self::ID) -> bool; /// Returns true if and only if the given identifier corresponds to either /// a dead state or a match state, such that one of `is_match_state(id)` /// or `is_dead_state(id)` must return true. /// /// Depending on the implementation of the DFA, this routine can be used /// to save a branch in the core matching loop. Nevertheless, /// `is_match_state(id) || is_dead_state(id)` is always a valid /// implementation. fn is_match_or_dead_state(&self, id: Self::ID) -> bool; /// Returns true if and only if this DFA is anchored. /// /// When a DFA is anchored, it is only allowed to report matches that /// start at index `0`. fn is_anchored(&self) -> bool; /// Given the current state that this DFA is in and the next input byte, /// this method returns the identifier of the next state. The identifier /// returned is always valid, but it may correspond to a dead state. fn next_state(&self, current: Self::ID, input: u8) -> Self::ID; /// Like `next_state`, but its implementation may look up the next state /// without memory safety checks such as bounds checks. As such, callers /// must ensure that the given identifier corresponds to a valid DFA /// state. Implementors must, in turn, ensure that this routine is safe /// for all valid state identifiers and for all possible `u8` values. unsafe fn next_state_unchecked( &self, current: Self::ID, input: u8, ) -> Self::ID; /// Returns true if and only if the given bytes match this DFA. /// /// This routine may short circuit if it knows that scanning future input /// will never lead to a different result. In particular, if a DFA enters /// a match state or a dead state, then this routine will return `true` or /// `false`, respectively, without inspecting any future input. /// /// # Example /// /// This example shows how to use this method with a /// [`DenseDFA`](enum.DenseDFA.html). /// /// ``` /// use regex_automata::{DFA, DenseDFA}; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let dfa = DenseDFA::new("foo[0-9]+bar")?; /// assert_eq!(true, dfa.is_match(b"foo12345bar")); /// assert_eq!(false, dfa.is_match(b"foobar")); /// # Ok(()) }; example().unwrap() /// ``` #[inline] fn is_match(&self, bytes: &[u8]) -> bool { self.is_match_at(bytes, 0) } /// Returns the first position at which a match is found. /// /// This routine stops scanning input in precisely the same circumstances /// as `is_match`. The key difference is that this routine returns the /// position at which it stopped scanning input if and only if a match /// was found. If no match is found, then `None` is returned. /// /// # Example /// /// This example shows how to use this method with a /// [`DenseDFA`](enum.DenseDFA.html). /// /// ``` /// use regex_automata::{DFA, DenseDFA}; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let dfa = DenseDFA::new("foo[0-9]+")?; /// assert_eq!(Some(4), dfa.shortest_match(b"foo12345")); /// /// // Normally, the end of the leftmost first match here would be 3, /// // but the shortest match semantics detect a match earlier. /// let dfa = DenseDFA::new("abc|a")?; /// assert_eq!(Some(1), dfa.shortest_match(b"abc")); /// # Ok(()) }; example().unwrap() /// ``` #[inline] fn shortest_match(&self, bytes: &[u8]) -> Option { self.shortest_match_at(bytes, 0) } /// Returns the end offset of the longest match. If no match exists, /// then `None` is returned. /// /// Implementors of this trait are not required to implement any particular /// match semantics (such as leftmost-first), which are instead manifest in /// the DFA's topology itself. /// /// In particular, this method must continue searching even after it /// enters a match state. The search should only terminate once it has /// reached the end of the input or when it has entered a dead state. Upon /// termination, the position of the last byte seen while still in a match /// state is returned. /// /// # Example /// /// This example shows how to use this method with a /// [`DenseDFA`](enum.DenseDFA.html). By default, a dense DFA uses /// "leftmost first" match semantics. /// /// Leftmost first match semantics corresponds to the match with the /// smallest starting offset, but where the end offset is determined by /// preferring earlier branches in the original regular expression. For /// example, `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` /// will match `Samwise` in `Samwise`. /// /// Generally speaking, the "leftmost first" match is how most backtracking /// regular expressions tend to work. This is in contrast to POSIX-style /// regular expressions that yield "leftmost longest" matches. Namely, /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using /// leftmost longest semantics. /// /// ``` /// use regex_automata::{DFA, DenseDFA}; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let dfa = DenseDFA::new("foo[0-9]+")?; /// assert_eq!(Some(8), dfa.find(b"foo12345")); /// /// // Even though a match is found after reading the first byte (`a`), /// // the leftmost first match semantics demand that we find the earliest /// // match that prefers earlier parts of the pattern over latter parts. /// let dfa = DenseDFA::new("abc|a")?; /// assert_eq!(Some(3), dfa.find(b"abc")); /// # Ok(()) }; example().unwrap() /// ``` #[inline] fn find(&self, bytes: &[u8]) -> Option { self.find_at(bytes, 0) } /// Returns the start offset of the longest match in reverse, by searching /// from the end of the input towards the start of the input. If no match /// exists, then `None` is returned. In other words, this has the same /// match semantics as `find`, but in reverse. /// /// # Example /// /// This example shows how to use this method with a /// [`DenseDFA`](enum.DenseDFA.html). In particular, this routine /// is principally useful when used in conjunction with the /// [`dense::Builder::reverse`](dense/struct.Builder.html#method.reverse) /// configuration knob. In general, it's unlikely to be correct to use both /// `find` and `rfind` with the same DFA since any particular DFA will only /// support searching in one direction. /// /// ``` /// use regex_automata::{dense, DFA}; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let dfa = dense::Builder::new().reverse(true).build("foo[0-9]+")?; /// assert_eq!(Some(0), dfa.rfind(b"foo12345")); /// # Ok(()) }; example().unwrap() /// ``` #[inline] fn rfind(&self, bytes: &[u8]) -> Option { self.rfind_at(bytes, bytes.len()) } /// Returns the same as `is_match`, but starts the search at the given /// offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, if the DFA is anchored, then /// a match can only occur when `start == 0`. #[inline] fn is_match_at(&self, bytes: &[u8], start: usize) -> bool { if self.is_anchored() && start > 0 { return false; } let mut state = self.start_state(); if self.is_match_or_dead_state(state) { return self.is_match_state(state); } for &b in bytes[start..].iter() { state = unsafe { self.next_state_unchecked(state, b) }; if self.is_match_or_dead_state(state) { return self.is_match_state(state); } } false } /// Returns the same as `shortest_match`, but starts the search at the /// given offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, if the DFA is anchored, then /// a match can only occur when `start == 0`. #[inline] fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option { if self.is_anchored() && start > 0 { return None; } let mut state = self.start_state(); if self.is_match_or_dead_state(state) { return if self.is_dead_state(state) { None } else { Some(start) }; } for (i, &b) in bytes[start..].iter().enumerate() { state = unsafe { self.next_state_unchecked(state, b) }; if self.is_match_or_dead_state(state) { return if self.is_dead_state(state) { None } else { Some(start + i + 1) }; } } None } /// Returns the same as `find`, but starts the search at the given /// offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, if the DFA is anchored, then /// a match can only occur when `start == 0`. #[inline] fn find_at(&self, bytes: &[u8], start: usize) -> Option { if self.is_anchored() && start > 0 { return None; } let mut state = self.start_state(); let mut last_match = if self.is_dead_state(state) { return None; } else if self.is_match_state(state) { Some(start) } else { None }; for (i, &b) in bytes[start..].iter().enumerate() { state = unsafe { self.next_state_unchecked(state, b) }; if self.is_match_or_dead_state(state) { if self.is_dead_state(state) { return last_match; } last_match = Some(start + i + 1); } } last_match } /// Returns the same as `rfind`, but starts the search at the given /// offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, if the DFA is anchored, then /// a match can only occur when `start == bytes.len()`. #[inline(never)] fn rfind_at(&self, bytes: &[u8], start: usize) -> Option { if self.is_anchored() && start < bytes.len() { return None; } let mut state = self.start_state(); let mut last_match = if self.is_dead_state(state) { return None; } else if self.is_match_state(state) { Some(start) } else { None }; for (i, &b) in bytes[..start].iter().enumerate().rev() { state = unsafe { self.next_state_unchecked(state, b) }; if self.is_match_or_dead_state(state) { if self.is_dead_state(state) { return last_match; } last_match = Some(i); } } last_match } } impl<'a, T: DFA> DFA for &'a T { type ID = T::ID; #[inline] fn start_state(&self) -> Self::ID { (**self).start_state() } #[inline] fn is_match_state(&self, id: Self::ID) -> bool { (**self).is_match_state(id) } #[inline] fn is_match_or_dead_state(&self, id: Self::ID) -> bool { (**self).is_match_or_dead_state(id) } #[inline] fn is_dead_state(&self, id: Self::ID) -> bool { (**self).is_dead_state(id) } #[inline] fn is_anchored(&self) -> bool { (**self).is_anchored() } #[inline] fn next_state(&self, current: Self::ID, input: u8) -> Self::ID { (**self).next_state(current, input) } #[inline] unsafe fn next_state_unchecked( &self, current: Self::ID, input: u8, ) -> Self::ID { (**self).next_state_unchecked(current, input) } } regex-automata-0.1.8/src/error.rs010064400017500000144000000127531341271333100151540ustar0000000000000000use std::error; use std::fmt; use std::result; use regex_syntax; pub type Result = result::Result; /// An error that occurred during the construction of a DFA. #[derive(Clone, Debug)] pub struct Error { kind: ErrorKind, } /// The kind of error that occurred. #[derive(Clone, Debug)] pub enum ErrorKind { /// An error that occurred while parsing a regular expression. Note that /// this error may be printed over multiple lines, and is generally /// intended to be end user readable on its own. Syntax(String), /// An error that occurred because an unsupported regex feature was used. /// The message string describes which unsupported feature was used. /// /// The primary regex features that are unsupported are those that require /// look-around, such as the `^` and `$` anchors and the word boundary /// assertion `\b`. These may be supported in the future. Unsupported(String), /// An error that occurred when attempting to serialize a DFA to bytes. Serialize(String), /// An error that occurs when constructing a DFA would require the use of /// a state ID that overflows the chosen state ID representation. For /// example, if one is using `u8` for state IDs and builds a DFA with /// 257 states, then the last state's ID will be `256` which cannot be /// represented with `u8`. /// /// Typically, this error occurs in the determinization process of building /// a DFA (the conversion step from NFA to DFA). It can also occur when /// trying to build a smaller DFA from an existing one. StateIDOverflow { /// The maximum possible state ID. max: usize, }, /// An error that occurs when premultiplication of state IDs is requested, /// but doing so would overflow the chosen state ID representation. /// /// When `max == requested_max`, then the state ID would overflow `usize`. PremultiplyOverflow { /// The maximum possible state id. max: usize, /// The maximum ID required by premultiplication. requested_max: usize, } } impl Error { /// Return the kind of this error. pub fn kind(&self) -> &ErrorKind { &self.kind } pub(crate) fn syntax(err: regex_syntax::Error) -> Error { Error { kind: ErrorKind::Syntax(err.to_string()) } } pub(crate) fn unsupported_anchor() -> Error { let msg = r"anchors such as ^, $, \A and \z are not supported"; Error { kind: ErrorKind::Unsupported(msg.to_string()) } } pub(crate) fn unsupported_word() -> Error { let msg = r"word boundary assertions (\b and \B) are not supported"; Error { kind: ErrorKind::Unsupported(msg.to_string()) } } pub(crate) fn unsupported_longest_match() -> Error { let msg = "unachored searches with longest match \ semantics are not supported"; Error { kind: ErrorKind::Unsupported(msg.to_string()) } } pub(crate) fn serialize(message: &str) -> Error { Error { kind: ErrorKind::Serialize(message.to_string()) } } pub(crate) fn state_id_overflow(max: usize) -> Error { Error { kind: ErrorKind::StateIDOverflow { max } } } pub(crate) fn premultiply_overflow( max: usize, requested_max: usize, ) -> Error { Error { kind: ErrorKind::PremultiplyOverflow { max, requested_max } } } } impl error::Error for Error { fn description(&self) -> &str { match self.kind { ErrorKind::Syntax(_) => "syntax error", ErrorKind::Unsupported(_) => "unsupported syntax", ErrorKind::Serialize(_) => "serialization error", ErrorKind::StateIDOverflow { .. } => { "state id representation too small" } ErrorKind::PremultiplyOverflow { .. } => { "state id representation too small for premultiplication" } } } } impl fmt::Display for Error { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { match self.kind { ErrorKind::Syntax(ref msg) => write!(f, "{}", msg), ErrorKind::Unsupported(ref msg) => write!(f, "{}", msg), ErrorKind::Serialize(ref msg) => { write!(f, "DFA serialization error: {}", msg) } ErrorKind::StateIDOverflow { max } => { write!( f, "building the DFA failed because it required building \ more states that can be identified, where the maximum \ ID for the chosen representation is {}", max, ) } ErrorKind::PremultiplyOverflow { max, requested_max } => { if max == requested_max { write!( f, "premultiplication of states requires the ability to \ represent a state ID greater than what can fit on \ this platform's usize, which is {}", ::std::usize::MAX, ) } else { write!( f, "premultiplication of states requires the ability to \ represent at least a state ID of {}, but the chosen \ representation only permits a maximum state ID of {}", requested_max, max, ) } } } } } regex-automata-0.1.8/src/lib.rs010064400017500000144000000363541341354172300146010ustar0000000000000000/*! A low level regular expression library that uses deterministic finite automata. It supports a rich syntax with Unicode support, has extensive options for configuring the best space vs time trade off for your use case and provides support for cheap deserialization of automata for use in `no_std` environments. # Overview This section gives a brief overview of the primary types in this crate: * A [`Regex`](struct.Regex.html) provides a way to search for matches of a regular expression. This includes iterating over matches with both the start and end positions of each match. * A [`RegexBuilder`](struct.RegexBuilder.html) provides a way configure many compilation options for a regex. * A [`DenseDFA`](enum.DenseDFA.html) provides low level access to a DFA that uses a dense representation (uses lots of space, but fast searching). * A [`SparseDFA`](enum.SparseDFA.html) provides the same API as a `DenseDFA`, but uses a sparse representation (uses less space, but slower matching). * A [`DFA`](trait.DFA.html) trait that defines an interface that all DFAs must implement. * Both dense DFAs and sparse DFAs support [serialization to raw bytes](enum.DenseDFA.html#method.to_bytes_little_endian) and [cheap deserialization](enum.DenseDFA.html#method.from_bytes). # Example: basic regex searching This example shows how to compile a regex using the default configuration and then use it to find matches in a byte string: ``` use regex_automata::Regex; let re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); let text = b"2018-12-24 2016-10-08"; let matches: Vec<(usize, usize)> = re.find_iter(text).collect(); assert_eq!(matches, vec![(0, 10), (11, 21)]); ``` # Example: use sparse DFAs By default, compiling a regex will use dense DFAs internally. This uses more memory, but executes searches more quickly. If you can abide slower searches (somewhere around 3-5x), then sparse DFAs might make more sense since they can use significantly less space. Using sparse DFAs is as easy as using `Regex::new_sparse` instead of `Regex::new`: ``` use regex_automata::Regex; # fn example() -> Result<(), regex_automata::Error> { let re = Regex::new_sparse(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); let text = b"2018-12-24 2016-10-08"; let matches: Vec<(usize, usize)> = re.find_iter(text).collect(); assert_eq!(matches, vec![(0, 10), (11, 21)]); # Ok(()) }; example().unwrap() ``` If you already have dense DFAs for some reason, they can be converted to sparse DFAs and used to build a new `Regex`. For example: ``` use regex_automata::Regex; # fn example() -> Result<(), regex_automata::Error> { let dense_re = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); let sparse_re = Regex::from_dfas( dense_re.forward().to_sparse()?, dense_re.reverse().to_sparse()?, ); let text = b"2018-12-24 2016-10-08"; let matches: Vec<(usize, usize)> = sparse_re.find_iter(text).collect(); assert_eq!(matches, vec![(0, 10), (11, 21)]); # Ok(()) }; example().unwrap() ``` # Example: deserialize a DFA This shows how to first serialize a DFA into raw bytes, and then deserialize those raw bytes back into a DFA. While this particular example is a bit contrived, this same technique can be used in your program to deserialize a DFA at start up time or by memory mapping a file. In particular, deserialization is guaranteed to be cheap because it will always be a constant time operation. ``` use regex_automata::{DenseDFA, Regex}; # fn example() -> Result<(), regex_automata::Error> { let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); // serialize both the forward and reverse DFAs, see note below let fwd_bytes = re1.forward().to_u16()?.to_bytes_native_endian()?; let rev_bytes = re1.reverse().to_u16()?.to_bytes_native_endian()?; // now deserialize both---we need to specify the correct type! let fwd: DenseDFA<&[u16], u16> = unsafe { DenseDFA::from_bytes(&fwd_bytes) }; let rev: DenseDFA<&[u16], u16> = unsafe { DenseDFA::from_bytes(&rev_bytes) }; // finally, reconstruct our regex let re2 = Regex::from_dfas(fwd, rev); // we can use it like normal let text = b"2018-12-24 2016-10-08"; let matches: Vec<(usize, usize)> = re2.find_iter(text).collect(); assert_eq!(matches, vec![(0, 10), (11, 21)]); # Ok(()) }; example().unwrap() ``` There are a few points worth noting here: * We need to extract the raw DFAs used by the regex and serialize those. You can build the DFAs manually yourself using [`dense::Builder`](dense/struct.Builder.html), but using the DFAs from a `Regex` guarantees that the DFAs are built correctly. * We specifically convert the dense DFA to a representation that uses `u16` for its state identifiers using [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16). While this isn't strictly necessary, if we skipped this step, then the serialized bytes would use `usize` for state identifiers, which does not have a fixed size. Using `u16` ensures that we can deserialize this DFA even on platforms with a smaller pointer size. If our DFA is too big for `u16` state identifiers, then one can use `u32` or `u64`. * To convert the DFA to raw bytes, we use the `to_bytes_native_endian` method. In practice, you'll want to use either [`DenseDFA::to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian) or [`DenseDFA::to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian), depending on which platform you're deserializing your DFA from. If you intend to deserialize on either platform, then you'll need to serialize both and deserialize the right one depending on your target's endianness. * Deserializing a DFA requires the use of `unsafe` because the raw bytes must be *trusted*. In particular, while some degree of sanity checks are performed, nothing guarantees the integrity of the DFA's transition table since deserialization is a constant time operation. Since searching with a DFA must be able to follow transitions blindly for performance reasons, giving incorrect bytes to the deserialization API can result in memory unsafety. The same process can be achieved with sparse DFAs as well: ``` use regex_automata::{SparseDFA, Regex}; # fn example() -> Result<(), regex_automata::Error> { let re1 = Regex::new(r"[0-9]{4}-[0-9]{2}-[0-9]{2}").unwrap(); // serialize both let fwd_bytes = re1.forward().to_u16()?.to_sparse()?.to_bytes_native_endian()?; let rev_bytes = re1.reverse().to_u16()?.to_sparse()?.to_bytes_native_endian()?; // now deserialize both---we need to specify the correct type! let fwd: SparseDFA<&[u8], u16> = unsafe { SparseDFA::from_bytes(&fwd_bytes) }; let rev: SparseDFA<&[u8], u16> = unsafe { SparseDFA::from_bytes(&rev_bytes) }; // finally, reconstruct our regex let re2 = Regex::from_dfas(fwd, rev); // we can use it like normal let text = b"2018-12-24 2016-10-08"; let matches: Vec<(usize, usize)> = re2.find_iter(text).collect(); assert_eq!(matches, vec![(0, 10), (11, 21)]); # Ok(()) }; example().unwrap() ``` Note that unlike dense DFAs, sparse DFAs have no alignment requirements. Conversely, dense DFAs must be be aligned to the same alignment as their state identifier representation. # Support for `no_std` This crate comes with a `std` feature that is enabled by default. When the `std` feature is enabled, the API of this crate will include the facilities necessary for compiling, serializing, deserializing and searching with regular expressions. When the `std` feature is disabled, the API of this crate will shrink such that it only includes the facilities necessary for deserializing and searching with regular expressions. The intended workflow for `no_std` environments is thus as follows: * Write a program with the `std` feature that compiles and serializes a regular expression. Serialization should only happen after first converting the DFAs to use a fixed size state identifier instead of the default `usize`. You may also need to serialize both little and big endian versions of each DFA. (So that's 4 DFAs in total for each regex.) * In your `no_std` environment, follow the examples above for deserializing your previously serialized DFAs into regexes. You can then search with them as you would any regex. Deserialization can happen anywhere. For example, with bytes embedded into a binary or with a file memory mapped at runtime. Note that the [`ucd-generate`](https://github.com/BurntSushi/ucd-generate) tool will do the first step for you with its `dfa` or `regex` sub-commands. # Syntax This crate supports the same syntax as the `regex` crate, since they share the same parser. You can find an exhaustive list of supported syntax in the [documentation for the `regex` crate](https://docs.rs/regex/1.1/regex/#syntax). Currently, there are a couple limitations. In general, this crate does not support zero-width assertions, although they may be added in the future. This includes: * Anchors such as `^`, `$`, `\A` and `\z`. * Word boundary assertions such as `\b` and `\B`. It is possible to run a search that is anchored at the beginning of the input. To do that, set the [`RegexBuilder::anchored`](struct.RegexBuilder.html#method.anchored) option when building a regex. By default, all searches are unanchored. # Differences with the regex crate The main goal of the [`regex`](https://docs.rs/regex) crate is to serve as a general purpose regular expression engine. It aims to automatically balance low compile times, fast search times and low memory usage, while also providing a convenient API for users. In contrast, this crate provides a lower level regular expression interface that is a bit less convenient while providing more explicit control over memory usage and search times. Here are some specific negative differences: * **Compilation can take an exponential amount of time and space** in the size of the regex pattern. While most patterns do not exhibit worst case exponential time, such patterns do exist. For example, `[01]*1[01]{N}` will build a DFA with `2^(N+1)` states. For this reason, untrusted patterns should not be compiled with this library. (In the future, the API may expose an option to return an error if the DFA gets too big.) * This crate does not support sub-match extraction, which can be achieved with the regex crate's "captures" API. This may be added in the future, but is unlikely. * While the regex crate doesn't necessarily sport fast compilation times, the regexes in this crate are almost universally slow to compile, especially when they contain large Unicode character classes. For example, on my system, compiling `\w{3}` with byte classes enabled takes just over 1 second and almost 5MB of memory! (Compiling a sparse regex takes about the same time but only uses about 500KB of memory.) Conversly, compiling the same regex without Unicode support, e.g., `(?-u)\w{3}`, takes under 1 millisecond and less than 5KB of memory. For this reason, you should only use Unicode character classes if you absolutely need them! * This crate does not support regex sets. * This crate does not support zero-width assertions such as `^`, `$`, `\b` or `\B`. * As a lower level crate, this library does not do literal optimizations. In exchange, you get predictable performance regardless of input. The philosophy here is that literal optimizations should be applied at a higher level, although there is no easy support for this in the ecosystem yet. * There is no `&str` API like in the regex crate. In this crate, all APIs operate on `&[u8]`. By default, match indices are guaranteed to fall on UTF-8 boundaries, unless [`RegexBuilder::allow_invalid_utf8`](struct.RegexBuilder.html#method.allow_invalid_utf8) is enabled. With some of the downsides out of the way, here are some positive differences: * Both dense and sparse DFAs can be serialized to raw bytes, and then cheaply deserialized. Deserialization always takes constant time since searching can be performed directly on the raw serialized bytes of a DFA. * This crate was specifically designed so that the searching phase of a DFA has minimal runtime requirements, and can therefore be used in `no_std` environments. While `no_std` environments cannot compile regexes, they can deserialize pre-compiled regexes. * Since this crate builds DFAs ahead of time, it will generally out-perform the `regex` crate on equivalent tasks. The performance difference is likely not large. However, because of a complex set of optimizations in the regex crate (like literal optimizations), an accurate performance comparison may be difficult to do. * Sparse DFAs provide a way to build a DFA ahead of time that sacrifices search performance a small amount, but uses much less storage space. Potentially even less than what the regex crate uses. * This crate exposes DFAs directly, such as [`DenseDFA`](enum.DenseDFA.html) and [`SparseDFA`](enum.SparseDFA.html), which enables one to do less work in some cases. For example, if you only need the end of a match and not the start of a match, then you can use a DFA directly without building a `Regex`, which always requires a second DFA to find the start of a match. * Aside from choosing between dense and sparse DFAs, there are several options for configuring the space usage vs search time trade off. These include things like choosing a smaller state identifier representation, to premultiplying state identifiers and splitting a DFA's alphabet into equivalence classes. Finally, DFA minimization is also provided, but can increase compilation times dramatically. */ #![deny(missing_docs)] #![cfg_attr(not(feature = "std"), no_std)] #[cfg(feature = "std")] extern crate core; extern crate byteorder; #[cfg(feature = "std")] extern crate regex_syntax; #[cfg(feature = "std")] extern crate utf8_ranges; pub use dense::DenseDFA; pub use dfa::DFA; #[cfg(feature = "std")] pub use error::{Error, ErrorKind}; pub use regex::Regex; #[cfg(feature = "std")] pub use regex::RegexBuilder; pub use sparse::SparseDFA; pub use state_id::StateID; mod classes; #[cfg(feature = "std")] mod determinize; #[path = "dense.rs"] mod dense_imp; mod dfa; #[cfg(feature = "std")] mod error; mod regex; #[cfg(feature = "std")] mod minimize; #[cfg(feature = "std")] mod nfa; #[path = "sparse.rs"] mod sparse_imp; #[cfg(feature = "std")] mod sparse_set; mod state_id; /// Types and routines specific to dense DFAs. /// /// This module is the home of [`DenseDFA`](enum.DenseDFA.html) and each of its /// corresponding variant DFA types, such as [`Standard`](struct.Standard.html) /// and [`ByteClass`](struct.ByteClass.html). /// /// This module also contains a [builder](struct.Builder.html) for /// configuring the construction of a dense DFA. pub mod dense { pub use dense_imp::*; } /// Types and routines specific to sparse DFAs. /// /// This module is the home of [`SparseDFA`](enum.SparseDFA.html) and each of /// its corresponding variant DFA types, such as /// [`Standard`](struct.Standard.html) and /// [`ByteClass`](struct.ByteClass.html). /// /// Unlike the [`dense`](../dense/index.html) module, this module does not /// contain a builder specific for sparse DFAs. Instead, the intended way to /// build a sparse DFA is either by using a default configuration with its /// [constructor](enum.SparseDFA.html#method.new), /// or by first /// [configuring the construction of a dense DFA](../dense/struct.Builder.html) /// and then calling /// [`DenseDFA::to_sparse`](../enum.DenseDFA.html#method.to_sparse). pub mod sparse { pub use sparse_imp::*; } regex-automata-0.1.8/src/minimize.rs010064400017500000144000000324501341442421200156370ustar0000000000000000use std::cell::RefCell; use std::fmt; use std::mem; use std::rc::Rc; use dense; use state_id::{StateID, dead_id}; type DFARepr = dense::Repr, S>; /// An implementation of Hopcroft's algorithm for minimizing DFAs. /// /// The algorithm implemented here is mostly taken from Wikipedia: /// https://en.wikipedia.org/wiki/DFA_minimization#Hopcroft's_algorithm /// /// This code has had some light optimization attention paid to it, /// particularly in the form of reducing allocation as much as possible. /// However, it is still generally slow. Future optimization work should /// probably focus on the bigger picture rather than micro-optimizations. For /// example: /// /// 1. Figure out how to more intelligently create initial partitions. That is, /// Hopcroft's algorithm starts by creating two partitions of DFA states /// that are known to NOT be equivalent: match states and non-match states. /// The algorithm proceeds by progressively refining these partitions into /// smaller partitions. If we could start with more partitions, then we /// could reduce the amount of work that Hopcroft's algorithm needs to do. /// 2. For every partition that we visit, we find all incoming transitions to /// every state in the partition for *every* element in the alphabet. (This /// is why using byte classes can significantly decrease minimization times, /// since byte classes shrink the alphabet.) This is quite costly and there /// is perhaps some redundant work being performed depending on the specific /// states in the set. For example, we might be able to only visit some /// elements of the alphabet based on the transitions. /// 3. Move parts of minimization into determinization. If minimization has /// fewer states to deal with, then it should run faster. A prime example /// of this might be large Unicode classes, which are generated in way that /// can create a lot of redundant states. pub(crate) struct Minimizer<'a, S: 'a> { dfa: &'a mut DFARepr, in_transitions: Vec>>, partitions: Vec>, waiting: Vec>, } impl<'a, S: StateID> fmt::Debug for Minimizer<'a, S> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { f.debug_struct("Minimizer") .field("dfa", &self.dfa) .field("in_transitions", &self.in_transitions) .field("partitions", &self.partitions) .field("waiting", &self.waiting) .finish() } } /// A set of states. A state set makes up a single partition in Hopcroft's /// algorithm. /// /// It is represented by an ordered set of state identifiers. We use shared /// ownership so that a single state set can be in both the set of partitions /// and in set of waiting sets simultaneously without an additional allocation. /// Generally, once a state set is built, it becomes immutable. /// /// We use this representation because it avoids the overhead of more /// traditional set data structures (HashSet/BTreeSet), and also because /// computing intersection/subtraction on this representation is especially /// fast. #[derive(Clone, Debug, Eq, PartialEq, PartialOrd, Ord)] struct StateSet(Rc>>); impl<'a, S: StateID> Minimizer<'a, S> { pub fn new(dfa: &'a mut DFARepr) -> Minimizer<'a, S> { let in_transitions = Minimizer::incoming_transitions(dfa); let partitions = Minimizer::initial_partitions(dfa); let waiting = vec![partitions[0].clone()]; Minimizer { dfa, in_transitions, partitions, waiting } } pub fn run(mut self) { let mut incoming = StateSet::empty(); let mut scratch1 = StateSet::empty(); let mut scratch2 = StateSet::empty(); let mut newparts = vec![]; while let Some(set) = self.waiting.pop() { for b in (0..self.dfa.alphabet_len()).map(|b| b as u8) { self.find_incoming_to(b, &set, &mut incoming); for p in 0..self.partitions.len() { self.partitions[p].intersection(&incoming, &mut scratch1); if scratch1.is_empty() { newparts.push(self.partitions[p].clone()); continue; } self.partitions[p].subtract(&incoming, &mut scratch2); if scratch2.is_empty() { newparts.push(self.partitions[p].clone()); continue; } let (x, y) = (scratch1.deep_clone(), scratch2.deep_clone()); newparts.push(x.clone()); newparts.push(y.clone()); match self.find_waiting(&self.partitions[p]) { Some(i) => { self.waiting[i] = x; self.waiting.push(y); } None => { if x.len() <= y.len() { self.waiting.push(x); } else { self.waiting.push(y); } } } } newparts = mem::replace(&mut self.partitions, newparts); newparts.clear(); } } // At this point, we now have a minimal partitioning of states, where // each partition is an equivalence class of DFA states. Now we need to // use this partioning to update the DFA to only contain one state for // each partition. // Create a map from DFA state ID to the representative ID of the // equivalence class to which it belongs. The representative ID of an // equivalence class of states is the minimum ID in that class. let mut state_to_part = vec![dead_id(); self.dfa.state_count()]; for p in &self.partitions { p.iter(|id| state_to_part[id.to_usize()] = p.min()); } // Generate a new contiguous sequence of IDs for minimal states, and // create a map from equivalence IDs to the new IDs. Thus, the new // minimal ID of *any* state in the unminimized DFA can be obtained // with minimals_ids[state_to_part[old_id]]. let mut minimal_ids = vec![dead_id(); self.dfa.state_count()]; let mut new_id = S::from_usize(0); for (id, _) in self.dfa.states() { if state_to_part[id.to_usize()] == id { minimal_ids[id.to_usize()] = new_id; new_id = S::from_usize(new_id.to_usize() + 1); } } // The total number of states in the minimal DFA. let minimal_count = new_id.to_usize(); // Re-map this DFA in place such that the only states remaining // correspond to the representative states of every equivalence class. for id in (0..self.dfa.state_count()).map(S::from_usize) { // If this state isn't a representative for an equivalence class, // then we skip it since it won't appear in the minimal DFA. if state_to_part[id.to_usize()] != id { continue; } for (_, next) in self.dfa.get_state_mut(id).iter_mut() { *next = minimal_ids[state_to_part[next.to_usize()].to_usize()]; } self.dfa.swap_states(id, minimal_ids[id.to_usize()]); } // Trim off all unused states from the pre-minimized DFA. This // represents all states that were merged into a non-singleton // equivalence class of states, and appeared after the first state // in each such class. (Because the state with the smallest ID in each // equivalence class is its representative ID.) self.dfa.truncate_states(minimal_count); // Update the new start state, which is now just the minimal ID of // whatever state the old start state was collapsed into. let old_start = self.dfa.start_state(); self.dfa.set_start_state( minimal_ids[state_to_part[old_start.to_usize()].to_usize()], ); // In order to update the ID of the maximum match state, we need to // find the maximum ID among all of the match states in the minimized // DFA. This is not necessarily the new ID of the unminimized maximum // match state, since that could have been collapsed with a much // earlier match state. Therefore, to find the new max match state, // we iterate over all previous match states, find their corresponding // new minimal ID, and take the maximum of those. let old_max = self.dfa.max_match_state(); self.dfa.set_max_match_state(dead_id()); for id in (0..(old_max.to_usize() + 1)).map(S::from_usize) { let part = state_to_part[id.to_usize()]; let new_id = minimal_ids[part.to_usize()]; if new_id > self.dfa.max_match_state() { self.dfa.set_max_match_state(new_id); } } } fn find_waiting(&self, set: &StateSet) -> Option { self.waiting.iter().position(|s| s == set) } fn find_incoming_to( &self, b: u8, set: &StateSet, incoming: &mut StateSet, ) { incoming.clear(); set.iter(|id| { for &inid in &self.in_transitions[id.to_usize()][b as usize] { incoming.add(inid); } }); incoming.canonicalize(); } fn initial_partitions(dfa: &DFARepr) -> Vec> { let mut is_match = StateSet::empty(); let mut no_match = StateSet::empty(); for (id, _) in dfa.states() { if dfa.is_match_state(id) { is_match.add(id); } else { no_match.add(id); } } let mut sets = vec![is_match]; if !no_match.is_empty() { sets.push(no_match); } sets.sort_by_key(|s| s.len()); sets } fn incoming_transitions(dfa: &DFARepr) -> Vec>> { let mut incoming = vec![]; for _ in dfa.states() { incoming.push(vec![vec![]; dfa.alphabet_len()]); } for (id, state) in dfa.states() { for (b, next) in state.transitions() { incoming[next.to_usize()][b as usize].push(id); } } incoming } } impl StateSet { fn empty() -> StateSet { StateSet(Rc::new(RefCell::new(vec![]))) } fn add(&mut self, id: S) { self.0.borrow_mut().push(id); } fn min(&self) -> S { self.0.borrow()[0] } fn canonicalize(&mut self) { self.0.borrow_mut().sort(); self.0.borrow_mut().dedup(); } fn clear(&mut self) { self.0.borrow_mut().clear(); } fn len(&self) -> usize { self.0.borrow().len() } fn is_empty(&self) -> bool { self.len() == 0 } fn deep_clone(&self) -> StateSet { let ids = self.0.borrow().iter().cloned().collect(); StateSet(Rc::new(RefCell::new(ids))) } fn iter(&self, mut f: F) { for &id in self.0.borrow().iter() { f(id); } } fn intersection(&self, other: &StateSet, dest: &mut StateSet) { dest.clear(); if self.is_empty() || other.is_empty() { return; } let (seta, setb) = (self.0.borrow(), other.0.borrow()); let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); loop { if a == b { dest.add(a); a = match ita.next() { None => break, Some(a) => a, }; b = match itb.next() { None => break, Some(b) => b, }; } else if a < b { a = match ita.next() { None => break, Some(a) => a, }; } else { b = match itb.next() { None => break, Some(b) => b, }; } } } fn subtract(&self, other: &StateSet, dest: &mut StateSet) { dest.clear(); if self.is_empty() || other.is_empty() { self.iter(|s| dest.add(s)); return; } let (seta, setb) = (self.0.borrow(), other.0.borrow()); let (mut ita, mut itb) = (seta.iter().cloned(), setb.iter().cloned()); let (mut a, mut b) = (ita.next().unwrap(), itb.next().unwrap()); loop { if a == b { a = match ita.next() { None => break, Some(a) => a, }; b = match itb.next() { None => { dest.add(a); break; } Some(b) => b, }; } else if a < b { dest.add(a); a = match ita.next() { None => break, Some(a) => a, }; } else { b = match itb.next() { None => { dest.add(a); break; } Some(b) => b, }; } } for a in ita { dest.add(a); } } } regex-automata-0.1.8/src/nfa.rs010064400017500000144000001033031347717653200146000ustar0000000000000000use std::cell::RefCell; use std::fmt; use std::iter; use regex_syntax::hir::{self, Hir, HirKind}; use classes::ByteClasses; use error::{Error, Result}; /// The representation for an NFA state identifier. pub type StateID = usize; /// A final compiled NFA. /// /// The states of the NFA are indexed by state IDs, which are how transitions /// are expressed. #[derive(Clone)] pub struct NFA { /// Whether this NFA can only match at the beginning of input or not. /// /// When true, a match should only be reported if it begins at the 0th /// index of the haystack. anchored: bool, /// The starting state of this NFA. start: StateID, /// The state list. This list is guaranteed to be indexable by the starting /// state ID, and it is also guaranteed to contain exactly one `Match` /// state. states: Vec, /// A mapping from any byte value to its corresponding equivalence class /// identifier. Two bytes in the same equivalence class cannot discriminate /// between a match or a non-match. This map can be used to shrink the /// total size of a DFA's transition table with a small match-time cost. /// /// Note that the NFA's transitions are *not* defined in terms of these /// equivalence classes. The NFA's transitions are defined on the original /// byte values. byte_classes: ByteClasses, } /// A state in a final compiled NFA. #[derive(Clone, Debug, Eq, PartialEq)] pub enum State { /// A state that transitions to `next` if and only if the current input /// byte is in the range `[start, end]` (inclusive). Range { start: u8, end: u8, next: StateID }, /// An alternation such that there exists an epsilon transition to all /// states in `alternates`, where matches found via earlier transitions /// are preferred over later transitions. Union { alternates: Vec }, /// A match state. There is exactly one such occurrence of this state in /// an NFA. Match, } impl NFA { /// Returns true if and only if this NFA is anchored. pub fn is_anchored(&self) -> bool { self.anchored } /// Return the number of states in this NFA. pub fn len(&self) -> usize { self.states.len() } /// Return the ID of the initial state of this NFA. pub fn start(&self) -> StateID { self.start } /// Return the NFA state corresponding to the given ID. pub fn state(&self, id: StateID) -> &State { &self.states[id] } /// Return the set of equivalence classes for this NFA. The slice returned /// always has length 256 and maps each possible byte value to its /// corresponding equivalence class ID (which is never more than 255). pub fn byte_classes(&self) -> &ByteClasses { &self.byte_classes } } impl State { /// Returns true if and only if this state contains one or more epsilon /// transitions. pub fn is_epsilon(&self) -> bool { match *self { State::Range { .. } | State::Match => false, State::Union { .. } => true, } } /// Remap the transitions in this state using the given map. Namely, the /// given map should be indexed according to the transitions currently /// in this state. /// /// This is used during the final phase of the NFA compiler, which turns /// its intermediate NFA into the final NFA. fn remap(&mut self, remap: &[StateID]) { match *self { State::Range { ref mut next, .. } => *next = remap[*next], State::Union { ref mut alternates } => { for alt in alternates { *alt = remap[*alt]; } } State::Match => {} } } } /// A builder for compiling an NFA. #[derive(Clone, Debug)] pub struct NFABuilder { anchored: bool, allow_invalid_utf8: bool, reverse: bool, } impl NFABuilder { /// Create a new NFA builder with its default configuration. pub fn new() -> NFABuilder { NFABuilder { anchored: false, allow_invalid_utf8: false, reverse: false, } } /// Compile the given high level intermediate representation of a regular /// expression into an NFA. /// /// If there was a problem building the NFA, then an error is returned. /// For example, if the regex uses unsupported features (such as zero-width /// assertions), then an error is returned. pub fn build(&self, mut expr: Hir) -> Result { if self.reverse { expr = reverse_hir(expr); } let compiler = NFACompiler { states: RefCell::new(vec![]), reverse: self.reverse, }; let mut start = compiler.add_empty(); if !self.anchored { let compiled = if self.allow_invalid_utf8 { compiler.compile_unanchored_prefix_invalid_utf8() } else { compiler.compile_unanchored_prefix_valid_utf8() }?; compiler.patch(start, compiled.start); start = compiled.end; } let compiled = compiler.compile(&expr)?; let match_id = compiler.add_match(); compiler.patch(start, compiled.start); compiler.patch(compiled.end, match_id); Ok(NFA { anchored: self.anchored, ..compiler.to_nfa() }) } /// Set whether matching must be anchored at the beginning of the input. /// /// When enabled, a match must begin at the start of the input. When /// disabled, the NFA will act as if the pattern started with a `.*?`, /// which enables a match to appear anywhere. /// /// By default this is disabled. pub fn anchored(&mut self, yes: bool) -> &mut NFABuilder { self.anchored = yes; self } /// When enabled, the builder will permit the construction of an NFA that /// may match invalid UTF-8. /// /// When disabled (the default), the builder is guaranteed to produce a /// regex that will only ever match valid UTF-8 (otherwise, the builder /// will return an error). pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut NFABuilder { self.allow_invalid_utf8 = yes; self } /// Reverse the NFA. /// /// A NFA reversal is performed by reversing all of the concatenated /// sub-expressions in the original pattern, recursively. The resulting /// NFA can be used to match the pattern starting from the end of a string /// instead of the beginning of a string. /// /// Reversing the NFA is useful for building a reverse DFA, which is most /// useful for finding the start of a match. pub fn reverse(&mut self, yes: bool) -> &mut NFABuilder { self.reverse = yes; self } } /// A compiler that converts a regex AST (well, a high-level IR) to an NFA via /// Thompson's construction. Namely, we permit epsilon transitions. /// /// The compiler deals with a slightly expanded set of NFA states that notably /// includes an empty node that has exactly one epsilon transition to the /// next state. In other words, it's a "goto" instruction if one views /// Thompson's NFA as a set of bytecode instructions. These goto instructions /// are removed in a subsequent phase before returning the NFA to the caller. /// The purpose of these empty nodes is that they make the construction /// algorithm substantially simpler to implement. #[derive(Debug)] struct NFACompiler { /// The set of compiled NFA states. Once a state is compiled, it is /// assigned a state ID equivalent to its index in this list. Subsequent /// compilation can modify previous states by adding new transitions. /// /// We use a RefCell here because the borrow checker otherwise makes /// logical decomposition into methods much harder otherwise. states: RefCell>, /// When true, we are compiling an HIR in reverse. Note that we actually /// reverse the HIR before handing it to this compiler, but the compiler /// does need to know to reverse UTF-8 automata since the HIR is expressed /// in terms of Unicode codepoints. reverse: bool, } /// A "builder" intermediate state representation for an NFA that is only used /// during compilation. Once compilation is done, `BState`s are converted to /// `State`s, which have a much simpler representation. #[derive(Clone, Debug, Eq, PartialEq)] enum BState { /// An empty state whose only purpose is to forward the automaton to /// another state via en epsilon transition. These are useful during /// compilation but are otherwise removed at the end. Empty { next: StateID }, /// A state that only transitions to `next` if the current input byte is /// in the range `[start, end]` (inclusive on both ends). Range { start: u8, end: u8, next: StateID }, /// An alternation such that there exists an epsilon transition to all /// states in `alternates`, where matches found via earlier transitions /// are preferred over later transitions. Union { alternates: Vec }, /// An alternation such that there exists an epsilon transition to all /// states in `alternates`, where matches found via later transitions /// are preferred over earlier transitions. /// /// This "reverse" state exists for convenience during compilation that /// permits easy construction of non-greedy combinations of NFA states. /// At the end of compilation, Union and UnionReverse states are merged /// into one Union type of state, where the latter has its epsilon /// transitions reversed to reflect the priority inversion. UnionReverse { alternates: Vec }, /// A match state. There is exactly one such occurrence of this state in /// an NFA. Match, } /// A value that represents the result of compiling a sub-expression of a /// regex's HIR. Specifically, this represents a sub-graph of the NFA that /// has an initial state at `start` and a final state at `end`. #[derive(Clone, Copy, Debug)] struct ThompsonRef { start: StateID, end: StateID, } impl NFACompiler { /// Convert the current intermediate NFA to its final compiled form. fn to_nfa(&self) -> NFA { let bstates = self.states.borrow(); let mut states = vec![]; let mut remap = vec![0; bstates.len()]; let mut empties = vec![]; let mut byteset = ByteClassSet::new(); // The idea here is to convert our intermediate states to their final // form. The only real complexity here is the process of converting // transitions, which are expressed in terms of state IDs. The new // set of states will be smaller because of partial epsilon removal, // so the state IDs will not be the same. for (id, bstate) in bstates.iter().enumerate() { match *bstate { BState::Empty { next } => { // Since we're removing empty states, we need to handle // them later since we don't yet know which new state this // empty state will be mapped to. empties.push((id, next)); } BState::Range { start, end, next } => { remap[id] = states.len(); states.push(State::Range { start, end, next }); byteset.set_range(start, end); } BState::Union { ref alternates } => { remap[id] = states.len(); let alternates = alternates.clone(); states.push(State::Union { alternates }); } BState::UnionReverse { ref alternates } => { remap[id] = states.len(); let mut alternates = alternates.clone(); alternates.reverse(); states.push(State::Union { alternates }); } BState::Match => { remap[id] = states.len(); states.push(State::Match); } } } for (empty_id, mut empty_next) in empties { // empty states can point to other empty states, forming a chain. // So we must follow the chain until the end, which must point to // a non-empty state, and therefore, a state that is correctly // remapped. while let BState::Empty { next } = bstates[empty_next] { empty_next = next; } remap[empty_id] = remap[empty_next]; } for state in &mut states { state.remap(&remap); } // The compiler always begins the NFA at the first state. let byte_classes = byteset.byte_classes(); NFA { anchored: false, start: remap[0], states, byte_classes } } fn compile(&self, expr: &Hir) -> Result { match *expr.kind() { HirKind::Empty => { let id = self.add_empty(); Ok(ThompsonRef { start: id, end: id }) } HirKind::Literal(hir::Literal::Unicode(ch)) => { let mut buf = [0; 4]; let it = ch .encode_utf8(&mut buf) .as_bytes() .iter() .map(|&b| Ok(self.compile_range(b, b))); self.compile_concat(it) } HirKind::Literal(hir::Literal::Byte(b)) => { Ok(self.compile_range(b, b)) } HirKind::Class(hir::Class::Bytes(ref cls)) => { let it = cls .iter() .map(|rng| Ok(self.compile_range(rng.start(), rng.end()))); self.compile_alternation(it) } HirKind::Class(hir::Class::Unicode(ref cls)) => { self.compile_unicode_class(cls) } HirKind::Repetition(ref rep) => { self.compile_repetition(rep) } HirKind::Group(ref group) => { self.compile(&*group.hir) } HirKind::Concat(ref exprs) => { self.compile_concat(exprs.iter().map(|e| self.compile(e))) } HirKind::Alternation(ref exprs) => { self.compile_alternation(exprs.iter().map(|e| self.compile(e))) } HirKind::Anchor(_) => { Err(Error::unsupported_anchor()) } HirKind::WordBoundary(_) => { Err(Error::unsupported_word()) } } } fn compile_concat( &self, mut it: I, ) -> Result where I: Iterator> { let ThompsonRef { start, mut end } = match it.next() { Some(result) => result?, None => return Ok(self.compile_empty()), }; for result in it { let compiled = result?; self.patch(end, compiled.start); end = compiled.end; } Ok(ThompsonRef { start, end }) } fn compile_alternation( &self, it: I, ) -> Result where I: Iterator> { let alternates = it.collect::>>()?; assert!(!alternates.is_empty(), "alternations must be non-empty"); if alternates.len() == 1 { return Ok(alternates[0]); } let union = self.add_union(); let empty = self.add_empty(); for compiled in alternates { self.patch(union, compiled.start); self.patch(compiled.end, empty); } Ok(ThompsonRef { start: union, end: empty }) } fn compile_repetition( &self, rep: &hir::Repetition, ) -> Result { match rep.kind { hir::RepetitionKind::ZeroOrOne => { self.compile_zero_or_one(&rep.hir, rep.greedy) } hir::RepetitionKind::ZeroOrMore => { self.compile_at_least(&rep.hir, rep.greedy, 0) } hir::RepetitionKind::OneOrMore => { self.compile_at_least(&rep.hir, rep.greedy, 1) } hir::RepetitionKind::Range(ref rng) => { match *rng { hir::RepetitionRange::Exactly(count) => { self.compile_exactly(&rep.hir, count) } hir::RepetitionRange::AtLeast(m) => { self.compile_at_least(&rep.hir, rep.greedy, m) } hir::RepetitionRange::Bounded(min, max) => { self.compile_bounded(&rep.hir, rep.greedy, min, max) } } } } } fn compile_bounded( &self, expr: &Hir, greedy: bool, min: u32, max: u32, ) -> Result { let prefix = self.compile_exactly(expr, min)?; if min == max { return Ok(prefix); } let suffix = self.compile_concat( (min..max).map(|_| self.compile_zero_or_one(expr, greedy)) )?; self.patch(prefix.end, suffix.start); Ok(ThompsonRef { start: prefix.start, end: suffix.end, }) } fn compile_at_least( &self, expr: &Hir, greedy: bool, n: u32, ) -> Result { if n == 0 { let union = if greedy { self.add_union() } else { self.add_reverse_union() }; let compiled = self.compile(expr)?; self.patch(union, compiled.start); self.patch(compiled.end, union); Ok(ThompsonRef { start: union, end: union }) } else if n == 1 { let compiled = self.compile(expr)?; let union = if greedy { self.add_union() } else { self.add_reverse_union() }; self.patch(compiled.end, union); self.patch(union, compiled.start); Ok(ThompsonRef { start: compiled.start, end: union }) } else { let prefix = self.compile_exactly(expr, n - 1)?; let last = self.compile(expr)?; let union = if greedy { self.add_union() } else { self.add_reverse_union() }; self.patch(prefix.end, last.start); self.patch(last.end, union); self.patch(union, last.start); Ok(ThompsonRef { start: prefix.start, end: union }) } } fn compile_zero_or_one( &self, expr: &Hir, greedy: bool, ) -> Result { let union = if greedy { self.add_union() } else { self.add_reverse_union() }; let compiled = self.compile(expr)?; let empty = self.add_empty(); self.patch(union, compiled.start); self.patch(union, empty); self.patch(compiled.end, empty); Ok(ThompsonRef { start: union, end: empty }) } fn compile_exactly(&self, expr: &Hir, n: u32) -> Result { let it = iter::repeat(()) .take(n as usize) .map(|_| self.compile(expr)); self.compile_concat(it) } fn compile_unicode_class( &self, cls: &hir::ClassUnicode, ) -> Result { use utf8_ranges::Utf8Sequences; let it = cls .iter() .flat_map(|rng| Utf8Sequences::new(rng.start(), rng.end())) .map(|seq| { if self.reverse { self.compile_concat( seq.as_slice() .iter() .rev() .map(|rng| { Ok(self.compile_range(rng.start, rng.end)) }) ) } else { self.compile_concat( seq.as_slice() .iter() .map(|rng| { Ok(self.compile_range(rng.start, rng.end)) }) ) } }); self.compile_alternation(it) } fn compile_range(&self, start: u8, end: u8) -> ThompsonRef { let id = self.add_range(start, end); ThompsonRef { start: id, end: id } } fn compile_empty(&self) -> ThompsonRef { let id = self.add_empty(); ThompsonRef { start: id, end: id } } fn compile_unanchored_prefix_valid_utf8(&self) -> Result { self.compile(&Hir::repetition(hir::Repetition { kind: hir::RepetitionKind::ZeroOrMore, greedy: false, hir: Box::new(Hir::any(false)), })) } fn compile_unanchored_prefix_invalid_utf8(&self) -> Result { self.compile(&Hir::repetition(hir::Repetition { kind: hir::RepetitionKind::ZeroOrMore, greedy: false, hir: Box::new(Hir::any(true)), })) } fn patch(&self, from: StateID, to: StateID) { match self.states.borrow_mut()[from] { BState::Empty { ref mut next } => { *next = to; } BState::Range { ref mut next, .. } => { *next = to; } BState::Union { ref mut alternates } => { alternates.push(to); } BState::UnionReverse { ref mut alternates } => { alternates.push(to); } BState::Match => {} } } fn add_empty(&self) -> StateID { let id = self.states.borrow().len(); self.states.borrow_mut().push(BState::Empty { next: 0 }); id } fn add_range(&self, start: u8, end: u8) -> StateID { let id = self.states.borrow().len(); let state = BState::Range { start, end, next: 0 }; self.states.borrow_mut().push(state); id } fn add_union(&self) -> StateID { let id = self.states.borrow().len(); let state = BState::Union { alternates: vec![] }; self.states.borrow_mut().push(state); id } fn add_reverse_union(&self) -> StateID { let id = self.states.borrow().len(); let state = BState::UnionReverse { alternates: vec![] }; self.states.borrow_mut().push(state); id } fn add_match(&self) -> StateID { let id = self.states.borrow().len(); self.states.borrow_mut().push(BState::Match); id } } /// A byte class set keeps track of an *approximation* of equivalence classes /// of bytes during NFA construction. That is, every byte in an equivalence /// class cannot discriminate between a match and a non-match. /// /// For example, in the regex `[ab]+`, the bytes `a` and `b` would be in the /// same equivalence class because it never matters whether an `a` or a `b` is /// seen, and no combination of `a`s and `b`s in the text can discriminate /// a match. /// /// Note though that this does not compute the minimal set of equivalence /// classes. For example, in the regex `[ac]+`, both `a` and `c` are in the /// same equivalence class for the same reason that `a` and `b` are in the /// same equivalence class in the aforementioned regex. However, in this /// implementation, `a` and `c` are put into distinct equivalence classes. /// The reason for this is implementation complexity. In the future, we should /// endeavor to compute the minimal equivalence classes since they can have a /// rather large impact on the size of the DFA. /// /// The representation here is 256 booleans, all initially set to false. Each /// boolean maps to its corresponding byte based on position. A `true` value /// indicates the end of an equivalence class, where its corresponding byte /// and all of the bytes corresponding to all previous contiguous `false` /// values are in the same equivalence class. /// /// This particular representation only permits contiguous ranges of bytes to /// be in the same equivalence class, which means that we can never discover /// the true minimal set of equivalence classes. #[derive(Debug)] struct ByteClassSet(Vec); impl ByteClassSet { /// Create a new set of byte classes where all bytes are part of the same /// equivalence class. fn new() -> Self { ByteClassSet(vec![false; 256]) } /// Indicate the the range of byte given (inclusive) can discriminate a /// match between it and all other bytes outside of the range. fn set_range(&mut self, start: u8, end: u8) { debug_assert!(start <= end); if start > 0 { self.0[start as usize - 1] = true; } self.0[end as usize] = true; } /// Convert this boolean set to a map that maps all byte values to their /// corresponding equivalence class. The last mapping indicates the largest /// equivalence class identifier (which is never bigger than 255). fn byte_classes(&self) -> ByteClasses { let mut classes = ByteClasses::empty(); let mut class = 0u8; let mut i = 0; loop { classes.set(i as u8, class as u8); if i >= 255 { break; } if self.0[i] { class = class.checked_add(1).unwrap(); } i += 1; } classes } } impl fmt::Debug for NFA { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { for (i, state) in self.states.iter().enumerate() { let status = if i == self.start { '>' } else { ' ' }; writeln!(f, "{}{:06X}: {:?}", status, i, state)?; } Ok(()) } } /// Reverse the given HIR expression. fn reverse_hir(expr: Hir) -> Hir { match expr.into_kind() { HirKind::Empty => Hir::empty(), HirKind::Literal(hir::Literal::Byte(b)) => { Hir::literal(hir::Literal::Byte(b)) } HirKind::Literal(hir::Literal::Unicode(c)) => { Hir::concat( c.encode_utf8(&mut [0; 4]) .as_bytes() .iter() .cloned() .rev() .map(|b| { if b <= 0x7F { hir::Literal::Unicode(b as char) } else { hir::Literal::Byte(b) } }) .map(Hir::literal) .collect() ) } HirKind::Class(cls) => Hir::class(cls), HirKind::Anchor(anchor) => Hir::anchor(anchor), HirKind::WordBoundary(anchor) => Hir::word_boundary(anchor), HirKind::Repetition(mut rep) => { rep.hir = Box::new(reverse_hir(*rep.hir)); Hir::repetition(rep) } HirKind::Group(mut group) => { group.hir = Box::new(reverse_hir(*group.hir)); Hir::group(group) } HirKind::Concat(exprs) => { let mut reversed = vec![]; for e in exprs { reversed.push(reverse_hir(e)); } reversed.reverse(); Hir::concat(reversed) } HirKind::Alternation(exprs) => { let mut reversed = vec![]; for e in exprs { reversed.push(reverse_hir(e)); } Hir::alternation(reversed) } } } #[cfg(test)] mod tests { use regex_syntax::ParserBuilder; use regex_syntax::hir::Hir; use super::{ByteClassSet, NFA, NFABuilder, State, StateID}; fn parse(pattern: &str) -> Hir { ParserBuilder::new().build().parse(pattern).unwrap() } fn build(pattern: &str) -> NFA { NFABuilder::new().anchored(true).build(parse(pattern)).unwrap() } fn s_byte(byte: u8, next: StateID) -> State { State::Range { start: byte, end: byte, next } } fn s_range(start: u8, end: u8, next: StateID) -> State { State::Range { start, end, next } } fn s_union(alts: &[StateID]) -> State { State::Union { alternates: alts.to_vec() } } fn s_match() -> State { State::Match } #[test] fn errors() { // unsupported anchors assert!(NFABuilder::new().build(parse(r"^")).is_err()); assert!(NFABuilder::new().build(parse(r"$")).is_err()); assert!(NFABuilder::new().build(parse(r"\A")).is_err()); assert!(NFABuilder::new().build(parse(r"\z")).is_err()); // unsupported word boundaries assert!(NFABuilder::new().build(parse(r"\b")).is_err()); assert!(NFABuilder::new().build(parse(r"\B")).is_err()); assert!(NFABuilder::new().build(parse(r"(?-u)\b")).is_err()); } // Test that building an unanchored NFA has an appropriate `.*?` prefix. #[test] fn compile_unanchored_prefix() { // When the machine can only match valid UTF-8. let nfa = NFABuilder::new() .anchored(false) .build(parse(r"a")) .unwrap(); // There should be many states since the `.` in `.*?` matches any // Unicode scalar value. assert_eq!(31, nfa.len()); assert_eq!(nfa.states[30], s_match()); assert_eq!(nfa.states[29], s_byte(b'a', 30)); // When the machine can match invalid UTF-8. let nfa = NFABuilder::new() .anchored(false) .allow_invalid_utf8(true) .build(parse(r"a")) .unwrap(); assert_eq!(nfa.states, &[ s_union(&[2, 1]), s_range(0, 255, 0), s_byte(b'a', 3), s_match(), ]); } #[test] fn compile_empty() { assert_eq!(build("").states, &[ s_match(), ]); } #[test] fn compile_literal() { assert_eq!(build("a").states, &[ s_byte(b'a', 1), s_match(), ]); assert_eq!(build("ab").states, &[ s_byte(b'a', 1), s_byte(b'b', 2), s_match(), ]); assert_eq!(build("☃").states, &[ s_byte(0xE2, 1), s_byte(0x98, 2), s_byte(0x83, 3), s_match(), ]); // Check that non-UTF-8 literals work. let hir = ParserBuilder::new() .allow_invalid_utf8(true) .build() .parse(r"(?-u)\xFF") .unwrap(); let nfa = NFABuilder::new() .anchored(true) .allow_invalid_utf8(true) .build(hir) .unwrap(); assert_eq!(nfa.states, &[ s_byte(b'\xFF', 1), s_match(), ]); } #[test] fn compile_class() { assert_eq!(build(r"[a-z]").states, &[ s_range(b'a', b'z', 1), s_match(), ]); assert_eq!(build(r"[x-za-c]").states, &[ s_range(b'a', b'c', 3), s_range(b'x', b'z', 3), s_union(&[0, 1]), s_match(), ]); assert_eq!(build(r"[\u03B1-\u03B4]").states, &[ s_byte(0xCE, 1), s_range(0xB1, 0xB4, 2), s_match(), ]); assert_eq!(build(r"[\u03B1-\u03B4\u{1F919}-\u{1F91E}]").states, &[ s_byte(0xCE, 1), s_range(0xB1, 0xB4, 7), s_byte(0xF0, 3), s_byte(0x9F, 4), s_byte(0xA4, 5), s_range(0x99, 0x9E, 7), s_union(&[0, 2]), s_match(), ]); } #[test] fn compile_repetition() { assert_eq!(build(r"a?").states, &[ s_union(&[1, 2]), s_byte(b'a', 2), s_match(), ]); assert_eq!(build(r"a??").states, &[ s_union(&[2, 1]), s_byte(b'a', 2), s_match(), ]); } #[test] fn compile_group() { assert_eq!(build(r"ab+").states, &[ s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[1, 3]), s_match(), ]); assert_eq!(build(r"(ab)").states, &[ s_byte(b'a', 1), s_byte(b'b', 2), s_match(), ]); assert_eq!(build(r"(ab)+").states, &[ s_byte(b'a', 1), s_byte(b'b', 2), s_union(&[0, 3]), s_match(), ]); } #[test] fn compile_alternation() { assert_eq!(build(r"a|b").states, &[ s_byte(b'a', 3), s_byte(b'b', 3), s_union(&[0, 1]), s_match(), ]); } #[test] fn byte_classes() { let mut set = ByteClassSet::new(); set.set_range(b'a', b'z'); let classes = set.byte_classes(); assert_eq!(classes.get(0), 0); assert_eq!(classes.get(1), 0); assert_eq!(classes.get(2), 0); assert_eq!(classes.get(b'a' - 1), 0); assert_eq!(classes.get(b'a'), 1); assert_eq!(classes.get(b'm'), 1); assert_eq!(classes.get(b'z'), 1); assert_eq!(classes.get(b'z' + 1), 2); assert_eq!(classes.get(254), 2); assert_eq!(classes.get(255), 2); let mut set = ByteClassSet::new(); set.set_range(0, 2); set.set_range(4, 6); let classes = set.byte_classes(); assert_eq!(classes.get(0), 0); assert_eq!(classes.get(1), 0); assert_eq!(classes.get(2), 0); assert_eq!(classes.get(3), 1); assert_eq!(classes.get(4), 2); assert_eq!(classes.get(5), 2); assert_eq!(classes.get(6), 2); assert_eq!(classes.get(7), 3); assert_eq!(classes.get(255), 3); } #[test] fn full_byte_classes() { let mut set = ByteClassSet::new(); for i in 0..256u16 { set.set_range(i as u8, i as u8); } assert_eq!(set.byte_classes().alphabet_len(), 256); } } regex-automata-0.1.8/src/regex.rs010064400017500000144000000746141341352022600151410ustar0000000000000000#[cfg(feature = "std")] use dense::{self, DenseDFA}; use dfa::DFA; #[cfg(feature = "std")] use error::Result; #[cfg(feature = "std")] use sparse::SparseDFA; #[cfg(feature = "std")] use state_id::StateID; /// A regular expression that uses deterministic finite automata for fast /// searching. /// /// A regular expression is comprised of two DFAs, a "forward" DFA and a /// "reverse" DFA. The forward DFA is responsible for detecting the end of a /// match while the reverse DFA is responsible for detecting the start of a /// match. Thus, in order to find the bounds of any given match, a forward /// search must first be run followed by a reverse search. A match found by /// the forward DFA guarantees that the reverse DFA will also find a match. /// /// The type of the DFA used by a `Regex` corresponds to the `D` type /// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically, /// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a /// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but /// search faster, while sparse DFAs use less memory but search more slowly. /// /// By default, a regex's DFA type parameter is set to /// `DenseDFA, usize>`. For most in-memory work loads, this is the /// most convenient type that gives the best search performance. /// /// # Sparse DFAs /// /// Since a `Regex` is generic over the `DFA` trait, it can be used with any /// kind of DFA. While this crate constructs dense DFAs by default, it is easy /// enough to build corresponding sparse DFAs, and then build a regex from /// them: /// /// ``` /// use regex_automata::Regex; /// /// # fn example() -> Result<(), regex_automata::Error> { /// // First, build a regex that uses dense DFAs. /// let dense_re = Regex::new("foo[0-9]+")?; /// /// // Second, build sparse DFAs from the forward and reverse dense DFAs. /// let fwd = dense_re.forward().to_sparse()?; /// let rev = dense_re.reverse().to_sparse()?; /// /// // Third, build a new regex from the constituent sparse DFAs. /// let sparse_re = Regex::from_dfas(fwd, rev); /// /// // A regex that uses sparse DFAs can be used just like with dense DFAs. /// assert_eq!(true, sparse_re.is_match(b"foo123")); /// # Ok(()) }; example().unwrap() /// ``` #[cfg(feature = "std")] #[derive(Clone, Debug)] pub struct Regex, usize>> { forward: D, reverse: D, } /// A regular expression that uses deterministic finite automata for fast /// searching. /// /// A regular expression is comprised of two DFAs, a "forward" DFA and a /// "reverse" DFA. The forward DFA is responsible for detecting the end of a /// match while the reverse DFA is responsible for detecting the start of a /// match. Thus, in order to find the bounds of any given match, a forward /// search must first be run followed by a reverse search. A match found by /// the forward DFA guarantees that the reverse DFA will also find a match. /// /// The type of the DFA used by a `Regex` corresponds to the `D` type /// parameter, which must satisfy the [`DFA`](trait.DFA.html) trait. Typically, /// `D` is either a [`DenseDFA`](enum.DenseDFA.html) or a /// [`SparseDFA`](enum.SparseDFA.html), where dense DFAs use more memory but /// search faster, while sparse DFAs use less memory but search more slowly. /// /// When using this crate without the standard library, the `Regex` type has /// no default type parameter. /// /// # Sparse DFAs /// /// Since a `Regex` is generic over the `DFA` trait, it can be used with any /// kind of DFA. While this crate constructs dense DFAs by default, it is easy /// enough to build corresponding sparse DFAs, and then build a regex from /// them: /// /// ``` /// use regex_automata::Regex; /// /// # fn example() -> Result<(), regex_automata::Error> { /// // First, build a regex that uses dense DFAs. /// let dense_re = Regex::new("foo[0-9]+")?; /// /// // Second, build sparse DFAs from the forward and reverse dense DFAs. /// let fwd = dense_re.forward().to_sparse()?; /// let rev = dense_re.reverse().to_sparse()?; /// /// // Third, build a new regex from the constituent sparse DFAs. /// let sparse_re = Regex::from_dfas(fwd, rev); /// /// // A regex that uses sparse DFAs can be used just like with dense DFAs. /// assert_eq!(true, sparse_re.is_match(b"foo123")); /// # Ok(()) }; example().unwrap() /// ``` #[cfg(not(feature = "std"))] #[derive(Clone, Debug)] pub struct Regex { forward: D, reverse: D, } #[cfg(feature = "std")] impl Regex { /// Parse the given regular expression using a default configuration and /// return the corresponding regex. /// /// The default configuration uses `usize` for state IDs, premultiplies /// them and reduces the alphabet size by splitting bytes into equivalence /// classes. The underlying DFAs are *not* minimized. /// /// If you want a non-default configuration, then use the /// [`RegexBuilder`](struct.RegexBuilder.html) /// to set your own configuration. /// /// # Example /// /// ``` /// use regex_automata::Regex; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let re = Regex::new("foo[0-9]+bar")?; /// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz")); /// # Ok(()) }; example().unwrap() /// ``` pub fn new(pattern: &str) -> Result { RegexBuilder::new().build(pattern) } } #[cfg(feature = "std")] impl Regex, usize>> { /// Parse the given regular expression using a default configuration and /// return the corresponding regex using sparse DFAs. /// /// The default configuration uses `usize` for state IDs, reduces the /// alphabet size by splitting bytes into equivalence classes. The /// underlying DFAs are *not* minimized. /// /// If you want a non-default configuration, then use the /// [`RegexBuilder`](struct.RegexBuilder.html) /// to set your own configuration. /// /// # Example /// /// ``` /// use regex_automata::Regex; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let re = Regex::new_sparse("foo[0-9]+bar")?; /// assert_eq!(Some((3, 14)), re.find(b"zzzfoo12345barzzz")); /// # Ok(()) }; example().unwrap() /// ``` pub fn new_sparse( pattern: &str, ) -> Result, usize>>> { RegexBuilder::new().build_sparse(pattern) } } impl Regex { /// Returns true if and only if the given bytes match. /// /// This routine may short circuit if it knows that scanning future input /// will never lead to a different result. In particular, if the underlying /// DFA enters a match state or a dead state, then this routine will return /// `true` or `false`, respectively, without inspecting any future input. /// /// # Example /// /// ``` /// use regex_automata::Regex; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let re = Regex::new("foo[0-9]+bar")?; /// assert_eq!(true, re.is_match(b"foo12345bar")); /// assert_eq!(false, re.is_match(b"foobar")); /// # Ok(()) }; example().unwrap() /// ``` pub fn is_match(&self, input: &[u8]) -> bool { self.is_match_at(input, 0) } /// Returns the first position at which a match is found. /// /// This routine stops scanning input in precisely the same circumstances /// as `is_match`. The key difference is that this routine returns the /// position at which it stopped scanning input if and only if a match /// was found. If no match is found, then `None` is returned. /// /// # Example /// /// ``` /// use regex_automata::Regex; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let re = Regex::new("foo[0-9]+")?; /// assert_eq!(Some(4), re.shortest_match(b"foo12345")); /// /// // Normally, the end of the leftmost first match here would be 3, /// // but the shortest match semantics detect a match earlier. /// let re = Regex::new("abc|a")?; /// assert_eq!(Some(1), re.shortest_match(b"abc")); /// # Ok(()) }; example().unwrap() /// ``` pub fn shortest_match(&self, input: &[u8]) -> Option { self.shortest_match_at(input, 0) } /// Returns the start and end offset of the leftmost first match. If no /// match exists, then `None` is returned. /// /// The "leftmost first" match corresponds to the match with the smallest /// starting offset, but where the end offset is determined by preferring /// earlier branches in the original regular expression. For example, /// `Sam|Samwise` will match `Sam` in `Samwise`, but `Samwise|Sam` will /// match `Samwise` in `Samwise`. /// /// Generally speaking, the "leftmost first" match is how most backtracking /// regular expressions tend to work. This is in contrast to POSIX-style /// regular expressions that yield "leftmost longest" matches. Namely, /// both `Sam|Samwise` and `Samwise|Sam` match `Samwise` when using /// leftmost longest semantics. /// /// # Example /// /// ``` /// use regex_automata::Regex; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let re = Regex::new("foo[0-9]+")?; /// assert_eq!(Some((3, 11)), re.find(b"zzzfoo12345zzz")); /// /// // Even though a match is found after reading the first byte (`a`), /// // the leftmost first match semantics demand that we find the earliest /// // match that prefers earlier parts of the pattern over latter parts. /// let re = Regex::new("abc|a")?; /// assert_eq!(Some((0, 3)), re.find(b"abc")); /// # Ok(()) }; example().unwrap() /// ``` pub fn find(&self, input: &[u8]) -> Option<(usize, usize)> { self.find_at(input, 0) } /// Returns the same as `is_match`, but starts the search at the given /// offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, if the DFA is anchored, then /// a match can only occur when `start == 0`. pub fn is_match_at(&self, input: &[u8], start: usize) -> bool { self.forward().is_match_at(input, start) } /// Returns the same as `shortest_match`, but starts the search at the /// given offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, if the DFA is anchored, then /// a match can only occur when `start == 0`. pub fn shortest_match_at( &self, input: &[u8], start: usize, ) -> Option { self.forward().shortest_match_at(input, start) } /// Returns the same as `find`, but starts the search at the given /// offset. /// /// The significance of the starting point is that it takes the surrounding /// context into consideration. For example, if the DFA is anchored, then /// a match can only occur when `start == 0`. pub fn find_at( &self, input: &[u8], start: usize, ) -> Option<(usize, usize)> { let end = match self.forward().find_at(input, start) { None => return None, Some(end) => end, }; let start = self .reverse() .rfind(&input[start..end]) .map(|i| start + i) .expect("reverse search must match if forward search does"); Some((start, end)) } /// Returns an iterator over all non-overlapping leftmost first matches /// in the given bytes. If no match exists, then the iterator yields no /// elements. /// /// Note that if the regex can match the empty string, then it is /// possible for the iterator to yield a zero-width match at a location /// that is not a valid UTF-8 boundary (for example, between the code units /// of a UTF-8 encoded codepoint). This can happen regardless of whether /// [`allow_invalid_utf8`](struct.RegexBuilder.html#method.allow_invalid_utf8) /// was enabled or not. /// /// # Example /// /// ``` /// use regex_automata::Regex; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let re = Regex::new("foo[0-9]+")?; /// let text = b"foo1 foo12 foo123"; /// let matches: Vec<(usize, usize)> = re.find_iter(text).collect(); /// assert_eq!(matches, vec![(0, 4), (5, 10), (11, 17)]); /// # Ok(()) }; example().unwrap() /// ``` pub fn find_iter<'r, 't>( &'r self, input: &'t [u8], ) -> Matches<'r, 't, D> { Matches::new(self, input) } /// Build a new regex from its constituent forward and reverse DFAs. /// /// This is useful when deserializing a regex from some arbitrary /// memory region. This is also useful for building regexes from other /// types of DFAs. /// /// # Example /// /// This example is a bit a contrived. The usual use of these methods /// would involve serializing `initial_re` somewhere and then deserializing /// it later to build a regex. /// /// ``` /// use regex_automata::Regex; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let initial_re = Regex::new("foo[0-9]+")?; /// assert_eq!(true, initial_re.is_match(b"foo123")); /// /// let (fwd, rev) = (initial_re.forward(), initial_re.reverse()); /// let re = Regex::from_dfas(fwd, rev); /// assert_eq!(true, re.is_match(b"foo123")); /// # Ok(()) }; example().unwrap() /// ``` /// /// This example shows how you might build smaller DFAs, and then use those /// smaller DFAs to build a new regex. /// /// ``` /// use regex_automata::Regex; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let initial_re = Regex::new("foo[0-9]+")?; /// assert_eq!(true, initial_re.is_match(b"foo123")); /// /// let fwd = initial_re.forward().to_u16()?; /// let rev = initial_re.reverse().to_u16()?; /// let re = Regex::from_dfas(fwd, rev); /// assert_eq!(true, re.is_match(b"foo123")); /// # Ok(()) }; example().unwrap() /// ``` /// /// This example shows how to build a `Regex` that uses sparse DFAs instead /// of dense DFAs: /// /// ``` /// use regex_automata::Regex; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let initial_re = Regex::new("foo[0-9]+")?; /// assert_eq!(true, initial_re.is_match(b"foo123")); /// /// let fwd = initial_re.forward().to_sparse()?; /// let rev = initial_re.reverse().to_sparse()?; /// let re = Regex::from_dfas(fwd, rev); /// assert_eq!(true, re.is_match(b"foo123")); /// # Ok(()) }; example().unwrap() /// ``` pub fn from_dfas(forward: D, reverse: D) -> Regex { Regex { forward, reverse } } /// Return the underlying DFA responsible for forward matching. pub fn forward(&self) -> &D { &self.forward } /// Return the underlying DFA responsible for reverse matching. pub fn reverse(&self) -> &D { &self.reverse } } /// An iterator over all non-overlapping matches for a particular search. /// /// The iterator yields a `(usize, usize)` value until no more matches could be /// found. The first `usize` is the start of the match (inclusive) while the /// second `usize` is the end of the match (exclusive). /// /// `S` is the type used to represent state identifiers in the underlying /// regex. The lifetime variables are as follows: /// /// * `'r` is the lifetime of the regular expression value itself. /// * `'t` is the lifetime of the text being searched. #[derive(Clone, Debug)] pub struct Matches<'r, 't, D: DFA + 'r> { re: &'r Regex, text: &'t [u8], last_end: usize, last_match: Option, } impl<'r, 't, D: DFA> Matches<'r, 't, D> { fn new(re: &'r Regex, text: &'t [u8]) -> Matches<'r, 't, D> { Matches { re: re, text: text, last_end: 0, last_match: None, } } } impl<'r, 't, D: DFA> Iterator for Matches<'r, 't, D> { type Item = (usize, usize); fn next(&mut self) -> Option<(usize, usize)> { if self.last_end > self.text.len() { return None; } let (s, e) = match self.re.find_at(self.text, self.last_end) { None => return None, Some((s, e)) => (s, e), }; if s == e { // This is an empty match. To ensure we make progress, start // the next search at the smallest possible starting position // of the next match following this one. self.last_end = e + 1; // Don't accept empty matches immediately following a match. // Just move on to the next match. if Some(e) == self.last_match { return self.next(); } } else { self.last_end = e; } self.last_match = Some(e); Some((s, e)) } } /// A builder for a regex based on deterministic finite automatons. /// /// This builder permits configuring several aspects of the construction /// process such as case insensitivity, Unicode support and various options /// that impact the size of the underlying DFAs. In some cases, options (like /// performing DFA minimization) can come with a substantial additional cost. /// /// This builder generally constructs two DFAs, where one is responsible for /// finding the end of a match and the other is responsible for finding the /// start of a match. If you only need to detect whether something matched, /// or only the end of a match, then you should use a /// [`dense::Builder`](dense/struct.Builder.html) /// to construct a single DFA, which is cheaper than building two DFAs. #[cfg(feature = "std")] #[derive(Clone, Debug)] pub struct RegexBuilder { dfa: dense::Builder, } #[cfg(feature = "std")] impl RegexBuilder { /// Create a new regex builder with the default configuration. pub fn new() -> RegexBuilder { RegexBuilder { dfa: dense::Builder::new(), } } /// Build a regex from the given pattern. /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. pub fn build( &self, pattern: &str, ) -> Result { self.build_with_size::(pattern) } /// Build a regex from the given pattern using sparse DFAs. /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. pub fn build_sparse( &self, pattern: &str, ) -> Result, usize>>> { self.build_with_size_sparse::(pattern) } /// Build a regex from the given pattern using a specific representation /// for the underlying DFA state IDs. /// /// If there was a problem parsing or compiling the pattern, then an error /// is returned. /// /// The representation of state IDs is determined by the `S` type /// parameter. In general, `S` is usually one of `u8`, `u16`, `u32`, `u64` /// or `usize`, where `usize` is the default used for `build`. The purpose /// of specifying a representation for state IDs is to reduce the memory /// footprint of the underlying DFAs. /// /// When using this routine, the chosen state ID representation will be /// used throughout determinization and minimization, if minimization was /// requested. Even if the minimized DFAs can fit into the chosen state ID /// representation but the initial determinized DFA cannot, then this will /// still return an error. To get a minimized DFA with a smaller state ID /// representation, first build it with a bigger state ID representation, /// and then shrink the sizes of the DFAs using one of its conversion /// routines, such as [`DenseDFA::to_u16`](enum.DenseDFA.html#method.to_u16). /// Finally, reconstitute the regex via /// [`Regex::from_dfa`](struct.Regex.html#method.from_dfa). pub fn build_with_size( &self, pattern: &str, ) -> Result, S>>> { let forward = self.dfa.build_with_size(pattern)?; let reverse = self.dfa .clone() .anchored(true) .reverse(true) .longest_match(true) .build_with_size(pattern)?; Ok(Regex::from_dfas(forward, reverse)) } /// Build a regex from the given pattern using a specific representation /// for the underlying DFA state IDs using sparse DFAs. pub fn build_with_size_sparse( &self, pattern: &str, ) -> Result, S>>> { let re = self.build_with_size(pattern)?; let fwd = re.forward().to_sparse()?; let rev = re.reverse().to_sparse()?; Ok(Regex::from_dfas(fwd, rev)) } /// Set whether matching must be anchored at the beginning of the input. /// /// When enabled, a match must begin at the start of the input. When /// disabled, the regex will act as if the pattern started with a `.*?`, /// which enables a match to appear anywhere. /// /// By default this is disabled. pub fn anchored(&mut self, yes: bool) -> &mut RegexBuilder { self.dfa.anchored(yes); self } /// Enable or disable the case insensitive flag by default. /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `i` flag. pub fn case_insensitive(&mut self, yes: bool) -> &mut RegexBuilder { self.dfa.case_insensitive(yes); self } /// Enable verbose mode in the regular expression. /// /// When enabled, verbose mode permits insigificant whitespace in many /// places in the regular expression, as well as comments. Comments are /// started using `#` and continue until the end of the line. /// /// By default, this is disabled. It may be selectively enabled in the /// regular expression by using the `x` flag regardless of this setting. pub fn ignore_whitespace(&mut self, yes: bool) -> &mut RegexBuilder { self.dfa.ignore_whitespace(yes); self } /// Enable or disable the "dot matches any character" flag by default. /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `s` flag. pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut RegexBuilder { self.dfa.dot_matches_new_line(yes); self } /// Enable or disable the "swap greed" flag by default. /// /// By default this is disabled. It may alternatively be selectively /// enabled in the regular expression itself via the `U` flag. pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { self.dfa.swap_greed(yes); self } /// Enable or disable the Unicode flag (`u`) by default. /// /// By default this is **enabled**. It may alternatively be selectively /// disabled in the regular expression itself via the `u` flag. /// /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by /// default), a regular expression will fail to parse if Unicode mode is /// disabled and a sub-expression could possibly match invalid UTF-8. pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { self.dfa.unicode(yes); self } /// When enabled, the builder will permit the construction of a regular /// expression that may match invalid UTF-8. /// /// When disabled (the default), the builder is guaranteed to produce a /// regex that will only ever match valid UTF-8 (otherwise, the builder /// will return an error). pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut RegexBuilder { self.dfa.allow_invalid_utf8(yes); self } /// Set the nesting limit used for the regular expression parser. /// /// The nesting limit controls how deep the abstract syntax tree is allowed /// to be. If the AST exceeds the given limit (e.g., with too many nested /// groups), then an error is returned by the parser. /// /// The purpose of this limit is to act as a heuristic to prevent stack /// overflow when building a finite automaton from a regular expression's /// abstract syntax tree. In particular, construction currently uses /// recursion. In the future, the implementation may stop using recursion /// and this option will no longer be necessary. /// /// This limit is not checked until the entire AST is parsed. Therefore, /// if callers want to put a limit on the amount of heap space used, then /// they should impose a limit on the length, in bytes, of the concrete /// pattern string. In particular, this is viable since the parser will /// limit itself to heap space proportional to the lenth of the pattern /// string. /// /// Note that a nest limit of `0` will return a nest limit error for most /// patterns but not all. For example, a nest limit of `0` permits `a` but /// not `ab`, since `ab` requires a concatenation AST item, which results /// in a nest depth of `1`. In general, a nest limit is not something that /// manifests in an obvious way in the concrete syntax, therefore, it /// should not be used in a granular way. pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { self.dfa.nest_limit(limit); self } /// Minimize the underlying DFAs. /// /// When enabled, the DFAs powering the resulting regex will be minimized /// such that it is as small as possible. /// /// Whether one enables minimization or not depends on the types of costs /// you're willing to pay and how much you care about its benefits. In /// particular, minimization has worst case `O(n*k*logn)` time and `O(k*n)` /// space, where `n` is the number of DFA states and `k` is the alphabet /// size. In practice, minimization can be quite costly in terms of both /// space and time, so it should only be done if you're willing to wait /// longer to produce a DFA. In general, you might want a minimal DFA in /// the following circumstances: /// /// 1. You would like to optimize for the size of the automaton. This can /// manifest in one of two ways. Firstly, if you're converting the /// DFA into Rust code (or a table embedded in the code), then a minimal /// DFA will translate into a corresponding reduction in code size, and /// thus, also the final compiled binary size. Secondly, if you are /// building many DFAs and putting them on the heap, you'll be able to /// fit more if they are smaller. Note though that building a minimal /// DFA itself requires additional space; you only realize the space /// savings once the minimal DFA is constructed (at which point, the /// space used for minimization is freed). /// 2. You've observed that a smaller DFA results in faster match /// performance. Naively, this isn't guaranteed since there is no /// inherent difference between matching with a bigger-than-minimal /// DFA and a minimal DFA. However, a smaller DFA may make use of your /// CPU's cache more efficiently. /// 3. You are trying to establish an equivalence between regular /// languages. The standard method for this is to build a minimal DFA /// for each language and then compare them. If the DFAs are equivalent /// (up to state renaming), then the languages are equivalent. /// /// This option is disabled by default. pub fn minimize(&mut self, yes: bool) -> &mut RegexBuilder { self.dfa.minimize(yes); self } /// Premultiply state identifiers in the underlying DFA transition tables. /// /// When enabled, state identifiers are premultiplied to point to their /// corresponding row in the DFA's transition table. That is, given the /// `i`th state, its corresponding premultiplied identifier is `i * k` /// where `k` is the alphabet size of the DFA. (The alphabet size is at /// most 256, but is in practice smaller if byte classes is enabled.) /// /// When state identifiers are not premultiplied, then the identifier of /// the `i`th state is `i`. /// /// The advantage of premultiplying state identifiers is that is saves /// a multiplication instruction per byte when searching with the DFA. /// This has been observed to lead to a 20% performance benefit in /// micro-benchmarks. /// /// The primary disadvantage of premultiplying state identifiers is /// that they require a larger integer size to represent. For example, /// if your DFA has 200 states, then its premultiplied form requires /// 16 bits to represent every possible state identifier, where as its /// non-premultiplied form only requires 8 bits. /// /// This option is enabled by default. pub fn premultiply(&mut self, yes: bool) -> &mut RegexBuilder { self.dfa.premultiply(yes); self } /// Shrink the size of the underlying DFA alphabet by mapping bytes to /// their equivalence classes. /// /// When enabled, each DFA will use a map from all possible bytes to their /// corresponding equivalence class. Each equivalence class represents a /// set of bytes that does not discriminate between a match and a non-match /// in the DFA. For example, the pattern `[ab]+` has at least two /// equivalence classes: a set containing `a` and `b` and a set containing /// every byte except for `a` and `b`. `a` and `b` are in the same /// equivalence classes because they never discriminate between a match /// and a non-match. /// /// The advantage of this map is that the size of the transition table can /// be reduced drastically from `#states * 256 * sizeof(id)` to /// `#states * k * sizeof(id)` where `k` is the number of equivalence /// classes. As a result, total space usage can decrease substantially. /// Moreover, since a smaller alphabet is used, compilation becomes faster /// as well. /// /// The disadvantage of this map is that every byte searched must be /// passed through this map before it can be used to determine the next /// transition. This has a small match time performance cost. /// /// This option is enabled by default. pub fn byte_classes(&mut self, yes: bool) -> &mut RegexBuilder { self.dfa.byte_classes(yes); self } } #[cfg(feature = "std")] impl Default for RegexBuilder { fn default() -> RegexBuilder { RegexBuilder::new() } } regex-automata-0.1.8/src/sparse.rs010064400017500000144000001347541341444542500153360ustar0000000000000000#[cfg(feature = "std")] use std::collections::HashMap; #[cfg(feature = "std")] use core::fmt; #[cfg(feature = "std")] use core::iter; use core::marker::PhantomData; use core::mem::size_of; use byteorder::{ByteOrder, NativeEndian}; #[cfg(feature = "std")] use byteorder::{BigEndian, LittleEndian}; use classes::ByteClasses; use dense; use dfa::DFA; #[cfg(feature = "std")] use error::{Error, Result}; #[cfg(feature = "std")] use state_id::{StateID, dead_id, usize_to_state_id, write_state_id_bytes}; #[cfg(not(feature = "std"))] use state_id::{StateID, dead_id}; /// A sparse table-based deterministic finite automaton (DFA). /// /// In contrast to a [dense DFA](enum.DenseDFA.html), a sparse DFA uses a /// more space efficient representation for its transition table. Consequently, /// sparse DFAs can use much less memory than dense DFAs, but this comes at a /// price. In particular, reading the more space efficient transitions takes /// more work, and consequently, searching using a sparse DFA is typically /// slower than a dense DFA. /// /// A sparse DFA can be built using the default configuration via the /// [`SparseDFA::new`](enum.SparseDFA.html#method.new) constructor. Otherwise, /// one can configure various aspects of a dense DFA via /// [`dense::Builder`](dense/struct.Builder.html), and then convert a dense /// DFA to a sparse DFA using /// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse). /// /// In general, a sparse DFA supports all the same operations as a dense DFA. /// /// Making the choice between a dense and sparse DFA depends on your specific /// work load. If you can sacrifice a bit of search time performance, then a /// sparse DFA might be the best choice. In particular, while sparse DFAs are /// probably always slower than dense DFAs, you may find that they are easily /// fast enough for your purposes! /// /// # State size /// /// A `SparseDFA` has two type parameters, `T` and `S`. `T` corresponds to /// the type of the DFA's transition table while `S` corresponds to the /// representation used for the DFA's state identifiers as described by the /// [`StateID`](trait.StateID.html) trait. This type parameter is typically /// `usize`, but other valid choices provided by this crate include `u8`, /// `u16`, `u32` and `u64`. The primary reason for choosing a different state /// identifier representation than the default is to reduce the amount of /// memory used by a DFA. Note though, that if the chosen representation cannot /// accommodate the size of your DFA, then building the DFA will fail and /// return an error. /// /// While the reduction in heap memory used by a DFA is one reason for choosing /// a smaller state identifier representation, another possible reason is for /// decreasing the serialization size of a DFA, as returned by /// [`to_bytes_little_endian`](enum.SparseDFA.html#method.to_bytes_little_endian), /// [`to_bytes_big_endian`](enum.SparseDFA.html#method.to_bytes_big_endian) /// or /// [`to_bytes_native_endian`](enum.DenseDFA.html#method.to_bytes_native_endian). /// /// The type of the transition table is typically either `Vec` or `&[u8]`, /// depending on where the transition table is stored. Note that this is /// different than a dense DFA, whose transition table is typically /// `Vec` or `&[S]`. The reason for this is that a sparse DFA always reads /// its transition table from raw bytes because the table is compactly packed. /// /// # Variants /// /// This DFA is defined as a non-exhaustive enumeration of different types of /// dense DFAs. All of the variants use the same internal representation /// for the transition table, but they vary in how the transition table is /// read. A DFA's specific variant depends on the configuration options set via /// [`dense::Builder`](dense/struct.Builder.html). The default variant is /// `ByteClass`. /// /// # The `DFA` trait /// /// This type implements the [`DFA`](trait.DFA.html) trait, which means it /// can be used for searching. For example: /// /// ``` /// use regex_automata::{DFA, SparseDFA}; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let dfa = SparseDFA::new("foo[0-9]+")?; /// assert_eq!(Some(8), dfa.find(b"foo12345")); /// # Ok(()) }; example().unwrap() /// ``` /// /// The `DFA` trait also provides an assortment of other lower level methods /// for DFAs, such as `start_state` and `next_state`. While these are correctly /// implemented, it is an anti-pattern to use them in performance sensitive /// code on the `SparseDFA` type directly. Namely, each implementation requires /// a branch to determine which type of sparse DFA is being used. Instead, /// this branch should be pushed up a layer in the code since walking the /// transitions of a DFA is usually a hot path. If you do need to use these /// lower level methods in performance critical code, then you should match on /// the variants of this DFA and use each variant's implementation of the `DFA` /// trait directly. #[derive(Clone, Debug)] pub enum SparseDFA, S: StateID = usize> { /// A standard DFA that does not use byte classes. Standard(Standard), /// A DFA that shrinks its alphabet to a set of equivalence classes instead /// of using all possible byte values. Any two bytes belong to the same /// equivalence class if and only if they can be used interchangeably /// anywhere in the DFA while never discriminating between a match and a /// non-match. /// /// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much /// from using byte classes. In some cases, using byte classes can even /// marginally increase the size of a sparse DFA's transition table. The /// reason for this is that a sparse DFA already compacts each state's /// transitions separate from whether byte classes are used. ByteClass(ByteClass), /// Hints that destructuring should not be exhaustive. /// /// This enum may grow additional variants, so this makes sure clients /// don't count on exhaustive matching. (Otherwise, adding a new variant /// could break existing code.) #[doc(hidden)] __Nonexhaustive, } #[cfg(feature = "std")] impl SparseDFA, usize> { /// Parse the given regular expression using a default configuration and /// return the corresponding sparse DFA. /// /// The default configuration uses `usize` for state IDs and reduces the /// alphabet size by splitting bytes into equivalence classes. The /// resulting DFA is *not* minimized. /// /// If you want a non-default configuration, then use the /// [`dense::Builder`](dense/struct.Builder.html) /// to set your own configuration, and then call /// [`DenseDFA::to_sparse`](enum.DenseDFA.html#method.to_sparse) /// to create a sparse DFA. /// /// # Example /// /// ``` /// use regex_automata::{DFA, SparseDFA}; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let dfa = SparseDFA::new("foo[0-9]+bar")?; /// assert_eq!(Some(11), dfa.find(b"foo12345bar")); /// # Ok(()) }; example().unwrap() /// ``` pub fn new(pattern: &str) -> Result, usize>> { dense::Builder::new() .build(pattern) .and_then(|dense| dense.to_sparse()) } } #[cfg(feature = "std")] impl SparseDFA, S> { /// Create a new empty sparse DFA that never matches any input. /// /// # Example /// /// In order to build an empty DFA, callers must provide a type hint /// indicating their choice of state identifier representation. /// /// ``` /// use regex_automata::{DFA, SparseDFA}; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let dfa: SparseDFA, usize> = SparseDFA::empty(); /// assert_eq!(None, dfa.find(b"")); /// assert_eq!(None, dfa.find(b"foo")); /// # Ok(()) }; example().unwrap() /// ``` pub fn empty() -> SparseDFA, S> { dense::DenseDFA::empty().to_sparse().unwrap() } pub(crate) fn from_dense_sized, A: StateID>( dfa: &dense::Repr, ) -> Result, A>> { Repr::from_dense_sized(dfa).map(|r| r.into_sparse_dfa()) } } impl, S: StateID> SparseDFA { /// Cheaply return a borrowed version of this sparse DFA. Specifically, the /// DFA returned always uses `&[u8]` for its transition table while keeping /// the same state identifier representation. pub fn as_ref<'a>(&'a self) -> SparseDFA<&'a [u8], S> { match *self { SparseDFA::Standard(Standard(ref r)) => { SparseDFA::Standard(Standard(r.as_ref())) } SparseDFA::ByteClass(ByteClass(ref r)) => { SparseDFA::ByteClass(ByteClass(r.as_ref())) } SparseDFA::__Nonexhaustive => unreachable!(), } } /// Return an owned version of this sparse DFA. Specifically, the DFA /// returned always uses `Vec` for its transition table while keeping /// the same state identifier representation. /// /// Effectively, this returns a sparse DFA whose transition table lives /// on the heap. #[cfg(feature = "std")] pub fn to_owned(&self) -> SparseDFA, S> { match *self { SparseDFA::Standard(Standard(ref r)) => { SparseDFA::Standard(Standard(r.to_owned())) } SparseDFA::ByteClass(ByteClass(ref r)) => { SparseDFA::ByteClass(ByteClass(r.to_owned())) } SparseDFA::__Nonexhaustive => unreachable!(), } } /// Returns the memory usage, in bytes, of this DFA. /// /// The memory usage is computed based on the number of bytes used to /// represent this DFA's transition table. This typically corresponds to /// heap memory usage. /// /// This does **not** include the stack size used up by this DFA. To /// compute that, used `std::mem::size_of::()`. pub fn memory_usage(&self) -> usize { self.repr().memory_usage() } fn repr(&self) -> &Repr { match *self { SparseDFA::Standard(ref r) => &r.0, SparseDFA::ByteClass(ref r) => &r.0, SparseDFA::__Nonexhaustive => unreachable!(), } } } /// Routines for converting a sparse DFA to other representations, such as /// smaller state identifiers or raw bytes suitable for persistent storage. #[cfg(feature = "std")] impl, S: StateID> SparseDFA { /// Create a new sparse DFA whose match semantics are equivalent to /// this DFA, but attempt to use `u8` for the representation of state /// identifiers. If `u8` is insufficient to represent all state identifiers /// in this DFA, then this returns an error. /// /// This is a convenience routine for `to_sized::()`. pub fn to_u8(&self) -> Result, u8>> { self.to_sized() } /// Create a new sparse DFA whose match semantics are equivalent to /// this DFA, but attempt to use `u16` for the representation of state /// identifiers. If `u16` is insufficient to represent all state /// identifiers in this DFA, then this returns an error. /// /// This is a convenience routine for `to_sized::()`. pub fn to_u16(&self) -> Result, u16>> { self.to_sized() } /// Create a new sparse DFA whose match semantics are equivalent to /// this DFA, but attempt to use `u32` for the representation of state /// identifiers. If `u32` is insufficient to represent all state /// identifiers in this DFA, then this returns an error. /// /// This is a convenience routine for `to_sized::()`. #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] pub fn to_u32(&self) -> Result, u32>> { self.to_sized() } /// Create a new sparse DFA whose match semantics are equivalent to /// this DFA, but attempt to use `u64` for the representation of state /// identifiers. If `u64` is insufficient to represent all state /// identifiers in this DFA, then this returns an error. /// /// This is a convenience routine for `to_sized::()`. #[cfg(target_pointer_width = "64")] pub fn to_u64(&self) -> Result, u64>> { self.to_sized() } /// Create a new sparse DFA whose match semantics are equivalent to /// this DFA, but attempt to use `A` for the representation of state /// identifiers. If `A` is insufficient to represent all state identifiers /// in this DFA, then this returns an error. /// /// An alternative way to construct such a DFA is to use /// [`DenseDFA::to_sparse_sized`](enum.DenseDFA.html#method.to_sparse_sized). /// In general, picking the appropriate size upon initial construction of /// a sparse DFA is preferred, since it will do the conversion in one /// step instead of two. pub fn to_sized(&self) -> Result, A>> { self.repr().to_sized().map(|r| r.into_sparse_dfa()) } /// Serialize a sparse DFA to raw bytes in little endian format. /// /// If the state identifier representation of this DFA has a size different /// than 1, 2, 4 or 8 bytes, then this returns an error. All /// implementations of `StateID` provided by this crate satisfy this /// requirement. pub fn to_bytes_little_endian(&self) -> Result> { self.repr().to_bytes::() } /// Serialize a sparse DFA to raw bytes in big endian format. /// /// If the state identifier representation of this DFA has a size different /// than 1, 2, 4 or 8 bytes, then this returns an error. All /// implementations of `StateID` provided by this crate satisfy this /// requirement. pub fn to_bytes_big_endian(&self) -> Result> { self.repr().to_bytes::() } /// Serialize a sparse DFA to raw bytes in native endian format. /// Generally, it is better to pick an explicit endianness using either /// `to_bytes_little_endian` or `to_bytes_big_endian`. This routine is /// useful in tests where the DFA is serialized and deserialized on the /// same platform. /// /// If the state identifier representation of this DFA has a size different /// than 1, 2, 4 or 8 bytes, then this returns an error. All /// implementations of `StateID` provided by this crate satisfy this /// requirement. pub fn to_bytes_native_endian(&self) -> Result> { self.repr().to_bytes::() } } impl<'a, S: StateID> SparseDFA<&'a [u8], S> { /// Deserialize a sparse DFA with a specific state identifier /// representation. /// /// Deserializing a DFA using this routine will never allocate heap memory. /// This is also guaranteed to be a constant time operation that does not /// vary with the size of the DFA. /// /// The bytes given should be generated by the serialization of a DFA with /// either the /// [`to_bytes_little_endian`](enum.DenseDFA.html#method.to_bytes_little_endian) /// method or the /// [`to_bytes_big_endian`](enum.DenseDFA.html#method.to_bytes_big_endian) /// endian, depending on the endianness of the machine you are /// deserializing this DFA from. /// /// If the state identifier representation is `usize`, then deserialization /// is dependent on the pointer size. For this reason, it is best to /// serialize DFAs using a fixed size representation for your state /// identifiers, such as `u8`, `u16`, `u32` or `u64`. /// /// # Panics /// /// The bytes given should be *trusted*. In particular, if the bytes /// are not a valid serialization of a DFA, or if the endianness of the /// serialized bytes is different than the endianness of the machine that /// is deserializing the DFA, then this routine will panic. Moreover, it /// is possible for this deserialization routine to succeed even if the /// given bytes do not represent a valid serialized sparse DFA. /// /// # Safety /// /// This routine is unsafe because it permits callers to provide an /// arbitrary transition table with possibly incorrect transitions. While /// the various serialization routines will never return an incorrect /// transition table, there is no guarantee that the bytes provided here /// are correct. While deserialization does many checks (as documented /// above in the panic conditions), this routine does not check that the /// transition table is correct. Given an incorrect transition table, it is /// possible for the search routines to access out-of-bounds memory because /// of explicit bounds check elision. /// /// # Example /// /// This example shows how to serialize a DFA to raw bytes, deserialize it /// and then use it for searching. Note that we first convert the DFA to /// using `u16` for its state identifier representation before serializing /// it. While this isn't strictly necessary, it's good practice in order to /// decrease the size of the DFA and to avoid platform specific pitfalls /// such as differing pointer sizes. /// /// ``` /// use regex_automata::{DFA, DenseDFA, SparseDFA}; /// /// # fn example() -> Result<(), regex_automata::Error> { /// let sparse = SparseDFA::new("foo[0-9]+")?; /// let bytes = sparse.to_u16()?.to_bytes_native_endian()?; /// /// let dfa: SparseDFA<&[u8], u16> = unsafe { /// SparseDFA::from_bytes(&bytes) /// }; /// /// assert_eq!(Some(8), dfa.find(b"foo12345")); /// # Ok(()) }; example().unwrap() /// ``` pub unsafe fn from_bytes(buf: &'a [u8]) -> SparseDFA<&'a [u8], S> { Repr::from_bytes(buf).into_sparse_dfa() } } impl, S: StateID> DFA for SparseDFA { type ID = S; #[inline] fn start_state(&self) -> S { self.repr().start_state() } #[inline] fn is_match_state(&self, id: S) -> bool { self.repr().is_match_state(id) } #[inline] fn is_dead_state(&self, id: S) -> bool { self.repr().is_dead_state(id) } #[inline] fn is_match_or_dead_state(&self, id: S) -> bool { self.repr().is_match_or_dead_state(id) } #[inline] fn is_anchored(&self) -> bool { self.repr().is_anchored() } #[inline] fn next_state(&self, current: S, input: u8) -> S { match *self { SparseDFA::Standard(ref r) => r.next_state(current, input), SparseDFA::ByteClass(ref r) => r.next_state(current, input), SparseDFA::__Nonexhaustive => unreachable!(), } } #[inline] unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { self.next_state(current, input) } // We specialize the following methods because it lets us lift the // case analysis between the different types of sparse DFAs. Instead of // doing the case analysis for every transition, we do it once before // searching. For sparse DFAs, this doesn't seem to benefit performance as // much as it does for the dense DFAs, but it's easy to do so we might as // well do it. #[inline] fn is_match_at(&self, bytes: &[u8], start: usize) -> bool { match *self { SparseDFA::Standard(ref r) => r.is_match_at(bytes, start), SparseDFA::ByteClass(ref r) => r.is_match_at(bytes, start), SparseDFA::__Nonexhaustive => unreachable!(), } } #[inline] fn shortest_match_at(&self, bytes: &[u8], start: usize) -> Option { match *self { SparseDFA::Standard(ref r) => r.shortest_match_at(bytes, start), SparseDFA::ByteClass(ref r) => r.shortest_match_at(bytes, start), SparseDFA::__Nonexhaustive => unreachable!(), } } #[inline] fn find_at(&self, bytes: &[u8], start: usize) -> Option { match *self { SparseDFA::Standard(ref r) => r.find_at(bytes, start), SparseDFA::ByteClass(ref r) => r.find_at(bytes, start), SparseDFA::__Nonexhaustive => unreachable!(), } } #[inline] fn rfind_at(&self, bytes: &[u8], start: usize) -> Option { match *self { SparseDFA::Standard(ref r) => r.rfind_at(bytes, start), SparseDFA::ByteClass(ref r) => r.rfind_at(bytes, start), SparseDFA::__Nonexhaustive => unreachable!(), } } } /// A standard sparse DFA that does not use premultiplication or byte classes. /// /// Generally, it isn't necessary to use this type directly, since a /// `SparseDFA` can be used for searching directly. One possible reason why /// one might want to use this type directly is if you are implementing your /// own search routines by walking a DFA's transitions directly. In that case, /// you'll want to use this type (or any of the other DFA variant types) /// directly, since they implement `next_state` more efficiently. #[derive(Clone, Debug)] pub struct Standard, S: StateID = usize>( Repr, ); impl, S: StateID> DFA for Standard { type ID = S; #[inline] fn start_state(&self) -> S { self.0.start_state() } #[inline] fn is_match_state(&self, id: S) -> bool { self.0.is_match_state(id) } #[inline] fn is_dead_state(&self, id: S) -> bool { self.0.is_dead_state(id) } #[inline] fn is_match_or_dead_state(&self, id: S) -> bool { self.0.is_match_or_dead_state(id) } #[inline] fn is_anchored(&self) -> bool { self.0.is_anchored() } #[inline] fn next_state(&self, current: S, input: u8) -> S { self.0.state(current).next(input) } #[inline] unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { self.next_state(current, input) } } /// A sparse DFA that shrinks its alphabet. /// /// Alphabet shrinking is achieved by using a set of equivalence classes /// instead of using all possible byte values. Any two bytes belong to the same /// equivalence class if and only if they can be used interchangeably anywhere /// in the DFA while never discriminating between a match and a non-match. /// /// Unlike dense DFAs, sparse DFAs do not tend to benefit nearly as much from /// using byte classes. In some cases, using byte classes can even marginally /// increase the size of a sparse DFA's transition table. The reason for this /// is that a sparse DFA already compacts each state's transitions separate /// from whether byte classes are used. /// /// Generally, it isn't necessary to use this type directly, since a /// `SparseDFA` can be used for searching directly. One possible reason why /// one might want to use this type directly is if you are implementing your /// own search routines by walking a DFA's transitions directly. In that case, /// you'll want to use this type (or any of the other DFA variant types) /// directly, since they implement `next_state` more efficiently. #[derive(Clone, Debug)] pub struct ByteClass, S: StateID = usize>( Repr, ); impl, S: StateID> DFA for ByteClass { type ID = S; #[inline] fn start_state(&self) -> S { self.0.start_state() } #[inline] fn is_match_state(&self, id: S) -> bool { self.0.is_match_state(id) } #[inline] fn is_dead_state(&self, id: S) -> bool { self.0.is_dead_state(id) } #[inline] fn is_match_or_dead_state(&self, id: S) -> bool { self.0.is_match_or_dead_state(id) } #[inline] fn is_anchored(&self) -> bool { self.0.is_anchored() } #[inline] fn next_state(&self, current: S, input: u8) -> S { let input = self.0.byte_classes.get(input); self.0.state(current).next(input) } #[inline] unsafe fn next_state_unchecked(&self, current: S, input: u8) -> S { self.next_state(current, input) } } /// The underlying representation of a sparse DFA. This is shared by all of /// the different variants of a sparse DFA. #[derive(Clone)] #[cfg_attr(not(feature = "std"), derive(Debug))] struct Repr, S: StateID = usize> { anchored: bool, start: S, state_count: usize, max_match: S, byte_classes: ByteClasses, trans: T, } impl, S: StateID> Repr { fn into_sparse_dfa(self) -> SparseDFA { if self.byte_classes.is_singleton() { SparseDFA::Standard(Standard(self)) } else { SparseDFA::ByteClass(ByteClass(self)) } } fn as_ref<'a>(&'a self) -> Repr<&'a [u8], S> { Repr { anchored: self.anchored, start: self.start, state_count: self.state_count, max_match: self.max_match, byte_classes: self.byte_classes.clone(), trans: self.trans(), } } #[cfg(feature = "std")] fn to_owned(&self) -> Repr, S> { Repr { anchored: self.anchored, start: self.start, state_count: self.state_count, max_match: self.max_match, byte_classes: self.byte_classes.clone(), trans: self.trans().to_vec(), } } /// Return a convenient representation of the given state. /// /// This is marked as inline because it doesn't seem to get inlined /// otherwise, which leads to a fairly significant performance loss (~25%). #[inline] fn state<'a>(&'a self, id: S) -> State<'a, S> { let mut pos = id.to_usize(); let ntrans = NativeEndian::read_u16(&self.trans()[pos..]) as usize; pos += 2; let input_ranges = &self.trans()[pos..pos + (ntrans * 2)]; pos += 2 * ntrans; let next = &self.trans()[pos..pos + (ntrans * size_of::())]; State { _state_id_repr: PhantomData, ntrans, input_ranges, next } } /// Return an iterator over all of the states in this DFA. /// /// The iterator returned yields tuples, where the first element is the /// state ID and the second element is the state itself. #[cfg(feature = "std")] fn states<'a>(&'a self) -> StateIter<'a, T, S> { StateIter { dfa: self, id: dead_id() } } fn memory_usage(&self) -> usize { self.trans().len() } fn start_state(&self) -> S { self.start } fn is_match_state(&self, id: S) -> bool { self.is_match_or_dead_state(id) && !self.is_dead_state(id) } fn is_dead_state(&self, id: S) -> bool { id == dead_id() } fn is_match_or_dead_state(&self, id: S) -> bool { id <= self.max_match } fn is_anchored(&self) -> bool { self.anchored } fn trans(&self) -> &[u8] { self.trans.as_ref() } /// Create a new sparse DFA whose match semantics are equivalent to this /// DFA, but attempt to use `A` for the representation of state /// identifiers. If `A` is insufficient to represent all state identifiers /// in this DFA, then this returns an error. #[cfg(feature = "std")] fn to_sized(&self) -> Result, A>> { // To build the new DFA, we proceed much like the initial construction // of the sparse DFA. Namely, since the state ID size is changing, // we don't actually know all of our state IDs until we've allocated // all necessary space. So we do one pass that allocates all of the // storage we need, and then another pass to fill in the transitions. let mut trans = Vec::with_capacity(size_of::() * self.state_count); let mut map: HashMap = HashMap::with_capacity(self.state_count); for (old_id, state) in self.states() { let pos = trans.len(); map.insert(old_id, usize_to_state_id(pos)?); let n = state.ntrans; let zeros = 2 + (n * 2) + (n * size_of::()); trans.extend(iter::repeat(0).take(zeros)); NativeEndian::write_u16(&mut trans[pos..], n as u16); let (s, e) = (pos + 2, pos + 2 + (n * 2)); trans[s..e].copy_from_slice(state.input_ranges); } let mut new = Repr { anchored: self.anchored, start: map[&self.start], state_count: self.state_count, max_match: map[&self.max_match], byte_classes: self.byte_classes.clone(), trans: trans, }; for (&old_id, &new_id) in map.iter() { let old_state = self.state(old_id); let mut new_state = new.state_mut(new_id); for i in 0..new_state.ntrans { let next = map[&old_state.next_at(i)]; new_state.set_next_at(i, usize_to_state_id(next.to_usize())?); } } new.start = map[&self.start]; new.max_match = map[&self.max_match]; Ok(new) } /// Serialize a sparse DFA to raw bytes using the provided endianness. /// /// If the state identifier representation of this DFA has a size different /// than 1, 2, 4 or 8 bytes, then this returns an error. All /// implementations of `StateID` provided by this crate satisfy this /// requirement. /// /// Unlike dense DFAs, the result is not necessarily aligned since a /// sparse DFA's transition table is always read as a sequence of bytes. #[cfg(feature = "std")] fn to_bytes(&self) -> Result> { let label = b"rust-regex-automata-sparse-dfa\x00"; let size = // For human readable label. label.len() // endiannes check, must be equal to 0xFEFF for native endian + 2 // For version number. + 2 // Size of state ID representation, in bytes. // Must be 1, 2, 4 or 8. + 2 // For DFA misc options. (Currently unused.) + 2 // For start state. + 8 // For state count. + 8 // For max match state. + 8 // For byte class map. + 256 // For transition table. + self.trans().len(); let mut i = 0; let mut buf = vec![0; size]; // write label for &b in label { buf[i] = b; i += 1; } // endianness check A::write_u16(&mut buf[i..], 0xFEFF); i += 2; // version number A::write_u16(&mut buf[i..], 1); i += 2; // size of state ID let state_size = size_of::(); if ![1, 2, 4, 8].contains(&state_size) { return Err(Error::serialize(&format!( "state size of {} not supported, must be 1, 2, 4 or 8", state_size ))); } A::write_u16(&mut buf[i..], state_size as u16); i += 2; // DFA misc options let mut options = 0u16; if self.anchored { options |= dense::MASK_ANCHORED; } A::write_u16(&mut buf[i..], options); i += 2; // start state A::write_u64(&mut buf[i..], self.start.to_usize() as u64); i += 8; // state count A::write_u64(&mut buf[i..], self.state_count as u64); i += 8; // max match state A::write_u64( &mut buf[i..], self.max_match.to_usize() as u64, ); i += 8; // byte class map for b in (0..256).map(|b| b as u8) { buf[i] = self.byte_classes.get(b); i += 1; } // transition table for (_, state) in self.states() { A::write_u16(&mut buf[i..], state.ntrans as u16); i += 2; buf[i..i + (state.ntrans * 2)].copy_from_slice(state.input_ranges); i += state.ntrans * 2; for j in 0..state.ntrans { write_state_id_bytes::(&mut buf[i..], state.next_at(j)); i += size_of::(); } } assert_eq!(size, i, "expected to consume entire buffer"); Ok(buf) } } impl<'a, S: StateID> Repr<&'a [u8], S> { /// The implementation for deserializing a sparse DFA from raw bytes. unsafe fn from_bytes(mut buf: &'a [u8]) -> Repr<&'a [u8], S> { // skip over label match buf.iter().position(|&b| b == b'\x00') { None => panic!("could not find label"), Some(i) => buf = &buf[i+1..], } // check that current endianness is same as endianness of DFA let endian_check = NativeEndian::read_u16(buf); buf = &buf[2..]; if endian_check != 0xFEFF { panic!( "endianness mismatch, expected 0xFEFF but got 0x{:X}. \ are you trying to load a SparseDFA serialized with a \ different endianness?", endian_check, ); } // check that the version number is supported let version = NativeEndian::read_u16(buf); buf = &buf[2..]; if version != 1 { panic!( "expected version 1, but found unsupported version {}", version, ); } // read size of state let state_size = NativeEndian::read_u16(buf) as usize; if state_size != size_of::() { panic!( "state size of SparseDFA ({}) does not match \ requested state size ({})", state_size, size_of::(), ); } buf = &buf[2..]; // read miscellaneous options let opts = NativeEndian::read_u16(buf); buf = &buf[2..]; // read start state let start = S::from_usize(NativeEndian::read_u64(buf) as usize); buf = &buf[8..]; // read state count let state_count = NativeEndian::read_u64(buf) as usize; buf = &buf[8..]; // read max match state let max_match = S::from_usize(NativeEndian::read_u64(buf) as usize); buf = &buf[8..]; // read byte classes let byte_classes = ByteClasses::from_slice(&buf[..256]); buf = &buf[256..]; Repr { anchored: opts & dense::MASK_ANCHORED > 0, start, state_count, max_match, byte_classes, trans: buf, } } } #[cfg(feature = "std")] impl Repr, S> { /// The implementation for constructing a sparse DFA from a dense DFA. fn from_dense_sized, A: StateID>( dfa: &dense::Repr, ) -> Result, A>> { // In order to build the transition table, we need to be able to write // state identifiers for each of the "next" transitions in each state. // Our state identifiers correspond to the byte offset in the // transition table at which the state is encoded. Therefore, we do not // actually know what the state identifiers are until we've allocated // exactly as much space as we need for each state. Thus, construction // of the transition table happens in two passes. // // In the first pass, we fill out the shell of each state, which // includes the transition count, the input byte ranges and zero-filled // space for the transitions. In this first pass, we also build up a // map from the state identifier index of the dense DFA to the state // identifier in this sparse DFA. // // In the second pass, we fill in the transitions based on the map // built in the first pass. let mut trans = Vec::with_capacity(size_of::() * dfa.state_count()); let mut remap: Vec = vec![dead_id(); dfa.state_count()]; for (old_id, state) in dfa.states() { let pos = trans.len(); remap[dfa.state_id_to_index(old_id)] = usize_to_state_id(pos)?; // zero-filled space for the transition count trans.push(0); trans.push(0); let mut trans_count = 0; for (b1, b2, _) in state.sparse_transitions() { trans_count += 1; trans.push(b1); trans.push(b2); } // fill in the transition count NativeEndian::write_u16(&mut trans[pos..], trans_count); // zero-fill the actual transitions let zeros = trans_count as usize * size_of::(); trans.extend(iter::repeat(0).take(zeros)); } let mut new = Repr { anchored: dfa.is_anchored(), start: remap[dfa.state_id_to_index(dfa.start_state())], state_count: dfa.state_count(), max_match: remap[dfa.state_id_to_index(dfa.max_match_state())], byte_classes: dfa.byte_classes().clone(), trans: trans, }; for (old_id, old_state) in dfa.states() { let new_id = remap[dfa.state_id_to_index(old_id)]; let mut new_state = new.state_mut(new_id); let sparse = old_state.sparse_transitions(); for (i, (_, _, next)) in sparse.enumerate() { let next = remap[dfa.state_id_to_index(next)]; new_state.set_next_at(i, next); } } Ok(new) } /// Return a convenient mutable representation of the given state. fn state_mut<'a>(&'a mut self, id: S) -> StateMut<'a, S> { let mut pos = id.to_usize(); let ntrans = NativeEndian::read_u16(&self.trans[pos..]) as usize; pos += 2; let size = (ntrans * 2) + (ntrans * size_of::()); let ranges_and_next = &mut self.trans[pos..pos + size]; let (input_ranges, next) = ranges_and_next.split_at_mut(ntrans * 2); StateMut { _state_id_repr: PhantomData, ntrans, input_ranges, next } } } #[cfg(feature = "std")] impl, S: StateID> fmt::Debug for Repr { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn state_status, S: StateID>( dfa: &Repr, id: S, ) -> &'static str { if id == dead_id() { if dfa.is_match_state(id) { "D*" } else { "D " } } else if id == dfa.start_state() { if dfa.is_match_state(id) { ">*" } else { "> " } } else { if dfa.is_match_state(id) { " *" } else { " " } } } writeln!(f, "SparseDFA(")?; for (id, state) in self.states() { let status = state_status(self, id); writeln!(f, "{}{:04}: {:?}", status, id.to_usize(), state)?; } writeln!(f, ")")?; Ok(()) } } /// An iterator over all states in a sparse DFA. /// /// This iterator yields tuples, where the first element is the state ID and /// the second element is the state itself. #[cfg(feature = "std")] #[derive(Debug)] struct StateIter<'a, T: AsRef<[u8]> + 'a, S: StateID + 'a = usize> { dfa: &'a Repr, id: S, } #[cfg(feature = "std")] impl<'a, T: AsRef<[u8]>, S: StateID> Iterator for StateIter<'a, T, S> { type Item = (S, State<'a, S>); fn next(&mut self) -> Option<(S, State<'a, S>)> { if self.id.to_usize() >= self.dfa.trans().len() { return None; } let id = self.id; let state = self.dfa.state(id); self.id = S::from_usize(self.id.to_usize() + state.bytes()); Some((id, state)) } } /// A representation of a sparse DFA state that can be cheaply materialized /// from a state identifier. #[derive(Clone)] struct State<'a, S: StateID = usize> { /// The state identifier representation used by the DFA from which this /// state was extracted. Since our transition table is compacted in a /// &[u8], we don't actually use the state ID type parameter explicitly /// anywhere, so we fake it. This prevents callers from using an incorrect /// state ID representation to read from this state. _state_id_repr: PhantomData, /// The number of transitions in this state. ntrans: usize, /// Pairs of input ranges, where there is one pair for each transition. /// Each pair specifies an inclusive start and end byte range for the /// corresponding transition. input_ranges: &'a [u8], /// Transitions to the next state. This slice contains native endian /// encoded state identifiers, with `S` as the representation. Thus, there /// are `ntrans * size_of::()` bytes in this slice. next: &'a [u8], } impl<'a, S: StateID> State<'a, S> { /// Searches for the next transition given an input byte. If no such /// transition could be found, then a dead state is returned. fn next(&self, input: u8) -> S { // This straight linear search was observed to be much better than // binary search on ASCII haystacks, likely because a binary search // visits the ASCII case last but a linear search sees it first. A // binary search does do a little better on non-ASCII haystacks, but // not by much. There might be a better trade off lurking here. for i in 0..self.ntrans { let (start, end) = self.range(i); if start <= input && input <= end { return self.next_at(i) } // We could bail early with an extra branch: if input < b1, then // we know we'll never find a matching transition. Interestingly, // this extra branch seems to not help performance, or will even // hurt it. It's likely very dependent on the DFA itself and what // is being searched. } dead_id() } /// Returns the inclusive input byte range for the ith transition in this /// state. fn range(&self, i: usize) -> (u8, u8) { (self.input_ranges[i * 2], self.input_ranges[i * 2 + 1]) } /// Returns the next state for the ith transition in this state. fn next_at(&self, i: usize) -> S { S::read_bytes(&self.next[i * size_of::()..]) } /// Return the total number of bytes that this state consumes in its /// encoded form. #[cfg(feature = "std")] fn bytes(&self) -> usize { 2 + (self.ntrans * 2) + (self.ntrans * size_of::()) } } #[cfg(feature = "std")] impl<'a, S: StateID> fmt::Debug for State<'a, S> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let mut transitions = vec![]; for i in 0..self.ntrans { let next = self.next_at(i); if next == dead_id() { continue; } let (start, end) = self.range(i); if start == end { transitions.push( format!("{} => {}", escape(start), next.to_usize()), ); } else { transitions.push( format!( "{}-{} => {}", escape(start), escape(end), next.to_usize(), ), ); } } write!(f, "{}", transitions.join(", ")) } } /// A representation of a mutable sparse DFA state that can be cheaply /// materialized from a state identifier. #[cfg(feature = "std")] struct StateMut<'a, S: StateID = usize> { /// The state identifier representation used by the DFA from which this /// state was extracted. Since our transition table is compacted in a /// &[u8], we don't actually use the state ID type parameter explicitly /// anywhere, so we fake it. This prevents callers from using an incorrect /// state ID representation to read from this state. _state_id_repr: PhantomData, /// The number of transitions in this state. ntrans: usize, /// Pairs of input ranges, where there is one pair for each transition. /// Each pair specifies an inclusive start and end byte range for the /// corresponding transition. input_ranges: &'a mut [u8], /// Transitions to the next state. This slice contains native endian /// encoded state identifiers, with `S` as the representation. Thus, there /// are `ntrans * size_of::()` bytes in this slice. next: &'a mut [u8], } #[cfg(feature = "std")] impl<'a, S: StateID> StateMut<'a, S> { /// Sets the ith transition to the given state. fn set_next_at(&mut self, i: usize, next: S) { next.write_bytes(&mut self.next[i * size_of::()..]); } } #[cfg(feature = "std")] impl<'a, S: StateID> fmt::Debug for StateMut<'a, S> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let state = State { _state_id_repr: self._state_id_repr, ntrans: self.ntrans, input_ranges: self.input_ranges, next: self.next, }; fmt::Debug::fmt(&state, f) } } /// Return the given byte as its escaped string form. #[cfg(feature = "std")] fn escape(b: u8) -> String { use std::ascii; String::from_utf8(ascii::escape_default(b).collect::>()).unwrap() } /// A binary search routine specialized specifically to a sparse DFA state's /// transitions. Specifically, the transitions are defined as a set of pairs /// of input bytes that delineate an inclusive range of bytes. If the input /// byte is in the range, then the corresponding transition is a match. /// /// This binary search accepts a slice of these pairs and returns the position /// of the matching pair (the ith transition), or None if no matching pair /// could be found. /// /// Note that this routine is not currently used since it was observed to /// either decrease performance when searching ASCII, or did not provide enough /// of a boost on non-ASCII haystacks to be worth it. However, we leave it here /// for posterity in case we can find a way to use it. /// /// In theory, we could use the standard library's search routine if we could /// cast a `&[u8]` to a `&[(u8, u8)]`, but I don't believe this currently /// guaranteed to be safe and is thus UB (since I don't think the in-memory /// representation of `(u8, u8)` has been nailed down). #[inline(always)] #[allow(dead_code)] fn binary_search_ranges(ranges: &[u8], needle: u8) -> Option { debug_assert!(ranges.len() % 2 == 0, "ranges must have even length"); debug_assert!(ranges.len() <= 512, "ranges should be short"); let (mut left, mut right) = (0, ranges.len() / 2); while left < right { let mid = (left + right) / 2; let (b1, b2) = (ranges[mid * 2], ranges[mid * 2 + 1]); if needle < b1 { right = mid; } else if needle > b2 { left = mid + 1; } else { return Some(mid); } } None } regex-automata-0.1.8/src/sparse_set.rs010064400017500000144000000033771341354362300162030ustar0000000000000000use std::slice; /// A sparse set used for representing ordered NFA states. /// /// This supports constant time addition and membership testing. Clearing an /// entire set can also be done in constant time. Iteration yields elements /// in the order in which they were inserted. /// /// The data structure is based on: http://research.swtch.com/sparse Note /// though that we don't actually use uninitialized memory. We generally reuse /// sparse sets, so the initial allocation cost is bareable. However, its other /// properties listed above are extremely useful. #[derive(Clone, Debug)] pub struct SparseSet { /// Dense contains the instruction pointers in the order in which they /// were inserted. dense: Vec, /// Sparse maps instruction pointers to their location in dense. /// /// An instruction pointer is in the set if and only if /// sparse[ip] < dense.len() && ip == dense[sparse[ip]]. sparse: Box<[usize]>, } impl SparseSet { pub fn new(size: usize) -> SparseSet { SparseSet { dense: Vec::with_capacity(size), sparse: vec![0; size].into_boxed_slice(), } } pub fn len(&self) -> usize { self.dense.len() } pub fn insert(&mut self, value: usize) { let i = self.len(); assert!(i < self.dense.capacity()); self.dense.push(value); self.sparse[value] = i; } pub fn contains(&self, value: usize) -> bool { let i = self.sparse[value]; self.dense.get(i) == Some(&value) } pub fn clear(&mut self) { self.dense.clear(); } } impl<'a> IntoIterator for &'a SparseSet { type Item = &'a usize; type IntoIter = slice::Iter<'a, usize>; fn into_iter(self) -> Self::IntoIter { self.dense.iter() } } regex-automata-0.1.8/src/state_id.rs010064400017500000144000000201121341324247500156120ustar0000000000000000use core::fmt::Debug; use core::hash::Hash; use core::mem::size_of; use byteorder::{ByteOrder, NativeEndian}; #[cfg(feature = "std")] pub use self::std::*; #[cfg(feature = "std")] mod std { use core::mem::size_of; use byteorder::ByteOrder; use error::{Error, Result}; use super::StateID; /// Check that the premultiplication of the given state identifier can /// fit into the representation indicated by `S`. If it cannot, or if it /// overflows `usize` itself, then an error is returned. pub fn premultiply_overflow_error( last_state: S, alphabet_len: usize, ) -> Result<()> { let requested = match last_state.to_usize().checked_mul(alphabet_len) { Some(requested) => requested, None => return Err(Error::premultiply_overflow(0, 0)), }; if requested > S::max_id() { return Err(Error::premultiply_overflow(S::max_id(), requested)); } Ok(()) } /// Allocate the next sequential identifier for a fresh state given /// the previously constructed state identified by `current`. If the /// next sequential identifier would overflow `usize` or the chosen /// representation indicated by `S`, then an error is returned. pub fn next_state_id(current: S) -> Result { let next = match current.to_usize().checked_add(1) { Some(next) => next, None => return Err(Error::state_id_overflow(::std::usize::MAX)), }; if next > S::max_id() { return Err(Error::state_id_overflow(S::max_id())); } Ok(S::from_usize(next)) } /// Convert the given `usize` to the chosen state identifier /// representation. If the given value cannot fit in the chosen /// representation, then an error is returned. pub fn usize_to_state_id(value: usize) -> Result { if value > S::max_id() { Err(Error::state_id_overflow(S::max_id())) } else { Ok(S::from_usize(value)) } } /// Write the given identifier to the given slice of bytes using the /// specified endianness. The given slice must have length at least /// `size_of::()`. /// /// The given state identifier representation must have size 1, 2, 4 or 8. pub fn write_state_id_bytes( slice: &mut [u8], id: S, ) { assert!( 1 == size_of::() || 2 == size_of::() || 4 == size_of::() || 8 == size_of::() ); match size_of::() { 1 => slice[0] = id.to_usize() as u8, 2 => E::write_u16(slice, id.to_usize() as u16), 4 => E::write_u32(slice, id.to_usize() as u32), 8 => E::write_u64(slice, id.to_usize() as u64), _ => unreachable!(), } } } /// Return the unique identifier for a DFA's dead state in the chosen /// representation indicated by `S`. pub fn dead_id() -> S { S::from_usize(0) } /// A trait describing the representation of a DFA's state identifier. /// /// The purpose of this trait is to safely express both the possible state /// identifier representations that can be used in a DFA and to convert between /// state identifier representations and types that can be used to efficiently /// index memory (such as `usize`). /// /// In general, one should not need to implement this trait explicitly. In /// particular, this crate provides implementations for `u8`, `u16`, `u32`, /// `u64` and `usize`. (`u32` and `u64` are only provided for targets that can /// represent all corresponding values in a `usize`.) /// /// # Safety /// /// This trait is unsafe because the correctness of its implementations may be /// relied upon by other unsafe code. For example, one possible way to /// implement this trait incorrectly would be to return a maximum identifier /// in `max_id` that is greater than the real maximum identifier. This will /// likely result in wrap-on-overflow semantics in release mode, which can in /// turn produce incorrect state identifiers. Those state identifiers may then /// in turn access out-of-bounds memory in a DFA's search routine, where bounds /// checks are explicitly elided for performance reasons. pub unsafe trait StateID: Clone + Copy + Debug + Eq + Hash + PartialEq + PartialOrd + Ord { /// Convert from a `usize` to this implementation's representation. /// /// Implementors may assume that `n <= Self::max_id`. That is, implementors /// do not need to check whether `n` can fit inside this implementation's /// representation. fn from_usize(n: usize) -> Self; /// Convert this implementation's representation to a `usize`. /// /// Implementors must not return a `usize` value greater than /// `Self::max_id` and must not permit overflow when converting between the /// implementor's representation and `usize`. In general, the preferred /// way for implementors to achieve this is to simply not provide /// implementations of `StateID` that cannot fit into the target platform's /// `usize`. fn to_usize(self) -> usize; /// Return the maximum state identifier supported by this representation. /// /// Implementors must return a correct bound. Doing otherwise may result /// in memory unsafety. fn max_id() -> usize; /// Read a single state identifier from the given slice of bytes in native /// endian format. /// /// Implementors may assume that the given slice has length at least /// `size_of::()`. fn read_bytes(slice: &[u8]) -> Self; /// Write this state identifier to the given slice of bytes in native /// endian format. /// /// Implementors may assume that the given slice has length at least /// `size_of::()`. fn write_bytes(self, slice: &mut [u8]); } unsafe impl StateID for usize { #[inline] fn from_usize(n: usize) -> usize { n } #[inline] fn to_usize(self) -> usize { self } #[inline] fn max_id() -> usize { ::core::usize::MAX } #[inline] fn read_bytes(slice: &[u8]) -> Self { NativeEndian::read_uint(slice, size_of::()) as usize } #[inline] fn write_bytes(self, slice: &mut [u8]) { NativeEndian::write_uint(slice, self as u64, size_of::()) } } unsafe impl StateID for u8 { #[inline] fn from_usize(n: usize) -> u8 { n as u8 } #[inline] fn to_usize(self) -> usize { self as usize } #[inline] fn max_id() -> usize { ::core::u8::MAX as usize } #[inline] fn read_bytes(slice: &[u8]) -> Self { slice[0] } #[inline] fn write_bytes(self, slice: &mut [u8]) { slice[0] = self; } } unsafe impl StateID for u16 { #[inline] fn from_usize(n: usize) -> u16 { n as u16 } #[inline] fn to_usize(self) -> usize { self as usize } #[inline] fn max_id() -> usize { ::core::u16::MAX as usize } #[inline] fn read_bytes(slice: &[u8]) -> Self { NativeEndian::read_u16(slice) } #[inline] fn write_bytes(self, slice: &mut [u8]) { NativeEndian::write_u16(slice, self) } } #[cfg(any(target_pointer_width = "32", target_pointer_width = "64"))] unsafe impl StateID for u32 { #[inline] fn from_usize(n: usize) -> u32 { n as u32 } #[inline] fn to_usize(self) -> usize { self as usize } #[inline] fn max_id() -> usize { ::core::u32::MAX as usize } #[inline] fn read_bytes(slice: &[u8]) -> Self { NativeEndian::read_u32(slice) } #[inline] fn write_bytes(self, slice: &mut [u8]) { NativeEndian::write_u32(slice, self) } } #[cfg(target_pointer_width = "64")] unsafe impl StateID for u64 { #[inline] fn from_usize(n: usize) -> u64 { n as u64 } #[inline] fn to_usize(self) -> usize { self as usize } #[inline] fn max_id() -> usize { ::core::u64::MAX as usize } #[inline] fn read_bytes(slice: &[u8]) -> Self { NativeEndian::read_u64(slice) } #[inline] fn write_bytes(self, slice: &mut [u8]) { NativeEndian::write_u64(slice, self) } } regex-automata-0.1.8/tests/collection.rs010064400017500000144000000320121341325532100165200ustar0000000000000000use std::collections::BTreeMap; use std::env; use std::fmt::{self, Write}; use std::thread; use regex; use regex_automata::{DFA, DenseDFA, ErrorKind, Regex, RegexBuilder, StateID}; use serde_bytes; use toml; macro_rules! load { ($col:ident, $path:expr) => { $col.extend(RegexTests::load( concat!("../data/tests/", $path), include_bytes!(concat!("../data/tests/", $path)) )); } } lazy_static! { pub static ref SUITE: RegexTestCollection = { let mut col = RegexTestCollection::new(); load!(col, "fowler/basic.toml"); load!(col, "fowler/nullsubexpr.toml"); load!(col, "fowler/repetition.toml"); load!(col, "fowler/repetition-long.toml"); load!(col, "crazy.toml"); load!(col, "flags.toml"); load!(col, "iter.toml"); load!(col, "no-unicode.toml"); load!(col, "unicode.toml"); col }; } #[derive(Clone, Debug)] pub struct RegexTestCollection { pub by_name: BTreeMap, } #[derive(Clone, Debug, Deserialize)] pub struct RegexTests { pub tests: Vec, } #[derive(Clone, Debug, Deserialize)] pub struct RegexTest { pub name: String, #[serde(default)] pub options: Vec, pub pattern: String, #[serde(with = "serde_bytes")] pub input: Vec, #[serde(rename = "matches")] pub matches: Vec, #[serde(default)] pub captures: Vec>, #[serde(default)] pub fowler_line_number: Option, } #[derive(Clone, Copy, Debug, Deserialize, Eq, PartialEq)] #[serde(rename_all = "kebab-case")] pub enum RegexTestOption { Anchored, CaseInsensitive, NoUnicode, Escaped, #[serde(rename = "invalid-utf8")] InvalidUTF8, } #[derive(Clone, Copy, Deserialize, Eq, PartialEq)] pub struct Match { pub start: usize, pub end: usize, } impl RegexTestCollection { fn new() -> RegexTestCollection { RegexTestCollection { by_name: BTreeMap::new() } } fn extend(&mut self, tests: RegexTests) { for test in tests.tests { let name = test.name.clone(); if self.by_name.contains_key(&name) { panic!("found duplicate test {}", name); } self.by_name.insert(name, test); } } pub fn tests(&self) -> Vec<&RegexTest> { self.by_name.values().collect() } } impl RegexTests { fn load(path: &str, slice: &[u8]) -> RegexTests { let mut data: RegexTests = toml::from_slice(slice) .expect(&format!("failed to load {}", path)); for test in &mut data.tests { if test.options.contains(&RegexTestOption::Escaped) { test.input = unescape_bytes(&test.input); } } data } } #[derive(Debug)] pub struct RegexTester { asserted: bool, results: RegexTestResults, skip_expensive: bool, whitelist: Vec, blacklist: Vec, } impl Drop for RegexTester { fn drop(&mut self) { // If we haven't asserted yet, then the test is probably buggy, so // fail it. But if we're already panicking (e.g., a bug in the regex // engine), then don't double-panic, which causes an immediate abort. if !thread::panicking() && !self.asserted { panic!("must call RegexTester::assert at end of test"); } } } impl RegexTester { pub fn new() -> RegexTester { let mut tester = RegexTester { asserted: false, results: RegexTestResults::default(), skip_expensive: false, whitelist: vec![], blacklist: vec![], }; for x in env::var("REGEX_TEST").unwrap_or("".to_string()).split(",") { let x = x.trim(); if x.is_empty() { continue; } if x.starts_with("-") { tester = tester.blacklist(&x[1..]); } else { tester = tester.whitelist(x); } } tester } pub fn skip_expensive(mut self) -> RegexTester { self.skip_expensive = true; self } pub fn whitelist(mut self, name: &str) -> RegexTester { self.whitelist.push(regex::Regex::new(name).unwrap()); self } pub fn blacklist(mut self, name: &str) -> RegexTester { self.blacklist.push(regex::Regex::new(name).unwrap()); self } pub fn assert(&mut self) { self.asserted = true; self.results.assert(); } pub fn build_regex( &self, mut builder: RegexBuilder, test: &RegexTest, ) -> Option, S>>> { if self.skip(test) { return None; } self.apply_options(test, &mut builder); match builder.build_with_size::(&test.pattern) { Ok(re) => Some(re), Err(err) => { if let ErrorKind::Unsupported(_) = *err.kind() { None } else { panic!( "failed to build {:?} with pattern '{:?}': {}", test.name, test.pattern, err ); } } } } pub fn test_all<'a, I, T>( &mut self, builder: RegexBuilder, tests: I, ) where I: IntoIterator, T: Iterator { for test in tests { let builder = builder.clone(); let re: Regex = match self.build_regex(builder, test) { None => continue, Some(re) => re, }; self.test(test, &re); } } pub fn test<'a, D: DFA>( &mut self, test: &RegexTest, re: &Regex, ) { self.test_is_match(test, re); self.test_find(test, re); // Some tests (namely, fowler) are designed only to detect the // first match even if there are more subsequent matches. To that // end, we only test match iteration when the number of matches // expected is not 1, or if the test name has 'iter' in it. if test.name.contains("iter") || test.matches.len() != 1 { self.test_find_iter(test, re); } } pub fn test_is_match<'a, D: DFA>( &mut self, test: &RegexTest, re: &Regex, ) { self.asserted = false; let got = re.is_match(&test.input); let expected = test.matches.len() >= 1; if got == expected { self.results.succeeded.push(test.clone()); return; } self.results.failed.push(RegexTestFailure { test: test.clone(), kind: RegexTestFailureKind::IsMatch, }); } pub fn test_find<'a, D: DFA>( &mut self, test: &RegexTest, re: &Regex, ) { self.asserted = false; let got = re .find(&test.input) .map(|(start, end)| Match { start, end }); if got == test.matches.get(0).map(|&m| m) { self.results.succeeded.push(test.clone()); return; } self.results.failed.push(RegexTestFailure { test: test.clone(), kind: RegexTestFailureKind::Find { got }, }); } pub fn test_find_iter<'a, D: DFA>( &mut self, test: &RegexTest, re: &Regex, ) { self.asserted = false; let got: Vec = re .find_iter(&test.input) .map(|(start, end)| Match { start, end }) .collect(); if got == test.matches { self.results.succeeded.push(test.clone()); return; } self.results.failed.push(RegexTestFailure { test: test.clone(), kind: RegexTestFailureKind::FindIter { got }, }); } fn skip(&self, test: &RegexTest) -> bool { if self.skip_expensive { if test.name.starts_with("repetition-long") { return true; } } if !self.blacklist.is_empty() { if self.blacklist.iter().any(|re| re.is_match(&test.name)) { return true; } } if !self.whitelist.is_empty() { if !self.whitelist.iter().any(|re| re.is_match(&test.name)) { return true; } } false } fn apply_options(&self, test: &RegexTest, builder: &mut RegexBuilder) { for opt in &test.options { match *opt { RegexTestOption::Anchored => { builder.anchored(true); } RegexTestOption::CaseInsensitive => { builder.case_insensitive(true); } RegexTestOption::NoUnicode => { builder.unicode(false); } RegexTestOption::Escaped => {} RegexTestOption::InvalidUTF8 => { builder.allow_invalid_utf8(true); } } } } } #[derive(Clone, Debug, Default)] pub struct RegexTestResults { /// Tests that succeeded. pub succeeded: Vec, /// Failed tests, indexed by group name. pub failed: Vec, } #[derive(Clone, Debug)] pub struct RegexTestFailure { test: RegexTest, kind: RegexTestFailureKind, } #[derive(Clone, Debug)] pub enum RegexTestFailureKind { IsMatch, Find { got: Option }, FindIter { got: Vec }, } impl RegexTestResults { fn new() -> RegexTestResults { RegexTestResults { succeeded: vec![], failed: vec![] } } pub fn assert(&self) { if self.failed.is_empty() { return; } let failures = self .failed .iter() .map(|f| f.to_string()) .collect::>() .join("\n\n"); panic!( "found {} failures:\n{}\n{}\n{}\n\n\ Set the REGEX_TEST environment variable to filter tests, \n\ e.g., REGEX_TEST=crazy-misc,-crazy-misc2 runs every test \n\ whose name contains crazy-misc but not crazy-misc2\n\n", self.failed.len(), "~".repeat(79), failures.trim(), "~".repeat(79) ) } } impl fmt::Display for RegexTestFailure { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!( f, "{}: {}\n \ options: {:?}\n \ pattern: {}\n \ pattern (escape): {}\n \ input: {}\n \ input (escape): {}\n \ input (hex): {}", self.test.name, self.kind.fmt(&self.test)?, self.test.options, self.test.pattern, escape_default(&self.test.pattern), nice_raw_bytes(&self.test.input), escape_bytes(&self.test.input), hex_bytes(&self.test.input) ) } } impl RegexTestFailureKind { fn fmt(&self, test: &RegexTest) -> Result { let mut buf = String::new(); match *self { RegexTestFailureKind::IsMatch => { if let Some(&m) = test.matches.get(0) { write!(buf, "expected match (at {}), but none found", m)? } else { write!(buf, "expected no match, but found a match")? } } RegexTestFailureKind::Find { got } => { write!( buf, "expected {:?}, but found {:?}", test.matches.get(0), got )? } RegexTestFailureKind::FindIter { ref got } => { write!( buf, "expected {:?}, but found {:?}", test.matches, got )? } } Ok(buf) } } impl fmt::Display for Match { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "({}, {})", self.start, self.end) } } impl fmt::Debug for Match { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { write!(f, "({}, {})", self.start, self.end) } } fn nice_raw_bytes(bytes: &[u8]) -> String { use std::str; match str::from_utf8(bytes) { Ok(s) => s.to_string(), Err(_) => escape_bytes(bytes), } } fn escape_bytes(bytes: &[u8]) -> String { use std::ascii; let escaped = bytes .iter() .flat_map(|&b| ascii::escape_default(b)) .collect::>(); String::from_utf8(escaped).unwrap() } fn hex_bytes(bytes: &[u8]) -> String { bytes.iter().map(|&b| format!(r"\x{:02X}", b)).collect() } fn escape_default(s: &str) -> String { s.chars().flat_map(|c| c.escape_default()).collect() } fn unescape_bytes(bytes: &[u8]) -> Vec { use std::str; use unescape::unescape; unescape(&str::from_utf8(bytes).expect("all input must be valid UTF-8")) } regex-automata-0.1.8/tests/regression.rs010064400017500000144000000026051341442433500165560ustar0000000000000000use regex_automata::{dense, DFA}; // A regression test for checking that minimization correctly translates // whether a state is a match state or not. Previously, it was possible for // minimization to mark a non-matching state as matching. #[test] fn minimize_sets_correct_match_states() { let pattern = // This is a subset of the grapheme matching regex. I couldn't seem // to get a repro any smaller than this unfortunately. r"(?x) (?: \p{gcb=Prepend}* (?: (?: (?: \p{gcb=L}* (?:\p{gcb=V}+|\p{gcb=LV}\p{gcb=V}*|\p{gcb=LVT}) \p{gcb=T}* ) | \p{gcb=L}+ | \p{gcb=T}+ ) | \p{Extended_Pictographic} (?:\p{gcb=Extend}*\p{gcb=ZWJ}\p{Extended_Pictographic})* | [^\p{gcb=Control}\p{gcb=CR}\p{gcb=LF}] ) [\p{gcb=Extend}\p{gcb=ZWJ}\p{gcb=SpacingMark}]* ) "; let dfa = dense::Builder::new() .minimize(true) .anchored(true) .build(pattern) .unwrap(); assert_eq!(None, dfa.find(b"\xE2")); } regex-automata-0.1.8/tests/suite.rs010064400017500000144000000153011341300704400155150ustar0000000000000000use regex_automata::{DenseDFA, Regex, RegexBuilder, SparseDFA}; use collection::{SUITE, RegexTester}; #[test] fn unminimized_standard() { let mut builder = RegexBuilder::new(); builder.minimize(false).premultiply(false).byte_classes(false); let mut tester = RegexTester::new().skip_expensive(); tester.test_all(builder, SUITE.tests()); tester.assert(); } #[test] fn unminimized_premultiply() { let mut builder = RegexBuilder::new(); builder.minimize(false).premultiply(true).byte_classes(false); let mut tester = RegexTester::new().skip_expensive(); tester.test_all(builder, SUITE.tests()); tester.assert(); } #[test] fn unminimized_byte_class() { let mut builder = RegexBuilder::new(); builder.minimize(false).premultiply(false).byte_classes(true); let mut tester = RegexTester::new(); tester.test_all(builder, SUITE.tests()); tester.assert(); } #[test] fn unminimized_premultiply_byte_class() { let mut builder = RegexBuilder::new(); builder.minimize(false).premultiply(true).byte_classes(true); let mut tester = RegexTester::new(); tester.test_all(builder, SUITE.tests()); tester.assert(); } #[test] fn minimized_standard() { let mut builder = RegexBuilder::new(); builder.minimize(true).premultiply(false).byte_classes(false); let mut tester = RegexTester::new().skip_expensive(); tester.test_all(builder, SUITE.tests()); tester.assert(); } #[test] fn minimized_premultiply() { let mut builder = RegexBuilder::new(); builder.minimize(true).premultiply(true).byte_classes(false); let mut tester = RegexTester::new().skip_expensive(); tester.test_all(builder, SUITE.tests()); tester.assert(); } #[test] fn minimized_byte_class() { let mut builder = RegexBuilder::new(); builder.minimize(true).premultiply(false).byte_classes(true); let mut tester = RegexTester::new(); tester.test_all(builder, SUITE.tests()); tester.assert(); } #[test] fn minimized_premultiply_byte_class() { let mut builder = RegexBuilder::new(); builder.minimize(true).premultiply(true).byte_classes(true); let mut tester = RegexTester::new(); tester.test_all(builder, SUITE.tests()); tester.assert(); } // A basic sanity test that checks we can convert a regex to a smaller // representation and that the resulting regex still passes our tests. // // If tests grow minimal regexes that cannot be represented in 16 bits, then // we'll either want to skip those or increase the size to test to u32. #[test] fn u16() { let mut builder = RegexBuilder::new(); builder.minimize(true).premultiply(false).byte_classes(true); let mut tester = RegexTester::new().skip_expensive(); for test in SUITE.tests() { let builder = builder.clone(); let re: Regex = match tester.build_regex(builder, test) { None => continue, Some(re) => re, }; let small_re = Regex::from_dfas( re.forward().to_u16().unwrap(), re.reverse().to_u16().unwrap(), ); tester.test(test, &small_re); } tester.assert(); } // Test that sparse DFAs work using the standard configuration. #[test] fn sparse_unminimized_standard() { let mut builder = RegexBuilder::new(); builder.minimize(false).premultiply(false).byte_classes(false); let mut tester = RegexTester::new().skip_expensive(); for test in SUITE.tests() { let builder = builder.clone(); let re: Regex = match tester.build_regex(builder, test) { None => continue, Some(re) => re, }; let fwd = re.forward().to_sparse().unwrap(); let rev = re.reverse().to_sparse().unwrap(); let sparse_re = Regex::from_dfas(fwd, rev); tester.test(test, &sparse_re); } tester.assert(); } // Test that sparse DFAs work after converting them to a different state ID // representation. #[test] fn sparse_u16() { let mut builder = RegexBuilder::new(); builder.minimize(true).premultiply(false).byte_classes(false); let mut tester = RegexTester::new().skip_expensive(); for test in SUITE.tests() { let builder = builder.clone(); let re: Regex = match tester.build_regex(builder, test) { None => continue, Some(re) => re, }; let fwd = re.forward().to_sparse().unwrap().to_u16().unwrap(); let rev = re.reverse().to_sparse().unwrap().to_u16().unwrap(); let sparse_re = Regex::from_dfas(fwd, rev); tester.test(test, &sparse_re); } tester.assert(); } // Another basic sanity test that checks we can serialize and then deserialize // a regex, and that the resulting regex can be used for searching correctly. #[test] fn serialization_roundtrip() { let mut builder = RegexBuilder::new(); builder.premultiply(false).byte_classes(true); let mut tester = RegexTester::new().skip_expensive(); for test in SUITE.tests() { let builder = builder.clone(); let re: Regex = match tester.build_regex(builder, test) { None => continue, Some(re) => re, }; let fwd_bytes = re.forward().to_bytes_native_endian().unwrap(); let rev_bytes = re.reverse().to_bytes_native_endian().unwrap(); let fwd: DenseDFA<&[usize], usize> = unsafe { DenseDFA::from_bytes(&fwd_bytes) }; let rev: DenseDFA<&[usize], usize> = unsafe { DenseDFA::from_bytes(&rev_bytes) }; let re = Regex::from_dfas(fwd, rev); tester.test(test, &re); } tester.assert(); } // A basic sanity test that checks we can serialize and then deserialize a // regex using sparse DFAs, and that the resulting regex can be used for // searching correctly. #[test] fn sparse_serialization_roundtrip() { let mut builder = RegexBuilder::new(); builder.byte_classes(true); let mut tester = RegexTester::new().skip_expensive(); for test in SUITE.tests() { let builder = builder.clone(); let re: Regex = match tester.build_regex(builder, test) { None => continue, Some(re) => re, }; let fwd_bytes = re .forward() .to_sparse() .unwrap() .to_bytes_native_endian() .unwrap(); let rev_bytes = re .reverse() .to_sparse() .unwrap() .to_bytes_native_endian() .unwrap(); let fwd: SparseDFA<&[u8], usize> = unsafe { SparseDFA::from_bytes(&fwd_bytes) }; let rev: SparseDFA<&[u8], usize> = unsafe { SparseDFA::from_bytes(&rev_bytes) }; let re = Regex::from_dfas(fwd, rev); tester.test(test, &re); } tester.assert(); } regex-automata-0.1.8/tests/tests.rs010064400017500000144000000004211341442251200155250ustar0000000000000000#![allow(dead_code)] #[macro_use] extern crate lazy_static; extern crate regex; extern crate regex_automata; extern crate serde; extern crate serde_bytes; #[macro_use] extern crate serde_derive; extern crate toml; mod collection; mod regression; mod suite; mod unescape; regex-automata-0.1.8/tests/unescape.rs010064400017500000144000000045511351510014500161730ustar0000000000000000#[derive(Clone, Copy, Eq, PartialEq)] enum State { /// The state after seeing a `\`. Escape, /// The state after seeing a `\x`. HexFirst, /// The state after seeing a `\x[0-9A-Fa-f]`. HexSecond(char), /// Default state. Literal, } pub fn unescape(s: &str) -> Vec { use self::State::*; let mut bytes = vec![]; let mut state = Literal; for c in s.chars() { match state { Escape => { match c { '\\' => { bytes.push(b'\\'); state = Literal; } 'n' => { bytes.push(b'\n'); state = Literal; } 'r' => { bytes.push(b'\r'); state = Literal; } 't' => { bytes.push(b'\t'); state = Literal; } 'x' => { state = HexFirst; } c => { bytes.extend(format!(r"\{}", c).into_bytes()); state = Literal; } } } HexFirst => { match c { '0'..='9' | 'A'..='F' | 'a'..='f' => { state = HexSecond(c); } c => { bytes.extend(format!(r"\x{}", c).into_bytes()); state = Literal; } } } HexSecond(first) => { match c { '0'..='9' | 'A'..='F' | 'a'..='f' => { let ordinal = format!("{}{}", first, c); let byte = u8::from_str_radix(&ordinal, 16).unwrap(); bytes.push(byte); state = Literal; } c => { let original = format!(r"\x{}{}", first, c); bytes.extend(original.into_bytes()); state = Literal; } } } Literal => { match c { '\\' => { state = Escape; } c => { bytes.extend(c.to_string().as_bytes()); } } } } } match state { Escape => bytes.push(b'\\'), HexFirst => bytes.extend(b"\\x"), HexSecond(c) => bytes.extend(format!("\\x{}", c).into_bytes()), Literal => {} } bytes } regex-automata-0.1.8/.cargo_vcs_info.json0000644000000001120000000000000140160ustar00{ "git": { "sha1": "58516d01ae12c6cae59b53c1326b06a950be6326" } }