hyphenation_commons-0.8.4/.cargo_vcs_info.json0000644000000001120000000000100151200ustar { "git": { "sha1": "70bb1fb214358885a03dfed7db3a164f9ca87561" } } hyphenation_commons-0.8.4/Cargo.toml0000644000000016360000000000100131320ustar # THIS FILE IS AUTOMATICALLY GENERATED BY CARGO # # When uploading crates to the registry Cargo will automatically # "normalize" Cargo.toml files for maximal compatibility # with all versions of Cargo and also rewrite `path` dependencies # to registry (e.g., crates.io) dependencies. # # If you are reading this file be aware that the original Cargo.toml # will likely look very different (and much more reasonable). # See Cargo.toml.orig for the original contents. [package] edition = "2018" name = "hyphenation_commons" version = "0.8.4" authors = ["Andrew "] description = "Proemial code for the `hyphenation` library" homepage = "https://github.com/tapeinosyne/hyphenation" documentation = "https://docs.rs/hyphenation" license = "Apache-2.0/MIT" repository = "https://github.com/tapeinosyne/hyphenation" [dependencies.fst] version = "0.4" [dependencies.serde] version = "1.0" features = ["derive"] hyphenation_commons-0.8.4/Cargo.toml.orig000064400000000000000000000006750072674642500166450ustar 00000000000000[package] name = "hyphenation_commons" version = "0.8.4" edition = "2018" authors = ["Andrew "] license = "Apache-2.0/MIT" repository = "https://github.com/tapeinosyne/hyphenation" homepage = "https://github.com/tapeinosyne/hyphenation" documentation = "https://docs.rs/hyphenation" description = "Proemial code for the `hyphenation` library" [dependencies] fst = "0.4" serde = { version = "1.0", features = ["derive"] } hyphenation_commons-0.8.4/README.md000064400000000000000000000001610072674642500152230ustar 00000000000000# hyphenation commons Proemial code for `hyphenation`. Mostly internal, slightly haphazard, leastly dependable. hyphenation_commons-0.8.4/src/dictionary/extended.rs000064400000000000000000000073610072674642500210570ustar 00000000000000/*! Data structures for extended hyphenation[1]. [1]: [Automatic non-standard hyphenation in OpenOffice.org](https://www.tug.org/TUGboat/tb27-1/tb86nemeth.pdf) */ use std::collections::HashMap; use crate::dictionary::trie::{self, Trie}; use crate::dictionary::{uniques, Locus, PrefixTallies}; use crate::language::Language; use crate::parse::Parse; /// The partial score carried by an extended hyphenation pattern. #[derive(Clone, Debug, Default, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct Tally { /// The pattern tally, equivalent to that found in standard patterns. pub standard : Vec, /// An optional subregion which may replace part of the string around the /// opportunity. pub subregion : Option<(Locus, Subregion)>, } /// Word alterations extending a standard Knuth–Liang pattern. #[derive(Clone, Debug, Default, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct Subregion { /// The number of bytes that the substitution will replace before the break. pub left : usize, /// The number of bytes that the substitution will replace after the break. pub right : usize, /// The replacement for the substring to be altered around the break, as /// delimited by the `left` and `right` subregion boundaries. pub substitution : String, /// An index into the substitution, denoting the hyphenation opportunity /// within this subregion. pub breakpoint : usize, } /// A trie mapping hyphenation patterns to their extended tallies. #[derive(Debug, Default, Serialize, Deserialize)] pub struct Patterns { tallies : Vec, automaton : Trie, } impl Patterns { pub fn from_iter(iter : I) -> Result where I : IntoIterator::Tally)> { let (kvs, tallies) = uniques(iter.into_iter()); let automaton = Trie::from_iter(kvs.into_iter())?; Ok(Patterns { tallies, automaton }) } } /// A specialized hashmap associating words to their known hyphenation. #[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] pub struct Exceptions(pub HashMap)>>); /// A dictionary for extended Knuth–Liang hyphenation, based on the strategy /// described by Németh in "Automatic non-standard hyphenation in /// OpenOffice.org". /// /// It comprises the working language, the set of extended patterns and /// exceptions, and the character boundaries for hyphenation. #[derive(Debug, Serialize, Deserialize)] pub struct Extended { language : Language, patterns : Patterns, pub exceptions : Exceptions, /// The minimum number of `char`s from the start and end of a word where /// breaks may not occur. pub minima : (usize, usize), } impl Extended { /// The language for which this dictionary can provide hyphenation. pub fn language(&self) -> Language { self.language } /// An iterator over the tallies associated to all prefixes of the query, /// including the query itself. pub fn prefix_tallies<'f, 'q>(&'f self, query : &'q [u8]) -> PrefixTallies<'f, 'q, Tally> { PrefixTallies { matches : self.patterns.automaton.get_prefixes(query), tallies : &self.patterns.tallies, } } } /// An intermediate dictionary builder, its primary purpose is visibility /// hygiene. #[derive(Debug)] pub struct Builder { pub language : Language, pub patterns : Patterns, pub exceptions : Exceptions, } impl From for Extended { fn from(b : Builder) -> Extended { Extended { language : b.language, patterns : b.patterns, exceptions : b.exceptions, minima : b.language.minima(), } } } hyphenation_commons-0.8.4/src/dictionary/mod.rs000064400000000000000000000072510072674642500200340ustar 00000000000000//! Data structures for the storage of hyphenation patterns and exceptions. pub mod extended; mod trie; use std::collections::HashMap; use std::hash::Hash; use crate::dictionary::trie::PrefixMatches; pub use crate::dictionary::trie::{Error, Trie}; use crate::language::Language; use crate::parse::Parse; #[derive(Copy, Clone, Debug, Default, PartialEq, Eq, Hash, Serialize, Deserialize)] pub struct Locus { pub index : u8, pub value : u8, } /// A trie mapping hyphenation patterns to their tallies. #[derive(Clone, Debug, Default, Serialize, Deserialize)] pub struct Patterns { tallies : Vec>, automaton : Trie, } impl Patterns { pub fn from_iter(iter : I) -> Result where I : IntoIterator::Tally)> { let (kvs, tallies) = uniques(iter.into_iter()); let automaton = Trie::from_iter(kvs.into_iter())?; Ok(Patterns { tallies, automaton }) } } /// A specialized hashmap associating words to their known hyphenation. #[derive(Clone, Debug, Default, PartialEq, Eq, Serialize, Deserialize)] pub struct Exceptions(pub HashMap>); /// A dictionary for standard Knuth–Liang hyphenation. /// /// It comprises the working language, the pattern and exception sets, /// and the character boundaries for hyphenation. #[derive(Clone, Debug, Serialize, Deserialize)] pub struct Standard { language : Language, patterns : Patterns, pub exceptions : Exceptions, /// The minimum number of `char`s from the start and end of a word where /// breaks may not occur. pub minima : (usize, usize), } impl Standard { /// The language for which this dictionary can provide hyphenation. pub fn language(&self) -> Language { self.language } /// An iterator over the tallies associated to all prefixes of the query, /// including the query itself. pub fn prefix_tallies<'f, 'q>(&'f self, query : &'q [u8]) -> PrefixTallies<'f, 'q, Vec> { PrefixTallies { matches : self.patterns.automaton.get_prefixes(query), tallies : &self.patterns.tallies, } } } pub struct PrefixTallies<'f, 'q, T> { tallies : &'f [T], matches : PrefixMatches<'f, 'q>, } impl<'f, 'q, T> Iterator for PrefixTallies<'f, 'q, T> { type Item = &'f T; fn next(&mut self) -> Option { self.matches .next() .and_then(|i| self.tallies.get(i as usize)) } } /// An intermediate dictionary builder, primarily to retain field privacy in the /// dictionary. #[derive(Debug)] pub struct Builder { pub language : Language, pub patterns : Patterns, pub exceptions : Exceptions, } impl From for Standard { fn from(b : Builder) -> Standard { Standard { language : b.language, patterns : b.patterns, exceptions : b.exceptions, minima : b.language.minima(), } } } pub fn uniques(iter : I) -> (Vec<(String, u64)>, Vec) where T : Eq + Clone + Hash, I : Iterator { let mut pairs = Vec::new(); let mut tally_ids = HashMap::new(); let mut tallies : Vec = Vec::with_capacity(256); for (pattern, tally) in iter { match tally_ids.get(&tally) { Some(&id) => pairs.push((pattern, id)), None => { let id = tallies.len() as u64; tallies.push(tally.clone()); tally_ids.insert(tally, id); pairs.push((pattern, id)); } } } pairs.sort_by(|a, b| a.0.cmp(&b.0)); pairs.dedup_by(|a, b| a.0 == b.0); (pairs, tallies) } hyphenation_commons-0.8.4/src/dictionary/trie.rs000064400000000000000000000075010072674642500202160ustar 00000000000000use fst::raw; use fst::Map; use serde::de::{self, Deserialize, Deserializer, Visitor}; use serde::ser::{Serialize, Serializer}; use std::convert::From; use std::error; use std::fmt; use std::slice; #[derive(Clone, Debug, Default)] pub struct Trie(Map>); impl Trie { pub fn as_bytes(&self) -> &[u8] { self.0.as_fst().as_bytes() } pub fn from_bytes(bs : Vec) -> Result { let map = Map::new(bs)?; Ok(Trie(map)) } pub fn from_iter(iter : I) -> Result where I : Iterator { let m = fst::Map::from_iter(iter)?; Ok(Trie(m)) } pub fn get_prefixes<'f, 'q>(&'f self, query : &'q [u8]) -> PrefixMatches<'f, 'q> { let fst = self.0.as_fst(); PrefixMatches { fst, node : fst.root(), output : raw::Output::zero(), query : query.iter() } } } #[derive(Clone)] pub struct PrefixMatches<'f, 'q> { fst : &'f raw::Fst>, node : raw::Node<'f>, output : raw::Output, query : slice::Iter<'q, u8>, } impl<'f, 'q> Iterator for PrefixMatches<'f, 'q> { type Item = u64; fn next(&mut self) -> Option { let prefix = &mut self.query; for &b in prefix { match self.node.find_input(b) { None => return None, Some(i) => { let t = self.node.transition(i); self.output = self.output.cat(t.out); self.node = self.fst.node(t.addr); if self.node.is_final() { let final_output = self.output.cat(self.node.final_output()); return Some(final_output.value()); } } } } None } } impl AsRef>> for Trie { fn as_ref(&self) -> &fst::Map> { &self.0 } } impl AsMut>> for Trie { fn as_mut(&mut self) -> &mut fst::Map> { &mut self.0 } } impl From>> for Trie { fn from(m : fst::Map>) -> Self { Trie(m) } } /* Serialization */ #[derive(Copy, Clone, Debug)] struct FstVisitor; const NOM_DE_SER : &str = "Trie"; impl<'de> Visitor<'de> for FstVisitor { type Value = Trie; fn expecting(&self, f : &mut fmt::Formatter) -> fmt::Result { f.write_str("the internal trie of a hyphenation dictionary") } fn visit_bytes(self, bs : &[u8]) -> Result where E : de::Error { Trie::from_bytes(bs.to_vec()).map_err(E::custom) } fn visit_byte_buf(self, bs : Vec) -> Result where E : de::Error { Trie::from_bytes(bs).map_err(E::custom) } fn visit_newtype_struct(self, de : D) -> Result where D : Deserializer<'de> { de.deserialize_bytes(FstVisitor) } } impl Serialize for Trie { fn serialize(&self, ser : S) -> Result where S : Serializer { ser.serialize_newtype_struct(NOM_DE_SER, self.as_bytes()) } } impl<'de> Deserialize<'de> for Trie { fn deserialize(de : D) -> Result where D : Deserializer<'de> { de.deserialize_newtype_struct(NOM_DE_SER, FstVisitor) } } #[derive(Debug)] pub struct Error(pub fst::Error); impl fmt::Display for Error { fn fmt(&self, f : &mut fmt::Formatter<'_>) -> fmt::Result { let message = format!("The dictionary's internal trie could not be built:\n{}", &self.0); f.write_str(&message) } } impl error::Error for Error { fn source(&self) -> Option<&(dyn error::Error + 'static)> { Some(&self.0) } } impl From for Error { fn from(err : fst::Error) -> Self { Error(err) } } hyphenation_commons-0.8.4/src/language.rs000064400000000000000000000135010072674642500166660ustar 00000000000000//! Available languages and related data. use std::fmt; macro_rules! fiant_linguae { ( $($lang:ident, $bounds:expr, $code:expr;)* ) => { fiant_linguae! { $($lang, $bounds, $code);* } }; ( $($lang:ident, $bounds:expr, $code:expr);* ) => { /// The set of languages available for hyphenation. #[derive(Copy, Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash, Serialize, Deserialize)] pub enum Language { $( $lang, )* } impl Language { /// The default number of characters from the start and end of a word /// where breaks may not occur. pub fn minima(&self) -> (usize, usize) { match *self { $( Language::$lang => $bounds, )* } } /// The [BCP 47](https://tools.ietf.org/html/bcp47) tag for this language. pub fn code(&self) -> &'static str { match *self { $( Language::$lang => $code, )* } } /// Try and construct a Language from a [BCP 47](https://tools.ietf.org/html/bcp47) tag. pub fn try_from_code>(code: T) -> Option { match code.as_ref() { $( $code => Some(Language::$lang), )* _ => None } } } impl fmt::Display for Language { fn fmt(&self, f : &mut fmt::Formatter) -> fmt::Result { write!(f, "{:?}", *self) } } } } // NOTE: These hyphenation bounds were taken directly from the relevant TeX // packages, but it is not entirely clear how well they map to the notion of // Unicode `char` in Rust. // // In the worst case, a language featuring graphemes larger than 1 `char` may // set boundaries mid-grapheme. This should be of no practical consequence, // since well-formed hyphenation patterns only match full graphemes; moreover, // well-behaved hyphenators are expected to validate hyphenation opportunities, // discarding any which arise outside `char` boundaries. fiant_linguae! { Afrikaans, (1, 2), "af"; Albanian, (2, 2), "sq"; Armenian, (1, 2), "hy"; Assamese, (1, 1), "as"; Basque, (2, 2), "eu"; Belarusian, (2, 2), "be"; Bengali, (1, 1), "bn"; Bulgarian, (2, 2), "bg"; Catalan, (2, 2), "ca"; Chinese, (1, 1), "zh-latn-pinyin"; Coptic, (1, 1), "cop"; Croatian, (2, 2), "hr"; Czech, (2, 3), "cs"; Danish, (2, 2), "da"; Dutch, (2, 2), "nl"; EnglishGB, (2, 3), "en-gb"; EnglishUS, (2, 3), "en-us"; Esperanto, (2, 2), "eo"; Estonian, (2, 3), "et"; Ethiopic, (1, 1), "mul-ethi"; Finnish, (2, 2), "fi"; FinnishScholastic, (1, 1), "fi-x-school"; French, (2, 3), "fr"; Friulan, (2, 2), "fur"; Galician, (2, 2), "gl"; Georgian, (1, 2), "ka"; German1901, (2, 2), "de-1901"; German1996, (2, 2), "de-1996"; GermanSwiss, (2, 2), "de-ch-1901"; GreekAncient, (1, 1), "grc"; GreekMono, (1, 1), "el-monoton"; GreekPoly, (1, 1), "el-polyton"; Gujarati, (1, 1), "gu"; Hindi, (1, 1), "hi"; Hungarian, (2, 2), "hu"; Icelandic, (2, 2), "is"; Indonesian, (2, 2), "id"; Interlingua, (2, 2), "ia"; Irish, (2, 3), "ga"; Italian, (2, 2), "it"; Kannada, (1, 1), "kn"; Kurmanji, (2, 2), "kmr"; Latin, (2, 2), "la"; LatinClassic, (2, 2), "la-x-classic"; LatinLiturgical, (2, 2), "la-x-liturgic"; Latvian, (2, 2), "lv"; Lithuanian, (2, 2), "lt"; Macedonian, (2, 2), "mk"; Malayalam, (1, 1), "ml"; Marathi, (1, 1), "mr"; Mongolian, (2, 2), "mn-cyrl"; NorwegianBokmal, (2, 2), "nb"; NorwegianNynorsk, (2, 2), "nn"; Occitan, (2, 2), "oc"; Oriya, (1, 1), "or"; Pali, (1, 2), "pi"; Panjabi, (1, 1), "pa"; Piedmontese, (2, 2), "pms"; Polish, (2, 2), "pl"; Portuguese, (2, 3), "pt"; Romanian, (2, 2), "ro"; Romansh, (2, 2), "rm"; Russian, (2, 2), "ru"; Sanskrit, (1, 3), "sa"; SerbianCyrillic, (2, 2), "sr-cyrl"; SerbocroatianCyrillic, (2, 2), "sh-cyrl"; SerbocroatianLatin, (2, 2), "sh-latn"; SlavonicChurch, (1, 2), "cu"; Slovak, (2, 3), "sk"; Slovenian, (2, 2), "sl"; Spanish, (2, 2), "es"; Swedish, (2, 2), "sv"; Tamil, (1, 1), "ta"; Telugu, (1, 1), "te"; Thai, (2, 3), "th"; Turkish, (2, 2), "tr"; Turkmen, (2, 2), "tk"; Ukrainian, (2, 2), "uk"; Uppersorbian, (2, 2), "hsb"; Welsh, (2, 3), "cy"; } hyphenation_commons-0.8.4/src/lib.rs000064400000000000000000000002060072674642500156470ustar 00000000000000/* Hyphenation internals */ #[macro_use] extern crate serde; pub mod dictionary; mod language; pub mod parse; pub use language::*; hyphenation_commons-0.8.4/src/parse.rs000064400000000000000000000117750072674642500162300ustar 00000000000000//! Pattern and exception parsing. use crate::dictionary::extended::{self as ext, Subregion}; use crate::dictionary::*; // TODO: make parsing fallible pub trait Parse { type Tally: Eq; fn value(v : char) -> Option; fn tally(s : &str) -> Self::Tally; fn alphabetical(s : &str) -> String { s.chars() .filter(|c| Self::value(*c) == None) .collect() } fn pair(str_klp : &str, normalize : N) -> (String, Self::Tally) where N : Fn(&str) -> String { let normalized = normalize(str_klp); (Self::alphabetical(&normalized), Self::tally(&normalized)) } } impl<'a> Parse for Patterns { type Tally = Vec; #[inline] fn value(c : char) -> Option { c.to_digit(10).map(|n| n as u8) } fn tally(pattern : &str) -> Self::Tally { pattern.bytes() .enumerate() .filter_map(|(i, b)| Self::value(b as char).map(|v| (i, v))) .enumerate() .map(|(j, (i, v))| Locus { index : (i - j) as u8, value : v, }) .collect() } } impl<'a> Parse for Exceptions { type Tally = Vec; #[inline] fn value(c : char) -> Option { match c { '-' => Some(2), _ => None, } } fn tally(exception : &str) -> Self::Tally { exception.bytes() .enumerate() .filter_map(|(i, b)| Self::value(b as char).map(|_| i)) .enumerate() .map(|(j, i)| i - j) .collect() } } impl<'a> Parse for ext::Patterns { type Tally = ext::Tally; #[inline] fn value(c : char) -> Option { c.to_digit(10).map(|n| n as u8) } fn alphabetical(s : &str) -> String { match s.find('/') { None => Patterns::alphabetical(s), Some(i) => Patterns::alphabetical(&s[.. i]), } } fn tally(pattern : &str) -> Self::Tally { use std::str::FromStr; // TODO: refactor match pattern.find('/') { None => ext::Tally { standard : Patterns::tally(pattern), subregion : None, }, Some(i) => { // Exoneration: we unwrap liberally within this match arm, since failure // would denote a malformed pattern. let err = &["Malformed extended hyphenation pattern: ", pattern].concat(); let (standard, extension) = (&pattern[.. i], &pattern[i + 1 ..]); let breakpoint = extension.find('=').expect(err); let sub_pattern_end = extension.find(',').expect(err); let sub_pattern = &extension[.. sub_pattern_end]; let sub_idxs = &extension[sub_pattern_end + 1 ..]; let dot_offset = if standard.starts_with('.') { 1 } else { 0 }; let (chars_to_op, span) = { let v : Vec<_> = sub_idxs.split(',') .map(|s| usize::from_str(s).expect(err)) .collect(); assert!(v.len() == 2, "Malformed extended hyphenation pattern: {}", pattern); (v[0] + dot_offset, v[1]) }; let tally = Patterns::tally(standard); let alphabetical = Patterns::alphabetical(standard); let substitution = sub_pattern.chars().filter(|&c| c.is_alphabetic()).collect(); // Németh always starts the subregion at the character immediately preceding // the opportunity. let chars_to_start = chars_to_op.saturating_sub(1); let start = alphabetical.char_indices() .nth(chars_to_start) .expect(err) .0; let end = alphabetical.char_indices() .nth(chars_to_start + span) .expect(err) .0; let index = alphabetical.char_indices().nth(chars_to_op).expect(err).0 as u8; let (left, right) = (index as usize - start, end - index as usize); let value = tally.iter() .find(|&&locus| locus.index == index) .map(|&locus| locus.value) .expect(err); ext::Tally { standard : tally, subregion : (Locus { index, value }, Subregion { left, right, substitution, breakpoint }) .into(), } } } } }