regexp_property_values-1.5.2/0000755000004100000410000000000014632134735016374 5ustar www-datawww-dataregexp_property_values-1.5.2/bin/0000755000004100000410000000000014632134735017144 5ustar www-datawww-dataregexp_property_values-1.5.2/bin/setup0000755000004100000410000000020314632134735020225 0ustar www-datawww-data#!/usr/bin/env bash set -euo pipefail IFS=$'\n\t' set -vx bundle install # Do any other automated setup that you need to do here regexp_property_values-1.5.2/bin/console0000755000004100000410000000062414632134735020536 0ustar www-datawww-data#!/usr/bin/env ruby require "bundler/setup" require "regexp_property_values" require "benchmark" # You can add fixtures and/or initialization code here to make experimenting # with your gem easier. You can also use a different console, if you like. # (If you use this, don't forget to add pry to your Gemfile!) # require "pry" # Pry.start PV = RegexpPropertyValues require "irb" IRB.start(__FILE__) regexp_property_values-1.5.2/.gitignore0000644000004100000410000000057514632134735020373 0ustar www-datawww-data*.bundle *.gem *.iml *.stTheme.cache *.sublime-project *.sublime-workspace *.swp *.tmlanguage.cache *.tmPreferences.cache *~ .byebug_history .DS_Store .idea/ .ruby-gemset .ruby-version .tags .tags1 .tool-versions bbin/ binstubs/* bundler_stubs/*/.yardoc Gemfile.lock /.bundle/ /.vscode/ /_yardoc/ /coverage/ /doc/ /pkg/ /spec/reports/ /tmp/ # rspec failure tracking .rspec_status regexp_property_values-1.5.2/.github/0000755000004100000410000000000014632134735017734 5ustar www-datawww-dataregexp_property_values-1.5.2/.github/workflows/0000755000004100000410000000000014632134735021771 5ustar www-datawww-dataregexp_property_values-1.5.2/.github/workflows/tests.yml0000644000004100000410000000103114632134735023651 0ustar www-datawww-dataname: tests on: [push, pull_request] jobs: build: runs-on: ubuntu-latest strategy: matrix: ruby: [ '2.3', '3.1', 'jruby-head' ] # TODO: 'ruby-head' after https://github.com/knu/sorted_set/issues/11 steps: - uses: actions/checkout@v2 - name: Set up Ruby ${{ matrix.ruby }} uses: ruby/setup-ruby@v1 with: ruby-version: ${{ matrix.ruby }} - name: Install dependencies run: bundle install --jobs 4 - name: Test with Rake run: bundle exec rake regexp_property_values-1.5.2/lib/0000755000004100000410000000000014632134735017142 5ustar www-datawww-dataregexp_property_values-1.5.2/lib/regexp_property_values/0000755000004100000410000000000014632134735023757 5ustar www-datawww-dataregexp_property_values-1.5.2/lib/regexp_property_values/value/0000755000004100000410000000000014632134735025073 5ustar www-datawww-dataregexp_property_values-1.5.2/lib/regexp_property_values/value/ext_adapter.rb0000644000004100000410000000112114632134735027713 0ustar www-datawww-datamodule RegexpPropertyValues class Value module ExtAdapter def matched_characters acc = [] matched_codepoints.each do |cp| acc << cp.chr('utf-8') if cp < 0xD800 || cp > 0xDFFF end acc end def matched_codepoints OnigRegexpPropertyHelper.matched_codepoints(name) rescue ArgumentError raise_unsupported_or_unknown_error end def matched_ranges OnigRegexpPropertyHelper.matched_ranges(name) rescue ArgumentError raise_unsupported_or_unknown_error end end end end regexp_property_values-1.5.2/lib/regexp_property_values/value/shared_methods.rb0000644000004100000410000000261614632134735030416 0ustar www-datawww-datamodule RegexpPropertyValues class Value module SharedMethods attr_reader :name def initialize(name) @name = name end def supported_by_current_ruby? !!regexp rescue false end def ==(other) identifier == other.identifier end alias eql? == def hash @hash ||= identifier.hash end def identifier @identifier ||= name.to_s.downcase.gsub(/[^0-9a-z=.]/, '') end alias to_s identifier def full_name (original = find_original) ? original.name : raise_unknown_error end def character_set require 'character_set' CharacterSet.from_ranges(*matched_ranges) end private def regexp @regexp ||= /\p{#{identifier}}/u rescue RegexpError, SyntaxError raise_unsupported_or_unknown_error end def find_original RegexpPropertyValues.all.find { |orig| orig.eql?(self) } || RegexpPropertyValues.alias_hash[self] end def raise_unsupported_or_unknown_error find_original ? raise_unsupported_error : raise_unknown_error end def raise_unsupported_error raise Error, "Property name `#{name}` is known, but not in this Ruby" end def raise_unknown_error raise Error, "Property name `#{name}` is not known in any Ruby" end end end end regexp_property_values-1.5.2/lib/regexp_property_values/value/ruby_fallback.rb0000644000004100000410000000115714632134735030224 0ustar www-datawww-datamodule RegexpPropertyValues class Value module RubyFallback def matched_characters matched_codepoints.map { |cp| cp.chr('utf-8') } end def matched_codepoints # turns out scanning one big string is the least slow way to do this @@test_str ||= (0..0xD7FF).map { |cp| cp.chr('utf-8') }.join << (0xE000..0x10FFFF).map { |cp| cp.chr('utf-8') }.join @@test_str.scan(regexp).flat_map(&:codepoints) end def matched_ranges require 'range_compressor' RangeCompressor.compress(matched_codepoints) end end end end regexp_property_values-1.5.2/lib/regexp_property_values/updater.rb0000644000004100000410000000740314632134735025754 0ustar www-datawww-datamodule RegexpPropertyValues module Updater module_function require 'fileutils' require 'set' BASE_URL = 'https://www.unicode.org/Public/UCD/latest/ucd' UCD_FILES = %w[ Blocks.txt DerivedAge.txt DerivedCoreProperties.txt PropertyAliases.txt PropertyValueAliases.txt PropList.txt Scripts.txt auxiliary/GraphemeBreakProperty.txt ] EMOJI_FILES = %w[ emoji-data.txt ] TMP_DIR = File.join(__dir__, 'tmp_ucd') def call(ucd_path: nil, emoji_path: nil) prepare_tmp_dir download_ucd_files(ucd_path: ucd_path, emoji_path: emoji_path) write_values write_aliases remove_tmp_dir print_stats end def prepare_tmp_dir FileUtils.rm_rf(TMP_DIR) if File.exist?(TMP_DIR) FileUtils.mkdir(TMP_DIR) end def download_ucd_files(ucd_path: nil, emoji_path: nil) puts 'This will try to load the latest UCD data. Continue? [y/n]' return puts 'download skipped.' unless $stdin.gets =~ /^y/i ucd_path ||= ENV['RPV_UCD_PATH'] || BASE_URL emoji_path ||= ENV['RPV_EMOJI_PATH'] || "#{BASE_URL}/emoji/" Dir.chdir(TMP_DIR) do UCD_FILES.each { |f| `wget #{ucd_path}/#{f}` } EMOJI_FILES.each { |f| `wget #{emoji_path}/#{f}` } end end def write_values @values = Set.new # posix properties @values += %w[ Alpha Blank Cntrl Digit Graph Lower Print Punct Space Upper XDigit Word Alnum ASCII XPosixPunct ] # special properties @values += %w[ Any Assigned Extended_Pictographic In_No_Block Unknown ] # legacy properties @values += %w[Newline] regexp = /^[0-9a-fA-F]+(?:\.\.[0-9a-fA-F]+)? *; (?\w+) +# / %w[ DerivedCoreProperties.txt PropList.txt Scripts.txt emoji-data.txt ].each { |file| scan(file, regexp) { |caps| @values << caps[:prop_name] } } scan('PropertyValueAliases.txt', /^gc ; \w+ *; (?\w+)/) do |caps| @values << caps[:prop_name] end scan('Blocks.txt', /^[\dA-F.]+ *; (?[-\w ]+)/) do |caps| @values << 'In_' + caps[:block_name].gsub(/\W/, '_') end scan('DerivedAge.txt', /^[\dA-F.]+ *; (?[\d.]+)/) do |caps| @values << 'Age=' + caps[:age_num] end scan('GraphemeBreakProperty.txt', /; *(?\w+) *#/) do |caps| @values << 'Grapheme_Cluster_Break=' + caps[:name] end File.write(RegexpPropertyValues::VALUES_PATH, @values.sort.join("\n")) end def write_aliases @aliases = Set.new scan('PropertyAliases.txt', /^(?\w+) *; (?\w+)/) do |caps| if in_values?(caps[:name]) && !in_values?(caps[:alias]) @aliases << [caps[:alias], caps[:name]] end end scan('PropertyValueAliases.txt', /^[gs]c ; (?\w+) *; (?\w+)(?: *; (?\w+))?/) do |caps| if in_values?(caps[:name]) && !in_values?(caps[:alias1]) @aliases << [caps[:alias1], caps[:name]] end if in_values?(caps[:name]) && caps[:alias2] && !in_values?(caps[:alias2]) @aliases << [caps[:alias2], caps[:name]] end end File.write(RegexpPropertyValues::ALIASES_PATH, @aliases.sort.map { |pair| pair.join(';') }.join("\n")) end def in_values?(string) @values.any? { |value| value.casecmp?(string) } end def scan(file, pattern) path = File.join(TMP_DIR, file) File.read(path).scan(pattern) { yield(Regexp.last_match) } end def remove_tmp_dir FileUtils.rm_rf(TMP_DIR) end def print_stats print "\nFetched #{@values.size} values and #{@aliases.size} aliases.\n\n" end end end regexp_property_values-1.5.2/lib/regexp_property_values/version.rb0000644000004100000410000000006414632134735025771 0ustar www-datawww-datamodule RegexpPropertyValues VERSION = '1.5.2' end regexp_property_values-1.5.2/lib/regexp_property_values/value.rb0000644000004100000410000000051114632134735025415 0ustar www-datawww-datamodule RegexpPropertyValues class Value require_relative 'value/shared_methods' include SharedMethods if const_defined?(:OnigRegexpPropertyHelper) require_relative 'value/ext_adapter' include ExtAdapter else require_relative 'value/ruby_fallback' include RubyFallback end end end regexp_property_values-1.5.2/lib/aliases0000644000004100000410000001004614632134735020507 0ustar www-datawww-dataAHex;ASCII_Hex_Digit Adlm;Adlam Aghb;Caucasian_Albanian Arab;Arabic Armi;Imperial_Aramaic Armn;Armenian Avst;Avestan Bali;Balinese Bamu;Bamum Bass;Bassa_Vah Batk;Batak Beng;Bengali Bhks;Bhaiksuki Bidi_C;Bidi_Control Bopo;Bopomofo Brah;Brahmi Brai;Braille Bugi;Buginese Buhd;Buhid C;Other CI;Case_Ignorable CWCF;Changes_When_Casefolded CWCM;Changes_When_Casemapped CWL;Changes_When_Lowercased CWT;Changes_When_Titlecased CWU;Changes_When_Uppercased Cakm;Chakma Cans;Canadian_Aboriginal Cari;Carian Cc;Control Cf;Format Cher;Cherokee Chrs;Chorasmian Cn;Unassigned Co;Private_Use Combining_Mark;Mark Copt;Coptic Cpmn;Cypro_Minoan Cprt;Cypriot Cs;Surrogate Cyrl;Cyrillic DI;Default_Ignorable_Code_Point Dep;Deprecated Deva;Devanagari Dia;Diacritic Diak;Dives_Akuru Dogr;Dogra Dsrt;Deseret Dupl;Duployan EBase;Emoji_Modifier_Base EComp;Emoji_Component EMod;Emoji_Modifier EPres;Emoji_Presentation Egyp;Egyptian_Hieroglyphs Elba;Elbasan Elym;Elymaic Ethi;Ethiopic Ext;Extender ExtPict;Extended_Pictographic Geor;Georgian Glag;Glagolitic Gong;Gunjala_Gondi Gonm;Masaram_Gondi Goth;Gothic Gr_Base;Grapheme_Base Gr_Ext;Grapheme_Extend Gr_Link;Grapheme_Link Gran;Grantha Grek;Greek Gujr;Gujarati Guru;Gurmukhi Hang;Hangul Hani;Han Hano;Hanunoo Hatr;Hatran Hebr;Hebrew Hex;Hex_Digit Hira;Hiragana Hluw;Anatolian_Hieroglyphs Hmng;Pahawh_Hmong Hmnp;Nyiakeng_Puachue_Hmong Hung;Old_Hungarian IDC;ID_Continue IDS;ID_Start IDSB;IDS_Binary_Operator IDST;IDS_Trinary_Operator IDSU;IDS_Unary_Operator Ideo;Ideographic Ital;Old_Italic Java;Javanese Join_C;Join_Control Kali;Kayah_Li Kana;Katakana Khar;Kharoshthi Khmr;Khmer Khoj;Khojki Kits;Khitan_Small_Script Knda;Kannada Kthi;Kaithi L;Letter LC;Cased_Letter LOE;Logical_Order_Exception Lana;Tai_Tham Laoo;Lao Latn;Latin Lepc;Lepcha Limb;Limbu Lina;Linear_A Linb;Linear_B Ll;Lowercase_Letter Lm;Modifier_Letter Lo;Other_Letter Lt;Titlecase_Letter Lu;Uppercase_Letter Lyci;Lycian Lydi;Lydian M;Mark Mahj;Mahajani Maka;Makasar Mand;Mandaic Mani;Manichaean Marc;Marchen Mc;Spacing_Mark Me;Enclosing_Mark Medf;Medefaidrin Mend;Mende_Kikakui Merc;Meroitic_Cursive Mero;Meroitic_Hieroglyphs Mlym;Malayalam Mn;Nonspacing_Mark Mong;Mongolian Mroo;Mro Mtei;Meetei_Mayek Mult;Multani Mymr;Myanmar N;Number NChar;Noncharacter_Code_Point Nagm;Nag_Mundari Nand;Nandinagari Narb;Old_North_Arabian Nbat;Nabataean Nd;Decimal_Number Nkoo;Nko Nl;Letter_Number No;Other_Number Nshu;Nushu OAlpha;Other_Alphabetic ODI;Other_Default_Ignorable_Code_Point OGr_Ext;Other_Grapheme_Extend OIDC;Other_ID_Continue OIDS;Other_ID_Start OLower;Other_Lowercase OMath;Other_Math OUpper;Other_Uppercase Ogam;Ogham Olck;Ol_Chiki Orkh;Old_Turkic Orya;Oriya Osge;Osage Osma;Osmanya Ougr;Old_Uyghur P;Punctuation PCM;Prepended_Concatenation_Mark Palm;Palmyrene Pat_Syn;Pattern_Syntax Pat_WS;Pattern_White_Space Pauc;Pau_Cin_Hau Pc;Connector_Punctuation Pd;Dash_Punctuation Pe;Close_Punctuation Perm;Old_Permic Pf;Final_Punctuation Phag;Phags_Pa Phli;Inscriptional_Pahlavi Phlp;Psalter_Pahlavi Phnx;Phoenician Pi;Initial_Punctuation Plrd;Miao Po;Other_Punctuation Prti;Inscriptional_Parthian Ps;Open_Punctuation QMark;Quotation_Mark Qaac;Coptic Qaai;Inherited RI;Regional_Indicator Rjng;Rejang Rohg;Hanifi_Rohingya Runr;Runic S;Symbol SD;Soft_Dotted STerm;Sentence_Terminal Samr;Samaritan Sarb;Old_South_Arabian Saur;Saurashtra Sc;Currency_Symbol Sgnw;SignWriting Shaw;Shavian Shrd;Sharada Sidd;Siddham Sind;Khudawadi Sinh;Sinhala Sk;Modifier_Symbol Sm;Math_Symbol So;Other_Symbol Sogd;Sogdian Sogo;Old_Sogdian Sora;Sora_Sompeng Soyo;Soyombo Sund;Sundanese Sylo;Syloti_Nagri Syrc;Syriac Tagb;Tagbanwa Takr;Takri Tale;Tai_Le Talu;New_Tai_Lue Taml;Tamil Tang;Tangut Tavt;Tai_Viet Telu;Telugu Term;Terminal_Punctuation Tfng;Tifinagh Tglg;Tagalog Thaa;Thaana Tibt;Tibetan Tirh;Tirhuta Tnsa;Tangsa UIdeo;Unified_Ideograph Ugar;Ugaritic VS;Variation_Selector Vaii;Vai Vith;Vithkuqi WSpace;White_Space Wara;Warang_Citi Wcho;Wancho XIDC;XID_Continue XIDS;XID_Start Xpeo;Old_Persian Xsux;Cuneiform Yezi;Yezidi Yiii;Yi Z;Separator Zanb;Zanabazar_Square Zinh;Inherited Zl;Line_Separator Zp;Paragraph_Separator Zs;Space_Separator Zyyy;Common Zzzz;Unknownregexp_property_values-1.5.2/lib/regexp_property_values.rb0000644000004100000410000000173114632134735024306 0ustar www-datawww-databegin require 'regexp_property_values/regexp_property_values' rescue LoadError warn 'regexp_property_values could not load C extension, using slower Ruby' end require 'regexp_property_values/updater' require 'regexp_property_values/value' require 'regexp_property_values/version' module RegexpPropertyValues Error = Class.new(StandardError) VALUES_PATH = File.join(__dir__, 'values') ALIASES_PATH = File.join(__dir__, 'aliases') def self.[](name) Value.new(name) end def self.all_for_current_ruby @all_for_current_ruby ||= all.select(&:supported_by_current_ruby?) end def self.all @all ||= File.readlines(VALUES_PATH).map { |line| Value.new(line.chomp) } end def self.alias_hash @alias_hash ||= File.readlines(ALIASES_PATH).map do |line| line.chomp.split(';').map { |name| Value.new(name) } end.to_h end def self.update(ucd_path: nil, emoji_path: nil) Updater.call(ucd_path: ucd_path, emoji_path: emoji_path) end end regexp_property_values-1.5.2/lib/values0000644000004100000410000002405114632134735020366 0ustar www-datawww-dataASCII ASCII_Hex_Digit Adlam Age=1.1 Age=10.0 Age=11.0 Age=12.0 Age=12.1 Age=13.0 Age=14.0 Age=15.0 Age=15.1 Age=2.0 Age=2.1 Age=3.0 Age=3.1 Age=3.2 Age=4.0 Age=4.1 Age=5.0 Age=5.1 Age=5.2 Age=6.0 Age=6.1 Age=6.2 Age=6.3 Age=7.0 Age=8.0 Age=9.0 Ahom Alnum Alpha Alphabetic Anatolian_Hieroglyphs Any Arabic Armenian Assigned Avestan Balinese Bamum Bassa_Vah Batak Bengali Bhaiksuki Bidi_Control Blank Bopomofo Brahmi Braille Buginese Buhid Canadian_Aboriginal Carian Case_Ignorable Cased Cased_Letter Caucasian_Albanian Chakma Cham Changes_When_Casefolded Changes_When_Casemapped Changes_When_Lowercased Changes_When_Titlecased Changes_When_Uppercased Cherokee Chorasmian Close_Punctuation Cntrl Common Connector_Punctuation Control Coptic Cuneiform Currency_Symbol Cypriot Cypro_Minoan Cyrillic Dash Dash_Punctuation Decimal_Number Default_Ignorable_Code_Point Deprecated Deseret Devanagari Diacritic Digit Dives_Akuru Dogra Duployan Egyptian_Hieroglyphs Elbasan Elymaic Emoji Emoji_Component Emoji_Modifier Emoji_Modifier_Base Emoji_Presentation Enclosing_Mark Ethiopic Extended_Pictographic Extender Final_Punctuation Format Georgian Glagolitic Gothic Grantha Graph Grapheme_Base Grapheme_Cluster_Break=CR Grapheme_Cluster_Break=Control Grapheme_Cluster_Break=Extend Grapheme_Cluster_Break=L Grapheme_Cluster_Break=LF Grapheme_Cluster_Break=LV Grapheme_Cluster_Break=LVT Grapheme_Cluster_Break=Prepend Grapheme_Cluster_Break=Regional_Indicator Grapheme_Cluster_Break=SpacingMark Grapheme_Cluster_Break=T Grapheme_Cluster_Break=V Grapheme_Cluster_Break=ZWJ Grapheme_Extend Grapheme_Link Greek Gujarati Gunjala_Gondi Gurmukhi Han Hangul Hanifi_Rohingya Hanunoo Hatran Hebrew Hex_Digit Hiragana Hyphen IDS_Binary_Operator IDS_Trinary_Operator IDS_Unary_Operator ID_Compat_Math_Continue ID_Compat_Math_Start ID_Continue ID_Start Ideographic Imperial_Aramaic In_Adlam In_Aegean_Numbers In_Ahom In_Alchemical_Symbols In_Alphabetic_Presentation_Forms In_Anatolian_Hieroglyphs In_Ancient_Greek_Musical_Notation In_Ancient_Greek_Numbers In_Ancient_Symbols In_Arabic In_Arabic_Extended_A In_Arabic_Extended_B In_Arabic_Extended_C In_Arabic_Mathematical_Alphabetic_Symbols In_Arabic_Presentation_Forms_A In_Arabic_Presentation_Forms_B In_Arabic_Supplement In_Armenian In_Arrows In_Avestan In_Balinese In_Bamum In_Bamum_Supplement In_Basic_Latin In_Bassa_Vah In_Batak In_Bengali In_Bhaiksuki In_Block_Elements In_Bopomofo In_Bopomofo_Extended In_Box_Drawing In_Brahmi In_Braille_Patterns In_Buginese In_Buhid In_Byzantine_Musical_Symbols In_CJK_Compatibility In_CJK_Compatibility_Forms In_CJK_Compatibility_Ideographs In_CJK_Compatibility_Ideographs_Supplement In_CJK_Radicals_Supplement In_CJK_Strokes In_CJK_Symbols_and_Punctuation In_CJK_Unified_Ideographs In_CJK_Unified_Ideographs_Extension_A In_CJK_Unified_Ideographs_Extension_B In_CJK_Unified_Ideographs_Extension_C In_CJK_Unified_Ideographs_Extension_D In_CJK_Unified_Ideographs_Extension_E In_CJK_Unified_Ideographs_Extension_F In_CJK_Unified_Ideographs_Extension_G In_CJK_Unified_Ideographs_Extension_H In_CJK_Unified_Ideographs_Extension_I In_Carian In_Caucasian_Albanian In_Chakma In_Cham In_Cherokee In_Cherokee_Supplement In_Chess_Symbols In_Chorasmian In_Combining_Diacritical_Marks In_Combining_Diacritical_Marks_Extended In_Combining_Diacritical_Marks_Supplement In_Combining_Diacritical_Marks_for_Symbols In_Combining_Half_Marks In_Common_Indic_Number_Forms In_Control_Pictures In_Coptic In_Coptic_Epact_Numbers In_Counting_Rod_Numerals In_Cuneiform In_Cuneiform_Numbers_and_Punctuation In_Currency_Symbols In_Cypriot_Syllabary In_Cypro_Minoan In_Cyrillic In_Cyrillic_Extended_A In_Cyrillic_Extended_B In_Cyrillic_Extended_C In_Cyrillic_Extended_D In_Cyrillic_Supplement In_Deseret In_Devanagari In_Devanagari_Extended In_Devanagari_Extended_A In_Dingbats In_Dives_Akuru In_Dogra In_Domino_Tiles In_Duployan In_Early_Dynastic_Cuneiform In_Egyptian_Hieroglyph_Format_Controls In_Egyptian_Hieroglyphs In_Elbasan In_Elymaic In_Emoticons In_Enclosed_Alphanumeric_Supplement In_Enclosed_Alphanumerics In_Enclosed_CJK_Letters_and_Months In_Enclosed_Ideographic_Supplement In_Ethiopic In_Ethiopic_Extended In_Ethiopic_Extended_A In_Ethiopic_Extended_B In_Ethiopic_Supplement In_General_Punctuation In_Geometric_Shapes In_Geometric_Shapes_Extended In_Georgian In_Georgian_Extended In_Georgian_Supplement In_Glagolitic In_Glagolitic_Supplement In_Gothic In_Grantha In_Greek_Extended In_Greek_and_Coptic In_Gujarati In_Gunjala_Gondi In_Gurmukhi In_Halfwidth_and_Fullwidth_Forms In_Hangul_Compatibility_Jamo In_Hangul_Jamo In_Hangul_Jamo_Extended_A In_Hangul_Jamo_Extended_B In_Hangul_Syllables In_Hanifi_Rohingya In_Hanunoo In_Hatran In_Hebrew In_High_Private_Use_Surrogates In_High_Surrogates In_Hiragana In_IPA_Extensions In_Ideographic_Description_Characters In_Ideographic_Symbols_and_Punctuation In_Imperial_Aramaic In_Indic_Siyaq_Numbers In_Inscriptional_Pahlavi In_Inscriptional_Parthian In_Javanese In_Kaithi In_Kaktovik_Numerals In_Kana_Extended_A In_Kana_Extended_B In_Kana_Supplement In_Kanbun In_Kangxi_Radicals In_Kannada In_Katakana In_Katakana_Phonetic_Extensions In_Kawi In_Kayah_Li In_Kharoshthi In_Khitan_Small_Script In_Khmer In_Khmer_Symbols In_Khojki In_Khudawadi In_Lao In_Latin_1_Supplement In_Latin_Extended_A In_Latin_Extended_Additional In_Latin_Extended_B In_Latin_Extended_C In_Latin_Extended_D In_Latin_Extended_E In_Latin_Extended_F In_Latin_Extended_G In_Lepcha In_Letterlike_Symbols In_Limbu In_Linear_A In_Linear_B_Ideograms In_Linear_B_Syllabary In_Lisu In_Lisu_Supplement In_Low_Surrogates In_Lycian In_Lydian In_Mahajani In_Mahjong_Tiles In_Makasar In_Malayalam In_Mandaic In_Manichaean In_Marchen In_Masaram_Gondi In_Mathematical_Alphanumeric_Symbols In_Mathematical_Operators In_Mayan_Numerals In_Medefaidrin In_Meetei_Mayek In_Meetei_Mayek_Extensions In_Mende_Kikakui In_Meroitic_Cursive In_Meroitic_Hieroglyphs In_Miao In_Miscellaneous_Mathematical_Symbols_A In_Miscellaneous_Mathematical_Symbols_B In_Miscellaneous_Symbols In_Miscellaneous_Symbols_and_Arrows In_Miscellaneous_Symbols_and_Pictographs In_Miscellaneous_Technical In_Modi In_Modifier_Tone_Letters In_Mongolian In_Mongolian_Supplement In_Mro In_Multani In_Musical_Symbols In_Myanmar In_Myanmar_Extended_A In_Myanmar_Extended_B In_NKo In_Nabataean In_Nag_Mundari In_Nandinagari In_New_Tai_Lue In_Newa In_No_Block In_Number_Forms In_Nushu In_Nyiakeng_Puachue_Hmong In_Ogham In_Ol_Chiki In_Old_Hungarian In_Old_Italic In_Old_North_Arabian In_Old_Permic In_Old_Persian In_Old_Sogdian In_Old_South_Arabian In_Old_Turkic In_Old_Uyghur In_Optical_Character_Recognition In_Oriya In_Ornamental_Dingbats In_Osage In_Osmanya In_Ottoman_Siyaq_Numbers In_Pahawh_Hmong In_Palmyrene In_Pau_Cin_Hau In_Phags_pa In_Phaistos_Disc In_Phoenician In_Phonetic_Extensions In_Phonetic_Extensions_Supplement In_Playing_Cards In_Private_Use_Area In_Psalter_Pahlavi In_Rejang In_Rumi_Numeral_Symbols In_Runic In_Samaritan In_Saurashtra In_Sharada In_Shavian In_Shorthand_Format_Controls In_Siddham In_Sinhala In_Sinhala_Archaic_Numbers In_Small_Form_Variants In_Small_Kana_Extension In_Sogdian In_Sora_Sompeng In_Soyombo In_Spacing_Modifier_Letters In_Specials In_Sundanese In_Sundanese_Supplement In_Superscripts_and_Subscripts In_Supplemental_Arrows_A In_Supplemental_Arrows_B In_Supplemental_Arrows_C In_Supplemental_Mathematical_Operators In_Supplemental_Punctuation In_Supplemental_Symbols_and_Pictographs In_Supplementary_Private_Use_Area_A In_Supplementary_Private_Use_Area_B In_Sutton_SignWriting In_Syloti_Nagri In_Symbols_and_Pictographs_Extended_A In_Symbols_for_Legacy_Computing In_Syriac In_Syriac_Supplement In_Tagalog In_Tagbanwa In_Tags In_Tai_Le In_Tai_Tham In_Tai_Viet In_Tai_Xuan_Jing_Symbols In_Takri In_Tamil In_Tamil_Supplement In_Tangsa In_Tangut In_Tangut_Components In_Tangut_Supplement In_Telugu In_Thaana In_Thai In_Tibetan In_Tifinagh In_Tirhuta In_Toto In_Transport_and_Map_Symbols In_Ugaritic In_Unified_Canadian_Aboriginal_Syllabics In_Unified_Canadian_Aboriginal_Syllabics_Extended In_Unified_Canadian_Aboriginal_Syllabics_Extended_A In_Vai In_Variation_Selectors In_Variation_Selectors_Supplement In_Vedic_Extensions In_Vertical_Forms In_Vithkuqi In_Wancho In_Warang_Citi In_Yezidi In_Yi_Radicals In_Yi_Syllables In_Yijing_Hexagram_Symbols In_Zanabazar_Square In_Znamenny_Musical_Notation Inherited Initial_Punctuation Inscriptional_Pahlavi Inscriptional_Parthian Javanese Join_Control Kaithi Kannada Katakana Kawi Kayah_Li Kharoshthi Khitan_Small_Script Khmer Khojki Khudawadi Lao Latin Lepcha Letter Letter_Number Limbu Line_Separator Linear_A Linear_B Lisu Logical_Order_Exception Lower Lowercase Lowercase_Letter Lycian Lydian Mahajani Makasar Malayalam Mandaic Manichaean Marchen Mark Masaram_Gondi Math Math_Symbol Medefaidrin Meetei_Mayek Mende_Kikakui Meroitic_Cursive Meroitic_Hieroglyphs Miao Modi Modifier_Letter Modifier_Symbol Mongolian Mro Multani Myanmar Nabataean Nag_Mundari Nandinagari New_Tai_Lue Newa Newline Nko Noncharacter_Code_Point Nonspacing_Mark Number Nushu Nyiakeng_Puachue_Hmong Ogham Ol_Chiki Old_Hungarian Old_Italic Old_North_Arabian Old_Permic Old_Persian Old_Sogdian Old_South_Arabian Old_Turkic Old_Uyghur Open_Punctuation Oriya Osage Osmanya Other Other_Alphabetic Other_Default_Ignorable_Code_Point Other_Grapheme_Extend Other_ID_Continue Other_ID_Start Other_Letter Other_Lowercase Other_Math Other_Number Other_Punctuation Other_Symbol Other_Uppercase Pahawh_Hmong Palmyrene Paragraph_Separator Pattern_Syntax Pattern_White_Space Pau_Cin_Hau Phags_Pa Phoenician Prepended_Concatenation_Mark Print Private_Use Psalter_Pahlavi Punct Punctuation Quotation_Mark Radical Regional_Indicator Rejang Runic Samaritan Saurashtra Sentence_Terminal Separator Sharada Shavian Siddham SignWriting Sinhala Soft_Dotted Sogdian Sora_Sompeng Soyombo Space Space_Separator Spacing_Mark Sundanese Surrogate Syloti_Nagri Symbol Syriac Tagalog Tagbanwa Tai_Le Tai_Tham Tai_Viet Takri Tamil Tangsa Tangut Telugu Terminal_Punctuation Thaana Thai Tibetan Tifinagh Tirhuta Titlecase_Letter Toto Ugaritic Unassigned Unified_Ideograph Unknown Upper Uppercase Uppercase_Letter Vai Variation_Selector Vithkuqi Wancho Warang_Citi White_Space Word XDigit XID_Continue XID_Start XPosixPunct Yezidi Yi Zanabazar_Squareregexp_property_values-1.5.2/LICENSE.txt0000644000004100000410000000207414632134735020222 0ustar www-datawww-dataThe MIT License (MIT) Copyright (c) 2018 Jannosch Müller Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. regexp_property_values-1.5.2/regexp_property_values.gemspec0000644000004100000410000000175414632134735024565 0ustar www-datawww-datalib = File.expand_path("../lib", __FILE__) $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) require 'regexp_property_values/version' Gem::Specification.new do |s| s.name = 'regexp_property_values' s.version = RegexpPropertyValues::VERSION s.authors = ['Janosch Müller'] s.email = ['janosch84@gmail.com'] s.summary = "Inspect property values supported by Ruby's regex engine" s.description = 'This small library lets you see which property values '\ 'are supported by the regular expression engine of the '\ 'Ruby version you are running, and what they match.' s.homepage = 'https://github.com/jaynetics/regexp_property_values' s.license = 'MIT' s.files = `git ls-files -z`.split("\x0").reject do |f| f.match(%r{^(test|spec|features)/}) end s.require_paths = ['lib'] s.extensions = %w[ext/regexp_property_values/extconf.rb] s.required_ruby_version = '>= 2.1.0' end regexp_property_values-1.5.2/.rspec0000644000004100000410000000006514632134735017512 0ustar www-datawww-data--format documentation --color --require spec_helper regexp_property_values-1.5.2/Rakefile0000644000004100000410000000160114632134735020037 0ustar www-datawww-datarequire 'bundler/gem_tasks' require 'rubygems/package_task' require 'rspec/core/rake_task' RSpec::Core::RakeTask.new(:spec) task :default => :spec require 'rake/extensiontask' Rake::ExtensionTask.new('regexp_property_values') do |ext| ext.lib_dir = 'lib/regexp_property_values' end namespace :java do java_gemspec = eval File.read('./regexp_property_values.gemspec') java_gemspec.platform = 'java' java_gemspec.extensions = [] java_gemspec.add_runtime_dependency 'range_compressor', '~> 1.0' Gem::PackageTask.new(java_gemspec) do |pkg| pkg.need_zip = true pkg.need_tar = true pkg.package_dir = 'pkg' end end task package: 'java:gem' if RUBY_PLATFORM !~ /java/i # recompile before running specs task(:spec).enhance([:compile]) end desc 'Update property lists' task :update do require_relative 'lib/regexp_property_values' RegexpPropertyValues.update end regexp_property_values-1.5.2/Gemfile0000644000004100000410000000047514632134735017675 0ustar www-datawww-datasource "https://rubygems.org" git_source(:github) {|repo_name| "https://github.com/#{repo_name}" } # Specify your gem's dependencies in regexp_property_values.gemspec gemspec gem 'character_set', '~> 1.8.0' gem 'rake', '~> 13.0' gem 'rake-compiler', '~> 1.0' gem 'range_compressor', '~> 1.0' gem 'rspec', '~> 3.0' regexp_property_values-1.5.2/ext/0000755000004100000410000000000014632134735017174 5ustar www-datawww-dataregexp_property_values-1.5.2/ext/regexp_property_values/0000755000004100000410000000000014632134735024011 5ustar www-datawww-dataregexp_property_values-1.5.2/ext/regexp_property_values/regexp_property_values.c0000644000004100000410000000426014632134735030774 0ustar www-datawww-data#include "ruby.h" #include "ruby/encoding.h" #include "ruby/oniguruma.h" // still in recent rubies f. backwards compatibility static int prop_name_to_ctype(VALUE arg, rb_encoding *enc) { char *name; UChar *uname; int ctype; name = StringValueCStr(arg); uname = (UChar *)name; ctype = ONIGENC_PROPERTY_NAME_TO_CTYPE(enc, uname, uname + strlen(name)); if (ctype < 0) rb_raise(rb_eArgError, "Unknown property name `%s`", name); return ctype; } const OnigCodePoint *get_onig_ranges(VALUE prop_name) { int ctype; const OnigCodePoint *ranges; OnigCodePoint sb_out; rb_encoding *enc; enc = rb_utf8_encoding(); ctype = prop_name_to_ctype(prop_name, enc); ONIGENC_GET_CTYPE_CODE_RANGE(enc, ctype, &sb_out, &ranges); return ranges; } VALUE onig_ranges_to_rb_ranges(const OnigCodePoint *onig_ranges) { unsigned int range_count, i; VALUE result, sub_range; range_count = onig_ranges[0]; result = rb_ary_new_capa(range_count); for (i = 0; i < range_count; i++) { sub_range = rb_range_new(INT2FIX(onig_ranges[(i * 2) + 1]), INT2FIX(onig_ranges[(i * 2) + 2]), 0); rb_ary_store(result, i, sub_range); } return result; } VALUE onig_ranges_to_rb_integers(const OnigCodePoint *onig_ranges) { unsigned int range_count, i, beg, end, j; VALUE result; range_count = onig_ranges[0]; result = rb_ary_new(); for (i = 0; i < range_count; i++) { beg = onig_ranges[(i * 2) + 1]; end = onig_ranges[(i * 2) + 2]; for (j = beg; j <= end; j++) { rb_ary_push(result, INT2FIX(j)); } } return result; } VALUE method_matched_ranges(VALUE self, VALUE arg) { return onig_ranges_to_rb_ranges(get_onig_ranges(arg)); } VALUE method_matched_codepoints(VALUE self, VALUE arg) { return onig_ranges_to_rb_integers(get_onig_ranges(arg)); } void Init_regexp_property_values() { #ifdef HAVE_RB_EXT_RACTOR_SAFE rb_ext_ractor_safe(true); #endif VALUE module; module = rb_define_module("OnigRegexpPropertyHelper"); rb_define_singleton_method(module, "matched_ranges", method_matched_ranges, 1); rb_define_singleton_method(module, "matched_codepoints", method_matched_codepoints, 1); } regexp_property_values-1.5.2/ext/regexp_property_values/extconf.rb0000644000004100000410000000012414632134735026001 0ustar www-datawww-datarequire 'mkmf' name = 'regexp_property_values' create_makefile("#{name}/#{name}") regexp_property_values-1.5.2/README.md0000644000004100000410000000366514632134735017665 0ustar www-datawww-data# RegexpPropertyValues [![Gem Version](https://badge.fury.io/rb/regexp_property_values.svg)](http://badge.fury.io/rb/regexp_property_values) [![Build Status](https://github.com/jaynetics/regexp_property_values/workflows/tests/badge.svg)](https://github.com/jaynetics/regexp_property_values/actions) This small library lets you see which property values are supported by the regular expression engine of the Ruby version you are running and directly reads out their codepoint ranges from there. That is, it determines all supported values for `\p{value}` expressions and what they match. ## Usage ##### Browse all property values (supported by any Ruby, ever) ```ruby require 'regexp_property_values' PV = RegexpPropertyValues PV.all # => [, , ...] ``` ##### Browse property values supported by the Ruby you are running ```ruby PV.all_for_current_ruby # => [, , ...] ``` ##### Inspect property values ```ruby PV['alpha'].supported_by_current_ruby? # => true PV['foobar'].supported_by_current_ruby? # => false PV['AHex'].matched_characters # => %w[0 1 2 3 4 5 6 7 8 9 A B C ...] PV['AHex'].matched_codepoints # => [48, 49, 50, ...] PV['AHex'].matched_ranges # => [48..57, 65..70, 97..102] # Note: #matched_characters is slow for large properties and you # may not want to use it in time-critical code. It also omits surrogates. PV['foobar'].matched_ranges # => RegexpPropertyValues::Error ``` If [`character_set`](https://github.com/jaynetics/character_set) is installed, you can also do this: ```ruby PV['AHex'].character_set # => # ``` ##### Utility methods ```ruby # get a Hash of aliases for property names PV.alias_hash # => { => , ... } # download a list of possible properties for the running Ruby version # (only used for .all and .alias_hash, not needed for prop lookup via .[]) PV.update ``` regexp_property_values-1.5.2/CHANGELOG.md0000644000004100000410000000260714632134735020212 0ustar www-datawww-data# Changelog All notable changes to this project will be documented in this file. The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). ## [1.5.2] - 2024-05-20 ### Fixed - fixed `#matched_characters` for properties containing surrogates - improved speed of `#matched_codepoints` ## [1.5.1] - 2024-01-06 ### Fixed - added missing Grapheme_Cluster_Break properties ## [1.5.0] - 2024-01-06 ### Added - new properties of future Ruby 3.3.x / Unicode 15.1 ## [1.4.0] - 2023-06-10 ### Added - new properties of Ruby 3.2 / Unicode 15.0 ## [1.3.0] - 2022-04-07 ### Added - new properties of Ruby 3.2 / Unicode 14.0 ## [1.2.0] - 2021-12-31 ### Added - support for usage in Ractors ## [1.1.0] - 2021-12-05 ### Added - added new properties from Ruby `3.1.0` to output of `::all`, `::all_for_current_ruby` - added options to run `::update` with custom ucd/emoji source paths ## [1.0.0] - 2019-06-16 ### Changed - removed `::by_category`, `::by_matched_codepoints`, `::short_and_long_names` - return values are now always of a custom `Value` class, no longer extended `Strings` - unknown properties now raise `RegexpPropertyValues::Error`, no longer an `ArgumentError` ### Added - `Value#identifier` - `Value#full_name` ### Fixed - better codepoint determination speed for non-C Rubies (still slow)