From b526e4dc49bae08bc6d2ebde873a0553a33dbea8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Janosch=20Mu=CC=88ller?= Date: Sun, 7 Jan 2024 11:33:12 +0100 Subject: [PATCH] Add support for GCB properties --- CHANGELOG.md | 5 +++++ Gemfile | 8 ++++---- .../expression/classes/unicode_property.rb | 11 ++++++----- lib/regexp_parser/parser.rb | 1 + lib/regexp_parser/scanner/properties/long.csv | 18 ++++++++++++++++++ lib/regexp_parser/scanner/properties/short.csv | 1 + .../syntax/token/unicode_property.rb | 18 ++++++++++++++++++ spec/parser/properties_spec.rb | 10 ++++++---- 8 files changed, 59 insertions(+), 13 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2d75202..5ee6c2c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - implemented `#negative?` / `#negated?` for more applicable expressions - `\B`, `\D`, `\H`, `\S`, `\W`, `(?!...)`, `(? 2.0' - gem 'rake', '~> 13.0' - gem 'regexp_property_values', '~> 1.4' + gem 'leto', '~> 2.1' + gem 'rake', '~> 13.1' + gem 'regexp_property_values', '~> 1.5' gem 'rspec', '~> 3.10' if RUBY_VERSION.to_f >= 2.7 gem 'benchmark-ips', '~> 2.1' gem 'gouteur', '~> 1.1' - gem 'rubocop', '~> 1.7' + gem 'rubocop', '~> 1.59' end end diff --git a/lib/regexp_parser/expression/classes/unicode_property.rb b/lib/regexp_parser/expression/classes/unicode_property.rb index 4ac0606..fe90b55 100644 --- a/lib/regexp_parser/expression/classes/unicode_property.rb +++ b/lib/regexp_parser/expression/classes/unicode_property.rb @@ -105,11 +105,12 @@ class PrivateUse < Codepoint::Base; end class Unassigned < Codepoint::Base; end end - class Age < UnicodeProperty::Base; end - class Derived < UnicodeProperty::Base; end - class Emoji < UnicodeProperty::Base; end - class Script < UnicodeProperty::Base; end - class Block < UnicodeProperty::Base; end + class Age < UnicodeProperty::Base; end + class Block < UnicodeProperty::Base; end + class Derived < UnicodeProperty::Base; end + class Emoji < UnicodeProperty::Base; end + class Enumerated < UnicodeProperty::Base; end + class Script < UnicodeProperty::Base; end end # alias for symmetry between token symbol and Expression class name diff --git a/lib/regexp_parser/parser.rb b/lib/regexp_parser/parser.rb index ef833ce..91154f5 100644 --- a/lib/regexp_parser/parser.rb +++ b/lib/regexp_parser/parser.rb @@ -467,6 +467,7 @@ def property(token) when *UPTokens::Age; node << UP::Age.new(token, active_opts) when *UPTokens::Derived; node << UP::Derived.new(token, active_opts) when *UPTokens::Emoji; node << UP::Emoji.new(token, active_opts) + when *UPTokens::Enumerated; node << UP::Enumerated.new(token, active_opts) when *UPTokens::Script; node << UP::Script.new(token, active_opts) when *UPTokens::UnicodeBlock; node << UP::Block.new(token, active_opts) diff --git a/lib/regexp_parser/scanner/properties/long.csv b/lib/regexp_parser/scanner/properties/long.csv index 7e644ae..777158b 100644 --- a/lib/regexp_parser/scanner/properties/long.csv +++ b/lib/regexp_parser/scanner/properties/long.csv @@ -8,6 +8,7 @@ age=12.1,age=12.1 age=13.0,age=13.0 age=14.0,age=14.0 age=15.0,age=15.0 +age=15.1,age=15.1 age=2.0,age=2.0 age=2.1,age=2.1 age=3.0,age=3.0 @@ -108,6 +109,19 @@ gothic,gothic grantha,grantha graph,graph graphemebase,grapheme_base +graphemeclusterbreak=control,grapheme_cluster_break=control +graphemeclusterbreak=cr,grapheme_cluster_break=cr +graphemeclusterbreak=extend,grapheme_cluster_break=extend +graphemeclusterbreak=l,grapheme_cluster_break=l +graphemeclusterbreak=lf,grapheme_cluster_break=lf +graphemeclusterbreak=lv,grapheme_cluster_break=lv +graphemeclusterbreak=lvt,grapheme_cluster_break=lvt +graphemeclusterbreak=prepend,grapheme_cluster_break=prepend +graphemeclusterbreak=regionalindicator,grapheme_cluster_break=regional_indicator +graphemeclusterbreak=spacingmark,grapheme_cluster_break=spacingmark +graphemeclusterbreak=t,grapheme_cluster_break=t +graphemeclusterbreak=v,grapheme_cluster_break=v +graphemeclusterbreak=zwj,grapheme_cluster_break=zwj graphemeextend,grapheme_extend graphemelink,grapheme_link greek,greek @@ -123,11 +137,14 @@ hebrew,hebrew hexdigit,hex_digit hiragana,hiragana hyphen,hyphen +idcompatmathcontinue,id_compat_math_continue +idcompatmathstart,id_compat_math_start idcontinue,id_continue ideographic,ideographic idsbinaryoperator,ids_binary_operator idstart,id_start idstrinaryoperator,ids_trinary_operator +idsunaryoperator,ids_unary_operator imperialaramaic,imperial_aramaic inadlam,in_adlam inaegeannumbers,in_aegean_numbers @@ -190,6 +207,7 @@ incjkunifiedideographsextensione,in_cjk_unified_ideographs_extension_e incjkunifiedideographsextensionf,in_cjk_unified_ideographs_extension_f incjkunifiedideographsextensiong,in_cjk_unified_ideographs_extension_g incjkunifiedideographsextensionh,in_cjk_unified_ideographs_extension_h +incjkunifiedideographsextensioni,in_cjk_unified_ideographs_extension_i incombiningdiacriticalmarks,in_combining_diacritical_marks incombiningdiacriticalmarksextended,in_combining_diacritical_marks_extended incombiningdiacriticalmarksforsymbols,in_combining_diacritical_marks_for_symbols diff --git a/lib/regexp_parser/scanner/properties/short.csv b/lib/regexp_parser/scanner/properties/short.csv index 05e0f74..be4ead1 100644 --- a/lib/regexp_parser/scanner/properties/short.csv +++ b/lib/regexp_parser/scanner/properties/short.csv @@ -86,6 +86,7 @@ ideo,ideographic ids,id_start idsb,ids_binary_operator idst,ids_trinary_operator +idsu,ids_unary_operator ital,old_italic java,javanese joinc,join_control diff --git a/lib/regexp_parser/syntax/token/unicode_property.rb b/lib/regexp_parser/syntax/token/unicode_property.rb index 09f7cb1..57da9ab 100644 --- a/lib/regexp_parser/syntax/token/unicode_property.rb +++ b/lib/regexp_parser/syntax/token/unicode_property.rb @@ -703,6 +703,24 @@ module Category extended_pictographic ] + Enumerated_V2_4_0 = %i[ + grapheme_cluster_break=control + grapheme_cluster_break=cr + grapheme_cluster_break=extend + grapheme_cluster_break=l + grapheme_cluster_break=lf + grapheme_cluster_break=lv + grapheme_cluster_break=lvt + grapheme_cluster_break=prepend + grapheme_cluster_break=regional_indicator + grapheme_cluster_break=spacingmark + grapheme_cluster_break=t + grapheme_cluster_break=v + grapheme_cluster_break=zwj + ] + + Enumerated = all[:Enumerated_V] + Emoji = all[:Emoji_V] V1_9_0 = Category::All + POSIX + all[:V1_9_0] diff --git a/spec/parser/properties_spec.rb b/spec/parser/properties_spec.rb index 30e5e0e..1799eff 100644 --- a/spec/parser/properties_spec.rb +++ b/spec/parser/properties_spec.rb @@ -35,10 +35,12 @@ include_examples 'parse', '\p{in_bengali}', 0 => [:property, :in_bengali, shortcut: nil] # test classification - include_examples 'parse', '\p{age=5.2}', 0 => [UnicodeProperty::Age] - include_examples 'parse', '\p{Math}', 0 => [UnicodeProperty::Derived] - include_examples 'parse', '\p{Hiragana}', 0 => [UnicodeProperty::Script] - include_examples 'parse', '\p{InArmenian}', 0 => [UnicodeProperty::Block] + include_examples 'parse', '\p{age=5.2}', 0 => [UnicodeProperty::Age] + include_examples 'parse', '\p{InArmenian}', 0 => [UnicodeProperty::Block] + include_examples 'parse', '\p{Math}', 0 => [UnicodeProperty::Derived] + include_examples 'parse', '\p{Emoji}', 0 => [UnicodeProperty::Emoji] + include_examples 'parse', '\p{GraphemeClusterBreak=Extend}', 0 => [UnicodeProperty::Enumerated] + include_examples 'parse', '\p{Hiragana}', 0 => [UnicodeProperty::Script] specify('parse abandoned newline property') do root = RP.parse('\p{newline}', 'ruby/1.9')