From b526e4dc49bae08bc6d2ebde873a0553a33dbea8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Janosch=20Mu=CC=88ller?= <janosch84@gmail.com>
Date: Sun, 7 Jan 2024 11:33:12 +0100
Subject: [PATCH] Add support for GCB properties

---
 CHANGELOG.md                                   |  5 +++++
 Gemfile                                        |  8 ++++----
 .../expression/classes/unicode_property.rb     | 11 ++++++-----
 lib/regexp_parser/parser.rb                    |  1 +
 lib/regexp_parser/scanner/properties/long.csv  | 18 ++++++++++++++++++
 lib/regexp_parser/scanner/properties/short.csv |  1 +
 .../syntax/token/unicode_property.rb           | 18 ++++++++++++++++++
 spec/parser/properties_spec.rb                 | 10 ++++++----
 8 files changed, 59 insertions(+), 13 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 2d75202..5ee6c2c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -14,6 +14,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - implemented `#negative?` / `#negated?` for more applicable expressions
   - `\B`, `\D`, `\H`, `\S`, `\W`, `(?!...)`, `(?<!...)`
 
+### Fixed
+
+- fixed missing support for grapheme cluster break unicode properties
+  - e.g. `/\p{Grapheme_Cluster_Break=Extend}/`
+
 ## [2.8.3] - 2023-12-04 - Janosch Müller
 
 ### Fixed
diff --git a/Gemfile b/Gemfile
index f4fc4c3..ade3eb1 100644
--- a/Gemfile
+++ b/Gemfile
@@ -3,13 +3,13 @@ source 'https://rubygems.org'
 gemspec
 
 group :development, :test do
-  gem 'leto', '~> 2.0'
-  gem 'rake', '~> 13.0'
-  gem 'regexp_property_values', '~> 1.4'
+  gem 'leto', '~> 2.1'
+  gem 'rake', '~> 13.1'
+  gem 'regexp_property_values', '~> 1.5'
   gem 'rspec', '~> 3.10'
   if RUBY_VERSION.to_f >= 2.7
     gem 'benchmark-ips', '~> 2.1'
     gem 'gouteur', '~> 1.1'
-    gem 'rubocop', '~> 1.7'
+    gem 'rubocop', '~> 1.59'
   end
 end
diff --git a/lib/regexp_parser/expression/classes/unicode_property.rb b/lib/regexp_parser/expression/classes/unicode_property.rb
index 4ac0606..fe90b55 100644
--- a/lib/regexp_parser/expression/classes/unicode_property.rb
+++ b/lib/regexp_parser/expression/classes/unicode_property.rb
@@ -105,11 +105,12 @@ class PrivateUse  < Codepoint::Base; end
       class Unassigned  < Codepoint::Base; end
     end
 
-    class Age     < UnicodeProperty::Base; end
-    class Derived < UnicodeProperty::Base; end
-    class Emoji   < UnicodeProperty::Base; end
-    class Script  < UnicodeProperty::Base; end
-    class Block   < UnicodeProperty::Base; end
+    class Age        < UnicodeProperty::Base; end
+    class Block      < UnicodeProperty::Base; end
+    class Derived    < UnicodeProperty::Base; end
+    class Emoji      < UnicodeProperty::Base; end
+    class Enumerated < UnicodeProperty::Base; end
+    class Script     < UnicodeProperty::Base; end
   end
 
   # alias for symmetry between token symbol and Expression class name
diff --git a/lib/regexp_parser/parser.rb b/lib/regexp_parser/parser.rb
index ef833ce..91154f5 100644
--- a/lib/regexp_parser/parser.rb
+++ b/lib/regexp_parser/parser.rb
@@ -467,6 +467,7 @@ def property(token)
     when *UPTokens::Age;          node << UP::Age.new(token, active_opts)
     when *UPTokens::Derived;      node << UP::Derived.new(token, active_opts)
     when *UPTokens::Emoji;        node << UP::Emoji.new(token, active_opts)
+    when *UPTokens::Enumerated;   node << UP::Enumerated.new(token, active_opts)
     when *UPTokens::Script;       node << UP::Script.new(token, active_opts)
     when *UPTokens::UnicodeBlock; node << UP::Block.new(token, active_opts)
 
diff --git a/lib/regexp_parser/scanner/properties/long.csv b/lib/regexp_parser/scanner/properties/long.csv
index 7e644ae..777158b 100644
--- a/lib/regexp_parser/scanner/properties/long.csv
+++ b/lib/regexp_parser/scanner/properties/long.csv
@@ -8,6 +8,7 @@ age=12.1,age=12.1
 age=13.0,age=13.0
 age=14.0,age=14.0
 age=15.0,age=15.0
+age=15.1,age=15.1
 age=2.0,age=2.0
 age=2.1,age=2.1
 age=3.0,age=3.0
@@ -108,6 +109,19 @@ gothic,gothic
 grantha,grantha
 graph,graph
 graphemebase,grapheme_base
+graphemeclusterbreak=control,grapheme_cluster_break=control
+graphemeclusterbreak=cr,grapheme_cluster_break=cr
+graphemeclusterbreak=extend,grapheme_cluster_break=extend
+graphemeclusterbreak=l,grapheme_cluster_break=l
+graphemeclusterbreak=lf,grapheme_cluster_break=lf
+graphemeclusterbreak=lv,grapheme_cluster_break=lv
+graphemeclusterbreak=lvt,grapheme_cluster_break=lvt
+graphemeclusterbreak=prepend,grapheme_cluster_break=prepend
+graphemeclusterbreak=regionalindicator,grapheme_cluster_break=regional_indicator
+graphemeclusterbreak=spacingmark,grapheme_cluster_break=spacingmark
+graphemeclusterbreak=t,grapheme_cluster_break=t
+graphemeclusterbreak=v,grapheme_cluster_break=v
+graphemeclusterbreak=zwj,grapheme_cluster_break=zwj
 graphemeextend,grapheme_extend
 graphemelink,grapheme_link
 greek,greek
@@ -123,11 +137,14 @@ hebrew,hebrew
 hexdigit,hex_digit
 hiragana,hiragana
 hyphen,hyphen
+idcompatmathcontinue,id_compat_math_continue
+idcompatmathstart,id_compat_math_start
 idcontinue,id_continue
 ideographic,ideographic
 idsbinaryoperator,ids_binary_operator
 idstart,id_start
 idstrinaryoperator,ids_trinary_operator
+idsunaryoperator,ids_unary_operator
 imperialaramaic,imperial_aramaic
 inadlam,in_adlam
 inaegeannumbers,in_aegean_numbers
@@ -190,6 +207,7 @@ incjkunifiedideographsextensione,in_cjk_unified_ideographs_extension_e
 incjkunifiedideographsextensionf,in_cjk_unified_ideographs_extension_f
 incjkunifiedideographsextensiong,in_cjk_unified_ideographs_extension_g
 incjkunifiedideographsextensionh,in_cjk_unified_ideographs_extension_h
+incjkunifiedideographsextensioni,in_cjk_unified_ideographs_extension_i
 incombiningdiacriticalmarks,in_combining_diacritical_marks
 incombiningdiacriticalmarksextended,in_combining_diacritical_marks_extended
 incombiningdiacriticalmarksforsymbols,in_combining_diacritical_marks_for_symbols
diff --git a/lib/regexp_parser/scanner/properties/short.csv b/lib/regexp_parser/scanner/properties/short.csv
index 05e0f74..be4ead1 100644
--- a/lib/regexp_parser/scanner/properties/short.csv
+++ b/lib/regexp_parser/scanner/properties/short.csv
@@ -86,6 +86,7 @@ ideo,ideographic
 ids,id_start
 idsb,ids_binary_operator
 idst,ids_trinary_operator
+idsu,ids_unary_operator
 ital,old_italic
 java,javanese
 joinc,join_control
diff --git a/lib/regexp_parser/syntax/token/unicode_property.rb b/lib/regexp_parser/syntax/token/unicode_property.rb
index 09f7cb1..57da9ab 100644
--- a/lib/regexp_parser/syntax/token/unicode_property.rb
+++ b/lib/regexp_parser/syntax/token/unicode_property.rb
@@ -703,6 +703,24 @@ module Category
         extended_pictographic
       ]
 
+      Enumerated_V2_4_0 = %i[
+        grapheme_cluster_break=control
+        grapheme_cluster_break=cr
+        grapheme_cluster_break=extend
+        grapheme_cluster_break=l
+        grapheme_cluster_break=lf
+        grapheme_cluster_break=lv
+        grapheme_cluster_break=lvt
+        grapheme_cluster_break=prepend
+        grapheme_cluster_break=regional_indicator
+        grapheme_cluster_break=spacingmark
+        grapheme_cluster_break=t
+        grapheme_cluster_break=v
+        grapheme_cluster_break=zwj
+      ]
+
+      Enumerated = all[:Enumerated_V]
+
       Emoji = all[:Emoji_V]
 
       V1_9_0 = Category::All + POSIX + all[:V1_9_0]
diff --git a/spec/parser/properties_spec.rb b/spec/parser/properties_spec.rb
index 30e5e0e..1799eff 100644
--- a/spec/parser/properties_spec.rb
+++ b/spec/parser/properties_spec.rb
@@ -35,10 +35,12 @@
   include_examples 'parse', '\p{in_bengali}',   0 => [:property, :in_bengali, shortcut: nil]
 
   # test classification
-  include_examples 'parse', '\p{age=5.2}',      0 => [UnicodeProperty::Age]
-  include_examples 'parse', '\p{Math}',         0 => [UnicodeProperty::Derived]
-  include_examples 'parse', '\p{Hiragana}',     0 => [UnicodeProperty::Script]
-  include_examples 'parse', '\p{InArmenian}',   0 => [UnicodeProperty::Block]
+  include_examples 'parse', '\p{age=5.2}',                     0 => [UnicodeProperty::Age]
+  include_examples 'parse', '\p{InArmenian}',                  0 => [UnicodeProperty::Block]
+  include_examples 'parse', '\p{Math}',                        0 => [UnicodeProperty::Derived]
+  include_examples 'parse', '\p{Emoji}',                       0 => [UnicodeProperty::Emoji]
+  include_examples 'parse', '\p{GraphemeClusterBreak=Extend}', 0 => [UnicodeProperty::Enumerated]
+  include_examples 'parse', '\p{Hiragana}',                    0 => [UnicodeProperty::Script]
 
   specify('parse abandoned newline property') do
     root = RP.parse('\p{newline}', 'ruby/1.9')