Skip to content

Commit

Permalink
Add support for GCB properties
Browse files Browse the repository at this point in the history
  • Loading branch information
jaynetics committed Jan 7, 2024
1 parent be5ce5b commit b526e4d
Show file tree
Hide file tree
Showing 8 changed files with 59 additions and 13 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Expand Up @@ -14,6 +14,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- implemented `#negative?` / `#negated?` for more applicable expressions
- `\B`, `\D`, `\H`, `\S`, `\W`, `(?!...)`, `(?<!...)`

### Fixed

- fixed missing support for grapheme cluster break unicode properties
- e.g. `/\p{Grapheme_Cluster_Break=Extend}/`

## [2.8.3] - 2023-12-04 - Janosch Müller

### Fixed
Expand Down
8 changes: 4 additions & 4 deletions Gemfile
Expand Up @@ -3,13 +3,13 @@ source 'https://rubygems.org'
gemspec

group :development, :test do
gem 'leto', '~> 2.0'
gem 'rake', '~> 13.0'
gem 'regexp_property_values', '~> 1.4'
gem 'leto', '~> 2.1'
gem 'rake', '~> 13.1'
gem 'regexp_property_values', '~> 1.5'
gem 'rspec', '~> 3.10'
if RUBY_VERSION.to_f >= 2.7
gem 'benchmark-ips', '~> 2.1'
gem 'gouteur', '~> 1.1'
gem 'rubocop', '~> 1.7'
gem 'rubocop', '~> 1.59'
end
end
11 changes: 6 additions & 5 deletions lib/regexp_parser/expression/classes/unicode_property.rb
Expand Up @@ -105,11 +105,12 @@ class PrivateUse < Codepoint::Base; end
class Unassigned < Codepoint::Base; end
end

class Age < UnicodeProperty::Base; end
class Derived < UnicodeProperty::Base; end
class Emoji < UnicodeProperty::Base; end
class Script < UnicodeProperty::Base; end
class Block < UnicodeProperty::Base; end
class Age < UnicodeProperty::Base; end
class Block < UnicodeProperty::Base; end
class Derived < UnicodeProperty::Base; end
class Emoji < UnicodeProperty::Base; end
class Enumerated < UnicodeProperty::Base; end
class Script < UnicodeProperty::Base; end
end

# alias for symmetry between token symbol and Expression class name
Expand Down
1 change: 1 addition & 0 deletions lib/regexp_parser/parser.rb
Expand Up @@ -467,6 +467,7 @@ def property(token)
when *UPTokens::Age; node << UP::Age.new(token, active_opts)
when *UPTokens::Derived; node << UP::Derived.new(token, active_opts)
when *UPTokens::Emoji; node << UP::Emoji.new(token, active_opts)
when *UPTokens::Enumerated; node << UP::Enumerated.new(token, active_opts)
when *UPTokens::Script; node << UP::Script.new(token, active_opts)
when *UPTokens::UnicodeBlock; node << UP::Block.new(token, active_opts)

Expand Down
18 changes: 18 additions & 0 deletions lib/regexp_parser/scanner/properties/long.csv
Expand Up @@ -8,6 +8,7 @@ age=12.1,age=12.1
age=13.0,age=13.0
age=14.0,age=14.0
age=15.0,age=15.0
age=15.1,age=15.1
age=2.0,age=2.0
age=2.1,age=2.1
age=3.0,age=3.0
Expand Down Expand Up @@ -108,6 +109,19 @@ gothic,gothic
grantha,grantha
graph,graph
graphemebase,grapheme_base
graphemeclusterbreak=control,grapheme_cluster_break=control
graphemeclusterbreak=cr,grapheme_cluster_break=cr
graphemeclusterbreak=extend,grapheme_cluster_break=extend
graphemeclusterbreak=l,grapheme_cluster_break=l
graphemeclusterbreak=lf,grapheme_cluster_break=lf
graphemeclusterbreak=lv,grapheme_cluster_break=lv
graphemeclusterbreak=lvt,grapheme_cluster_break=lvt
graphemeclusterbreak=prepend,grapheme_cluster_break=prepend
graphemeclusterbreak=regionalindicator,grapheme_cluster_break=regional_indicator
graphemeclusterbreak=spacingmark,grapheme_cluster_break=spacingmark
graphemeclusterbreak=t,grapheme_cluster_break=t
graphemeclusterbreak=v,grapheme_cluster_break=v
graphemeclusterbreak=zwj,grapheme_cluster_break=zwj
graphemeextend,grapheme_extend
graphemelink,grapheme_link
greek,greek
Expand All @@ -123,11 +137,14 @@ hebrew,hebrew
hexdigit,hex_digit
hiragana,hiragana
hyphen,hyphen
idcompatmathcontinue,id_compat_math_continue
idcompatmathstart,id_compat_math_start
idcontinue,id_continue
ideographic,ideographic
idsbinaryoperator,ids_binary_operator
idstart,id_start
idstrinaryoperator,ids_trinary_operator
idsunaryoperator,ids_unary_operator
imperialaramaic,imperial_aramaic
inadlam,in_adlam
inaegeannumbers,in_aegean_numbers
Expand Down Expand Up @@ -190,6 +207,7 @@ incjkunifiedideographsextensione,in_cjk_unified_ideographs_extension_e
incjkunifiedideographsextensionf,in_cjk_unified_ideographs_extension_f
incjkunifiedideographsextensiong,in_cjk_unified_ideographs_extension_g
incjkunifiedideographsextensionh,in_cjk_unified_ideographs_extension_h
incjkunifiedideographsextensioni,in_cjk_unified_ideographs_extension_i
incombiningdiacriticalmarks,in_combining_diacritical_marks
incombiningdiacriticalmarksextended,in_combining_diacritical_marks_extended
incombiningdiacriticalmarksforsymbols,in_combining_diacritical_marks_for_symbols
Expand Down
1 change: 1 addition & 0 deletions lib/regexp_parser/scanner/properties/short.csv
Expand Up @@ -86,6 +86,7 @@ ideo,ideographic
ids,id_start
idsb,ids_binary_operator
idst,ids_trinary_operator
idsu,ids_unary_operator
ital,old_italic
java,javanese
joinc,join_control
Expand Down
18 changes: 18 additions & 0 deletions lib/regexp_parser/syntax/token/unicode_property.rb
Expand Up @@ -703,6 +703,24 @@ module Category
extended_pictographic
]

Enumerated_V2_4_0 = %i[
grapheme_cluster_break=control
grapheme_cluster_break=cr
grapheme_cluster_break=extend
grapheme_cluster_break=l
grapheme_cluster_break=lf
grapheme_cluster_break=lv
grapheme_cluster_break=lvt
grapheme_cluster_break=prepend
grapheme_cluster_break=regional_indicator
grapheme_cluster_break=spacingmark
grapheme_cluster_break=t
grapheme_cluster_break=v
grapheme_cluster_break=zwj
]

Enumerated = all[:Enumerated_V]

Emoji = all[:Emoji_V]

V1_9_0 = Category::All + POSIX + all[:V1_9_0]
Expand Down
10 changes: 6 additions & 4 deletions spec/parser/properties_spec.rb
Expand Up @@ -35,10 +35,12 @@
include_examples 'parse', '\p{in_bengali}', 0 => [:property, :in_bengali, shortcut: nil]

# test classification
include_examples 'parse', '\p{age=5.2}', 0 => [UnicodeProperty::Age]
include_examples 'parse', '\p{Math}', 0 => [UnicodeProperty::Derived]
include_examples 'parse', '\p{Hiragana}', 0 => [UnicodeProperty::Script]
include_examples 'parse', '\p{InArmenian}', 0 => [UnicodeProperty::Block]
include_examples 'parse', '\p{age=5.2}', 0 => [UnicodeProperty::Age]
include_examples 'parse', '\p{InArmenian}', 0 => [UnicodeProperty::Block]
include_examples 'parse', '\p{Math}', 0 => [UnicodeProperty::Derived]
include_examples 'parse', '\p{Emoji}', 0 => [UnicodeProperty::Emoji]
include_examples 'parse', '\p{GraphemeClusterBreak=Extend}', 0 => [UnicodeProperty::Enumerated]
include_examples 'parse', '\p{Hiragana}', 0 => [UnicodeProperty::Script]

specify('parse abandoned newline property') do
root = RP.parse('\p{newline}', 'ruby/1.9')
Expand Down

0 comments on commit b526e4d

Please sign in to comment.