From 8649a3b57106a72f334cf0503dc9dde45e1098ea Mon Sep 17 00:00:00 2001 From: Ammar Ali Date: Sat, 6 Jun 2020 16:17:10 +0300 Subject: [PATCH 1/2] Support informal delimiter literals --- README.md | 7 +-- lib/regexp_parser/scanner/scanner.rl | 21 ++++++++- spec/lexer/delimiters_spec.rb | 65 ++++++++++++++++++++++++++++ spec/scanner/delimiters_spec.rb | 49 +++++++++++++++++++++ spec/scanner/errors_spec.rb | 1 - 5 files changed, 135 insertions(+), 8 deletions(-) create mode 100644 spec/lexer/delimiters_spec.rb create mode 100644 spec/scanner/delimiters_spec.rb diff --git a/README.md b/README.md index 982f518..5485257 100644 --- a/README.md +++ b/README.md @@ -136,11 +136,8 @@ Regexp::Scanner.scan( /(cat?([bhm]at)){3,5}/ ).map {|token| token[2]} to the lexer. * The MRI implementation may accept expressions that either conflict with - the documentation or are undocumented. The scanner does not support such - implementation quirks. - _(See issues [#3](https://github.com/ammar/regexp_parser/issues/3) and - [#15](https://github.com/ammar/regexp_parser/issues/15) for examples)_ - + the documentation or are undocumented, like `{}` and `]` _(unescaped)_. + The scanner will try to support as many of these cases as possible. --- ### Syntax diff --git a/lib/regexp_parser/scanner/scanner.rl b/lib/regexp_parser/scanner/scanner.rl index 39f6995..5a0edfe 100644 --- a/lib/regexp_parser/scanner/scanner.rl +++ b/lib/regexp_parser/scanner/scanner.rl @@ -62,9 +62,15 @@ quantifier_possessive = '?+' | '*+' | '++'; quantifier_mode = '?' | '+'; - quantifier_interval = range_open . (digit+)? . ','? . (digit+)? . + quantifier_exact = range_open . (digit+) . range_close . quantifier_mode?; + quantifier_minimum = range_open . (digit+) . ',' . range_close . quantifier_mode?; + quantifier_maximum = range_open . ',' . (digit+) . range_close . quantifier_mode?; + quantifier_range = range_open . (digit+) . ',' . (digit+) . range_close . quantifier_mode?; + quantifier_interval = quantifier_exact | quantifier_minimum | + quantifier_maximum | quantifier_range; + quantifiers = quantifier_greedy | quantifier_reluctant | quantifier_possessive | quantifier_interval; @@ -114,6 +120,8 @@ curlies | parantheses | brackets | line_anchor | quantifier_greedy; + literal_delimiters = ']' | '}' | '{}'; + ascii_print = ((0x20..0x7e) - meta_char); ascii_nonprint = (0x01..0x1f | 0x7f); @@ -417,6 +425,10 @@ end }; + literal_delimiters { + append_literal(data, ts, te) + }; + # Character sets # ------------------------------------------------------------------------ set_open >set_opened { @@ -620,10 +632,15 @@ end }; - quantifier_interval @err(premature_end_error) { + quantifier_interval { emit(:quantifier, :interval, *text(data, ts, te)) }; + # Catch unmatched curly braces as literals + range_open { + append_literal(data, ts, te) + }; + # Escaped sequences # ------------------------------------------------------------------------ backslash > (backslashed, 1) { diff --git a/spec/lexer/delimiters_spec.rb b/spec/lexer/delimiters_spec.rb new file mode 100644 index 0000000..e83964e --- /dev/null +++ b/spec/lexer/delimiters_spec.rb @@ -0,0 +1,65 @@ +require 'spec_helper' + +RSpec.describe('Literal delimiter lexing') do + include_examples 'lex', '}', + 0 => [:literal, :literal, '}', 0, 1, 0, 0, 0] + + include_examples 'lex', '}}', + 0 => [:literal, :literal, '}}', 0, 2, 0, 0, 0] + + include_examples 'lex', '{', + 0 => [:literal, :literal, '{', 0, 1, 0, 0, 0] + + include_examples 'lex', '{{', + 0 => [:literal, :literal, '{{', 0, 2, 0, 0, 0] + + include_examples 'lex', '{}', + 0 => [:literal, :literal, '{}', 0, 2, 0, 0, 0] + + include_examples 'lex', '}{', + 0 => [:literal, :literal, '}{', 0, 2, 0, 0, 0] + + include_examples 'lex', '}{+', + 0 => [:literal, :literal, '}', 0, 1, 0, 0, 0], + 1 => [:literal, :literal, '{', 1, 2, 0, 0, 0], + 2 => [:quantifier, :one_or_more, '+', 2, 3, 0, 0, 0] + + include_examples 'lex', '{{var}}', + 0 => [:literal, :literal, '{{var}}', 0, 7, 0, 0, 0] + + include_examples 'lex', 'a{b}c', + 0 => [:literal, :literal, 'a{b}c', 0, 5, 0, 0, 0] + + include_examples 'lex', '({.+})', + 0 => [:group, :capture, '(', 0, 1, 0, 0, 0], + 1 => [:literal, :literal, '{', 1, 2, 1, 0, 0], + 2 => [:meta, :dot, '.', 2, 3, 1, 0, 0], + 3 => [:quantifier, :one_or_more, '+', 3, 4, 1, 0, 0], + 4 => [:literal, :literal, '}', 4, 5, 1, 0, 0], + 5 => [:group, :close, ')', 5, 6, 0, 0, 0] + + include_examples 'lex', ']', + 0 => [:literal, :literal, ']', 0, 1, 0, 0, 0] + + include_examples 'lex', ']]', + 0 => [:literal, :literal, ']]', 0, 2, 0, 0, 0] + + include_examples 'lex', ']\[', + 0 => [:literal, :literal, ']', 0, 1, 0, 0, 0], + 1 => [:escape, :set_open, '\[', 1, 3, 0, 0, 0] + + include_examples 'lex', '()', + 0 => [:group, :capture, '(', 0, 1, 0, 0, 0], + 1 => [:group, :close, ')', 1, 2, 0, 0, 0] + + include_examples 'lex', '{abc:.+}}}[^}]]}', + 0 => [:literal, :literal, '{abc:', 0, 5, 0, 0, 0], + 1 => [:meta, :dot, '.', 5, 6, 0, 0, 0], + 2 => [:quantifier, :one_or_more, '+', 6, 7, 0, 0, 0], + 3 => [:literal, :literal, '}}}', 7, 10, 0, 0, 0], + 4 => [:set, :open, '[', 10, 11, 0, 0, 0], + 5 => [:set, :negate, '^', 11, 12, 0, 1, 0], + 6 => [:literal, :literal, '}', 12, 13, 0, 1, 0], + 7 => [:set, :close, ']', 13, 14, 0, 0, 0], + 8 => [:literal, :literal, ']}', 14, 16, 0, 0, 0] +end diff --git a/spec/scanner/delimiters_spec.rb b/spec/scanner/delimiters_spec.rb new file mode 100644 index 0000000..e96d957 --- /dev/null +++ b/spec/scanner/delimiters_spec.rb @@ -0,0 +1,49 @@ +require 'spec_helper' + +RSpec.describe('Literal delimiter scanning') do + include_examples 'scan', '}', + 0 => [:literal, :literal, '}', 0, 1] + + include_examples 'scan', '}}', + 0 => [:literal, :literal, '}}', 0, 2] + + include_examples 'scan', '{', + 0 => [:literal, :literal, '{', 0, 1] + + include_examples 'scan', '{{', + 0 => [:literal, :literal, '{{', 0, 2] + + include_examples 'scan', '{}', + 0 => [:literal, :literal, '{}', 0, 2] + + include_examples 'scan', '}{', + 0 => [:literal, :literal, '}{', 0, 2] + + include_examples 'scan', '}{+', + 0 => [:literal, :literal, '}{', 0, 2] + + include_examples 'scan', '{{var}}', + 0 => [:literal, :literal, '{{var}}', 0, 7] + + include_examples 'scan', '({.+})', + 0 => [:group, :capture, '(', 0, 1], + 1 => [:literal, :literal, '{', 1, 2], + 2 => [:meta, :dot, '.', 2, 3], + 3 => [:quantifier, :one_or_more, '+', 3, 4], + 4 => [:literal, :literal, '}', 4, 5], + 5 => [:group, :close, ')', 5, 6] + + include_examples 'scan', ']', + 0 => [:literal, :literal, ']', 0, 1] + + include_examples 'scan', ']]', + 0 => [:literal, :literal, ']]', 0, 2] + + include_examples 'scan', ']\[', + 0 => [:literal, :literal, ']', 0, 1], + 1 => [:escape, :set_open, '\[', 1, 3] + + include_examples 'scan', '()', + 0 => [:group, :capture, '(', 0, 1], + 1 => [:group, :close, ')', 1, 2] +end diff --git a/spec/scanner/errors_spec.rb b/spec/scanner/errors_spec.rb index ff1e716..6158896 100644 --- a/spec/scanner/errors_spec.rb +++ b/spec/scanner/errors_spec.rb @@ -10,7 +10,6 @@ include_examples 'scan error', RS::PrematureEndError, 'unbalanced set', '[a' include_examples 'scan error', RS::PrematureEndError, 'unbalanced set', '[[:alpha:]' include_examples 'scan error', RS::PrematureEndError, 'unbalanced group', '(abc' - include_examples 'scan error', RS::PrematureEndError, 'unbalanced interval', 'a{1,2' include_examples 'scan error', RS::PrematureEndError, 'eof in property', '\p{asci' include_examples 'scan error', RS::PrematureEndError, 'incomplete property', '\p{ascii abc' include_examples 'scan error', RS::PrematureEndError, 'eof options', '(?mix' From f94d9a3359c6c236ff81e2d2d34bafddbb1e1ec4 Mon Sep 17 00:00:00 2001 From: Ammar Ali Date: Sun, 7 Jun 2020 17:49:56 +0300 Subject: [PATCH 2/2] Scanner tweaks and added more tests --- lib/regexp_parser/scanner/scanner.rl | 18 ++++++++---------- spec/lexer/delimiters_spec.rb | 3 +++ spec/parser/quantifiers_spec.rb | 1 + spec/scanner/delimiters_spec.rb | 3 +++ 4 files changed, 15 insertions(+), 10 deletions(-) diff --git a/lib/regexp_parser/scanner/scanner.rl b/lib/regexp_parser/scanner/scanner.rl index 5a0edfe..c535947 100644 --- a/lib/regexp_parser/scanner/scanner.rl +++ b/lib/regexp_parser/scanner/scanner.rl @@ -62,19 +62,17 @@ quantifier_possessive = '?+' | '*+' | '++'; quantifier_mode = '?' | '+'; - quantifier_exact = range_open . (digit+) . range_close . quantifier_mode?; - quantifier_minimum = range_open . (digit+) . ',' . range_close . quantifier_mode?; - quantifier_maximum = range_open . ',' . (digit+) . range_close . quantifier_mode?; - quantifier_range = range_open . (digit+) . ',' . (digit+) . - range_close . quantifier_mode?; - - quantifier_interval = quantifier_exact | quantifier_minimum | - quantifier_maximum | quantifier_range; + quantity_exact = (digit+); + quantity_minimum = (digit+) . ','; + quantity_maximum = ',' . (digit+); + quantity_range = (digit+) . ',' . (digit+); + quantifier_interval = range_open . ( quantity_exact | quantity_minimum | + quantity_maximum | quantity_range ) . range_close . + quantifier_mode?; quantifiers = quantifier_greedy | quantifier_reluctant | quantifier_possessive | quantifier_interval; - conditional = '(?('; group_comment = '?#' . [^)]* . group_close; @@ -120,7 +118,7 @@ curlies | parantheses | brackets | line_anchor | quantifier_greedy; - literal_delimiters = ']' | '}' | '{}'; + literal_delimiters = ']' | '}'; ascii_print = ((0x20..0x7e) - meta_char); ascii_nonprint = (0x01..0x1f | 0x7f); diff --git a/spec/lexer/delimiters_spec.rb b/spec/lexer/delimiters_spec.rb index e83964e..5ee5111 100644 --- a/spec/lexer/delimiters_spec.rb +++ b/spec/lexer/delimiters_spec.rb @@ -30,6 +30,9 @@ include_examples 'lex', 'a{b}c', 0 => [:literal, :literal, 'a{b}c', 0, 5, 0, 0, 0] + include_examples 'lex', 'a{1,2', + 0 => [:literal, :literal, 'a{1,2', 0, 5, 0, 0, 0] + include_examples 'lex', '({.+})', 0 => [:group, :capture, '(', 0, 1, 0, 0, 0], 1 => [:literal, :literal, '{', 1, 2, 1, 0, 0], diff --git a/spec/parser/quantifiers_spec.rb b/spec/parser/quantifiers_spec.rb index 3d0cf2c..6e08e3c 100644 --- a/spec/parser/quantifiers_spec.rb +++ b/spec/parser/quantifiers_spec.rb @@ -35,6 +35,7 @@ include_examples 'quantifier', /a{4}b/, '{4}', :greedy, :interval, 4, 4 include_examples 'quantifier', /a{4}?b/, '{4}?', :reluctant, :interval, 4, 4 include_examples 'quantifier', /a{4}+b/, '{4}+', :possessive, :interval, 4, 4 + include_examples 'quantifier', /a{004}+b/, '{004}+', :possessive, :interval, 4, 4 specify('mode-checking methods') do exp = RP.parse(/a??/).first diff --git a/spec/scanner/delimiters_spec.rb b/spec/scanner/delimiters_spec.rb index e96d957..8f31011 100644 --- a/spec/scanner/delimiters_spec.rb +++ b/spec/scanner/delimiters_spec.rb @@ -25,6 +25,9 @@ include_examples 'scan', '{{var}}', 0 => [:literal, :literal, '{{var}}', 0, 7] + include_examples 'scan', 'a{1,2', + 0 => [:literal, :literal, 'a{1,2', 0, 5] + include_examples 'scan', '({.+})', 0 => [:group, :capture, '(', 0, 1], 1 => [:literal, :literal, '{', 1, 2],