Skip to content

Commit

Permalink
Support informal delimiter literals (#64)
Browse files Browse the repository at this point in the history
* Support informal delimiter literals
* Scanner tweaks and added more tests
  • Loading branch information
ammar committed Jun 7, 2020
1 parent a7ea2da commit ae68d66
Show file tree
Hide file tree
Showing 6 changed files with 142 additions and 10 deletions.
7 changes: 2 additions & 5 deletions README.md
Expand Up @@ -136,11 +136,8 @@ Regexp::Scanner.scan( /(cat?([bhm]at)){3,5}/ ).map {|token| token[2]}
to the lexer.

* The MRI implementation may accept expressions that either conflict with
the documentation or are undocumented. The scanner does not support such
implementation quirks.
_(See issues [#3](https://github.com/ammar/regexp_parser/issues/3) and
[#15](https://github.com/ammar/regexp_parser/issues/15) for examples)_

the documentation or are undocumented, like `{}` and `]` _(unescaped)_.
The scanner will try to support as many of these cases as possible.

---
### Syntax
Expand Down
23 changes: 19 additions & 4 deletions lib/regexp_parser/scanner/scanner.rl
Expand Up @@ -62,13 +62,17 @@
quantifier_possessive = '?+' | '*+' | '++';
quantifier_mode = '?' | '+';

quantifier_interval = range_open . (digit+)? . ','? . (digit+)? .
range_close . quantifier_mode?;
quantity_exact = (digit+);
quantity_minimum = (digit+) . ',';
quantity_maximum = ',' . (digit+);
quantity_range = (digit+) . ',' . (digit+);
quantifier_interval = range_open . ( quantity_exact | quantity_minimum |
quantity_maximum | quantity_range ) . range_close .
quantifier_mode?;

quantifiers = quantifier_greedy | quantifier_reluctant |
quantifier_possessive | quantifier_interval;


conditional = '(?(';

group_comment = '?#' . [^)]* . group_close;
Expand Down Expand Up @@ -114,6 +118,8 @@
curlies | parantheses | brackets |
line_anchor | quantifier_greedy;

literal_delimiters = ']' | '}';

ascii_print = ((0x20..0x7e) - meta_char);
ascii_nonprint = (0x01..0x1f | 0x7f);

Expand Down Expand Up @@ -417,6 +423,10 @@
end
};

literal_delimiters {
append_literal(data, ts, te)
};

# Character sets
# ------------------------------------------------------------------------
set_open >set_opened {
Expand Down Expand Up @@ -620,10 +630,15 @@
end
};

quantifier_interval @err(premature_end_error) {
quantifier_interval {
emit(:quantifier, :interval, *text(data, ts, te))
};

# Catch unmatched curly braces as literals
range_open {
append_literal(data, ts, te)
};

# Escaped sequences
# ------------------------------------------------------------------------
backslash > (backslashed, 1) {
Expand Down
68 changes: 68 additions & 0 deletions spec/lexer/delimiters_spec.rb
@@ -0,0 +1,68 @@
require 'spec_helper'

RSpec.describe('Literal delimiter lexing') do
include_examples 'lex', '}',
0 => [:literal, :literal, '}', 0, 1, 0, 0, 0]

include_examples 'lex', '}}',
0 => [:literal, :literal, '}}', 0, 2, 0, 0, 0]

include_examples 'lex', '{',
0 => [:literal, :literal, '{', 0, 1, 0, 0, 0]

include_examples 'lex', '{{',
0 => [:literal, :literal, '{{', 0, 2, 0, 0, 0]

include_examples 'lex', '{}',
0 => [:literal, :literal, '{}', 0, 2, 0, 0, 0]

include_examples 'lex', '}{',
0 => [:literal, :literal, '}{', 0, 2, 0, 0, 0]

include_examples 'lex', '}{+',
0 => [:literal, :literal, '}', 0, 1, 0, 0, 0],
1 => [:literal, :literal, '{', 1, 2, 0, 0, 0],
2 => [:quantifier, :one_or_more, '+', 2, 3, 0, 0, 0]

include_examples 'lex', '{{var}}',
0 => [:literal, :literal, '{{var}}', 0, 7, 0, 0, 0]

include_examples 'lex', 'a{b}c',
0 => [:literal, :literal, 'a{b}c', 0, 5, 0, 0, 0]

include_examples 'lex', 'a{1,2',
0 => [:literal, :literal, 'a{1,2', 0, 5, 0, 0, 0]

include_examples 'lex', '({.+})',
0 => [:group, :capture, '(', 0, 1, 0, 0, 0],
1 => [:literal, :literal, '{', 1, 2, 1, 0, 0],
2 => [:meta, :dot, '.', 2, 3, 1, 0, 0],
3 => [:quantifier, :one_or_more, '+', 3, 4, 1, 0, 0],
4 => [:literal, :literal, '}', 4, 5, 1, 0, 0],
5 => [:group, :close, ')', 5, 6, 0, 0, 0]

include_examples 'lex', ']',
0 => [:literal, :literal, ']', 0, 1, 0, 0, 0]

include_examples 'lex', ']]',
0 => [:literal, :literal, ']]', 0, 2, 0, 0, 0]

include_examples 'lex', ']\[',
0 => [:literal, :literal, ']', 0, 1, 0, 0, 0],
1 => [:escape, :set_open, '\[', 1, 3, 0, 0, 0]

include_examples 'lex', '()',
0 => [:group, :capture, '(', 0, 1, 0, 0, 0],
1 => [:group, :close, ')', 1, 2, 0, 0, 0]

include_examples 'lex', '{abc:.+}}}[^}]]}',
0 => [:literal, :literal, '{abc:', 0, 5, 0, 0, 0],
1 => [:meta, :dot, '.', 5, 6, 0, 0, 0],
2 => [:quantifier, :one_or_more, '+', 6, 7, 0, 0, 0],
3 => [:literal, :literal, '}}}', 7, 10, 0, 0, 0],
4 => [:set, :open, '[', 10, 11, 0, 0, 0],
5 => [:set, :negate, '^', 11, 12, 0, 1, 0],
6 => [:literal, :literal, '}', 12, 13, 0, 1, 0],
7 => [:set, :close, ']', 13, 14, 0, 0, 0],
8 => [:literal, :literal, ']}', 14, 16, 0, 0, 0]
end
1 change: 1 addition & 0 deletions spec/parser/quantifiers_spec.rb
Expand Up @@ -35,6 +35,7 @@
include_examples 'quantifier', /a{4}b/, '{4}', :greedy, :interval, 4, 4
include_examples 'quantifier', /a{4}?b/, '{4}?', :reluctant, :interval, 4, 4
include_examples 'quantifier', /a{4}+b/, '{4}+', :possessive, :interval, 4, 4
include_examples 'quantifier', /a{004}+b/, '{004}+', :possessive, :interval, 4, 4

specify('mode-checking methods') do
exp = RP.parse(/a??/).first
Expand Down
52 changes: 52 additions & 0 deletions spec/scanner/delimiters_spec.rb
@@ -0,0 +1,52 @@
require 'spec_helper'

RSpec.describe('Literal delimiter scanning') do
include_examples 'scan', '}',
0 => [:literal, :literal, '}', 0, 1]

include_examples 'scan', '}}',
0 => [:literal, :literal, '}}', 0, 2]

include_examples 'scan', '{',
0 => [:literal, :literal, '{', 0, 1]

include_examples 'scan', '{{',
0 => [:literal, :literal, '{{', 0, 2]

include_examples 'scan', '{}',
0 => [:literal, :literal, '{}', 0, 2]

include_examples 'scan', '}{',
0 => [:literal, :literal, '}{', 0, 2]

include_examples 'scan', '}{+',
0 => [:literal, :literal, '}{', 0, 2]

include_examples 'scan', '{{var}}',
0 => [:literal, :literal, '{{var}}', 0, 7]

include_examples 'scan', 'a{1,2',
0 => [:literal, :literal, 'a{1,2', 0, 5]

include_examples 'scan', '({.+})',
0 => [:group, :capture, '(', 0, 1],
1 => [:literal, :literal, '{', 1, 2],
2 => [:meta, :dot, '.', 2, 3],
3 => [:quantifier, :one_or_more, '+', 3, 4],
4 => [:literal, :literal, '}', 4, 5],
5 => [:group, :close, ')', 5, 6]

include_examples 'scan', ']',
0 => [:literal, :literal, ']', 0, 1]

include_examples 'scan', ']]',
0 => [:literal, :literal, ']]', 0, 2]

include_examples 'scan', ']\[',
0 => [:literal, :literal, ']', 0, 1],
1 => [:escape, :set_open, '\[', 1, 3]

include_examples 'scan', '()',
0 => [:group, :capture, '(', 0, 1],
1 => [:group, :close, ')', 1, 2]
end
1 change: 0 additions & 1 deletion spec/scanner/errors_spec.rb
Expand Up @@ -10,7 +10,6 @@
include_examples 'scan error', RS::PrematureEndError, 'unbalanced set', '[a'
include_examples 'scan error', RS::PrematureEndError, 'unbalanced set', '[[:alpha:]'
include_examples 'scan error', RS::PrematureEndError, 'unbalanced group', '(abc'
include_examples 'scan error', RS::PrematureEndError, 'unbalanced interval', 'a{1,2'
include_examples 'scan error', RS::PrematureEndError, 'eof in property', '\p{asci'
include_examples 'scan error', RS::PrematureEndError, 'incomplete property', '\p{ascii abc'
include_examples 'scan error', RS::PrematureEndError, 'eof options', '(?mix'
Expand Down

0 comments on commit ae68d66

Please sign in to comment.