forked from ammar/regexp_parser
/
lexer.rb
127 lines (101 loc) · 3.85 KB
/
lexer.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
# A very thin wrapper around the scanner that breaks quantified literal runs,
# collects emitted tokens into an array, calculates their nesting depth, and
# normalizes tokens for the parser, and checks if they are implemented by the
# given syntax flavor.
class Regexp::Lexer
OPENING_TOKENS = [
:capture, :passive, :lookahead, :nlookahead, :lookbehind, :nlookbehind,
:atomic, :options, :options_switch, :named, :absence
].freeze
CLOSING_TOKENS = [:close].freeze
def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
new.lex(input, syntax, options: options, &block)
end
def lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
syntax = Regexp::Syntax.new(syntax)
self.tokens = []
self.nesting = 0
self.set_nesting = 0
self.conditional_nesting = 0
self.shift = 0
last = nil
Regexp::Scanner.scan(input, options: options) do |type, token, text, ts, te|
type, token = *syntax.normalize(type, token)
syntax.check! type, token
ascend(type, token)
if type == :quantifier and last
break_literal(last) if last.type == :literal
break_codepoint_list(last) if last.token == :codepoint_list
end
current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
nesting, set_nesting, conditional_nesting)
current = merge_condition(current) if type == :conditional and
[:condition, :condition_close].include?(token)
last.next = current if last
current.previous = last if last
tokens << current
last = current
descend(type, token)
end
if block_given?
tokens.map { |t| block.call(t) }
else
tokens
end
end
class << self
alias :scan :lex
end
private
attr_accessor :tokens, :nesting, :set_nesting, :conditional_nesting, :shift
def ascend(type, token)
case type
when :group, :assertion
self.nesting = nesting - 1 if CLOSING_TOKENS.include?(token)
when :set
self.set_nesting = set_nesting - 1 if token == :close
when :conditional
self.conditional_nesting = conditional_nesting - 1 if token == :close
end
end
def descend(type, token)
case type
when :group, :assertion
self.nesting = nesting + 1 if OPENING_TOKENS.include?(token)
when :set
self.set_nesting = set_nesting + 1 if token == :open
when :conditional
self.conditional_nesting = conditional_nesting + 1 if token == :open
end
end
# called by scan to break a literal run that is longer than one character
# into two separate tokens when it is followed by a quantifier
def break_literal(token)
lead, last, _ = token.text.partition(/.\z/mu)
return if lead.empty?
tokens.pop
tokens << Regexp::Token.new(:literal, :literal, lead,
token.ts, (token.te - last.bytesize),
nesting, set_nesting, conditional_nesting)
tokens << Regexp::Token.new(:literal, :literal, last,
(token.ts + lead.bytesize), token.te,
nesting, set_nesting, conditional_nesting)
end
def break_codepoint_list(token)
lead, _, tail = token.text.rpartition(' ')
return if lead.empty?
tokens.pop
tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}',
token.ts, (token.te - tail.length),
nesting, set_nesting, conditional_nesting)
tokens << Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
(token.ts + lead.length + 1), (token.te + 3),
nesting, set_nesting, conditional_nesting)
self.shift = shift + 3 # one space less, but extra \, u, {, and }
end
def merge_condition(current)
last = tokens.pop
Regexp::Token.new(:conditional, :condition, last.text + current.text,
last.ts, current.te, nesting, set_nesting, conditional_nesting)
end
end # module Regexp::Lexer