lib/regexp_parser/lexer.rb

# A very thin wrapper around the scanner that breaks quantified literal runs,
# collects emitted tokens into an array, calculates their nesting depth, and
# normalizes tokens for the parser, and checks if they are implemented by the
# given syntax flavor.
class Regexp::Lexer

  OPENING_TOKENS = [
    :capture, :passive, :lookahead, :nlookahead, :lookbehind, :nlookbehind,
    :atomic, :options, :options_switch, :named, :absence
  ].freeze

  CLOSING_TOKENS = [:close].freeze

  def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
    new.lex(input, syntax, options: options, &block)
  end

  def lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
    syntax = Regexp::Syntax.new(syntax)

    self.tokens = []
    self.nesting = 0
    self.set_nesting = 0
    self.conditional_nesting = 0
    self.shift = 0

    last = nil
    Regexp::Scanner.scan(input, options: options) do |type, token, text, ts, te|
      type, token = *syntax.normalize(type, token)
      syntax.check! type, token

      ascend(type, token)

      if type == :quantifier and last
        break_literal(last)        if last.type == :literal
        break_codepoint_list(last) if last.token == :codepoint_list
      end

      current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
                                  nesting, set_nesting, conditional_nesting)

      current = merge_condition(current) if type == :conditional and
        [:condition, :condition_close].include?(token)

      last.next = current if last
      current.previous = last if last

      tokens << current
      last = current

      descend(type, token)
    end

    if block_given?
      tokens.map { |t| block.call(t) }
    else
      tokens
    end
  end

  class << self
    alias :scan :lex
  end

  private

  attr_accessor :tokens, :nesting, :set_nesting, :conditional_nesting, :shift

  def ascend(type, token)
    case type
    when :group, :assertion
      self.nesting = nesting - 1 if CLOSING_TOKENS.include?(token)
    when :set
      self.set_nesting = set_nesting - 1 if token == :close
    when :conditional
      self.conditional_nesting = conditional_nesting - 1 if token == :close
    end
  end

  def descend(type, token)
    case type
    when :group, :assertion
      self.nesting = nesting + 1 if OPENING_TOKENS.include?(token)
    when :set
      self.set_nesting = set_nesting + 1 if token == :open
    when :conditional
      self.conditional_nesting = conditional_nesting + 1 if token == :open
    end
  end

  # called by scan to break a literal run that is longer than one character
  # into two separate tokens when it is followed by a quantifier
  def break_literal(token)
    lead, last, _ = token.text.partition(/.\z/mu)
    return if lead.empty?

    tokens.pop
    tokens << Regexp::Token.new(:literal, :literal, lead,
              token.ts, (token.te - last.bytesize),
              nesting, set_nesting, conditional_nesting)
    tokens << Regexp::Token.new(:literal, :literal, last,
              (token.ts + lead.bytesize), token.te,
              nesting, set_nesting, conditional_nesting)
  end

  def break_codepoint_list(token)
    lead, _, tail = token.text.rpartition(' ')
    return if lead.empty?

    tokens.pop
    tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}',
              token.ts, (token.te - tail.length),
              nesting, set_nesting, conditional_nesting)
    tokens << Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
              (token.ts + lead.length + 1), (token.te + 3),
              nesting, set_nesting, conditional_nesting)

    self.shift = shift + 3 # one space less, but extra \, u, {, and }
  end

  def merge_condition(current)
    last = tokens.pop
    Regexp::Token.new(:conditional, :condition, last.text + current.text,
      last.ts, current.te, nesting, set_nesting, conditional_nesting)
  end

end # module Regexp::Lexer