diff --git a/.github/workflows/rubocop.yml b/.github/workflows/rubocop.yml index 35927de4c..f9d181be7 100644 --- a/.github/workflows/rubocop.yml +++ b/.github/workflows/rubocop.yml @@ -74,7 +74,7 @@ jobs: run: bundle exec rake spec - name: internal investigation if: matrix.internal_investigation - run: bundle exec rake internal_investigation + run: bundle exec rake generate internal_investigation rubocop_specs: name: >- Main Gem Specs | RuboCop: ${{ matrix.rubocop }} | ${{ matrix.ruby }} (${{ matrix.os }}) @@ -98,6 +98,8 @@ jobs: ruby-version: ${{ matrix.ruby }} - name: install dependencies run: bundle install --jobs 3 --retry 3 + - name: generate lexer and parser + run: bundle exec rake generate - name: clone rubocop from source for full specs -- master if: matrix.rubocop == 'master' run: git clone --branch ${{ matrix.rubocop }} https://github.com/rubocop-hq/rubocop.git ../rubocop diff --git a/.gitignore b/.gitignore index be1d4d69e..778367d25 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,8 @@ +# generated parser / lexer +/lib/rubocop/ast/node_pattern/parser.racc.rb +/lib/rubocop/ast/node_pattern/parser.output +/lib/rubocop/ast/node_pattern/lexer.rex.rb + # rcov generated coverage coverage.data diff --git a/.rubocop.yml b/.rubocop.yml index 018e4ba80..0be2dabd2 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -13,6 +13,9 @@ AllCops: - 'spec/fixtures/**/*' - 'tmp/**/*' - '.git/**/*' + - 'lib/rubocop/ast/node_pattern/parser.racc.rb' + - 'lib/rubocop/ast/node_pattern/lexer.rex.rb' + - 'spec/rubocop/ast/node_pattern/parse_helper.rb' TargetRubyVersion: 2.4 Naming/PredicateName: diff --git a/.rubocop_todo.yml b/.rubocop_todo.yml index 99518ef9b..2aa934c16 100644 --- a/.rubocop_todo.yml +++ b/.rubocop_todo.yml @@ -32,7 +32,7 @@ Metrics/MethodLength: # Offense count: 1 # Configuration parameters: CountComments. Metrics/ModuleLength: - Max: 101 + Max: 108 # Offense count: 1 # Configuration parameters: ExpectMatchingDefinition, Regex, IgnoreExecutableScripts, AllowedAcronyms. @@ -65,6 +65,7 @@ RSpec/ContextWording: - 'spec/rubocop/ast/resbody_node_spec.rb' - 'spec/rubocop/ast/token_spec.rb' - 'spec/spec_helper.rb' + - 'spec/rubocop/ast/node_pattern/helper.rb' # Offense count: 6 # Configuration parameters: Max. @@ -73,6 +74,7 @@ RSpec/ExampleLength: - 'spec/rubocop/ast/node_pattern_spec.rb' - 'spec/rubocop/ast/processed_source_spec.rb' - 'spec/rubocop/ast/send_node_spec.rb' + - 'spec/rubocop/ast/node_pattern/parser_spec.rb' # Offense count: 6 RSpec/LeakyConstantDeclaration: diff --git a/Gemfile b/Gemfile index b56eb4d4e..a25fac139 100644 --- a/Gemfile +++ b/Gemfile @@ -5,8 +5,10 @@ source 'https://rubygems.org' gemspec gem 'bump', require: false +gem 'oedipus_lex', require: false gem 'pry' -gem 'rake', '~> 12.0' +gem 'racc' +gem 'rake', '~> 13.0' gem 'rspec', '~> 3.7' local_ast = File.expand_path('../rubocop', __dir__) if Dir.exist? local_ast diff --git a/Rakefile b/Rakefile index adba5a44f..53c412254 100644 --- a/Rakefile +++ b/Rakefile @@ -15,7 +15,7 @@ end require 'rspec/core/rake_task' -RSpec::Core::RakeTask.new(:spec) do |spec| +RSpec::Core::RakeTask.new(spec: :generate) do |spec| spec.pattern = FileList['spec/**/*_spec.rb'] end diff --git a/docs/modules/ROOT/pages/node_pattern_compiler.adoc b/docs/modules/ROOT/pages/node_pattern_compiler.adoc new file mode 100644 index 000000000..1d8a1bbb8 --- /dev/null +++ b/docs/modules/ROOT/pages/node_pattern_compiler.adoc @@ -0,0 +1,240 @@ += Hacker's guide to the `NodePattern` compiler + +This documentation is aimed at anyone wanting to understand / modify the `NodePattern` compiler. +It assumes some familiarity with the syntax of https://github.com/rubocop-hq/rubocop-ast/blob/master/doc/modules/ROOT/pages/node_pattern.md[`NodePattern`], as well as the AST produced by the `parser` gem. + +== High level view + +The `NodePattern` compiler uses the same techniques as the `parser` gem: + +* a `Lexer` that breaks source into tokens +* a `Parser` that uses tokens and a `Builder` to emit an AST +* a `Compiler` that converts this AST into Ruby code + +Example: + +* Pattern: `+(send nil? {:puts :p} $...)+` +* Tokens: `+'(', [:tNODE_TYPE, :send], [:tPREDICATE, :nil?], '{', ...+` +* AST: `+s(:sequence, s(:node_type, :send), s(:predicate, :nil?), s(:union, ...+` +* Ruby code: ++ +[source,ruby] +---- +node.is_a?(::RuboCop::AST::Node) && node.children.size >= 2 && +node.send_type? && +node.children[0].nil?() && +(union2 = node.children[1]; ... +---- + +The different parts are described below + +== Vocabulary + +*"node pattern"*: something that can be matched against a single `AST::Node`. +While `(int 42)` and `#is_fun?` both correspond to node patterns, `+...+` (without the parenthesis) is not a node pattern. + +*"sequence"*: a node pattern that describes the sequence of children of a node (and its type): `+(type first_child second_child ...)+` + +*"variadic"*: element of a sequence that can match a variable number of children. +`+(send _ int* ...)+` has two variadic elements (`int*` and `+...+`). +`(send _ :name)` contains no variadic element. +Note that a sequence is itself never variadic. + +*"atom"*: element of a pattern that corresponds with a simple Ruby object. +`(send nil? +:puts (str 'hello'))` has two atoms: `:puts` and `'hello'`. + +== Lexer + +The `lexer.rb` defines `Lexer` and has the few definitions needed for the lexer to work. +The bulk of the processing is in the inherited class that is generated by https://github.com/seattlerb/oedipus_lex[`oedipus_lex`] + +[discrete] +==== Rules + +https://github.com/seattlerb/oedipus_lex[`oedipus_lex`] generates the Ruby file `lexer.rex.rb` from the rules defined in `lexer.rex`. + +These rules map a Regexp to code that emits a token. + +`oedipus_lex` aims to be simple and the generated file is readable. +It uses https://ruby-doc.org/stdlib-2.7.1/libdoc/strscan/rdoc/StringScanner.html[`StringScanner`] behind the scene. +It selects the first rule that matches, contrary to many lexing tools that prioritize longest match. + +[discrete] +==== Tokens + +The `Lexer` emits tokens with types that are: + +* string for the syntactic symbols (e.g. +`'('`, `'$'`, `+'...'+`) +* symbols of the form `:tTOKEN_TYPE` for the rest (e.g. +`:tPREDICATE`) + +Tokens are stored as `[type, value]`. + +[discrete] +==== Generation + +Use `rake generate:lexer` to generate the `lexer.rex.rb` from `lexer.rex` file. +This is done automatically by `rake spec`. + +NOTE: the `lexer.rex.rb` is not under source control, but is included in the gem. + +== Parser + +Similarly to the `Lexer`, the `parser.rb` defines `Parser` and has the few definitions needed for the parser to work. +The bulk of the processing is in the inherited class `parser.racc.rb` that is generated by https://ruby-doc.org/stdlib-2.7.0/libdoc/racc/parser/rdoc/Racc.html#module-Racc-label-Writing+A+Racc+Grammar+File[`racc`] from the rules in `parser.y`. + +[discrete] +==== Nodes + +The `Parser` emits `NodePattern::Node` which are similar to RuboCop's node. +They both inherit from ``parser``'s `Parser::AST::Source::Node`, and share additional methods too. + +Like for RuboCop's nodes, some nodes have specicialized classes (e.g. +`Sequence`) while other nodes use the base class directly (e.g. +`s(:number, 42)`) + +[discrete] +==== Rules + +The rules follow closely the definitions above. +In particular a distinction between `node_pattern_list`, which is a list of node patterns (each term can match a single node), while the more generic `variadic_pattern_list` is a list of elements, some of which could be variadic, others simple node patterns. + +[discrete] +==== Generation + +Similarly to the lexer, use `rake generate:parser` to generate the `parser.racc.rb` from `parser.y` file. +This is done automatically by `rake spec`. + +NOTE: the `parser.racc.rb` is not under source control, but is included in the gem. + +== Compiler + +The compiler's core is the `Compiler` class. +It holds the global state (e.g. +references to named arguments). +The goal of the compiler is to produce `matching_code`, Ruby code that can be run against an `AST::Node`, or any Ruby object for that matter. + +Packaging of that `matching_code` into code for a `lambda`, or method `def` is handled separately by the `MethodDefiner` module. + +The compilation itself is handled by three subcompilers: + +* `NodePatternSubcompiler` +* `AtomSubcompiler` +* `SequenceSubcompiler` + +=== Visitors + +The subcompilers use the visitor pattern [https://en.wikipedia.org/wiki/Visitor_pattern] + +The methods starting with `visit_` are used to process the different types of nodes. +For a node of type `:capture`, the method `visit_capture` will be called, or if none is defined then `visit_other_type` will be called. + +No argument is passed, as the visited node is accessible with the `node` attribute reader. + +=== NodePatternSubcompiler + +Given any `NodePattern::Node`, it generates the Ruby code that can return `true` or `false` for the given node, or node type for sequence head. + +==== `var` vs `access` + +The subcompiler can be called with the current node stored either in a variable (provided with the `var:` keyword argument) or via a Ruby expression (e.g. +`access: 'current_node.children[2]'`). + +The subcompiler will not generate code that executes this `access` expression more than once or twice. +If it might access the node more than that, `multiple_access` will store the result in a temporary variable (e.g. +`union`). + +==== Sequences + +Sequences are the most difficult elements to handle and are deferred to the `SequenceSubcompiler`. + +==== Atoms + +Atoms are handled with `visit_other_type`, which defers to the `AtomSubcompiler` and converts that result to a node pattern by appending `=== cur_node` (or `=== cur_node.type` if in sequence head). + +This way, the two arguments in `(_ #func?(%1) %2)` would be compiled differently; +`%1` would be compiled as `param1`, while `%2` gets compiled as `param2 === node.children[1]`. + +==== Precedence + +The code generated has higher or equal precedence to `&&`, so as to make chaining convenient. + +=== AtomSubcompiler + +This subcompiler produces Ruby code that gets evaluated to a Ruby object. +E.g. +`"42"`, `:a_symbol`, `param1`. + +A good way to think about it is when it has to be passed as arguments to a function call. +For example: + +[source,ruby] +---- +# Pattern '#func(42, %1)' compiles to +func(node, 42, param1) +---- + +Note that any node pattern can be output by this subcompiler, but those that don't correspond to a Ruby literal will be output as a lambda so they can be combined. +For example: + +[source,ruby] +---- +# Pattern '#func(int)' compiles to +func(node, ->(compare) { compare.is_a?(::RuboCop::AST::Node) && compare.int_type? }) +---- + +=== SequenceSubcompiler + +The subcompiler compiles the sequences' terms in turn, keeping track of which children of the `AST::Node` are being matched. + +==== Variadic terms + +The complexity comes from variadic elements, which have complex processing _and_ may make it impossible to know at compile time which children are matched by the subsequent terms. + +*Example* (no variadic terms) + +---- +(_type int _ str) +---- + +First child must match `int`, third child must match `str`. +The subcompiler will use `children[0]` and `children[2]`. + +*Example* (one variadic terms) + +---- +(_type int _* str) +---- + +First child must match `int` and _last_ child must match `str`. +The subcompiler will use `children[0]` and `children[-1]`. + +*Example* (multiple variadic terms) + +---- +(_type int+ sym str+) +---- + +The subcompiler can not use any integer and `children[]` to match `sym`. +This must be tracked at runtime in a variable (`cur_index`). + +The subcompiler will use fixed indices before the first variadic element and after the last one. + +==== Node pattern terms + +The node pattern terms are delegated to the `NodePatternSubcompiler`. + +In the pattern `(:sym :sym)`, both `:sym` will be compiled differently because the first `:sym` is in "sequence head": `:sym === node.type` and `:sym == node.children[0]` respectively. +The subcompiler indicates if the pattern is in "sequence head" or not, so the `NodePatternSubcompiler` can produce the right code. + +Variadic elements may not (currently) cover the sequence head. +As a convenience, `+(...)+` is understood as `+(_ ...)+`. +Other types of nodes will raise an error (e.g. +`()`; +see `Node#in_sequence_head`) + +==== Precedence + +Like the node pattern subcompiler, it generates code that has higher or equal precedence to `&&`, so as to make chaining convenient. diff --git a/lib/rubocop/ast.rb b/lib/rubocop/ast.rb index a3e78f5c1..fb7e0ee02 100644 --- a/lib/rubocop/ast.rb +++ b/lib/rubocop/ast.rb @@ -6,8 +6,20 @@ require_relative 'ast/ext/range' require_relative 'ast/ext/set' +require_relative 'ast/node_pattern/method_definer' require_relative 'ast/node_pattern' require_relative 'ast/node/mixin/descendence' +require_relative 'ast/node_pattern/builder' +require_relative 'ast/node_pattern/comment' +require_relative 'ast/node_pattern/compiler' +require_relative 'ast/node_pattern/compiler/subcompiler' +require_relative 'ast/node_pattern/compiler/atom_subcompiler' +require_relative 'ast/node_pattern/compiler/binding' +require_relative 'ast/node_pattern/compiler/node_pattern_subcompiler' +require_relative 'ast/node_pattern/compiler/sequence_subcompiler' +require_relative 'ast/node_pattern/lexer' +require_relative 'ast/node_pattern/node' +require_relative 'ast/node_pattern/parser' require_relative 'ast/sexp' require_relative 'ast/node' require_relative 'ast/node/mixin/method_identifier_predicates' diff --git a/lib/rubocop/ast/node_pattern.rb b/lib/rubocop/ast/node_pattern.rb index c95b32f66..1254a5c35 100644 --- a/lib/rubocop/ast/node_pattern.rb +++ b/lib/rubocop/ast/node_pattern.rb @@ -1,13 +1,13 @@ # frozen_string_literal: true require 'delegate' -require 'erb' -# rubocop:disable Metrics/ClassLength, Metrics/CyclomaticComplexity module RuboCop module AST # This class performs a pattern-matching operation on an AST node. # + # Detailed syntax: /doc/modules/ROOT/pages/node_pattern.md + # # Initialize a new `NodePattern` with `NodePattern.new(pattern_string)`, then # pass an AST node to `NodePattern#match`. Alternatively, use one of the class # macros in `NodePattern::Macros` to define your own pattern-matching method. @@ -23,838 +23,7 @@ module AST # - With no block, but multiple captures: captures are returned as an array. # - With no block and no captures: #match returns `true`. # - # ## Pattern string format examples - # - # ':sym' # matches a literal symbol - # '1' # matches a literal integer - # 'nil' # matches a literal nil - # 'send' # matches (send ...) - # '(send)' # matches (send) - # '(send ...)' # matches (send ...) - # '(op-asgn)' # node types with hyphenated names also work - # '{send class}' # matches (send ...) or (class ...) - # '({send class})' # matches (send) or (class) - # '(send const)' # matches (send (const ...)) - # '(send _ :new)' # matches (send :new) - # '(send $_ :new)' # as above, but whatever matches the $_ is captured - # '(send $_ $_)' # you can use as many captures as you want - # '(send !const ...)' # ! negates the next part of the pattern - # '$(send const ...)' # arbitrary matching can be performed on a capture - # '(send _recv _msg)' # wildcards can be named (for readability) - # '(send ... :new)' # you can match against the last children - # '(array )' # you can match children in any order. This - # # would match `['x', :y]` as well as `[:y, 'x'] - # '(_ )' # will match if arguments have at least a `str` and - # # a `sym` node, but can have more. - # '(array <$str $_>)' # captures are in the order of the pattern, - # # irrespective of the actual order of the children - # '(array int*)' # will match an array of 0 or more integers - # '(array int ?)' # will match 0 or 1 integer. - # # Note: Space needed to distinguish from int? - # '(array int+)' # will match an array of 1 or more integers - # '(array (int $_)+)' # as above and will capture the numbers in an array - # '(send $...)' # capture all the children as an array - # '(send $... int)' # capture all children but the last as an array - # '(send _x :+ _x)' # unification is performed on named wildcards - # # (like Prolog variables...) - # # (#== is used to see if values unify) - # '(int odd?)' # words which end with a ? are predicate methods, - # # are are called on the target to see if it matches - # # any Ruby method which the matched object supports - # # can be used - # # if a truthy value is returned, the match succeeds - # '(int [!1 !2])' # [] contains multiple patterns, ALL of which must - # # match in that position - # # in other words, while {} is pattern union (logical - # # OR), [] is intersection (logical AND) - # '(send %1 _)' # % stands for a parameter which must be supplied to - # # #match at matching time - # # it will be compared to the corresponding value in - # # the AST using #=== so you can pass Procs, Regexp, - # # etc. in addition to Nodes or literals. - # # `Array#===` will never match a node element, but - # # `Set#===` is an alias to `Set#include?` (Ruby 2.5+ - # # only), and so can be very useful to match within - # # many possible literals / Nodes. - # # a bare '%' is the same as '%1' - # # the number of extra parameters passed to #match - # # must equal the highest % value in the pattern - # # for consistency, %0 is the 'root node' which is - # # passed as the 1st argument to #match, where the - # # matching process starts - # '(send _ %named)' # arguments can also be passed as named - # # parameters (see `%1`) - # # Note that the macros `def_node_matcher` and - # # `def_node_search` accept default values for these. - # '(send _ %CONST)' # the named constant will act like `%1` and `%named`. - # '^^send' # each ^ ascends one level in the AST - # # so this matches against the grandparent node - # '`send' # descends any number of level in the AST - # # so this matches against any descendant node - # '#method' # we call this a 'funcall'; it calls a method in the - # # context where a pattern-matching method is defined - # # if that returns a truthy value, the match succeeds - # 'equal?(%1)' # predicates can be given 1 or more extra args - # '#method(%0, 1)' # funcalls can also be given 1 or more extra args - # # These arguments can be patterns themselves, in - # # which case a matcher responding to === will be - # # passed. - # '# comment' # comments are accepted at the end of lines - # - # You can nest arbitrarily deep: - # - # # matches node parsed from 'Const = Class.new' or 'Const = Module.new': - # '(casgn nil? :Const (send (const nil? {:Class :Module}) :new))' - # # matches a node parsed from an 'if', with a '==' comparison, - # # and no 'else' branch: - # '(if (send _ :== _) _ nil?)' - # - # Note that patterns like 'send' are implemented by calling `#send_type?` on - # the node being matched, 'const' by `#const_type?`, 'int' by `#int_type?`, - # and so on. Therefore, if you add methods which are named like - # `#prefix_type?` to the AST node class, then 'prefix' will become usable as - # a pattern. class NodePattern - # @private - Invalid = Class.new(StandardError) - - # @private - # Builds Ruby code which implements a pattern - class Compiler - SYMBOL = %r{:(?:[\w+@*/?!<>=~|%^-]+|\[\]=?)}.freeze - IDENTIFIER = /[a-zA-Z_][a-zA-Z0-9_-]*/.freeze - COMMENT = /#\s.*$/.freeze - - META = Regexp.union( - %w"( ) { } [ ] $< < > $... $ ! ^ ` ... + * ?" - ).freeze - NUMBER = /-?\d+(?:\.\d+)?/.freeze - STRING = /".+?"/.freeze - METHOD_NAME = /\#?#{IDENTIFIER}[!?]?\(?/.freeze - PARAM_CONST = /%[A-Z:][a-zA-Z_:]+/.freeze - KEYWORD_NAME = /%[a-z_]+/.freeze - PARAM_NUMBER = /%\d*/.freeze - - SEPARATORS = /\s+/.freeze - ONLY_SEPARATOR = /\A#{SEPARATORS}\Z/.freeze - - TOKENS = Regexp.union(META, PARAM_CONST, KEYWORD_NAME, PARAM_NUMBER, NUMBER, - METHOD_NAME, SYMBOL, STRING) - - TOKEN = /\G(?:#{SEPARATORS}|#{TOKENS}|.)/.freeze - - NODE = /\A#{IDENTIFIER}\Z/.freeze - PREDICATE = /\A#{IDENTIFIER}\?\(?\Z/.freeze - WILDCARD = /\A_(?:#{IDENTIFIER})?\Z/.freeze - - FUNCALL = /\A\##{METHOD_NAME}/.freeze - LITERAL = /\A(?:#{SYMBOL}|#{NUMBER}|#{STRING})\Z/.freeze - PARAM = /\A#{PARAM_NUMBER}\Z/.freeze - CONST = /\A#{PARAM_CONST}\Z/.freeze - KEYWORD = /\A#{KEYWORD_NAME}\Z/.freeze - CLOSING = /\A(?:\)|\}|\])\Z/.freeze - - REST = '...' - CAPTURED_REST = '$...' - - attr_reader :match_code, :tokens, :captures - - SEQ_HEAD_INDEX = -1 - - # Placeholders while compiling, see with_..._context methods - CUR_PLACEHOLDER = '@@@cur' - CUR_NODE = "#{CUR_PLACEHOLDER} node@@@" - CUR_ELEMENT = "#{CUR_PLACEHOLDER} element@@@" - SEQ_HEAD_GUARD = '@@@seq guard head@@@' - MULTIPLE_CUR_PLACEHOLDER = /#{CUR_PLACEHOLDER}.*#{CUR_PLACEHOLDER}/.freeze - - line = __LINE__ - ANY_ORDER_TEMPLATE = ERB.new <<~RUBY.gsub("-%>\n", '%>') - <% if capture_rest %>(<%= capture_rest %> = []) && <% end -%> - <% if capture_all %>(<%= capture_all %> = <% end -%> - <%= CUR_NODE %>.children[<%= range %>]<% if capture_all %>)<% end -%> - .each_with_object({}) { |<%= child %>, <%= matched %>| - case - <% patterns.each_with_index do |pattern, i| -%> - when !<%= matched %>[<%= i %>] && <%= - with_context(pattern, child, use_temp_node: false) - %> then <%= matched %>[<%= i %>] = true - <% end -%> - <% if !rest %> else break({}) - <% elsif capture_rest %> else <%= capture_rest %> << <%= child %> - <% end -%> - end - }.size == <%= patterns.size -%> - RUBY - ANY_ORDER_TEMPLATE.location = [__FILE__, line + 1] - - line = __LINE__ - REPEATED_TEMPLATE = ERB.new <<~RUBY.gsub("-%>\n", '%>') - <% if captured %>(<%= accumulate %> = Array.new) && <% end %> - <%= CUR_NODE %>.children[<%= range %>].all? do |<%= child %>| - <%= with_context(expr, child, use_temp_node: false) %><% if captured %>&& - <%= accumulate %>.push(<%= captured %>)<% end %> - end <% if captured %>&& - (<%= captured %> = if <%= accumulate %>.empty? - <%= captured %>.map{[]} # Transpose hack won't work for empty case - else - <%= accumulate %>.transpose - end) <% end -%> - RUBY - REPEATED_TEMPLATE.location = [__FILE__, line + 1] - - def initialize(str, root = 'node0', node_var = root) - @string = str - # For def_node_matcher, root == node_var - # For def_node_search, root is the root node to search on, - # and node_var is the current descendant being searched. - @root = root - @node_var = node_var - - @temps = 0 # avoid name clashes between temp variables - @captures = 0 # number of captures seen - @unify = {} # named wildcard -> temp variable - @params = 0 # highest % (param) number seen - @keywords = Set[] # keyword parameters seen - run - end - - def run - @tokens = Compiler.tokens(@string) - - @match_code = with_context(compile_expr, @node_var, use_temp_node: false) - @match_code.prepend("(captures = Array.new(#{@captures})) && ") \ - if @captures.positive? - - fail_due_to('unbalanced pattern') unless tokens.empty? - end - - # rubocop:disable Metrics/MethodLength, Metrics/AbcSize - def compile_expr(token = tokens.shift) - # read a single pattern-matching expression from the token stream, - # return Ruby code which performs the corresponding matching operation - # - # the 'pattern-matching' expression may be a composite which - # contains an arbitrary number of sub-expressions, but that composite - # must all have precedence higher or equal to that of `&&` - # - # Expressions may use placeholders like: - # CUR_NODE: Ruby code that evaluates to an AST node - # CUR_ELEMENT: Either the node or the type if in first element of - # a sequence (aka seq_head, e.g. "(seq_head first_node_arg ...") - if (atom = compile_atom(token)) - return atom_to_expr(atom) - end - - case token - when '(' then compile_seq - when '{' then compile_union - when '[' then compile_intersect - when '!' then compile_negation - when '$' then compile_capture - when '^' then compile_ascend - when '`' then compile_descend - when WILDCARD then compile_new_wildcard(token[1..-1]) - when FUNCALL then compile_funcall(token) - when PREDICATE then compile_predicate(token) - when NODE then compile_nodetype(token) - else fail_due_to("invalid token #{token.inspect}") - end - end - # rubocop:enable Metrics/MethodLength, Metrics/AbcSize - - def tokens_until(stop, what) - return to_enum __method__, stop, what unless block_given? - - fail_due_to("empty #{what}") if tokens.first == stop - yield until tokens.first == stop - tokens.shift - end - - def compile_seq - terms = tokens_until(')', 'sequence').map { variadic_seq_term } - Sequence.new(self, *terms).compile - end - - def compile_guard_clause - "#{CUR_NODE}.is_a?(RuboCop::AST::Node)" - end - - def variadic_seq_term - token = tokens.shift - case token - when CAPTURED_REST then compile_captured_ellipsis - when REST then compile_ellipsis - when '$<' then compile_any_order(next_capture) - when '<' then compile_any_order - else compile_repeated_expr(token) - end - end - - def compile_repeated_expr(token) - before = @captures - expr = compile_expr(token) - min, max = parse_repetition_token - return [1, expr] if min.nil? - - if @captures != before - captured = "captures[#{before}...#{@captures}]" - accumulate = next_temp_variable(:accumulate) - end - arity = min..max || Float::INFINITY - - [arity, repeated_generator(expr, captured, accumulate)] - end - - def repeated_generator(expr, captured, accumulate) - with_temp_variables do |child| - lambda do |range| - fail_due_to 'repeated pattern at beginning of sequence' if range.begin == SEQ_HEAD_INDEX - REPEATED_TEMPLATE.result(binding) - end - end - end - - def parse_repetition_token - case tokens.first - when '*' then min = 0 - when '+' then min = 1 - when '?' then min = 0 - max = 1 - else return - end - tokens.shift - [min, max] - end - - # @private - # Builds Ruby code for a sequence - # (head *first_terms variadic_term *last_terms) - class Sequence - extend Forwardable - def_delegators :@compiler, :compile_guard_clause, :with_seq_head_context, - :with_child_context, :fail_due_to - - def initialize(compiler, *arity_term_list) - @arities, @terms = arity_term_list.transpose - - @compiler = compiler - @variadic_index = @arities.find_index { |a| a.is_a?(Range) } - fail_due_to 'multiple variable patterns in same sequence' \ - if @variadic_index && !@arities.one? { |a| a.is_a?(Range) } - end - - def compile - [ - compile_guard_clause, - compile_child_nb_guard, - compile_seq_head, - *compile_first_terms, - compile_variadic_term, - *compile_last_terms - ].compact.join(" &&\n") << SEQ_HEAD_GUARD - end - - private - - def first_terms_arity - first_terms_range { |r| @arities[r].inject(0, :+) } || 0 - end - - def last_terms_arity - last_terms_range { |r| @arities[r].inject(0, :+) } || 0 - end - - def variadic_term_min_arity - @variadic_index ? @arities[@variadic_index].begin : 0 - end - - def first_terms_range - yield 1..(@variadic_index || @terms.size) - 1 if seq_head? - end - - def last_terms_range - yield @variadic_index + 1...@terms.size if @variadic_index - end - - def seq_head? - @variadic_index != 0 - end - - def compile_child_nb_guard - fixed = first_terms_arity + last_terms_arity - min = fixed + variadic_term_min_arity - op = if @variadic_index - max_variadic = @arities[@variadic_index].end - if max_variadic != Float::INFINITY - range = min..fixed + max_variadic - return "(#{range}).cover?(#{CUR_NODE}.children.size)" - end - '>=' - else - '==' - end - "#{CUR_NODE}.children.size #{op} #{min}" - end - - def term(index, range) - t = @terms[index] - if t.respond_to? :call - t.call(range) - else - with_child_context(t, range.begin) - end - end - - def compile_seq_head - return unless seq_head? - - fail_due_to 'sequences cannot start with <' \ - if @terms[0].respond_to? :call - - with_seq_head_context(@terms[0]) - end - - def compile_first_terms - first_terms_range { |range| compile_terms(range, 0) } - end - - def compile_last_terms - last_terms_range { |r| compile_terms(r, -last_terms_arity) } - end - - def compile_terms(index_range, start) - index_range.map do |i| - current = start - start += @arities.fetch(i) - term(i, current..start - 1) - end - end - - def compile_variadic_term - variadic_arity { |arity| term(@variadic_index, arity) } - end - - def variadic_arity - return unless @variadic_index - - first = @variadic_index.positive? ? first_terms_arity : SEQ_HEAD_INDEX - yield first..-last_terms_arity - 1 - end - end - private_constant :Sequence - - def compile_captured_ellipsis - capture = next_capture - block = lambda { |range| - # Consider ($...) like (_ $...): - range = 0..range.end if range.begin == SEQ_HEAD_INDEX - "(#{capture} = #{CUR_NODE}.children[#{range}])" - } - [0..Float::INFINITY, block] - end - - def compile_ellipsis - [0..Float::INFINITY, 'true'] - end - - # rubocop:disable Metrics/MethodLength - def compile_any_order(capture_all = nil) - rest = capture_rest = nil - patterns = [] - with_temp_variables do |child, matched| - tokens_until('>', 'any child') do - fail_due_to 'ellipsis must be at the end of <>' if rest - token = tokens.shift - case token - when CAPTURED_REST then rest = capture_rest = next_capture - when REST then rest = true - else patterns << compile_expr(token) - end - end - [rest ? patterns.size..Float::INFINITY : patterns.size, - ->(range) { ANY_ORDER_TEMPLATE.result(binding) }] - end - end - # rubocop:enable Metrics/MethodLength - - def insure_same_captures(enum, what) - return to_enum __method__, enum, what unless block_given? - - captures_before = captures_after = nil - enum.each do - captures_before ||= @captures - @captures = captures_before - yield - captures_after ||= @captures - fail_due_to("each #{what} must have same # of captures") if captures_after != @captures - end - end - - def access_unify(name) - var = @unify[name] - - if var == :forbidden_unification - fail_due_to "Wildcard #{name} was first seen in a subset of a" \ - " union and can't be used outside that union" - end - var - end - - def forbid_unification(*names) - names.each do |name| - @unify[name] = :forbidden_unification - end - end - - # rubocop:disable Metrics/MethodLength, Metrics/AbcSize - def unify_in_union(enum) - # We need to reset @unify before each branch is processed. - # Moreover we need to keep track of newly encountered wildcards. - # Var `new_unify_intersection` will hold those that are encountered - # in all branches; these are not a problem. - # Var `partial_unify` will hold those encountered in only a subset - # of the branches; these can't be used outside of the union. - - return to_enum __method__, enum unless block_given? - - new_unify_intersection = nil - partial_unify = [] - unify_before = @unify.dup - - result = enum.each do |e| - @unify = unify_before.dup if new_unify_intersection - yield e - new_unify = @unify.keys - unify_before.keys - if new_unify_intersection.nil? - # First iteration - new_unify_intersection = new_unify - else - union = new_unify_intersection | new_unify - new_unify_intersection &= new_unify - partial_unify |= union - new_unify_intersection - end - end - - # At this point, all members of `new_unify_intersection` can be used - # for unification outside of the union, but partial_unify may not - - forbid_unification(*partial_unify) - - result - end - # rubocop:enable Metrics/MethodLength, Metrics/AbcSize - - def compile_union - # we need to ensure that each branch of the {} contains the same - # number of captures (since only one branch of the {} can actually - # match, the same variables are used to hold the captures for each - # branch) - enum = tokens_until('}', 'union') - enum = unify_in_union(enum) - terms = insure_same_captures(enum, 'branch of {}') - .map { compile_expr } - - "(#{terms.join(' || ')})" - end - - def compile_intersect - tokens_until(']', 'intersection') - .map { compile_expr } - .join(' && ') - end - - def compile_capture - "(#{next_capture} = #{CUR_ELEMENT}; #{compile_expr})" - end - - def compile_negation - "!(#{compile_expr})" - end - - def compile_ascend - with_context("#{CUR_NODE} && #{compile_expr}", "#{CUR_NODE}.parent") - end - - def compile_descend - with_temp_variables do |descendant| - pattern = with_context(compile_expr, descendant, - use_temp_node: false) - [ - "RuboCop::AST::NodePattern.descend(#{CUR_ELEMENT}).", - "any? do |#{descendant}|", - " #{pattern}", - 'end' - ].join("\n") - end - end - - # Known wildcards are considered atoms, see `compile_atom` - def compile_new_wildcard(name) - return 'true' if name.empty? - - n = @unify[name] = "unify_#{name.gsub('-', '__')}" - # double assign to avoid "assigned but unused variable" - "(#{n} = #{CUR_ELEMENT}; #{n} = #{n}; true)" - end - - def compile_predicate(predicate) - if predicate.end_with?('(') # is there an arglist? - args = compile_args - predicate = predicate[0..-2] # drop the trailing ( - "#{CUR_ELEMENT}.#{predicate}(#{args.join(',')})" - else - "#{CUR_ELEMENT}.#{predicate}" - end - end - - def compile_funcall(method) - # call a method in the context which this pattern-matching - # code is used in. pass target value as an argument - method = method[1..-1] # drop the leading # - if method.end_with?('(') # is there an arglist? - args = compile_args - method = method[0..-2] # drop the trailing ( - "#{method}(#{CUR_ELEMENT},#{args.join(',')})" - else - "#{method}(#{CUR_ELEMENT})" - end - end - - def compile_nodetype(type) - "#{compile_guard_clause} && #{CUR_NODE}.#{type.tr('-', '_')}_type?" - end - - def compile_args - tokens_until(')', 'call arguments').map do - arg = compile_arg - tokens.shift if tokens.first == ',' - arg - end - end - - def atom_to_expr(atom) - "#{atom} === #{CUR_ELEMENT}" - end - - def expr_to_atom(expr) - with_temp_variables do |compare| - in_context = with_context(expr, compare, use_temp_node: false) - "::RuboCop::AST::NodePattern::Matcher.new{|#{compare}| #{in_context}}" - end - end - - # @return compiled atom (e.g. ":literal" or "SOME_CONST") - # or nil if not a simple atom (unknown wildcard, other tokens) - def compile_atom(token) - case token - when WILDCARD then access_unify(token[1..-1]) # could be nil - when LITERAL then token - when KEYWORD then get_keyword(token[1..-1]) - when CONST then get_const(token[1..-1]) - when PARAM then get_param(token[1..-1]) - when CLOSING then fail_due_to("#{token} in invalid position") - when nil then fail_due_to('pattern ended prematurely') - end - end - - def compile_arg - token = tokens.shift - compile_atom(token) || expr_to_atom(compile_expr(token)) - end - - def next_capture - index = @captures - @captures += 1 - "captures[#{index}]" - end - - def get_param(number) - number = number.empty? ? 1 : Integer(number) - @params = number if number > @params - number.zero? ? @root : "param#{number}" - end - - def get_keyword(name) - @keywords << name - name - end - - def get_const(const) - const # Output the constant exactly as given - end - - def emit_yield_capture(when_no_capture = '') - yield_val = if @captures.zero? - when_no_capture - elsif @captures == 1 - 'captures[0]' # Circumvent https://github.com/jruby/jruby/issues/5710 - else - '*captures' - end - "yield(#{yield_val})" - end - - def emit_retval - if @captures.zero? - 'true' - elsif @captures == 1 - 'captures[0]' - else - 'captures' - end - end - - def emit_param_list - (1..@params).map { |n| "param#{n}" }.join(',') - end - - def emit_keyword_list(forwarding: false) - pattern = "%s: #{'%s' if forwarding}" - @keywords.map { |k| format(pattern, keyword: k) }.join(',') - end - - def emit_params(*first, forwarding: false) - params = emit_param_list - keywords = emit_keyword_list(forwarding: forwarding) - [*first, params, keywords].reject(&:empty?).join(',') - end - - def emit_method_code - <<~RUBY - return unless #{@match_code} - block_given? ? #{emit_yield_capture} : (return #{emit_retval}) - RUBY - end - - def fail_due_to(message) - raise Invalid, "Couldn't compile due to #{message}. Pattern: #{@string}" - end - - def with_temp_node(cur_node) - with_temp_variables do |node| - yield "(#{node} = #{cur_node})", node - end - .gsub("\n", "\n ") # Nicer indent for debugging - end - - def with_temp_variables(&block) - names = block.parameters.map { |_, name| next_temp_variable(name) } - yield(*names) - end - - def next_temp_variable(name) - "#{name}#{next_temp_value}" - end - - def next_temp_value - @temps += 1 - end - - def auto_use_temp_node?(code) - code.match?(MULTIPLE_CUR_PLACEHOLDER) - end - - # with_<...>_context methods are used whenever the context, - # i.e the current node or the current element can be determined. - - def with_child_context(code, child_index) - with_context(code, "#{CUR_NODE}.children[#{child_index}]") - end - - def with_context(code, cur_node, - use_temp_node: auto_use_temp_node?(code)) - if use_temp_node - with_temp_node(cur_node) do |init, temp_var| - substitute_cur_node(code, temp_var, first_cur_node: init) - end - else - substitute_cur_node(code, cur_node) - end - end - - def with_seq_head_context(code) - fail_due_to('parentheses at sequence head') if code.include?(SEQ_HEAD_GUARD) - - code.gsub CUR_ELEMENT, "#{CUR_NODE}.type" - end - - def substitute_cur_node(code, cur_node, first_cur_node: cur_node) - iter = 0 - code - .gsub(CUR_ELEMENT, CUR_NODE) - .gsub(CUR_NODE) do - iter += 1 - iter == 1 ? first_cur_node : cur_node - end - .gsub(SEQ_HEAD_GUARD, '') - end - - def self.tokens(pattern) - pattern.gsub(COMMENT, '').scan(TOKEN).grep_v(ONLY_SEPARATOR) - end - - # This method minimizes the closure for our method - def wrapping_block(method_name, **defaults) - proc do |*args, **values| - send method_name, *args, **defaults, **values - end - end - - def def_helper(base, method_name, **defaults) - location = caller_locations(3, 1).first - unless defaults.empty? - call = :"without_defaults_#{method_name}" - base.send :define_method, method_name, &wrapping_block(call, **defaults) - method_name = call - end - src = yield method_name - base.class_eval(src, location.path, location.lineno) - end - - def def_node_matcher(base, method_name, **defaults) - def_helper(base, method_name, **defaults) do |name| - <<~RUBY - def #{name}(#{emit_params('node = self')}) - #{emit_method_code} - end - RUBY - end - end - - def def_node_search(base, method_name, **defaults) - def_helper(base, method_name, **defaults) do |name| - emit_node_search(name) - end - end - - def emit_node_search(method_name) - if method_name.to_s.end_with?('?') - on_match = 'return true' - else - args = emit_params(":#{method_name}", @root, forwarding: true) - prelude = "return enum_for(#{args}) unless block_given?\n" - on_match = emit_yield_capture(@node_var) - end - emit_node_search_body(method_name, prelude: prelude, on_match: on_match) - end - - def emit_node_search_body(method_name, prelude:, on_match:) - <<~RUBY - def #{method_name}(#{emit_params(@root)}) - #{prelude} - #{@root}.each_node do |#{@node_var}| - if #{match_code} - #{on_match} - end - end - nil - end - RUBY - end - end - private_constant :Compiler - # Helpers for defining methods based on a pattern string module Macros # Define a method which applies a pattern to an AST node @@ -865,8 +34,7 @@ module Macros # If the node matches, and no block is provided, the new method will # return the captures, or `true` if there were none. def def_node_matcher(method_name, pattern_str, **keyword_defaults) - Compiler.new(pattern_str, 'node') - .def_node_matcher(self, method_name, **keyword_defaults) + NodePattern.new(pattern_str).def_node_matcher(self, method_name, **keyword_defaults) end # Define a method which recurses over the descendants of an AST node, @@ -876,48 +44,60 @@ def def_node_matcher(method_name, pattern_str, **keyword_defaults) # as soon as it finds a descendant which matches. Otherwise, it will # yield all descendants which match. def def_node_search(method_name, pattern_str, **keyword_defaults) - Compiler.new(pattern_str, 'node0', 'node') - .def_node_search(self, method_name, **keyword_defaults) + NodePattern.new(pattern_str).def_node_search(self, method_name, **keyword_defaults) end end - attr_reader :pattern + extend Forwardable + include MethodDefiner + Invalid = Class.new(StandardError) - def initialize(str) + VAR = 'node' + + attr_reader :pattern, :ast, :match_code + + def_delegators :@compiler, :captures, :named_parameters, :positional_parameters + + def initialize(str, compiler: Compiler.new) @pattern = str - compiler = Compiler.new(str, 'node0') - src = "def match(#{compiler.emit_params('node0')});" \ - "#{compiler.emit_method_code}end" - instance_eval(src, __FILE__, __LINE__ + 1) + @ast = compiler.parser.new.parse(str) + @compiler = compiler + @match_code = @compiler.compile_as_node_pattern(@ast, var: VAR) + @cache = {} end - def match(*args, **rest) - # If we're here, it's because the singleton method has not been defined, - # either because we've been dup'ed or serialized through YAML - initialize(pattern) - if rest.empty? - match(*args) - else - match(*args, **rest) - end + def match(*args, **rest, &block) + @cache[:lambda] ||= as_lambda + @cache[:lambda].call(*args, block: block, **rest) + end + + def ==(other) + other.is_a?(NodePattern) && other.ast == ast + end + alias eql? == + + def to_s + "#<#{self.class} #{pattern}>" end - def marshal_load(pattern) + def marshal_load(pattern) #:nodoc: initialize pattern end - def marshal_dump + def marshal_dump #:nodoc: pattern end - def ==(other) - other.is_a?(NodePattern) && - Compiler.tokens(other.pattern) == Compiler.tokens(pattern) + def as_json(_options = nil) #:nodoc: + pattern end - alias eql? == - def to_s - "#<#{self.class} #{pattern}>" + def encode_with(coder) #:nodoc: + coder['pattern'] = pattern + end + + def init_with(coder) #:nodoc: + initialize(coder['pattern']) end # Yields its argument and any descendants, depth-first. @@ -936,17 +116,11 @@ def self.descend(element, &block) nil end - # @api private - class Matcher - def initialize(&block) - @block = block - end - - def ===(compare) - @block.call(compare) - end + def freeze + @match_code.freeze + @compiler.freeze + super end end end end -# rubocop:enable Metrics/ClassLength, Metrics/CyclomaticComplexity diff --git a/lib/rubocop/ast/node_pattern/builder.rb b/lib/rubocop/ast/node_pattern/builder.rb new file mode 100644 index 000000000..5d4212f02 --- /dev/null +++ b/lib/rubocop/ast/node_pattern/builder.rb @@ -0,0 +1,42 @@ +# frozen_string_literal: true + +module RuboCop + module AST + class NodePattern + # Responsible to build the AST nodes for `NodePattern` + # + # Doc on how this fits in the compiling process: + # /doc/modules/ROOT/pages/node_pattern.md + class Builder + def emit_capture(capture_token, node) + return node if capture_token.nil? + + emit_unary_op(:capture, capture_token, node) + end + + def emit_atom(type, value) + n(type, [value]) + end + + def emit_unary_op(type, _operator = nil, *children) + n(type, children) + end + + def emit_list(type, _begin, children, _end) + n(type, children) + end + + def emit_call(type, selector, args = nil) + _begin_t, arg_nodes, _end_t = args + n(type, [selector, *arg_nodes]) + end + + private + + def n(type, *args) + Node::MAP[type].new(type, *args) + end + end + end + end +end diff --git a/lib/rubocop/ast/node_pattern/comment.rb b/lib/rubocop/ast/node_pattern/comment.rb new file mode 100644 index 000000000..9dc6f53f0 --- /dev/null +++ b/lib/rubocop/ast/node_pattern/comment.rb @@ -0,0 +1,45 @@ +# frozen_string_literal: true + +module RuboCop + module AST + class NodePattern + # A NodePattern comment, simplified version of ::Parser::Source::Comment + class Comment + attr_reader :location + alias loc location + + ## + # @param [Parser::Source::Range] range + # + def initialize(range) + @location = ::Parser::Source::Map.new(range) + freeze + end + + # @return [String] + def text + loc.expression.source.freeze + end + + ## + # Compares comments. Two comments are equal if they + # correspond to the same source range. + # + # @param [Object] other + # @return [Boolean] + # + def ==(other) + other.is_a?(Comment) && + @location == other.location + end + + ## + # @return [String] a human-readable representation of this comment + # + def inspect + "#" + end + end + end + end +end diff --git a/lib/rubocop/ast/node_pattern/compiler.rb b/lib/rubocop/ast/node_pattern/compiler.rb new file mode 100644 index 000000000..56df01691 --- /dev/null +++ b/lib/rubocop/ast/node_pattern/compiler.rb @@ -0,0 +1,104 @@ +# frozen_string_literal: true + +module RuboCop + module AST + class NodePattern + # The top-level compiler holding the global state + # Defers work to its subcompilers + # + # Doc on how this fits in the compiling process: + # /doc/modules/ROOT/pages/node_pattern.md + class Compiler + extend Forwardable + attr_reader :captures, :named_parameters, :positional_parameters, :binding + + def initialize + @temp_depth = 0 # avoid name clashes between temp variables + @captures = 0 # number of captures seen + @positional_parameters = 0 # highest % (param) number seen + @named_parameters = Set[] # keyword parameters + @binding = Binding.new # bound variables + @atom_subcompiler = self.class::AtomSubcompiler.new(self) + end + + def_delegators :binding, :bind + + def positional_parameter(number) + @positional_parameters = number if number > @positional_parameters + "param#{number}" + end + + def named_parameter(name) + @named_parameters << name + name + end + + # Enumerates `enum` while keeping track of state accross + # union branches (captures and unification). + def each_union(enum, &block) + enforce_same_captures(binding.union_bind(enum), &block) + end + + def compile_as_atom(node) + @atom_subcompiler.compile(node) + end + + def compile_as_node_pattern(node, **options) + self.class::NodePatternSubcompiler.new(self, **options).compile(node) + end + + def compile_sequence(sequence, var:) + self.class::SequenceSubcompiler.new(self, sequence: sequence, var: var).compile_sequence + end + + def parser + Parser + end + + # Utilities + + def with_temp_variables(*names, &block) + @temp_depth += 1 + suffix = @temp_depth if @temp_depth > 1 + names = block.parameters.map(&:last) if names.empty? + names.map! { |name| "#{name}#{suffix}" } + yield(*names) + ensure + @temp_depth -= 1 + end + + def next_capture + "captures[#{new_capture}]" + end + + def freeze + @named_parameters.freeze + super + end + + private + + def enforce_same_captures(enum) + return to_enum __method__, enum unless block_given? + + captures_before = captures_after = nil + enum.each do |node| + captures_before ||= @captures + @captures = captures_before + yield node + captures_after ||= @captures + if captures_after != @captures + raise Invalid, 'each branch must have same number of captures' + end + end + end + + def new_capture + @captures + ensure + @captures += 1 + end + end + end + end +end diff --git a/lib/rubocop/ast/node_pattern/compiler/atom_subcompiler.rb b/lib/rubocop/ast/node_pattern/compiler/atom_subcompiler.rb new file mode 100644 index 000000000..bca1875fb --- /dev/null +++ b/lib/rubocop/ast/node_pattern/compiler/atom_subcompiler.rb @@ -0,0 +1,50 @@ +# frozen_string_literal: true + +module RuboCop + module AST + class NodePattern + class Compiler + # Generates code that evaluates to a value (Ruby object) + # This value responds to `===`. + # + # Doc on how this fits in the compiling process: + # /doc/modules/ROOT/pages/node_pattern.md + class AtomSubcompiler < Subcompiler + private + + def visit_unify + compiler.bind(node.child) do + raise Invalid, 'unified variables can not appear first as argument' + end + end + + def visit_symbol + node.child.inspect + end + alias visit_number visit_symbol + alias visit_string visit_symbol + + def visit_const + node.child + end + + def visit_named_parameter + compiler.named_parameter(node.child) + end + + def visit_positional_parameter + compiler.positional_parameter(node.child) + end + + # Assumes other types are node patterns. + def visit_other_type + compiler.with_temp_variables do |compare| + code = compiler.compile_as_node_pattern(node, var: compare) + "->(#{compare}) { #{code} }" + end + end + end + end + end + end +end diff --git a/lib/rubocop/ast/node_pattern/compiler/binding.rb b/lib/rubocop/ast/node_pattern/compiler/binding.rb new file mode 100644 index 000000000..7359b014b --- /dev/null +++ b/lib/rubocop/ast/node_pattern/compiler/binding.rb @@ -0,0 +1,78 @@ +# frozen_string_literal: true + +module RuboCop + module AST + class NodePattern + class Compiler + # Holds the list of bound variable names + class Binding + def initialize + @bound = {} + end + + # Yields the first time a given name is bound + # + # @return [String] bound variable name + def bind(name) + var = @bound.fetch(name) do + yield n = @bound[name] = "unify_#{name.gsub('-', '__')}" + n + end + + if var == :forbidden_unification + raise Invalid, "Wildcard #{name} was first seen in a subset of a" \ + " union and can't be used outside that union" + end + var + end + + # rubocop:disable Metrics/MethodLength, Metrics/AbcSize + def union_bind(enum) + # We need to reset @bound before each branch is processed. + # Moreover we need to keep track of newly encountered wildcards. + # Var `newly_bound_intersection` will hold those that are encountered + # in all branches; these are not a problem. + # Var `partially_bound` will hold those encountered in only a subset + # of the branches; these can't be used outside of the union. + + return to_enum __method__, enum unless block_given? + + newly_bound_intersection = nil + partially_bound = [] + bound_before = @bound.dup + + result = enum.each do |e| + @bound = bound_before.dup if newly_bound_intersection + yield e + newly_bound = @bound.keys - bound_before.keys + if newly_bound_intersection.nil? + # First iteration + newly_bound_intersection = newly_bound + else + union = newly_bound_intersection | newly_bound + newly_bound_intersection &= newly_bound + partially_bound |= union - newly_bound_intersection + end + end + + # At this point, all members of `newly_bound_intersection` can be used + # for unification outside of the union, but partially_bound may not + + forbid(partially_bound) + + result + end + # rubocop:enable Metrics/MethodLength, Metrics/AbcSize + + private + + def forbid(names) + names.each do |name| + @bound[name] = :forbidden_unification + end + end + end + end + end + end +end diff --git a/lib/rubocop/ast/node_pattern/compiler/node_pattern_subcompiler.rb b/lib/rubocop/ast/node_pattern/compiler/node_pattern_subcompiler.rb new file mode 100644 index 000000000..5b1c98a10 --- /dev/null +++ b/lib/rubocop/ast/node_pattern/compiler/node_pattern_subcompiler.rb @@ -0,0 +1,146 @@ +# frozen_string_literal: true + +module RuboCop + module AST + class NodePattern + class Compiler + # Compiles code that evalues to true or false + # for a given value `var` (typically a RuboCop::AST::Node) + # or it's `node.type` if `seq_head` is true + # + # Doc on how this fits in the compiling process: + # /doc/modules/ROOT/pages/node_pattern.md + class NodePatternSubcompiler < Subcompiler + attr_reader :access, :seq_head + + def initialize(compiler, var: nil, access: var, seq_head: false) + super(compiler) + @var = var + @access = access + @seq_head = seq_head + end + + private + + def visit_negation + expr = compile(node.child) + "!(#{expr})" + end + + def visit_ascend + compiler.with_temp_variables do |ascend| + expr = compiler.compile_as_node_pattern(node.child, var: ascend) + "(#{ascend} = #{access_node}) && (#{ascend} = #{ascend}.parent) && #{expr}" + end + end + + def visit_descend + compiler.with_temp_variables { |descendant| <<~RUBY.chomp } + ::RuboCop::AST::NodePattern.descend(#{access}).any? do |#{descendant}| + #{compiler.compile_as_node_pattern(node.child, var: descendant)} + end + RUBY + end + + def visit_wildcard + 'true' + end + + def visit_unify + name = compiler.bind(node.child) do |unify_name| + # double assign to avoid "assigned but unused variable" + return "(#{unify_name} = #{access_element}; #{unify_name} = #{unify_name}; true)" + end + + compile_value_match(name) + end + + def visit_capture + "(#{compiler.next_capture} = #{access_element}; #{compile(node.child)})" + end + + ### Lists + + def visit_union + multiple_access(:union) do + terms = compiler.each_union(node.children) + .map { |child| compile(child) } + + "(#{terms.join(' || ')})" + end + end + + def visit_intersection + multiple_access(:intersection) do + node.children.map { |child| compile(child) } + .join(' && ') + end + end + + def visit_predicate + "#{access_element}.#{node.method_name}#{compile_args(node.arg_list)}" + end + + def visit_function_call + "#{node.method_name}#{compile_args(node.arg_list, first: access_element)}" + end + + def visit_node_type + "#{access_node}.#{node.child.to_s.tr('-', '_')}_type?" + end + + def visit_sequence + multiple_access(:sequence) do |var| + term = compiler.compile_sequence(node, var: var) + "#{compile_guard_clause} && #{term}" + end + end + + # Assumes other types are atoms. + def visit_other_type + value = compiler.compile_as_atom(node) + compile_value_match(value) + end + + # Compiling helpers + + def compile_value_match(value) + "#{value} === #{access_element}" + end + + # @param [Array, nil] + # @return [String, nil] + def compile_args(arg_list, first: nil) + args = arg_list&.map { |arg| compiler.compile_as_atom(arg) } + args = [first, *args] if first + "(#{args.join(', ')})" if args + end + + def access_element + seq_head ? "#{access}.type" : access + end + + def access_node + return access if seq_head + + "#{compile_guard_clause} && #{access}" + end + + def compile_guard_clause + "#{access}.is_a?(::RuboCop::AST::Node)" + end + + def multiple_access(kind) + return yield @var if @var + + compiler.with_temp_variables(kind) do |var| + memo = "#{var} = #{access}" + @var = @access = var + "(#{memo}; #{yield @var})" + end + end + end + end + end + end +end diff --git a/lib/rubocop/ast/node_pattern/compiler/sequence_subcompiler.rb b/lib/rubocop/ast/node_pattern/compiler/sequence_subcompiler.rb new file mode 100644 index 000000000..27e86d5ba --- /dev/null +++ b/lib/rubocop/ast/node_pattern/compiler/sequence_subcompiler.rb @@ -0,0 +1,338 @@ +# frozen_string_literal: true + +module RuboCop + module AST + class NodePattern + class Compiler + # Compiles terms within a sequence to code that evalues to true or false. + # Compilation of the nodes that can match only a single term is deferred to + # `NodePatternSubcompiler`; only nodes that can match multiple terms are + # compiled here. + # Assumes the given `var` is a `::RuboCop::AST::Node` + # + # Doc on how this fits in the compiling process: + # /doc/modules/ROOT/pages/node_pattern.md + # + # rubocop:disable Metrics/ClassLength + class SequenceSubcompiler < Subcompiler + DELTA = 1 + # Calls `compile_sequence`; the actual `compile` method + # will be used for the different terms of the sequence. + # The only case of re-entrant call to `compile` is `visit_capture` + def initialize(compiler, sequence:, var:) + @seq = sequence # The node to be compiled + @seq_var = var # Holds the name of the variable holding the AST::Node we are matching + super(compiler) + end + + def compile_sequence + # rubocop:disable Layout/CommentIndentation + compiler.with_temp_variables do |cur_child, cur_index, previous_index| + @cur_child_var = cur_child # To hold the current child node + @cur_index_var = cur_index # To hold the current child index (always >= 0) + @prev_index_var = previous_index # To hold the child index before we enter the + # variadic nodes + @cur_index = :seq_head # Can be any of: + # :seq_head : when the current child is actually the + # sequence head + # :variadic_mode : child index held by @cur_index_var + # >= 0 : when the current child index is known + # (from the begining) + # < 0 : when the index is known from the end, + # where -1 is *past the end*, + # -2 is the last child, etc... + # This shift of 1 from standard Ruby indices + # is stored in DELTA + @in_sync = false # `true` iff `@cur_child_var` and `@cur_index_var` + # correspond to `@cur_index` + # Must be true if `@cur_index` is `:variadic_mode` + compile_terms + end + # rubocop:enable Layout/CommentIndentation + end + + private + + private :compile # Not meant to be called from outside + + # Single node patterns are all handled here + def visit_other_type + access = case @cur_index + when :seq_head + { var: @seq_var, + seq_head: true } + when :variadic_mode + { var: @cur_child_var } + else + idx = @cur_index + (@cur_index.negative? ? DELTA : 0) + { access: "#{@seq_var}.children[#{idx}]" } + end + + term = compiler.compile_as_node_pattern(node, **access) + compile_and_advance(term) + end + + def visit_repetition + within_loop do + child_captures = node.child.nb_captures + child_code = compile(node.child) + next compile_loop(child_code) if child_captures.zero? + + compile_captured_repetition(child_code, child_captures) + end + end + + def visit_any_order + within_loop do + compiler.with_temp_variables do |matched| + case_terms = compile_any_order_branches(matched) + else_code, init = compile_any_order_else + term = "#{compile_case(case_terms, else_code)} && #{compile_loop_advance}" + + all_matched_check = "&&\n#{matched}.size == #{node.term_nodes.size}" if node.rest_node + <<~RUBY + (#{init}#{matched} = {}; true) && + #{compile_loop(term)} #{all_matched_check} \\ + RUBY + end + end + end + + def compile_case(when_branches, else_code) + <<~RUBY + case + #{when_branches.join(' ')} + else #{else_code} + end \\ + RUBY + end + + def compile_any_order_branches(matched_var) + node.term_nodes.map.with_index do |node, i| + code = compiler.compile_as_node_pattern(node, var: @cur_child_var, seq_head: false) + var = "#{matched_var}[#{i}]" + "when !#{var} && #{code} then #{var} = true" + end + end + + # @return [Array] Else code, and init code (if any) + def compile_any_order_else + rest = node.rest_node + if !rest + 'false' + elsif rest.capture? + capture_rest = compiler.next_capture + init = "#{capture_rest} = [];" + ["#{capture_rest} << #{@cur_child_var}", init] + else + 'true' + end + end + + def visit_capture + return visit_other_type if node.child.arity == 1 + + storage = compiler.next_capture + term = compile(node.child) + capture = "#{@seq_var}.children[#{compile_matched(:range)}]" + "#{term} && (#{storage} = #{capture})" + end + + def visit_rest + empty_loop + end + + # Compilation helpers + + def compile_and_advance(term) + case @cur_index + when :variadic_mode + "#{term} && #{compile_loop_advance}" + when :seq_head + # @in_sync = false # already the case + @cur_index = 0 + term + else + @in_sync = false + @cur_index += 1 + term + end + end + + def compile_captured_repetition(child_code, child_captures) + captured_range = "#{compiler.captures - child_captures}...#{compiler.captures}" + captured = "captures[#{captured_range}]" + compiler.with_temp_variables do |accumulate| + code = "#{child_code} && #{accumulate}.push(#{captured})" + <<~RUBY + (#{accumulate} = Array.new) && + #{compile_loop(code)} && + (#{captured} = if #{accumulate}.empty? + (#{captured_range}).map{[]} # Transpose hack won't work for empty case + else + #{accumulate}.transpose + end) \\ + RUBY + end + end + + # Assumes `@cur_index` is already updated + def compile_matched(kind) + to = compile_cur_index + from = if @prev_index == :variadic_mode + @prev_index_used = true + @prev_index_var + else + compile_index(@prev_index) + end + case kind + when :range + "#{from}...#{to}" + when :length + "#{to} - #{from}" + end + end + + def handle_prev + @prev_index = @cur_index + @prev_index_used = false + code = yield + if @prev_index_used + @prev_index_used = false + code = "(#{@prev_index_var} = #{@cur_index_var}; true) && #{code}" + end + + code + end + + def compile_terms(children = @seq.children, last_arity = 0..0) + arities = remaining_arities(children, last_arity) + total_arity = arities.shift + guard = compile_child_nb_guard(total_arity) + return guard if children.empty? + + @remaining_arity = total_arity + terms = children.map do |child| + use_index_from_end + @remaining_arity = arities.shift + handle_prev { compile(child) } + end + [guard, terms].join(" &&\n") + end + + # yield `sync_code` iff not already in sync + def sync + return if @in_sync + + code = compile_loop_advance("= #{compile_cur_index}") + @in_sync = true + yield code + end + + # @return [Array] total arities (as Ranges) of remaining children nodes + # E.g. For sequence `(_ _? <_ _>)`, arities are: 1, 0..1, 2 + # and remaining arities are: 3..4, 2..3, 2..2, 0..0 + def remaining_arities(children, last_arity) + last = last_arity + arities = children + .reverse + .map(&:arity_range) + .map { |r| last = last.begin + r.begin..last.max + r.max } + .reverse! + arities.push last_arity + end + + # @return [String] code that evaluates to `false` if the matched arity is too small + def compile_min_check + return 'false' unless node.variadic? + + unless @remaining_arity.end.infinite? + not_too_much_remaining = "#{compile_remaining} <= #{@remaining_arity.max}" + end + min_to_match = node.arity_range.begin + if min_to_match.positive? + enough_matched = "#{compile_matched(:length)} >= #{min_to_match}" + end + return 'true' unless not_too_much_remaining || enough_matched + + [not_too_much_remaining, enough_matched].compact.join(' && ') + end + + def compile_remaining + "#{@seq_var}.children.size - #{@cur_index_var}" + end + + def compile_max_matched + return node.arity unless node.variadic? + + min_remaining_children = "#{compile_remaining} - #{@remaining_arity.begin}" + return min_remaining_children if node.arity.end.infinite? + + "[#{min_remaining_children}, #{node.arity.max}].min" + end + + def empty_loop + @cur_index = -@remaining_arity.begin - DELTA + @in_sync = false + 'true' + end + + def compile_cur_index + return @cur_index_var if @in_sync + + compile_index + end + + def compile_index(cur = @cur_index) + return cur if cur >= 0 + + "#{@seq_var}.children.size - #{-(cur + DELTA)}" + end + + # Note: assumes `@cur_index != :seq_head`. Node types using `within_loop` must + # have `def in_sequence_head; :raise; end` + def within_loop + sync do |sync_code| + @cur_index = :variadic_mode + "#{sync_code} && #{yield}" + end || yield + end + + # returns truthy iff `@cur_index` switched to relative from end mode (i.e. < 0) + def use_index_from_end + return if @cur_index == :seq_head || @remaining_arity.begin != @remaining_arity.max + + @cur_index = -@remaining_arity.begin - DELTA + end + + def compile_loop_advance(to = '+=1') + # The `#{@cur_child_var} ||` is just to avoid unused variable warning + "(#{@cur_child_var} = #{@seq_var}.children[#{@cur_index_var} #{to}]; " \ + "#{@cur_child_var} || true)" + end + + def compile_loop(term) + <<~RUBY + (#{compile_max_matched}).times do + break #{compile_min_check} unless #{term} + end \\ + RUBY + end + + def compile_child_nb_guard(arity_range) + # The -1 are because of seq_head + case arity_range.max + when Float::INFINITY + "#{@seq_var}.children.size >= #{arity_range.begin - 1}" + when arity_range.begin + "#{@seq_var}.children.size == #{arity_range.begin - 1}" + else + "(#{arity_range.begin - 1}..#{arity_range.max - 1}).cover?(#{@seq_var}.children.size)" + end + end + end + # rubocop:enable Metrics/ClassLength + end + end + end +end diff --git a/lib/rubocop/ast/node_pattern/compiler/subcompiler.rb b/lib/rubocop/ast/node_pattern/compiler/subcompiler.rb new file mode 100644 index 000000000..299d2c5ae --- /dev/null +++ b/lib/rubocop/ast/node_pattern/compiler/subcompiler.rb @@ -0,0 +1,57 @@ +# frozen_string_literal: true + +module RuboCop + module AST + class NodePattern + class Compiler + # Base class for subcompilers + # Implements visitor pattern + # + # Doc on how this fits in the compiling process: + # /doc/modules/ROOT/pages/node_pattern.md + class Subcompiler + attr_reader :compiler + + def initialize(compiler) + @compiler = compiler + @node = nil + end + + def compile(node) + prev = @node + @node = node + do_compile + ensure + @node = prev + end + + # @api private + + private + + attr_reader :node + + def do_compile + send(self.class.registry.fetch(node.type, :visit_other_type)) + end + + @registry = {} + class << self + attr_reader :registry + + def method_added(method) + @registry[Regexp.last_match(1).to_sym] = method if method =~ /^visit_(.*)/ + super + end + + def inherited(base) + us = self + base.class_eval { @registry = us.registry.dup } + super + end + end + end + end + end + end +end diff --git a/lib/rubocop/ast/node_pattern/lexer.rb b/lib/rubocop/ast/node_pattern/lexer.rb new file mode 100644 index 000000000..698ee0912 --- /dev/null +++ b/lib/rubocop/ast/node_pattern/lexer.rb @@ -0,0 +1,54 @@ +# frozen_string_literal: true + +begin + require_relative 'lexer.rex' +rescue LoadError + msg = '*** You must run `rake generate` to generate the lexer and the parser ***' + puts '*' * msg.length, msg, '*' * msg.length + raise +end + +module RuboCop + module AST + class NodePattern + # Lexer class for `NodePattern` + # + # Doc on how this fits in the compiling process: + # /doc/modules/ROOT/pages/node_pattern.md + class Lexer < LexerRex + Error = ScanError + + attr_reader :source_buffer, :comments, :tokens + + def initialize(source) + @tokens = [] + super() + parse(source) + end + + private + + # @return [token] + def emit(type) + value = ss.captures.first || ss.matched + value = yield value if block_given? + token = token(type, value) + @tokens << token + token + end + + def emit_comment + nil + end + + def do_parse + # Called by the generated `parse` method, do nothing here. + end + + def token(type, value) + [type, value] + end + end + end + end +end diff --git a/lib/rubocop/ast/node_pattern/lexer.rex b/lib/rubocop/ast/node_pattern/lexer.rex new file mode 100644 index 000000000..03e2b6261 --- /dev/null +++ b/lib/rubocop/ast/node_pattern/lexer.rex @@ -0,0 +1,36 @@ +# The only difficulty is to distinguish: `fn(argument)` from `fn (sequence)`. +# The presence of the whitespace determines if it is an _argument_ to the +# function call `fn` or if a _sequence_ follows the function call. +# +# If there is the potential for an argument list, the lexer enters the state `:ARG`. +# The rest of the times, the state is `nil`. +# +# In case of an argument list, :tARG_LIST is emitted instead of a '('. +# Therefore, the token '(' always signals the beginning of a sequence. + +class RuboCop::AST::NodePattern::LexerRex + +macros + SYMBOL_NAME /[\w+@*\/?!<>=~|%^-]+|\[\]=?/ + IDENTIFIER /[a-zA-Z_][a-zA-Z0-9_-]*/ +rules + /\s+/ + /:(#{SYMBOL_NAME})/o { emit :tSYMBOL, &:to_sym } + /"(.+?)"/ { emit :tSTRING } + /[-+]?\d+\.\d+/ { emit :tNUMBER, &:to_f } + /[-+]?\d+/ { emit :tNUMBER, &:to_i } + /#{Regexp.union( + %w"( ) { } [ ] < > $ ! ^ ` ... + * ? ," + )}/o { emit ss.matched, &:to_sym } + /%([A-Z:][a-zA-Z_:]+)/ { emit :tPARAM_CONST } + /%([a-z_]+)/ { emit :tPARAM_NAMED } + /%(\d*)/ { emit(:tPARAM_NUMBER) { |s| s.empty? ? 1 : s.to_i } } # Map `%` to `%1` + /_(#{IDENTIFIER})/o { emit :tUNIFY } + /_/o { emit :tWILDCARD } + /\#(#{IDENTIFIER}[!?]?)/o { @state = :ARG; emit :tFUNCTION_CALL, &:to_sym } + /#{IDENTIFIER}\?/o { @state = :ARG; emit :tPREDICATE, &:to_sym } + /#{IDENTIFIER}/o { emit :tNODE_TYPE, &:to_sym } + :ARG /\(/ { @state = nil; emit :tARG_LIST } + :ARG // { @state = nil } + /\#.*/ { emit_comment } +end diff --git a/lib/rubocop/ast/node_pattern/method_definer.rb b/lib/rubocop/ast/node_pattern/method_definer.rb new file mode 100644 index 000000000..6caac18f8 --- /dev/null +++ b/lib/rubocop/ast/node_pattern/method_definer.rb @@ -0,0 +1,143 @@ +# frozen_string_literal: true + +module RuboCop + module AST + class NodePattern + # Functionality to turn `match_code` into methods/lambda + module MethodDefiner + def def_node_matcher(base, method_name, **defaults) + def_helper(base, method_name, **defaults) do |name| + params = emit_params('param0 = self') + <<~RUBY + def #{name}(#{params}) + #{VAR} = param0 + #{compile_init} + #{emit_method_code} + end + RUBY + end + end + + def def_node_search(base, method_name, **defaults) + def_helper(base, method_name, **defaults) do |name| + emit_node_search(name) + end + end + + def compile_as_lambda + <<~RUBY + ->(#{emit_params('param0')}, block: nil) do + #{VAR} = param0 + #{compile_init} + #{emit_lambda_code} + end + RUBY + end + + def as_lambda + eval(compile_as_lambda) # rubocop:disable Security/Eval + end + + private + + # This method minimizes the closure for our method + def wrapping_block(method_name, **defaults) + proc do |*args, **values| + send method_name, *args, **defaults, **values + end + end + + def def_helper(base, method_name, **defaults) + location = caller_locations(3, 1).first + unless defaults.empty? + call = :"without_defaults_#{method_name}" + base.send :define_method, method_name, &wrapping_block(call, **defaults) + method_name = call + end + src = yield method_name + base.class_eval(src, location.path, location.lineno) + end + + def emit_node_search(method_name) + if method_name.to_s.end_with?('?') + on_match = 'return true' + else + args = emit_params(":#{method_name}", 'param0', forwarding: true) + prelude = "return enum_for(#{args}) unless block_given?\n" + on_match = emit_yield_capture(VAR) + end + emit_node_search_body(method_name, prelude: prelude, on_match: on_match) + end + + def emit_node_search_body(method_name, prelude:, on_match:) + <<~RUBY + def #{method_name}(#{emit_params('param0')}) + #{compile_init} + #{prelude} + param0.each_node do |#{VAR}| + if #{match_code} + #{on_match} + end + end + nil + end + RUBY + end + + def emit_yield_capture(when_no_capture = '', yield_with: 'yield') + yield_val = if captures.zero? + when_no_capture + elsif captures == 1 + 'captures[0]' # Circumvent https://github.com/jruby/jruby/issues/5710 + else + '*captures' + end + "#{yield_with}(#{yield_val})" + end + + def emit_retval + if captures.zero? + 'true' + elsif captures == 1 + 'captures[0]' + else + 'captures' + end + end + + def emit_param_list + (1..positional_parameters).map { |n| "param#{n}" }.join(',') + end + + def emit_keyword_list(forwarding: false) + pattern = "%s: #{'%s' if forwarding}" + named_parameters.map { |k| format(pattern, keyword: k) }.join(',') + end + + def emit_params(*first, forwarding: false) + params = emit_param_list + keywords = emit_keyword_list(forwarding: forwarding) + [*first, params, keywords].reject(&:empty?).join(',') + end + + def emit_method_code + <<~RUBY + return unless #{match_code} + block_given? ? #{emit_yield_capture} : (return #{emit_retval}) + RUBY + end + + def emit_lambda_code + <<~RUBY + return unless #{match_code} + block ? #{emit_yield_capture(yield_with: 'block.call')} : (return #{emit_retval}) + RUBY + end + + def compile_init + "captures = Array.new(#{captures})" if captures.positive? + end + end + end + end +end diff --git a/lib/rubocop/ast/node_pattern/node.rb b/lib/rubocop/ast/node_pattern/node.rb new file mode 100644 index 000000000..86f9ab758 --- /dev/null +++ b/lib/rubocop/ast/node_pattern/node.rb @@ -0,0 +1,200 @@ +# frozen_string_literal: true + +module RuboCop + module AST + class NodePattern + # Base class for AST Nodes of a `NodePattern` + class Node < ::Parser::AST::Node + extend Forwardable + include ::RuboCop::AST::Descendence + + ### + # To be overriden by subclasses + ### + + def rest? + false + end + + def capture? + false + end + + # @return [Integer, Range] An Integer for fixed length terms, otherwise a Range. + # Note: `arity.end` may be `Float::INFINITY` + def arity + 1 + end + + # @return [Array, nil] replace node with result, or `nil` if no change requested. + def in_sequence_head + nil + end + + ### + # Utilities + ### + + # @return [Array] + def children_nodes + children.grep(Node) + end + + # @return [Node] most nodes have only one child + def child + children[0] + end + + # @return [Integer] nb of captures of that node and its descendants + def nb_captures + children_nodes.sum(&:nb_captures) + end + + # @return [Boolean] returns true iff matches variable number of elements + def variadic? + arity.is_a?(Range) + end + + # @return [Range] arity as a Range + def arity_range + a = arity + a.is_a?(Range) ? a : INT_TO_RANGE[a] + end + + INT_TO_RANGE = Hash.new { |h, k| h[k] = k..k } + private_constant :INT_TO_RANGE + + # :nodoc: + module ForbidInSeqHead + def in_sequence_head + raise NodePattern::Invalid, "A sequence can not start with a #{type}" + end + end + + ### + # Subclasses for specific node types + ### + + # Node class for `$something` + class Capture < Node + # Delegate most introspection methods to it's only child + def_delegators :child, :arity, :rest? + + def capture? + true + end + + def nb_captures + 1 + super + end + + def in_sequence_head + wildcard, original_child = child.in_sequence_head + return unless original_child + + [wildcard, self] # ($...) => (_ $...) + end + end + + # Node class for `(type first second ...)` + class Sequence < Node + include ForbidInSeqHead + + def initialize(type, children = [], properties = {}) + if (replace = children.first.in_sequence_head) + children = [*replace, *children[1..-1]] + end + + super + end + end + + # Node class for `predicate?(:arg, :list)` + class Predicate < Node + def method_name + children.first + end + + def arg_list + children[1..-1] + end + end + FunctionCall = Predicate + + # Node class for `int+` + class Repetition < Node + include ForbidInSeqHead + + def operator + children[1] + end + + ARITIES = { + '*': 0..Float::INFINITY, + '+': 1..Float::INFINITY, + '?': 0..1 + }.freeze + + def arity + ARITIES[operator] + end + end + + # Node class for `...` + class Rest < Node + ARITY = (0..Float::INFINITY).freeze + private_constant :ARITY + + def rest? + true + end + + def arity + ARITY + end + + def in_sequence_head + [Node.new(:wildcard), self] + end + end + + # Node class for `` + class AnyOrder < Node + include ForbidInSeqHead + + ARITIES = Hash.new { |h, k| h[k] = k - 1..Float::INFINITY } + private_constant :ARITIES + + def term_nodes + ends_with_rest? ? children[0...-1] : children + end + + def ends_with_rest? + children.last.rest? + end + + def rest_node + children.last if ends_with_rest? + end + + def arity + return children.size unless ends_with_rest? + + ARITIES[children.size] + end + end + + # Registry + MAP = Hash.new(Node).merge!( + sequence: Sequence, + repetition: Repetition, + rest: Rest, + capture: Capture, + predicate: Predicate, + any_order: AnyOrder, + function_call: FunctionCall + ).freeze + end + end + end +end diff --git a/lib/rubocop/ast/node_pattern/parser.rb b/lib/rubocop/ast/node_pattern/parser.rb new file mode 100644 index 000000000..96e75632f --- /dev/null +++ b/lib/rubocop/ast/node_pattern/parser.rb @@ -0,0 +1,60 @@ +# frozen_string_literal: true + +require_relative 'parser.racc' + +module RuboCop + module AST + class NodePattern + # Parser for NodePattern + # Note: class reopened in `parser.racc` + # + # Doc on how this fits in the compiling process: + # /doc/modules/ROOT/pages/node_pattern.md + class Parser < Racc::Parser + extend Forwardable + + Builder = NodePattern::Builder + Lexer = NodePattern::Lexer + + def initialize(builder = self.class::Builder.new) + super() + @builder = builder + end + + ## + # (Similar API to `parser` gem) + # Parses a source and returns the AST. + # + # @param [Parser::Source::Buffer, String] source_buffer The source buffer to parse. + # @return [NodePattern::Node] + # + def parse(source) + @lexer = self.class::Lexer.new(source) + ast = do_parse + return ast unless block_given? + + yield ast, @lexer + rescue Lexer::Error => e + raise NodePattern::Invalid, e.message + ensure + @lexer = nil # Don't keep references + end + + def inspect + "<##{self.class}>" + end + + private + + def_delegators :@builder, :emit_list, :emit_unary_op, :emit_atom, :emit_capture, :emit_call + def_delegators :@lexer, :next_token + + # Overrides Racc::Parser's method: + def on_error(token, val, _vstack) + detail = token_to_str(token) || '?' + raise NodePattern::Invalid, "parse error on value #{val.inspect} (#{detail})" + end + end + end + end +end diff --git a/lib/rubocop/ast/node_pattern/parser.y b/lib/rubocop/ast/node_pattern/parser.y new file mode 100644 index 000000000..14387069e --- /dev/null +++ b/lib/rubocop/ast/node_pattern/parser.y @@ -0,0 +1,87 @@ +class RuboCop::AST::NodePattern::Parser +options no_result_var +token tSYMBOL tNUMBER tSTRING tWILDCARD tPARAM_NAMED tPARAM_CONST tPARAM_NUMBER + tFUNCTION_CALL tPREDICATE tNODE_TYPE tARG_LIST tUNIFY +rule + node_pattern # @return Node + : '(' variadic_pattern_list ')' { emit_list :sequence, *val } + | '{' node_pattern_list '}' { emit_list :union, *val } + | '[' node_pattern_list ']' { emit_list :intersection, *val } + | '!' node_pattern { emit_unary_op :negation, *val } + | '^' node_pattern { emit_unary_op :ascend, *val } + | '`' node_pattern { emit_unary_op :descend, *val } + | '$' node_pattern { emit_capture(*val) } + | tFUNCTION_CALL args { emit_call :function_call, *val } + | tPREDICATE args { emit_call :predicate, *val } + | tNODE_TYPE { emit_call :node_type, *val } + | atom + ; + + atom # @return Node + : tSYMBOL { emit_atom :symbol, *val } + | tNUMBER { emit_atom :number, *val } + | tSTRING { emit_atom :string, *val } + | tPARAM_CONST { emit_atom :const, *val } + | tPARAM_NAMED { emit_atom :named_parameter, *val } + | tPARAM_NUMBER { emit_atom :positional_parameter, *val } + | tWILDCARD { emit_atom :wildcard, *val } + | tUNIFY { emit_atom :unify, *val } + ; + + variadic_pattern # @return Node + : node_pattern + | node_pattern repetition + { + main, repeat_t = val + emit_unary_op(:repetition, repeat_t, main, repeat_t) + } + | opt_capture '<' node_pattern_list opt_rest '>' + { + opt_capture, bracket, node_pattern_list, opt_rest, close_bracket = val + node_pattern_list << opt_rest if opt_rest + main = emit_list :any_order, bracket, node_pattern_list, close_bracket + emit_capture(opt_capture, main) + } + | rest + ; + + repetition # @return Token + : '?' + | '*' + | '+' + ; + + opt_capture # @return Token | nil + : + | '$' + ; + + rest # @return Node + : opt_capture '...' { emit_capture(val[0], emit_atom(:rest, val[1])) } + ; + + opt_rest # @return Node | nil + : + | rest + ; + + args # @return [Token, Array, Token] | nil + : + | tARG_LIST arg_list ')' { val } + ; + + arg_list # @return Array + : node_pattern { val } + | arg_list ',' node_pattern { val[0] << val[2] } + ; + + node_pattern_list # @return Array + : node_pattern { val } + | node_pattern_list node_pattern { val[0] << val[1] } + ; + + variadic_pattern_list # @return Array + : variadic_pattern { val } + | variadic_pattern_list variadic_pattern { val[0] << val[1] } + ; +end diff --git a/rubocop-ast.gemspec b/rubocop-ast.gemspec index d0d84e852..cef4a1d51 100644 --- a/rubocop-ast.gemspec +++ b/rubocop-ast.gemspec @@ -15,7 +15,10 @@ Gem::Specification.new do |s| s.email = 'rubocop@googlegroups.com' s.files = `git ls-files lib LICENSE.txt README.md` - .split($RS) + .split($RS) + %w[ + lib/rubocop/ast/node_pattern/parser.racc.rb + lib/rubocop/ast/node_pattern/lexer.rex.rb + ] s.extra_rdoc_files = ['LICENSE.txt', 'README.md'] s.homepage = 'https://github.com/rubocop-hq/rubocop-ast' s.licenses = ['MIT'] @@ -30,6 +33,7 @@ Gem::Specification.new do |s| } s.add_runtime_dependency('parser', '>= 2.7.1.4') + s.add_runtime_dependency('strscan', '>= 1.0.0') # Ruby 2.4 doesn't provide `captures` s.add_development_dependency('bundler', '>= 1.15.0', '< 3.0') diff --git a/spec/rubocop/ast/node_pattern/helper.rb b/spec/rubocop/ast/node_pattern/helper.rb new file mode 100644 index 000000000..86245c3ce --- /dev/null +++ b/spec/rubocop/ast/node_pattern/helper.rb @@ -0,0 +1,5 @@ +# frozen_string_literal: true + +RSpec.shared_context 'parser' do + let(:parser) { RuboCop::AST::NodePattern::Parser.new } +end diff --git a/spec/rubocop/ast/node_pattern/lexer_spec.rb b/spec/rubocop/ast/node_pattern/lexer_spec.rb new file mode 100644 index 000000000..66b2e1e59 --- /dev/null +++ b/spec/rubocop/ast/node_pattern/lexer_spec.rb @@ -0,0 +1,38 @@ +# frozen_string_literal: true + +RSpec.describe RuboCop::AST::NodePattern::Lexer do + let(:source) { '(send nil? #func(:foo) #func (bar))' } + let(:lexer) { RuboCop::AST::NodePattern::Parser::Lexer.new(source) } + let(:tokens) do + tokens = [] + while (token = lexer.next_token) + tokens << token + end + tokens + end + + it 'provides tokens via next_token' do # rubocop:disable RSpec/ExampleLength + type, (text, _range) = tokens[3] + expect(type).to eq :tFUNCTION_CALL + expect(text).to eq :func + + expect(tokens.map(&:first)).to eq [ + '(', + :tNODE_TYPE, + :tPREDICATE, + :tFUNCTION_CALL, :tARG_LIST, :tSYMBOL, ')', + :tFUNCTION_CALL, + '(', :tNODE_TYPE, ')', + ')' + ] + end + + context 'with $type+' do + let(:source) { '(array sym $int+ x)' } + + it 'works' do + expect(tokens.map(&:last)).to eq \ + %i[( array sym $ int + x )] + end + end +end diff --git a/spec/rubocop/ast/node_pattern/parser_spec.rb b/spec/rubocop/ast/node_pattern/parser_spec.rb new file mode 100644 index 000000000..0a6b315a3 --- /dev/null +++ b/spec/rubocop/ast/node_pattern/parser_spec.rb @@ -0,0 +1,15 @@ +# frozen_string_literal: true + +require_relative 'helper' + +RSpec.describe RuboCop::AST::NodePattern::Parser do + include_context 'parser' + + describe 'sequences' do + it 'generates specialized nodes' do + ast = parser.parse('($_)') + expect(ast.class).to eq ::RuboCop::AST::NodePattern::Node::Sequence + expect(ast.child.class).to eq ::RuboCop::AST::NodePattern::Node::Capture + end + end +end diff --git a/spec/rubocop/ast/node_pattern_spec.rb b/spec/rubocop/ast/node_pattern_spec.rb index 79c073481..52c5b8cd4 100644 --- a/spec/rubocop/ast/node_pattern_spec.rb +++ b/spec/rubocop/ast/node_pattern_spec.rb @@ -1647,14 +1647,22 @@ def withargs(foo, bar, qux) context 'with an ellipsis inside and outside' do let(:pattern) { '(array <(str $_) (sym $_) ...> ...)' } + let(:captured_vals) { ['world', :hello] } - it_behaves_like 'invalid' + it_behaves_like 'multiple capture' end context 'doubled with ellipsis' do - let(:pattern) { '(array <(str $_) ...> <(str $_) ...>)' } + let(:pattern) { '(array <(sym $_) ...> <(int $_) ...>)' } + let(:captured_vals) { [:hello, 3] } - it_behaves_like 'invalid' + it_behaves_like 'multiple capture' + end + + context 'doubled with ellipsis in wrong order' do + let(:pattern) { '(array <(int $_) ...> <(sym $_) ...>)' } + + it_behaves_like 'nonmatching' end context 'nested' do @@ -1684,9 +1692,9 @@ def withargs(foo, bar, qux) end context 'with an ellipsis in the same sequence' do - let(:pattern) { "(array int #{symbol} ...)" } + let(:pattern) { "(array sym #{symbol} ...)" } - it_behaves_like 'invalid' + it { expect(pattern).to match_code(ruby) } end end @@ -1892,9 +1900,10 @@ def withargs(foo, bar, qux) end context 'with doubled ellipsis' do + let(:ruby) { 'foo' } let(:pattern) { '(send ... ...)' } - it_behaves_like 'invalid' + it { expect(pattern).to match_code(ruby) } # yet silly end context 'with doubled comma in arg list' do diff --git a/tasks/compile.rake b/tasks/compile.rake new file mode 100644 index 000000000..095609ff3 --- /dev/null +++ b/tasks/compile.rake @@ -0,0 +1,45 @@ +# frozen_string_literal: true + +Rake.application.rake_require 'oedipus_lex' + +# Patch gem, see https://github.com/seattlerb/oedipus_lex/pull/15 +class OedipusLex + remove_const :RE + RE = %r{(/(?:\\.|[^/])*/[ion]?)}.freeze +end + +def update_file(path) + content = File.read(path) + File.write(path, yield(content)) +end + +ENCODING_COMMENT = '# frozen_string_literal: true' +GENERATED_FILES = %w[ + lib/rubocop/ast/node_pattern/parser.racc.rb + lib/rubocop/ast/node_pattern/lexer.rex.rb +].freeze +desc 'Generate the lexer and parser files.' +task generate: %w[generate:lexer generate:parser] + +files = { + lexer: 'lib/rubocop/ast/node_pattern/lexer.rex.rb', + parser: 'lib/rubocop/ast/node_pattern/parser.racc.rb' +} + +CLEAN.include(files.values) +namespace :generate do + files.each do |kind, filename| + desc "Generate just the #{kind}" + task kind => filename do + update_file(filename) do |content| + content.prepend ENCODING_COMMENT, "\n" unless content.start_with?(ENCODING_COMMENT) + content.gsub 'module NodePattern', 'class NodePattern' + end + end + end +end + +rule '.racc.rb' => '.y' do |t| + cmd = "bundle exec racc -l -v -o #{t.name} #{t.source}" + sh cmd +end