From 269ac4c3f027eb60ce814e0c24633c4237ea174a Mon Sep 17 00:00:00 2001
From: Owen Stephens <owen@owenstephens.co.uk>
Date: Sun, 4 Oct 2020 11:33:29 +0100
Subject: [PATCH] Use `regexp_parser` to improve `Style/RedundantRegexp...`
 cops (#8625)

As mentioned in #8593 by @marcandre
---
 CHANGELOG.md                                  |  1 +
 lib/rubocop.rb                                |  1 -
 lib/rubocop/cop/mixin/regexp_literal_help.rb  | 43 ------------
 .../style/redundant_regexp_character_class.rb | 63 +++++++++++-------
 .../cop/style/redundant_regexp_escape.rb      | 23 +++----
 .../redundant_regexp_character_class_spec.rb  | 65 +++++++++++++++++++
 .../cop/style/redundant_regexp_escape_spec.rb | 36 +++++++++-
 7 files changed, 148 insertions(+), 84 deletions(-)
 delete mode 100644 lib/rubocop/cop/mixin/regexp_literal_help.rb

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 460a457cf2a..897aada697d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -20,6 +20,7 @@
 ### Changes
 
 * [#8803](https://github.com/rubocop-hq/rubocop/pull/8803): **(Breaking)** `RegexpNode#parsed_tree` now processes regexps including interpolation (by blanking the interpolation before parsing, rather than skipping). ([@owst][])
+* [#8625](https://github.com/rubocop-hq/rubocop/pull/8625): Improve `Style/RedundantRegexpCharacterClass` and `Style/RedundantRegexpEscape` by using `regexp_parser` gem. ([@owst][])
 * [#8646](https://github.com/rubocop-hq/rubocop/issues/8646): Faster find of all files in `TargetFinder` class which improves initial startup speed. ([@tleish][])
 * [#8102](https://github.com/rubocop-hq/rubocop/issues/8102): Consider class length instead of block length for `Struct.new`. ([@tejasbubane][])
 
diff --git a/lib/rubocop.rb b/lib/rubocop.rb
index 88984b1e257..5dc90731237 100644
--- a/lib/rubocop.rb
+++ b/lib/rubocop.rb
@@ -105,7 +105,6 @@
 require_relative 'rubocop/cop/mixin/preceding_following_alignment'
 require_relative 'rubocop/cop/mixin/preferred_delimiters'
 require_relative 'rubocop/cop/mixin/rational_literal'
-require_relative 'rubocop/cop/mixin/regexp_literal_help'
 require_relative 'rubocop/cop/mixin/rescue_node'
 require_relative 'rubocop/cop/mixin/safe_assignment'
 require_relative 'rubocop/cop/mixin/space_after_punctuation'
diff --git a/lib/rubocop/cop/mixin/regexp_literal_help.rb b/lib/rubocop/cop/mixin/regexp_literal_help.rb
deleted file mode 100644
index 5f00caa6b7b..00000000000
--- a/lib/rubocop/cop/mixin/regexp_literal_help.rb
+++ /dev/null
@@ -1,43 +0,0 @@
-# frozen_string_literal: true
-
-module RuboCop
-  module Cop
-    # Common functionality for handling Regexp literals.
-    module RegexpLiteralHelp
-      private
-
-      def freespace_mode_regexp?(node)
-        regopt = node.children.find(&:regopt_type?)
-
-        regopt.children.include?(:x)
-      end
-
-      def pattern_source(node)
-        freespace_mode = freespace_mode_regexp?(node)
-
-        node.children.reject(&:regopt_type?).map do |child|
-          source_with_comments_and_interpolations_blanked(child, freespace_mode)
-        end.join
-      end
-
-      def source_with_comments_and_interpolations_blanked(child, freespace_mode)
-        source = child.source
-
-        # We don't want to consider the contents of interpolations or free-space mode comments as
-        # part of the pattern source, but need to preserve their width, to allow offsets to
-        # correctly line up with the original source: spaces have no effect, and preserve width.
-        if child.begin_type?
-          replace_match_with_spaces(source, /.*/m) # replace all content
-        elsif freespace_mode
-          replace_match_with_spaces(source, /(?<!\\)#.*/) # replace any comments
-        else
-          source
-        end
-      end
-
-      def replace_match_with_spaces(source, pattern)
-        source.sub(pattern) { ' ' * Regexp.last_match[0].length }
-      end
-    end
-  end
-end
diff --git a/lib/rubocop/cop/style/redundant_regexp_character_class.rb b/lib/rubocop/cop/style/redundant_regexp_character_class.rb
index 369b8130295..375afc116b0 100644
--- a/lib/rubocop/cop/style/redundant_regexp_character_class.rb
+++ b/lib/rubocop/cop/style/redundant_regexp_character_class.rb
@@ -22,32 +22,14 @@ module Style
       #   # good
       #   r = /[ab]/
       class RedundantRegexpCharacterClass < Base
-        include MatchRange
-        include RegexpLiteralHelp
         extend AutoCorrector
 
+        REQUIRES_ESCAPE_OUTSIDE_CHAR_CLASS_CHARS = '.*+?{}()|$'.chars.freeze
         MSG_REDUNDANT_CHARACTER_CLASS = 'Redundant single-element character class, ' \
         '`%<char_class>s` can be replaced with `%<element>s`.'
 
-        PATTERN = /
-          (
-            (?<!\\)           # No \-prefix (i.e. not escaped)
-            \[                # Literal [
-            (?!\#\{)          # Not (the start of) an interpolation
-            (?:               # Either...
-             \\[^b] |         # Any escaped character except b (which would change behaviour)
-             [^.*+?{}()|$] |  # or one that doesn't require escaping outside the character class
-             \\[upP]\{[^}]+\} # or a unicode code-point or property
-            )
-            (?<!\\)           # No \-prefix (i.e. not escaped)
-            \]                # Literal ]
-          )
-        /x.freeze
-
         def on_regexp(node)
           each_redundant_character_class(node) do |loc|
-            next if whitespace_in_free_space_mode?(node, loc)
-
             add_offense(
               loc, message: format(
                 MSG_REDUNDANT_CHARACTER_CLASS,
@@ -63,19 +45,52 @@ def on_regexp(node)
         private
 
         def each_redundant_character_class(node)
-          pattern_source(node).scan(PATTERN) do
-            yield match_range(node.loc.begin.end, Regexp.last_match)
+          each_single_element_character_class(node) do |char_class|
+            next unless redundant_single_element_character_class?(node, char_class)
+
+            yield node.loc.begin.adjust(begin_pos: 1 + char_class.ts, end_pos: char_class.te)
+          end
+        end
+
+        def each_single_element_character_class(node)
+          node.parsed_tree&.each_expression do |expr|
+            next if expr.type != :set || expr.expressions.size != 1
+            next if expr.negative?
+            next if %i[set posixclass nonposixclass].include?(expr.expressions.first.type)
+
+            yield expr
           end
         end
 
+        def redundant_single_element_character_class?(node, char_class)
+          class_elem = char_class.expressions.first.text
+
+          non_redundant =
+            whitespace_in_free_space_mode?(node, class_elem) ||
+            backslash_b?(class_elem) ||
+            requires_escape_outside_char_class?(class_elem)
+
+          !non_redundant
+        end
+
         def without_character_class(loc)
           loc.source[1..-2]
         end
 
-        def whitespace_in_free_space_mode?(node, loc)
-          return false unless freespace_mode_regexp?(node)
+        def whitespace_in_free_space_mode?(node, elem)
+          return false unless node.extended?
+
+          /\s/.match?(elem)
+        end
+
+        def backslash_b?(elem)
+          # \b's behaviour is different inside and outside of a character class, matching word
+          # boundaries outside but backspace (0x08) when inside.
+          elem == '\b'
+        end
 
-          /\[\s\]/.match?(loc.source)
+        def requires_escape_outside_char_class?(elem)
+          REQUIRES_ESCAPE_OUTSIDE_CHAR_CLASS_CHARS.include?(elem)
         end
       end
     end
diff --git a/lib/rubocop/cop/style/redundant_regexp_escape.rb b/lib/rubocop/cop/style/redundant_regexp_escape.rb
index 43b4b56d328..4f13913d6ca 100644
--- a/lib/rubocop/cop/style/redundant_regexp_escape.rb
+++ b/lib/rubocop/cop/style/redundant_regexp_escape.rb
@@ -34,7 +34,6 @@ module Style
       #   /[+\-]\d/
       class RedundantRegexpEscape < Base
         include RangeHelp
-        include RegexpLiteralHelp
         extend AutoCorrector
 
         MSG_REDUNDANT_ESCAPE = 'Redundant escape inside regexp literal'
@@ -59,9 +58,9 @@ def on_regexp(node)
 
         def allowed_escape?(node, char, within_character_class)
           # Strictly speaking a few single-letter metachars are currently
-          # unnecessary to "escape", e.g. g, i, E, F, but enumerating them is
+          # unnecessary to "escape", e.g. i, E, F, but enumerating them is
           # rather difficult, and their behaviour could change over time with
-          # different versions of Ruby so that e.g. /\g/ != /g/
+          # different versions of Ruby so that e.g. /\i/ != /i/
           return true if /[[:alnum:]]/.match?(char)
           return true if ALLOWED_ALWAYS_ESCAPES.include?(char) || delimiter?(node, char)
 
@@ -82,19 +81,13 @@ def delimiter?(node, char)
         end
 
         def each_escape(node)
-          pattern_source(node).each_char.with_index.reduce(
-            [nil, 0]
-          ) do |(previous, char_class_depth), (current, index)|
-            if previous == '\\'
-              yield [current, index - 1, !char_class_depth.zero?]
-
-              [nil, char_class_depth]
-            elsif previous == '['
-              [current, char_class_depth + 1]
-            elsif current == ']'
-              [current, char_class_depth - 1]
+          node.parsed_tree&.traverse&.reduce(0) do |char_class_depth, (event, expr)|
+            yield(expr.text[1], expr.ts, !char_class_depth.zero?) if expr.type == :escape
+
+            if expr.type == :set
+              char_class_depth + (event == :enter ? 1 : -1)
             else
-              [current, char_class_depth]
+              char_class_depth
             end
           end
         end
diff --git a/spec/rubocop/cop/style/redundant_regexp_character_class_spec.rb b/spec/rubocop/cop/style/redundant_regexp_character_class_spec.rb
index 5fdd836a79d..0b704ff52c8 100644
--- a/spec/rubocop/cop/style/redundant_regexp_character_class_spec.rb
+++ b/spec/rubocop/cop/style/redundant_regexp_character_class_spec.rb
@@ -30,6 +30,58 @@
     end
   end
 
+  context 'with a character class containing a single character inside a group' do
+    it 'registers an offense and corrects' do
+      expect_offense(<<~RUBY)
+        foo = /([a])/
+                ^^^ Redundant single-element character class, `[a]` can be replaced with `a`.
+      RUBY
+
+      expect_correction(<<~RUBY)
+        foo = /(a)/
+      RUBY
+    end
+  end
+
+  context 'with a character class containing a single range' do
+    it 'does not register an offense' do
+      expect_no_offenses('foo = /[a-z]/')
+    end
+  end
+
+  context 'with a character class containing a posix bracket expression' do
+    it 'does not register an offense' do
+      expect_no_offenses('foo = /[[:alnum:]]/')
+    end
+  end
+
+  context 'with a character class containing a negated posix bracket expression' do
+    it 'does not register an offense' do
+      expect_no_offenses('foo = /[[:^alnum:]]/')
+    end
+  end
+
+  context 'with a character class containing set intersection' do
+    it 'does not register an offense' do
+      expect_no_offenses('foo = /[[:alnum:]&&a-d]/')
+    end
+  end
+
+  context "with a regexp containing invalid \g escape" do
+    it 'registers an offense and corrects' do
+      # See https://ruby-doc.org/core-2.7.1/Regexp.html#class-Regexp-label-Subexpression+Calls
+      # \g should be \g<name>
+      expect_offense(<<~'RUBY')
+        foo = /[a]\g/
+               ^^^ Redundant single-element character class, `[a]` can be replaced with `a`.
+      RUBY
+
+      expect_correction(<<~'RUBY')
+        foo = /a\g/
+      RUBY
+    end
+  end
+
   context 'with a character class containing an escaped ]' do
     it 'registers an offense and corrects' do
       expect_offense(<<~'RUBY')
@@ -224,6 +276,19 @@
     end
   end
 
+  context 'with a redundant character class after an interpolation' do
+    it 'registers an offense and corrects' do
+      expect_offense(<<~'RUBY')
+        foo = /#{x}[a]/
+                   ^^^ Redundant single-element character class, `[a]` can be replaced with `a`.
+      RUBY
+
+      expect_correction(<<~'RUBY')
+        foo = /#{x}a/
+      RUBY
+    end
+  end
+
   context 'with a multi-line interpolation' do
     it 'ignores offenses in the interpolated expression' do
       expect_no_offenses(<<~'RUBY')
diff --git a/spec/rubocop/cop/style/redundant_regexp_escape_spec.rb b/spec/rubocop/cop/style/redundant_regexp_escape_spec.rb
index f56596ec2b0..f081d983ded 100644
--- a/spec/rubocop/cop/style/redundant_regexp_escape_spec.rb
+++ b/spec/rubocop/cop/style/redundant_regexp_escape_spec.rb
@@ -29,7 +29,7 @@
     end
 
     [
-      ('a'..'z').to_a - %w[c n p u x],
+      ('a'..'z').to_a - %w[c g k n p u x],
       ('A'..'Z').to_a - %w[C M P],
       %w[n101 x41 u0041 u{0041} cc C-c p{alpha} P{alpha}]
     ].flatten.each do |escape|
@@ -46,6 +46,14 @@
       end
     end
 
+    context "with an invalid \g escape" do
+      it 'does not register an offense' do
+        # See https://ruby-doc.org/core-2.7.1/Regexp.html#class-Regexp-label-Subexpression+Calls
+        # \g should be \g<name>
+        expect_no_offenses('foo = /\g/')
+      end
+    end
+
     context "with an escaped 'M-a' outside a character class" do
       it 'does not register an offense' do
         expect_no_offenses('foo = /\\M-a/n')
@@ -79,6 +87,19 @@
       end
     end
 
+    context "with an escaped '+' inside a character class inside a group" do
+      it 'registers an offense and corrects' do
+        expect_offense(<<~'RUBY')
+          foo = /([\+])/
+                   ^^ Redundant escape inside regexp literal
+        RUBY
+
+        expect_correction(<<~RUBY)
+          foo = /([+])/
+        RUBY
+      end
+    end
+
     context 'with an escaped . inside a character class beginning with :' do
       it 'registers an offense and corrects' do
         expect_offense(<<~'RUBY')
@@ -421,6 +442,19 @@
       end
     end
 
+    context 'with a # inside a character class' do
+      it 'does not register an offense' do
+        # See https://github.com/rubocop-hq/rubocop/issues/8805 - the # inside the character class
+        # must not be treated as starting a comment (which makes the following \. redundant)
+        expect_no_offenses(<<~'RUBY')
+          regexp = %r{
+            \A[a-z#]            # letters or #
+            \.[a-z]\z           # dot + letters
+          }x
+        RUBY
+      end
+    end
+
     context 'with redundantly-escaped slashes' do
       it 'registers an offense and corrects' do
         expect_offense(<<~'RUBY')