Use regexp_parser to improve Style/RedundantRegexp... cops (#8625)

As mentioned in #8593 by @marcandre
rubocop · Oct 4, 2020 · 269ac4c · 269ac4c
1 parent 918f7bb
commit 269ac4c
Show file tree

Hide file tree

Showing 7 changed files with 148 additions and 84 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -20,6 +20,7 @@
 ### Changes
 
 * [#8803](https://github.com/rubocop-hq/rubocop/pull/8803): **(Breaking)** `RegexpNode#parsed_tree` now processes regexps including interpolation (by blanking the interpolation before parsing, rather than skipping). ([@owst][])
+* [#8625](https://github.com/rubocop-hq/rubocop/pull/8625): Improve `Style/RedundantRegexpCharacterClass` and `Style/RedundantRegexpEscape` by using `regexp_parser` gem. ([@owst][])
 * [#8646](https://github.com/rubocop-hq/rubocop/issues/8646): Faster find of all files in `TargetFinder` class which improves initial startup speed. ([@tleish][])
 * [#8102](https://github.com/rubocop-hq/rubocop/issues/8102): Consider class length instead of block length for `Struct.new`. ([@tejasbubane][])
 

diff --git a/lib/rubocop.rb b/lib/rubocop.rb
@@ -105,7 +105,6 @@
 require_relative 'rubocop/cop/mixin/preceding_following_alignment'
 require_relative 'rubocop/cop/mixin/preferred_delimiters'
 require_relative 'rubocop/cop/mixin/rational_literal'
-require_relative 'rubocop/cop/mixin/regexp_literal_help'
 require_relative 'rubocop/cop/mixin/rescue_node'
 require_relative 'rubocop/cop/mixin/safe_assignment'
 require_relative 'rubocop/cop/mixin/space_after_punctuation'

diff --git a/lib/rubocop/cop/mixin/regexp_literal_help.rb b/lib/rubocop/cop/mixin/regexp_literal_help.rb
diff --git a/lib/rubocop/cop/style/redundant_regexp_character_class.rb b/lib/rubocop/cop/style/redundant_regexp_character_class.rb
@@ -22,32 +22,14 @@ module Style
       #   # good
       #   r = /[ab]/
       class RedundantRegexpCharacterClass < Base
-        include MatchRange
-        include RegexpLiteralHelp
         extend AutoCorrector
 
+        REQUIRES_ESCAPE_OUTSIDE_CHAR_CLASS_CHARS = '.*+?{}()|$'.chars.freeze
         MSG_REDUNDANT_CHARACTER_CLASS = 'Redundant single-element character class, ' \
         '`%<char_class>s` can be replaced with `%<element>s`.'
 
-        PATTERN = /
-          (
-            (?<!\\)           # No \-prefix (i.e. not escaped)
-            \[                # Literal [
-            (?!\#\{)          # Not (the start of) an interpolation
-            (?:               # Either...
-             \\[^b] |         # Any escaped character except b (which would change behaviour)
-             [^.*+?{}()|$] |  # or one that doesn't require escaping outside the character class
-             \\[upP]\{[^}]+\} # or a unicode code-point or property
-            )
-            (?<!\\)           # No \-prefix (i.e. not escaped)
-            \]                # Literal ]
-          )
-        /x.freeze
-
         def on_regexp(node)
           each_redundant_character_class(node) do |loc|
-            next if whitespace_in_free_space_mode?(node, loc)
-
             add_offense(
               loc, message: format(
                 MSG_REDUNDANT_CHARACTER_CLASS,
@@ -63,19 +45,52 @@ def on_regexp(node)
         private
 
         def each_redundant_character_class(node)
-          pattern_source(node).scan(PATTERN) do
-            yield match_range(node.loc.begin.end, Regexp.last_match)
+          each_single_element_character_class(node) do |char_class|
+            next unless redundant_single_element_character_class?(node, char_class)
+
+            yield node.loc.begin.adjust(begin_pos: 1 + char_class.ts, end_pos: char_class.te)
+          end
+        end
+
+        def each_single_element_character_class(node)
+          node.parsed_tree&.each_expression do |expr|
+            next if expr.type != :set || expr.expressions.size != 1
+            next if expr.negative?
+            next if %i[set posixclass nonposixclass].include?(expr.expressions.first.type)
+
+            yield expr
           end
         end
 
+        def redundant_single_element_character_class?(node, char_class)
+          class_elem = char_class.expressions.first.text
+
+          non_redundant =
+            whitespace_in_free_space_mode?(node, class_elem) ||
+            backslash_b?(class_elem) ||
+            requires_escape_outside_char_class?(class_elem)
+
+          !non_redundant
+        end
+
         def without_character_class(loc)
           loc.source[1..-2]
         end
 
-        def whitespace_in_free_space_mode?(node, loc)
-          return false unless freespace_mode_regexp?(node)
+        def whitespace_in_free_space_mode?(node, elem)
+          return false unless node.extended?
+
+          /\s/.match?(elem)
+        end
+
+        def backslash_b?(elem)
+          # \b's behaviour is different inside and outside of a character class, matching word
+          # boundaries outside but backspace (0x08) when inside.
+          elem == '\b'
+        end
 
-          /\[\s\]/.match?(loc.source)
+        def requires_escape_outside_char_class?(elem)
+          REQUIRES_ESCAPE_OUTSIDE_CHAR_CLASS_CHARS.include?(elem)
         end
       end
     end

diff --git a/lib/rubocop/cop/style/redundant_regexp_escape.rb b/lib/rubocop/cop/style/redundant_regexp_escape.rb
@@ -34,7 +34,6 @@ module Style
       #   /[+\-]\d/
       class RedundantRegexpEscape < Base
         include RangeHelp
-        include RegexpLiteralHelp
         extend AutoCorrector
 
         MSG_REDUNDANT_ESCAPE = 'Redundant escape inside regexp literal'
@@ -59,9 +58,9 @@ def on_regexp(node)
 
         def allowed_escape?(node, char, within_character_class)
           # Strictly speaking a few single-letter metachars are currently
-          # unnecessary to "escape", e.g. g, i, E, F, but enumerating them is
+          # unnecessary to "escape", e.g. i, E, F, but enumerating them is
           # rather difficult, and their behaviour could change over time with
-          # different versions of Ruby so that e.g. /\g/ != /g/
+          # different versions of Ruby so that e.g. /\i/ != /i/
           return true if /[[:alnum:]]/.match?(char)
           return true if ALLOWED_ALWAYS_ESCAPES.include?(char) || delimiter?(node, char)
 
@@ -82,19 +81,13 @@ def delimiter?(node, char)
         end
 
         def each_escape(node)
-          pattern_source(node).each_char.with_index.reduce(
-            [nil, 0]
-          ) do |(previous, char_class_depth), (current, index)|
-            if previous == '\\'
-              yield [current, index - 1, !char_class_depth.zero?]
-
-              [nil, char_class_depth]
-            elsif previous == '['
-              [current, char_class_depth + 1]
-            elsif current == ']'
-              [current, char_class_depth - 1]
+          node.parsed_tree&.traverse&.reduce(0) do |char_class_depth, (event, expr)|
+            yield(expr.text[1], expr.ts, !char_class_depth.zero?) if expr.type == :escape
+
+            if expr.type == :set
+              char_class_depth + (event == :enter ? 1 : -1)
             else
-              [current, char_class_depth]
+              char_class_depth
             end
           end
         end

diff --git a/spec/rubocop/cop/style/redundant_regexp_character_class_spec.rb b/spec/rubocop/cop/style/redundant_regexp_character_class_spec.rb
@@ -30,6 +30,58 @@
     end
   end
 
+  context 'with a character class containing a single character inside a group' do
+    it 'registers an offense and corrects' do
+      expect_offense(<<~RUBY)
+        foo = /([a])/
+                ^^^ Redundant single-element character class, `[a]` can be replaced with `a`.
+      RUBY
+
+      expect_correction(<<~RUBY)
+        foo = /(a)/
+      RUBY
+    end
+  end
+
+  context 'with a character class containing a single range' do
+    it 'does not register an offense' do
+      expect_no_offenses('foo = /[a-z]/')
+    end
+  end
+
+  context 'with a character class containing a posix bracket expression' do
+    it 'does not register an offense' do
+      expect_no_offenses('foo = /[[:alnum:]]/')
+    end
+  end
+
+  context 'with a character class containing a negated posix bracket expression' do
+    it 'does not register an offense' do
+      expect_no_offenses('foo = /[[:^alnum:]]/')
+    end
+  end
+
+  context 'with a character class containing set intersection' do
+    it 'does not register an offense' do
+      expect_no_offenses('foo = /[[:alnum:]&&a-d]/')
+    end
+  end
+
+  context "with a regexp containing invalid \g escape" do
+    it 'registers an offense and corrects' do
+      # See https://ruby-doc.org/core-2.7.1/Regexp.html#class-Regexp-label-Subexpression+Calls
+      # \g should be \g<name>
+      expect_offense(<<~'RUBY')
+        foo = /[a]\g/
+               ^^^ Redundant single-element character class, `[a]` can be replaced with `a`.
+      RUBY
+
+      expect_correction(<<~'RUBY')
+        foo = /a\g/
+      RUBY
+    end
+  end
+
   context 'with a character class containing an escaped ]' do
     it 'registers an offense and corrects' do
       expect_offense(<<~'RUBY')
@@ -224,6 +276,19 @@
     end
   end
 
+  context 'with a redundant character class after an interpolation' do
+    it 'registers an offense and corrects' do
+      expect_offense(<<~'RUBY')
+        foo = /#{x}[a]/
+                   ^^^ Redundant single-element character class, `[a]` can be replaced with `a`.
+      RUBY
+
+      expect_correction(<<~'RUBY')
+        foo = /#{x}a/
+      RUBY
+    end
+  end
+
   context 'with a multi-line interpolation' do
     it 'ignores offenses in the interpolated expression' do
       expect_no_offenses(<<~'RUBY')

diff --git a/spec/rubocop/cop/style/redundant_regexp_escape_spec.rb b/spec/rubocop/cop/style/redundant_regexp_escape_spec.rb
@@ -29,7 +29,7 @@
     end
 
     [
-      ('a'..'z').to_a - %w[c n p u x],
+      ('a'..'z').to_a - %w[c g k n p u x],
       ('A'..'Z').to_a - %w[C M P],
       %w[n101 x41 u0041 u{0041} cc C-c p{alpha} P{alpha}]
     ].flatten.each do |escape|
@@ -46,6 +46,14 @@
       end
     end
 
+    context "with an invalid \g escape" do
+      it 'does not register an offense' do
+        # See https://ruby-doc.org/core-2.7.1/Regexp.html#class-Regexp-label-Subexpression+Calls
+        # \g should be \g<name>
+        expect_no_offenses('foo = /\g/')
+      end
+    end
+
     context "with an escaped 'M-a' outside a character class" do
       it 'does not register an offense' do
         expect_no_offenses('foo = /\\M-a/n')
@@ -79,6 +87,19 @@
       end
     end
 
+    context "with an escaped '+' inside a character class inside a group" do
+      it 'registers an offense and corrects' do
+        expect_offense(<<~'RUBY')
+          foo = /([\+])/
+                   ^^ Redundant escape inside regexp literal
+        RUBY
+
+        expect_correction(<<~RUBY)
+          foo = /([+])/
+        RUBY
+      end
+    end
+
     context 'with an escaped . inside a character class beginning with :' do
       it 'registers an offense and corrects' do
         expect_offense(<<~'RUBY')
@@ -421,6 +442,19 @@
       end
     end
 
+    context 'with a # inside a character class' do
+      it 'does not register an offense' do
+        # See https://github.com/rubocop-hq/rubocop/issues/8805 - the # inside the character class
+        # must not be treated as starting a comment (which makes the following \. redundant)
+        expect_no_offenses(<<~'RUBY')
+          regexp = %r{
+            \A[a-z#]            # letters or #
+            \.[a-z]\z           # dot + letters
+          }x
+        RUBY
+      end
+    end
+
     context 'with redundantly-escaped slashes' do
       it 'registers an offense and corrects' do
         expect_offense(<<~'RUBY')