Use regexp_parser to improve Style/RedundantRegexp... cops

As mentioned in #8593 by @marcandre
rubocop · Aug 31, 2020 · c474274 · c474274
1 parent 6cab599
commit c474274
Show file tree

Hide file tree

Showing 5 changed files with 128 additions and 37 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -44,6 +44,7 @@
 * [#8517](https://github.com/rubocop-hq/rubocop/pull/8517): Make `Style/HashTransformKeys` and `Style/HashTransformValues` aware of `to_h` with block. ([@eugeneius][])
 * [#8529](https://github.com/rubocop-hq/rubocop/pull/8529): Mark `Lint/FrozenStringLiteralComment` as `Safe`, but with unsafe auto-correction. ([@marcandre][])
 * [#8602](https://github.com/rubocop-hq/rubocop/pull/8602): Fix usage of `to_enum(:scan, regexp)` to work on TruffleRuby. ([@jaimerave][])
+* [#8625](https://github.com/rubocop-hq/rubocop/pull/8625): Improve `Style/RedundantRegexpCharacterClass` and `Style/RedundantRegexpEscape` by using `regexp_parser` gem. ([@owst][])
 
 ## 0.89.1 (2020-08-10)
 

diff --git a/lib/rubocop/cop/style/redundant_regexp_character_class.rb b/lib/rubocop/cop/style/redundant_regexp_character_class.rb
@@ -24,32 +24,15 @@ module Style
       #
       # @api private
       class RedundantRegexpCharacterClass < Base
-        include MatchRange
         include RegexpLiteralHelp
         extend AutoCorrector
 
+        REQUIRES_ESCAPE_OUTSIDE_CHAR_CLASS_CHARS = '.*+?{}()|$'.chars.freeze
         MSG_REDUNDANT_CHARACTER_CLASS = 'Redundant single-element character class, ' \
         '`%<char_class>s` can be replaced with `%<element>s`.'
 
-        PATTERN = /
-          (
-            (?<!\\)           # No \-prefix (i.e. not escaped)
-            \[                # Literal [
-            (?!\#\{)          # Not (the start of) an interpolation
-            (?:               # Either...
-             \\[^b] |         # Any escaped character except b (which would change behaviour)
-             [^.*+?{}()|$] |  # or one that doesn't require escaping outside the character class
-             \\[upP]\{[^}]+\} # or a unicode code-point or property
-            )
-            (?<!\\)           # No \-prefix (i.e. not escaped)
-            \]                # Literal ]
-          )
-        /x.freeze
-
         def on_regexp(node)
           each_redundant_character_class(node) do |loc|
-            next if whitespace_in_free_space_mode?(node, loc)
-
             add_offense(
               loc, message: format(
                 MSG_REDUNDANT_CHARACTER_CLASS,
@@ -65,19 +48,54 @@ def on_regexp(node)
         private
 
         def each_redundant_character_class(node)
-          pattern_source(node).scan(PATTERN) do
-            yield match_range(node.loc.begin.end, Regexp.last_match)
+          each_single_element_character_class(node) do |char_class|
+            next unless redundant_single_element_character_class?(node, char_class)
+
+            yield node.loc.begin.adjust(begin_pos: 1 + char_class.ts, end_pos: char_class.te)
+          end
+        end
+
+        def each_single_element_character_class(node)
+          Regexp::Parser.parse(pattern_source(node)).each_expression do |expr|
+            next if expr.type != :set || expr.expressions.size != 1
+            next if expr.negative? || %i[posixclass set].include?(expr.expressions.first.type)
+
+            yield expr
           end
+        rescue Regexp::Scanner::ScannerError
+          # Handle malformed patterns that are accepted by Ruby but cause the regexp_parser gem to
+          # error, see https://github.com/rubocop-hq/rubocop/issues/8083 for details
+        end
+
+        def redundant_single_element_character_class?(node, char_class)
+          class_elem = char_class.expressions.first.text
+
+          non_redundant =
+            whitespace_in_free_space_mode?(node, class_elem) ||
+            backslash_b?(class_elem) ||
+            requires_escape_outside_char_class?(class_elem)
+
+          !non_redundant
         end
 
         def without_character_class(loc)
           loc.source[1..-2]
         end
 
-        def whitespace_in_free_space_mode?(node, loc)
+        def whitespace_in_free_space_mode?(node, elem)
           return false unless freespace_mode_regexp?(node)
 
-          /\[\s\]/.match?(loc.source)
+          /\s/.match?(elem)
+        end
+
+        def backslash_b?(elem)
+          # \b's behaviour is different inside and outside of a character class, matching word
+          # boundaries outside but backspace (0x08) when inside.
+          elem == '\b'
+        end
+
+        def requires_escape_outside_char_class?(elem)
+          REQUIRES_ESCAPE_OUTSIDE_CHAR_CLASS_CHARS.include?(elem)
         end
       end
     end

diff --git a/lib/rubocop/cop/style/redundant_regexp_escape.rb b/lib/rubocop/cop/style/redundant_regexp_escape.rb
@@ -61,9 +61,9 @@ def on_regexp(node)
 
         def allowed_escape?(node, char, within_character_class)
           # Strictly speaking a few single-letter metachars are currently
-          # unnecessary to "escape", e.g. g, i, E, F, but enumerating them is
+          # unnecessary to "escape", e.g. i, E, F, but enumerating them is
           # rather difficult, and their behaviour could change over time with
-          # different versions of Ruby so that e.g. /\g/ != /g/
+          # different versions of Ruby so that e.g. /\i/ != /i/
           return true if /[[:alnum:]]/.match?(char)
           return true if ALLOWED_ALWAYS_ESCAPES.include?(char) || delimiter?(node, char)
 
@@ -84,21 +84,20 @@ def delimiter?(node, char)
         end
 
         def each_escape(node)
-          pattern_source(node).each_char.with_index.reduce(
-            [nil, 0]
-          ) do |(previous, char_class_depth), (current, index)|
-            if previous == '\\'
-              yield [current, index - 1, !char_class_depth.zero?]
-
-              [nil, char_class_depth]
-            elsif previous == '['
-              [current, char_class_depth + 1]
-            elsif current == ']'
-              [current, char_class_depth - 1]
+          Regexp::Parser.parse(
+            pattern_source(node)
+          ).traverse.reduce(0) do |char_class_depth, (event, expr)|
+            yield(expr.text[1], expr.ts, !char_class_depth.zero?) if expr.type == :escape
+
+            if expr.type == :set
+              char_class_depth + (event == :enter ? 1 : -1)
             else
-              [current, char_class_depth]
+              char_class_depth
             end
           end
+        rescue Regexp::Scanner::ScannerError
+          # Handle malformed patterns that are accepted by Ruby but cause the regexp_parser gem to
+          # error, see https://github.com/rubocop-hq/rubocop/issues/8083 for details
         end
 
         def escape_range_at_index(node, index)

diff --git a/spec/rubocop/cop/style/redundant_regexp_character_class_spec.rb b/spec/rubocop/cop/style/redundant_regexp_character_class_spec.rb
@@ -30,6 +30,45 @@
     end
   end
 
+  context 'with a character class containing a single character inside a gruop' do
+    it 'registers an offense and corrects' do
+      expect_offense(<<~RUBY)
+        foo = /([a])/
+                ^^^ Redundant single-element character class, `[a]` can be replaced with `a`.
+      RUBY
+
+      expect_correction(<<~RUBY)
+        foo = /(a)/
+      RUBY
+    end
+  end
+
+  context 'with a character class containing a single range' do
+    it 'does not register an offense' do
+      expect_no_offenses('foo = /[a-z]/')
+    end
+  end
+
+  context 'with a character class containing a posix bracket expression' do
+    it 'does not register an offense' do
+      expect_no_offenses('foo = /[[:alnum:]]/')
+    end
+  end
+
+  context 'with a character class containing set intersection' do
+    it 'does not register an offense' do
+      expect_no_offenses('foo = /[[:alnum:]&&a-d]/')
+    end
+  end
+
+  context "with a regexp containing invalid \g escape" do
+    it 'does not register an offense' do
+      # See https://ruby-doc.org/core-2.7.1/Regexp.html#class-Regexp-label-Subexpression+Calls
+      # \g should be \g<name>
+      expect_no_offenses('foo = /[a]\g/')
+    end
+  end
+
   context 'with a character class containing an escaped ]' do
     it 'registers an offense and corrects' do
       expect_offense(<<~'RUBY')
@@ -224,6 +263,19 @@
     end
   end
 
+  context 'with a redundant character class after an interpolation' do
+    it 'registers an offense and corrects' do
+      expect_offense(<<~'RUBY')
+        foo = /#{x}[a]/
+                   ^^^ Redundant single-element character class, `[a]` can be replaced with `a`.
+      RUBY
+
+      expect_correction(<<~'RUBY')
+        foo = /#{x}a/
+      RUBY
+    end
+  end
+
   context 'with a multi-line interpolation' do
     it 'ignores offenses in the interpolated expression' do
       expect_no_offenses(<<~'RUBY')

diff --git a/spec/rubocop/cop/style/redundant_regexp_escape_spec.rb b/spec/rubocop/cop/style/redundant_regexp_escape_spec.rb
@@ -29,7 +29,7 @@
     end
 
     [
-      ('a'..'z').to_a - %w[c n p u x],
+      ('a'..'z').to_a - %w[c g k n p u x],
       ('A'..'Z').to_a - %w[C M P],
       %w[n101 x41 u0041 u{0041} cc C-c p{alpha} P{alpha}]
     ].flatten.each do |escape|
@@ -46,6 +46,14 @@
       end
     end
 
+    context "with an invalid \g escape" do
+      it 'does not register an offense' do
+        # See https://ruby-doc.org/core-2.7.1/Regexp.html#class-Regexp-label-Subexpression+Calls
+        # \g should be \g<name>
+        expect_no_offenses('foo = /\g/')
+      end
+    end
+
     context "with an escaped 'M-a' outside a character class" do
       it 'does not register an offense' do
         expect_no_offenses('foo = /\\M-a/n')
@@ -79,6 +87,19 @@
       end
     end
 
+      context "with an escaped '+' inside a character class inside a group" do
+        it 'registers an offense and corrects' do
+          expect_offense(<<~'RUBY')
+            foo = /([\+])/
+                     ^^ Redundant escape inside regexp literal
+          RUBY
+
+          expect_correction(<<~RUBY)
+            foo = /([+])/
+          RUBY
+        end
+      end
+
     context 'with an escaped . inside a character class beginning with :' do
       it 'registers an offense and corrects' do
         expect_offense(<<~'RUBY')