Skip to content

Commit

Permalink
Use regexp_parser to improve Style/RedundantRegexp... cops
Browse files Browse the repository at this point in the history
As mentioned in #8593 by @marcandre
  • Loading branch information
owst committed Aug 31, 2020
1 parent 6cab599 commit c474274
Show file tree
Hide file tree
Showing 5 changed files with 128 additions and 37 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Expand Up @@ -44,6 +44,7 @@
* [#8517](https://github.com/rubocop-hq/rubocop/pull/8517): Make `Style/HashTransformKeys` and `Style/HashTransformValues` aware of `to_h` with block. ([@eugeneius][])
* [#8529](https://github.com/rubocop-hq/rubocop/pull/8529): Mark `Lint/FrozenStringLiteralComment` as `Safe`, but with unsafe auto-correction. ([@marcandre][])
* [#8602](https://github.com/rubocop-hq/rubocop/pull/8602): Fix usage of `to_enum(:scan, regexp)` to work on TruffleRuby. ([@jaimerave][])
* [#8625](https://github.com/rubocop-hq/rubocop/pull/8625): Improve `Style/RedundantRegexpCharacterClass` and `Style/RedundantRegexpEscape` by using `regexp_parser` gem. ([@owst][])

## 0.89.1 (2020-08-10)

Expand Down
62 changes: 40 additions & 22 deletions lib/rubocop/cop/style/redundant_regexp_character_class.rb
Expand Up @@ -24,32 +24,15 @@ module Style
#
# @api private
class RedundantRegexpCharacterClass < Base
include MatchRange
include RegexpLiteralHelp
extend AutoCorrector

REQUIRES_ESCAPE_OUTSIDE_CHAR_CLASS_CHARS = '.*+?{}()|$'.chars.freeze
MSG_REDUNDANT_CHARACTER_CLASS = 'Redundant single-element character class, ' \
'`%<char_class>s` can be replaced with `%<element>s`.'

PATTERN = /
(
(?<!\\) # No \-prefix (i.e. not escaped)
\[ # Literal [
(?!\#\{) # Not (the start of) an interpolation
(?: # Either...
\\[^b] | # Any escaped character except b (which would change behaviour)
[^.*+?{}()|$] | # or one that doesn't require escaping outside the character class
\\[upP]\{[^}]+\} # or a unicode code-point or property
)
(?<!\\) # No \-prefix (i.e. not escaped)
\] # Literal ]
)
/x.freeze

def on_regexp(node)
each_redundant_character_class(node) do |loc|
next if whitespace_in_free_space_mode?(node, loc)

add_offense(
loc, message: format(
MSG_REDUNDANT_CHARACTER_CLASS,
Expand All @@ -65,19 +48,54 @@ def on_regexp(node)
private

def each_redundant_character_class(node)
pattern_source(node).scan(PATTERN) do
yield match_range(node.loc.begin.end, Regexp.last_match)
each_single_element_character_class(node) do |char_class|
next unless redundant_single_element_character_class?(node, char_class)

yield node.loc.begin.adjust(begin_pos: 1 + char_class.ts, end_pos: char_class.te)
end
end

def each_single_element_character_class(node)
Regexp::Parser.parse(pattern_source(node)).each_expression do |expr|
next if expr.type != :set || expr.expressions.size != 1
next if expr.negative? || %i[posixclass set].include?(expr.expressions.first.type)

yield expr
end
rescue Regexp::Scanner::ScannerError
# Handle malformed patterns that are accepted by Ruby but cause the regexp_parser gem to
# error, see https://github.com/rubocop-hq/rubocop/issues/8083 for details
end

def redundant_single_element_character_class?(node, char_class)
class_elem = char_class.expressions.first.text

non_redundant =
whitespace_in_free_space_mode?(node, class_elem) ||
backslash_b?(class_elem) ||
requires_escape_outside_char_class?(class_elem)

!non_redundant
end

def without_character_class(loc)
loc.source[1..-2]
end

def whitespace_in_free_space_mode?(node, loc)
def whitespace_in_free_space_mode?(node, elem)
return false unless freespace_mode_regexp?(node)

/\[\s\]/.match?(loc.source)
/\s/.match?(elem)
end

def backslash_b?(elem)
# \b's behaviour is different inside and outside of a character class, matching word
# boundaries outside but backspace (0x08) when inside.
elem == '\b'
end

def requires_escape_outside_char_class?(elem)
REQUIRES_ESCAPE_OUTSIDE_CHAR_CLASS_CHARS.include?(elem)
end
end
end
Expand Down
27 changes: 13 additions & 14 deletions lib/rubocop/cop/style/redundant_regexp_escape.rb
Expand Up @@ -61,9 +61,9 @@ def on_regexp(node)

def allowed_escape?(node, char, within_character_class)
# Strictly speaking a few single-letter metachars are currently
# unnecessary to "escape", e.g. g, i, E, F, but enumerating them is
# unnecessary to "escape", e.g. i, E, F, but enumerating them is
# rather difficult, and their behaviour could change over time with
# different versions of Ruby so that e.g. /\g/ != /g/
# different versions of Ruby so that e.g. /\i/ != /i/
return true if /[[:alnum:]]/.match?(char)
return true if ALLOWED_ALWAYS_ESCAPES.include?(char) || delimiter?(node, char)

Expand All @@ -84,21 +84,20 @@ def delimiter?(node, char)
end

def each_escape(node)
pattern_source(node).each_char.with_index.reduce(
[nil, 0]
) do |(previous, char_class_depth), (current, index)|
if previous == '\\'
yield [current, index - 1, !char_class_depth.zero?]

[nil, char_class_depth]
elsif previous == '['
[current, char_class_depth + 1]
elsif current == ']'
[current, char_class_depth - 1]
Regexp::Parser.parse(
pattern_source(node)
).traverse.reduce(0) do |char_class_depth, (event, expr)|
yield(expr.text[1], expr.ts, !char_class_depth.zero?) if expr.type == :escape

if expr.type == :set
char_class_depth + (event == :enter ? 1 : -1)
else
[current, char_class_depth]
char_class_depth
end
end
rescue Regexp::Scanner::ScannerError
# Handle malformed patterns that are accepted by Ruby but cause the regexp_parser gem to
# error, see https://github.com/rubocop-hq/rubocop/issues/8083 for details
end

def escape_range_at_index(node, index)
Expand Down
52 changes: 52 additions & 0 deletions spec/rubocop/cop/style/redundant_regexp_character_class_spec.rb
Expand Up @@ -30,6 +30,45 @@
end
end

context 'with a character class containing a single character inside a gruop' do
it 'registers an offense and corrects' do
expect_offense(<<~RUBY)
foo = /([a])/
^^^ Redundant single-element character class, `[a]` can be replaced with `a`.
RUBY

expect_correction(<<~RUBY)
foo = /(a)/
RUBY
end
end

context 'with a character class containing a single range' do
it 'does not register an offense' do
expect_no_offenses('foo = /[a-z]/')
end
end

context 'with a character class containing a posix bracket expression' do
it 'does not register an offense' do
expect_no_offenses('foo = /[[:alnum:]]/')
end
end

context 'with a character class containing set intersection' do
it 'does not register an offense' do
expect_no_offenses('foo = /[[:alnum:]&&a-d]/')
end
end

context "with a regexp containing invalid \g escape" do
it 'does not register an offense' do
# See https://ruby-doc.org/core-2.7.1/Regexp.html#class-Regexp-label-Subexpression+Calls
# \g should be \g<name>
expect_no_offenses('foo = /[a]\g/')
end
end

context 'with a character class containing an escaped ]' do
it 'registers an offense and corrects' do
expect_offense(<<~'RUBY')
Expand Down Expand Up @@ -224,6 +263,19 @@
end
end

context 'with a redundant character class after an interpolation' do
it 'registers an offense and corrects' do
expect_offense(<<~'RUBY')
foo = /#{x}[a]/
^^^ Redundant single-element character class, `[a]` can be replaced with `a`.
RUBY

expect_correction(<<~'RUBY')
foo = /#{x}a/
RUBY
end
end

context 'with a multi-line interpolation' do
it 'ignores offenses in the interpolated expression' do
expect_no_offenses(<<~'RUBY')
Expand Down
23 changes: 22 additions & 1 deletion spec/rubocop/cop/style/redundant_regexp_escape_spec.rb
Expand Up @@ -29,7 +29,7 @@
end

[
('a'..'z').to_a - %w[c n p u x],
('a'..'z').to_a - %w[c g k n p u x],
('A'..'Z').to_a - %w[C M P],
%w[n101 x41 u0041 u{0041} cc C-c p{alpha} P{alpha}]
].flatten.each do |escape|
Expand All @@ -46,6 +46,14 @@
end
end

context "with an invalid \g escape" do
it 'does not register an offense' do
# See https://ruby-doc.org/core-2.7.1/Regexp.html#class-Regexp-label-Subexpression+Calls
# \g should be \g<name>
expect_no_offenses('foo = /\g/')
end
end

context "with an escaped 'M-a' outside a character class" do
it 'does not register an offense' do
expect_no_offenses('foo = /\\M-a/n')
Expand Down Expand Up @@ -79,6 +87,19 @@
end
end

context "with an escaped '+' inside a character class inside a group" do
it 'registers an offense and corrects' do
expect_offense(<<~'RUBY')
foo = /([\+])/
^^ Redundant escape inside regexp literal
RUBY

expect_correction(<<~RUBY)
foo = /([+])/
RUBY
end
end

context 'with an escaped . inside a character class beginning with :' do
it 'registers an offense and corrects' do
expect_offense(<<~'RUBY')
Expand Down

0 comments on commit c474274

Please sign in to comment.