+ lexer.rl: parse meta-control-hex chars in regexes starting from 3.1 (…

…#828) This commit tracks upstream commit ruby/ruby@11ae581.
whitequark · Nov 19, 2021 · 24d2f68 · 24d2f68
1 parent 547d731
commit 24d2f68
Show file tree

Hide file tree

Showing 3 changed files with 76 additions and 3 deletions.
diff --git a/lib/parser/lexer.rl b/lib/parser/lexer.rl
@@ -738,12 +738,14 @@ class Parser::Lexer
 
   maybe_escaped_char = (
         '\\' c_any      %unescape_char
+    |   '\\x' xdigit{1,2} % { @escape = encode_escape(tok(p - 2, p).to_i(16)) } %slash_c_char
     | ( c_any - [\\] )  %read_post_meta_or_ctrl_char
   );
 
   maybe_escaped_ctrl_char = ( # why?!
         '\\' c_any      %unescape_char %slash_c_char
     |   '?'             % { @escape = "\x7f" }
+    |   '\\x' xdigit{1,2} % { @escape = encode_escape(tok(p - 2, p).to_i(16)) } %slash_c_char
     | ( c_any - [\\?] ) %read_post_meta_or_ctrl_char %slash_c_char
   );
 
@@ -935,7 +937,7 @@ class Parser::Lexer
         #   b"
         # must be parsed as "ab"
         current_literal.extend_string(tok.gsub("\\\n".freeze, ''.freeze), @ts, @te)
-      elsif current_literal.regexp?
+      elsif current_literal.regexp? && @version < 31
         # Regular expressions should include escape sequences in their
         # escaped form. On the other hand, escaped newlines are removed (in cases like "\\C-\\\n\\M-x")
         current_literal.extend_string(tok.gsub("\\\n".freeze, ''.freeze), @ts, @te)

diff --git a/test/test_lexer.rb b/test/test_lexer.rb
@@ -3719,4 +3719,29 @@ def test_meta_escape_slash_u__after_30
     refute_scanned_meta_escape_slash_u('"\M-\u0000"')
     refute_scanned_meta_escape_slash_u('"\M-\U0000"')
   end
+
+  def test_meta_control_hex_escaped_char
+    setup_lexer(19)
+
+    assert_scanned("\"\\c\\xFF\"",
+                    :tSTRING, "\x9F", [0, 8])
+
+    assert_scanned("\"\\c\\M-\\xFF\"",
+                    :tSTRING, "\x9F", [0, 11])
+
+    assert_scanned("\"\\C-\\xFF\"",
+                    :tSTRING, "\x9F", [0, 9])
+
+    assert_scanned("\"\\C-\\M-\\xFF\"",
+                    :tSTRING, "\x9F", [0, 12])
+
+    assert_scanned("\"\\M-\\xFF\"",
+                    :tSTRING, "\x9F", [0, 9])
+
+    assert_scanned("\"\\M-\\C-\\xFF\"",
+                    :tSTRING, "\x9F", [0, 12])
+
+    assert_scanned("\"\\M-\\c\\xFF\"",
+                    :tSTRING, "\x9F", [0, 11])
+  end
 end
diff --git a/test/test_parser.rb b/test/test_parser.rb
@@ -5608,7 +5608,7 @@ def test_regexp_encoding
         s(:str, "")),
       %q{/\xa8/n =~ ""}.dup.force_encoding(Encoding::UTF_8),
       %{},
-      SINCE_1_9)
+      SINCE_3_1 - SINCE_1_9)
   end
 
   #
@@ -6513,7 +6513,7 @@ def test_parser_bug_198
         s(:str, "#")),
       %q{[/()\\1/, ?#]},
       %q{},
-      SINCE_1_9)
+      SINCE_3_1 - SINCE_1_9)
   end
 
   def test_parser_bug_272
@@ -10672,4 +10672,50 @@ def test_warn_on_duplicate_hash_key
       %q{               ~~~~~ location},
       SINCE_3_1)
   end
+
+  def test_control_meta_escape_chars_in_regexp
+    x9f = "\x9F".dup.force_encoding('ascii-8bit')
+
+    assert_parses(
+      s(:regexp, s(:str, x9f), s(:regopt)),
+      %q{/\c\xFF/}.dup.force_encoding('ascii-8bit'),
+      %q{},
+      SINCE_3_1)
+
+    assert_parses(
+      s(:regexp, s(:str, x9f), s(:regopt)),
+      %q{/\c\M-\xFF/}.dup.force_encoding('ascii-8bit'),
+      %q{},
+      SINCE_3_1)
+
+    assert_parses(
+      s(:regexp, s(:str, x9f), s(:regopt)),
+      %q{/\C-\xFF/}.dup.force_encoding('ascii-8bit'),
+      %q{},
+      SINCE_3_1)
+
+    assert_parses(
+      s(:regexp, s(:str, x9f), s(:regopt)),
+      %q{/\C-\M-\xFF/}.dup.force_encoding('ascii-8bit'),
+      %q{},
+      SINCE_3_1)
+
+    assert_parses(
+      s(:regexp, s(:str, x9f), s(:regopt)),
+      %q{/\M-\xFF/}.dup.force_encoding('ascii-8bit'),
+      %q{},
+      SINCE_3_1)
+
+    assert_parses(
+      s(:regexp, s(:str, x9f), s(:regopt)),
+      %q{/\M-\C-\xFF/}.dup.force_encoding('ascii-8bit'),
+      %q{},
+      SINCE_3_1)
+
+    assert_parses(
+      s(:regexp, s(:str, x9f), s(:regopt)),
+      %q{/\M-\c\xFF/}.dup.force_encoding('ascii-8bit'),
+      %q{},
+      SINCE_3_1)
+  end
 end