From a48a8f6068217bc02557f9780681f11377fdbee8 Mon Sep 17 00:00:00 2001
From: Masataka Pocke Kuwabara <kuwabara@pocke.me>
Date: Mon, 12 Jul 2021 17:59:19 +0900
Subject: [PATCH] - lexer.rl: fix incompatible delimiters on percent literal
 (#808)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

CRuby only accepts ASCII characters except `[A-Za-z0-9]` as a delimiter
of percent literal, but the lexer accepts different characters.
For exmaple:

* CRuby accepts `%w^Dfoo^D`, but parser didn't (note: `^D` means 0x04)
* CRuby reject `%w1foo1`, but parser accepts
* CRuby reject `%w★foo★`, but parser accepts

This patch fixes the problems.
---
 lib/parser/lexer.rl |  9 +++++----
 test/test_lexer.rb  | 39 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 44 insertions(+), 4 deletions(-)
diff --git a/lib/parser/lexer.rl b/lib/parser/lexer.rl
index 8574f95cc..718c1d8bf 100644
--- a/lib/parser/lexer.rl
+++ b/lib/parser/lexer.rl
@@ -518,7 +518,8 @@ class Parser::Lexer
   c_nl_zlen  = c_nl | zlen;
   c_line     = any - c_nl_zlen;
 
-  c_unicode  = c_any - 0x00..0x7f;
+  c_ascii    = 0x00..0x7f;
+  c_unicode  = c_any - c_ascii;
   c_upper    = [A-Z];
   c_lower    = [a-z_]  | c_unicode;
   c_alpha    = c_lower | c_upper;
@@ -1406,7 +1407,7 @@ class Parser::Lexer
       ':'
       => { fhold; fgoto expr_beg; };
 
-      '%s' c_any
+      '%s' (c_ascii - [A-Za-z0-9])
       => {
         if version?(23)
           type, delimiter = tok[0..-2], tok[-1].chr
@@ -1758,14 +1759,14 @@ class Parser::Lexer
       };
 
       # %<string>
-      '%' ( any - [A-Za-z] )
+      '%' ( c_ascii - [A-Za-z0-9] )
       => {
         type, delimiter = @source_buffer.slice(@ts).chr, tok[-1].chr
         fgoto *push_literal(type, delimiter, @ts);
       };
 
       # %w(we are the people)
-      '%' [A-Za-z]+ c_any
+      '%' [A-Za-z] (c_ascii - [A-Za-z0-9])
       => {
         type, delimiter = tok[0..-2], tok[-1].chr
         fgoto *push_literal(type, delimiter, @ts);
diff --git a/test/test_lexer.rb b/test/test_lexer.rb
index 9074ccc27..192247718 100644
--- a/test/test_lexer.rb
+++ b/test/test_lexer.rb
@@ -2246,6 +2246,25 @@ def test_string_pct_pct
                    :tSTRING_END,     '%',    [6, 7])
   end
 
+  def test_string_pct_null
+    assert_scanned("%\0blah\0",
+                   :tSTRING_BEG,     "%\0",  [0, 2],
+                   :tSTRING_CONTENT, "blah", [2, 6],
+                   :tSTRING_END,     "\0",    [6, 7])
+  end
+
+  def test_string_pct_non_ascii
+    refute_scanned("%★foo★")
+  end
+
+  def test_string_pct_alphabet
+    refute_scanned("%AfooA")
+  end
+
+  def test_string_pct_number
+    refute_scanned("%1foo1")
+  end
+
   def test_string_pct_w
     assert_scanned("%w[s1 s2 ]",
                    :tQWORDS_BEG,     "%w[", [0, 3],
@@ -2293,6 +2312,26 @@ def test_string_pct_w_tab
                    :tSTRING_END,     ']',   [10, 11])
   end
 
+  def test_string_pct_w_null
+    assert_scanned("%w\0abc\0",
+                   :tQWORDS_BEG,     "%w\0", [0, 3],
+                   :tSTRING_CONTENT, "abc",  [3, 6],
+                   :tSPACE,          nil,    [6, 6],
+                   :tSTRING_END,     "\0",   [6, 7])
+  end
+
+  def test_string_pct_w_non_ascii
+    refute_scanned("%w★foo★")
+  end
+
+  def test_string_pct_w_alphabet
+    refute_scanned("%wAfooA")
+  end
+
+  def test_string_pct_w_number
+    refute_scanned("%w1foo1")
+  end
+
   def test_string_pct_i
     assert_scanned("%i(s1 s2)",
                    :tQSYMBOLS_BEG,   "%i(", [0, 3],