Merge pull request #4090 from rmosolgo/full-unicode

Support full unicode range in source text
rmosolgo · Jun 7, 2022 · 10eff22 · 10eff22
2 parents d85d91a + a70dec1
commit 10eff22
Show file tree

Hide file tree

Showing 4 changed files with 92 additions and 32 deletions.
diff --git a/lib/graphql/language/lexer.rb b/lib/graphql/language/lexer.rb
diff --git a/lib/graphql/language/lexer.rl b/lib/graphql/language/lexer.rl
@@ -39,7 +39,10 @@
   BACKSLASH = '\\';
   # Could limit to hex here, but “bad unicode escape” on 0XXF is probably a
   # more helpful error than “unknown char”
-  UNICODE_ESCAPE = '\\u' [0-9A-Za-z]{4};
+  UNICODE_DIGIT = [0-9A-Za-z];
+  FOUR_DIGIT_UNICODE = UNICODE_DIGIT{4};
+  N_DIGIT_UNICODE = LCURLY UNICODE_DIGIT{4,} RCURLY;
+  UNICODE_ESCAPE = '\\u' (FOUR_DIGIT_UNICODE | N_DIGIT_UNICODE);
   # https://graphql.github.io/graphql-spec/June2018/#sec-String-Value
   STRING_ESCAPE = '\\' [\\/bfnrt];
   BLOCK_QUOTE =   '"""';
@@ -131,7 +134,25 @@ module GraphQL
       # To avoid allocating more strings, this modifies the string passed into it
       def self.replace_escaped_characters_in_place(raw_string)
         raw_string.gsub!(ESCAPES, ESCAPES_REPLACE)
-        raw_string.gsub!(UTF_8, &UTF_8_REPLACE)
+        raw_string.gsub!(UTF_8) do |_matched_str|
+          codepoint_1 = ($1 || $2).to_i(16)
+          codepoint_2 = $3
+
+          if codepoint_2
+            codepoint_2 = codepoint_2.to_i(16)
+            if (codepoint_1 >= 0xD800 && codepoint_1 <= 0xDBFF) && # leading surrogate
+                (codepoint_2 >= 0xDC00 && codepoint_2 <= 0xDFFF) # trailing surrogate
+              # A surrogate pair
+              combined = ((codepoint_1 - 0xD800) * 0x400) + (codepoint_2 - 0xDC00) + 0x10000
+              [combined].pack('U'.freeze)
+            else
+              # Two separate code points
+              [codepoint_1].pack('U'.freeze) + [codepoint_2].pack('U'.freeze)
+            end
+          else
+            [codepoint_1].pack('U'.freeze)
+          end
+        end
         nil
       end
 
@@ -203,8 +224,8 @@ module GraphQL
         "\\t" => "\t",
       }
 
-      UTF_8 = /\\u[\dAa-f]{4}/i
-      UTF_8_REPLACE = ->(m) { [m[-4..-1].to_i(16)].pack('U'.freeze) }
+      UTF_8 = /\\u(?:([\dAa-f]{4})|\{([\da-f]{4,})\})(?:\\u([\dAa-f]{4}))?/i
+
 
       VALID_STRING = /\A(?:[^\\]|#{ESCAPES}|#{UTF_8})*\z/o
 
@@ -219,8 +240,7 @@ module GraphQL
           line_incr = value.count("\n")
           value = GraphQL::Language::BlockString.trim_whitespace(value)
         end
-        # TODO: replace with `String#match?` when we support only Ruby 2.4+
-        # (It's faster: https://bugs.ruby-lang.org/issues/8110)
+
         if !value.valid_encoding? || !value.match?(VALID_STRING)
           meta[:tokens] << token = GraphQL::Language::Token.new(
             :BAD_UNICODE_ESCAPE,

diff --git a/spec/graphql/language/lexer_spec.rb b/spec/graphql/language/lexer_spec.rb
@@ -78,6 +78,21 @@
 
     it "unescapes escaped unicode characters" do
       assert_equal "\t", subject.tokenize('"\\u0009"').first.to_s
+      assert_equal "\t", subject.tokenize('"\\u{0009}"').first.to_s
+      assert_equal "𐘑", subject.tokenize('"\\u{10611}"').first.to_s
+      assert_equal "💩", subject.tokenize('"\\u{1F4A9}"').first.to_s
+      assert_equal "💩", subject.tokenize('"\\uD83D\\uDCA9"').first.to_s
+    end
+
+    it "accepts the full range of unicode" do
+      assert_equal "💩", subject.tokenize('"💩"').first.to_s
+      assert_equal "⌱", subject.tokenize('"⌱"').first.to_s
+      assert_equal "🂡\n🂢", subject.tokenize('"""🂡
+🂢"""').first.to_s
+    end
+
+    it "doesn't accept unicode outside strings or comments" do
+      assert_equal :UNKNOWN_CHAR, GraphQL.scan('😘 ').first.name
     end
 
     it "rejects bad unicode, even when there's good unicode in the string" do
@@ -92,7 +107,8 @@
     it "rejects unicode that's well-formed but results in invalidly-encoded strings" do
       # when the string here gets tokenized into an actual `:STRING`, it results in `valid_encoding?` being false for
       # the ruby string so application code usually blows up trying to manipulate it
-      assert_equal :BAD_UNICODE_ESCAPE, subject.tokenize('"\\ud83c\\udf2c"').first.name
+      assert_equal :BAD_UNICODE_ESCAPE, subject.tokenize('"\\udc00\\udf2c"').first.name
+      assert_equal :BAD_UNICODE_ESCAPE, subject.tokenize('"\\u{dc00}\\u{df2c}"').first.name
     end
 
     it "clears the previous_token between runs" do

diff --git a/spec/graphql/language/parser_spec.rb b/spec/graphql/language/parser_spec.rb
@@ -12,6 +12,13 @@
     end
   end
 
+  it "raises an error when unicode is used as names" do
+    err = assert_raises(GraphQL::ParseError) {
+      GraphQL.parse('query 😘 { a b }')
+    }
+    assert_equal "Parse error on \"\\xF0\" (error) at [1, 7]", err.message
+  end
+
   describe "anonymous fragment extension" do
     let(:document) { GraphQL.parse(query_string) }
     let(:query_string) {%|