Skip to content

Commit

Permalink
Merge pull request #4090 from rmosolgo/full-unicode
Browse files Browse the repository at this point in the history
Support full unicode range in source text
  • Loading branch information
rmosolgo committed Jun 7, 2022
2 parents d85d91a + a70dec1 commit 10eff22
Show file tree
Hide file tree
Showing 4 changed files with 92 additions and 32 deletions.
67 changes: 42 additions & 25 deletions lib/graphql/language/lexer.rb

Large diffs are not rendered by default.

32 changes: 26 additions & 6 deletions lib/graphql/language/lexer.rl
Expand Up @@ -39,7 +39,10 @@
BACKSLASH = '\\';
# Could limit to hex here, but β€œbad unicode escape” on 0XXF is probably a
# more helpful error than β€œunknown char”
UNICODE_ESCAPE = '\\u' [0-9A-Za-z]{4};
UNICODE_DIGIT = [0-9A-Za-z];
FOUR_DIGIT_UNICODE = UNICODE_DIGIT{4};
N_DIGIT_UNICODE = LCURLY UNICODE_DIGIT{4,} RCURLY;
UNICODE_ESCAPE = '\\u' (FOUR_DIGIT_UNICODE | N_DIGIT_UNICODE);
# https://graphql.github.io/graphql-spec/June2018/#sec-String-Value
STRING_ESCAPE = '\\' [\\/bfnrt];
BLOCK_QUOTE = '"""';
Expand Down Expand Up @@ -131,7 +134,25 @@ module GraphQL
# To avoid allocating more strings, this modifies the string passed into it
def self.replace_escaped_characters_in_place(raw_string)
raw_string.gsub!(ESCAPES, ESCAPES_REPLACE)
raw_string.gsub!(UTF_8, &UTF_8_REPLACE)
raw_string.gsub!(UTF_8) do |_matched_str|
codepoint_1 = ($1 || $2).to_i(16)
codepoint_2 = $3

if codepoint_2
codepoint_2 = codepoint_2.to_i(16)
if (codepoint_1 >= 0xD800 && codepoint_1 <= 0xDBFF) && # leading surrogate
(codepoint_2 >= 0xDC00 && codepoint_2 <= 0xDFFF) # trailing surrogate
# A surrogate pair
combined = ((codepoint_1 - 0xD800) * 0x400) + (codepoint_2 - 0xDC00) + 0x10000
[combined].pack('U'.freeze)
else
# Two separate code points
[codepoint_1].pack('U'.freeze) + [codepoint_2].pack('U'.freeze)
end
else
[codepoint_1].pack('U'.freeze)
end
end
nil
end

Expand Down Expand Up @@ -203,8 +224,8 @@ module GraphQL
"\\t" => "\t",
}

UTF_8 = /\\u[\dAa-f]{4}/i
UTF_8_REPLACE = ->(m) { [m[-4..-1].to_i(16)].pack('U'.freeze) }
UTF_8 = /\\u(?:([\dAa-f]{4})|\{([\da-f]{4,})\})(?:\\u([\dAa-f]{4}))?/i


VALID_STRING = /\A(?:[^\\]|#{ESCAPES}|#{UTF_8})*\z/o

Expand All @@ -219,8 +240,7 @@ module GraphQL
line_incr = value.count("\n")
value = GraphQL::Language::BlockString.trim_whitespace(value)
end
# TODO: replace with `String#match?` when we support only Ruby 2.4+
# (It's faster: https://bugs.ruby-lang.org/issues/8110)

if !value.valid_encoding? || !value.match?(VALID_STRING)
meta[:tokens] << token = GraphQL::Language::Token.new(
:BAD_UNICODE_ESCAPE,
Expand Down
18 changes: 17 additions & 1 deletion spec/graphql/language/lexer_spec.rb
Expand Up @@ -78,6 +78,21 @@

it "unescapes escaped unicode characters" do
assert_equal "\t", subject.tokenize('"\\u0009"').first.to_s
assert_equal "\t", subject.tokenize('"\\u{0009}"').first.to_s
assert_equal "π˜‘", subject.tokenize('"\\u{10611}"').first.to_s
assert_equal "πŸ’©", subject.tokenize('"\\u{1F4A9}"').first.to_s
assert_equal "πŸ’©", subject.tokenize('"\\uD83D\\uDCA9"').first.to_s
end

it "accepts the full range of unicode" do
assert_equal "πŸ’©", subject.tokenize('"πŸ’©"').first.to_s
assert_equal "⌱", subject.tokenize('"⌱"').first.to_s
assert_equal "πŸ‚‘\nπŸ‚’", subject.tokenize('"""πŸ‚‘
πŸ‚’"""').first.to_s
end

it "doesn't accept unicode outside strings or comments" do
assert_equal :UNKNOWN_CHAR, GraphQL.scan('😘 ').first.name
end

it "rejects bad unicode, even when there's good unicode in the string" do
Expand All @@ -92,7 +107,8 @@
it "rejects unicode that's well-formed but results in invalidly-encoded strings" do
# when the string here gets tokenized into an actual `:STRING`, it results in `valid_encoding?` being false for
# the ruby string so application code usually blows up trying to manipulate it
assert_equal :BAD_UNICODE_ESCAPE, subject.tokenize('"\\ud83c\\udf2c"').first.name
assert_equal :BAD_UNICODE_ESCAPE, subject.tokenize('"\\udc00\\udf2c"').first.name
assert_equal :BAD_UNICODE_ESCAPE, subject.tokenize('"\\u{dc00}\\u{df2c}"').first.name
end

it "clears the previous_token between runs" do
Expand Down
7 changes: 7 additions & 0 deletions spec/graphql/language/parser_spec.rb
Expand Up @@ -12,6 +12,13 @@
end
end

it "raises an error when unicode is used as names" do
err = assert_raises(GraphQL::ParseError) {
GraphQL.parse('query 😘 { a b }')
}
assert_equal "Parse error on \"\\xF0\" (error) at [1, 7]", err.message
end

describe "anonymous fragment extension" do
let(:document) { GraphQL.parse(query_string) }
let(:query_string) {%|
Expand Down

0 comments on commit 10eff22

Please sign in to comment.