From 7d8efae3b665a38118d2385dc56a32be97f72ef4 Mon Sep 17 00:00:00 2001 From: Michael Kruglos Date: Sun, 6 Dec 2015 18:39:10 +0200 Subject: [PATCH 1/3] Fixed crash with sigabort on invalid surrogate sequence --- ext/yajl/yajl_lex.c | 104 +++++++++++++++++++++++++++++++---- ext/yajl/yajl_lex.h | 3 +- lib/yajl/version.rb | 2 +- spec/parsing/one_off_spec.rb | 16 ++++++ 4 files changed, 112 insertions(+), 13 deletions(-) diff --git a/ext/yajl/yajl_lex.c b/ext/yajl/yajl_lex.c index 286132df..b7135453 100644 --- a/ext/yajl/yajl_lex.c +++ b/ext/yajl/yajl_lex.c @@ -271,6 +271,94 @@ if (*offset >= jsonTextLen) { \ goto finish_string_lex; \ } +/* helper to lex \u sequences, including surrogates pairs. + * surrogate pairs are unicode sequence of two codepoints, where first + * must be in range 0xD800..0xDBFF, and the second in range 0xDC00..0xDFFF. + */ +static yajl_tok +yajl_lex_escaped_unicode(yajl_lexer lexer, const unsigned char * jsonText, + unsigned int jsonTextLen, unsigned int * offset) +{ + yajl_tok tok = yajl_tok_error; + unsigned int i; + unsigned char curChar; + unsigned int pairs[2] = {0, 0}; + unsigned int pair_index = 0; + + for (;;) { + for (i=0;i<4;i++) { + STR_CHECK_EOF; + curChar = readChar(lexer, jsonText, offset); + if (!(charLookupTable[curChar] & VHC)) { + /* back up to offending char */ + unreadChar(lexer, offset); + lexer->error = yajl_lex_string_invalid_hex_char; + goto finish_string_lex; + } + /* if we reached here, it means we passed the hex validation above */ + /* if it's 0-9 - convert by subtracting '0' */ + if(curChar>='0' && curChar<='9') { + pairs[pair_index] <<= 4; + pairs[pair_index] |= (curChar-'0'); + } + /* if it's a-f - convert by subtracting 'a' and adding 10 */ + if(curChar>='a' && curChar<='f') { + pairs[pair_index] <<= 4; + pairs[pair_index] |= ((curChar-'a') + 10); + } + /* if it's A-F - convert by subtracting 'A' and adding 10 */ + if(curChar>='A' && curChar<='F') { + pairs[pair_index] <<= 4; + pairs[pair_index] |= ((curChar-'A') + 10); + } + } + + /* We've read first pair candidate, and it's a surrogate - make sure we have a second in pair, starting with \u */ + if (pair_index == 0 && pairs[0] >= 0xd800 && pairs[0] <= 0xdbff) { + STR_CHECK_EOF; + curChar = readChar(lexer, jsonText, offset); + if(curChar == '\\') { + STR_CHECK_EOF; + curChar = readChar(lexer, jsonText, offset); + if(curChar == 'u') { + /* continue to the next one */ + pair_index++; + } + else { + unreadChar(lexer, offset); + unreadChar(lexer, offset); + lexer->error = yajl_lex_string_invalid_surrogate; + goto finish_string_lex; + } + } + else { + unreadChar(lexer, offset); + lexer->error = yajl_lex_string_invalid_surrogate; + goto finish_string_lex; + } + } + /* We've read first pair candidate, and it's not a surrogate */ + else if (pair_index == 0 && !(pairs[pair_index] >= 0xd800 && pairs[pair_index] <= 0xdbff)) { + tok = yajl_tok_string_with_escapes; + goto finish_string_lex; + } + /* We've read second pair candidate, and it's a valid second in pair */ + else if (pair_index == 1 && pairs[pair_index] >= 0xdc00 && pairs[pair_index] <= 0xdfff) { + tok = yajl_tok_string_with_escapes; + goto finish_string_lex; + } + /* We've read second pair candidate, and it's an invalid second in pair */ + else if (pair_index == 1 && !(pairs[pair_index] >= 0xdc00 && pairs[pair_index] <= 0xdfff)) { + lexer->error = yajl_lex_string_invalid_surrogate; + goto finish_string_lex; + } + } + + + finish_string_lex: + return tok; +} + static yajl_tok yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText, unsigned int jsonTextLen, unsigned int * offset) @@ -298,17 +386,9 @@ yajl_lex_string(yajl_lexer lexer, const unsigned char * jsonText, /* special case \u */ curChar = readChar(lexer, jsonText, offset); if (curChar == 'u') { - unsigned int i = 0; - - for (i=0;i<4;i++) { - STR_CHECK_EOF; - curChar = readChar(lexer, jsonText, offset); - if (!(charLookupTable[curChar] & VHC)) { - /* back up to offending char */ - unreadChar(lexer, offset); - lexer->error = yajl_lex_string_invalid_hex_char; - goto finish_string_lex; - } + tok = yajl_lex_escaped_unicode(lexer, jsonText, jsonTextLen, offset); + if (tok != yajl_tok_string_with_escapes) { + goto finish_string_lex; } } else if (!(charLookupTable[curChar] & VEC)) { /* back up to offending char */ @@ -688,6 +768,8 @@ yajl_lex_error_to_string(yajl_lex_error error) case yajl_lex_string_invalid_hex_char: return "invalid (non-hex) character occurs after '\\u' inside " "string."; + case yajl_lex_string_invalid_surrogate: + return "invalid surrogate pair occurs inside string."; case yajl_lex_invalid_char: return "invalid char in json text."; case yajl_lex_invalid_string: diff --git a/ext/yajl/yajl_lex.h b/ext/yajl/yajl_lex.h index 3cdeef11..b290bcb8 100644 --- a/ext/yajl/yajl_lex.h +++ b/ext/yajl/yajl_lex.h @@ -119,7 +119,8 @@ typedef enum { yajl_lex_missing_integer_after_decimal, yajl_lex_missing_integer_after_exponent, yajl_lex_missing_integer_after_minus, - yajl_lex_unallowed_comment + yajl_lex_unallowed_comment, + yajl_lex_string_invalid_surrogate, } yajl_lex_error; YAJL_API diff --git a/lib/yajl/version.rb b/lib/yajl/version.rb index f79d031d..caa30e13 100644 --- a/lib/yajl/version.rb +++ b/lib/yajl/version.rb @@ -1,3 +1,3 @@ module Yajl - VERSION = '1.2.1' + VERSION = '1.2.2' end diff --git a/spec/parsing/one_off_spec.rb b/spec/parsing/one_off_spec.rb index 9bc6b324..0492f75e 100644 --- a/spec/parsing/one_off_spec.rb +++ b/spec/parsing/one_off_spec.rb @@ -66,6 +66,22 @@ expect(Yajl::Parser.parse("{\"id\": 1046289770033519442869495707521600000000}")).to eql({"id" => 1046289770033519442869495707521600000000}) end + it "should not crash on JSON with broken surrogate pair, but should raise exception instead" do + expect { + Yajl::Parser.parse('["\\ud800\\\\a"]') + }.to raise_error(Yajl::ParseError) + expect { + Yajl::Parser.parse('["\\ud800\\\\u"]') + }.to raise_error(Yajl::ParseError) + end + + it "should accept correct surrogate pairs" do + Encoding.default_internal = Encoding.find('utf-8') + expect { + expect(Yajl::Parser.parse('["\\ud800\\udf02"]')).to eql(["𐌂"]) + }.not_to raise_error + end + if RUBY_VERSION =~ /^1.9/ it "should encode non-ascii symbols in utf-8" do parsed = Yajl::Parser.parse('{"曦": 1234}', :symbolize_keys => true) From 19564a09c6f7a2a77b1cb543d5e8ef44da6c40e1 Mon Sep 17 00:00:00 2001 From: Michael Kruglos Date: Sun, 6 Dec 2015 21:02:51 +0200 Subject: [PATCH 2/3] Removed unnecessary call to Encoding.default_internal from rspec --- spec/parsing/one_off_spec.rb | 1 - 1 file changed, 1 deletion(-) diff --git a/spec/parsing/one_off_spec.rb b/spec/parsing/one_off_spec.rb index 0492f75e..002c1fa0 100644 --- a/spec/parsing/one_off_spec.rb +++ b/spec/parsing/one_off_spec.rb @@ -76,7 +76,6 @@ end it "should accept correct surrogate pairs" do - Encoding.default_internal = Encoding.find('utf-8') expect { expect(Yajl::Parser.parse('["\\ud800\\udf02"]')).to eql(["𐌂"]) }.not_to raise_error From 0fb61f787a8d365fb5f571e4f794fd6efe186796 Mon Sep 17 00:00:00 2001 From: Michael Kruglos Date: Mon, 7 Dec 2015 23:42:45 +0200 Subject: [PATCH 3/3] Update version.rb --- lib/yajl/version.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/yajl/version.rb b/lib/yajl/version.rb index caa30e13..f79d031d 100644 --- a/lib/yajl/version.rb +++ b/lib/yajl/version.rb @@ -1,3 +1,3 @@ module Yajl - VERSION = '1.2.2' + VERSION = '1.2.1' end