From 46b3d4df6e1b344e332833adb6b54740b2737330 Mon Sep 17 00:00:00 2001 From: KJ Tsanaktsidis Date: Thu, 15 Feb 2024 09:22:24 +1100 Subject: [PATCH] Make invalid Unicode data raise when encoding through Oj::Rails::Encoder (#912) * Actually run activesupport7 tests with Oj These tests were not even loading Oj::Rails; they were definitely not actually testing the Oj rails shim. * Raise on invalid unicode in rails mode mimmicry Activesupport & JSON gem will raise an exception when trying to an encode an object containing a string with invalid byte sequences for the string's encoding. Oj correctly raises if escaspe_html_entites_in_json is enabled, but if that's disabled, the invalid byte sequence is copied directly to the output. Use the same logic to validate unicode in that case as well. --- ext/oj/dump.c | 48 +++++++++----- test/activesupport6/encoding_test.rb | 91 ++++++++++++++++++--------- test/activesupport7/encoding_test.rb | 94 +++++++++++++++++++++------- 3 files changed, 169 insertions(+), 64 deletions(-) diff --git a/ext/oj/dump.c b/ext/oj/dump.c index 1f5bfbe9..397604d4 100644 --- a/ext/oj/dump.c +++ b/ext/oj/dump.c @@ -198,7 +198,18 @@ inline static long rails_xss_friendly_size(const uint8_t *str, size_t len) { } inline static size_t rails_friendly_size(const uint8_t *str, size_t len) { - return calculate_string_size(str, len, rails_friendly_chars); + long size = 0; + size_t i = len; + uint8_t hi = 0; + + for (; 0 < i; str++, i--) { + size += rails_friendly_chars[*str]; + hi |= *str & 0x80; + } + if (0 == hi) { + return size - len * (size_t)'0'; + } + return -(size - len * (size_t)'0'); } const char *oj_nan_str(VALUE obj, int opt, int mode, bool plus, int *lenp) { @@ -750,8 +761,9 @@ void oj_dump_raw_json(VALUE obj, int depth, Out out) { void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out out) { size_t size; char *cmap; - const char *orig = str; - bool has_hi = false; + const char *orig = str; + bool has_hi = false; + bool do_unicode_validation = false; switch (out->opts->escape_mode) { case NLEsc: @@ -772,8 +784,9 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou size = xss_friendly_size((uint8_t *)str, cnt); break; case JXEsc: - cmap = hixss_friendly_chars; - size = hixss_friendly_size((uint8_t *)str, cnt); + cmap = hixss_friendly_chars; + size = hixss_friendly_size((uint8_t *)str, cnt); + do_unicode_validation = true; break; case RailsXEsc: { long sz; @@ -786,12 +799,22 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou } else { size = (size_t)sz; } + do_unicode_validation = true; break; } - case RailsEsc: + case RailsEsc: { + long sz; cmap = rails_friendly_chars; - size = rails_friendly_size((uint8_t *)str, cnt); + sz = rails_friendly_size((uint8_t *)str, cnt); + if (sz < 0) { + has_hi = true; + size = (size_t)-sz; + } else { + size = (size_t)sz; + } + do_unicode_validation = true; break; + } case JSONEsc: default: cmap = hibit_friendly_chars; size = hibit_friendly_size((uint8_t *)str, cnt); } @@ -822,7 +845,7 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou for (; str < end; str++) { switch (cmap[(uint8_t)*str]) { case '1': - if ((JXEsc == out->opts->escape_mode || RailsXEsc == out->opts->escape_mode) && check_start <= str) { + if (do_unicode_validation && check_start <= str) { if (0 != (0x80 & (uint8_t)*str)) { if (0xC0 == (0xC0 & (uint8_t)*str)) { check_start = check_unicode(str, end, orig); @@ -846,8 +869,7 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou } break; case '3': // Unicode - if (0xe2 == (uint8_t)*str && (JXEsc == out->opts->escape_mode || RailsXEsc == out->opts->escape_mode) && - 2 <= end - str) { + if (0xe2 == (uint8_t)*str && do_unicode_validation && 2 <= end - str) { if (0x80 == (uint8_t)str[1] && (0xa8 == (uint8_t)str[2] || 0xa9 == (uint8_t)str[2])) { str = dump_unicode(str, end, out, orig); } else { @@ -866,8 +888,7 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou APPEND_CHARS(out->cur, "\\u00", 4); dump_hex((uint8_t)*str, out); } else { - if (0xe2 == (uint8_t)*str && - (JXEsc == out->opts->escape_mode || RailsXEsc == out->opts->escape_mode) && 2 <= end - str) { + if (0xe2 == (uint8_t)*str && do_unicode_validation && 2 <= end - str) { if (0x80 == (uint8_t)str[1] && (0xa8 == (uint8_t)str[2] || 0xa9 == (uint8_t)str[2])) { str = dump_unicode(str, end, out, orig); } else { @@ -884,8 +905,7 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou } *out->cur++ = '"'; } - if ((JXEsc == out->opts->escape_mode || RailsXEsc == out->opts->escape_mode) && 0 < str - orig && - 0 != (0x80 & *(str - 1))) { + if (do_unicode_validation && 0 < str - orig && 0 != (0x80 & *(str - 1))) { uint8_t c = (uint8_t) * (str - 1); int i; int scnt = (int)(str - orig); diff --git a/test/activesupport6/encoding_test.rb b/test/activesupport6/encoding_test.rb index 73bdeba8..b54be22b 100644 --- a/test/activesupport6/encoding_test.rb +++ b/test/activesupport6/encoding_test.rb @@ -74,42 +74,77 @@ def test_hash_keys_encoding ActiveSupport.escape_html_entities_in_json = false end - def test_utf8_string_encoded_properly - # The original test seems to expect that - # ActiveSupport.escape_html_entities_in_json reverts to true even after - # being set to false. I haven't been able to figure that out so the value is - # set to true, the default, before running the test. This might be wrong but - # for now it will have to do. - ActiveSupport.escape_html_entities_in_json = true - result = ActiveSupport::JSON.encode("€2.99") - assert_equal '"€2.99"', result - assert_equal(Encoding::UTF_8, result.encoding) - - result = ActiveSupport::JSON.encode("✎☺") - assert_equal '"✎☺"', result - assert_equal(Encoding::UTF_8, result.encoding) + def test_hash_keys_encoding_without_escaping + assert_equal "{\"<>\":\"<>\"}", ActiveSupport::JSON.encode("<>" => "<>") end - def test_non_utf8_string_transcodes - s = "二".encode("Shift_JIS") - result = ActiveSupport::JSON.encode(s) - assert_equal '"二"', result - assert_equal Encoding::UTF_8, result.encoding + module UnicodeTests + def test_utf8_string_encoded_properly + result = ActiveSupport::JSON.encode("€2.99") + assert_equal '"€2.99"', result + assert_equal(Encoding::UTF_8, result.encoding) + + result = ActiveSupport::JSON.encode("✎☺") + assert_equal '"✎☺"', result + assert_equal(Encoding::UTF_8, result.encoding) + end + + def test_non_utf8_string_transcodes + s = "二".encode("Shift_JIS") + result = ActiveSupport::JSON.encode(s) + assert_equal '"二"', result + assert_equal Encoding::UTF_8, result.encoding + end + + def test_wide_utf8_chars + w = "𠜎" + result = ActiveSupport::JSON.encode(w) + assert_equal '"𠜎"', result + end + + def test_wide_utf8_roundtrip + hash = { string: "𐒑" } + json = ActiveSupport::JSON.encode(hash) + decoded_hash = ActiveSupport::JSON.decode(json) + assert_equal "𐒑", decoded_hash["string"] + end + + def test_invalid_encoding_raises + s = "\xAE\xFF\x9F" + refute s.valid_encoding? + + # n.b. this raises EncodingError, because we didn't call Oj.mimic_JSON in the test setup; but, + # if you do that (even indirectly through Oj.optimize_rails), then this raises a + # JSON::GeneratorError instead of an EncodingError. + assert_raises(EncodingError) do + ActiveSupport::JSON.encode([s]) + end + end end - def test_wide_utf8_chars - w = "𠜎" - result = ActiveSupport::JSON.encode(w) - assert_equal '"𠜎"', result + module UnicodeTestsWithEscapingOn + def setup + ActiveSupport.escape_html_entities_in_json = true + end + + def teardown + ActiveSupport.escape_html_entities_in_json = false + end + + include UnicodeTests end - def test_wide_utf8_roundtrip - hash = { string: "𐒑" } - json = ActiveSupport::JSON.encode(hash) - decoded_hash = ActiveSupport::JSON.decode(json) - assert_equal "𐒑", decoded_hash["string"] + module UnicodeTestsWithEscapingOff + def setup + ActiveSupport.escape_html_entities_in_json = false + end + + include UnicodeTests end + include UnicodeTestsWithEscapingOn + include UnicodeTestsWithEscapingOff + def test_hash_key_identifiers_are_always_quoted values = { 0 => 0, 1 => 1, :_ => :_, "$" => "$", "a" => "a", :A => :A, :A0 => :A0, "A0B" => "A0B" } assert_equal %w( "$" "A" "A0" "A0B" "_" "a" "0" "1" ).sort, object_keys(ActiveSupport::JSON.encode(values)) diff --git a/test/activesupport7/encoding_test.rb b/test/activesupport7/encoding_test.rb index 30bdedcf..a748b7f6 100644 --- a/test/activesupport7/encoding_test.rb +++ b/test/activesupport7/encoding_test.rb @@ -8,9 +8,18 @@ require_relative "time_zone_test_helpers" require_relative "encoding_test_cases" +require 'oj' +# Sets the ActiveSupport encoder to be Oj and also wraps the setting of globals. +Oj::Rails.set_encoder() +Oj::Rails.optimize() + class TestJSONEncoding < ActiveSupport::TestCase include TimeZoneTestHelpers + def test_is_actually_oj + assert_equal Oj::Rails::Encoder, ActiveSupport.json_encoder + end + def sorted_json(json) if json.start_with?("{") && json.end_with?("}") "{" + json[1..-2].split(",").sort.join(",") + "}" @@ -61,36 +70,77 @@ def test_hash_keys_encoding ActiveSupport.escape_html_entities_in_json = false end - def test_utf8_string_encoded_properly - result = ActiveSupport::JSON.encode("€2.99") - assert_equal '"€2.99"', result - assert_equal(Encoding::UTF_8, result.encoding) - - result = ActiveSupport::JSON.encode("✎☺") - assert_equal '"✎☺"', result - assert_equal(Encoding::UTF_8, result.encoding) + def test_hash_keys_encoding_without_escaping + assert_equal "{\"<>\":\"<>\"}", ActiveSupport::JSON.encode("<>" => "<>") end - def test_non_utf8_string_transcodes - s = "二".encode("Shift_JIS") - result = ActiveSupport::JSON.encode(s) - assert_equal '"二"', result - assert_equal Encoding::UTF_8, result.encoding + module UnicodeTests + def test_utf8_string_encoded_properly + result = ActiveSupport::JSON.encode("€2.99") + assert_equal '"€2.99"', result + assert_equal(Encoding::UTF_8, result.encoding) + + result = ActiveSupport::JSON.encode("✎☺") + assert_equal '"✎☺"', result + assert_equal(Encoding::UTF_8, result.encoding) + end + + def test_non_utf8_string_transcodes + s = "二".encode("Shift_JIS") + result = ActiveSupport::JSON.encode(s) + assert_equal '"二"', result + assert_equal Encoding::UTF_8, result.encoding + end + + def test_wide_utf8_chars + w = "𠜎" + result = ActiveSupport::JSON.encode(w) + assert_equal '"𠜎"', result + end + + def test_wide_utf8_roundtrip + hash = { string: "𐒑" } + json = ActiveSupport::JSON.encode(hash) + decoded_hash = ActiveSupport::JSON.decode(json) + assert_equal "𐒑", decoded_hash["string"] + end + + def test_invalid_encoding_raises + s = "\xAE\xFF\x9F" + refute s.valid_encoding? + + # n.b. this raises EncodingError, because we didn't call Oj.mimic_JSON in the test setup; but, + # if you do that (even indirectly through Oj.optimize_rails), then this raises a + # JSON::GeneratorError instead of an EncodingError. + assert_raises(EncodingError) do + ActiveSupport::JSON.encode([s]) + end + end end - def test_wide_utf8_chars - w = "𠜎" - result = ActiveSupport::JSON.encode(w) - assert_equal '"𠜎"', result + module UnicodeTestsWithEscapingOn + def setup + ActiveSupport.escape_html_entities_in_json = true + end + + def teardown + ActiveSupport.escape_html_entities_in_json = false + end + + include UnicodeTests end - def test_wide_utf8_roundtrip - hash = { string: "𐒑" } - json = ActiveSupport::JSON.encode(hash) - decoded_hash = ActiveSupport::JSON.decode(json) - assert_equal "𐒑", decoded_hash["string"] + module UnicodeTestsWithEscapingOff + def setup + ActiveSupport.escape_html_entities_in_json = false + end + + include UnicodeTests end + include UnicodeTestsWithEscapingOn + include UnicodeTestsWithEscapingOff + def test_hash_key_identifiers_are_always_quoted values = { 0 => 0, 1 => 1, :_ => :_, "$" => "$", "a" => "a", :A => :A, :A0 => :A0, "A0B" => "A0B" } assert_equal %w( "$" "A" "A0" "A0B" "_" "a" "0" "1" ).sort, object_keys(ActiveSupport::JSON.encode(values))