From 46b3d4df6e1b344e332833adb6b54740b2737330 Mon Sep 17 00:00:00 2001
From: KJ Tsanaktsidis <kj@kjtsanaktsidis.id.au>
Date: Thu, 15 Feb 2024 09:22:24 +1100
Subject: [PATCH] Make invalid Unicode data raise when encoding through
 Oj::Rails::Encoder (#912)

* Actually run activesupport7 tests with Oj

These tests were not even loading Oj::Rails; they were definitely not
actually testing the Oj rails shim.

* Raise on invalid unicode in rails mode mimmicry

Activesupport & JSON gem will raise an exception when trying to an
encode an object containing a string with invalid byte sequences for the
string's encoding. Oj correctly raises if escaspe_html_entites_in_json
is enabled, but if that's disabled, the invalid byte sequence is copied
directly to the output.

Use the same logic to validate unicode in that case as well.
---
 ext/oj/dump.c                        | 48 +++++++++-----
 test/activesupport6/encoding_test.rb | 91 ++++++++++++++++++---------
 test/activesupport7/encoding_test.rb | 94 +++++++++++++++++++++-------
 3 files changed, 169 insertions(+), 64 deletions(-)

diff --git a/ext/oj/dump.c b/ext/oj/dump.c
index 1f5bfbe9..397604d4 100644
--- a/ext/oj/dump.c
+++ b/ext/oj/dump.c
@@ -198,7 +198,18 @@ inline static long rails_xss_friendly_size(const uint8_t *str, size_t len) {
 }
 
 inline static size_t rails_friendly_size(const uint8_t *str, size_t len) {
-    return calculate_string_size(str, len, rails_friendly_chars);
+    long    size = 0;
+    size_t  i    = len;
+    uint8_t hi   = 0;
+
+    for (; 0 < i; str++, i--) {
+        size += rails_friendly_chars[*str];
+        hi |= *str & 0x80;
+    }
+    if (0 == hi) {
+        return size - len * (size_t)'0';
+    }
+    return -(size - len * (size_t)'0');
 }
 
 const char *oj_nan_str(VALUE obj, int opt, int mode, bool plus, int *lenp) {
@@ -750,8 +761,9 @@ void oj_dump_raw_json(VALUE obj, int depth, Out out) {
 void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out out) {
     size_t      size;
     char       *cmap;
-    const char *orig   = str;
-    bool        has_hi = false;
+    const char *orig                  = str;
+    bool        has_hi                = false;
+    bool        do_unicode_validation = false;
 
     switch (out->opts->escape_mode) {
     case NLEsc:
@@ -772,8 +784,9 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
         size = xss_friendly_size((uint8_t *)str, cnt);
         break;
     case JXEsc:
-        cmap = hixss_friendly_chars;
-        size = hixss_friendly_size((uint8_t *)str, cnt);
+        cmap                  = hixss_friendly_chars;
+        size                  = hixss_friendly_size((uint8_t *)str, cnt);
+        do_unicode_validation = true;
         break;
     case RailsXEsc: {
         long sz;
@@ -786,12 +799,22 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
         } else {
             size = (size_t)sz;
         }
+        do_unicode_validation = true;
         break;
     }
-    case RailsEsc:
+    case RailsEsc: {
+        long sz;
         cmap = rails_friendly_chars;
-        size = rails_friendly_size((uint8_t *)str, cnt);
+        sz   = rails_friendly_size((uint8_t *)str, cnt);
+        if (sz < 0) {
+            has_hi = true;
+            size   = (size_t)-sz;
+        } else {
+            size = (size_t)sz;
+        }
+        do_unicode_validation = true;
         break;
+    }
     case JSONEsc:
     default: cmap = hibit_friendly_chars; size = hibit_friendly_size((uint8_t *)str, cnt);
     }
@@ -822,7 +845,7 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
         for (; str < end; str++) {
             switch (cmap[(uint8_t)*str]) {
             case '1':
-                if ((JXEsc == out->opts->escape_mode || RailsXEsc == out->opts->escape_mode) && check_start <= str) {
+                if (do_unicode_validation && check_start <= str) {
                     if (0 != (0x80 & (uint8_t)*str)) {
                         if (0xC0 == (0xC0 & (uint8_t)*str)) {
                             check_start = check_unicode(str, end, orig);
@@ -846,8 +869,7 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
                 }
                 break;
             case '3':  // Unicode
-                if (0xe2 == (uint8_t)*str && (JXEsc == out->opts->escape_mode || RailsXEsc == out->opts->escape_mode) &&
-                    2 <= end - str) {
+                if (0xe2 == (uint8_t)*str && do_unicode_validation && 2 <= end - str) {
                     if (0x80 == (uint8_t)str[1] && (0xa8 == (uint8_t)str[2] || 0xa9 == (uint8_t)str[2])) {
                         str = dump_unicode(str, end, out, orig);
                     } else {
@@ -866,8 +888,7 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
                     APPEND_CHARS(out->cur, "\\u00", 4);
                     dump_hex((uint8_t)*str, out);
                 } else {
-                    if (0xe2 == (uint8_t)*str &&
-                        (JXEsc == out->opts->escape_mode || RailsXEsc == out->opts->escape_mode) && 2 <= end - str) {
+                    if (0xe2 == (uint8_t)*str && do_unicode_validation && 2 <= end - str) {
                         if (0x80 == (uint8_t)str[1] && (0xa8 == (uint8_t)str[2] || 0xa9 == (uint8_t)str[2])) {
                             str = dump_unicode(str, end, out, orig);
                         } else {
@@ -884,8 +905,7 @@ void oj_dump_cstr(const char *str, size_t cnt, bool is_sym, bool escape1, Out ou
         }
         *out->cur++ = '"';
     }
-    if ((JXEsc == out->opts->escape_mode || RailsXEsc == out->opts->escape_mode) && 0 < str - orig &&
-        0 != (0x80 & *(str - 1))) {
+    if (do_unicode_validation && 0 < str - orig && 0 != (0x80 & *(str - 1))) {
         uint8_t c = (uint8_t) * (str - 1);
         int     i;
         int     scnt = (int)(str - orig);
diff --git a/test/activesupport6/encoding_test.rb b/test/activesupport6/encoding_test.rb
index 73bdeba8..b54be22b 100644
--- a/test/activesupport6/encoding_test.rb
+++ b/test/activesupport6/encoding_test.rb
@@ -74,42 +74,77 @@ def test_hash_keys_encoding
     ActiveSupport.escape_html_entities_in_json = false
   end
 
-  def test_utf8_string_encoded_properly
-    # The original test seems to expect that
-    # ActiveSupport.escape_html_entities_in_json reverts to true even after
-    # being set to false. I haven't been able to figure that out so the value is
-    # set to true, the default, before running the test. This might be wrong but
-    # for now it will have to do.
-    ActiveSupport.escape_html_entities_in_json = true
-    result = ActiveSupport::JSON.encode("€2.99")
-    assert_equal '"€2.99"', result
-    assert_equal(Encoding::UTF_8, result.encoding)
-
-    result = ActiveSupport::JSON.encode("✎☺")
-    assert_equal '"✎☺"', result
-    assert_equal(Encoding::UTF_8, result.encoding)
+  def test_hash_keys_encoding_without_escaping
+    assert_equal "{\"<>\":\"<>\"}", ActiveSupport::JSON.encode("<>" => "<>")
   end
 
-  def test_non_utf8_string_transcodes
-    s = "二".encode("Shift_JIS")
-    result = ActiveSupport::JSON.encode(s)
-    assert_equal '"二"', result
-    assert_equal Encoding::UTF_8, result.encoding
+  module UnicodeTests
+    def test_utf8_string_encoded_properly
+      result = ActiveSupport::JSON.encode("€2.99")
+      assert_equal '"€2.99"', result
+      assert_equal(Encoding::UTF_8, result.encoding)
+
+      result = ActiveSupport::JSON.encode("✎☺")
+      assert_equal '"✎☺"', result
+      assert_equal(Encoding::UTF_8, result.encoding)
+    end
+
+    def test_non_utf8_string_transcodes
+      s = "二".encode("Shift_JIS")
+      result = ActiveSupport::JSON.encode(s)
+      assert_equal '"二"', result
+      assert_equal Encoding::UTF_8, result.encoding
+    end
+
+    def test_wide_utf8_chars
+      w = "𠜎"
+      result = ActiveSupport::JSON.encode(w)
+      assert_equal '"𠜎"', result
+    end
+
+    def test_wide_utf8_roundtrip
+      hash = { string: "𐒑" }
+      json = ActiveSupport::JSON.encode(hash)
+      decoded_hash = ActiveSupport::JSON.decode(json)
+      assert_equal "𐒑", decoded_hash["string"]
+    end
+
+    def test_invalid_encoding_raises
+      s = "\xAE\xFF\x9F"
+      refute s.valid_encoding?
+
+      # n.b. this raises EncodingError, because we didn't call Oj.mimic_JSON in the test setup; but,
+      # if you do that (even indirectly through Oj.optimize_rails), then this raises a
+      # JSON::GeneratorError instead of an EncodingError.
+      assert_raises(EncodingError) do
+        ActiveSupport::JSON.encode([s])
+      end
+    end
   end
 
-  def test_wide_utf8_chars
-    w = "𠜎"
-    result = ActiveSupport::JSON.encode(w)
-    assert_equal '"𠜎"', result
+  module UnicodeTestsWithEscapingOn
+    def setup
+      ActiveSupport.escape_html_entities_in_json = true
+    end
+
+    def teardown
+      ActiveSupport.escape_html_entities_in_json = false
+    end
+
+    include UnicodeTests
   end
 
-  def test_wide_utf8_roundtrip
-    hash = { string: "𐒑" }
-    json = ActiveSupport::JSON.encode(hash)
-    decoded_hash = ActiveSupport::JSON.decode(json)
-    assert_equal "𐒑", decoded_hash["string"]
+  module UnicodeTestsWithEscapingOff
+    def setup
+      ActiveSupport.escape_html_entities_in_json = false
+    end
+
+    include UnicodeTests
   end
 
+  include UnicodeTestsWithEscapingOn
+  include UnicodeTestsWithEscapingOff
+
   def test_hash_key_identifiers_are_always_quoted
     values = { 0 => 0, 1 => 1, :_ => :_, "$" => "$", "a" => "a", :A => :A, :A0 => :A0, "A0B" => "A0B" }
     assert_equal %w( "$" "A" "A0" "A0B" "_" "a" "0" "1" ).sort, object_keys(ActiveSupport::JSON.encode(values))
diff --git a/test/activesupport7/encoding_test.rb b/test/activesupport7/encoding_test.rb
index 30bdedcf..a748b7f6 100644
--- a/test/activesupport7/encoding_test.rb
+++ b/test/activesupport7/encoding_test.rb
@@ -8,9 +8,18 @@
 require_relative "time_zone_test_helpers"
 require_relative "encoding_test_cases"
 
+require 'oj'
+# Sets the ActiveSupport encoder to be Oj and also wraps the setting of globals.
+Oj::Rails.set_encoder()
+Oj::Rails.optimize()
+
 class TestJSONEncoding < ActiveSupport::TestCase
   include TimeZoneTestHelpers
 
+  def test_is_actually_oj
+    assert_equal Oj::Rails::Encoder, ActiveSupport.json_encoder
+  end
+
   def sorted_json(json)
     if json.start_with?("{") && json.end_with?("}")
       "{" + json[1..-2].split(",").sort.join(",") + "}"
@@ -61,36 +70,77 @@ def test_hash_keys_encoding
     ActiveSupport.escape_html_entities_in_json = false
   end
 
-  def test_utf8_string_encoded_properly
-    result = ActiveSupport::JSON.encode("€2.99")
-    assert_equal '"€2.99"', result
-    assert_equal(Encoding::UTF_8, result.encoding)
-
-    result = ActiveSupport::JSON.encode("✎☺")
-    assert_equal '"✎☺"', result
-    assert_equal(Encoding::UTF_8, result.encoding)
+  def test_hash_keys_encoding_without_escaping
+    assert_equal "{\"<>\":\"<>\"}", ActiveSupport::JSON.encode("<>" => "<>")
   end
 
-  def test_non_utf8_string_transcodes
-    s = "二".encode("Shift_JIS")
-    result = ActiveSupport::JSON.encode(s)
-    assert_equal '"二"', result
-    assert_equal Encoding::UTF_8, result.encoding
+  module UnicodeTests
+    def test_utf8_string_encoded_properly
+      result = ActiveSupport::JSON.encode("€2.99")
+      assert_equal '"€2.99"', result
+      assert_equal(Encoding::UTF_8, result.encoding)
+
+      result = ActiveSupport::JSON.encode("✎☺")
+      assert_equal '"✎☺"', result
+      assert_equal(Encoding::UTF_8, result.encoding)
+    end
+
+    def test_non_utf8_string_transcodes
+      s = "二".encode("Shift_JIS")
+      result = ActiveSupport::JSON.encode(s)
+      assert_equal '"二"', result
+      assert_equal Encoding::UTF_8, result.encoding
+    end
+
+    def test_wide_utf8_chars
+      w = "𠜎"
+      result = ActiveSupport::JSON.encode(w)
+      assert_equal '"𠜎"', result
+    end
+
+    def test_wide_utf8_roundtrip
+      hash = { string: "𐒑" }
+      json = ActiveSupport::JSON.encode(hash)
+      decoded_hash = ActiveSupport::JSON.decode(json)
+      assert_equal "𐒑", decoded_hash["string"]
+    end
+
+    def test_invalid_encoding_raises
+      s = "\xAE\xFF\x9F"
+      refute s.valid_encoding?
+
+      # n.b. this raises EncodingError, because we didn't call Oj.mimic_JSON in the test setup; but,
+      # if you do that (even indirectly through Oj.optimize_rails), then this raises a
+      # JSON::GeneratorError instead of an EncodingError.
+      assert_raises(EncodingError) do
+        ActiveSupport::JSON.encode([s])
+      end
+    end
   end
 
-  def test_wide_utf8_chars
-    w = "𠜎"
-    result = ActiveSupport::JSON.encode(w)
-    assert_equal '"𠜎"', result
+  module UnicodeTestsWithEscapingOn
+    def setup
+      ActiveSupport.escape_html_entities_in_json = true
+    end
+
+    def teardown
+      ActiveSupport.escape_html_entities_in_json = false
+    end
+
+    include UnicodeTests
   end
 
-  def test_wide_utf8_roundtrip
-    hash = { string: "𐒑" }
-    json = ActiveSupport::JSON.encode(hash)
-    decoded_hash = ActiveSupport::JSON.decode(json)
-    assert_equal "𐒑", decoded_hash["string"]
+  module UnicodeTestsWithEscapingOff
+    def setup
+      ActiveSupport.escape_html_entities_in_json = false
+    end
+
+    include UnicodeTests
   end
 
+  include UnicodeTestsWithEscapingOn
+  include UnicodeTestsWithEscapingOff
+
   def test_hash_key_identifiers_are_always_quoted
     values = { 0 => 0, 1 => 1, :_ => :_, "$" => "$", "a" => "a", :A => :A, :A0 => :A0, "A0B" => "A0B" }
     assert_equal %w( "$" "A" "A0" "A0B" "_" "a" "0" "1" ).sort, object_keys(ActiveSupport::JSON.encode(values))