From abc7a7d1c9382a13265f4136c14e59f194f4fcc7 Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Sun, 17 Apr 2022 03:25:19 +0000
Subject: [PATCH] Fix handling of surrogates on encoding

This allows surrogates anywhere in the input, compatible with the json module from the standard library.

This also refactors two interfaces:
- The `PyUnicode` to `char*` conversion is moved into its own function, separated from the `JSONTypeContext` handling, so it can be reused for other things in the future (e.g. indentation and separators) which don't have a type context.
- Converting the `char*` output to a Python string with surrogates intact requires the string length for `PyUnicode_Decode` & Co. While `strlen` could be used, the length is already known inside the encoder, so the encoder function now also takes an extra `size_t` pointer argument to return that and no longer NUL-terminates the string. This also permits output that contains NUL bytes (even though that would be invalid JSON), e.g. if an object's `__json__` method return value were to contain them.

Fixes #156
Fixes #447
Supersedes #284
---
 lib/ultrajson.h     |  7 +++++--
 lib/ultrajsonenc.c  |  5 ++---
 python/objToJSON.c  | 47 +++++++++++++++++++++++----------------------
 tests/test_ujson.py | 16 +++++++++++----
 4 files changed, 43 insertions(+), 32 deletions(-)

diff --git a/lib/ultrajson.h b/lib/ultrajson.h
index c5d75b17..a0744fae 100644
--- a/lib/ultrajson.h
+++ b/lib/ultrajson.h
@@ -300,9 +300,10 @@ obj - An anonymous type representing the object
 enc - Function definitions for querying JSOBJ type
 buffer - Preallocated buffer to store result in. If NULL function allocates own buffer
 cbBuffer - Length of buffer (ignored if buffer is NULL)
+outLen - Will store the length of the encoded string
 
 Returns:
-Encoded JSON object as a null terminated char string.
+Encoded JSON object as a char string.
 
 NOTE:
 If the supplied buffer wasn't enough to hold the result the function will allocate a new buffer.
@@ -310,8 +311,10 @@ Life cycle of the provided buffer must still be handled by caller.
 
 If the return value doesn't equal the specified buffer caller must release the memory using
 JSONObjectEncoder.free or free() as specified when calling this function.
+
+If an error occurs during encoding, NULL is returned and no outLen is stored.
 */
-EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *buffer, size_t cbBuffer);
+EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *buffer, size_t cbBuffer, size_t *outLen);
 
 typedef struct __JSONObjectDecoder
 {
diff --git a/lib/ultrajsonenc.c b/lib/ultrajsonenc.c
index c8756136..7de88241 100644
--- a/lib/ultrajsonenc.c
+++ b/lib/ultrajsonenc.c
@@ -905,7 +905,7 @@ static void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t c
   enc->level--;
 }
 
-char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, size_t _cbBuffer)
+char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, size_t _cbBuffer, size_t *_outLen)
 {
   enc->malloc = enc->malloc ? enc->malloc : malloc;
   enc->free =  enc->free ? enc->free : free;
@@ -941,12 +941,11 @@ char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, size_t
 
   encode (obj, enc, NULL, 0);
 
-  Buffer_Reserve(enc, 1);
   if (enc->errorMsg)
   {
     return NULL;
   }
-  Buffer_AppendCharUnchecked(enc, '\0');
 
+  *_outLen = enc->offset - enc->start;
   return enc->start;
 }
diff --git a/python/objToJSON.c b/python/objToJSON.c
index d7f6cb92..e23c80a5 100644
--- a/python/objToJSON.c
+++ b/python/objToJSON.c
@@ -114,29 +114,29 @@ static void *PyStringToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, siz
   return PyBytes_AsString(obj);
 }
 
-static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen)
-{
+static char *PyUnicodeToUTF8Raw(JSOBJ _obj, size_t *_outLen, PyObject *bytesObj)
+{
+  /*
+  Converts the PyUnicode object to char* whose size is stored in _outLen.
+  This conversion requires the creation of an intermediate PyBytes object.
+  The returned char* is in fact the internal buffer of that PyBytes object.
+  When the char* buffer is no longer needed, the bytesObj must be Py_DECREF'd.
+  */
   PyObject *obj = (PyObject *) _obj;
-  PyObject *newObj;
-#ifndef Py_LIMITED_API
-  if (PyUnicode_IS_COMPACT_ASCII(obj))
-  {
-    Py_ssize_t len;
-    char *data = PyUnicode_AsUTF8AndSize(obj, &len);
-    *_outLen = len;
-    return data;
-  }
-#endif
-  newObj = PyUnicode_AsUTF8String(obj);
-  if(!newObj)
+
+  bytesObj = PyUnicode_AsEncodedString (obj, "utf-8", "surrogatepass");
+  if (!bytesObj)
   {
     return NULL;
   }
 
-  GET_TC(tc)->newObj = newObj;
+  *_outLen = PyBytes_Size(bytesObj);
+  return PyBytes_AsString(bytesObj);
+}
 
-  *_outLen = PyBytes_Size(newObj);
-  return PyBytes_AsString(newObj);
+static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen)
+{
+  return PyUnicodeToUTF8Raw(_obj, _outLen, GET_TC(tc)->newObj);
 }
 
 static void *PyRawJSONToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen)
@@ -240,7 +240,7 @@ static int Dict_iterNext(JSOBJ obj, JSONTypeContext *tc)
   if (PyUnicode_Check(GET_TC(tc)->itemName))
   {
     itemNameTmp = GET_TC(tc)->itemName;
-    GET_TC(tc)->itemName = PyUnicode_AsUTF8String (GET_TC(tc)->itemName);
+    GET_TC(tc)->itemName = PyUnicode_AsEncodedString (GET_TC(tc)->itemName, "utf-8", "surrogatepass");
     Py_DECREF(itemNameTmp);
   }
   else
@@ -263,7 +263,7 @@ static int Dict_iterNext(JSOBJ obj, JSONTypeContext *tc)
       return -1;
     }
     itemNameTmp = GET_TC(tc)->itemName;
-    GET_TC(tc)->itemName = PyUnicode_AsUTF8String (GET_TC(tc)->itemName);
+    GET_TC(tc)->itemName = PyUnicode_AsEncodedString (GET_TC(tc)->itemName, "utf-8", "surrogatepass");
     Py_DECREF(itemNameTmp);
   }
   PRINTMARK();
@@ -332,7 +332,7 @@ static int SortedDict_iterNext(JSOBJ obj, JSONTypeContext *tc)
       // Subject the key to the same type restrictions and conversions as in Dict_iterGetValue.
       if (PyUnicode_Check(key))
       {
-        key = PyUnicode_AsUTF8String(key);
+        key = PyUnicode_AsEncodedString(key, "utf-8", "surrogatepass");
       }
       else if (!PyBytes_Check(key))
       {
@@ -342,7 +342,7 @@ static int SortedDict_iterNext(JSOBJ obj, JSONTypeContext *tc)
           goto error;
         }
         keyTmp = key;
-        key = PyUnicode_AsUTF8String(key);
+        key = PyUnicode_AsEncodedString(key, "utf-8", "surrogatepass");
         Py_DECREF(keyTmp);
       }
       else
@@ -770,6 +770,7 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
   PyObject *odefaultFn = NULL;
   int allowNan = -1;
   int orejectBytes = -1;
+  size_t retLen;
 
   JSONObjectEncoder encoder =
   {
@@ -853,7 +854,7 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
                  csInf, csNan, 'e', DCONV_DECIMAL_IN_SHORTEST_LOW, DCONV_DECIMAL_IN_SHORTEST_HIGH, 0, 0);
 
   PRINTMARK();
-  ret = JSON_EncodeObject (oinput, &encoder, buffer, sizeof (buffer));
+  ret = JSON_EncodeObject (oinput, &encoder, buffer, sizeof (buffer), &retLen);
   PRINTMARK();
 
   dconv_d2s_free(&encoder.d2s);
@@ -874,7 +875,7 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
     return NULL;
   }
 
-  newobj = PyUnicode_FromString (ret);
+  newobj = PyUnicode_DecodeUTF8(ret, retLen, "surrogatepass");
 
   if (ret != buffer)
   {
diff --git a/tests/test_ujson.py b/tests/test_ujson.py
index 3e3f3f1b..fcba959c 100644
--- a/tests/test_ujson.py
+++ b/tests/test_ujson.py
@@ -498,10 +498,18 @@ def test_decode_array_empty():
     assert [] == obj
 
 
-def test_encoding_invalid_unicode_character():
-    s = "\udc7f"
-    with pytest.raises(UnicodeEncodeError):
-        ujson.dumps(s)
+def test_encode_surrogate_characters():
+    assert ujson.dumps("\udc7f") == r'"\udc7f"'
+    out = r'{"\ud800":"\udfff"}'
+    assert ujson.dumps({"\ud800": "\udfff"}) == out
+    assert ujson.dumps({"\ud800": "\udfff"}, sort_keys=True) == out
+    o = {b"\xed\xa0\x80": b"\xed\xbf\xbf"}
+    assert ujson.dumps(o, reject_bytes=False) == out
+    assert ujson.dumps(o, reject_bytes=False, sort_keys=True) == out
+
+    out2 = '{"\ud800":"\udfff"}'
+    assert ujson.dumps({"\ud800": "\udfff"}, ensure_ascii=False) == out2
+    assert ujson.dumps({"\ud800": "\udfff"}, ensure_ascii=False, sort_keys=True) == out2
 
 
 def test_sort_keys():