Skip to content

Commit

Permalink
Merge pull request #530 from JustAnotherArchivist/fix-encode-surrogates
Browse files Browse the repository at this point in the history
  • Loading branch information
hugovk committed Jun 1, 2022
2 parents b300d64 + 59aa3bf commit 66bb6e0
Show file tree
Hide file tree
Showing 4 changed files with 45 additions and 24 deletions.
7 changes: 5 additions & 2 deletions lib/ultrajson.h
Expand Up @@ -300,18 +300,21 @@ obj - An anonymous type representing the object
enc - Function definitions for querying JSOBJ type
buffer - Preallocated buffer to store result in. If NULL function allocates own buffer
cbBuffer - Length of buffer (ignored if buffer is NULL)
outLen - Will store the length of the encoded string
Returns:
Encoded JSON object as a null terminated char string.
Encoded JSON object as a char string.
NOTE:
If the supplied buffer wasn't enough to hold the result the function will allocate a new buffer.
Life cycle of the provided buffer must still be handled by caller.
If the return value doesn't equal the specified buffer caller must release the memory using
JSONObjectEncoder.free or free() as specified when calling this function.
If an error occurs during encoding, NULL is returned and no outLen is stored.
*/
EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *buffer, size_t cbBuffer);
EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *buffer, size_t cbBuffer, size_t *outLen);

typedef struct __JSONObjectDecoder
{
Expand Down
5 changes: 2 additions & 3 deletions lib/ultrajsonenc.c
Expand Up @@ -948,7 +948,7 @@ static void encode(JSOBJ obj, JSONObjectEncoder *enc, const char *name, size_t c
enc->level--;
}

char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, size_t _cbBuffer)
char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, size_t _cbBuffer, size_t *_outLen)
{
enc->malloc = enc->malloc ? enc->malloc : malloc;
enc->free = enc->free ? enc->free : free;
Expand Down Expand Up @@ -984,12 +984,11 @@ char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *_buffer, size_t

encode (obj, enc, NULL, 0);

Buffer_Reserve(enc, 1);
if (enc->errorMsg)
{
return NULL;
}
Buffer_AppendCharUnchecked(enc, '\0');

*_outLen = enc->offset - enc->start;
return enc->start;
}
41 changes: 26 additions & 15 deletions python/objToJSON.c
Expand Up @@ -114,10 +114,16 @@ static void *PyStringToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, siz
return PyBytes_AsString(obj);
}

static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen)
{
static char *PyUnicodeToUTF8Raw(JSOBJ _obj, size_t *_outLen, PyObject **pBytesObj)
{
/*
Converts the PyUnicode object to char* whose size is stored in _outLen.
This conversion may require the creation of an intermediate PyBytes object.
In that case, the returned char* is in fact the internal buffer of that PyBytes object,
and when the char* buffer is no longer needed, the bytesObj must be DECREF'd.
*/
PyObject *obj = (PyObject *) _obj;
PyObject *newObj;

#ifndef Py_LIMITED_API
if (PyUnicode_IS_COMPACT_ASCII(obj))
{
Expand All @@ -127,16 +133,20 @@ static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, si
return data;
}
#endif
newObj = PyUnicode_AsUTF8String(obj);
if(!newObj)

PyObject *bytesObj = *pBytesObj = PyUnicode_AsEncodedString (obj, NULL, "surrogatepass");
if (!bytesObj)
{
return NULL;
}

GET_TC(tc)->newObj = newObj;
*_outLen = PyBytes_Size(bytesObj);
return PyBytes_AsString(bytesObj);
}

*_outLen = PyBytes_Size(newObj);
return PyBytes_AsString(newObj);
static void *PyUnicodeToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen)
{
return PyUnicodeToUTF8Raw(_obj, _outLen, &(GET_TC(tc)->newObj));
}

static void *PyRawJSONToUTF8(JSOBJ _obj, JSONTypeContext *tc, void *outValue, size_t *_outLen)
Expand Down Expand Up @@ -240,7 +250,7 @@ static int Dict_iterNext(JSOBJ obj, JSONTypeContext *tc)
if (PyUnicode_Check(GET_TC(tc)->itemName))
{
itemNameTmp = GET_TC(tc)->itemName;
GET_TC(tc)->itemName = PyUnicode_AsUTF8String (GET_TC(tc)->itemName);
GET_TC(tc)->itemName = PyUnicode_AsEncodedString (GET_TC(tc)->itemName, NULL, "surrogatepass");
Py_DECREF(itemNameTmp);
}
else
Expand All @@ -263,7 +273,7 @@ static int Dict_iterNext(JSOBJ obj, JSONTypeContext *tc)
return -1;
}
itemNameTmp = GET_TC(tc)->itemName;
GET_TC(tc)->itemName = PyUnicode_AsUTF8String (GET_TC(tc)->itemName);
GET_TC(tc)->itemName = PyUnicode_AsEncodedString (GET_TC(tc)->itemName, NULL, "surrogatepass");
Py_DECREF(itemNameTmp);
}
PRINTMARK();
Expand Down Expand Up @@ -332,7 +342,7 @@ static int SortedDict_iterNext(JSOBJ obj, JSONTypeContext *tc)
// Subject the key to the same type restrictions and conversions as in Dict_iterGetValue.
if (PyUnicode_Check(key))
{
key = PyUnicode_AsUTF8String(key);
key = PyUnicode_AsEncodedString(key, NULL, "surrogatepass");
}
else if (!PyBytes_Check(key))
{
Expand All @@ -342,7 +352,7 @@ static int SortedDict_iterNext(JSOBJ obj, JSONTypeContext *tc)
goto error;
}
keyTmp = key;
key = PyUnicode_AsUTF8String(key);
key = PyUnicode_AsEncodedString(key, NULL, "surrogatepass");
Py_DECREF(keyTmp);
}
else
Expand Down Expand Up @@ -674,7 +684,7 @@ static void Object_beginTypeContext (JSOBJ _obj, JSONTypeContext *tc, JSONObject
{
goto INVALID;
}
PyObject* str = PyUnicode_AsEncodedString(objRepr, "utf-8", "strict");
PyObject* str = PyUnicode_AsEncodedString(objRepr, NULL, "strict");
if (str)
{
PyErr_Format (PyExc_TypeError, "%s is not JSON serializable", PyBytes_AsString(str));
Expand Down Expand Up @@ -777,6 +787,7 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
PyObject *odefaultFn = NULL;
int allowNan = -1;
int orejectBytes = -1;
size_t retLen;

JSONObjectEncoder encoder =
{
Expand Down Expand Up @@ -860,7 +871,7 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
csInf, csNan, 'e', DCONV_DECIMAL_IN_SHORTEST_LOW, DCONV_DECIMAL_IN_SHORTEST_HIGH, 0, 0);

PRINTMARK();
ret = JSON_EncodeObject (oinput, &encoder, buffer, sizeof (buffer));
ret = JSON_EncodeObject (oinput, &encoder, buffer, sizeof (buffer), &retLen);
PRINTMARK();

dconv_d2s_free(&encoder.d2s);
Expand All @@ -881,7 +892,7 @@ PyObject* objToJSON(PyObject* self, PyObject *args, PyObject *kwargs)
return NULL;
}

newobj = PyUnicode_FromString (ret);
newobj = PyUnicode_DecodeUTF8(ret, retLen, "surrogatepass");

if (ret != buffer)
{
Expand Down
16 changes: 12 additions & 4 deletions tests/test_ujson.py
Expand Up @@ -498,10 +498,18 @@ def test_decode_array_empty():
assert [] == obj


def test_encoding_invalid_unicode_character():
s = "\udc7f"
with pytest.raises(UnicodeEncodeError):
ujson.dumps(s)
def test_encode_surrogate_characters():
assert ujson.dumps("\udc7f") == r'"\udc7f"'
out = r'{"\ud800":"\udfff"}'
assert ujson.dumps({"\ud800": "\udfff"}) == out
assert ujson.dumps({"\ud800": "\udfff"}, sort_keys=True) == out
o = {b"\xed\xa0\x80": b"\xed\xbf\xbf"}
assert ujson.dumps(o, reject_bytes=False) == out
assert ujson.dumps(o, reject_bytes=False, sort_keys=True) == out

out2 = '{"\ud800":"\udfff"}'
assert ujson.dumps({"\ud800": "\udfff"}, ensure_ascii=False) == out2
assert ujson.dumps({"\ud800": "\udfff"}, ensure_ascii=False, sort_keys=True) == out2


def test_sort_keys():
Expand Down

0 comments on commit 66bb6e0

Please sign in to comment.