From bc7bdff051e19188edf121c216b8250110abf39a Mon Sep 17 00:00:00 2001 From: JustAnotherArchivist Date: Sat, 18 Jun 2022 22:54:44 +0000 Subject: [PATCH] Replace wchar_t string decoding implementation with a uint32_t-based one This fixes character handling on platforms with 16-bit wchar_t (notably, Windows), which was broken (in different ways) on both CPython and PyPy. Fixes #552 --- lib/ultrajson.h | 3 +- lib/ultrajsondec.c | 70 ++++++++++++++++----------------------------- python/JSONtoObj.c | 13 +++++++-- tests/test_ujson.py | 9 ------ 4 files changed, 37 insertions(+), 58 deletions(-) diff --git a/lib/ultrajson.h b/lib/ultrajson.h index 170214ff..eda221aa 100644 --- a/lib/ultrajson.h +++ b/lib/ultrajson.h @@ -54,7 +54,6 @@ tree doesn't have cyclic references. #define __ULTRAJSON_H__ #include -#include // Max decimals to encode double floating point numbers with #ifndef JSON_DOUBLE_MAX_DECIMALS @@ -318,7 +317,7 @@ EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char * typedef struct __JSONObjectDecoder { - JSOBJ (*newString)(void *prv, wchar_t *start, wchar_t *end); + JSOBJ (*newString)(void *prv, JSUINT32 *start, JSUINT32 *end); void (*objectAddKey)(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value); void (*arrayAddItem)(void *prv, JSOBJ obj, JSOBJ value); JSOBJ (*newTrue)(void *prv); diff --git a/lib/ultrajsondec.c b/lib/ultrajsondec.c index caf15ccc..47485a6b 100644 --- a/lib/ultrajsondec.c +++ b/lib/ultrajsondec.c @@ -41,7 +41,6 @@ Numeric decoder derived from from TCL library #include #include #include -#include #include #include #include @@ -58,8 +57,8 @@ struct DecoderState { char *start; char *end; - wchar_t *escStart; - wchar_t *escEnd; + JSUINT32 *escStart; + JSUINT32 *escEnd; int escHeap; int lastType; JSUINT32 objDepth; @@ -361,14 +360,12 @@ static const JSUINT8 g_decoderLookup[256] = static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds) { int index; - wchar_t *escOffset; - wchar_t *escStart; + JSUINT32 *escOffset; + JSUINT32 *escStart; size_t escLen = (ds->escEnd - ds->escStart); JSUINT8 *inputOffset; JSUTF16 ch = 0; -#if WCHAR_MAX >= 0x10FFFF JSUINT8 *lastHighSurrogate = NULL; -#endif JSUINT8 oct; JSUTF32 ucs; ds->lastType = JT_INVALID; @@ -380,11 +377,11 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds if (ds->escHeap) { - if (newSize > (SIZE_MAX / sizeof(wchar_t))) + if (newSize > (SIZE_MAX / sizeof(JSUINT32))) { return SetError(ds, -1, "Could not reserve memory block"); } - escStart = (wchar_t *)ds->dec->realloc(ds->escStart, newSize * sizeof(wchar_t)); + escStart = (JSUINT32 *)ds->dec->realloc(ds->escStart, newSize * sizeof(JSUINT32)); if (!escStart) { ds->dec->free(ds->escStart); @@ -394,18 +391,18 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds } else { - wchar_t *oldStart = ds->escStart; - if (newSize > (SIZE_MAX / sizeof(wchar_t))) + JSUINT32 *oldStart = ds->escStart; + if (newSize > (SIZE_MAX / sizeof(JSUINT32))) { return SetError(ds, -1, "Could not reserve memory block"); } - ds->escStart = (wchar_t *) ds->dec->malloc(newSize * sizeof(wchar_t)); + ds->escStart = (JSUINT32 *) ds->dec->malloc(newSize * sizeof(JSUINT32)); if (!ds->escStart) { return SetError(ds, -1, "Could not reserve memory block"); } ds->escHeap = 1; - memcpy(ds->escStart, oldStart, escLen * sizeof(wchar_t)); + memcpy(ds->escStart, oldStart, escLen * sizeof(JSUINT32)); } ds->escEnd = ds->escStart + newSize; @@ -438,14 +435,14 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds inputOffset ++; switch (*inputOffset) { - case '\\': *(escOffset++) = L'\\'; inputOffset++; continue; - case '\"': *(escOffset++) = L'\"'; inputOffset++; continue; - case '/': *(escOffset++) = L'/'; inputOffset++; continue; - case 'b': *(escOffset++) = L'\b'; inputOffset++; continue; - case 'f': *(escOffset++) = L'\f'; inputOffset++; continue; - case 'n': *(escOffset++) = L'\n'; inputOffset++; continue; - case 'r': *(escOffset++) = L'\r'; inputOffset++; continue; - case 't': *(escOffset++) = L'\t'; inputOffset++; continue; + case '\\': *(escOffset++) = '\\'; inputOffset++; continue; + case '\"': *(escOffset++) = '\"'; inputOffset++; continue; + case '/': *(escOffset++) = '/'; inputOffset++; continue; + case 'b': *(escOffset++) = '\b'; inputOffset++; continue; + case 'f': *(escOffset++) = '\f'; inputOffset++; continue; + case 'n': *(escOffset++) = '\n'; inputOffset++; continue; + case 'r': *(escOffset++) = '\r'; inputOffset++; continue; + case 't': *(escOffset++) = '\t'; inputOffset++; continue; case 'u': { @@ -494,7 +491,6 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds inputOffset ++; } -#if WCHAR_MAX >= 0x10FFFF if ((ch & 0xfc00) == 0xdc00 && lastHighSurrogate == inputOffset - 6 * sizeof(*inputOffset)) { // Low surrogate immediately following a high surrogate @@ -502,16 +498,13 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds *(escOffset-1) = (((*(escOffset-1) - 0xd800) <<10) | (ch - 0xdc00)) + 0x10000; } else -#endif { - *(escOffset++) = (wchar_t) ch; + *(escOffset++) = (JSUINT32) ch; } -#if WCHAR_MAX >= 0x10FFFF if ((ch & 0xfc00) == 0xd800) { lastHighSurrogate = inputOffset; } -#endif break; } @@ -523,7 +516,7 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds case 1: { - *(escOffset++) = (wchar_t) (*inputOffset++); + *(escOffset++) = (JSUINT32) (*inputOffset++); break; } @@ -537,7 +530,7 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds } ucs |= (*inputOffset++) & 0x3f; if (ucs < 0x80) return SetError (ds, -1, "Overlong 2 byte UTF-8 sequence detected when decoding 'string'"); - *(escOffset++) = (wchar_t) ucs; + *(escOffset++) = (JSUINT32) ucs; break; } @@ -560,7 +553,7 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds } if (ucs < 0x800) return SetError (ds, -1, "Overlong 3 byte UTF-8 sequence detected when encoding string"); - *(escOffset++) = (wchar_t) ucs; + *(escOffset++) = (JSUINT32) ucs; break; } @@ -584,20 +577,7 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds if (ucs < 0x10000) return SetError (ds, -1, "Overlong 4 byte UTF-8 sequence detected when decoding 'string'"); -#if WCHAR_MAX == 0xffff - if (ucs >= 0x10000) - { - ucs -= 0x10000; - *(escOffset++) = (wchar_t) (ucs >> 10) + 0xd800; - *(escOffset++) = (wchar_t) (ucs & 0x3ff) + 0xdc00; - } - else - { - *(escOffset++) = (wchar_t) ucs; - } -#else - *(escOffset++) = (wchar_t) ucs; -#endif + *(escOffset++) = (JSUINT32) ucs; break; } } @@ -810,14 +790,14 @@ JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuf /* FIXME: Base the size of escBuffer of that of cbBuffer so that the unicode escaping doesn't run into the wall each time */ struct DecoderState ds; - wchar_t escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t))]; + JSUINT32 escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(JSUINT32))]; JSOBJ ret; ds.start = (char *) buffer; ds.end = ds.start + cbBuffer; ds.escStart = escBuffer; - ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t)); + ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(JSUINT32)); ds.escHeap = 0; ds.prv = dec->prv; ds.dec = dec; diff --git a/python/JSONtoObj.c b/python/JSONtoObj.c index 55000b58..fbb3fb6f 100644 --- a/python/JSONtoObj.c +++ b/python/JSONtoObj.c @@ -59,9 +59,18 @@ static void Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value) return; } -static JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end) +/* +Check that Py_UCS4 is the same as JSUINT32, else Object_newString will fail. +Based on Linux's check in vbox_vmmdev_types.h. +This should be replaced with + _Static_assert(sizeof(Py_UCS4) == sizeof(JSUINT32)); +when C11 is made mandatory (CPython 3.11+, PyPy ?). +*/ +typedef char assert_py_ucs4_is_jsuint32[1 - 2*!(sizeof(Py_UCS4) == sizeof(JSUINT32))]; + +static JSOBJ Object_newString(void *prv, JSUINT32 *start, JSUINT32 *end) { - return PyUnicode_FromWideChar (start, (end - start)); + return PyUnicode_FromKindAndData (PyUnicode_4BYTE_KIND, (Py_UCS4 *) start, (end - start)); } static JSOBJ Object_newTrue(void *prv) diff --git a/tests/test_ujson.py b/tests/test_ujson.py index 66d8d0c9..d50ebbf9 100644 --- a/tests/test_ujson.py +++ b/tests/test_ujson.py @@ -1,4 +1,3 @@ -import ctypes import datetime as dt import decimal import io @@ -515,10 +514,6 @@ def test_encode_surrogate_characters(): assert ujson.dumps({"\ud800": "\udfff"}, ensure_ascii=False, sort_keys=True) == out2 -@pytest.mark.xfail( - hasattr(sys, "pypy_version_info") and os.name == "nt", - reason="This feature needs fixing! See #552", -) @pytest.mark.parametrize( "test_input, expected", [ @@ -543,10 +538,6 @@ def test_encode_surrogate_characters(): ], ) def test_decode_surrogate_characters(test_input, expected): - # FIXME Wrong output (combined char) on platforms with 16-bit wchar_t - if test_input == '"\uD83D\uDCA9"' and ctypes.sizeof(ctypes.c_wchar) == 2: - pytest.skip("Raw surrogate pairs are not supported with 16-bit wchar_t") - assert ujson.loads(test_input) == expected assert ujson.loads(test_input.encode("utf-8", "surrogatepass")) == expected