From b2787099f805f8c2a2790c40e16314847da48b9f Mon Sep 17 00:00:00 2001 From: Adriane Boyd Date: Wed, 20 Jul 2022 15:55:19 +0200 Subject: [PATCH] Backport "Replace `wchar_t` string decoding implementation with a `uint32_t`-based one" (#67) Backport https://github.com/ultrajson/ultrajson/pull/555 --- srsly/tests/ujson/test_ujson.py | 4 -- srsly/ujson/JSONtoObj.c | 13 +++++- srsly/ujson/lib/ultrajson.h | 3 +- srsly/ujson/lib/ultrajsondec.c | 70 ++++++++++++--------------------- 4 files changed, 37 insertions(+), 53 deletions(-) diff --git a/srsly/tests/ujson/test_ujson.py b/srsly/tests/ujson/test_ujson.py index 8bda090..b9a3814 100644 --- a/srsly/tests/ujson/test_ujson.py +++ b/srsly/tests/ujson/test_ujson.py @@ -949,10 +949,6 @@ def test_issue_334(indent): ], ) def test_decode_surrogate_characters(test_input, expected): - # FIXME Wrong output (combined char) on platforms with 16-bit wchar_t - if test_input == '"\uD83D\uDCA9"' and ctypes.sizeof(ctypes.c_wchar) == 2: - pytest.skip("Raw surrogate pairs are not supported with 16-bit wchar_t") - assert ujson.loads(test_input) == expected assert ujson.loads(test_input.encode("utf-8", "surrogatepass")) == expected diff --git a/srsly/ujson/JSONtoObj.c b/srsly/ujson/JSONtoObj.c index 08dfb42..8563970 100644 --- a/srsly/ujson/JSONtoObj.c +++ b/srsly/ujson/JSONtoObj.c @@ -58,9 +58,18 @@ void Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value) return; } -JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end) +/* +Check that Py_UCS4 is the same as JSUINT32, else Object_newString will fail. +Based on Linux's check in vbox_vmmdev_types.h. +This should be replaced with + _Static_assert(sizeof(Py_UCS4) == sizeof(JSUINT32)); +when C11 is made mandatory (CPython 3.11+, PyPy ?). +*/ +typedef char assert_py_ucs4_is_jsuint32[1 - 2*!(sizeof(Py_UCS4) == sizeof(JSUINT32))]; + +static JSOBJ Object_newString(void *prv, JSUINT32 *start, JSUINT32 *end) { - return PyUnicode_FromWideChar (start, (end - start)); + return PyUnicode_FromKindAndData (PyUnicode_4BYTE_KIND, (Py_UCS4 *) start, (end - start)); } JSOBJ Object_newTrue(void *prv) diff --git a/srsly/ujson/lib/ultrajson.h b/srsly/ujson/lib/ultrajson.h index e931db4..a117901 100644 --- a/srsly/ujson/lib/ultrajson.h +++ b/srsly/ujson/lib/ultrajson.h @@ -54,7 +54,6 @@ tree doesn't have cyclic references. #define __ULTRAJSON_H__ #include -#include // Max decimals to encode double floating point numbers with #ifndef JSON_DOUBLE_MAX_DECIMALS @@ -298,7 +297,7 @@ EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char * typedef struct __JSONObjectDecoder { - JSOBJ (*newString)(void *prv, wchar_t *start, wchar_t *end); + JSOBJ (*newString)(void *prv, JSUINT32 *start, JSUINT32 *end); void (*objectAddKey)(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value); void (*arrayAddItem)(void *prv, JSOBJ obj, JSOBJ value); JSOBJ (*newTrue)(void *prv); diff --git a/srsly/ujson/lib/ultrajsondec.c b/srsly/ujson/lib/ultrajsondec.c index 1dee5df..e3461f7 100644 --- a/srsly/ujson/lib/ultrajsondec.c +++ b/srsly/ujson/lib/ultrajsondec.c @@ -41,7 +41,6 @@ Numeric decoder derived from from TCL library #include #include #include -#include #include #include @@ -57,8 +56,8 @@ struct DecoderState { char *start; char *end; - wchar_t *escStart; - wchar_t *escEnd; + JSUINT32 *escStart; + JSUINT32 *escEnd; int escHeap; int lastType; JSUINT32 objDepth; @@ -425,14 +424,12 @@ static const JSUINT8 g_decoderLookup[256] = FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds) { int index; - wchar_t *escOffset; - wchar_t *escStart; + JSUINT32 *escOffset; + JSUINT32 *escStart; size_t escLen = (ds->escEnd - ds->escStart); JSUINT8 *inputOffset; JSUTF16 ch = 0; -#if WCHAR_MAX >= 0x10FFFF JSUINT8 *lastHighSurrogate = NULL; -#endif JSUINT8 oct; JSUTF32 ucs; ds->lastType = JT_INVALID; @@ -444,11 +441,11 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds) if (ds->escHeap) { - if (newSize > (SIZE_MAX / sizeof(wchar_t))) + if (newSize > (SIZE_MAX / sizeof(JSUINT32))) { return SetError(ds, -1, "Could not reserve memory block"); } - escStart = (wchar_t *)ds->dec->realloc(ds->escStart, newSize * sizeof(wchar_t)); + escStart = (JSUINT32 *)ds->dec->realloc(ds->escStart, newSize * sizeof(JSUINT32)); if (!escStart) { ds->dec->free(ds->escStart); @@ -458,18 +455,18 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds) } else { - wchar_t *oldStart = ds->escStart; - if (newSize > (SIZE_MAX / sizeof(wchar_t))) + JSUINT32 *oldStart = ds->escStart; + if (newSize > (SIZE_MAX / sizeof(JSUINT32))) { return SetError(ds, -1, "Could not reserve memory block"); } - ds->escStart = (wchar_t *) ds->dec->malloc(newSize * sizeof(wchar_t)); + ds->escStart = (JSUINT32 *) ds->dec->malloc(newSize * sizeof(JSUINT32)); if (!ds->escStart) { return SetError(ds, -1, "Could not reserve memory block"); } ds->escHeap = 1; - memcpy(ds->escStart, oldStart, escLen * sizeof(wchar_t)); + memcpy(ds->escStart, oldStart, escLen * sizeof(JSUINT32)); } ds->escEnd = ds->escStart + newSize; @@ -501,14 +498,14 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds) inputOffset ++; switch (*inputOffset) { - case '\\': *(escOffset++) = L'\\'; inputOffset++; continue; - case '\"': *(escOffset++) = L'\"'; inputOffset++; continue; - case '/': *(escOffset++) = L'/'; inputOffset++; continue; - case 'b': *(escOffset++) = L'\b'; inputOffset++; continue; - case 'f': *(escOffset++) = L'\f'; inputOffset++; continue; - case 'n': *(escOffset++) = L'\n'; inputOffset++; continue; - case 'r': *(escOffset++) = L'\r'; inputOffset++; continue; - case 't': *(escOffset++) = L'\t'; inputOffset++; continue; + case '\\': *(escOffset++) = '\\'; inputOffset++; continue; + case '\"': *(escOffset++) = '\"'; inputOffset++; continue; + case '/': *(escOffset++) = '/'; inputOffset++; continue; + case 'b': *(escOffset++) = '\b'; inputOffset++; continue; + case 'f': *(escOffset++) = '\f'; inputOffset++; continue; + case 'n': *(escOffset++) = '\n'; inputOffset++; continue; + case 'r': *(escOffset++) = '\r'; inputOffset++; continue; + case 't': *(escOffset++) = '\t'; inputOffset++; continue; case 'u': { @@ -557,7 +554,6 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds) inputOffset ++; } -#if WCHAR_MAX >= 0x10FFFF if ((ch & 0xfc00) == 0xdc00 && lastHighSurrogate == inputOffset - 6 * sizeof(*inputOffset)) { // Low surrogate immediately following a high surrogate @@ -565,16 +561,13 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds) *(escOffset-1) = (((*(escOffset-1) - 0xd800) <<10) | (ch - 0xdc00)) + 0x10000; } else -#endif { - *(escOffset++) = (wchar_t) ch; + *(escOffset++) = (JSUINT32) ch; } -#if WCHAR_MAX >= 0x10FFFF if ((ch & 0xfc00) == 0xd800) { lastHighSurrogate = inputOffset; } -#endif break; } @@ -585,7 +578,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds) case 1: { - *(escOffset++) = (wchar_t) (*inputOffset++); + *(escOffset++) = (JSUINT32) (*inputOffset++); break; } @@ -599,7 +592,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds) } ucs |= (*inputOffset++) & 0x3f; if (ucs < 0x80) return SetError (ds, -1, "Overlong 2 byte UTF-8 sequence detected when decoding 'string'"); - *(escOffset++) = (wchar_t) ucs; + *(escOffset++) = (JSUINT32) ucs; break; } @@ -622,7 +615,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds) } if (ucs < 0x800) return SetError (ds, -1, "Overlong 3 byte UTF-8 sequence detected when encoding string"); - *(escOffset++) = (wchar_t) ucs; + *(escOffset++) = (JSUINT32) ucs; break; } @@ -646,20 +639,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds) if (ucs < 0x10000) return SetError (ds, -1, "Overlong 4 byte UTF-8 sequence detected when decoding 'string'"); -#if WCHAR_MAX == 0xffff - if (ucs >= 0x10000) - { - ucs -= 0x10000; - *(escOffset++) = (wchar_t) (ucs >> 10) + 0xd800; - *(escOffset++) = (wchar_t) (ucs & 0x3ff) + 0xdc00; - } - else - { - *(escOffset++) = (wchar_t) ucs; - } -#else - *(escOffset++) = (wchar_t) ucs; -#endif + *(escOffset++) = (JSUINT32) ucs; break; } } @@ -859,14 +839,14 @@ JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuf /* FIXME: Base the size of escBuffer of that of cbBuffer so that the unicode escaping doesn't run into the wall each time */ struct DecoderState ds; - wchar_t escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t))]; + JSUINT32 escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(JSUINT32))]; JSOBJ ret; ds.start = (char *) buffer; ds.end = ds.start + cbBuffer; ds.escStart = escBuffer; - ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t)); + ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(JSUINT32)); ds.escHeap = 0; ds.prv = dec->prv; ds.dec = dec;