Skip to content

Commit

Permalink
Backport "Replace wchar_t string decoding implementation with a `ui…
Browse files Browse the repository at this point in the history
…nt32_t`-based one" (explosion#67)

Backport ultrajson/ultrajson#555
  • Loading branch information
adrianeboyd committed Jul 18, 2023
1 parent a175408 commit 54c69db
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 53 deletions.
4 changes: 0 additions & 4 deletions srsly/tests/ujson/test_ujson.py
Expand Up @@ -949,10 +949,6 @@ def test_issue_334(indent):
],
)
def test_decode_surrogate_characters(test_input, expected):
# FIXME Wrong output (combined char) on platforms with 16-bit wchar_t
if test_input == '"\uD83D\uDCA9"' and ctypes.sizeof(ctypes.c_wchar) == 2:
pytest.skip("Raw surrogate pairs are not supported with 16-bit wchar_t")

assert ujson.loads(test_input) == expected
assert ujson.loads(test_input.encode("utf-8", "surrogatepass")) == expected

Expand Down
13 changes: 11 additions & 2 deletions srsly/ujson/JSONtoObj.c
Expand Up @@ -58,9 +58,18 @@ void Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value)
return;
}

JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end)
/*
Check that Py_UCS4 is the same as JSUINT32, else Object_newString will fail.
Based on Linux's check in vbox_vmmdev_types.h.
This should be replaced with
_Static_assert(sizeof(Py_UCS4) == sizeof(JSUINT32));
when C11 is made mandatory (CPython 3.11+, PyPy ?).
*/
typedef char assert_py_ucs4_is_jsuint32[1 - 2*!(sizeof(Py_UCS4) == sizeof(JSUINT32))];

static JSOBJ Object_newString(void *prv, JSUINT32 *start, JSUINT32 *end)
{
return PyUnicode_FromWideChar (start, (end - start));
return PyUnicode_FromKindAndData (PyUnicode_4BYTE_KIND, (Py_UCS4 *) start, (end - start));
}

JSOBJ Object_newTrue(void *prv)
Expand Down
3 changes: 1 addition & 2 deletions srsly/ujson/lib/ultrajson.h
Expand Up @@ -54,7 +54,6 @@ tree doesn't have cyclic references.
#define __ULTRAJSON_H__

#include <stdio.h>
#include <wchar.h>

// Max decimals to encode double floating point numbers with
#ifndef JSON_DOUBLE_MAX_DECIMALS
Expand Down Expand Up @@ -298,7 +297,7 @@ EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *

typedef struct __JSONObjectDecoder
{
JSOBJ (*newString)(void *prv, wchar_t *start, wchar_t *end);
JSOBJ (*newString)(void *prv, JSUINT32 *start, JSUINT32 *end);
void (*objectAddKey)(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value);
void (*arrayAddItem)(void *prv, JSOBJ obj, JSOBJ value);
JSOBJ (*newTrue)(void *prv);
Expand Down
70 changes: 25 additions & 45 deletions srsly/ujson/lib/ultrajsondec.c
Expand Up @@ -41,7 +41,6 @@ Numeric decoder derived from from TCL library
#include <assert.h>
#include <string.h>
#include <limits.h>
#include <wchar.h>
#include <stdlib.h>
#include <errno.h>

Expand All @@ -57,8 +56,8 @@ struct DecoderState
{
char *start;
char *end;
wchar_t *escStart;
wchar_t *escEnd;
JSUINT32 *escStart;
JSUINT32 *escEnd;
int escHeap;
int lastType;
JSUINT32 objDepth;
Expand Down Expand Up @@ -425,14 +424,12 @@ static const JSUINT8 g_decoderLookup[256] =
FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
{
int index;
wchar_t *escOffset;
wchar_t *escStart;
JSUINT32 *escOffset;
JSUINT32 *escStart;
size_t escLen = (ds->escEnd - ds->escStart);
JSUINT8 *inputOffset;
JSUTF16 ch = 0;
#if WCHAR_MAX >= 0x10FFFF
JSUINT8 *lastHighSurrogate = NULL;
#endif
JSUINT8 oct;
JSUTF32 ucs;
ds->lastType = JT_INVALID;
Expand All @@ -444,11 +441,11 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)

if (ds->escHeap)
{
if (newSize > (SIZE_MAX / sizeof(wchar_t)))
if (newSize > (SIZE_MAX / sizeof(JSUINT32)))
{
return SetError(ds, -1, "Could not reserve memory block");
}
escStart = (wchar_t *)ds->dec->realloc(ds->escStart, newSize * sizeof(wchar_t));
escStart = (JSUINT32 *)ds->dec->realloc(ds->escStart, newSize * sizeof(JSUINT32));
if (!escStart)
{
ds->dec->free(ds->escStart);
Expand All @@ -458,18 +455,18 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
}
else
{
wchar_t *oldStart = ds->escStart;
if (newSize > (SIZE_MAX / sizeof(wchar_t)))
JSUINT32 *oldStart = ds->escStart;
if (newSize > (SIZE_MAX / sizeof(JSUINT32)))
{
return SetError(ds, -1, "Could not reserve memory block");
}
ds->escStart = (wchar_t *) ds->dec->malloc(newSize * sizeof(wchar_t));
ds->escStart = (JSUINT32 *) ds->dec->malloc(newSize * sizeof(JSUINT32));
if (!ds->escStart)
{
return SetError(ds, -1, "Could not reserve memory block");
}
ds->escHeap = 1;
memcpy(ds->escStart, oldStart, escLen * sizeof(wchar_t));
memcpy(ds->escStart, oldStart, escLen * sizeof(JSUINT32));
}

ds->escEnd = ds->escStart + newSize;
Expand Down Expand Up @@ -501,14 +498,14 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
inputOffset ++;
switch (*inputOffset)
{
case '\\': *(escOffset++) = L'\\'; inputOffset++; continue;
case '\"': *(escOffset++) = L'\"'; inputOffset++; continue;
case '/': *(escOffset++) = L'/'; inputOffset++; continue;
case 'b': *(escOffset++) = L'\b'; inputOffset++; continue;
case 'f': *(escOffset++) = L'\f'; inputOffset++; continue;
case 'n': *(escOffset++) = L'\n'; inputOffset++; continue;
case 'r': *(escOffset++) = L'\r'; inputOffset++; continue;
case 't': *(escOffset++) = L'\t'; inputOffset++; continue;
case '\\': *(escOffset++) = '\\'; inputOffset++; continue;
case '\"': *(escOffset++) = '\"'; inputOffset++; continue;
case '/': *(escOffset++) = '/'; inputOffset++; continue;
case 'b': *(escOffset++) = '\b'; inputOffset++; continue;
case 'f': *(escOffset++) = '\f'; inputOffset++; continue;
case 'n': *(escOffset++) = '\n'; inputOffset++; continue;
case 'r': *(escOffset++) = '\r'; inputOffset++; continue;
case 't': *(escOffset++) = '\t'; inputOffset++; continue;

case 'u':
{
Expand Down Expand Up @@ -557,24 +554,20 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
inputOffset ++;
}

#if WCHAR_MAX >= 0x10FFFF
if ((ch & 0xfc00) == 0xdc00 && lastHighSurrogate == inputOffset - 6 * sizeof(*inputOffset))
{
// Low surrogate immediately following a high surrogate
// Overwrite existing high surrogate with combined character
*(escOffset-1) = (((*(escOffset-1) - 0xd800) <<10) | (ch - 0xdc00)) + 0x10000;
}
else
#endif
{
*(escOffset++) = (wchar_t) ch;
*(escOffset++) = (JSUINT32) ch;
}
#if WCHAR_MAX >= 0x10FFFF
if ((ch & 0xfc00) == 0xd800)
{
lastHighSurrogate = inputOffset;
}
#endif
break;
}

Expand All @@ -585,7 +578,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)

case 1:
{
*(escOffset++) = (wchar_t) (*inputOffset++);
*(escOffset++) = (JSUINT32) (*inputOffset++);
break;
}

Expand All @@ -599,7 +592,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
}
ucs |= (*inputOffset++) & 0x3f;
if (ucs < 0x80) return SetError (ds, -1, "Overlong 2 byte UTF-8 sequence detected when decoding 'string'");
*(escOffset++) = (wchar_t) ucs;
*(escOffset++) = (JSUINT32) ucs;
break;
}

Expand All @@ -622,7 +615,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
}

if (ucs < 0x800) return SetError (ds, -1, "Overlong 3 byte UTF-8 sequence detected when encoding string");
*(escOffset++) = (wchar_t) ucs;
*(escOffset++) = (JSUINT32) ucs;
break;
}

Expand All @@ -646,20 +639,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)

if (ucs < 0x10000) return SetError (ds, -1, "Overlong 4 byte UTF-8 sequence detected when decoding 'string'");

#if WCHAR_MAX == 0xffff
if (ucs >= 0x10000)
{
ucs -= 0x10000;
*(escOffset++) = (wchar_t) (ucs >> 10) + 0xd800;
*(escOffset++) = (wchar_t) (ucs & 0x3ff) + 0xdc00;
}
else
{
*(escOffset++) = (wchar_t) ucs;
}
#else
*(escOffset++) = (wchar_t) ucs;
#endif
*(escOffset++) = (JSUINT32) ucs;
break;
}
}
Expand Down Expand Up @@ -859,14 +839,14 @@ JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuf
/*
FIXME: Base the size of escBuffer of that of cbBuffer so that the unicode escaping doesn't run into the wall each time */
struct DecoderState ds;
wchar_t escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t))];
JSUINT32 escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(JSUINT32))];
JSOBJ ret;

ds.start = (char *) buffer;
ds.end = ds.start + cbBuffer;

ds.escStart = escBuffer;
ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t));
ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(JSUINT32));
ds.escHeap = 0;
ds.prv = dec->prv;
ds.dec = dec;
Expand Down

0 comments on commit 54c69db

Please sign in to comment.