Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Replace wchar_t string decoding implementation with a uint32_t-based one #555

Merged
merged 1 commit into from Jun 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 1 addition & 2 deletions lib/ultrajson.h
Expand Up @@ -54,7 +54,6 @@ tree doesn't have cyclic references.
#define __ULTRAJSON_H__

#include <stdio.h>
#include <wchar.h>

// Max decimals to encode double floating point numbers with
#ifndef JSON_DOUBLE_MAX_DECIMALS
Expand Down Expand Up @@ -318,7 +317,7 @@ EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *

typedef struct __JSONObjectDecoder
{
JSOBJ (*newString)(void *prv, wchar_t *start, wchar_t *end);
JSOBJ (*newString)(void *prv, JSUINT32 *start, JSUINT32 *end);
void (*objectAddKey)(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value);
void (*arrayAddItem)(void *prv, JSOBJ obj, JSOBJ value);
JSOBJ (*newTrue)(void *prv);
Expand Down
70 changes: 25 additions & 45 deletions lib/ultrajsondec.c
Expand Up @@ -41,7 +41,6 @@ Numeric decoder derived from from TCL library
#include <assert.h>
#include <string.h>
#include <limits.h>
#include <wchar.h>
#include <stdlib.h>
#include <errno.h>
#include <stdint.h>
Expand All @@ -58,8 +57,8 @@ struct DecoderState
{
char *start;
char *end;
wchar_t *escStart;
wchar_t *escEnd;
JSUINT32 *escStart;
JSUINT32 *escEnd;
int escHeap;
int lastType;
JSUINT32 objDepth;
Expand Down Expand Up @@ -361,14 +360,12 @@ static const JSUINT8 g_decoderLookup[256] =
static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
{
int index;
wchar_t *escOffset;
wchar_t *escStart;
JSUINT32 *escOffset;
JSUINT32 *escStart;
size_t escLen = (ds->escEnd - ds->escStart);
JSUINT8 *inputOffset;
JSUTF16 ch = 0;
#if WCHAR_MAX >= 0x10FFFF
JSUINT8 *lastHighSurrogate = NULL;
#endif
JSUINT8 oct;
JSUTF32 ucs;
ds->lastType = JT_INVALID;
Expand All @@ -380,11 +377,11 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds

if (ds->escHeap)
{
if (newSize > (SIZE_MAX / sizeof(wchar_t)))
if (newSize > (SIZE_MAX / sizeof(JSUINT32)))
{
return SetError(ds, -1, "Could not reserve memory block");
}
escStart = (wchar_t *)ds->dec->realloc(ds->escStart, newSize * sizeof(wchar_t));
escStart = (JSUINT32 *)ds->dec->realloc(ds->escStart, newSize * sizeof(JSUINT32));
if (!escStart)
{
ds->dec->free(ds->escStart);
Expand All @@ -394,18 +391,18 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds
}
else
{
wchar_t *oldStart = ds->escStart;
if (newSize > (SIZE_MAX / sizeof(wchar_t)))
JSUINT32 *oldStart = ds->escStart;
if (newSize > (SIZE_MAX / sizeof(JSUINT32)))
{
return SetError(ds, -1, "Could not reserve memory block");
}
ds->escStart = (wchar_t *) ds->dec->malloc(newSize * sizeof(wchar_t));
ds->escStart = (JSUINT32 *) ds->dec->malloc(newSize * sizeof(JSUINT32));
if (!ds->escStart)
{
return SetError(ds, -1, "Could not reserve memory block");
}
ds->escHeap = 1;
memcpy(ds->escStart, oldStart, escLen * sizeof(wchar_t));
memcpy(ds->escStart, oldStart, escLen * sizeof(JSUINT32));
}

ds->escEnd = ds->escStart + newSize;
Expand Down Expand Up @@ -438,14 +435,14 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds
inputOffset ++;
switch (*inputOffset)
{
case '\\': *(escOffset++) = L'\\'; inputOffset++; continue;
case '\"': *(escOffset++) = L'\"'; inputOffset++; continue;
case '/': *(escOffset++) = L'/'; inputOffset++; continue;
case 'b': *(escOffset++) = L'\b'; inputOffset++; continue;
case 'f': *(escOffset++) = L'\f'; inputOffset++; continue;
case 'n': *(escOffset++) = L'\n'; inputOffset++; continue;
case 'r': *(escOffset++) = L'\r'; inputOffset++; continue;
case 't': *(escOffset++) = L'\t'; inputOffset++; continue;
case '\\': *(escOffset++) = '\\'; inputOffset++; continue;
case '\"': *(escOffset++) = '\"'; inputOffset++; continue;
case '/': *(escOffset++) = '/'; inputOffset++; continue;
case 'b': *(escOffset++) = '\b'; inputOffset++; continue;
case 'f': *(escOffset++) = '\f'; inputOffset++; continue;
case 'n': *(escOffset++) = '\n'; inputOffset++; continue;
case 'r': *(escOffset++) = '\r'; inputOffset++; continue;
case 't': *(escOffset++) = '\t'; inputOffset++; continue;

case 'u':
{
Expand Down Expand Up @@ -494,24 +491,20 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds
inputOffset ++;
}

#if WCHAR_MAX >= 0x10FFFF
if ((ch & 0xfc00) == 0xdc00 && lastHighSurrogate == inputOffset - 6 * sizeof(*inputOffset))
{
// Low surrogate immediately following a high surrogate
// Overwrite existing high surrogate with combined character
*(escOffset-1) = (((*(escOffset-1) - 0xd800) <<10) | (ch - 0xdc00)) + 0x10000;
}
else
#endif
{
*(escOffset++) = (wchar_t) ch;
*(escOffset++) = (JSUINT32) ch;
}
#if WCHAR_MAX >= 0x10FFFF
if ((ch & 0xfc00) == 0xd800)
{
lastHighSurrogate = inputOffset;
}
#endif
break;
}

Expand All @@ -523,7 +516,7 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds

case 1:
{
*(escOffset++) = (wchar_t) (*inputOffset++);
*(escOffset++) = (JSUINT32) (*inputOffset++);
break;
}

Expand All @@ -537,7 +530,7 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds
}
ucs |= (*inputOffset++) & 0x3f;
if (ucs < 0x80) return SetError (ds, -1, "Overlong 2 byte UTF-8 sequence detected when decoding 'string'");
*(escOffset++) = (wchar_t) ucs;
*(escOffset++) = (JSUINT32) ucs;
break;
}

Expand All @@ -560,7 +553,7 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds
}

if (ucs < 0x800) return SetError (ds, -1, "Overlong 3 byte UTF-8 sequence detected when encoding string");
*(escOffset++) = (wchar_t) ucs;
*(escOffset++) = (JSUINT32) ucs;
break;
}

Expand All @@ -584,20 +577,7 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds

if (ucs < 0x10000) return SetError (ds, -1, "Overlong 4 byte UTF-8 sequence detected when decoding 'string'");

#if WCHAR_MAX == 0xffff
if (ucs >= 0x10000)
{
ucs -= 0x10000;
*(escOffset++) = (wchar_t) (ucs >> 10) + 0xd800;
*(escOffset++) = (wchar_t) (ucs & 0x3ff) + 0xdc00;
}
else
{
*(escOffset++) = (wchar_t) ucs;
}
#else
*(escOffset++) = (wchar_t) ucs;
#endif
*(escOffset++) = (JSUINT32) ucs;
break;
}
}
Expand Down Expand Up @@ -810,14 +790,14 @@ JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuf
/*
FIXME: Base the size of escBuffer of that of cbBuffer so that the unicode escaping doesn't run into the wall each time */
struct DecoderState ds;
wchar_t escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t))];
JSUINT32 escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(JSUINT32))];
JSOBJ ret;

ds.start = (char *) buffer;
ds.end = ds.start + cbBuffer;

ds.escStart = escBuffer;
ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t));
ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(JSUINT32));
ds.escHeap = 0;
ds.prv = dec->prv;
ds.dec = dec;
Expand Down
13 changes: 11 additions & 2 deletions python/JSONtoObj.c
Expand Up @@ -59,9 +59,18 @@ static void Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value)
return;
}

static JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end)
/*
Check that Py_UCS4 is the same as JSUINT32, else Object_newString will fail.
Based on Linux's check in vbox_vmmdev_types.h.
This should be replaced with
_Static_assert(sizeof(Py_UCS4) == sizeof(JSUINT32));
when C11 is made mandatory (CPython 3.11+, PyPy ?).
*/
typedef char assert_py_ucs4_is_jsuint32[1 - 2*!(sizeof(Py_UCS4) == sizeof(JSUINT32))];

static JSOBJ Object_newString(void *prv, JSUINT32 *start, JSUINT32 *end)
{
return PyUnicode_FromWideChar (start, (end - start));
return PyUnicode_FromKindAndData (PyUnicode_4BYTE_KIND, (Py_UCS4 *) start, (end - start));
}

static JSOBJ Object_newTrue(void *prv)
Expand Down
9 changes: 0 additions & 9 deletions tests/test_ujson.py
@@ -1,4 +1,3 @@
import ctypes
import datetime as dt
import decimal
import io
Expand Down Expand Up @@ -515,10 +514,6 @@ def test_encode_surrogate_characters():
assert ujson.dumps({"\ud800": "\udfff"}, ensure_ascii=False, sort_keys=True) == out2


@pytest.mark.xfail(
hasattr(sys, "pypy_version_info") and os.name == "nt",
reason="This feature needs fixing! See #552",
)
@pytest.mark.parametrize(
"test_input, expected",
[
Expand All @@ -543,10 +538,6 @@ def test_encode_surrogate_characters():
],
)
def test_decode_surrogate_characters(test_input, expected):
# FIXME Wrong output (combined char) on platforms with 16-bit wchar_t
if test_input == '"\uD83D\uDCA9"' and ctypes.sizeof(ctypes.c_wchar) == 2:
pytest.skip("Raw surrogate pairs are not supported with 16-bit wchar_t")

assert ujson.loads(test_input) == expected
assert ujson.loads(test_input.encode("utf-8", "surrogatepass")) == expected

Expand Down