Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Backport "Replace wchar_t string decoding implementation with a uint32_t-based one" #67

Merged
merged 1 commit into from Jul 20, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
7 changes: 1 addition & 6 deletions srsly/tests/ujson/test_ujson.py
@@ -1,5 +1,4 @@
import ctypes
import decimal
import decimal
import json
import math
import sys
Expand Down Expand Up @@ -985,10 +984,6 @@ def test_issue_334(indent):
],
)
def test_decode_surrogate_characters(test_input, expected):
# FIXME Wrong output (combined char) on platforms with 16-bit wchar_t
if test_input == '"\uD83D\uDCA9"' and ctypes.sizeof(ctypes.c_wchar) == 2:
pytest.skip("Raw surrogate pairs are not supported with 16-bit wchar_t")

assert ujson.loads(test_input) == expected
assert ujson.loads(test_input.encode("utf-8", "surrogatepass")) == expected

Expand Down
13 changes: 11 additions & 2 deletions srsly/ujson/JSONtoObj.c
Expand Up @@ -58,9 +58,18 @@ void Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value)
return;
}

JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end)
/*
Check that Py_UCS4 is the same as JSUINT32, else Object_newString will fail.
Based on Linux's check in vbox_vmmdev_types.h.
This should be replaced with
_Static_assert(sizeof(Py_UCS4) == sizeof(JSUINT32));
when C11 is made mandatory (CPython 3.11+, PyPy ?).
*/
typedef char assert_py_ucs4_is_jsuint32[1 - 2*!(sizeof(Py_UCS4) == sizeof(JSUINT32))];

static JSOBJ Object_newString(void *prv, JSUINT32 *start, JSUINT32 *end)
{
return PyUnicode_FromWideChar (start, (end - start));
return PyUnicode_FromKindAndData (PyUnicode_4BYTE_KIND, (Py_UCS4 *) start, (end - start));
}

JSOBJ Object_newTrue(void *prv)
Expand Down
3 changes: 1 addition & 2 deletions srsly/ujson/lib/ultrajson.h
Expand Up @@ -54,7 +54,6 @@ tree doesn't have cyclic references.
#define __ULTRAJSON_H__

#include <stdio.h>
#include <wchar.h>

// Max decimals to encode double floating point numbers with
#ifndef JSON_DOUBLE_MAX_DECIMALS
Expand Down Expand Up @@ -298,7 +297,7 @@ EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *

typedef struct __JSONObjectDecoder
{
JSOBJ (*newString)(void *prv, wchar_t *start, wchar_t *end);
JSOBJ (*newString)(void *prv, JSUINT32 *start, JSUINT32 *end);
void (*objectAddKey)(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value);
void (*arrayAddItem)(void *prv, JSOBJ obj, JSOBJ value);
JSOBJ (*newTrue)(void *prv);
Expand Down
70 changes: 25 additions & 45 deletions srsly/ujson/lib/ultrajsondec.c
Expand Up @@ -41,7 +41,6 @@ Numeric decoder derived from from TCL library
#include <assert.h>
#include <string.h>
#include <limits.h>
#include <wchar.h>
#include <stdlib.h>
#include <errno.h>

Expand All @@ -57,8 +56,8 @@ struct DecoderState
{
char *start;
char *end;
wchar_t *escStart;
wchar_t *escEnd;
JSUINT32 *escStart;
JSUINT32 *escEnd;
int escHeap;
int lastType;
JSUINT32 objDepth;
Expand Down Expand Up @@ -425,14 +424,12 @@ static const JSUINT8 g_decoderLookup[256] =
FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
{
int index;
wchar_t *escOffset;
wchar_t *escStart;
JSUINT32 *escOffset;
JSUINT32 *escStart;
size_t escLen = (ds->escEnd - ds->escStart);
JSUINT8 *inputOffset;
JSUTF16 ch = 0;
#if WCHAR_MAX >= 0x10FFFF
JSUINT8 *lastHighSurrogate = NULL;
#endif
JSUINT8 oct;
JSUTF32 ucs;
ds->lastType = JT_INVALID;
Expand All @@ -444,11 +441,11 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)

if (ds->escHeap)
{
if (newSize > (SIZE_MAX / sizeof(wchar_t)))
if (newSize > (SIZE_MAX / sizeof(JSUINT32)))
{
return SetError(ds, -1, "Could not reserve memory block");
}
escStart = (wchar_t *)ds->dec->realloc(ds->escStart, newSize * sizeof(wchar_t));
escStart = (JSUINT32 *)ds->dec->realloc(ds->escStart, newSize * sizeof(JSUINT32));
if (!escStart)
{
ds->dec->free(ds->escStart);
Expand All @@ -458,18 +455,18 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
}
else
{
wchar_t *oldStart = ds->escStart;
if (newSize > (SIZE_MAX / sizeof(wchar_t)))
JSUINT32 *oldStart = ds->escStart;
if (newSize > (SIZE_MAX / sizeof(JSUINT32)))
{
return SetError(ds, -1, "Could not reserve memory block");
}
ds->escStart = (wchar_t *) ds->dec->malloc(newSize * sizeof(wchar_t));
ds->escStart = (JSUINT32 *) ds->dec->malloc(newSize * sizeof(JSUINT32));
if (!ds->escStart)
{
return SetError(ds, -1, "Could not reserve memory block");
}
ds->escHeap = 1;
memcpy(ds->escStart, oldStart, escLen * sizeof(wchar_t));
memcpy(ds->escStart, oldStart, escLen * sizeof(JSUINT32));
}

ds->escEnd = ds->escStart + newSize;
Expand Down Expand Up @@ -501,14 +498,14 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
inputOffset ++;
switch (*inputOffset)
{
case '\\': *(escOffset++) = L'\\'; inputOffset++; continue;
case '\"': *(escOffset++) = L'\"'; inputOffset++; continue;
case '/': *(escOffset++) = L'/'; inputOffset++; continue;
case 'b': *(escOffset++) = L'\b'; inputOffset++; continue;
case 'f': *(escOffset++) = L'\f'; inputOffset++; continue;
case 'n': *(escOffset++) = L'\n'; inputOffset++; continue;
case 'r': *(escOffset++) = L'\r'; inputOffset++; continue;
case 't': *(escOffset++) = L'\t'; inputOffset++; continue;
case '\\': *(escOffset++) = '\\'; inputOffset++; continue;
case '\"': *(escOffset++) = '\"'; inputOffset++; continue;
case '/': *(escOffset++) = '/'; inputOffset++; continue;
case 'b': *(escOffset++) = '\b'; inputOffset++; continue;
case 'f': *(escOffset++) = '\f'; inputOffset++; continue;
case 'n': *(escOffset++) = '\n'; inputOffset++; continue;
case 'r': *(escOffset++) = '\r'; inputOffset++; continue;
case 't': *(escOffset++) = '\t'; inputOffset++; continue;

case 'u':
{
Expand Down Expand Up @@ -557,24 +554,20 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
inputOffset ++;
}

#if WCHAR_MAX >= 0x10FFFF
if ((ch & 0xfc00) == 0xdc00 && lastHighSurrogate == inputOffset - 6 * sizeof(*inputOffset))
{
// Low surrogate immediately following a high surrogate
// Overwrite existing high surrogate with combined character
*(escOffset-1) = (((*(escOffset-1) - 0xd800) <<10) | (ch - 0xdc00)) + 0x10000;
}
else
#endif
{
*(escOffset++) = (wchar_t) ch;
*(escOffset++) = (JSUINT32) ch;
}
#if WCHAR_MAX >= 0x10FFFF
if ((ch & 0xfc00) == 0xd800)
{
lastHighSurrogate = inputOffset;
}
#endif
break;
}

Expand All @@ -585,7 +578,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)

case 1:
{
*(escOffset++) = (wchar_t) (*inputOffset++);
*(escOffset++) = (JSUINT32) (*inputOffset++);
break;
}

Expand All @@ -599,7 +592,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
}
ucs |= (*inputOffset++) & 0x3f;
if (ucs < 0x80) return SetError (ds, -1, "Overlong 2 byte UTF-8 sequence detected when decoding 'string'");
*(escOffset++) = (wchar_t) ucs;
*(escOffset++) = (JSUINT32) ucs;
break;
}

Expand All @@ -622,7 +615,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
}

if (ucs < 0x800) return SetError (ds, -1, "Overlong 3 byte UTF-8 sequence detected when encoding string");
*(escOffset++) = (wchar_t) ucs;
*(escOffset++) = (JSUINT32) ucs;
break;
}

Expand All @@ -646,20 +639,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)

if (ucs < 0x10000) return SetError (ds, -1, "Overlong 4 byte UTF-8 sequence detected when decoding 'string'");

#if WCHAR_MAX == 0xffff
if (ucs >= 0x10000)
{
ucs -= 0x10000;
*(escOffset++) = (wchar_t) (ucs >> 10) + 0xd800;
*(escOffset++) = (wchar_t) (ucs & 0x3ff) + 0xdc00;
}
else
{
*(escOffset++) = (wchar_t) ucs;
}
#else
*(escOffset++) = (wchar_t) ucs;
#endif
*(escOffset++) = (JSUINT32) ucs;
break;
}
}
Expand Down Expand Up @@ -869,14 +849,14 @@ JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuf
/*
FIXME: Base the size of escBuffer of that of cbBuffer so that the unicode escaping doesn't run into the wall each time */
struct DecoderState ds;
wchar_t escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t))];
JSUINT32 escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(JSUINT32))];
JSOBJ ret;

ds.start = (char *) buffer;
ds.end = ds.start + cbBuffer;

ds.escStart = escBuffer;
ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t));
ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(JSUINT32));
ds.escHeap = 0;
ds.prv = dec->prv;
ds.dec = dec;
Expand Down