Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix handling of surrogates on decoding #550

Merged
merged 1 commit into from Jun 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
46 changes: 20 additions & 26 deletions lib/ultrajsondec.c
Expand Up @@ -357,13 +357,15 @@ static const JSUINT8 g_decoderLookup[256] =

static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
{
JSUTF16 sur[2] = { 0 };
int iSur = 0;
int index;
wchar_t *escOffset;
wchar_t *escStart;
size_t escLen = (ds->escEnd - ds->escStart);
JSUINT8 *inputOffset;
JSUTF16 ch = 0;
#if WCHAR_MAX >= 0x10FFFF
JSUINT8 *lastHighSurrogate = NULL;
#endif
JSUINT8 oct;
JSUTF32 ucs;
ds->lastType = JT_INVALID;
Expand Down Expand Up @@ -464,7 +466,7 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds
case '7':
case '8':
case '9':
sur[iSur] = (sur[iSur] << 4) + (JSUTF16) (*inputOffset - '0');
ch = (ch << 4) + (JSUTF16) (*inputOffset - '0');
break;

case 'a':
Expand All @@ -473,7 +475,7 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds
case 'd':
case 'e':
case 'f':
sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'a');
ch = (ch << 4) + 10 + (JSUTF16) (*inputOffset - 'a');
break;

case 'A':
Expand All @@ -482,39 +484,31 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds
case 'D':
case 'E':
case 'F':
sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'A');
ch = (ch << 4) + 10 + (JSUTF16) (*inputOffset - 'A');
break;
}

inputOffset ++;
}

if (iSur == 0)
#if WCHAR_MAX >= 0x10FFFF
if ((ch & 0xfc00) == 0xdc00 && lastHighSurrogate == inputOffset - 6 * sizeof(*inputOffset))
{
if((sur[iSur] & 0xfc00) == 0xd800)
{
// First of a surrogate pair, continue parsing
iSur ++;
break;
}
(*escOffset++) = (wchar_t) sur[iSur];
iSur = 0;
// Low surrogate immediately following a high surrogate
// Overwrite existing high surrogate with combined character
*(escOffset-1) = (((*(escOffset-1) - 0xd800) <<10) | (ch - 0xdc00)) + 0x10000;
}
else
{
// Decode pair
if ((sur[1] & 0xfc00) != 0xdc00)
{
return SetError (ds, -1, "Unpaired high surrogate when decoding 'string'");
}
#if WCHAR_MAX == 0xffff
(*escOffset++) = (wchar_t) sur[0];
(*escOffset++) = (wchar_t) sur[1];
#else
(*escOffset++) = (wchar_t) 0x10000 + (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00));
#endif
iSur = 0;
{
*(escOffset++) = (wchar_t) ch;
}
#if WCHAR_MAX >= 0x10FFFF
if ((ch & 0xfc00) == 0xd800)
{
lastHighSurrogate = inputOffset;
}
#endif
break;
}

Expand Down
2 changes: 1 addition & 1 deletion python/JSONtoObj.c
Expand Up @@ -173,7 +173,7 @@ PyObject* JSONToObj(PyObject* self, PyObject *args, PyObject *kwargs)
else
if (PyUnicode_Check(arg))
{
sarg = PyUnicode_AsUTF8String(arg);
sarg = PyUnicode_AsEncodedString(arg, NULL, "surrogatepass");
if (sarg == NULL)
{
//Exception raised above us by codec according to docs
Expand Down
36 changes: 36 additions & 0 deletions tests/test_ujson.py
@@ -1,3 +1,4 @@
import ctypes
import datetime as dt
import decimal
import io
Expand Down Expand Up @@ -512,6 +513,41 @@ def test_encode_surrogate_characters():
assert ujson.dumps({"\ud800": "\udfff"}, ensure_ascii=False, sort_keys=True) == out2


@pytest.mark.parametrize(
"test_input, expected",
[
# Normal cases
(r'"\uD83D\uDCA9"', "\U0001F4A9"),
(r'"a\uD83D\uDCA9b"', "a\U0001F4A9b"),
# Unpaired surrogates
(r'"\uD800"', "\uD800"),
(r'"a\uD800b"', "a\uD800b"),
(r'"\uDEAD"', "\uDEAD"),
(r'"a\uDEADb"', "a\uDEADb"),
(r'"\uD83D\uD83D\uDCA9"', "\uD83D\U0001F4A9"),
(r'"\uDCA9\uD83D\uDCA9"', "\uDCA9\U0001F4A9"),
(r'"\uD83D\uDCA9\uD83D"', "\U0001F4A9\uD83D"),
(r'"\uD83D\uDCA9\uDCA9"', "\U0001F4A9\uDCA9"),
(r'"\uD83D \uDCA9"', "\uD83D \uDCA9"),
# No decoding of actual surrogate characters (rather than escaped ones)
('"\uD800"', "\uD800"),
('"\uDEAD"', "\uDEAD"),
('"\uD800a\uDEAD"', "\uD800a\uDEAD"),
('"\uD83D\uDCA9"', "\uD83D\uDCA9"),
],
)
def test_decode_surrogate_characters(test_input, expected):
# FIXME Wrong output (combined char) on platforms with 16-bit wchar_t
if test_input == '"\uD83D\uDCA9"' and ctypes.sizeof(ctypes.c_wchar) == 2:
pytest.skip("Raw surrogate pairs are not supported with 16-bit wchar_t")

assert ujson.loads(test_input) == expected
assert ujson.loads(test_input.encode("utf-8", "surrogatepass")) == expected

# Ensure that this matches stdlib's behaviour
assert json.loads(test_input) == expected


def test_sort_keys():
data = {"a": 1, "c": 1, "b": 1, "e": 1, "f": 1, "d": 1}
sorted_keys = ujson.dumps(data, sort_keys=True)
Expand Down