diff --git a/srsly/tests/ujson/test_ujson.py b/srsly/tests/ujson/test_ujson.py index ccbf0cd..6c02cad 100644 --- a/srsly/tests/ujson/test_ujson.py +++ b/srsly/tests/ujson/test_ujson.py @@ -1,10 +1,10 @@ -# coding=UTF-8 from __future__ import print_function, unicode_literals import six from six.moves import range, zip import calendar import functools +import ctypes import decimal import json import math @@ -946,3 +946,38 @@ def test_issue_334(indent): path = Path(__file__).with_name("334-reproducer.json") a = ujson.loads(path.read_bytes()) ujson.dumps(a, indent=indent) + + +@pytest.mark.parametrize( + "test_input, expected", + [ + # Normal cases + (r'"\uD83D\uDCA9"', "\U0001F4A9"), + (r'"a\uD83D\uDCA9b"', "a\U0001F4A9b"), + # Unpaired surrogates + (r'"\uD800"', "\uD800"), + (r'"a\uD800b"', "a\uD800b"), + (r'"\uDEAD"', "\uDEAD"), + (r'"a\uDEADb"', "a\uDEADb"), + (r'"\uD83D\uD83D\uDCA9"', "\uD83D\U0001F4A9"), + (r'"\uDCA9\uD83D\uDCA9"', "\uDCA9\U0001F4A9"), + (r'"\uD83D\uDCA9\uD83D"', "\U0001F4A9\uD83D"), + (r'"\uD83D\uDCA9\uDCA9"', "\U0001F4A9\uDCA9"), + (r'"\uD83D \uDCA9"', "\uD83D \uDCA9"), + # No decoding of actual surrogate characters (rather than escaped ones) + ('"\uD800"', "\uD800"), + ('"\uDEAD"', "\uDEAD"), + ('"\uD800a\uDEAD"', "\uD800a\uDEAD"), + ('"\uD83D\uDCA9"', "\uD83D\uDCA9"), + ], +) +def test_decode_surrogate_characters(test_input, expected): + # FIXME Wrong output (combined char) on platforms with 16-bit wchar_t + if test_input == '"\uD83D\uDCA9"' and ctypes.sizeof(ctypes.c_wchar) == 2: + pytest.skip("Raw surrogate pairs are not supported with 16-bit wchar_t") + + assert ujson.loads(test_input) == expected + assert ujson.loads(test_input.encode("utf-8", "surrogatepass")) == expected + + # Ensure that this matches stdlib's behaviour + assert json.loads(test_input) == expected diff --git a/srsly/ujson/JSONtoObj.c b/srsly/ujson/JSONtoObj.c index 79d9f1a..08dfb42 100644 --- a/srsly/ujson/JSONtoObj.c +++ b/srsly/ujson/JSONtoObj.c @@ -161,7 +161,7 @@ PyObject* JSONToObj(PyObject* self, PyObject *args, PyObject *kwargs) else if (PyUnicode_Check(arg)) { - sarg = PyUnicode_AsUTF8String(arg); + sarg = PyUnicode_AsEncodedString(arg, NULL, "surrogatepass"); if (sarg == NULL) { //Exception raised above us by codec according to docs diff --git a/srsly/ujson/lib/ultrajsondec.c b/srsly/ujson/lib/ultrajsondec.c index 21a732e..1dee5df 100644 --- a/srsly/ujson/lib/ultrajsondec.c +++ b/srsly/ujson/lib/ultrajsondec.c @@ -424,13 +424,15 @@ static const JSUINT8 g_decoderLookup[256] = FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds) { - JSUTF16 sur[2] = { 0 }; - int iSur = 0; int index; wchar_t *escOffset; wchar_t *escStart; size_t escLen = (ds->escEnd - ds->escStart); JSUINT8 *inputOffset; + JSUTF16 ch = 0; +#if WCHAR_MAX >= 0x10FFFF + JSUINT8 *lastHighSurrogate = NULL; +#endif JSUINT8 oct; JSUTF32 ucs; ds->lastType = JT_INVALID; @@ -530,7 +532,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds) case '7': case '8': case '9': - sur[iSur] = (sur[iSur] << 4) + (JSUTF16) (*inputOffset - '0'); + ch = (ch << 4) + (JSUTF16) (*inputOffset - '0'); break; case 'a': @@ -539,7 +541,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds) case 'd': case 'e': case 'f': - sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'a'); + ch = (ch << 4) + 10 + (JSUTF16) (*inputOffset - 'a'); break; case 'A': @@ -548,39 +550,31 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds) case 'D': case 'E': case 'F': - sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'A'); + ch = (ch << 4) + 10 + (JSUTF16) (*inputOffset - 'A'); break; } inputOffset ++; } - if (iSur == 0) +#if WCHAR_MAX >= 0x10FFFF + if ((ch & 0xfc00) == 0xdc00 && lastHighSurrogate == inputOffset - 6 * sizeof(*inputOffset)) { - if((sur[iSur] & 0xfc00) == 0xd800) - { - // First of a surrogate pair, continue parsing - iSur ++; - break; - } - (*escOffset++) = (wchar_t) sur[iSur]; - iSur = 0; + // Low surrogate immediately following a high surrogate + // Overwrite existing high surrogate with combined character + *(escOffset-1) = (((*(escOffset-1) - 0xd800) <<10) | (ch - 0xdc00)) + 0x10000; } else - { - // Decode pair - if ((sur[1] & 0xfc00) != 0xdc00) - { - return SetError (ds, -1, "Unpaired high surrogate when decoding 'string'"); - } -#if WCHAR_MAX == 0xffff - (*escOffset++) = (wchar_t) sur[0]; - (*escOffset++) = (wchar_t) sur[1]; -#else - (*escOffset++) = (wchar_t) 0x10000 + (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00)); #endif - iSur = 0; + { + *(escOffset++) = (wchar_t) ch; } +#if WCHAR_MAX >= 0x10FFFF + if ((ch & 0xfc00) == 0xd800) + { + lastHighSurrogate = inputOffset; + } +#endif break; }