From bc7bdff051e19188edf121c216b8250110abf39a Mon Sep 17 00:00:00 2001
From: JustAnotherArchivist <JustAnotherArchivist@users.noreply.github.com>
Date: Sat, 18 Jun 2022 22:54:44 +0000
Subject: [PATCH] Replace wchar_t string decoding implementation with a
 uint32_t-based one

This fixes character handling on platforms with 16-bit wchar_t (notably, Windows), which was broken (in different ways) on both CPython and PyPy.

Fixes #552
---
 lib/ultrajson.h     |  3 +-
 lib/ultrajsondec.c  | 70 ++++++++++++++++-----------------------------
 python/JSONtoObj.c  | 13 +++++++--
 tests/test_ujson.py |  9 ------
 4 files changed, 37 insertions(+), 58 deletions(-)

diff --git a/lib/ultrajson.h b/lib/ultrajson.h
index 170214ff..eda221aa 100644
--- a/lib/ultrajson.h
+++ b/lib/ultrajson.h
@@ -54,7 +54,6 @@ tree doesn't have cyclic references.
 #define __ULTRAJSON_H__
 
 #include <stdio.h>
-#include <wchar.h>
 
 // Max decimals to encode double floating point numbers with
 #ifndef JSON_DOUBLE_MAX_DECIMALS
@@ -318,7 +317,7 @@ EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *
 
 typedef struct __JSONObjectDecoder
 {
-  JSOBJ (*newString)(void *prv, wchar_t *start, wchar_t *end);
+  JSOBJ (*newString)(void *prv, JSUINT32 *start, JSUINT32 *end);
   void (*objectAddKey)(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value);
   void (*arrayAddItem)(void *prv, JSOBJ obj, JSOBJ value);
   JSOBJ (*newTrue)(void *prv);
diff --git a/lib/ultrajsondec.c b/lib/ultrajsondec.c
index caf15ccc..47485a6b 100644
--- a/lib/ultrajsondec.c
+++ b/lib/ultrajsondec.c
@@ -41,7 +41,6 @@ Numeric decoder derived from from TCL library
 #include <assert.h>
 #include <string.h>
 #include <limits.h>
-#include <wchar.h>
 #include <stdlib.h>
 #include <errno.h>
 #include <stdint.h>
@@ -58,8 +57,8 @@ struct DecoderState
 {
   char *start;
   char *end;
-  wchar_t *escStart;
-  wchar_t *escEnd;
+  JSUINT32 *escStart;
+  JSUINT32 *escEnd;
   int escHeap;
   int lastType;
   JSUINT32 objDepth;
@@ -361,14 +360,12 @@ static const JSUINT8 g_decoderLookup[256] =
 static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
 {
   int index;
-  wchar_t *escOffset;
-  wchar_t *escStart;
+  JSUINT32 *escOffset;
+  JSUINT32 *escStart;
   size_t escLen = (ds->escEnd - ds->escStart);
   JSUINT8 *inputOffset;
   JSUTF16 ch = 0;
-#if WCHAR_MAX >= 0x10FFFF
   JSUINT8 *lastHighSurrogate = NULL;
-#endif
   JSUINT8 oct;
   JSUTF32 ucs;
   ds->lastType = JT_INVALID;
@@ -380,11 +377,11 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds
 
     if (ds->escHeap)
     {
-      if (newSize > (SIZE_MAX / sizeof(wchar_t)))
+      if (newSize > (SIZE_MAX / sizeof(JSUINT32)))
       {
         return SetError(ds, -1, "Could not reserve memory block");
       }
-      escStart = (wchar_t *)ds->dec->realloc(ds->escStart, newSize * sizeof(wchar_t));
+      escStart = (JSUINT32 *)ds->dec->realloc(ds->escStart, newSize * sizeof(JSUINT32));
       if (!escStart)
       {
         ds->dec->free(ds->escStart);
@@ -394,18 +391,18 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds
     }
     else
     {
-      wchar_t *oldStart = ds->escStart;
-      if (newSize > (SIZE_MAX / sizeof(wchar_t)))
+      JSUINT32 *oldStart = ds->escStart;
+      if (newSize > (SIZE_MAX / sizeof(JSUINT32)))
       {
         return SetError(ds, -1, "Could not reserve memory block");
       }
-      ds->escStart = (wchar_t *) ds->dec->malloc(newSize * sizeof(wchar_t));
+      ds->escStart = (JSUINT32 *) ds->dec->malloc(newSize * sizeof(JSUINT32));
       if (!ds->escStart)
       {
         return SetError(ds, -1, "Could not reserve memory block");
       }
       ds->escHeap = 1;
-      memcpy(ds->escStart, oldStart, escLen * sizeof(wchar_t));
+      memcpy(ds->escStart, oldStart, escLen * sizeof(JSUINT32));
     }
 
     ds->escEnd = ds->escStart + newSize;
@@ -438,14 +435,14 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds
         inputOffset ++;
         switch (*inputOffset)
         {
-          case '\\': *(escOffset++) = L'\\'; inputOffset++; continue;
-          case '\"': *(escOffset++) = L'\"'; inputOffset++; continue;
-          case '/':  *(escOffset++) = L'/';  inputOffset++; continue;
-          case 'b':  *(escOffset++) = L'\b'; inputOffset++; continue;
-          case 'f':  *(escOffset++) = L'\f'; inputOffset++; continue;
-          case 'n':  *(escOffset++) = L'\n'; inputOffset++; continue;
-          case 'r':  *(escOffset++) = L'\r'; inputOffset++; continue;
-          case 't':  *(escOffset++) = L'\t'; inputOffset++; continue;
+          case '\\': *(escOffset++) = '\\'; inputOffset++; continue;
+          case '\"': *(escOffset++) = '\"'; inputOffset++; continue;
+          case '/':  *(escOffset++) = '/';  inputOffset++; continue;
+          case 'b':  *(escOffset++) = '\b'; inputOffset++; continue;
+          case 'f':  *(escOffset++) = '\f'; inputOffset++; continue;
+          case 'n':  *(escOffset++) = '\n'; inputOffset++; continue;
+          case 'r':  *(escOffset++) = '\r'; inputOffset++; continue;
+          case 't':  *(escOffset++) = '\t'; inputOffset++; continue;
 
           case 'u':
           {
@@ -494,7 +491,6 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds
               inputOffset ++;
             }
 
-#if WCHAR_MAX >= 0x10FFFF
             if ((ch & 0xfc00) == 0xdc00 && lastHighSurrogate == inputOffset - 6 * sizeof(*inputOffset))
             {
               // Low surrogate immediately following a high surrogate
@@ -502,16 +498,13 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds
               *(escOffset-1) = (((*(escOffset-1) - 0xd800) <<10) | (ch - 0xdc00)) + 0x10000;
             }
             else
-#endif
             {
-              *(escOffset++) = (wchar_t) ch;
+              *(escOffset++) = (JSUINT32) ch;
             }
-#if WCHAR_MAX >= 0x10FFFF
             if ((ch & 0xfc00) == 0xd800)
             {
               lastHighSurrogate = inputOffset;
             }
-#endif
             break;
           }
 
@@ -523,7 +516,7 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds
 
       case 1:
       {
-        *(escOffset++) = (wchar_t) (*inputOffset++);
+        *(escOffset++) = (JSUINT32) (*inputOffset++);
         break;
       }
 
@@ -537,7 +530,7 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds
         }
         ucs |= (*inputOffset++) & 0x3f;
         if (ucs < 0x80) return SetError (ds, -1, "Overlong 2 byte UTF-8 sequence detected when decoding 'string'");
-        *(escOffset++) = (wchar_t) ucs;
+        *(escOffset++) = (JSUINT32) ucs;
         break;
       }
 
@@ -560,7 +553,7 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds
         }
 
         if (ucs < 0x800) return SetError (ds, -1, "Overlong 3 byte UTF-8 sequence detected when encoding string");
-        *(escOffset++) = (wchar_t) ucs;
+        *(escOffset++) = (JSUINT32) ucs;
         break;
       }
 
@@ -584,20 +577,7 @@ static FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds
 
         if (ucs < 0x10000) return SetError (ds, -1, "Overlong 4 byte UTF-8 sequence detected when decoding 'string'");
 
-#if WCHAR_MAX == 0xffff
-        if (ucs >= 0x10000)
-        {
-          ucs -= 0x10000;
-          *(escOffset++) = (wchar_t) (ucs >> 10) + 0xd800;
-          *(escOffset++) = (wchar_t) (ucs & 0x3ff) + 0xdc00;
-        }
-        else
-        {
-          *(escOffset++) = (wchar_t) ucs;
-        }
-#else
-        *(escOffset++) = (wchar_t) ucs;
-#endif
+        *(escOffset++) = (JSUINT32) ucs;
         break;
       }
     }
@@ -810,14 +790,14 @@ JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuf
   /*
   FIXME: Base the size of escBuffer of that of cbBuffer so that the unicode escaping doesn't run into the wall each time */
   struct DecoderState ds;
-  wchar_t escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t))];
+  JSUINT32 escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(JSUINT32))];
   JSOBJ ret;
 
   ds.start = (char *) buffer;
   ds.end = ds.start + cbBuffer;
 
   ds.escStart = escBuffer;
-  ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t));
+  ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(JSUINT32));
   ds.escHeap = 0;
   ds.prv = dec->prv;
   ds.dec = dec;
diff --git a/python/JSONtoObj.c b/python/JSONtoObj.c
index 55000b58..fbb3fb6f 100644
--- a/python/JSONtoObj.c
+++ b/python/JSONtoObj.c
@@ -59,9 +59,18 @@ static void Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value)
   return;
 }
 
-static JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end)
+/*
+Check that Py_UCS4 is the same as JSUINT32, else Object_newString will fail.
+Based on Linux's check in vbox_vmmdev_types.h.
+This should be replaced with
+  _Static_assert(sizeof(Py_UCS4) == sizeof(JSUINT32));
+when C11 is made mandatory (CPython 3.11+, PyPy ?).
+*/
+typedef char assert_py_ucs4_is_jsuint32[1 - 2*!(sizeof(Py_UCS4) == sizeof(JSUINT32))];
+
+static JSOBJ Object_newString(void *prv, JSUINT32 *start, JSUINT32 *end)
 {
-  return PyUnicode_FromWideChar (start, (end - start));
+  return PyUnicode_FromKindAndData (PyUnicode_4BYTE_KIND, (Py_UCS4 *) start, (end - start));
 }
 
 static JSOBJ Object_newTrue(void *prv)
diff --git a/tests/test_ujson.py b/tests/test_ujson.py
index 66d8d0c9..d50ebbf9 100644
--- a/tests/test_ujson.py
+++ b/tests/test_ujson.py
@@ -1,4 +1,3 @@
-import ctypes
 import datetime as dt
 import decimal
 import io
@@ -515,10 +514,6 @@ def test_encode_surrogate_characters():
     assert ujson.dumps({"\ud800": "\udfff"}, ensure_ascii=False, sort_keys=True) == out2
 
 
-@pytest.mark.xfail(
-    hasattr(sys, "pypy_version_info") and os.name == "nt",
-    reason="This feature needs fixing! See #552",
-)
 @pytest.mark.parametrize(
     "test_input, expected",
     [
@@ -543,10 +538,6 @@ def test_encode_surrogate_characters():
     ],
 )
 def test_decode_surrogate_characters(test_input, expected):
-    # FIXME Wrong output (combined char) on platforms with 16-bit wchar_t
-    if test_input == '"\uD83D\uDCA9"' and ctypes.sizeof(ctypes.c_wchar) == 2:
-        pytest.skip("Raw surrogate pairs are not supported with 16-bit wchar_t")
-
     assert ujson.loads(test_input) == expected
     assert ujson.loads(test_input.encode("utf-8", "surrogatepass")) == expected