Backport "Fix handling of surrogates on decoding" (#66)

adrianeboyd · web-flow · commit 991060751999 · 2022-07-20T13:56:25.000+02:00
Backport ultrajson/ultrajson#550
diff --git a/srsly/tests/ujson/test_ujson.py b/srsly/tests/ujson/test_ujson.py
@@ -1,4 +1,5 @@
-﻿import decimal
+﻿import ctypes
+import decimal
 import json
 import math
 import sys
@@ -958,3 +959,38 @@ def test_issue_334(indent):
     path = Path(__file__).with_name("334-reproducer.json")
     a = ujson.loads(path.read_bytes())
     ujson.dumps(a, indent=indent)
+
+
+@pytest.mark.parametrize(
+    "test_input, expected",
+    [
+        # Normal cases
+        (r'"\uD83D\uDCA9"', "\U0001F4A9"),
+        (r'"a\uD83D\uDCA9b"', "a\U0001F4A9b"),
+        # Unpaired surrogates
+        (r'"\uD800"', "\uD800"),
+        (r'"a\uD800b"', "a\uD800b"),
+        (r'"\uDEAD"', "\uDEAD"),
+        (r'"a\uDEADb"', "a\uDEADb"),
+        (r'"\uD83D\uD83D\uDCA9"', "\uD83D\U0001F4A9"),
+        (r'"\uDCA9\uD83D\uDCA9"', "\uDCA9\U0001F4A9"),
+        (r'"\uD83D\uDCA9\uD83D"', "\U0001F4A9\uD83D"),
+        (r'"\uD83D\uDCA9\uDCA9"', "\U0001F4A9\uDCA9"),
+        (r'"\uD83D \uDCA9"', "\uD83D \uDCA9"),
+        # No decoding of actual surrogate characters (rather than escaped ones)
+        ('"\uD800"', "\uD800"),
+        ('"\uDEAD"', "\uDEAD"),
+        ('"\uD800a\uDEAD"', "\uD800a\uDEAD"),
+        ('"\uD83D\uDCA9"', "\uD83D\uDCA9"),
+    ],
+)
+def test_decode_surrogate_characters(test_input, expected):
+    # FIXME Wrong output (combined char) on platforms with 16-bit wchar_t
+    if test_input == '"\uD83D\uDCA9"' and ctypes.sizeof(ctypes.c_wchar) == 2:
+        pytest.skip("Raw surrogate pairs are not supported with 16-bit wchar_t")
+
+    assert ujson.loads(test_input) == expected
+    assert ujson.loads(test_input.encode("utf-8", "surrogatepass")) == expected
+
+    # Ensure that this matches stdlib's behaviour
+    assert json.loads(test_input) == expected
diff --git a/srsly/ujson/JSONtoObj.c b/srsly/ujson/JSONtoObj.c
@@ -161,7 +161,7 @@ PyObject* JSONToObj(PyObject* self, PyObject *args, PyObject *kwargs)
   else
   if (PyUnicode_Check(arg))
   {
-    sarg = PyUnicode_AsUTF8String(arg);
+    sarg = PyUnicode_AsEncodedString(arg, NULL, "surrogatepass");
     if (sarg == NULL)
     {
       //Exception raised above us by codec according to docs
diff --git a/srsly/ujson/lib/ultrajsondec.c b/srsly/ujson/lib/ultrajsondec.c
@@ -424,13 +424,15 @@ static const JSUINT8 g_decoderLookup[256] =
 
 FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
 {
-  JSUTF16 sur[2] = { 0 };
-  int iSur = 0;
   int index;
   wchar_t *escOffset;
   wchar_t *escStart;
   size_t escLen = (ds->escEnd - ds->escStart);
   JSUINT8 *inputOffset;
+  JSUTF16 ch = 0;
+#if WCHAR_MAX >= 0x10FFFF
+  JSUINT8 *lastHighSurrogate = NULL;
+#endif
   JSUINT8 oct;
   JSUTF32 ucs;
   ds->lastType = JT_INVALID;
@@ -530,7 +532,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
                 case '7':
                 case '8':
                 case '9':
-                  sur[iSur] = (sur[iSur] << 4) + (JSUTF16) (*inputOffset - '0');
+                  ch = (ch << 4) + (JSUTF16) (*inputOffset - '0');
                   break;
 
                 case 'a':
@@ -539,7 +541,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
                 case 'd':
                 case 'e':
                 case 'f':
-                  sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'a');
+                  ch = (ch << 4) + 10 + (JSUTF16) (*inputOffset - 'a');
                   break;
 
                 case 'A':
@@ -548,39 +550,31 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
                 case 'D':
                 case 'E':
                 case 'F':
-                  sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'A');
+                  ch = (ch << 4) + 10 + (JSUTF16) (*inputOffset - 'A');
                   break;
               }
 
               inputOffset ++;
             }
 
-            if (iSur == 0)
+#if WCHAR_MAX >= 0x10FFFF
+            if ((ch & 0xfc00) == 0xdc00 && lastHighSurrogate == inputOffset - 6 * sizeof(*inputOffset))
             {
-              if((sur[iSur] & 0xfc00) == 0xd800)
-              {
-                // First of a surrogate pair, continue parsing
-                iSur ++;
-                break;
-              }
-              (*escOffset++) = (wchar_t) sur[iSur];
-              iSur = 0;
+              // Low surrogate immediately following a high surrogate
+              // Overwrite existing high surrogate with combined character
+              *(escOffset-1) = (((*(escOffset-1) - 0xd800) <<10) | (ch - 0xdc00)) + 0x10000;
             }
             else
-            {
-              // Decode pair
-              if ((sur[1] & 0xfc00) != 0xdc00)
-              {
-                return SetError (ds, -1, "Unpaired high surrogate when decoding 'string'");
-              }
-#if WCHAR_MAX == 0xffff
-              (*escOffset++) = (wchar_t) sur[0];
-              (*escOffset++) = (wchar_t) sur[1];
-#else
-              (*escOffset++) = (wchar_t) 0x10000 + (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00));
 #endif
-              iSur = 0;
+            {
+              *(escOffset++) = (wchar_t) ch;
             }
+#if WCHAR_MAX >= 0x10FFFF
+            if ((ch & 0xfc00) == 0xd800)
+            {
+              lastHighSurrogate = inputOffset;
+            }
+#endif
           break;
         }
 

Original file line number	Diff line number	Diff line change
`@@ -161,7 +161,7 @@ PyObject* JSONToObj(PyObject* self, PyObject args, PyObject kwargs)`
`161`	`161`	`else`
`162`	`162`	`if (PyUnicode_Check(arg))`
`163`	`163`	`{`
`164`		`- sarg = PyUnicode_AsUTF8String(arg);`
	`164`	`+ sarg = PyUnicode_AsEncodedString(arg, NULL, "surrogatepass");`
`165`	`165`	`if (sarg == NULL)`
`166`	`166`	`{`
`167`	`167`	`//Exception raised above us by codec according to docs`