Skip to content

Commit 9910607

Browse files
authored
Backport "Fix handling of surrogates on decoding" (#66)
Backport ultrajson/ultrajson#550
1 parent 63a8a1c commit 9910607

File tree

3 files changed

+58
-28
lines changed

3 files changed

+58
-28
lines changed

srsly/tests/ujson/test_ujson.py

Lines changed: 37 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
import decimal
1+
import ctypes
2+
import decimal
23
import json
34
import math
45
import sys
@@ -958,3 +959,38 @@ def test_issue_334(indent):
958959
path = Path(__file__).with_name("334-reproducer.json")
959960
a = ujson.loads(path.read_bytes())
960961
ujson.dumps(a, indent=indent)
962+
963+
964+
@pytest.mark.parametrize(
965+
"test_input, expected",
966+
[
967+
# Normal cases
968+
(r'"\uD83D\uDCA9"', "\U0001F4A9"),
969+
(r'"a\uD83D\uDCA9b"', "a\U0001F4A9b"),
970+
# Unpaired surrogates
971+
(r'"\uD800"', "\uD800"),
972+
(r'"a\uD800b"', "a\uD800b"),
973+
(r'"\uDEAD"', "\uDEAD"),
974+
(r'"a\uDEADb"', "a\uDEADb"),
975+
(r'"\uD83D\uD83D\uDCA9"', "\uD83D\U0001F4A9"),
976+
(r'"\uDCA9\uD83D\uDCA9"', "\uDCA9\U0001F4A9"),
977+
(r'"\uD83D\uDCA9\uD83D"', "\U0001F4A9\uD83D"),
978+
(r'"\uD83D\uDCA9\uDCA9"', "\U0001F4A9\uDCA9"),
979+
(r'"\uD83D \uDCA9"', "\uD83D \uDCA9"),
980+
# No decoding of actual surrogate characters (rather than escaped ones)
981+
('"\uD800"', "\uD800"),
982+
('"\uDEAD"', "\uDEAD"),
983+
('"\uD800a\uDEAD"', "\uD800a\uDEAD"),
984+
('"\uD83D\uDCA9"', "\uD83D\uDCA9"),
985+
],
986+
)
987+
def test_decode_surrogate_characters(test_input, expected):
988+
# FIXME Wrong output (combined char) on platforms with 16-bit wchar_t
989+
if test_input == '"\uD83D\uDCA9"' and ctypes.sizeof(ctypes.c_wchar) == 2:
990+
pytest.skip("Raw surrogate pairs are not supported with 16-bit wchar_t")
991+
992+
assert ujson.loads(test_input) == expected
993+
assert ujson.loads(test_input.encode("utf-8", "surrogatepass")) == expected
994+
995+
# Ensure that this matches stdlib's behaviour
996+
assert json.loads(test_input) == expected

srsly/ujson/JSONtoObj.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -161,7 +161,7 @@ PyObject* JSONToObj(PyObject* self, PyObject *args, PyObject *kwargs)
161161
else
162162
if (PyUnicode_Check(arg))
163163
{
164-
sarg = PyUnicode_AsUTF8String(arg);
164+
sarg = PyUnicode_AsEncodedString(arg, NULL, "surrogatepass");
165165
if (sarg == NULL)
166166
{
167167
//Exception raised above us by codec according to docs

srsly/ujson/lib/ultrajsondec.c

Lines changed: 20 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -424,13 +424,15 @@ static const JSUINT8 g_decoderLookup[256] =
424424

425425
FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
426426
{
427-
JSUTF16 sur[2] = { 0 };
428-
int iSur = 0;
429427
int index;
430428
wchar_t *escOffset;
431429
wchar_t *escStart;
432430
size_t escLen = (ds->escEnd - ds->escStart);
433431
JSUINT8 *inputOffset;
432+
JSUTF16 ch = 0;
433+
#if WCHAR_MAX >= 0x10FFFF
434+
JSUINT8 *lastHighSurrogate = NULL;
435+
#endif
434436
JSUINT8 oct;
435437
JSUTF32 ucs;
436438
ds->lastType = JT_INVALID;
@@ -530,7 +532,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
530532
case '7':
531533
case '8':
532534
case '9':
533-
sur[iSur] = (sur[iSur] << 4) + (JSUTF16) (*inputOffset - '0');
535+
ch = (ch << 4) + (JSUTF16) (*inputOffset - '0');
534536
break;
535537

536538
case 'a':
@@ -539,7 +541,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
539541
case 'd':
540542
case 'e':
541543
case 'f':
542-
sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'a');
544+
ch = (ch << 4) + 10 + (JSUTF16) (*inputOffset - 'a');
543545
break;
544546

545547
case 'A':
@@ -548,39 +550,31 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
548550
case 'D':
549551
case 'E':
550552
case 'F':
551-
sur[iSur] = (sur[iSur] << 4) + 10 + (JSUTF16) (*inputOffset - 'A');
553+
ch = (ch << 4) + 10 + (JSUTF16) (*inputOffset - 'A');
552554
break;
553555
}
554556

555557
inputOffset ++;
556558
}
557559

558-
if (iSur == 0)
560+
#if WCHAR_MAX >= 0x10FFFF
561+
if ((ch & 0xfc00) == 0xdc00 && lastHighSurrogate == inputOffset - 6 * sizeof(*inputOffset))
559562
{
560-
if((sur[iSur] & 0xfc00) == 0xd800)
561-
{
562-
// First of a surrogate pair, continue parsing
563-
iSur ++;
564-
break;
565-
}
566-
(*escOffset++) = (wchar_t) sur[iSur];
567-
iSur = 0;
563+
// Low surrogate immediately following a high surrogate
564+
// Overwrite existing high surrogate with combined character
565+
*(escOffset-1) = (((*(escOffset-1) - 0xd800) <<10) | (ch - 0xdc00)) + 0x10000;
568566
}
569567
else
570-
{
571-
// Decode pair
572-
if ((sur[1] & 0xfc00) != 0xdc00)
573-
{
574-
return SetError (ds, -1, "Unpaired high surrogate when decoding 'string'");
575-
}
576-
#if WCHAR_MAX == 0xffff
577-
(*escOffset++) = (wchar_t) sur[0];
578-
(*escOffset++) = (wchar_t) sur[1];
579-
#else
580-
(*escOffset++) = (wchar_t) 0x10000 + (((sur[0] - 0xd800) << 10) | (sur[1] - 0xdc00));
581568
#endif
582-
iSur = 0;
569+
{
570+
*(escOffset++) = (wchar_t) ch;
583571
}
572+
#if WCHAR_MAX >= 0x10FFFF
573+
if ((ch & 0xfc00) == 0xd800)
574+
{
575+
lastHighSurrogate = inputOffset;
576+
}
577+
#endif
584578
break;
585579
}
586580

0 commit comments

Comments
 (0)