Skip to content

Commit febb6f2

Browse files
authored
Backport "Replace wchar_t string decoding implementation with a uint32_t-based one" (#67)
Backport ultrajson/ultrajson#555
1 parent 9910607 commit febb6f2

File tree

4 files changed

+38
-55
lines changed

4 files changed

+38
-55
lines changed

srsly/tests/ujson/test_ujson.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
import ctypes
2-
import decimal
1+
import decimal
32
import json
43
import math
54
import sys
@@ -985,10 +984,6 @@ def test_issue_334(indent):
985984
],
986985
)
987986
def test_decode_surrogate_characters(test_input, expected):
988-
# FIXME Wrong output (combined char) on platforms with 16-bit wchar_t
989-
if test_input == '"\uD83D\uDCA9"' and ctypes.sizeof(ctypes.c_wchar) == 2:
990-
pytest.skip("Raw surrogate pairs are not supported with 16-bit wchar_t")
991-
992987
assert ujson.loads(test_input) == expected
993988
assert ujson.loads(test_input.encode("utf-8", "surrogatepass")) == expected
994989

srsly/ujson/JSONtoObj.c

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -58,9 +58,18 @@ void Object_arrayAddItem(void *prv, JSOBJ obj, JSOBJ value)
5858
return;
5959
}
6060

61-
JSOBJ Object_newString(void *prv, wchar_t *start, wchar_t *end)
61+
/*
62+
Check that Py_UCS4 is the same as JSUINT32, else Object_newString will fail.
63+
Based on Linux's check in vbox_vmmdev_types.h.
64+
This should be replaced with
65+
_Static_assert(sizeof(Py_UCS4) == sizeof(JSUINT32));
66+
when C11 is made mandatory (CPython 3.11+, PyPy ?).
67+
*/
68+
typedef char assert_py_ucs4_is_jsuint32[1 - 2*!(sizeof(Py_UCS4) == sizeof(JSUINT32))];
69+
70+
static JSOBJ Object_newString(void *prv, JSUINT32 *start, JSUINT32 *end)
6271
{
63-
return PyUnicode_FromWideChar (start, (end - start));
72+
return PyUnicode_FromKindAndData (PyUnicode_4BYTE_KIND, (Py_UCS4 *) start, (end - start));
6473
}
6574

6675
JSOBJ Object_newTrue(void *prv)

srsly/ujson/lib/ultrajson.h

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,6 @@ tree doesn't have cyclic references.
5454
#define __ULTRAJSON_H__
5555

5656
#include <stdio.h>
57-
#include <wchar.h>
5857

5958
// Max decimals to encode double floating point numbers with
6059
#ifndef JSON_DOUBLE_MAX_DECIMALS
@@ -298,7 +297,7 @@ EXPORTFUNCTION char *JSON_EncodeObject(JSOBJ obj, JSONObjectEncoder *enc, char *
298297

299298
typedef struct __JSONObjectDecoder
300299
{
301-
JSOBJ (*newString)(void *prv, wchar_t *start, wchar_t *end);
300+
JSOBJ (*newString)(void *prv, JSUINT32 *start, JSUINT32 *end);
302301
void (*objectAddKey)(void *prv, JSOBJ obj, JSOBJ name, JSOBJ value);
303302
void (*arrayAddItem)(void *prv, JSOBJ obj, JSOBJ value);
304303
JSOBJ (*newTrue)(void *prv);

srsly/ujson/lib/ultrajsondec.c

Lines changed: 25 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,6 @@ Numeric decoder derived from from TCL library
4141
#include <assert.h>
4242
#include <string.h>
4343
#include <limits.h>
44-
#include <wchar.h>
4544
#include <stdlib.h>
4645
#include <errno.h>
4746

@@ -57,8 +56,8 @@ struct DecoderState
5756
{
5857
char *start;
5958
char *end;
60-
wchar_t *escStart;
61-
wchar_t *escEnd;
59+
JSUINT32 *escStart;
60+
JSUINT32 *escEnd;
6261
int escHeap;
6362
int lastType;
6463
JSUINT32 objDepth;
@@ -425,14 +424,12 @@ static const JSUINT8 g_decoderLookup[256] =
425424
FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
426425
{
427426
int index;
428-
wchar_t *escOffset;
429-
wchar_t *escStart;
427+
JSUINT32 *escOffset;
428+
JSUINT32 *escStart;
430429
size_t escLen = (ds->escEnd - ds->escStart);
431430
JSUINT8 *inputOffset;
432431
JSUTF16 ch = 0;
433-
#if WCHAR_MAX >= 0x10FFFF
434432
JSUINT8 *lastHighSurrogate = NULL;
435-
#endif
436433
JSUINT8 oct;
437434
JSUTF32 ucs;
438435
ds->lastType = JT_INVALID;
@@ -444,11 +441,11 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
444441

445442
if (ds->escHeap)
446443
{
447-
if (newSize > (SIZE_MAX / sizeof(wchar_t)))
444+
if (newSize > (SIZE_MAX / sizeof(JSUINT32)))
448445
{
449446
return SetError(ds, -1, "Could not reserve memory block");
450447
}
451-
escStart = (wchar_t *)ds->dec->realloc(ds->escStart, newSize * sizeof(wchar_t));
448+
escStart = (JSUINT32 *)ds->dec->realloc(ds->escStart, newSize * sizeof(JSUINT32));
452449
if (!escStart)
453450
{
454451
ds->dec->free(ds->escStart);
@@ -458,18 +455,18 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
458455
}
459456
else
460457
{
461-
wchar_t *oldStart = ds->escStart;
462-
if (newSize > (SIZE_MAX / sizeof(wchar_t)))
458+
JSUINT32 *oldStart = ds->escStart;
459+
if (newSize > (SIZE_MAX / sizeof(JSUINT32)))
463460
{
464461
return SetError(ds, -1, "Could not reserve memory block");
465462
}
466-
ds->escStart = (wchar_t *) ds->dec->malloc(newSize * sizeof(wchar_t));
463+
ds->escStart = (JSUINT32 *) ds->dec->malloc(newSize * sizeof(JSUINT32));
467464
if (!ds->escStart)
468465
{
469466
return SetError(ds, -1, "Could not reserve memory block");
470467
}
471468
ds->escHeap = 1;
472-
memcpy(ds->escStart, oldStart, escLen * sizeof(wchar_t));
469+
memcpy(ds->escStart, oldStart, escLen * sizeof(JSUINT32));
473470
}
474471

475472
ds->escEnd = ds->escStart + newSize;
@@ -501,14 +498,14 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
501498
inputOffset ++;
502499
switch (*inputOffset)
503500
{
504-
case '\\': *(escOffset++) = L'\\'; inputOffset++; continue;
505-
case '\"': *(escOffset++) = L'\"'; inputOffset++; continue;
506-
case '/': *(escOffset++) = L'/'; inputOffset++; continue;
507-
case 'b': *(escOffset++) = L'\b'; inputOffset++; continue;
508-
case 'f': *(escOffset++) = L'\f'; inputOffset++; continue;
509-
case 'n': *(escOffset++) = L'\n'; inputOffset++; continue;
510-
case 'r': *(escOffset++) = L'\r'; inputOffset++; continue;
511-
case 't': *(escOffset++) = L'\t'; inputOffset++; continue;
501+
case '\\': *(escOffset++) = '\\'; inputOffset++; continue;
502+
case '\"': *(escOffset++) = '\"'; inputOffset++; continue;
503+
case '/': *(escOffset++) = '/'; inputOffset++; continue;
504+
case 'b': *(escOffset++) = '\b'; inputOffset++; continue;
505+
case 'f': *(escOffset++) = '\f'; inputOffset++; continue;
506+
case 'n': *(escOffset++) = '\n'; inputOffset++; continue;
507+
case 'r': *(escOffset++) = '\r'; inputOffset++; continue;
508+
case 't': *(escOffset++) = '\t'; inputOffset++; continue;
512509

513510
case 'u':
514511
{
@@ -557,24 +554,20 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
557554
inputOffset ++;
558555
}
559556

560-
#if WCHAR_MAX >= 0x10FFFF
561557
if ((ch & 0xfc00) == 0xdc00 && lastHighSurrogate == inputOffset - 6 * sizeof(*inputOffset))
562558
{
563559
// Low surrogate immediately following a high surrogate
564560
// Overwrite existing high surrogate with combined character
565561
*(escOffset-1) = (((*(escOffset-1) - 0xd800) <<10) | (ch - 0xdc00)) + 0x10000;
566562
}
567563
else
568-
#endif
569564
{
570-
*(escOffset++) = (wchar_t) ch;
565+
*(escOffset++) = (JSUINT32) ch;
571566
}
572-
#if WCHAR_MAX >= 0x10FFFF
573567
if ((ch & 0xfc00) == 0xd800)
574568
{
575569
lastHighSurrogate = inputOffset;
576570
}
577-
#endif
578571
break;
579572
}
580573

@@ -585,7 +578,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
585578

586579
case 1:
587580
{
588-
*(escOffset++) = (wchar_t) (*inputOffset++);
581+
*(escOffset++) = (JSUINT32) (*inputOffset++);
589582
break;
590583
}
591584

@@ -599,7 +592,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
599592
}
600593
ucs |= (*inputOffset++) & 0x3f;
601594
if (ucs < 0x80) return SetError (ds, -1, "Overlong 2 byte UTF-8 sequence detected when decoding 'string'");
602-
*(escOffset++) = (wchar_t) ucs;
595+
*(escOffset++) = (JSUINT32) ucs;
603596
break;
604597
}
605598

@@ -622,7 +615,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
622615
}
623616

624617
if (ucs < 0x800) return SetError (ds, -1, "Overlong 3 byte UTF-8 sequence detected when encoding string");
625-
*(escOffset++) = (wchar_t) ucs;
618+
*(escOffset++) = (JSUINT32) ucs;
626619
break;
627620
}
628621

@@ -646,20 +639,7 @@ FASTCALL_ATTR JSOBJ FASTCALL_MSVC decode_string ( struct DecoderState *ds)
646639

647640
if (ucs < 0x10000) return SetError (ds, -1, "Overlong 4 byte UTF-8 sequence detected when decoding 'string'");
648641

649-
#if WCHAR_MAX == 0xffff
650-
if (ucs >= 0x10000)
651-
{
652-
ucs -= 0x10000;
653-
*(escOffset++) = (wchar_t) (ucs >> 10) + 0xd800;
654-
*(escOffset++) = (wchar_t) (ucs & 0x3ff) + 0xdc00;
655-
}
656-
else
657-
{
658-
*(escOffset++) = (wchar_t) ucs;
659-
}
660-
#else
661-
*(escOffset++) = (wchar_t) ucs;
662-
#endif
642+
*(escOffset++) = (JSUINT32) ucs;
663643
break;
664644
}
665645
}
@@ -869,14 +849,14 @@ JSOBJ JSON_DecodeObject(JSONObjectDecoder *dec, const char *buffer, size_t cbBuf
869849
/*
870850
FIXME: Base the size of escBuffer of that of cbBuffer so that the unicode escaping doesn't run into the wall each time */
871851
struct DecoderState ds;
872-
wchar_t escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t))];
852+
JSUINT32 escBuffer[(JSON_MAX_STACK_BUFFER_SIZE / sizeof(JSUINT32))];
873853
JSOBJ ret;
874854

875855
ds.start = (char *) buffer;
876856
ds.end = ds.start + cbBuffer;
877857

878858
ds.escStart = escBuffer;
879-
ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(wchar_t));
859+
ds.escEnd = ds.escStart + (JSON_MAX_STACK_BUFFER_SIZE / sizeof(JSUINT32));
880860
ds.escHeap = 0;
881861
ds.prv = dec->prv;
882862
ds.dec = dec;

0 commit comments

Comments
 (0)