Skip to content

Commit 61cda86

Browse files
authored
Merge pull request #742 from byroot/refactor-convert-utf8
Refactor convert_UTF8_to_JSON to split searching and escaping code
2 parents 1023227 + 8fb5ae8 commit 61cda86

File tree

2 files changed

+181
-157
lines changed

2 files changed

+181
-157
lines changed

ext/json/ext/generator/generator.c

+175-157
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ static void raise_generator_error(VALUE invalid_object, const char *fmt, ...)
101101
// 0 - single byte char that don't need to be escaped.
102102
// (x | 8) - char that needs to be escaped.
103103
static const unsigned char CHAR_LENGTH_MASK = 7;
104+
static const unsigned char ESCAPE_MASK = 8;
104105

105106
static const unsigned char escape_table[256] = {
106107
// ASCII Control Characters
@@ -165,6 +166,84 @@ static const unsigned char script_safe_escape_table[256] = {
165166
4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 9, 9,
166167
};
167168

169+
170+
typedef struct _search_state {
171+
const char *ptr;
172+
const char *end;
173+
const char *cursor;
174+
FBuffer *buffer;
175+
} search_state;
176+
177+
static inline void search_flush(search_state *search)
178+
{
179+
fbuffer_append(search->buffer, search->cursor, search->ptr - search->cursor);
180+
search->cursor = search->ptr;
181+
}
182+
183+
static inline unsigned char search_escape(search_state *search, const unsigned char escape_table[256])
184+
{
185+
while (search->ptr < search->end) {
186+
unsigned char ch = (unsigned char)*search->ptr;
187+
unsigned char ch_len = escape_table[ch];
188+
189+
if (RB_UNLIKELY(ch_len)) {
190+
if (ch_len & ESCAPE_MASK) {
191+
if (RB_UNLIKELY(ch_len == 11)) {
192+
const unsigned char *uptr = (const unsigned char *)search->ptr;
193+
if (!(uptr[1] == 0x80 && (uptr[2] >> 1) == 0x54)) {
194+
search->ptr += 3;
195+
continue;
196+
}
197+
}
198+
search_flush(search);
199+
return ch_len & CHAR_LENGTH_MASK;
200+
} else {
201+
search->ptr += ch_len;
202+
}
203+
} else {
204+
search->ptr++;
205+
}
206+
}
207+
search_flush(search);
208+
return 0;
209+
}
210+
211+
static inline void fast_escape_UTF8_char(search_state *search, unsigned char ch_len) {
212+
const unsigned char ch = (unsigned char)*search->ptr;
213+
switch (ch_len) {
214+
case 1: {
215+
switch (ch) {
216+
case '"': fbuffer_append(search->buffer, "\\\"", 2); break;
217+
case '\\': fbuffer_append(search->buffer, "\\\\", 2); break;
218+
case '/': fbuffer_append(search->buffer, "\\/", 2); break;
219+
case '\b': fbuffer_append(search->buffer, "\\b", 2); break;
220+
case '\f': fbuffer_append(search->buffer, "\\f", 2); break;
221+
case '\n': fbuffer_append(search->buffer, "\\n", 2); break;
222+
case '\r': fbuffer_append(search->buffer, "\\r", 2); break;
223+
case '\t': fbuffer_append(search->buffer, "\\t", 2); break;
224+
default: {
225+
const char *hexdig = "0123456789abcdef";
226+
char scratch[6] = { '\\', 'u', '0', '0', 0, 0 };
227+
scratch[4] = hexdig[(ch >> 4) & 0xf];
228+
scratch[5] = hexdig[ch & 0xf];
229+
fbuffer_append(search->buffer, scratch, 6);
230+
break;
231+
}
232+
}
233+
break;
234+
}
235+
case 3: {
236+
if (search->ptr[2] & 1) {
237+
fbuffer_append(search->buffer, "\\u2029", 6);
238+
} else {
239+
fbuffer_append(search->buffer, "\\u2028", 6);
240+
}
241+
break;
242+
}
243+
}
244+
search->cursor = (search->ptr += ch_len);
245+
}
246+
168247
/* Converts in_string to a JSON string (without the wrapping '"'
169248
* characters) in FBuffer out_buffer.
170249
*
@@ -181,182 +260,114 @@ static const unsigned char script_safe_escape_table[256] = {
181260
* Everything else (should be UTF-8) is just passed through and
182261
* appended to the result.
183262
*/
184-
static inline void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
263+
static inline void convert_UTF8_to_JSON(search_state *search, const unsigned char escape_table[256])
185264
{
186-
const char *hexdig = "0123456789abcdef";
187-
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
188-
189-
const char *ptr = RSTRING_PTR(str);
190-
unsigned long len = RSTRING_LEN(str);
191-
192-
unsigned long beg = 0, pos = 0;
193-
194-
#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos;
265+
unsigned char ch_len;
266+
while ((ch_len = search_escape(search, escape_table))) {
267+
fast_escape_UTF8_char(search, ch_len);
268+
}
269+
}
195270

196-
while (pos < len) {
197-
unsigned char ch = ptr[pos];
271+
static inline unsigned char search_ascii_only_escape(search_state *search, const unsigned char escape_table[256])
272+
{
273+
while (search->ptr < search->end) {
274+
unsigned char ch = (unsigned char)*search->ptr;
198275
unsigned char ch_len = escape_table[ch];
199-
/* JSON encoding */
200276

201277
if (RB_UNLIKELY(ch_len)) {
202-
switch (ch_len) {
203-
case 9: {
204-
FLUSH_POS(1);
205-
switch (ch) {
206-
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
207-
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
208-
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
209-
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
210-
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
211-
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
212-
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
213-
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
214-
default: {
215-
scratch[2] = '0';
216-
scratch[3] = '0';
217-
scratch[4] = hexdig[(ch >> 4) & 0xf];
218-
scratch[5] = hexdig[ch & 0xf];
219-
fbuffer_append(out_buffer, scratch, 6);
220-
break;
221-
}
222-
}
223-
break;
224-
}
225-
case 11: {
226-
unsigned char b2 = ptr[pos + 1];
227-
if (RB_UNLIKELY(b2 == 0x80)) {
228-
unsigned char b3 = ptr[pos + 2];
229-
if (b3 == 0xA8) {
230-
FLUSH_POS(3);
231-
fbuffer_append(out_buffer, "\\u2028", 6);
232-
break;
233-
} else if (b3 == 0xA9) {
234-
FLUSH_POS(3);
235-
fbuffer_append(out_buffer, "\\u2029", 6);
236-
break;
237-
}
238-
}
239-
ch_len = 3;
240-
// fallthrough
241-
}
242-
default:
243-
pos += ch_len;
244-
break;
245-
}
278+
search_flush(search);
279+
return ch_len & CHAR_LENGTH_MASK;
246280
} else {
247-
pos++;
281+
search->ptr++;
248282
}
249283
}
250-
#undef FLUSH_POS
251-
252-
if (beg < len) {
253-
fbuffer_append(out_buffer, &ptr[beg], len - beg);
254-
}
255-
256-
RB_GC_GUARD(str);
284+
search_flush(search);
285+
return 0;
257286
}
258287

259-
static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE str, const unsigned char escape_table[256])
260-
{
261-
const char *hexdig = "0123456789abcdef";
262-
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
263-
264-
const char *ptr = RSTRING_PTR(str);
265-
unsigned long len = RSTRING_LEN(str);
266-
267-
unsigned long beg = 0, pos = 0;
268-
269-
#define FLUSH_POS(bytes) if (pos > beg) { fbuffer_append(out_buffer, &ptr[beg], pos - beg); } pos += bytes; beg = pos;
270-
271-
while (pos < len) {
272-
unsigned char ch = ptr[pos];
273-
unsigned char ch_len = escape_table[ch];
274-
275-
if (RB_UNLIKELY(ch_len)) {
276-
switch (ch_len) {
277-
case 9: {
278-
FLUSH_POS(1);
279-
switch (ch) {
280-
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
281-
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
282-
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
283-
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
284-
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
285-
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
286-
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
287-
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
288-
default: {
289-
scratch[2] = '0';
290-
scratch[3] = '0';
291-
scratch[4] = hexdig[(ch >> 4) & 0xf];
292-
scratch[5] = hexdig[ch & 0xf];
293-
fbuffer_append(out_buffer, scratch, 6);
294-
break;
295-
}
296-
}
288+
static inline void full_escape_UTF8_char(search_state *search, unsigned char ch_len) {
289+
const unsigned char ch = (unsigned char)*search->ptr;
290+
switch (ch_len) {
291+
case 1: {
292+
switch (ch) {
293+
case '"': fbuffer_append(search->buffer, "\\\"", 2); break;
294+
case '\\': fbuffer_append(search->buffer, "\\\\", 2); break;
295+
case '/': fbuffer_append(search->buffer, "\\/", 2); break;
296+
case '\b': fbuffer_append(search->buffer, "\\b", 2); break;
297+
case '\f': fbuffer_append(search->buffer, "\\f", 2); break;
298+
case '\n': fbuffer_append(search->buffer, "\\n", 2); break;
299+
case '\r': fbuffer_append(search->buffer, "\\r", 2); break;
300+
case '\t': fbuffer_append(search->buffer, "\\t", 2); break;
301+
default: {
302+
const char *hexdig = "0123456789abcdef";
303+
char scratch[6] = { '\\', 'u', '0', '0', 0, 0 };
304+
scratch[4] = hexdig[(ch >> 4) & 0xf];
305+
scratch[5] = hexdig[ch & 0xf];
306+
fbuffer_append(search->buffer, scratch, 6);
297307
break;
298308
}
299-
default: {
300-
uint32_t wchar = 0;
301-
ch_len = ch_len & CHAR_LENGTH_MASK;
302-
303-
switch(ch_len) {
304-
case 2:
305-
wchar = ptr[pos] & 0x1F;
306-
break;
307-
case 3:
308-
wchar = ptr[pos] & 0x0F;
309-
break;
310-
case 4:
311-
wchar = ptr[pos] & 0x07;
312-
break;
313-
}
309+
}
310+
break;
311+
}
312+
default: {
313+
const char *hexdig = "0123456789abcdef";
314+
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
314315

315-
for (short i = 1; i < ch_len; i++) {
316-
wchar = (wchar << 6) | (ptr[pos+i] & 0x3F);
317-
}
316+
uint32_t wchar = 0;
318317

319-
FLUSH_POS(ch_len);
318+
switch(ch_len) {
319+
case 2:
320+
wchar = ch & 0x1F;
321+
break;
322+
case 3:
323+
wchar = ch & 0x0F;
324+
break;
325+
case 4:
326+
wchar = ch & 0x07;
327+
break;
328+
}
320329

321-
if (wchar <= 0xFFFF) {
322-
scratch[2] = hexdig[wchar >> 12];
323-
scratch[3] = hexdig[(wchar >> 8) & 0xf];
324-
scratch[4] = hexdig[(wchar >> 4) & 0xf];
325-
scratch[5] = hexdig[wchar & 0xf];
326-
fbuffer_append(out_buffer, scratch, 6);
327-
} else {
328-
uint16_t hi, lo;
329-
wchar -= 0x10000;
330-
hi = 0xD800 + (uint16_t)(wchar >> 10);
331-
lo = 0xDC00 + (uint16_t)(wchar & 0x3FF);
332-
333-
scratch[2] = hexdig[hi >> 12];
334-
scratch[3] = hexdig[(hi >> 8) & 0xf];
335-
scratch[4] = hexdig[(hi >> 4) & 0xf];
336-
scratch[5] = hexdig[hi & 0xf];
337-
338-
scratch[8] = hexdig[lo >> 12];
339-
scratch[9] = hexdig[(lo >> 8) & 0xf];
340-
scratch[10] = hexdig[(lo >> 4) & 0xf];
341-
scratch[11] = hexdig[lo & 0xf];
342-
343-
fbuffer_append(out_buffer, scratch, 12);
344-
}
330+
for (short i = 1; i < ch_len; i++) {
331+
wchar = (wchar << 6) | (search->ptr[i] & 0x3F);
332+
}
345333

346-
break;
347-
}
334+
if (wchar <= 0xFFFF) {
335+
scratch[2] = hexdig[wchar >> 12];
336+
scratch[3] = hexdig[(wchar >> 8) & 0xf];
337+
scratch[4] = hexdig[(wchar >> 4) & 0xf];
338+
scratch[5] = hexdig[wchar & 0xf];
339+
fbuffer_append(search->buffer, scratch, 6);
340+
} else {
341+
uint16_t hi, lo;
342+
wchar -= 0x10000;
343+
hi = 0xD800 + (uint16_t)(wchar >> 10);
344+
lo = 0xDC00 + (uint16_t)(wchar & 0x3FF);
345+
346+
scratch[2] = hexdig[hi >> 12];
347+
scratch[3] = hexdig[(hi >> 8) & 0xf];
348+
scratch[4] = hexdig[(hi >> 4) & 0xf];
349+
scratch[5] = hexdig[hi & 0xf];
350+
351+
scratch[8] = hexdig[lo >> 12];
352+
scratch[9] = hexdig[(lo >> 8) & 0xf];
353+
scratch[10] = hexdig[(lo >> 4) & 0xf];
354+
scratch[11] = hexdig[lo & 0xf];
355+
356+
fbuffer_append(search->buffer, scratch, 12);
348357
}
349-
} else {
350-
pos++;
358+
359+
break;
351360
}
352361
}
353-
#undef FLUSH_POS
362+
search->cursor = (search->ptr += ch_len);
363+
}
354364

355-
if (beg < len) {
356-
fbuffer_append(out_buffer, &ptr[beg], len - beg);
365+
static void convert_UTF8_to_ASCII_only_JSON(search_state *search, const unsigned char escape_table[256])
366+
{
367+
unsigned char ch_len;
368+
while ((ch_len = search_ascii_only_escape(search, escape_table))) {
369+
full_escape_UTF8_char(search, ch_len);
357370
}
358-
359-
RB_GC_GUARD(str);
360371
}
361372

362373
/*
@@ -911,13 +922,20 @@ static void generate_json_string(FBuffer *buffer, struct generate_json_data *dat
911922

912923
fbuffer_append_char(buffer, '"');
913924

925+
long len;
926+
search_state search;
927+
search.buffer = buffer;
928+
RSTRING_GETMEM(obj, search.ptr, len);
929+
search.cursor = search.ptr;
930+
search.end = search.ptr + len;
931+
914932
switch(rb_enc_str_coderange(obj)) {
915933
case ENC_CODERANGE_7BIT:
916934
case ENC_CODERANGE_VALID:
917935
if (RB_UNLIKELY(state->ascii_only)) {
918-
convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
936+
convert_UTF8_to_ASCII_only_JSON(&search, state->script_safe ? script_safe_escape_table : ascii_only_escape_table);
919937
} else {
920-
convert_UTF8_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table);
938+
convert_UTF8_to_JSON(&search, state->script_safe ? script_safe_escape_table : escape_table);
921939
}
922940
break;
923941
default:

test/json/json_generator_test.rb

+6
Original file line numberDiff line numberDiff line change
@@ -665,6 +665,12 @@ def test_nonutf8_encoding
665665
assert_equal("\"5\u{b0}\"", "5\xb0".dup.force_encoding(Encoding::ISO_8859_1).to_json)
666666
end
667667

668+
def test_utf8_multibyte
669+
assert_equal('["foßbar"]', JSON.generate(["foßbar"]))
670+
assert_equal('"n€ßt€ð2"', JSON.generate("n€ßt€ð2"))
671+
assert_equal('"\"\u0000\u001f"', JSON.generate("\"\u0000\u001f"))
672+
end
673+
668674
def test_fragment
669675
fragment = JSON::Fragment.new(" 42")
670676
assert_equal '{"number": 42}', JSON.generate({ number: fragment })

0 commit comments

Comments
 (0)