-
Notifications
You must be signed in to change notification settings - Fork 342
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add a fast path for ASCII strings #620
Merged
Merged
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,14 +25,13 @@ static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend; | |
* Everything else (should be UTF-8) is just passed through and | ||
* appended to the result. | ||
*/ | ||
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ascii_only, bool out_script_safe) | ||
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_script_safe) | ||
{ | ||
const char *hexdig = "0123456789abcdef"; | ||
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' }; | ||
|
||
const char *in_utf8_str = RSTRING_PTR(in_string); | ||
unsigned long in_utf8_len = RSTRING_LEN(in_string); | ||
bool in_is_ascii_only = rb_enc_str_asciionly_p(in_string); | ||
|
||
unsigned long beg = 0, pos; | ||
|
||
|
@@ -42,46 +41,196 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ | |
bool should_escape; | ||
|
||
/* UTF-8 decoding */ | ||
if (in_is_ascii_only) { | ||
ch = in_utf8_str[pos]; | ||
ch_len = 1; | ||
} else { | ||
short i; | ||
if ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos]; } /* leading 1 bit is 0b0 */ | ||
else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110 */ | ||
else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110 */ | ||
else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */ | ||
else | ||
rb_raise(rb_path2class("JSON::GeneratorError"), | ||
"source sequence is illegal/malformed utf-8"); | ||
if ((pos+ch_len) > in_utf8_len) | ||
rb_raise(rb_path2class("JSON::GeneratorError"), | ||
"partial character in source, but hit end"); | ||
for (i = 1; i < ch_len; i++) { | ||
if ((in_utf8_str[pos+i] & 0xC0) != 0x80) /* leading 2 bits should be 0b10 */ | ||
rb_raise(rb_path2class("JSON::GeneratorError"), | ||
"source sequence is illegal/malformed utf-8"); | ||
ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F); | ||
short i; | ||
if ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos]; } /* leading 1 bit is 0b0 */ | ||
else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110 */ | ||
else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110 */ | ||
else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */ | ||
else { | ||
rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8"); | ||
} | ||
|
||
for (i = 1; i < ch_len; i++) { | ||
ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F); | ||
} | ||
|
||
/* JSON policy */ | ||
should_escape = | ||
(ch < 0x20) || | ||
(ch == '"') || | ||
(ch == '\\') || | ||
(out_script_safe && (ch == '/')) || | ||
(out_script_safe && (ch == 0x2028)) || | ||
(out_script_safe && (ch == 0x2029)); | ||
|
||
/* JSON encoding */ | ||
if (should_escape) { | ||
if (pos > beg) { | ||
fbuffer_append(out_buffer, &in_utf8_str[beg], pos - beg); | ||
} | ||
|
||
beg = pos + ch_len; | ||
switch (ch) { | ||
case '"': fbuffer_append(out_buffer, "\\\"", 2); break; | ||
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break; | ||
case '/': fbuffer_append(out_buffer, "\\/", 2); break; | ||
case '\b': fbuffer_append(out_buffer, "\\b", 2); break; | ||
case '\f': fbuffer_append(out_buffer, "\\f", 2); break; | ||
case '\n': fbuffer_append(out_buffer, "\\n", 2); break; | ||
case '\r': fbuffer_append(out_buffer, "\\r", 2); break; | ||
case '\t': fbuffer_append(out_buffer, "\\t", 2); break; | ||
default: | ||
if (ch <= 0xFFFF) { | ||
scratch[2] = hexdig[ch >> 12]; | ||
scratch[3] = hexdig[(ch >> 8) & 0xf]; | ||
scratch[4] = hexdig[(ch >> 4) & 0xf]; | ||
scratch[5] = hexdig[ch & 0xf]; | ||
fbuffer_append(out_buffer, scratch, 6); | ||
} else { | ||
uint16_t hi, lo; | ||
ch -= 0x10000; | ||
hi = 0xD800 + (uint16_t)(ch >> 10); | ||
lo = 0xDC00 + (uint16_t)(ch & 0x3FF); | ||
|
||
scratch[2] = hexdig[hi >> 12]; | ||
scratch[3] = hexdig[(hi >> 8) & 0xf]; | ||
scratch[4] = hexdig[(hi >> 4) & 0xf]; | ||
scratch[5] = hexdig[hi & 0xf]; | ||
|
||
scratch[8] = hexdig[lo >> 12]; | ||
scratch[9] = hexdig[(lo >> 8) & 0xf]; | ||
scratch[10] = hexdig[(lo >> 4) & 0xf]; | ||
scratch[11] = hexdig[lo & 0xf]; | ||
|
||
fbuffer_append(out_buffer, scratch, 12); | ||
} | ||
} | ||
} | ||
|
||
pos += ch_len; | ||
} | ||
|
||
if (beg < in_utf8_len) { | ||
fbuffer_append(out_buffer, &in_utf8_str[beg], in_utf8_len - beg); | ||
} | ||
|
||
RB_GC_GUARD(in_string); | ||
} | ||
|
||
static const bool escape_table[256] = { | ||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | ||
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* '"' */ | ||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, /* '\\' */ | ||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | ||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | ||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | ||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 | ||
}; | ||
|
||
static const bool script_safe_escape_table[256] = { | ||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | ||
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* '"' and '/' */ | ||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, /* '\\' */ | ||
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, | ||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | ||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | ||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, | ||
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1 | ||
}; | ||
|
||
static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, const bool escape_table[256]) | ||
{ | ||
const char *hexdig = "0123456789abcdef"; | ||
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' }; | ||
|
||
const char *ptr = RSTRING_PTR(str); | ||
unsigned long len = RSTRING_LEN(str); | ||
|
||
unsigned long beg = 0, pos; | ||
|
||
for (pos = 0; pos < len;) { | ||
unsigned char ch = ptr[pos]; | ||
/* JSON encoding */ | ||
if (escape_table[ch]) { | ||
if (pos > beg) { | ||
fbuffer_append(out_buffer, &ptr[beg], pos - beg); | ||
} | ||
|
||
beg = pos + 1; | ||
switch (ch) { | ||
case '"': fbuffer_append(out_buffer, "\\\"", 2); break; | ||
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break; | ||
case '/': fbuffer_append(out_buffer, "\\/", 2); break; | ||
case '\b': fbuffer_append(out_buffer, "\\b", 2); break; | ||
case '\f': fbuffer_append(out_buffer, "\\f", 2); break; | ||
case '\n': fbuffer_append(out_buffer, "\\n", 2); break; | ||
case '\r': fbuffer_append(out_buffer, "\\r", 2); break; | ||
case '\t': fbuffer_append(out_buffer, "\\t", 2); break; | ||
default: | ||
scratch[2] = hexdig[ch >> 12]; | ||
scratch[3] = hexdig[(ch >> 8) & 0xf]; | ||
scratch[4] = hexdig[(ch >> 4) & 0xf]; | ||
scratch[5] = hexdig[ch & 0xf]; | ||
fbuffer_append(out_buffer, scratch, 6); | ||
} | ||
if (ch > 0x10FFFF) | ||
rb_raise(rb_path2class("JSON::GeneratorError"), | ||
"source sequence is illegal/malformed utf-8"); | ||
} | ||
|
||
pos++; | ||
} | ||
|
||
if (beg < len) { | ||
fbuffer_append(out_buffer, &ptr[beg], len - beg); | ||
} | ||
|
||
RB_GC_GUARD(str); | ||
} | ||
|
||
static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE in_string, bool out_script_safe) | ||
{ | ||
const char *hexdig = "0123456789abcdef"; | ||
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' }; | ||
|
||
const char *in_utf8_str = RSTRING_PTR(in_string); | ||
unsigned long in_utf8_len = RSTRING_LEN(in_string); | ||
|
||
unsigned long beg = 0, pos; | ||
|
||
for (pos = 0; pos < in_utf8_len;) { | ||
uint32_t ch; | ||
short ch_len; | ||
bool should_escape; | ||
|
||
/* UTF-8 decoding */ | ||
short i; | ||
if ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos]; } /* leading 1 bit is 0b0 */ | ||
else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110 */ | ||
else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110 */ | ||
else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */ | ||
else { | ||
rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8"); | ||
} | ||
|
||
for (i = 1; i < ch_len; i++) { | ||
ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F); | ||
} | ||
|
||
/* JSON policy */ | ||
should_escape = | ||
(ch < 0x20) || | ||
(ch == '"') || | ||
(ch == '\\') || | ||
(out_ascii_only && (ch > 0x7F)) || | ||
(ch > 0x7F) || | ||
(out_script_safe && (ch == '/')) || | ||
(out_script_safe && (ch == 0x2028)) || | ||
(out_script_safe && (ch == 0x2029)); | ||
|
||
/* JSON encoding */ | ||
if (should_escape) { | ||
if (pos > beg) | ||
if (pos > beg) { | ||
fbuffer_append(out_buffer, &in_utf8_str[beg], pos - beg); | ||
} | ||
|
||
beg = pos + ch_len; | ||
switch (ch) { | ||
case '"': fbuffer_append(out_buffer, "\\\"", 2); break; | ||
|
@@ -122,8 +271,11 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ | |
|
||
pos += ch_len; | ||
} | ||
if (beg < in_utf8_len) | ||
|
||
if (beg < in_utf8_len) { | ||
fbuffer_append(out_buffer, &in_utf8_str[beg], in_utf8_len - beg); | ||
} | ||
|
||
RB_GC_GUARD(in_string); | ||
} | ||
|
||
|
@@ -570,11 +722,27 @@ static int enc_utf8_compatible_p(int enc_idx) | |
|
||
static void generate_json_string(FBuffer *buffer, VALUE Vstate, JSON_Generator_State *state, VALUE obj) | ||
{ | ||
fbuffer_append_char(buffer, '"'); | ||
if (!enc_utf8_compatible_p(RB_ENCODING_GET(obj))) { | ||
obj = rb_str_export_to_enc(obj, rb_utf8_encoding()); | ||
} | ||
convert_UTF8_to_JSON(buffer, obj, state->ascii_only, state->script_safe); | ||
|
||
fbuffer_append_char(buffer, '"'); | ||
|
||
switch(rb_enc_str_coderange(obj)) { | ||
case ENC_CODERANGE_7BIT: | ||
convert_ASCII_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table); | ||
break; | ||
case ENC_CODERANGE_VALID: | ||
if (RB_UNLIKELY(state->ascii_only)) { | ||
convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. After reflection, the split that might have the more impact is |
||
} else { | ||
convert_UTF8_to_JSON(buffer, obj, state->script_safe); | ||
} | ||
break; | ||
default: | ||
rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8"); | ||
break; | ||
} | ||
fbuffer_append_char(buffer, '"'); | ||
} | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for elaborating on this change in your blog post ❤️
I was wondering if we could entirely eliminate this
switch
at an expense of static memory by storing escaped values instead of booleans in the lookup table. Just as an example, in ruby:It may not result in a visible improvement in realistic benchmarks due to having not that many chars to escape but could be visible on a micro-benchmark specifically crafted to measure escaping performance at no cost (except static memory) for the main flow.
I could be missing something that makes it hard to store non-boolean values in
escape_table
but would love to at least learn about this as well. Thanks!There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It could be worth trying, but there's a couple things making this not an obvious win.
Escaped values in this case aren't scalar. You can either store them as a table of
char *
:If you do this:
char *
is8B
on 64-bit platform, whereaschar
is just1B
, so we went from256B
to2kiB
, not nothing.char *
doesn't store the length, so presumably to append these you'd need to callstrlen
, they are shot string so probably not very slow, but given we're just trying to eliminate a case, it may actually be slower.char *
isn't stored contiguiously, so that's another memory region to keep in CPU cache.Alternatively, you could store the escaped strings inline, e.g.:
But this would make the escape table even bigger.
Maybe you could only do this for strings that are known to be pure-ASCII so the table only has
128
entries, keeping the memory usage down, but then you can't really re-use it for UTF-8.Note that all of these are just "static assumption", performance isn't always easy to predict, so maybe your suggestion would actually help.