Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a fast path for ASCII strings #620

Merged
merged 2 commits into from
Oct 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
228 changes: 198 additions & 30 deletions ext/json/ext/generator/generator.c
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,13 @@ static ID i_to_s, i_to_json, i_new, i_pack, i_unpack, i_create_id, i_extend;
* Everything else (should be UTF-8) is just passed through and
* appended to the result.
*/
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ascii_only, bool out_script_safe)
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_script_safe)
{
const char *hexdig = "0123456789abcdef";
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };

const char *in_utf8_str = RSTRING_PTR(in_string);
unsigned long in_utf8_len = RSTRING_LEN(in_string);
bool in_is_ascii_only = rb_enc_str_asciionly_p(in_string);

unsigned long beg = 0, pos;

Expand All @@ -42,46 +41,196 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_
bool should_escape;

/* UTF-8 decoding */
if (in_is_ascii_only) {
ch = in_utf8_str[pos];
ch_len = 1;
} else {
short i;
if ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos]; } /* leading 1 bit is 0b0 */
else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110 */
else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110 */
else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */
else
rb_raise(rb_path2class("JSON::GeneratorError"),
"source sequence is illegal/malformed utf-8");
if ((pos+ch_len) > in_utf8_len)
rb_raise(rb_path2class("JSON::GeneratorError"),
"partial character in source, but hit end");
for (i = 1; i < ch_len; i++) {
if ((in_utf8_str[pos+i] & 0xC0) != 0x80) /* leading 2 bits should be 0b10 */
rb_raise(rb_path2class("JSON::GeneratorError"),
"source sequence is illegal/malformed utf-8");
ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F);
short i;
if ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos]; } /* leading 1 bit is 0b0 */
else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110 */
else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110 */
else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */
else {
rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8");
}

for (i = 1; i < ch_len; i++) {
ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F);
}

/* JSON policy */
should_escape =
(ch < 0x20) ||
(ch == '"') ||
(ch == '\\') ||
(out_script_safe && (ch == '/')) ||
(out_script_safe && (ch == 0x2028)) ||
(out_script_safe && (ch == 0x2029));

/* JSON encoding */
if (should_escape) {
if (pos > beg) {
fbuffer_append(out_buffer, &in_utf8_str[beg], pos - beg);
}

beg = pos + ch_len;
switch (ch) {
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
default:
if (ch <= 0xFFFF) {
scratch[2] = hexdig[ch >> 12];
scratch[3] = hexdig[(ch >> 8) & 0xf];
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
fbuffer_append(out_buffer, scratch, 6);
} else {
uint16_t hi, lo;
ch -= 0x10000;
hi = 0xD800 + (uint16_t)(ch >> 10);
lo = 0xDC00 + (uint16_t)(ch & 0x3FF);

scratch[2] = hexdig[hi >> 12];
scratch[3] = hexdig[(hi >> 8) & 0xf];
scratch[4] = hexdig[(hi >> 4) & 0xf];
scratch[5] = hexdig[hi & 0xf];

scratch[8] = hexdig[lo >> 12];
scratch[9] = hexdig[(lo >> 8) & 0xf];
scratch[10] = hexdig[(lo >> 4) & 0xf];
scratch[11] = hexdig[lo & 0xf];

fbuffer_append(out_buffer, scratch, 12);
}
}
}

pos += ch_len;
}

if (beg < in_utf8_len) {
fbuffer_append(out_buffer, &in_utf8_str[beg], in_utf8_len - beg);
}

RB_GC_GUARD(in_string);
}

static const bool escape_table[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* '"' */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, /* '\\' */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
};

static const bool script_safe_escape_table[256] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, /* '"' and '/' */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0, /* '\\' */
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1
};

static void convert_ASCII_to_JSON(FBuffer *out_buffer, VALUE str, const bool escape_table[256])
{
const char *hexdig = "0123456789abcdef";
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };

const char *ptr = RSTRING_PTR(str);
unsigned long len = RSTRING_LEN(str);

unsigned long beg = 0, pos;

for (pos = 0; pos < len;) {
unsigned char ch = ptr[pos];
/* JSON encoding */
if (escape_table[ch]) {
if (pos > beg) {
fbuffer_append(out_buffer, &ptr[beg], pos - beg);
}

beg = pos + 1;
switch (ch) {

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for elaborating on this change in your blog post ❤️

I was wondering if we could entirely eliminate this switch at an expense of static memory by storing escaped values instead of booleans in the lookup table. Just as an example, in ruby:

JSON_ESCAPE_TABLE['"'] = "\\\"
JSON_ESCAPE_TABLE['\\'] = "\\\\"
JSON_ESCAPE_TABLE['\b'] = "\\b"
# ...
# default: case can be pre-calculated as well for other characters

It may not result in a visible improvement in realistic benchmarks due to having not that many chars to escape but could be visible on a micro-benchmark specifically crafted to measure escaping performance at no cost (except static memory) for the main flow.

I could be missing something that makes it hard to store non-boolean values in escape_table but would love to at least learn about this as well. Thanks!

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It could be worth trying, but there's a couple things making this not an obvious win.

Escaped values in this case aren't scalar. You can either store them as a table of char *:

static const char *[256] escape_table = {
  0, 0, 0, "\\n", // ....
};

If you do this:

  • char * is 8B on 64-bit platform, whereas char is just 1B, so we went from 256B to 2kiB, not nothing.
  • char * doesn't store the length, so presumably to append these you'd need to call strlen, they are shot string so probably not very slow, but given we're just trying to eliminate a case, it may actually be slower.
  • char * isn't stored contiguiously, so that's another memory region to keep in CPU cache.

Alternatively, you could store the escaped strings inline, e.g.:

struct escaped_char {
  unsigned char len;
  char bytes[8];
}

static const struct escaped_char[256] escape_table = {
  {0, 0}, {0, 0}, {2, "\\n"}, // ....
}

But this would make the escape table even bigger.

Maybe you could only do this for strings that are known to be pure-ASCII so the table only has 128 entries, keeping the memory usage down, but then you can't really re-use it for UTF-8.

Note that all of these are just "static assumption", performance isn't always easy to predict, so maybe your suggestion would actually help.

case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
default:
scratch[2] = hexdig[ch >> 12];
scratch[3] = hexdig[(ch >> 8) & 0xf];
scratch[4] = hexdig[(ch >> 4) & 0xf];
scratch[5] = hexdig[ch & 0xf];
fbuffer_append(out_buffer, scratch, 6);
}
if (ch > 0x10FFFF)
rb_raise(rb_path2class("JSON::GeneratorError"),
"source sequence is illegal/malformed utf-8");
}

pos++;
}

if (beg < len) {
fbuffer_append(out_buffer, &ptr[beg], len - beg);
}

RB_GC_GUARD(str);
}

static void convert_UTF8_to_ASCII_only_JSON(FBuffer *out_buffer, VALUE in_string, bool out_script_safe)
{
const char *hexdig = "0123456789abcdef";
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };

const char *in_utf8_str = RSTRING_PTR(in_string);
unsigned long in_utf8_len = RSTRING_LEN(in_string);

unsigned long beg = 0, pos;

for (pos = 0; pos < in_utf8_len;) {
uint32_t ch;
short ch_len;
bool should_escape;

/* UTF-8 decoding */
short i;
if ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos]; } /* leading 1 bit is 0b0 */
else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110 */
else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110 */
else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */
else {
rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8");
}

for (i = 1; i < ch_len; i++) {
ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F);
}

/* JSON policy */
should_escape =
(ch < 0x20) ||
(ch == '"') ||
(ch == '\\') ||
(out_ascii_only && (ch > 0x7F)) ||
(ch > 0x7F) ||
(out_script_safe && (ch == '/')) ||
(out_script_safe && (ch == 0x2028)) ||
(out_script_safe && (ch == 0x2029));

/* JSON encoding */
if (should_escape) {
if (pos > beg)
if (pos > beg) {
fbuffer_append(out_buffer, &in_utf8_str[beg], pos - beg);
}

beg = pos + ch_len;
switch (ch) {
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
Expand Down Expand Up @@ -122,8 +271,11 @@ static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_

pos += ch_len;
}
if (beg < in_utf8_len)

if (beg < in_utf8_len) {
fbuffer_append(out_buffer, &in_utf8_str[beg], in_utf8_len - beg);
}

RB_GC_GUARD(in_string);
}

Expand Down Expand Up @@ -570,11 +722,27 @@ static int enc_utf8_compatible_p(int enc_idx)

static void generate_json_string(FBuffer *buffer, VALUE Vstate, JSON_Generator_State *state, VALUE obj)
{
fbuffer_append_char(buffer, '"');
if (!enc_utf8_compatible_p(RB_ENCODING_GET(obj))) {
obj = rb_str_export_to_enc(obj, rb_utf8_encoding());
}
convert_UTF8_to_JSON(buffer, obj, state->ascii_only, state->script_safe);

fbuffer_append_char(buffer, '"');

switch(rb_enc_str_coderange(obj)) {
case ENC_CODERANGE_7BIT:
convert_ASCII_to_JSON(buffer, obj, state->script_safe ? script_safe_escape_table : escape_table);
break;
case ENC_CODERANGE_VALID:
if (RB_UNLIKELY(state->ascii_only)) {
convert_UTF8_to_ASCII_only_JSON(buffer, obj, state->script_safe);
Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

After reflection, the split that might have the more impact is script_safe, as it would also speedup the ASCII path.

} else {
convert_UTF8_to_JSON(buffer, obj, state->script_safe);
}
break;
default:
rb_raise(rb_path2class("JSON::GeneratorError"), "source sequence is illegal/malformed utf-8");
break;
}
fbuffer_append_char(buffer, '"');
}

Expand Down
1 change: 0 additions & 1 deletion ext/json/ext/generator/generator.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ typedef unsigned char _Bool;
#endif
#endif

static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ascii_only, bool out_script_safe);
static char *fstrndup(const char *ptr, unsigned long len);

/* ruby api and some helpers */
Expand Down
Loading