diff --git a/src/prism.c b/src/prism.c index a68577b4dce..69f896cbb6b 100644 --- a/src/prism.c +++ b/src/prism.c @@ -5475,46 +5475,40 @@ pm_super_node_create(pm_parser_t *parser, const pm_token_t *keyword, pm_argument * Read through the contents of a string and check if it consists solely of US ASCII code points. */ static bool -ascii_only_p( const pm_string_t *contents) { - const size_t length = contents->length; +pm_ascii_only_p(const pm_string_t *contents) { + const size_t length = pm_string_length(contents); + const uint8_t *source = pm_string_source(contents); - for (size_t i = 0; i < length; i++) { - if (contents->source[i] & 0x80) { - return false; - } + for (size_t index = 0; index < length; index++) { + if (source[index] & 0x80) return false; } return true; } /** - * Ruby "downgrades" the encoding of Symbols to US-ASCII if the associated encoding is ASCII-compatible and - * the Symbol consists only of US-ASCII code points. Otherwise, the encoding may be explicitly set with an - * escape sequence. + * Ruby "downgrades" the encoding of Symbols to US-ASCII if the associated + * encoding is ASCII-compatible and the Symbol consists only of US-ASCII code + * points. Otherwise, the encoding may be explicitly set with an escape + * sequence. */ static inline pm_node_flags_t parse_symbol_encoding(const pm_parser_t *parser, const pm_string_t *contents) { - // Don't set any flags on the Symbol if it hasn't been populated yet. - if (contents->source == NULL) { - return 0; - } - - // Ruby stipulates that all source files must use an ASCII-compatible encoding. Thus, all symbols appearing in - // source are eligible for "downgrading" to US-ASCII. - if (ascii_only_p(contents)) { - return PM_SYMBOL_FLAGS_FORCED_US_ASCII_ENCODING; - } else { - // A Symbol may optionally have its encoding explicitly set. - // - // NB: an explicitly set encoding is ignored by Ruby if the Symbol consists of only US ASCII code points. - if (parser->explicit_encoding != NULL) { - if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { - return PM_SYMBOL_FLAGS_FORCED_UTF8_ENCODING; - } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { - return PM_SYMBOL_FLAGS_FORCED_BINARY_ENCODING; - } + if (parser->explicit_encoding != NULL) { + // A Symbol may optionally have its encoding explicitly set. This will + // happen if an escape sequence results in a non-ASCII code point. + if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { + return PM_SYMBOL_FLAGS_FORCED_UTF8_ENCODING; + } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { + return PM_SYMBOL_FLAGS_FORCED_BINARY_ENCODING; } + } else if (pm_ascii_only_p(contents)) { + // Ruby stipulates that all source files must use an ASCII-compatible + // encoding. Thus, all symbols appearing in source are eligible for + // "downgrading" to US-ASCII. + return PM_SYMBOL_FLAGS_FORCED_US_ASCII_ENCODING; } + return 0; } @@ -5523,13 +5517,13 @@ parse_symbol_encoding(const pm_parser_t *parser, const pm_string_t *contents) { * string. */ static pm_symbol_node_t * -pm_symbol_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing, const pm_string_t *unescaped) { +pm_symbol_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing, const pm_string_t *unescaped, pm_node_flags_t flags) { pm_symbol_node_t *node = PM_ALLOC_NODE(parser, pm_symbol_node_t); *node = (pm_symbol_node_t) { { .type = PM_SYMBOL_NODE, - .flags = PM_NODE_FLAG_STATIC_LITERAL, + .flags = PM_NODE_FLAG_STATIC_LITERAL | flags, .location = { .start = (opening->type == PM_TOKEN_NOT_PROVIDED ? value->start : opening->start), .end = (closing->type == PM_TOKEN_NOT_PROVIDED ? value->end : closing->end) @@ -5541,8 +5535,6 @@ pm_symbol_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, .unescaped = *unescaped }; - pm_node_flag_set((pm_node_t *)node, parse_symbol_encoding(parser, unescaped)); - return node; } @@ -5551,7 +5543,7 @@ pm_symbol_node_create_unescaped(pm_parser_t *parser, const pm_token_t *opening, */ static inline pm_symbol_node_t * pm_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) { - return pm_symbol_node_create_unescaped(parser, opening, value, closing, &PM_STRING_EMPTY); + return pm_symbol_node_create_unescaped(parser, opening, value, closing, &PM_STRING_EMPTY, 0); } /** @@ -5559,7 +5551,7 @@ pm_symbol_node_create(pm_parser_t *parser, const pm_token_t *opening, const pm_t */ static pm_symbol_node_t * pm_symbol_node_create_current_string(pm_parser_t *parser, const pm_token_t *opening, const pm_token_t *value, const pm_token_t *closing) { - pm_symbol_node_t *node = pm_symbol_node_create_unescaped(parser, opening, value, closing, &parser->current_string); + pm_symbol_node_t *node = pm_symbol_node_create_unescaped(parser, opening, value, closing, &parser->current_string, parse_symbol_encoding(parser, &parser->current_string)); parser->current_string = PM_STRING_EMPTY; return node; } @@ -5581,7 +5573,8 @@ pm_symbol_node_label_create(pm_parser_t *parser, const pm_token_t *token) { assert((label.end - label.start) >= 0); pm_string_shared_init(&node->unescaped, label.start, label.end); - pm_node_flag_set((pm_node_t *)node, parse_symbol_encoding(parser, &node->unescaped)); + pm_node_flag_set((pm_node_t *) node, parse_symbol_encoding(parser, &node->unescaped)); + break; } case PM_TOKEN_MISSING: { @@ -12644,12 +12637,15 @@ PM_STATIC_ASSERT(__LINE__, ((int) PM_STRING_FLAGS_FORCED_UTF8_ENCODING) == ((int static inline pm_node_flags_t parse_unescaped_encoding(const pm_parser_t *parser) { if (parser->explicit_encoding != NULL) { - // If the there's an explicit encoding and it's using a UTF-8 escape sequence, then mark the string as UTF-8. if (parser->explicit_encoding == PM_ENCODING_UTF_8_ENTRY) { + // If the there's an explicit encoding and it's using a UTF-8 escape + // sequence, then mark the string as UTF-8. return PM_STRING_FLAGS_FORCED_UTF8_ENCODING; - // If there's a non-UTF-8 escape sequence being used, then the string uses the source encoding, unless the source - // is marked as US-ASCII. In that case the string is forced as ASCII-8BIT in order to keep the string valid. } else if (parser->encoding == PM_ENCODING_US_ASCII_ENTRY) { + // If there's a non-UTF-8 escape sequence being used, then the + // string uses the source encoding, unless the source is marked as + // US-ASCII. In that case the string is forced as ASCII-8BIT in + // order to keep the string valid. return PM_STRING_FLAGS_FORCED_BINARY_ENCODING; } } @@ -12801,7 +12797,8 @@ parse_operator_symbol(pm_parser_t *parser, const pm_token_t *opening, pm_lex_sta parser_lex(parser); pm_string_shared_init(&symbol->unescaped, parser->previous.start, end); - pm_node_flag_set((pm_node_t *)symbol, parse_symbol_encoding(parser, &symbol->unescaped)); + pm_node_flag_set((pm_node_t *) symbol, PM_SYMBOL_FLAGS_FORCED_US_ASCII_ENCODING); + return (pm_node_t *) symbol; } @@ -12840,7 +12837,7 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing); pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end); - pm_node_flag_set((pm_node_t *)symbol, parse_symbol_encoding(parser, &symbol->unescaped)); + pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &symbol->unescaped)); return (pm_node_t *) symbol; } @@ -12927,7 +12924,6 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s } else { content = (pm_token_t) { .type = PM_TOKEN_STRING_CONTENT, .start = parser->previous.end, .end = parser->previous.end }; pm_string_shared_init(&unescaped, content.start, content.end); - } if (next_state != PM_LEX_STATE_NONE) { @@ -12940,10 +12936,7 @@ parse_symbol(pm_parser_t *parser, pm_lex_mode_t *lex_mode, pm_lex_state_t next_s expect1(parser, PM_TOKEN_STRING_END, PM_ERR_SYMBOL_TERM_DYNAMIC); } - pm_symbol_node_t *symbol_node = pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped); - pm_node_flag_set((pm_node_t *)symbol_node, parse_symbol_encoding(parser, &symbol_node->unescaped)); - - return (pm_node_t *) symbol_node; + return (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &unescaped)); } /** @@ -12968,6 +12961,8 @@ parse_undef_argument(pm_parser_t *parser) { pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing); pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end); + pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &symbol->unescaped)); + return (pm_node_t *) symbol; } case PM_TOKEN_SYMBOL_BEGIN: { @@ -13007,7 +13002,8 @@ parse_alias_argument(pm_parser_t *parser, bool first) { pm_symbol_node_t *symbol = pm_symbol_node_create(parser, &opening, &parser->previous, &closing); pm_string_shared_init(&symbol->unescaped, parser->previous.start, parser->previous.end); - pm_node_flag_set((pm_node_t *)symbol, parse_symbol_encoding(parser, &symbol->unescaped)); + pm_node_flag_set((pm_node_t *) symbol, parse_symbol_encoding(parser, &symbol->unescaped)); + return (pm_node_t *) symbol; } case PM_TOKEN_SYMBOL_BEGIN: { @@ -14066,7 +14062,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) { expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_TERM); node = (pm_node_t *) pm_interpolated_string_node_create(parser, &opening, &parts, &parser->previous); } else if (accept1(parser, PM_TOKEN_LABEL_END) && !state_is_arg_labeled) { - node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped); + node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &unescaped)); } else if (match1(parser, PM_TOKEN_EOF)) { pm_parser_err_token(parser, &opening, PM_ERR_STRING_LITERAL_TERM); node = (pm_node_t *) pm_string_node_create_unescaped(parser, &opening, &content, &parser->current, &unescaped); @@ -14088,7 +14084,7 @@ parse_strings(pm_parser_t *parser, pm_node_t *current) { pm_node_flag_set(node, parse_unescaped_encoding(parser)); expect1(parser, PM_TOKEN_STRING_END, PM_ERR_STRING_LITERAL_TERM); } else if (accept1(parser, PM_TOKEN_LABEL_END)) { - node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped); + node = (pm_node_t *) pm_symbol_node_create_unescaped(parser, &opening, &content, &parser->previous, &unescaped, parse_symbol_encoding(parser, &unescaped)); } else { // If we get here, then we have interpolation so we'll need // to create a string or symbol node with interpolation. diff --git a/test/prism/snapshots/undef.txt b/test/prism/snapshots/undef.txt index fb62b9acb52..3430ef50599 100644 --- a/test/prism/snapshots/undef.txt +++ b/test/prism/snapshots/undef.txt @@ -6,7 +6,7 @@ ├── @ UndefNode (location: (1,0)-(1,7)) │ ├── names: (length: 1) │ │ └── @ SymbolNode (location: (1,6)-(1,7)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: ∅ │ │ ├── value_loc: (1,6)-(1,7) = "a" │ │ ├── closing_loc: ∅ @@ -15,13 +15,13 @@ ├── @ UndefNode (location: (3,0)-(3,10)) │ ├── names: (length: 2) │ │ ├── @ SymbolNode (location: (3,6)-(3,7)) - │ │ │ ├── flags: ∅ + │ │ │ ├── flags: forced_us_ascii_encoding │ │ │ ├── opening_loc: ∅ │ │ │ ├── value_loc: (3,6)-(3,7) = "a" │ │ │ ├── closing_loc: ∅ │ │ │ └── unescaped: "a" │ │ └── @ SymbolNode (location: (3,9)-(3,10)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: ∅ │ │ ├── value_loc: (3,9)-(3,10) = "b" │ │ ├── closing_loc: ∅ @@ -30,7 +30,7 @@ ├── @ UndefNode (location: (5,0)-(5,8)) │ ├── names: (length: 1) │ │ └── @ SymbolNode (location: (5,6)-(5,8)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: ∅ │ │ ├── value_loc: (5,6)-(5,8) = "if" │ │ ├── closing_loc: ∅ @@ -108,7 +108,7 @@ └── @ UndefNode (location: (17,0)-(17,14)) ├── names: (length: 1) │ └── @ SymbolNode (location: (17,6)-(17,14)) - │ ├── flags: ∅ + │ ├── flags: forced_us_ascii_encoding │ ├── opening_loc: ∅ │ ├── value_loc: (17,6)-(17,14) = "Constant" │ ├── closing_loc: ∅ diff --git a/test/prism/snapshots/unparser/corpus/semantic/undef.txt b/test/prism/snapshots/unparser/corpus/semantic/undef.txt index 1578260a64f..ecb073148d7 100644 --- a/test/prism/snapshots/unparser/corpus/semantic/undef.txt +++ b/test/prism/snapshots/unparser/corpus/semantic/undef.txt @@ -6,7 +6,7 @@ ├── @ UndefNode (location: (1,0)-(1,9)) │ ├── names: (length: 1) │ │ └── @ SymbolNode (location: (1,6)-(1,9)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: ∅ │ │ ├── value_loc: (1,6)-(1,9) = "foo" │ │ ├── closing_loc: ∅ @@ -15,13 +15,13 @@ └── @ UndefNode (location: (2,0)-(2,14)) ├── names: (length: 2) │ ├── @ SymbolNode (location: (2,6)-(2,9)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: ∅ │ │ ├── value_loc: (2,6)-(2,9) = "foo" │ │ ├── closing_loc: ∅ │ │ └── unescaped: "foo" │ └── @ SymbolNode (location: (2,11)-(2,14)) - │ ├── flags: ∅ + │ ├── flags: forced_us_ascii_encoding │ ├── opening_loc: ∅ │ ├── value_loc: (2,11)-(2,14) = "bar" │ ├── closing_loc: ∅ diff --git a/test/prism/snapshots/whitequark/undef.txt b/test/prism/snapshots/whitequark/undef.txt index 0f735cc7631..9889a5a5257 100644 --- a/test/prism/snapshots/whitequark/undef.txt +++ b/test/prism/snapshots/whitequark/undef.txt @@ -6,7 +6,7 @@ └── @ UndefNode (location: (1,0)-(1,27)) ├── names: (length: 3) │ ├── @ SymbolNode (location: (1,6)-(1,9)) - │ │ ├── flags: ∅ + │ │ ├── flags: forced_us_ascii_encoding │ │ ├── opening_loc: ∅ │ │ ├── value_loc: (1,6)-(1,9) = "foo" │ │ ├── closing_loc: ∅