diff --git a/include/yarp/util/yp_newline_list.h b/include/yarp/util/yp_newline_list.h index 095acd5168e..b7c8c1f3aac 100644 --- a/include/yarp/util/yp_newline_list.h +++ b/include/yarp/util/yp_newline_list.h @@ -47,6 +47,9 @@ bool yp_newline_list_init(yp_newline_list_t *list, const char *start, size_t cap // the offsets succeeds (if one was necessary), otherwise returns false. bool yp_newline_list_append(yp_newline_list_t *list, const char *cursor); +// Conditionally append a new offset to the newline list, if the value passed in is a newline. +bool yp_newline_list_check_append(yp_newline_list_t *list, const char *cursor); + // Returns the line and column of the given offset. If the offset is not in the // list, the line and column of the closest offset less than the given offset // are returned. diff --git a/src/util/yp_newline_list.c b/src/util/yp_newline_list.c index 8b24f82a024..ad9d99d0ab8 100644 --- a/src/util/yp_newline_list.c +++ b/src/util/yp_newline_list.c @@ -38,6 +38,15 @@ yp_newline_list_append(yp_newline_list_t *list, const char *cursor) { return true; } +// Conditionally append a new offset to the newline list, if the value passed in is a newline. +bool +yp_newline_list_check_append(yp_newline_list_t *list, const char *cursor) { + if (*cursor != '\n') { + return true; + } + return yp_newline_list_append(list, cursor); +} + // Returns the line and column of the given offset, assuming we don't have any // information about the previous index that we found. static yp_line_column_t diff --git a/src/yarp.c b/src/yarp.c index 1f30672d519..8f93611d6bd 100644 --- a/src/yarp.c +++ b/src/yarp.c @@ -6183,9 +6183,7 @@ parser_lex(yp_parser_t *parser) { parser->current.end++; } - if (*parser->current.end == '\n') { - yp_newline_list_append(&parser->newline_list, parser->current.end); - } + yp_newline_list_check_append(&parser->newline_list, parser->current.end); parser->current.end++; LEX(YP_TOKEN_STRING_BEGIN); @@ -6215,6 +6213,7 @@ parser_lex(yp_parser_t *parser) { if (parser->current.end < parser->end) { lex_mode_push_regexp(parser, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end)); + yp_newline_list_check_append(&parser->newline_list, parser->current.end); parser->current.end++; } @@ -6225,6 +6224,7 @@ parser_lex(yp_parser_t *parser) { if (parser->current.end < parser->end) { lex_mode_push_string(parser, false, false, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end)); + yp_newline_list_check_append(&parser->newline_list, parser->current.end); parser->current.end++; } @@ -6235,6 +6235,7 @@ parser_lex(yp_parser_t *parser) { if (parser->current.end < parser->end) { lex_mode_push_string(parser, true, false, lex_mode_incrementor(*parser->current.end), lex_mode_terminator(*parser->current.end)); + yp_newline_list_check_append(&parser->newline_list, parser->current.end); parser->current.end++; } @@ -6462,9 +6463,7 @@ parser_lex(yp_parser_t *parser) { // If the result is an escaped newline, then we need to // track that newline. - if (breakpoint[difference - 1] == '\n') { - yp_newline_list_append(&parser->newline_list, breakpoint + difference - 1); - } + yp_newline_list_check_append(&parser->newline_list, breakpoint + difference - 1); breakpoint = yp_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference)); continue; @@ -6526,7 +6525,13 @@ parser_lex(yp_parser_t *parser) { // If we've hit a newline, then we need to track that in the // list of newlines. if (*breakpoint == '\n') { - yp_newline_list_append(&parser->newline_list, breakpoint); + // For the special case of a newline-terminated regular expression, we will pass + // through this branch twice -- once with YP_TOKEN_REGEXP_BEGIN and then again + // with YP_TOKEN_STRING_CONTENT. Let's avoid tracking the newline twice, by + // tracking it only in the REGEXP_BEGIN case. + if (!(lex_mode->as.regexp.terminator == '\n' && parser->current.type != YP_TOKEN_REGEXP_BEGIN)) { + yp_newline_list_append(&parser->newline_list, breakpoint); + } if (lex_mode->as.regexp.terminator != '\n') { // If the terminator is not a newline, then we can set @@ -6571,9 +6576,7 @@ parser_lex(yp_parser_t *parser) { // If the result is an escaped newline, then we need to // track that newline. - if (breakpoint[difference - 1] == '\n') { - yp_newline_list_append(&parser->newline_list, breakpoint + difference - 1); - } + yp_newline_list_check_append(&parser->newline_list, breakpoint + difference - 1); breakpoint = yp_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference)); continue; @@ -6664,9 +6667,7 @@ parser_lex(yp_parser_t *parser) { parser->current.end = breakpoint + 2; yp_newline_list_append(&parser->newline_list, breakpoint + 1); } else { - if (*parser->current.end == '\n') { - yp_newline_list_append(&parser->newline_list, parser->current.end); - } + yp_newline_list_check_append(&parser->newline_list, parser->current.end); parser->current.end = breakpoint + 1; } @@ -6716,9 +6717,7 @@ parser_lex(yp_parser_t *parser) { // If the result is an escaped newline, then we need to // track that newline. - if (breakpoint[difference - 1] == '\n') { - yp_newline_list_append(&parser->newline_list, breakpoint + difference - 1); - } + yp_newline_list_check_append(&parser->newline_list, breakpoint + difference - 1); breakpoint = yp_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference)); break; @@ -6889,9 +6888,7 @@ parser_lex(yp_parser_t *parser) { yp_unescape_type_t unescape_type = (quote == YP_HEREDOC_QUOTE_SINGLE) ? YP_UNESCAPE_MINIMAL : YP_UNESCAPE_ALL; size_t difference = yp_unescape_calculate_difference(breakpoint, parser->end, unescape_type, false, &parser->error_list); - if (breakpoint[difference - 1] == '\n') { - yp_newline_list_append(&parser->newline_list, breakpoint + difference - 1); - } + yp_newline_list_check_append(&parser->newline_list, breakpoint + difference - 1); breakpoint = yp_strpbrk(parser, breakpoint + difference, breakpoints, parser->end - (breakpoint + difference)); } diff --git a/test/fixtures/newline_terminated.txt b/test/fixtures/newline_terminated.txt new file mode 100644 index 00000000000..3faf45ab4ad --- /dev/null +++ b/test/fixtures/newline_terminated.txt @@ -0,0 +1,13 @@ +# note that %i, %I, %w, and %W do not support newline termination in CRuby + +% +foo + +%q +foo + +%Q +foo + +%r +foo diff --git a/test/parse_test.rb b/test/parse_test.rb index ed0567d0194..c0f3ecf551e 100644 --- a/test/parse_test.rb +++ b/test/parse_test.rb @@ -131,6 +131,8 @@ def test_parse_takes_file_path end Dir["*.txt", base: base].each do |relative| + next if relative == "newline_terminated.txt" + # We test every snippet (separated by \n\n) in isolation # to ensure the parser does not try to read bytes further than the end of each snippet define_method "test_individual_snippets_#{relative}" do diff --git a/test/snapshots/newline_terminated.txt b/test/snapshots/newline_terminated.txt new file mode 100644 index 00000000000..946eb44a369 --- /dev/null +++ b/test/snapshots/newline_terminated.txt @@ -0,0 +1,15 @@ +ProgramNode(76...106)( + [], + StatementsNode(76...106)( + [StringNode(76...82)((76...78), (78...81), (81...82), "foo"), + StringNode(83...90)((83...86), (86...89), (89...90), "foo"), + StringNode(91...98)((91...94), (94...97), (97...98), "foo"), + RegularExpressionNode(99...106)( + (99...102), + (102...105), + (105...106), + "foo", + 0 + )] + ) +)