Remove string concat in favor of a flat list

Right now when you have a lot of string concats it ends up being difficult to work with because of the depth of the tree. You end up descending very far for every string literal that is part of the concat. There are already times when we use an interpolated string node to group together two string segments that are part of the same string (like when they are interupted by the contents of a heredoc). This commit takes the same approach and replaces string concats with interpolated string nodes. Now that they're a flat list, they should be much easier to work with. There's still some missing information here that would be useful to consumers: whether or not there is _actually_ any interpolation contained in the list. We could remedy this with another node type that is named something like string list, or we could add a flag to interpolated string node indicating that there is interpolation. Either way I want to solve that in a follow-up commit, since this commit is valuable on its own.
ruby · Nov 21, 2023 · 1e7ae3a · 1e7ae3a
1 parent 060bcc8
commit 1e7ae3a
Show file tree

Hide file tree

Showing 11 changed files with 293 additions and 311 deletions.
diff --git a/config.yml b/config.yml
@@ -2370,17 +2370,6 @@ nodes:
 
           foo; bar; baz
           ^^^^^^^^^^^^^
-  - name: StringConcatNode
-    fields:
-      - name: left
-        type: node
-      - name: right
-        type: node
-    comment: |
-      Represents the use of compile-time string concatenation.
-
-          "foo" "bar"
-          ^^^^^^^^^^^
   - name: StringNode
     fields:
       - name: flags

diff --git a/src/prism.c b/src/prism.c
@@ -5127,28 +5127,6 @@ pm_statements_node_body_append(pm_statements_node_t *node, pm_node_t *statement)
     statement->flags |= PM_NODE_FLAG_NEWLINE;
 }
 
-/**
- * Allocate a new StringConcatNode node.
- */
-static pm_string_concat_node_t *
-pm_string_concat_node_create(pm_parser_t *parser, pm_node_t *left, pm_node_t *right) {
-    pm_string_concat_node_t *node = PM_ALLOC_NODE(parser, pm_string_concat_node_t);
-
-    *node = (pm_string_concat_node_t) {
-        {
-            .type = PM_STRING_CONCAT_NODE,
-            .location = {
-                .start = left->location.start,
-                .end = right->location.end
-            }
-        },
-        .left = left,
-        .right = right
-    };
-
-    return node;
-}
-
 /**
  * Allocate a new StringNode node with the current string on the parser.
  */
@@ -13470,9 +13448,10 @@ parse_strings_empty_content(const uint8_t *location) {
  * Parse a set of strings that could be concatenated together.
  */
 static inline pm_node_t *
-parse_strings(pm_parser_t *parser) {
+parse_strings(pm_parser_t *parser, pm_node_t *current) {
     assert(parser->current.type == PM_TOKEN_STRING_BEGIN);
-    pm_node_t *result = NULL;
+
+    bool concating = false;
     bool state_is_arg_labeled = lex_state_p(parser, PM_LEX_STATE_ARG | PM_LEX_STATE_LABELED);
 
     while (match1(parser, PM_TOKEN_STRING_BEGIN)) {
@@ -13608,7 +13587,7 @@ parse_strings(pm_parser_t *parser) {
             }
         }
 
-        if (result == NULL) {
+        if (current == NULL) {
             // If the node we just parsed is a symbol node, then we can't
             // concatenate it with anything else, so we can now return that
             // node.
@@ -13618,7 +13597,7 @@ parse_strings(pm_parser_t *parser) {
 
             // If we don't already have a node, then it's fine and we can just
             // set the result to be the node we just parsed.
-            result = node;
+            current = node;
         } else {
             // Otherwise we need to check the type of the node we just parsed.
             // If it cannot be concatenated with the previous node, then we'll
@@ -13627,13 +13606,22 @@ parse_strings(pm_parser_t *parser) {
                 pm_parser_err_node(parser, node, PM_ERR_STRING_CONCATENATION);
             }
 
-            // Either way we will create a concat node to hold the strings
-            // together.
-            result = (pm_node_t *) pm_string_concat_node_create(parser, result, node);
+            // If we haven't already created our container for concatenation,
+            // we'll do that now.
+            if (!concating) {
+                concating = true;
+                pm_token_t bounds = not_provided(parser);
+
+                pm_interpolated_string_node_t *container = pm_interpolated_string_node_create(parser, &bounds, NULL, &bounds);
+                pm_interpolated_string_node_append(container, current);
+                current = (pm_node_t *) container;
+            }
+
+            pm_interpolated_string_node_append((pm_interpolated_string_node_t *) current, node);
         }
     }
 
-    return result;
+    return current;
 }
 
 /**
@@ -13894,8 +13882,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
             // Characters can be followed by strings in which case they are
             // automatically concatenated.
             if (match1(parser, PM_TOKEN_STRING_BEGIN)) {
-                pm_node_t *concat = parse_strings(parser);
-                return (pm_node_t *) pm_string_concat_node_create(parser, node, concat);
+                return parse_strings(parser, node);
             }
 
             return node;
@@ -14169,8 +14156,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
             }
 
             if (match1(parser, PM_TOKEN_STRING_BEGIN)) {
-                pm_node_t *concat = parse_strings(parser);
-                return (pm_node_t *) pm_string_concat_node_create(parser, node, concat);
+                return parse_strings(parser, node);
             }
 
             return node;
@@ -15773,7 +15759,7 @@ parse_expression_prefix(pm_parser_t *parser, pm_binding_power_t binding_power) {
             return (pm_node_t *) node;
         }
         case PM_TOKEN_STRING_BEGIN:
-            return parse_strings(parser);
+            return parse_strings(parser, NULL);
         case PM_TOKEN_SYMBOL_BEGIN: {
             pm_lex_mode_t lex_mode = *parser->lex_modes.current;
             parser_lex(parser);

diff --git a/test/prism/location_test.rb b/test/prism/location_test.rb
@@ -495,6 +495,7 @@ def test_InterpolatedRegularExpressionNode
     def test_InterpolatedStringNode
       assert_location(InterpolatedStringNode, "\"foo \#@bar baz\"")
       assert_location(InterpolatedStringNode, "<<~A\nhello \#{1} world\nA", 0...4)
+      assert_location(InterpolatedStringNode, '"foo" "bar"')
     end
 
     def test_InterpolatedSymbolNode
@@ -789,10 +790,6 @@ def test_StatementsNode
       assert_location(StatementsNode, "\"\#{foo}\"", 3...6) { |node| node.parts.first.statements }
     end
 
-    def test_StringConcatNode
-      assert_location(StringConcatNode, '"foo" "bar"')
-    end
-
     def test_StringNode
       assert_location(StringNode, '"foo"')
       assert_location(StringNode, '%q[foo]')

diff --git a/test/prism/snapshots/dos_endings.txt b/test/prism/snapshots/dos_endings.txt
diff --git a/test/prism/snapshots/seattlerb/parse_line_evstr_after_break.txt b/test/prism/snapshots/seattlerb/parse_line_evstr_after_break.txt
diff --git a/test/prism/snapshots/seattlerb/str_lit_concat_bad_encodings.txt b/test/prism/snapshots/seattlerb/str_lit_concat_bad_encodings.txt
diff --git a/test/prism/snapshots/strings.txt b/test/prism/snapshots/strings.txt
diff --git a/test/prism/snapshots/unparser/corpus/literal/literal.txt b/test/prism/snapshots/unparser/corpus/literal/literal.txt