ziglang · RetroDev256 · Sep 13, 2024 · Sep 13, 2024 · Sep 13, 2024 · Sep 13, 2024
diff --git a/lib/compiler/aro_translate_c/ast.zig b/lib/compiler/aro_translate_c/ast.zig
@@ -786,7 +786,8 @@ pub fn render(gpa: Allocator, nodes: []const Node) !std.zig.Ast {
 
     try ctx.tokens.append(gpa, .{
         .tag = .eof,
-        .start = @as(u32, @intCast(ctx.buf.items.len)),
+        .start = @intCast(ctx.buf.items.len),
+        .end = @intCast(ctx.buf.items.len),
     });
 
     return std.zig.Ast{
@@ -814,10 +815,12 @@ const Context = struct {
     fn addTokenFmt(c: *Context, tag: TokenTag, comptime format: []const u8, args: anytype) Allocator.Error!TokenIndex {
         const start_index = c.buf.items.len;
         try c.buf.writer().print(format ++ " ", args);
+        const end_index = c.buf.items.len - 1;
 
         try c.tokens.append(c.gpa, .{
             .tag = tag,
-            .start = @as(u32, @intCast(start_index)),
+            .start = @intCast(start_index),
+            .end = @intCast(end_index),
         });
 
         return @as(u32, @intCast(c.tokens.len - 1));

diff --git a/lib/std/zig/Ast.zig b/lib/std/zig/Ast.zig
@@ -19,10 +19,7 @@ errors: []const Error,
 pub const TokenIndex = u32;
 pub const ByteOffset = u32;
 
-pub const TokenList = std.MultiArrayList(struct {
-    tag: Token.Tag,
-    start: ByteOffset,
-});
+pub const TokenList = std.MultiArrayList(Token);
 pub const NodeList = std.MultiArrayList(Node);
 
 pub const Location = struct {
@@ -60,20 +57,28 @@ pub fn parse(gpa: Allocator, source: [:0]const u8, mode: Mode) Allocator.Error!A
     var tokens = Ast.TokenList{};
     defer tokens.deinit(gpa);
 
-    // Empirically, the zig std lib has an 8:1 ratio of source bytes to token count.
-    const estimated_token_count = source.len / 8;
-    try tokens.ensureTotalCapacity(gpa, estimated_token_count);
-
-    var tokenizer = std.zig.Tokenizer.init(source);
-    while (true) {
-        const token = tokenizer.next();
-        try tokens.append(gpa, .{
-            .tag = token.tag,
-            .start = @intCast(token.loc.start),
-        });
-        if (token.tag == .eof) break;
+    var tokenizer: std.zig.Tokenizer = .init(source);
+    outer: while (true) {
+        const remaining = source.len - tokenizer.index;
+        // As of 2024-09-13 there is a ~6.2:1 ratio of bytes to tokens
+        const estimated_tokens = (remaining / 6) + 1;
+        // guarantee `estimated_tokens` number of free slots - so we can use
+        // appendAssumeCapacity for each insert, speeding things up.
+        try tokens.ensureUnusedCapacity(gpa, estimated_tokens);
+        const actual_unused = tokens.capacity - tokens.len;
+        for (0..actual_unused) |_| {
+            const token = tokenizer.next();
+            tokens.appendAssumeCapacity(token);
+            if (token.tag == .eof) {
+                @branchHint(.unlikely);
+                break :outer;
+            }
+        }
     }
 
+    // trim the size of `tokens` - it will not be modified further
+    tokens.shrinkAndFree(gpa, tokens.len);
+
     var parser: Parse = .{
         .source = source,
         .gpa = gpa,
@@ -90,16 +95,26 @@ pub fn parse(gpa: Allocator, source: [:0]const u8, mode: Mode) Allocator.Error!A
     defer parser.extra_data.deinit(gpa);
     defer parser.scratch.deinit(gpa);
 
-    // Empirically, Zig source code has a 2:1 ratio of tokens to AST nodes.
-    // Make sure at least 1 so we can use appendAssumeCapacity on the root node below.
-    const estimated_node_count = (tokens.len + 2) / 2;
-    try parser.nodes.ensureTotalCapacity(gpa, estimated_node_count);
+    // Typically Zig source code has around a 2:1 ratio of tokens to AST nodes.
+    // In practice, it could be up to a 1:1 ratio of tokens to AST nodes.
+    // We use this information to ensure we can always use appendAssumeCapacity,
+    // at the cost of increased memory usage during parsing.
+
+    // Ensure we have the upper bound number of nodes, and add one for the "root" node
+    try parser.nodes.ensureTotalCapacity(gpa, tokens.len + 1);
 
     switch (mode) {
         .zig => try parser.parseRoot(),
         .zon => try parser.parseZon(),
     }
 
+    // trim size - these lists aren't modified further, we want to reduce max rss
+    // this also cleans up the `ensureTotalCapacity` call we made earlier, so our
+    // memory gobbling doesn't affect the rest of the pipeline
+    parser.nodes.shrinkAndFree(gpa, parser.nodes.len);
+    parser.extra_data.shrinkAndFree(gpa, parser.extra_data.items.len);
+    parser.errors.shrinkAndFree(gpa, parser.errors.items.len);
+
     // TODO experiment with compacting the MultiArrayList slices here
     return Ast{
         .source = source,
@@ -129,11 +144,9 @@ pub fn renderToArrayList(tree: Ast, buffer: *std.ArrayList(u8), fixups: Fixups)
 
 /// Returns an extra offset for column and byte offset of errors that
 /// should point after the token in the error message.
-pub fn errorOffset(tree: Ast, parse_error: Error) u32 {
-    return if (parse_error.token_is_prev)
-        @as(u32, @intCast(tree.tokenSlice(parse_error.token).len))
-    else
-        0;
+pub inline fn errorOffset(tree: Ast, parse_error: Error) u32 {
+    const token_length: u32 = @intCast(tree.tokenSlice(parse_error.token).len);
+    return token_length * @intFromBool(parse_error.token_is_prev);
 }
 
 pub fn tokenLocation(self: Ast, start_offset: ByteOffset, token_index: TokenIndex) Location {
@@ -176,22 +189,8 @@ pub fn tokenLocation(self: Ast, start_offset: ByteOffset, token_index: TokenInde
 
 pub fn tokenSlice(tree: Ast, token_index: TokenIndex) []const u8 {
     const token_starts = tree.tokens.items(.start);
-    const token_tags = tree.tokens.items(.tag);
-    const token_tag = token_tags[token_index];
-
-    // Many tokens can be determined entirely by their tag.
-    if (token_tag.lexeme()) |lexeme| {
-        return lexeme;
-    }
-
-    // For some tokens, re-tokenization is needed to find the end.
-    var tokenizer: std.zig.Tokenizer = .{
-        .buffer = tree.source,
-        .index = token_starts[token_index],
-    };
-    const token = tokenizer.next();
-    assert(token.tag == token_tag);
-    return tree.source[token.loc.start..token.loc.end];
+    const token_ends = tree.tokens.items(.end);
+    return tree.source[token_starts[token_index]..token_ends[token_index]];
 }
 
 pub fn extraData(tree: Ast, index: usize, comptime T: type) T {
@@ -790,7 +789,7 @@ pub fn lastToken(tree: Ast, node: Node.Index) TokenIndex {
     var n = node;
     var end_offset: TokenIndex = 0;
     while (true) switch (tags[n]) {
-        .root => return @as(TokenIndex, @intCast(tree.tokens.len - 1)),
+        .root => return @intCast(tree.tokens.len - 1),
 
         .@"usingnamespace",
         .bool_not,
@@ -1325,10 +1324,11 @@ pub fn tokensOnSameLine(tree: Ast, token1: TokenIndex, token2: TokenIndex) bool
 
 pub fn getNodeSource(tree: Ast, node: Node.Index) []const u8 {
     const token_starts = tree.tokens.items(.start);
+    const token_ends = tree.tokens.items(.end);
     const first_token = tree.firstToken(node);
     const last_token = tree.lastToken(node);
     const start = token_starts[first_token];
-    const end = token_starts[last_token] + tree.tokenSlice(last_token).len;
+    const end = token_ends[last_token];
     return tree.source[start..end];
 }
 
@@ -3632,6 +3632,7 @@ pub fn tokenToSpan(tree: *const Ast, token: Ast.TokenIndex) Span {
 
 pub fn tokensToSpan(tree: *const Ast, start: Ast.TokenIndex, end: Ast.TokenIndex, main: Ast.TokenIndex) Span {
     const token_starts = tree.tokens.items(.start);
+    const token_ends = tree.tokens.items(.end);
     var start_tok = start;
     var end_tok = end;
 
@@ -3645,9 +3646,11 @@ pub fn tokensToSpan(tree: *const Ast, start: Ast.TokenIndex, end: Ast.TokenIndex
         start_tok = main;
         end_tok = main;
     }
-    const start_off = token_starts[start_tok];
-    const end_off = token_starts[end_tok] + @as(u32, @intCast(tree.tokenSlice(end_tok).len));
-    return Span{ .start = start_off, .end = end_off, .main = token_starts[main] };
+    return Span{
+        .start = token_starts[start_tok],
+        .end = token_ends[end_tok],
+        .main = token_starts[main],
+    };
 }
 
 const std = @import("../std.zig");