Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Faster AST Construction #21398

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 5 additions & 2 deletions lib/compiler/aro_translate_c/ast.zig
Original file line number Diff line number Diff line change
Expand Up @@ -786,7 +786,8 @@ pub fn render(gpa: Allocator, nodes: []const Node) !std.zig.Ast {

try ctx.tokens.append(gpa, .{
.tag = .eof,
.start = @as(u32, @intCast(ctx.buf.items.len)),
.start = @intCast(ctx.buf.items.len),
.end = @intCast(ctx.buf.items.len),
});

return std.zig.Ast{
Expand Down Expand Up @@ -814,10 +815,12 @@ const Context = struct {
fn addTokenFmt(c: *Context, tag: TokenTag, comptime format: []const u8, args: anytype) Allocator.Error!TokenIndex {
const start_index = c.buf.items.len;
try c.buf.writer().print(format ++ " ", args);
const end_index = c.buf.items.len - 1;

try c.tokens.append(c.gpa, .{
.tag = tag,
.start = @as(u32, @intCast(start_index)),
.start = @intCast(start_index),
.end = @intCast(end_index),
});

return @as(u32, @intCast(c.tokens.len - 1));
Expand Down
95 changes: 49 additions & 46 deletions lib/std/zig/Ast.zig
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,7 @@ errors: []const Error,
pub const TokenIndex = u32;
pub const ByteOffset = u32;

pub const TokenList = std.MultiArrayList(struct {
tag: Token.Tag,
start: ByteOffset,
});
pub const TokenList = std.MultiArrayList(Token);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why has this changed? Storing only the start offset of the token is a very intentional design choice.

Copy link
Contributor Author

@RetroDev256 RetroDev256 Sep 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I found that the tokenSlice function in Ast.zig could avoid running the tokenizer if I stored the end index. Would you like me to revert that change?

That change alone is responsible for the higher memory usage, but also cut off a few percent on the wall time.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, please. This tradeoff was made intentionally to save memory at the potential expense of a small amount of performance. I say "potential" because tokenization is fast enough that it could be faster to re-tokenize this one then than to get more cache misses from storing this data in memory. See Andrew's talk on DOD for details.

If the performance impact is more significant than expected perhaps we can revisit this. If you'd like to quantify the performance difference this gives on ast-check, that'd be appreciated.

Copy link
Contributor Author

@RetroDev256 RetroDev256 Sep 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Here is the benchmark on Sema.zig with the changes to Ast.zig:
image
Here is the benchmark on Sema.zig without the changes to Ast.zig:
image

The results honestly make me wonder if anything else I did was just pure noise.

Copy link
Contributor Author

@RetroDev256 RetroDev256 Sep 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

image
And the difference between these two runs is purely the pub const TokenList = std.MultiArrayList(Token); change (alongside the tokenSlice change), without any of the other changes in this PR thrown in.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I feel that based on these findings, I should close this PR if we want the memory savings.

pub const NodeList = std.MultiArrayList(Node);

pub const Location = struct {
Expand Down Expand Up @@ -60,20 +57,28 @@ pub fn parse(gpa: Allocator, source: [:0]const u8, mode: Mode) Allocator.Error!A
var tokens = Ast.TokenList{};
defer tokens.deinit(gpa);

// Empirically, the zig std lib has an 8:1 ratio of source bytes to token count.
const estimated_token_count = source.len / 8;
try tokens.ensureTotalCapacity(gpa, estimated_token_count);

var tokenizer = std.zig.Tokenizer.init(source);
while (true) {
const token = tokenizer.next();
try tokens.append(gpa, .{
.tag = token.tag,
.start = @intCast(token.loc.start),
});
if (token.tag == .eof) break;
var tokenizer: std.zig.Tokenizer = .init(source);
outer: while (true) {
const remaining = source.len - tokenizer.index;
// As of 2024-09-13 there is a ~6.2:1 ratio of bytes to tokens
const estimated_tokens = (remaining / 6) + 1;
// guarantee `estimated_tokens` number of free slots - so we can use
// appendAssumeCapacity for each insert, speeding things up.
try tokens.ensureUnusedCapacity(gpa, estimated_tokens);
const actual_unused = tokens.capacity - tokens.len;
for (0..actual_unused) |_| {
const token = tokenizer.next();
tokens.appendAssumeCapacity(token);
if (token.tag == .eof) {
@branchHint(.unlikely);
break :outer;
}
}
}

// trim the size of `tokens` - it will not be modified further
tokens.shrinkAndFree(gpa, tokens.len);

var parser: Parse = .{
.source = source,
.gpa = gpa,
Expand All @@ -90,16 +95,26 @@ pub fn parse(gpa: Allocator, source: [:0]const u8, mode: Mode) Allocator.Error!A
defer parser.extra_data.deinit(gpa);
defer parser.scratch.deinit(gpa);

// Empirically, Zig source code has a 2:1 ratio of tokens to AST nodes.
// Make sure at least 1 so we can use appendAssumeCapacity on the root node below.
const estimated_node_count = (tokens.len + 2) / 2;
try parser.nodes.ensureTotalCapacity(gpa, estimated_node_count);
// Typically Zig source code has around a 2:1 ratio of tokens to AST nodes.
// In practice, it could be up to a 1:1 ratio of tokens to AST nodes.
// We use this information to ensure we can always use appendAssumeCapacity,
// at the cost of increased memory usage during parsing.

// Ensure we have the upper bound number of nodes, and add one for the "root" node
try parser.nodes.ensureTotalCapacity(gpa, tokens.len + 1);

switch (mode) {
.zig => try parser.parseRoot(),
.zon => try parser.parseZon(),
}

// trim size - these lists aren't modified further, we want to reduce max rss
// this also cleans up the `ensureTotalCapacity` call we made earlier, so our
// memory gobbling doesn't affect the rest of the pipeline
parser.nodes.shrinkAndFree(gpa, parser.nodes.len);
parser.extra_data.shrinkAndFree(gpa, parser.extra_data.items.len);
parser.errors.shrinkAndFree(gpa, parser.errors.items.len);

// TODO experiment with compacting the MultiArrayList slices here
return Ast{
.source = source,
Expand Down Expand Up @@ -129,11 +144,9 @@ pub fn renderToArrayList(tree: Ast, buffer: *std.ArrayList(u8), fixups: Fixups)

/// Returns an extra offset for column and byte offset of errors that
/// should point after the token in the error message.
pub fn errorOffset(tree: Ast, parse_error: Error) u32 {
return if (parse_error.token_is_prev)
@as(u32, @intCast(tree.tokenSlice(parse_error.token).len))
else
0;
pub inline fn errorOffset(tree: Ast, parse_error: Error) u32 {
const token_length: u32 = @intCast(tree.tokenSlice(parse_error.token).len);
return token_length * @intFromBool(parse_error.token_is_prev);
}

pub fn tokenLocation(self: Ast, start_offset: ByteOffset, token_index: TokenIndex) Location {
Expand Down Expand Up @@ -176,22 +189,8 @@ pub fn tokenLocation(self: Ast, start_offset: ByteOffset, token_index: TokenInde

pub fn tokenSlice(tree: Ast, token_index: TokenIndex) []const u8 {
const token_starts = tree.tokens.items(.start);
const token_tags = tree.tokens.items(.tag);
const token_tag = token_tags[token_index];

// Many tokens can be determined entirely by their tag.
if (token_tag.lexeme()) |lexeme| {
return lexeme;
}

// For some tokens, re-tokenization is needed to find the end.
var tokenizer: std.zig.Tokenizer = .{
.buffer = tree.source,
.index = token_starts[token_index],
};
const token = tokenizer.next();
assert(token.tag == token_tag);
return tree.source[token.loc.start..token.loc.end];
const token_ends = tree.tokens.items(.end);
return tree.source[token_starts[token_index]..token_ends[token_index]];
}

pub fn extraData(tree: Ast, index: usize, comptime T: type) T {
Expand Down Expand Up @@ -790,7 +789,7 @@ pub fn lastToken(tree: Ast, node: Node.Index) TokenIndex {
var n = node;
var end_offset: TokenIndex = 0;
while (true) switch (tags[n]) {
.root => return @as(TokenIndex, @intCast(tree.tokens.len - 1)),
.root => return @intCast(tree.tokens.len - 1),

.@"usingnamespace",
.bool_not,
Expand Down Expand Up @@ -1325,10 +1324,11 @@ pub fn tokensOnSameLine(tree: Ast, token1: TokenIndex, token2: TokenIndex) bool

pub fn getNodeSource(tree: Ast, node: Node.Index) []const u8 {
const token_starts = tree.tokens.items(.start);
const token_ends = tree.tokens.items(.end);
const first_token = tree.firstToken(node);
const last_token = tree.lastToken(node);
const start = token_starts[first_token];
const end = token_starts[last_token] + tree.tokenSlice(last_token).len;
const end = token_ends[last_token];
return tree.source[start..end];
}

Expand Down Expand Up @@ -3632,6 +3632,7 @@ pub fn tokenToSpan(tree: *const Ast, token: Ast.TokenIndex) Span {

pub fn tokensToSpan(tree: *const Ast, start: Ast.TokenIndex, end: Ast.TokenIndex, main: Ast.TokenIndex) Span {
const token_starts = tree.tokens.items(.start);
const token_ends = tree.tokens.items(.end);
var start_tok = start;
var end_tok = end;

Expand All @@ -3645,9 +3646,11 @@ pub fn tokensToSpan(tree: *const Ast, start: Ast.TokenIndex, end: Ast.TokenIndex
start_tok = main;
end_tok = main;
}
const start_off = token_starts[start_tok];
const end_off = token_starts[end_tok] + @as(u32, @intCast(tree.tokenSlice(end_tok).len));
return Span{ .start = start_off, .end = end_off, .main = token_starts[main] };
return Span{
.start = token_starts[start_tok],
.end = token_ends[end_tok],
.main = token_starts[main],
};
}

const std = @import("../std.zig");
Expand Down
Loading
Loading