From 0bc416cd8efba64d5a246d2c1f368cbd2e951f00 Mon Sep 17 00:00:00 2001 From: Andrew Kelley Date: Tue, 3 Oct 2023 23:27:17 -0700 Subject: [PATCH] rework package manager Organize everything around a Fetch task which does a bunch of stuff in a worker thread without touching any shared state, and then queues up Fetch tasks for its dependencies. This isn't the theoretical optimal package fetching performance because CPU cores don't necessarily map 1:1 with I/O tasks, and each fetch task contains a mixture of computations and I/O. However, it is expected for this to significantly outperform master branch, which fetches everything recursively with only one thread. The logic is now a lot more linear and easy to follow. Everything that is embarassingly parallel is done on the thread pool, and then after everything is fetched, the worker threads are joined and the main thread does the finishing touches of stitching together the dependencies.zig import files. There is only one tiny little critical section and it does not even have any error handling in it. This also lays the groundwork for #14281 because in system mode, all this fetching logic will be skipped, but the "finishing touches" mentioned above still need to be done. With this branch, that logic is separated out and no longer recursively tangled with fetching stuff. Additionally, this branch: * Implements inclusion directives in `build.zig.zon` for deciding which files belong the package (#14311). * Adds basic documentation for `build.zig.zon` files. * Adds support for fetching dependencies with the `file://` protocol scheme (#17364). * Adds a workaround for a Linux/btrfs file system bug (#17282). This commit is a work-in-progress. Still todo: 1. Hook up the CLI to the new system. 2. Restore the module table creation logic after all the fetching is done. 3. Fix compilation errors, get the tests passing, and regression test against real world projects. --- CMakeLists.txt | 2 +- src/Package.zig | 1104 +---------------------------------------- src/Package/Fetch.zig | 1012 +++++++++++++++++++++++++++++++++++++ src/Package/hash.zig | 153 ------ src/main.zig | 2 +- 5 files changed, 1017 insertions(+), 1256 deletions(-) create mode 100644 src/Package/Fetch.zig delete mode 100644 src/Package/hash.zig diff --git a/CMakeLists.txt b/CMakeLists.txt index 28c2dcc8e8a3..e0dbe7cc193e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -528,7 +528,7 @@ set(ZIG_STAGE2_SOURCES "${CMAKE_SOURCE_DIR}/src/Liveness.zig" "${CMAKE_SOURCE_DIR}/src/Module.zig" "${CMAKE_SOURCE_DIR}/src/Package.zig" - "${CMAKE_SOURCE_DIR}/src/Package/hash.zig" + "${CMAKE_SOURCE_DIR}/src/Package/Fetch.zig" "${CMAKE_SOURCE_DIR}/src/RangeSet.zig" "${CMAKE_SOURCE_DIR}/src/Sema.zig" "${CMAKE_SOURCE_DIR}/src/TypedValue.zig" diff --git a/src/Package.zig b/src/Package.zig index 14052e3de4be..e5fa24e18d7d 100644 --- a/src/Package.zig +++ b/src/Package.zig @@ -15,9 +15,9 @@ const Compilation = @import("Compilation.zig"); const Module = @import("Module.zig"); const Cache = std.Build.Cache; const build_options = @import("build_options"); -const git = @import("git.zig"); -const computePackageHash = @import("Package/hash.zig").compute; +const Fetch = @import("Package/Fetch.zig"); +pub const build_zig_basename = "build.zig"; pub const Manifest = @import("Manifest.zig"); pub const Table = std.StringHashMapUnmanaged(*Package); @@ -213,223 +213,6 @@ pub fn getName(target: *const Package, gpa: Allocator, mod: Module) ![]const u8 return buf.toOwnedSlice(); } -pub const build_zig_basename = "build.zig"; - -/// Fetches a package and all of its dependencies recursively. Writes the -/// corresponding datastructures for the build runner into `dependencies_source`. -pub fn fetchAndAddDependencies( - pkg: *Package, - deps_pkg: *Package, - arena: Allocator, - thread_pool: *ThreadPool, - http_client: *std.http.Client, - directory: Compilation.Directory, - global_cache_directory: Compilation.Directory, - local_cache_directory: Compilation.Directory, - dependencies_source: *std.ArrayList(u8), - error_bundle: *std.zig.ErrorBundle.Wip, - all_modules: *AllModules, - root_prog_node: *std.Progress.Node, - /// null for the root package - this_hash: ?[]const u8, -) !void { - const max_bytes = 10 * 1024 * 1024; - const gpa = thread_pool.allocator; - const build_zig_zon_bytes = directory.handle.readFileAllocOptions( - arena, - Manifest.basename, - max_bytes, - null, - 1, - 0, - ) catch |err| switch (err) { - error.FileNotFound => { - // Handle the same as no dependencies. - if (this_hash) |hash| { - try dependencies_source.writer().print( - \\ pub const {} = struct {{ - \\ pub const build_root = "{}"; - \\ pub const build_zig = @import("{}"); - \\ pub const deps: []const struct {{ []const u8, []const u8 }} = &.{{}}; - \\ }}; - \\ - , .{ - std.zig.fmtId(hash), - std.zig.fmtEscapes(pkg.root_src_directory.path.?), - std.zig.fmtEscapes(hash), - }); - } else { - try dependencies_source.writer().writeAll( - \\pub const packages = struct {}; - \\pub const root_deps: []const struct { []const u8, []const u8 } = &.{}; - \\ - ); - } - return; - }, - else => |e| return e, - }; - - var ast = try std.zig.Ast.parse(gpa, build_zig_zon_bytes, .zon); - defer ast.deinit(gpa); - - if (ast.errors.len > 0) { - const file_path = try directory.join(arena, &.{Manifest.basename}); - try main.putAstErrorsIntoBundle(gpa, ast, file_path, error_bundle); - return error.PackageFetchFailed; - } - - var manifest = try Manifest.parse(gpa, ast); - defer manifest.deinit(gpa); - - if (manifest.errors.len > 0) { - const file_path = try directory.join(arena, &.{Manifest.basename}); - for (manifest.errors) |msg| { - const str = try error_bundle.addString(msg.msg); - try Report.addErrorMessage(&ast, file_path, error_bundle, 0, str, msg.tok, msg.off); - } - return error.PackageFetchFailed; - } - - const report: Report = .{ - .ast = &ast, - .directory = directory, - .error_bundle = error_bundle, - }; - - for (manifest.dependencies.values()) |dep| { - // If the hash is invalid, let errors happen later - // We only want to add these for progress reporting - const hash = dep.hash orelse continue; - if (hash.len != hex_multihash_len) continue; - const gop = try all_modules.getOrPut(gpa, hash[0..hex_multihash_len].*); - if (!gop.found_existing) gop.value_ptr.* = null; - } - - root_prog_node.setEstimatedTotalItems(all_modules.count()); - - if (this_hash == null) { - try dependencies_source.writer().writeAll("pub const packages = struct {\n"); - } - - for (manifest.dependencies.keys(), manifest.dependencies.values()) |name, *dep| { - var fetch_location = try FetchLocation.init(gpa, dep.*, directory, report); - defer fetch_location.deinit(gpa); - - // Directories do not provide a hash in build.zig.zon. - // Hash the path to the module rather than its contents. - const sub_mod, const found_existing = if (fetch_location == .directory) - try getDirectoryModule(gpa, fetch_location, directory, all_modules, dep, report) - else - try getCachedPackage( - gpa, - global_cache_directory, - dep.*, - all_modules, - root_prog_node, - ) orelse .{ - try fetchAndUnpack( - fetch_location, - thread_pool, - http_client, - directory, - global_cache_directory, - dep.*, - report, - all_modules, - root_prog_node, - name, - ), - false, - }; - - assert(dep.hash != null); - - switch (sub_mod) { - .zig_pkg => |sub_pkg| { - if (!found_existing) { - try sub_pkg.fetchAndAddDependencies( - deps_pkg, - arena, - thread_pool, - http_client, - sub_pkg.root_src_directory, - global_cache_directory, - local_cache_directory, - dependencies_source, - error_bundle, - all_modules, - root_prog_node, - dep.hash.?, - ); - } - - try pkg.add(gpa, name, sub_pkg); - if (deps_pkg.table.get(dep.hash.?)) |other_sub| { - // This should be the same package (and hence module) since it's the same hash - // TODO: dedup multiple versions of the same package - assert(other_sub == sub_pkg); - } else { - try deps_pkg.add(gpa, dep.hash.?, sub_pkg); - } - }, - .non_zig_pkg => |sub_pkg| { - if (!found_existing) { - try dependencies_source.writer().print( - \\ pub const {} = struct {{ - \\ pub const build_root = "{}"; - \\ pub const deps: []const struct {{ []const u8, []const u8 }} = &.{{}}; - \\ }}; - \\ - , .{ - std.zig.fmtId(dep.hash.?), - std.zig.fmtEscapes(sub_pkg.root_src_directory.path.?), - }); - } - }, - } - } - - if (this_hash) |hash| { - try dependencies_source.writer().print( - \\ pub const {} = struct {{ - \\ pub const build_root = "{}"; - \\ pub const build_zig = @import("{}"); - \\ pub const deps: []const struct {{ []const u8, []const u8 }} = &.{{ - \\ - , .{ - std.zig.fmtId(hash), - std.zig.fmtEscapes(pkg.root_src_directory.path.?), - std.zig.fmtEscapes(hash), - }); - for (manifest.dependencies.keys(), manifest.dependencies.values()) |name, dep| { - try dependencies_source.writer().print( - " .{{ \"{}\", \"{}\" }},\n", - .{ std.zig.fmtEscapes(name), std.zig.fmtEscapes(dep.hash.?) }, - ); - } - try dependencies_source.writer().writeAll( - \\ }; - \\ }; - \\ - ); - } else { - try dependencies_source.writer().writeAll( - \\}; - \\ - \\pub const root_deps: []const struct { []const u8, []const u8 } = &.{ - \\ - ); - for (manifest.dependencies.keys(), manifest.dependencies.values()) |name, dep| { - try dependencies_source.writer().print( - " .{{ \"{}\", \"{}\" }},\n", - .{ std.zig.fmtEscapes(name), std.zig.fmtEscapes(dep.hash.?) }, - ); - } - try dependencies_source.writer().writeAll("};\n"); - } -} - pub fn createFilePkg( gpa: Allocator, cache_directory: Compilation.Directory, @@ -450,484 +233,11 @@ pub fn createFilePkg( const hex_digest = hh.final(); const o_dir_sub_path = "o" ++ fs.path.sep_str ++ hex_digest; - try renameTmpIntoCache(cache_directory.handle, tmp_dir_sub_path, o_dir_sub_path); + try Fetch.renameTmpIntoCache(cache_directory.handle, tmp_dir_sub_path, o_dir_sub_path); return createWithDir(gpa, cache_directory, o_dir_sub_path, basename); } -pub const Report = struct { - ast: ?*const std.zig.Ast, - directory: Compilation.Directory, - error_bundle: *std.zig.ErrorBundle.Wip, - - fn fail( - report: Report, - tok: std.zig.Ast.TokenIndex, - comptime fmt_string: []const u8, - fmt_args: anytype, - ) error{ PackageFetchFailed, OutOfMemory } { - const msg = try report.error_bundle.printString(fmt_string, fmt_args); - return failMsg(report, tok, msg); - } - - fn failMsg( - report: Report, - tok: std.zig.Ast.TokenIndex, - msg: u32, - ) error{ PackageFetchFailed, OutOfMemory } { - const gpa = report.error_bundle.gpa; - - const file_path = try report.directory.join(gpa, &.{Manifest.basename}); - defer gpa.free(file_path); - - const eb = report.error_bundle; - - if (report.ast) |ast| { - try addErrorMessage(ast, file_path, eb, 0, msg, tok, 0); - } else { - try eb.addRootErrorMessage(.{ - .msg = msg, - .src_loc = .none, - .notes_len = 0, - }); - } - - return error.PackageFetchFailed; - } - - fn addErrorWithNotes( - report: Report, - notes_len: u32, - msg: Manifest.ErrorMessage, - ) error{OutOfMemory}!void { - const eb = report.error_bundle; - const msg_str = try eb.addString(msg.msg); - if (report.ast) |ast| { - const gpa = eb.gpa; - const file_path = try report.directory.join(gpa, &.{Manifest.basename}); - defer gpa.free(file_path); - return addErrorMessage(ast, file_path, eb, notes_len, msg_str, msg.tok, msg.off); - } else { - return eb.addRootErrorMessage(.{ - .msg = msg_str, - .src_loc = .none, - .notes_len = notes_len, - }); - } - } - - fn addErrorMessage( - ast: *const std.zig.Ast, - file_path: []const u8, - eb: *std.zig.ErrorBundle.Wip, - notes_len: u32, - msg_str: u32, - msg_tok: std.zig.Ast.TokenIndex, - msg_off: u32, - ) error{OutOfMemory}!void { - const token_starts = ast.tokens.items(.start); - const start_loc = ast.tokenLocation(0, msg_tok); - - try eb.addRootErrorMessage(.{ - .msg = msg_str, - .src_loc = try eb.addSourceLocation(.{ - .src_path = try eb.addString(file_path), - .span_start = token_starts[msg_tok], - .span_end = @as(u32, @intCast(token_starts[msg_tok] + ast.tokenSlice(msg_tok).len)), - .span_main = token_starts[msg_tok] + msg_off, - .line = @intCast(start_loc.line), - .column = @as(u32, @intCast(start_loc.column)), - .source_line = try eb.addString(ast.source[start_loc.line_start..start_loc.line_end]), - }), - .notes_len = notes_len, - }); - } -}; - -pub const FetchLocation = union(enum) { - /// The relative path to a file or directory. - /// This may be a file that requires unpacking (such as a .tar.gz), - /// or the path to the root directory of a package. - file: []const u8, - directory: []const u8, - http_request: std.Uri, - git_request: std.Uri, - - pub fn init( - gpa: Allocator, - dep: Manifest.Dependency, - root_dir: Compilation.Directory, - report: Report, - ) !FetchLocation { - switch (dep.location) { - .url => |url| { - const uri = std.Uri.parse(url) catch |err| switch (err) { - error.UnexpectedCharacter => return report.fail(dep.location_tok, "failed to parse dependency location as URI", .{}), - else => return err, - }; - return initUri(uri, dep.location_tok, report); - }, - .path => |path| { - if (fs.path.isAbsolute(path)) { - return report.fail(dep.location_tok, "absolute paths are not allowed. Use a relative path instead", .{}); - } - - const is_dir = isDirectory(root_dir, path) catch |err| switch (err) { - error.FileNotFound => return report.fail(dep.location_tok, "file not found: {s}", .{path}), - else => return err, - }; - - return if (is_dir) - .{ .directory = try gpa.dupe(u8, path) } - else - .{ .file = try gpa.dupe(u8, path) }; - }, - } - } - - pub fn initUri(uri: std.Uri, location_tok: std.zig.Ast.TokenIndex, report: Report) !FetchLocation { - if (ascii.eqlIgnoreCase(uri.scheme, "file")) { - return report.fail(location_tok, "'file' scheme is not allowed for URLs. Use '.path' instead", .{}); - } else if (ascii.eqlIgnoreCase(uri.scheme, "http") or ascii.eqlIgnoreCase(uri.scheme, "https")) { - return .{ .http_request = uri }; - } else if (ascii.eqlIgnoreCase(uri.scheme, "git+http") or ascii.eqlIgnoreCase(uri.scheme, "git+https")) { - return .{ .git_request = uri }; - } else { - return report.fail(location_tok, "unsupported URL scheme: {s}", .{uri.scheme}); - } - } - - pub fn deinit(f: *FetchLocation, gpa: Allocator) void { - switch (f.*) { - .file, .directory => |path| gpa.free(path), - .http_request, .git_request => {}, - } - f.* = undefined; - } - - pub fn fetch( - f: FetchLocation, - gpa: Allocator, - root_dir: Compilation.Directory, - http_client: *std.http.Client, - dep_location_tok: std.zig.Ast.TokenIndex, - report: Report, - ) !ReadableResource { - switch (f) { - .file => |file| { - const owned_path = try gpa.dupe(u8, file); - errdefer gpa.free(owned_path); - return .{ - .path = owned_path, - .resource = .{ .file = try root_dir.handle.openFile(file, .{}) }, - }; - }, - .http_request => |uri| { - var h = std.http.Headers{ .allocator = gpa }; - defer h.deinit(); - - var req = try http_client.request(.GET, uri, h, .{}); - errdefer req.deinit(); - - try req.start(.{}); - try req.wait(); - - if (req.response.status != .ok) { - return report.fail(dep_location_tok, "expected response status '200 OK' got '{} {s}'", .{ - @intFromEnum(req.response.status), - req.response.status.phrase() orelse "", - }); - } - - return .{ - .path = try gpa.dupe(u8, uri.path), - .resource = .{ .http_request = req }, - }; - }, - .git_request => |uri| { - var transport_uri = uri; - transport_uri.scheme = uri.scheme["git+".len..]; - var redirect_uri: []u8 = undefined; - var session: git.Session = .{ .transport = http_client, .uri = transport_uri }; - session.discoverCapabilities(gpa, &redirect_uri) catch |e| switch (e) { - error.Redirected => { - defer gpa.free(redirect_uri); - return report.fail(dep_location_tok, "repository moved to {s}", .{redirect_uri}); - }, - else => |other| return other, - }; - - const want_oid = want_oid: { - const want_ref = uri.fragment orelse "HEAD"; - if (git.parseOid(want_ref)) |oid| break :want_oid oid else |_| {} - - const want_ref_head = try std.fmt.allocPrint(gpa, "refs/heads/{s}", .{want_ref}); - defer gpa.free(want_ref_head); - const want_ref_tag = try std.fmt.allocPrint(gpa, "refs/tags/{s}", .{want_ref}); - defer gpa.free(want_ref_tag); - - var ref_iterator = try session.listRefs(gpa, .{ - .ref_prefixes = &.{ want_ref, want_ref_head, want_ref_tag }, - .include_peeled = true, - }); - defer ref_iterator.deinit(); - while (try ref_iterator.next()) |ref| { - if (mem.eql(u8, ref.name, want_ref) or - mem.eql(u8, ref.name, want_ref_head) or - mem.eql(u8, ref.name, want_ref_tag)) - { - break :want_oid ref.peeled orelse ref.oid; - } - } - return report.fail(dep_location_tok, "ref not found: {s}", .{want_ref}); - }; - if (uri.fragment == null) { - const notes_len = 1; - try report.addErrorWithNotes(notes_len, .{ - .tok = dep_location_tok, - .off = 0, - .msg = "url field is missing an explicit ref", - }); - const eb = report.error_bundle; - const notes_start = try eb.reserveNotes(notes_len); - eb.extra.items[notes_start] = @intFromEnum(try eb.addErrorMessage(.{ - .msg = try eb.printString("try .url = \"{+/}#{}\",", .{ uri, std.fmt.fmtSliceHexLower(&want_oid) }), - })); - return error.PackageFetchFailed; - } - - var want_oid_buf: [git.fmt_oid_length]u8 = undefined; - _ = std.fmt.bufPrint(&want_oid_buf, "{}", .{std.fmt.fmtSliceHexLower(&want_oid)}) catch unreachable; - var fetch_stream = try session.fetch(gpa, &.{&want_oid_buf}); - errdefer fetch_stream.deinit(); - - return .{ - .path = try gpa.dupe(u8, &want_oid_buf), - .resource = .{ .git_fetch_stream = fetch_stream }, - }; - }, - .directory => unreachable, // Directories do not require fetching - } - } -}; - -pub const ReadableResource = struct { - path: []const u8, - resource: union(enum) { - file: fs.File, - http_request: std.http.Client.Request, - git_fetch_stream: git.Session.FetchStream, - dir: fs.IterableDir, - }, - - /// Unpack the package into the global cache directory. - /// If `ps` does not require unpacking (for example, if it is a directory), then no caching is performed. - /// In either case, the hash is computed and returned along with the path to the package. - pub fn unpack( - rr: *ReadableResource, - allocator: Allocator, - thread_pool: *ThreadPool, - global_cache_directory: Compilation.Directory, - dep_location_tok: std.zig.Ast.TokenIndex, - report: Report, - pkg_prog_node: *std.Progress.Node, - ) !PackageLocation { - switch (rr.resource) { - inline .file, .http_request, .git_fetch_stream, .dir => |*r, tag| { - const s = fs.path.sep_str; - const rand_int = std.crypto.random.int(u64); - const tmp_dir_sub_path = "tmp" ++ s ++ Manifest.hex64(rand_int); - - const actual_hash = h: { - var tmp_directory: Compilation.Directory = d: { - const path = try global_cache_directory.join(allocator, &.{tmp_dir_sub_path}); - errdefer allocator.free(path); - - const iterable_dir = try global_cache_directory.handle.makeOpenPathIterable(tmp_dir_sub_path, .{}); - errdefer iterable_dir.close(); - - break :d .{ - .path = path, - .handle = iterable_dir.dir, - }; - }; - defer tmp_directory.closeAndFree(allocator); - - if (tag != .dir) { - const opt_content_length = try rr.getSize(); - - var prog_reader: ProgressReader(@TypeOf(r.reader())) = .{ - .child_reader = r.reader(), - .prog_node = pkg_prog_node, - .unit = if (opt_content_length) |content_length| unit: { - const kib = content_length / 1024; - const mib = kib / 1024; - if (mib > 0) { - pkg_prog_node.setEstimatedTotalItems(@intCast(mib)); - pkg_prog_node.setUnit("MiB"); - break :unit .mib; - } else { - pkg_prog_node.setEstimatedTotalItems(@intCast(@max(1, kib))); - pkg_prog_node.setUnit("KiB"); - break :unit .kib; - } - } else .any, - }; - - switch (try rr.getFileType(dep_location_tok, report)) { - .tar => try unpackTarball(allocator, prog_reader.reader(), tmp_directory.handle, dep_location_tok, report), - .@"tar.gz" => try unpackTarballCompressed(allocator, prog_reader, tmp_directory.handle, dep_location_tok, report, std.compress.gzip), - .@"tar.xz" => try unpackTarballCompressed(allocator, prog_reader, tmp_directory.handle, dep_location_tok, report, std.compress.xz), - .git_pack => try unpackGitPack(allocator, &prog_reader, git.parseOid(rr.path) catch unreachable, tmp_directory.handle, dep_location_tok, report), - } - } else { - // Recursive directory copy. - var it = try r.walk(allocator); - defer it.deinit(); - while (try it.next()) |entry| { - switch (entry.kind) { - .directory => try tmp_directory.handle.makePath(entry.path), - .file => try r.dir.copyFile( - entry.path, - tmp_directory.handle, - entry.path, - .{}, - ), - .sym_link => { - var buf: [fs.MAX_PATH_BYTES]u8 = undefined; - const link_name = try r.dir.readLink(entry.path, &buf); - // TODO: if this would create a symlink to outside - // the destination directory, fail with an error instead. - try tmp_directory.handle.symLink(link_name, entry.path, .{}); - }, - else => return error.IllegalFileTypeInPackage, - } - } - } - - break :h try computePackageHash(thread_pool, .{ .dir = tmp_directory.handle }); - }; - - const pkg_dir_sub_path = "p" ++ s ++ Manifest.hexDigest(actual_hash); - const unpacked_path = try global_cache_directory.join(allocator, &.{pkg_dir_sub_path}); - defer allocator.free(unpacked_path); - - const relative_unpacked_path = try fs.path.relative(allocator, global_cache_directory.path.?, unpacked_path); - errdefer allocator.free(relative_unpacked_path); - try renameTmpIntoCache(global_cache_directory.handle, tmp_dir_sub_path, relative_unpacked_path); - - return .{ - .hash = actual_hash, - .relative_unpacked_path = relative_unpacked_path, - }; - }, - } - } - - const FileType = enum { - tar, - @"tar.gz", - @"tar.xz", - git_pack, - }; - - pub fn getSize(rr: ReadableResource) !?u64 { - switch (rr.resource) { - .file => |f| return (try f.metadata()).size(), - // TODO: Handle case of chunked content-length - .http_request => |req| return req.response.content_length, - .git_fetch_stream => |stream| return stream.request.response.content_length, - .dir => unreachable, - } - } - - pub fn getFileType( - rr: ReadableResource, - dep_location_tok: std.zig.Ast.TokenIndex, - report: Report, - ) !FileType { - switch (rr.resource) { - .file => { - return fileTypeFromPath(rr.path) orelse - return report.fail(dep_location_tok, "unknown file type", .{}); - }, - .http_request => |req| { - const content_type = req.response.headers.getFirstValue("Content-Type") orelse - return report.fail(dep_location_tok, "missing 'Content-Type' header", .{}); - - // If the response has a different content type than the URI indicates, override - // the previously assumed file type. - if (ascii.eqlIgnoreCase(content_type, "application/x-tar")) return .tar; - - return if (ascii.eqlIgnoreCase(content_type, "application/gzip") or - ascii.eqlIgnoreCase(content_type, "application/x-gzip") or - ascii.eqlIgnoreCase(content_type, "application/tar+gzip")) - .@"tar.gz" - else if (ascii.eqlIgnoreCase(content_type, "application/x-xz")) - .@"tar.xz" - else if (ascii.eqlIgnoreCase(content_type, "application/octet-stream")) ty: { - // support gitlab tarball urls such as https://gitlab.com///-/archive//-.tar.gz - // whose content-disposition header is: 'attachment; filename="-.tar.gz"' - const content_disposition = req.response.headers.getFirstValue("Content-Disposition") orelse - return report.fail(dep_location_tok, "missing 'Content-Disposition' header for Content-Type=application/octet-stream", .{}); - break :ty getAttachmentType(content_disposition) orelse - return report.fail(dep_location_tok, "unsupported 'Content-Disposition' header value: '{s}' for Content-Type=application/octet-stream", .{content_disposition}); - } else return report.fail(dep_location_tok, "unrecognized value for 'Content-Type' header: {s}", .{content_type}); - }, - .git_fetch_stream => return .git_pack, - .dir => unreachable, - } - } - - fn fileTypeFromPath(file_path: []const u8) ?FileType { - if (ascii.endsWithIgnoreCase(file_path, ".tar")) return .tar; - if (ascii.endsWithIgnoreCase(file_path, ".tar.gz")) return .@"tar.gz"; - if (ascii.endsWithIgnoreCase(file_path, ".tar.xz")) return .@"tar.xz"; - return null; - } - - fn getAttachmentType(content_disposition: []const u8) ?FileType { - const disposition_type_end = ascii.indexOfIgnoreCase(content_disposition, "attachment;") orelse return null; - - var value_start = ascii.indexOfIgnoreCasePos(content_disposition, disposition_type_end + 1, "filename") orelse return null; - value_start += "filename".len; - if (content_disposition[value_start] == '*') { - value_start += 1; - } - if (content_disposition[value_start] != '=') return null; - value_start += 1; - - var value_end = mem.indexOfPos(u8, content_disposition, value_start, ";") orelse content_disposition.len; - if (content_disposition[value_end - 1] == '\"') { - value_end -= 1; - } - return fileTypeFromPath(content_disposition[value_start..value_end]); - } - - pub fn deinit(rr: *ReadableResource, gpa: Allocator) void { - gpa.free(rr.path); - switch (rr.resource) { - .file => |file| file.close(), - .http_request => |*req| req.deinit(), - .git_fetch_stream => |*stream| stream.deinit(), - .dir => |*dir| dir.close(), - } - rr.* = undefined; - } -}; - -pub const PackageLocation = struct { - /// For packages that require unpacking, this is the hash of the package contents. - /// For directories, this is the hash of the absolute file path. - hash: [Manifest.Hash.digest_length]u8, - relative_unpacked_path: []const u8, - - pub fn deinit(pl: *PackageLocation, allocator: Allocator) void { - allocator.free(pl.relative_unpacked_path); - pl.* = undefined; - } -}; - const hex_multihash_len = 2 * Manifest.multihash_len; const MultiHashHexDigest = [hex_multihash_len]u8; @@ -939,411 +249,3 @@ const DependencyModule = union(enum) { /// If the value is `null`, the package is a known dependency, but has not yet /// been fetched. pub const AllModules = std.AutoHashMapUnmanaged(MultiHashHexDigest, ?DependencyModule); - -fn ProgressReader(comptime ReaderType: type) type { - return struct { - child_reader: ReaderType, - bytes_read: u64 = 0, - prog_node: *std.Progress.Node, - unit: enum { - kib, - mib, - any, - }, - - pub const Error = ReaderType.Error; - pub const Reader = std.io.Reader(*@This(), Error, read); - - pub fn read(self: *@This(), buf: []u8) Error!usize { - const amt = try self.child_reader.read(buf); - self.bytes_read += amt; - const kib = self.bytes_read / 1024; - const mib = kib / 1024; - switch (self.unit) { - .kib => self.prog_node.setCompletedItems(@intCast(kib)), - .mib => self.prog_node.setCompletedItems(@intCast(mib)), - .any => { - if (mib > 0) { - self.prog_node.setUnit("MiB"); - self.prog_node.setCompletedItems(@intCast(mib)); - } else { - self.prog_node.setUnit("KiB"); - self.prog_node.setCompletedItems(@intCast(kib)); - } - }, - } - self.prog_node.activate(); - return amt; - } - - pub fn reader(self: *@This()) Reader { - return .{ .context = self }; - } - }; -} - -/// Get a cached package if it exists. -/// Returns `null` if the package has not been cached -/// If the package exists in the cache, returns a pointer to the package and a -/// boolean indicating whether this package has already been seen in the build -/// (i.e. whether or not its transitive dependencies have been fetched). -fn getCachedPackage( - gpa: Allocator, - global_cache_directory: Compilation.Directory, - dep: Manifest.Dependency, - all_modules: *AllModules, - root_prog_node: *std.Progress.Node, -) !?struct { DependencyModule, bool } { - const s = fs.path.sep_str; - // Check if the expected_hash is already present in the global package - // cache, and thereby avoid both fetching and unpacking. - if (dep.hash) |h| { - const hex_digest = h[0..hex_multihash_len]; - const pkg_dir_sub_path = "p" ++ s ++ hex_digest; - - var pkg_dir = global_cache_directory.handle.openDir(pkg_dir_sub_path, .{}) catch |err| switch (err) { - error.FileNotFound => return null, - else => |e| return e, - }; - errdefer pkg_dir.close(); - - // The compiler has a rule that a file must not be included in multiple modules, - // so we must detect if a module has been created for this package and reuse it. - const gop = try all_modules.getOrPut(gpa, hex_digest.*); - if (gop.found_existing) { - if (gop.value_ptr.*) |mod| { - return .{ mod, true }; - } - } - - root_prog_node.completeOne(); - - const is_zig_mod = if (pkg_dir.access(build_zig_basename, .{})) |_| true else |_| false; - const basename = if (is_zig_mod) build_zig_basename else ""; - const pkg = try createWithDir(gpa, global_cache_directory, pkg_dir_sub_path, basename); - - const module: DependencyModule = if (is_zig_mod) - .{ .zig_pkg = pkg } - else - .{ .non_zig_pkg = pkg }; - - try all_modules.put(gpa, hex_digest.*, module); - return .{ module, false }; - } - - return null; -} - -fn getDirectoryModule( - gpa: Allocator, - fetch_location: FetchLocation, - directory: Compilation.Directory, - all_modules: *AllModules, - dep: *Manifest.Dependency, - report: Report, -) !struct { DependencyModule, bool } { - assert(fetch_location == .directory); - - if (dep.hash != null) { - return report.fail(dep.hash_tok, "hash not allowed for directory package", .{}); - } - - const hash = try computePathHash(gpa, directory, fetch_location.directory); - const hex_digest = Manifest.hexDigest(hash); - dep.hash = try gpa.dupe(u8, &hex_digest); - - // There is no fixed location to check for directory modules. - // Instead, check whether it is already listed in all_modules. - if (all_modules.get(hex_digest)) |mod| return .{ mod.?, true }; - - var pkg_dir = directory.handle.openDir(fetch_location.directory, .{}) catch |err| switch (err) { - error.FileNotFound => return report.fail(dep.location_tok, "file not found: {s}", .{fetch_location.directory}), - else => |e| return e, - }; - defer pkg_dir.close(); - - const is_zig_mod = if (pkg_dir.access(build_zig_basename, .{})) |_| true else |_| false; - const basename = if (is_zig_mod) build_zig_basename else ""; - - const pkg = try createWithDir(gpa, directory, fetch_location.directory, basename); - const module: DependencyModule = if (is_zig_mod) - .{ .zig_pkg = pkg } - else - .{ .non_zig_pkg = pkg }; - - try all_modules.put(gpa, hex_digest, module); - return .{ module, false }; -} - -fn fetchAndUnpack( - fetch_location: FetchLocation, - thread_pool: *ThreadPool, - http_client: *std.http.Client, - directory: Compilation.Directory, - global_cache_directory: Compilation.Directory, - dep: Manifest.Dependency, - report: Report, - all_modules: *AllModules, - root_prog_node: *std.Progress.Node, - /// This does not have to be any form of canonical or fully-qualified name: it - /// is only intended to be human-readable for progress reporting. - name_for_prog: []const u8, -) !DependencyModule { - assert(fetch_location != .directory); - - const gpa = http_client.allocator; - - var pkg_prog_node = root_prog_node.start(name_for_prog, 0); - defer pkg_prog_node.end(); - pkg_prog_node.activate(); - - var readable_resource = try fetch_location.fetch(gpa, directory, http_client, dep.location_tok, report); - defer readable_resource.deinit(gpa); - - var package_location = try readable_resource.unpack( - gpa, - thread_pool, - global_cache_directory, - dep.location_tok, - report, - &pkg_prog_node, - ); - defer package_location.deinit(gpa); - - const actual_hex = Manifest.hexDigest(package_location.hash); - if (dep.hash) |h| { - if (!mem.eql(u8, h, &actual_hex)) { - return report.fail(dep.hash_tok, "hash mismatch: expected: {s}, found: {s}", .{ - h, actual_hex, - }); - } - } else { - const notes_len = 1; - try report.addErrorWithNotes(notes_len, .{ - .tok = dep.location_tok, - .off = 0, - .msg = "dependency is missing hash field", - }); - const eb = report.error_bundle; - const notes_start = try eb.reserveNotes(notes_len); - eb.extra.items[notes_start] = @intFromEnum(try eb.addErrorMessage(.{ - .msg = try eb.printString("expected .hash = \"{s}\",", .{&actual_hex}), - })); - return error.PackageFetchFailed; - } - - const build_zig_path = try fs.path.join(gpa, &.{ package_location.relative_unpacked_path, build_zig_basename }); - defer gpa.free(build_zig_path); - - const is_zig_mod = if (global_cache_directory.handle.access(build_zig_path, .{})) |_| true else |_| false; - const basename = if (is_zig_mod) build_zig_basename else ""; - const pkg = try createWithDir(gpa, global_cache_directory, package_location.relative_unpacked_path, basename); - const module: DependencyModule = if (is_zig_mod) - .{ .zig_pkg = pkg } - else - .{ .non_zig_pkg = pkg }; - - try all_modules.put(gpa, actual_hex, module); - return module; -} - -fn unpackTarballCompressed( - gpa: Allocator, - reader: anytype, - out_dir: fs.Dir, - dep_location_tok: std.zig.Ast.TokenIndex, - report: Report, - comptime Compression: type, -) !void { - var br = std.io.bufferedReaderSize(std.crypto.tls.max_ciphertext_record_len, reader); - - var decompress = try Compression.decompress(gpa, br.reader()); - defer decompress.deinit(); - - return unpackTarball(gpa, decompress.reader(), out_dir, dep_location_tok, report); -} - -fn unpackTarball( - gpa: Allocator, - reader: anytype, - out_dir: fs.Dir, - dep_location_tok: std.zig.Ast.TokenIndex, - report: Report, -) !void { - var diagnostics: std.tar.Options.Diagnostics = .{ .allocator = gpa }; - defer diagnostics.deinit(); - - try std.tar.pipeToFileSystem(out_dir, reader, .{ - .diagnostics = &diagnostics, - .strip_components = 1, - // TODO: we would like to set this to executable_bit_only, but two - // things need to happen before that: - // 1. the tar implementation needs to support it - // 2. the hashing algorithm here needs to support detecting the is_executable - // bit on Windows from the ACLs (see the isExecutable function). - .mode_mode = .ignore, - }); - - if (diagnostics.errors.items.len > 0) { - const notes_len: u32 = @intCast(diagnostics.errors.items.len); - try report.addErrorWithNotes(notes_len, .{ - .tok = dep_location_tok, - .off = 0, - .msg = "unable to unpack tarball", - }); - const eb = report.error_bundle; - const notes_start = try eb.reserveNotes(notes_len); - for (diagnostics.errors.items, notes_start..) |item, note_i| { - switch (item) { - .unable_to_create_sym_link => |info| { - eb.extra.items[note_i] = @intFromEnum(try eb.addErrorMessage(.{ - .msg = try eb.printString("unable to create symlink from '{s}' to '{s}': {s}", .{ - info.file_name, info.link_name, @errorName(info.code), - }), - })); - }, - .unsupported_file_type => |info| { - eb.extra.items[note_i] = @intFromEnum(try eb.addErrorMessage(.{ - .msg = try eb.printString("file '{s}' has unsupported type '{c}'", .{ - info.file_name, @intFromEnum(info.file_type), - }), - })); - }, - } - } - return error.InvalidTarball; - } -} - -fn unpackGitPack( - gpa: Allocator, - reader: anytype, - want_oid: git.Oid, - out_dir: fs.Dir, - dep_location_tok: std.zig.Ast.TokenIndex, - report: Report, -) !void { - // The .git directory is used to store the packfile and associated index, but - // we do not attempt to replicate the exact structure of a real .git - // directory, since that isn't relevant for fetching a package. - { - var pack_dir = try out_dir.makeOpenPath(".git", .{}); - defer pack_dir.close(); - var pack_file = try pack_dir.createFile("pkg.pack", .{ .read = true }); - defer pack_file.close(); - var fifo = std.fifo.LinearFifo(u8, .{ .Static = 4096 }).init(); - try fifo.pump(reader.reader(), pack_file.writer()); - try pack_file.sync(); - - var index_file = try pack_dir.createFile("pkg.idx", .{ .read = true }); - defer index_file.close(); - { - var index_prog_node = reader.prog_node.start("Index pack", 0); - defer index_prog_node.end(); - index_prog_node.activate(); - var index_buffered_writer = std.io.bufferedWriter(index_file.writer()); - try git.indexPack(gpa, pack_file, index_buffered_writer.writer()); - try index_buffered_writer.flush(); - try index_file.sync(); - } - - { - var checkout_prog_node = reader.prog_node.start("Checkout", 0); - defer checkout_prog_node.end(); - checkout_prog_node.activate(); - var repository = try git.Repository.init(gpa, pack_file, index_file); - defer repository.deinit(); - var diagnostics: git.Diagnostics = .{ .allocator = gpa }; - defer diagnostics.deinit(); - try repository.checkout(out_dir, want_oid, &diagnostics); - - if (diagnostics.errors.items.len > 0) { - const notes_len: u32 = @intCast(diagnostics.errors.items.len); - try report.addErrorWithNotes(notes_len, .{ - .tok = dep_location_tok, - .off = 0, - .msg = "unable to unpack packfile", - }); - const eb = report.error_bundle; - const notes_start = try eb.reserveNotes(notes_len); - for (diagnostics.errors.items, notes_start..) |item, note_i| { - switch (item) { - .unable_to_create_sym_link => |info| { - eb.extra.items[note_i] = @intFromEnum(try eb.addErrorMessage(.{ - .msg = try eb.printString("unable to create symlink from '{s}' to '{s}': {s}", .{ - info.file_name, info.link_name, @errorName(info.code), - }), - })); - }, - } - } - return error.InvalidGitPack; - } - } - } - - try out_dir.deleteTree(".git"); -} - -/// Compute the hash of a file path. -fn computePathHash(gpa: Allocator, dir: Compilation.Directory, path: []const u8) ![Manifest.Hash.digest_length]u8 { - const resolved_path = try std.fs.path.resolve(gpa, &.{ dir.path.?, path }); - defer gpa.free(resolved_path); - var hasher = Manifest.Hash.init(.{}); - hasher.update(resolved_path); - return hasher.finalResult(); -} - -fn isDirectory(root_dir: Compilation.Directory, path: []const u8) !bool { - var dir = root_dir.handle.openDir(path, .{}) catch |err| switch (err) { - error.NotDir => return false, - else => return err, - }; - defer dir.close(); - return true; -} - -fn renameTmpIntoCache( - cache_dir: fs.Dir, - tmp_dir_sub_path: []const u8, - dest_dir_sub_path: []const u8, -) !void { - assert(dest_dir_sub_path[1] == fs.path.sep); - var handled_missing_dir = false; - while (true) { - cache_dir.rename(tmp_dir_sub_path, dest_dir_sub_path) catch |err| switch (err) { - error.FileNotFound => { - if (handled_missing_dir) return err; - cache_dir.makeDir(dest_dir_sub_path[0..1]) catch |mkd_err| switch (mkd_err) { - error.PathAlreadyExists => handled_missing_dir = true, - else => |e| return e, - }; - continue; - }, - error.PathAlreadyExists, error.AccessDenied => { - // Package has been already downloaded and may already be in use on the system. - cache_dir.deleteTree(tmp_dir_sub_path) catch |del_err| { - std.log.warn("unable to delete temp directory: {s}", .{@errorName(del_err)}); - }; - }, - else => |e| return e, - }; - break; - } -} - -test "getAttachmentType" { - try std.testing.expectEqual(@as(?ReadableResource.FileType, .@"tar.gz"), ReadableResource.getAttachmentType("attaChment; FILENAME=\"stuff.tar.gz\"; size=42")); - try std.testing.expectEqual(@as(?ReadableResource.FileType, .@"tar.gz"), ReadableResource.getAttachmentType("attachment; filename*=\"stuff.tar.gz\"")); - try std.testing.expectEqual(@as(?ReadableResource.FileType, .@"tar.xz"), ReadableResource.getAttachmentType("ATTACHMENT; filename=\"stuff.tar.xz\"")); - try std.testing.expectEqual(@as(?ReadableResource.FileType, .@"tar.xz"), ReadableResource.getAttachmentType("attachment; FileName=\"stuff.tar.xz\"")); - try std.testing.expectEqual(@as(?ReadableResource.FileType, .@"tar.gz"), ReadableResource.getAttachmentType("attachment; FileName*=UTF-8\'\'xyz%2Fstuff.tar.gz")); - - try std.testing.expect(ReadableResource.getAttachmentType("attachment FileName=\"stuff.tar.gz\"") == null); - try std.testing.expect(ReadableResource.getAttachmentType("attachment; FileName=\"stuff.tar\"") == null); - try std.testing.expect(ReadableResource.getAttachmentType("attachment; FileName\"stuff.gz\"") == null); - try std.testing.expect(ReadableResource.getAttachmentType("attachment; size=42") == null); - try std.testing.expect(ReadableResource.getAttachmentType("inline; size=42") == null); - try std.testing.expect(ReadableResource.getAttachmentType("FileName=\"stuff.tar.gz\"; attachment;") == null); - try std.testing.expect(ReadableResource.getAttachmentType("FileName=\"stuff.tar.gz\";") == null); -} diff --git a/src/Package/Fetch.zig b/src/Package/Fetch.zig new file mode 100644 index 000000000000..b3b4667e4093 --- /dev/null +++ b/src/Package/Fetch.zig @@ -0,0 +1,1012 @@ +//! Represents one independent job whose responsibility is to: +//! +//! 1. Check the global zig package cache to see if the hash already exists. +//! If so, load, parse, and validate the build.zig.zon file therein, and +//! goto step 8. Likewise if the location is a relative path, treat this +//! the same as a cache hit. Otherwise, proceed. +//! 2. Fetch and unpack a URL into a temporary directory. +//! 3. Load, parse, and validate the build.zig.zon file therein. It is allowed +//! for the file to be missing, in which case this fetched package is considered +//! to be a "naked" package. +//! 4. Apply inclusion rules of the build.zig.zon to the temporary directory by +//! deleting excluded files. If any files had errors for files that were +//! ultimately excluded, those errors should be ignored, such as failure to +//! create symlinks that weren't supposed to be included anyway. +//! 5. Compute the package hash based on the remaining files in the temporary +//! directory. +//! 6. Rename the temporary directory into the global zig package cache +//! directory. If the hash already exists, delete the temporary directory and +//! leave the zig package cache directory untouched as it may be in use by the +//! system. This is done even if the hash is invalid, in case the package with +//! the different hash is used in the future. +//! 7. Validate the computed hash against the expected hash. If invalid, +//! this job is done. +//! 8. Spawn a new fetch job for each dependency in the manifest file. Use +//! a mutex and a hash map so that redundant jobs do not get queued up. +//! +//! All of this must be done with only referring to the state inside this struct +//! because this work will be done in a dedicated thread. + +/// Try to avoid this as much as possible since arena will have less contention. +gpa: Allocator, +arena: std.heap.ArenaAllocator, +location: Location, +location_tok: std.zig.Ast.TokenIndex, +hash_tok: std.zig.Ast.TokenIndex, +global_cache: Cache.Directory, +parent_package_root: Path, +parent_manifest_ast: ?*const std.zig.Ast, +prog_node: *std.Progress.Node, +http_client: *std.http.Client, +thread_pool: *ThreadPool, +job_queue: *JobQueue, +wait_group: *WaitGroup, + +// Above this are fields provided as inputs to `run`. +// Below this are fields populated by `run`. + +/// This will either be relative to `global_cache`, or to the build root of +/// the root package. +package_root: Path, +error_bundle: std.zig.ErrorBundle.Wip, +manifest: ?Manifest, +manifest_ast: ?*std.zig.Ast, +actual_hash: Digest, +/// Fetch logic notices whether a package has a build.zig file and sets this flag. +has_build_zig: bool, +/// Indicates whether the task aborted due to an out-of-memory condition. +oom_flag: bool, + +pub const JobQueue = struct { + mutex: std.Thread.Mutex = .{}, +}; + +pub const Digest = [Manifest.Hash.digest_length]u8; +pub const MultiHashHexDigest = [hex_multihash_len]u8; + +pub const Path = struct { + root_dir: Cache.Directory, + /// The path, relative to the root dir, that this `Path` represents. + /// Empty string means the root_dir is the path. + sub_path: []const u8 = "", +}; + +pub const Location = union(enum) { + remote: Remote, + relative_path: []const u8, + + pub const Remote = struct { + url: []const u8, + /// If this is null it means the user omitted the hash field from a dependency. + /// It will be an error but the logic should still fetch and print the discovered hash. + hash: ?[hex_multihash_len]u8, + }; +}; + +pub const RunError = error{ + OutOfMemory, + /// This error code is intended to be handled by inspecting the + /// `error_bundle` field. + FetchFailed, +}; + +pub fn run(f: *Fetch) RunError!void { + const eb = &f.error_bundle; + const arena = f.arena_allocator.allocator(); + + // Check the global zig package cache to see if the hash already exists. If + // so, load, parse, and validate the build.zig.zon file therein, and skip + // ahead to queuing up jobs for dependencies. Likewise if the location is a + // relative path, treat this the same as a cache hit. Otherwise, proceed. + + const remote = switch (f.location) { + .relative_path => |sub_path| { + if (fs.path.isAbsolute(sub_path)) return f.fail( + f.location_tok, + try eb.addString("expected path relative to build root; found absolute path"), + ); + if (f.hash_tok != 0) return f.fail( + f.hash_tok, + try eb.addString("path-based dependencies are not hashed"), + ); + f.package_root = try f.parent_package_root.join(arena, sub_path); + try loadManifest(f, f.package_root); + // Package hashes are used as unique identifiers for packages, so + // we still need one for relative paths. + const hash = h: { + var hasher = Manifest.Hash.init(.{}); + // This hash is a tuple of: + // * whether it relative to the global cache directory or to the root package + // * the relative file path from there to the build root of the package + hasher.update(if (f.package_root.root_dir.handle == f.global_cache.handle) + &package_hash_prefix_cached + else + &package_hash_prefix_project); + hasher.update(f.package_root.sub_path); + break :h hasher.finalResult(); + }; + return queueJobsForDeps(f, hash); + }, + .remote => |remote| remote, + }; + const s = fs.path.sep_str; + if (remote.hash) |expected_hash| { + const pkg_sub_path = "p" ++ s ++ expected_hash; + if (f.global_cache.handle.access(pkg_sub_path, .{})) |_| { + f.package_root = .{ + .root_dir = f.global_cache, + .sub_path = pkg_sub_path, + }; + try loadManifest(f, f.package_root); + return queueJobsForDeps(f, expected_hash); + } else |err| switch (err) { + error.FileNotFound => {}, + else => |e| { + try eb.addRootErrorMessage(.{ + .msg = try eb.printString("unable to open global package cache directory '{s}': {s}", .{ + try f.global_cache.join(arena, .{pkg_sub_path}), @errorName(e), + }), + .src_loc = .none, + .notes_len = 0, + }); + return error.FetchFailed; + }, + } + } + + // Fetch and unpack the remote into a temporary directory. + + const uri = std.Uri.parse(remote.url) catch |err| return f.fail( + f.location_tok, + "invalid URI: {s}", + .{@errorName(err)}, + ); + const rand_int = std.crypto.random.int(u64); + const tmp_dir_sub_path = "tmp" ++ s ++ Manifest.hex64(rand_int); + + var tmp_directory: Cache.Directory = .{ + .path = try f.global_cache.join(arena, &.{tmp_dir_sub_path}), + .handle = (try f.global_cache.handle.makeOpenPathIterable(tmp_dir_sub_path, .{})).dir, + }; + defer tmp_directory.handle.close(); + + var resource = try f.initResource(uri); + defer resource.deinit(); // releases more than memory + + try f.unpackResource(&resource, uri.path, tmp_directory); + + // Load, parse, and validate the unpacked build.zig.zon file. It is allowed + // for the file to be missing, in which case this fetched package is + // considered to be a "naked" package. + try loadManifest(f, .{ .root_dir = tmp_directory }); + + // Apply the manifest's inclusion rules to the temporary directory by + // deleting excluded files. If any error occurred for files that were + // ultimately excluded, those errors should be ignored, such as failure to + // create symlinks that weren't supposed to be included anyway. + + // Empty directories have already been omitted by `unpackResource`. + + const filter: Filter = .{ + .include_paths = if (f.manifest) |m| m.paths else .{}, + }; + + // Compute the package hash based on the remaining files in the temporary + // directory. + + if (builtin.os.tag == .linux and f.work_around_btrfs_bug) { + // https://github.com/ziglang/zig/issues/17095 + tmp_directory.handle.close(); + const iterable_dir = f.global_cache.handle.makeOpenPathIterable(tmp_dir_sub_path, .{}) catch + @panic("btrfs workaround failed"); + tmp_directory.handle = iterable_dir.dir; + } + + f.actual_hash = try computeHash(f, .{ .dir = tmp_directory.handle }, filter); + + // Rename the temporary directory into the global zig package cache + // directory. If the hash already exists, delete the temporary directory + // and leave the zig package cache directory untouched as it may be in use + // by the system. This is done even if the hash is invalid, in case the + // package with the different hash is used in the future. + + const dest_pkg_sub_path = "p" ++ s ++ Manifest.hexDigest(f.actual_hash); + try renameTmpIntoCache(f.global_cache.handle, tmp_dir_sub_path, dest_pkg_sub_path); + + // Validate the computed hash against the expected hash. If invalid, this + // job is done. + + const actual_hex = Manifest.hexDigest(f.actual_hash); + if (remote.hash) |declared_hash| { + if (!std.mem.eql(u8, declared_hash, &actual_hex)) { + return f.fail(f.hash_tok, "hash mismatch: manifest declares {s} but the fetched package has {s}", .{ + declared_hash, actual_hex, + }); + } + } else { + const notes_len = 1; + try f.addErrorWithNotes(notes_len, f.location_tok, "dependency is missing hash field"); + const notes_start = try eb.reserveNotes(notes_len); + eb.extra.items[notes_start] = @intFromEnum(try eb.addErrorMessage(.{ + .msg = try eb.printString("expected .hash = \"{s}\",", .{&actual_hex}), + })); + return error.PackageFetchFailed; + } + + // Spawn a new fetch job for each dependency in the manifest file. Use + // a mutex and a hash map so that redundant jobs do not get queued up. + return queueJobsForDeps(f, .{ .hash = f.actual_hash }); +} + +/// This function populates `f.manifest` or leaves it `null`. +fn loadManifest(f: *Fetch, pkg_root: Path) RunError!void { + const eb = &f.error_bundle; + const arena = f.arena_allocator.allocator(); + const manifest_bytes = pkg_root.readFileAllocOptions( + arena, + Manifest.basename, + Manifest.max_bytes, + null, + 1, + 0, + ) catch |err| switch (err) { + error.FileNotFound => return, + else => |e| { + const file_path = try pkg_root.join(arena, .{Manifest.basename}); + try eb.addRootErrorMessage(.{ + .msg = try eb.printString("unable to load package manifest '{s}': {s}", .{ + file_path, @errorName(e), + }), + .src_loc = .none, + .notes_len = 0, + }); + }, + }; + + var ast = try std.zig.Ast.parse(arena, manifest_bytes, .zon); + f.manifest_ast = ast; + + if (ast.errors.len > 0) { + const file_path = try pkg_root.join(arena, .{Manifest.basename}); + try main.putAstErrorsIntoBundle(arena, ast, file_path, eb); + return error.PackageFetchFailed; + } + + f.manifest = try Manifest.parse(arena, ast); + + if (f.manifest.errors.len > 0) { + const file_path = try pkg_root.join(arena, .{Manifest.basename}); + const token_starts = ast.tokens.items(.start); + + for (f.manifest.errors) |msg| { + const start_loc = ast.tokenLocation(0, msg.tok); + + try eb.addRootErrorMessage(.{ + .msg = try eb.addString(msg.msg), + .src_loc = try eb.addSourceLocation(.{ + .src_path = try eb.addString(file_path), + .span_start = token_starts[msg.tok], + .span_end = @intCast(token_starts[msg.tok] + ast.tokenSlice(msg.tok).len), + .span_main = token_starts[msg.tok] + msg.off, + .line = @intCast(start_loc.line), + .column = @intCast(start_loc.column), + .source_line = try eb.addString(ast.source[start_loc.line_start..start_loc.line_end]), + }), + .notes_len = 0, + }); + } + return error.PackageFetchFailed; + } +} + +fn queueJobsForDeps(f: *Fetch, hash: Digest) RunError!void { + // If the package does not have a build.zig.zon file then there are no dependencies. + const manifest = f.manifest orelse return; + + const new_fetches = nf: { + // Grab the new tasks into a temporary buffer so we can unlock that mutex + // as fast as possible. + // This overallocates any fetches that get skipped by the `continue` in the + // loop below. + const new_fetches = try f.arena.alloc(Fetch, manifest.dependencies.count()); + var new_fetch_index: usize = 0; + + f.job_queue.lock(); + defer f.job_queue.unlock(); + + // It is impossible for there to be a collision here. Consider all three cases: + // * Correct hash is provided by manifest. + // - Redundant jobs are skipped in the loop below. + // * Incorrect has is provided by manifest. + // - Hash mismatch error emitted; `queueJobsForDeps` is not called. + // * Hash is not provided by manifest. + // - Hash missing error emitted; `queueJobsForDeps` is not called. + try f.job_queue.finish(hash, f, new_fetches.len); + + for (manifest.dependencies.values()) |dep| { + const location: Location = switch (dep.location) { + .url => |url| .{ .remote = .{ + .url = url, + .hash = if (dep.hash) |h| h[0..hex_multihash_len].* else null, + } }, + .path => |path| .{ .relative_path = path }, + }; + const new_fetch = &new_fetches[new_fetch_index]; + const already_done = f.job_queue.add(location, new_fetch); + if (already_done) continue; + new_fetch_index += 1; + + new_fetch.* = .{ + .gpa = f.gpa, + .arena = std.heap.ArenaAllocator.init(f.gpa), + .location = location, + .location_tok = dep.location_tok, + .hash_tok = dep.hash_tok, + .global_cache = f.global_cache, + .parent_package_root = f.package_root, + .parent_manifest_ast = f.manifest_ast.?, + .prog_node = f.prog_node, + .http_client = f.http_client, + .thread_pool = f.thread_pool, + .job_queue = f.job_queue, + .wait_group = f.wait_group, + + .package_root = undefined, + .error_bundle = .{}, + .manifest = null, + .manifest_ast = null, + .actual_hash = undefined, + .has_build_zig = false, + }; + } + + break :nf new_fetches[0..new_fetch_index]; + }; + + // Now it's time to give tasks to the thread pool. + for (new_fetches) |new_fetch| { + f.wait_group.start(); + f.thread_pool.spawn(workerRun, .{f}) catch |err| switch (err) { + error.OutOfMemory => { + new_fetch.oom_flag = true; + f.wait_group.finish(); + continue; + }, + }; + } +} + +fn workerRun(f: *Fetch) void { + defer f.wait_group.finish(); + run(f) catch |err| switch (err) { + error.OutOfMemory => f.oom_flag = true, + error.FetchFailed => {}, // See `error_bundle`. + }; +} + +fn fail(f: *Fetch, msg_tok: std.zig.Ast.TokenIndex, msg_str: u32) RunError!void { + const ast = f.parent_manifest_ast; + const token_starts = ast.tokens.items(.start); + const start_loc = ast.tokenLocation(0, msg_tok); + const eb = &f.error_bundle; + const file_path = try f.parent_package_root.join(f.arena, Manifest.basename); + const msg_off = 0; + + try eb.addRootErrorMessage(.{ + .msg = msg_str, + .src_loc = try eb.addSourceLocation(.{ + .src_path = try eb.addString(file_path), + .span_start = token_starts[msg_tok], + .span_end = @intCast(token_starts[msg_tok] + ast.tokenSlice(msg_tok).len), + .span_main = token_starts[msg_tok] + msg_off, + .line = @intCast(start_loc.line), + .column = @intCast(start_loc.column), + .source_line = try eb.addString(ast.source[start_loc.line_start..start_loc.line_end]), + }), + .notes_len = 0, + }); + + return error.FetchFailed; +} + +const Resource = union(enum) { + file: fs.File, + http_request: std.http.Client.Request, + git_fetch_stream: git.Session.FetchStream, + dir: fs.IterableDir, +}; + +const FileType = enum { + tar, + @"tar.gz", + @"tar.xz", + git_pack, + + fn fromPath(file_path: []const u8) ?FileType { + if (ascii.endsWithIgnoreCase(file_path, ".tar")) return .tar; + if (ascii.endsWithIgnoreCase(file_path, ".tar.gz")) return .@"tar.gz"; + if (ascii.endsWithIgnoreCase(file_path, ".tar.xz")) return .@"tar.xz"; + return null; + } + + /// Parameter is a content-disposition header value. + fn fromContentDisposition(cd_header: []const u8) ?FileType { + const attach_end = ascii.indexOfIgnoreCase(cd_header, "attachment;") orelse + return null; + + var value_start = ascii.indexOfIgnoreCasePos(cd_header, attach_end + 1, "filename") orelse + return null; + value_start += "filename".len; + if (cd_header[value_start] == '*') { + value_start += 1; + } + if (cd_header[value_start] != '=') return null; + value_start += 1; + + var value_end = std.mem.indexOfPos(u8, cd_header, value_start, ";") orelse cd_header.len; + if (cd_header[value_end - 1] == '\"') { + value_end -= 1; + } + return fromPath(cd_header[value_start..value_end]); + } + + test fromContentDisposition { + try std.testing.expectEqual(@as(?FileType, .@"tar.gz"), fromContentDisposition("attaChment; FILENAME=\"stuff.tar.gz\"; size=42")); + try std.testing.expectEqual(@as(?FileType, .@"tar.gz"), fromContentDisposition("attachment; filename*=\"stuff.tar.gz\"")); + try std.testing.expectEqual(@as(?FileType, .@"tar.xz"), fromContentDisposition("ATTACHMENT; filename=\"stuff.tar.xz\"")); + try std.testing.expectEqual(@as(?FileType, .@"tar.xz"), fromContentDisposition("attachment; FileName=\"stuff.tar.xz\"")); + try std.testing.expectEqual(@as(?FileType, .@"tar.gz"), fromContentDisposition("attachment; FileName*=UTF-8\'\'xyz%2Fstuff.tar.gz")); + + try std.testing.expect(fromContentDisposition("attachment FileName=\"stuff.tar.gz\"") == null); + try std.testing.expect(fromContentDisposition("attachment; FileName=\"stuff.tar\"") == null); + try std.testing.expect(fromContentDisposition("attachment; FileName\"stuff.gz\"") == null); + try std.testing.expect(fromContentDisposition("attachment; size=42") == null); + try std.testing.expect(fromContentDisposition("inline; size=42") == null); + try std.testing.expect(fromContentDisposition("FileName=\"stuff.tar.gz\"; attachment;") == null); + try std.testing.expect(fromContentDisposition("FileName=\"stuff.tar.gz\";") == null); + } +}; + +fn initResource(f: *Fetch, uri: std.Uri) RunError!Resource { + const gpa = f.gpa; + const arena = f.arena_allocator.allocator(); + const eb = &f.error_bundle; + + if (ascii.eqlIgnoreCase(uri.scheme, "file")) return .{ + .file = try f.parent_package_root.openFile(uri.path, .{}), + }; + + if (ascii.eqlIgnoreCase(uri.scheme, "http") or + ascii.eqlIgnoreCase(uri.scheme, "https")) + { + var h = std.http.Headers{ .allocator = gpa }; + defer h.deinit(); + + var req = try f.http_client.request(.GET, uri, h, .{}); + errdefer req.deinit(); // releases more than memory + + try req.start(.{}); + try req.wait(); + + if (req.response.status != .ok) { + return f.fail(f.location_tok, "expected response status '200 OK' got '{s} {s}'", .{ + @intFromEnum(req.response.status), req.response.status.phrase() orelse "", + }); + } + + return .{ .http_request = req }; + } + + if (ascii.eqlIgnoreCase(uri.scheme, "git+http") or + ascii.eqlIgnoreCase(uri.scheme, "git+https")) + { + var transport_uri = uri; + transport_uri.scheme = uri.scheme["git+".len..]; + var redirect_uri: []u8 = undefined; + var session: git.Session = .{ .transport = f.http_client, .uri = transport_uri }; + session.discoverCapabilities(gpa, &redirect_uri) catch |e| switch (e) { + error.Redirected => { + defer gpa.free(redirect_uri); + return f.fail(f.location_tok, "repository moved to {s}", .{redirect_uri}); + }, + else => |other| return other, + }; + + const want_oid = want_oid: { + const want_ref = uri.fragment orelse "HEAD"; + if (git.parseOid(want_ref)) |oid| break :want_oid oid else |_| {} + + const want_ref_head = try std.fmt.allocPrint(arena, "refs/heads/{s}", .{want_ref}); + const want_ref_tag = try std.fmt.allocPrint(arena, "refs/tags/{s}", .{want_ref}); + + var ref_iterator = try session.listRefs(gpa, .{ + .ref_prefixes = &.{ want_ref, want_ref_head, want_ref_tag }, + .include_peeled = true, + }); + defer ref_iterator.deinit(); + while (try ref_iterator.next()) |ref| { + if (std.mem.eql(u8, ref.name, want_ref) or + std.mem.eql(u8, ref.name, want_ref_head) or + std.mem.eql(u8, ref.name, want_ref_tag)) + { + break :want_oid ref.peeled orelse ref.oid; + } + } + return f.fail(f.location_tok, "ref not found: {s}", .{want_ref}); + }; + if (uri.fragment == null) { + const notes_len = 1; + try f.addErrorWithNotes(notes_len, f.location_tok, "url field is missing an explicit ref"); + const notes_start = try eb.reserveNotes(notes_len); + eb.extra.items[notes_start] = @intFromEnum(try eb.addErrorMessage(.{ + .msg = try eb.printString("try .url = \"{+/}#{}\",", .{ + uri, std.fmt.fmtSliceHexLower(&want_oid), + }), + })); + return error.PackageFetchFailed; + } + + var want_oid_buf: [git.fmt_oid_length]u8 = undefined; + _ = std.fmt.bufPrint(&want_oid_buf, "{}", .{ + std.fmt.fmtSliceHexLower(&want_oid), + }) catch unreachable; + var fetch_stream = try session.fetch(gpa, &.{&want_oid_buf}); + errdefer fetch_stream.deinit(); + + return .{ .git_fetch_stream = fetch_stream }; + } + + return f.fail(f.location_tok, "unsupported URL scheme: {s}", .{uri.scheme}); +} + +fn unpackResource( + f: *Fetch, + resource: *Resource, + uri_path: []const u8, + tmp_directory: Cache.Directory, +) RunError!void { + const file_type = switch (resource.*) { + .file => FileType.fromPath(uri_path) orelse + return f.fail(f.location_tok, "unknown file type: '{s}'", .{uri_path}), + + .http_request => |req| ft: { + // Content-Type takes first precedence. + const content_type = req.response.headers.getFirstValue("Content-Type") orelse + return f.fail(f.location_tok, "missing 'Content-Type' header", .{}); + + if (ascii.eqlIgnoreCase(content_type, "application/x-tar")) + return .tar; + + if (ascii.eqlIgnoreCase(content_type, "application/gzip") or + ascii.eqlIgnoreCase(content_type, "application/x-gzip") or + ascii.eqlIgnoreCase(content_type, "application/tar+gzip")) + { + return .@"tar.gz"; + } + + if (ascii.eqlIgnoreCase(content_type, "application/x-xz")) + return .@"tar.xz"; + + if (!ascii.eqlIgnoreCase(content_type, "application/octet-stream")) { + return f.fail(f.location_tok, "unrecognized 'Content-Type' header: '{s}'", .{ + content_type, + }); + } + + // Next, the filename from 'content-disposition: attachment' takes precedence. + if (req.response.headers.getFirstValue("Content-Disposition")) |cd_header| { + break :ft FileType.fromContentDisposition(cd_header) orelse + return f.fail( + f.location_tok, + "unsupported Content-Disposition header value: '{s}' for Content-Type=application/octet-stream", + .{cd_header}, + ); + } + + // Finally, the path from the URI is used. + break :ft FileType.fromPath(uri_path) orelse + return f.fail(f.location_tok, "unknown file type: '{s}'", .{uri_path}); + }, + .git_fetch_stream => return .git_pack, + .dir => |dir| { + try f.recursiveDirectoryCopy(dir, tmp_directory.handle); + return; + }, + }; + + switch (file_type) { + .tar => try unpackTarball(f, tmp_directory.handle, resource.reader()), + .@"tar.gz" => try unpackTarballCompressed(f, tmp_directory.handle, resource, std.compress.gzip), + .@"tar.xz" => try unpackTarballCompressed(f, tmp_directory.handle, resource, std.compress.xz), + .git_pack => try unpackGitPack(f, tmp_directory.handle, resource), + } +} + +fn unpackTarballCompressed( + f: *Fetch, + out_dir: fs.Dir, + resource: *Resource, + comptime Compression: type, +) RunError!void { + const gpa = f.gpa; + const reader = resource.reader(); + var br = std.io.bufferedReaderSize(std.crypto.tls.max_ciphertext_record_len, reader); + + var decompress = try Compression.decompress(gpa, br.reader()); + defer decompress.deinit(); + + return unpackTarball(f, out_dir, decompress.reader()); +} + +fn unpackTarball(f: *Fetch, out_dir: fs.Dir, reader: anytype) RunError!void { + const eb = &f.error_bundle; + + var diagnostics: std.tar.Options.Diagnostics = .{ .allocator = f.gpa }; + defer diagnostics.deinit(); + + try std.tar.pipeToFileSystem(out_dir, reader, .{ + .diagnostics = &diagnostics, + .strip_components = 1, + // TODO: we would like to set this to executable_bit_only, but two + // things need to happen before that: + // 1. the tar implementation needs to support it + // 2. the hashing algorithm here needs to support detecting the is_executable + // bit on Windows from the ACLs (see the isExecutable function). + .mode_mode = .ignore, + .filter = .{ .exclude_empty_directories = true }, + }); + + if (diagnostics.errors.items.len > 0) { + const notes_len: u32 = @intCast(diagnostics.errors.items.len); + try f.addErrorWithNotes(notes_len, f.location_tok, "unable to unpack tarball"); + const notes_start = try eb.reserveNotes(notes_len); + for (diagnostics.errors.items, notes_start..) |item, note_i| { + switch (item) { + .unable_to_create_sym_link => |info| { + eb.extra.items[note_i] = @intFromEnum(try eb.addErrorMessage(.{ + .msg = try eb.printString("unable to create symlink from '{s}' to '{s}': {s}", .{ + info.file_name, info.link_name, @errorName(info.code), + }), + })); + }, + .unsupported_file_type => |info| { + eb.extra.items[note_i] = @intFromEnum(try eb.addErrorMessage(.{ + .msg = try eb.printString("file '{s}' has unsupported type '{c}'", .{ + info.file_name, @intFromEnum(info.file_type), + }), + })); + }, + } + } + return error.InvalidTarball; + } +} + +fn unpackGitPack( + f: *Fetch, + out_dir: fs.Dir, + resource: *Resource, + want_oid: git.Oid, +) !void { + const eb = &f.error_bundle; + const gpa = f.gpa; + const reader = resource.reader(); + // The .git directory is used to store the packfile and associated index, but + // we do not attempt to replicate the exact structure of a real .git + // directory, since that isn't relevant for fetching a package. + { + var pack_dir = try out_dir.makeOpenPath(".git", .{}); + defer pack_dir.close(); + var pack_file = try pack_dir.createFile("pkg.pack", .{ .read = true }); + defer pack_file.close(); + var fifo = std.fifo.LinearFifo(u8, .{ .Static = 4096 }).init(); + try fifo.pump(reader.reader(), pack_file.writer()); + try pack_file.sync(); + + var index_file = try pack_dir.createFile("pkg.idx", .{ .read = true }); + defer index_file.close(); + { + var index_prog_node = reader.prog_node.start("Index pack", 0); + defer index_prog_node.end(); + index_prog_node.activate(); + var index_buffered_writer = std.io.bufferedWriter(index_file.writer()); + try git.indexPack(gpa, pack_file, index_buffered_writer.writer()); + try index_buffered_writer.flush(); + try index_file.sync(); + } + + { + var checkout_prog_node = reader.prog_node.start("Checkout", 0); + defer checkout_prog_node.end(); + checkout_prog_node.activate(); + var repository = try git.Repository.init(gpa, pack_file, index_file); + defer repository.deinit(); + var diagnostics: git.Diagnostics = .{ .allocator = gpa }; + defer diagnostics.deinit(); + try repository.checkout(out_dir, want_oid, &diagnostics); + + if (diagnostics.errors.items.len > 0) { + const notes_len: u32 = @intCast(diagnostics.errors.items.len); + try f.addErrorWithNotes(notes_len, f.location_tok, "unable to unpack packfile"); + const notes_start = try eb.reserveNotes(notes_len); + for (diagnostics.errors.items, notes_start..) |item, note_i| { + switch (item) { + .unable_to_create_sym_link => |info| { + eb.extra.items[note_i] = @intFromEnum(try eb.addErrorMessage(.{ + .msg = try eb.printString("unable to create symlink from '{s}' to '{s}': {s}", .{ + info.file_name, info.link_name, @errorName(info.code), + }), + })); + }, + } + } + return error.InvalidGitPack; + } + } + } + + try out_dir.deleteTree(".git"); +} + +fn recursiveDirectoryCopy(f: *Fetch, dir: fs.IterableDir, tmp_dir: fs.Dir) RunError!void { + // Recursive directory copy. + var it = try dir.walk(f.gpa); + defer it.deinit(); + while (try it.next()) |entry| { + switch (entry.kind) { + .directory => {}, // omit empty directories + .file => { + dir.dir.copyFile( + entry.path, + tmp_dir, + entry.path, + .{}, + ) catch |err| switch (err) { + error.FileNotFound => { + if (fs.path.dirname(entry.path)) |dirname| try tmp_dir.makePath(dirname); + try dir.dir.copyFile(entry.path, tmp_dir, entry.path, .{}); + }, + else => |e| return e, + }; + }, + .sym_link => { + var buf: [fs.MAX_PATH_BYTES]u8 = undefined; + const link_name = try dir.dir.readLink(entry.path, &buf); + // TODO: if this would create a symlink to outside + // the destination directory, fail with an error instead. + try tmp_dir.symLink(link_name, entry.path, .{}); + }, + else => return error.IllegalFileTypeInPackage, + } + } +} + +pub fn renameTmpIntoCache( + cache_dir: fs.Dir, + tmp_dir_sub_path: []const u8, + dest_dir_sub_path: []const u8, +) !void { + assert(dest_dir_sub_path[1] == fs.path.sep); + var handled_missing_dir = false; + while (true) { + cache_dir.rename(tmp_dir_sub_path, dest_dir_sub_path) catch |err| switch (err) { + error.FileNotFound => { + if (handled_missing_dir) return err; + cache_dir.makeDir(dest_dir_sub_path[0..1]) catch |mkd_err| switch (mkd_err) { + error.PathAlreadyExists => handled_missing_dir = true, + else => |e| return e, + }; + continue; + }, + error.PathAlreadyExists, error.AccessDenied => { + // Package has been already downloaded and may already be in use on the system. + cache_dir.deleteTree(tmp_dir_sub_path) catch { + // Garbage files leftover in zig-cache/tmp/ is, as they say + // on Star Trek, "operating within normal parameters". + }; + }, + else => |e| return e, + }; + break; + } +} + +/// Assumes that files not included in the package have already been filtered +/// prior to calling this function. This ensures that files not protected by +/// the hash are not present on the file system. Empty directories are *not +/// hashed* and must not be present on the file system when calling this +/// function. +fn computeHash(f: *Fetch, pkg_dir: fs.IterableDir, filter: Filter) RunError!Digest { + // All the path name strings need to be in memory for sorting. + const arena = f.arena_allocator.allocator(); + const gpa = f.gpa; + + // Collect all files, recursively, then sort. + var all_files = std.ArrayList(*HashedFile).init(gpa); + defer all_files.deinit(); + + var walker = try pkg_dir.walk(gpa); + defer walker.deinit(); + + { + // The final hash will be a hash of each file hashed independently. This + // allows hashing in parallel. + var wait_group: WaitGroup = .{}; + // `computeHash` is called from a worker thread so there must not be + // any waiting without working or a deadlock could occur. + defer wait_group.waitAndWork(); + + while (try walker.next()) |entry| { + _ = filter; // TODO: apply filter rules here + + const kind: HashedFile.Kind = switch (entry.kind) { + .directory => continue, + .file => .file, + .sym_link => .sym_link, + else => return error.IllegalFileTypeInPackage, + }; + + if (std.mem.eql(u8, entry.path, build_zig_basename)) + f.has_build_zig = true; + + const hashed_file = try arena.create(HashedFile); + const fs_path = try arena.dupe(u8, entry.path); + hashed_file.* = .{ + .fs_path = fs_path, + .normalized_path = try normalizePath(arena, fs_path), + .kind = kind, + .hash = undefined, // to be populated by the worker + .failure = undefined, // to be populated by the worker + }; + wait_group.start(); + try f.thread_pool.spawn(workerHashFile, .{ pkg_dir.dir, hashed_file, &wait_group }); + + try all_files.append(hashed_file); + } + } + + std.mem.sortUnstable(*HashedFile, all_files.items, {}, HashedFile.lessThan); + + var hasher = Manifest.Hash.init(.{}); + var any_failures = false; + const eb = &f.error_bundle; + for (all_files.items) |hashed_file| { + hashed_file.failure catch |err| { + any_failures = true; + try eb.addRootErrorMessage(.{ + .msg = try eb.printString("unable to hash: {s}", .{@errorName(err)}), + .src_loc = try eb.addSourceLocation(.{ + .src_path = try eb.addString(hashed_file.fs_path), + .span_start = 0, + .span_end = 0, + .span_main = 0, + }), + .notes_len = 0, + }); + }; + hasher.update(&hashed_file.hash); + } + if (any_failures) return error.FetchFailed; + return hasher.finalResult(); +} + +fn workerHashFile(dir: fs.Dir, hashed_file: *HashedFile, wg: *WaitGroup) void { + defer wg.finish(); + hashed_file.failure = hashFileFallible(dir, hashed_file); +} + +fn hashFileFallible(dir: fs.Dir, hashed_file: *HashedFile) HashedFile.Error!void { + var buf: [8000]u8 = undefined; + var hasher = Manifest.Hash.init(.{}); + hasher.update(hashed_file.normalized_path); + switch (hashed_file.kind) { + .file => { + var file = try dir.openFile(hashed_file.fs_path, .{}); + defer file.close(); + hasher.update(&.{ 0, @intFromBool(try isExecutable(file)) }); + while (true) { + const bytes_read = try file.read(&buf); + if (bytes_read == 0) break; + hasher.update(buf[0..bytes_read]); + } + }, + .sym_link => { + const link_name = try dir.readLink(hashed_file.fs_path, &buf); + hasher.update(link_name); + }, + } + hasher.final(&hashed_file.hash); +} + +fn isExecutable(file: fs.File) !bool { + if (builtin.os.tag == .windows) { + // TODO check the ACL on Windows. + // Until this is implemented, this could be a false negative on + // Windows, which is why we do not yet set executable_bit_only above + // when unpacking the tarball. + return false; + } else { + const stat = try file.stat(); + return (stat.mode & std.os.S.IXUSR) != 0; + } +} + +const HashedFile = struct { + fs_path: []const u8, + normalized_path: []const u8, + hash: Digest, + failure: Error!void, + kind: Kind, + + const Error = + fs.File.OpenError || + fs.File.ReadError || + fs.File.StatError || + fs.Dir.ReadLinkError; + + const Kind = enum { file, sym_link }; + + fn lessThan(context: void, lhs: *const HashedFile, rhs: *const HashedFile) bool { + _ = context; + return std.mem.lessThan(u8, lhs.normalized_path, rhs.normalized_path); + } +}; + +/// Make a file system path identical independently of operating system path inconsistencies. +/// This converts backslashes into forward slashes. +fn normalizePath(arena: Allocator, fs_path: []const u8) ![]const u8 { + const canonical_sep = '/'; + + if (fs.path.sep == canonical_sep) + return fs_path; + + const normalized = try arena.dupe(u8, fs_path); + for (normalized) |*byte| { + switch (byte.*) { + fs.path.sep => byte.* = canonical_sep, + else => continue, + } + } + return normalized; +} + +pub const Filter = struct { + include_paths: std.StringArrayHashMapUnmanaged(void) = .{}, + + /// sub_path is relative to the tarball root. + pub fn includePath(self: Filter, sub_path: []const u8) bool { + if (self.include_paths.count() == 0) return true; + if (self.include_paths.contains("")) return true; + if (self.include_paths.contains(sub_path)) return true; + + // Check if any included paths are parent directories of sub_path. + var dirname = sub_path; + while (std.fs.path.dirname(sub_path)) |next_dirname| { + if (self.include_paths.contains(sub_path)) return true; + dirname = next_dirname; + } + + return false; + } +}; + +const build_zig_basename = @import("../Package.zig").build_zig_basename; +const hex_multihash_len = 2 * Manifest.multihash_len; + +// These are random bytes. +const package_hash_prefix_cached: [8]u8 = &.{ 0x53, 0x7e, 0xfa, 0x94, 0x65, 0xe9, 0xf8, 0x73 }; +const package_hash_prefix_project: [8]u8 = &.{ 0xe1, 0x25, 0xee, 0xfa, 0xa6, 0x17, 0x38, 0xcc }; + +const builtin = @import("builtin"); +const std = @import("std"); +const fs = std.fs; +const assert = std.debug.assert; +const ascii = std.ascii; +const Allocator = std.mem.Allocator; +const Cache = std.Build.Cache; +const ThreadPool = std.Thread.Pool; +const WaitGroup = std.Thread.WaitGroup; +const Manifest = @import("../Manifest.zig"); +const Fetch = @This(); +const main = @import("../main.zig"); +const git = @import("../git.zig"); diff --git a/src/Package/hash.zig b/src/Package/hash.zig deleted file mode 100644 index b14ec70244a3..000000000000 --- a/src/Package/hash.zig +++ /dev/null @@ -1,153 +0,0 @@ -const builtin = @import("builtin"); -const std = @import("std"); -const fs = std.fs; -const ThreadPool = std.Thread.Pool; -const WaitGroup = std.Thread.WaitGroup; -const Allocator = std.mem.Allocator; - -const Hash = @import("../Manifest.zig").Hash; - -pub fn compute(thread_pool: *ThreadPool, pkg_dir: fs.IterableDir) ![Hash.digest_length]u8 { - const gpa = thread_pool.allocator; - - // We'll use an arena allocator for the path name strings since they all - // need to be in memory for sorting. - var arena_instance = std.heap.ArenaAllocator.init(gpa); - defer arena_instance.deinit(); - const arena = arena_instance.allocator(); - - // TODO: delete files not included in the package prior to computing the package hash. - // for example, if the ini file has directives to include/not include certain files, - // apply those rules directly to the filesystem right here. This ensures that files - // not protected by the hash are not present on the file system. - - // Collect all files, recursively, then sort. - var all_files = std.ArrayList(*HashedFile).init(gpa); - defer all_files.deinit(); - - var walker = try pkg_dir.walk(gpa); - defer walker.deinit(); - - { - // The final hash will be a hash of each file hashed independently. This - // allows hashing in parallel. - var wait_group: WaitGroup = .{}; - defer wait_group.wait(); - - while (try walker.next()) |entry| { - const kind: HashedFile.Kind = switch (entry.kind) { - .directory => continue, - .file => .file, - .sym_link => .sym_link, - else => return error.IllegalFileTypeInPackage, - }; - const hashed_file = try arena.create(HashedFile); - const fs_path = try arena.dupe(u8, entry.path); - hashed_file.* = .{ - .fs_path = fs_path, - .normalized_path = try normalizePath(arena, fs_path), - .kind = kind, - .hash = undefined, // to be populated by the worker - .failure = undefined, // to be populated by the worker - }; - wait_group.start(); - try thread_pool.spawn(workerHashFile, .{ pkg_dir.dir, hashed_file, &wait_group }); - - try all_files.append(hashed_file); - } - } - - std.mem.sortUnstable(*HashedFile, all_files.items, {}, HashedFile.lessThan); - - var hasher = Hash.init(.{}); - var any_failures = false; - for (all_files.items) |hashed_file| { - hashed_file.failure catch |err| { - any_failures = true; - std.log.err("unable to hash '{s}': {s}", .{ hashed_file.fs_path, @errorName(err) }); - }; - hasher.update(&hashed_file.hash); - } - if (any_failures) return error.PackageHashUnavailable; - return hasher.finalResult(); -} - -const HashedFile = struct { - fs_path: []const u8, - normalized_path: []const u8, - hash: [Hash.digest_length]u8, - failure: Error!void, - kind: Kind, - - const Error = - fs.File.OpenError || - fs.File.ReadError || - fs.File.StatError || - fs.Dir.ReadLinkError; - - const Kind = enum { file, sym_link }; - - fn lessThan(context: void, lhs: *const HashedFile, rhs: *const HashedFile) bool { - _ = context; - return std.mem.lessThan(u8, lhs.normalized_path, rhs.normalized_path); - } -}; - -/// Make a file system path identical independently of operating system path inconsistencies. -/// This converts backslashes into forward slashes. -fn normalizePath(arena: Allocator, fs_path: []const u8) ![]const u8 { - const canonical_sep = '/'; - - if (fs.path.sep == canonical_sep) - return fs_path; - - const normalized = try arena.dupe(u8, fs_path); - for (normalized) |*byte| { - switch (byte.*) { - fs.path.sep => byte.* = canonical_sep, - else => continue, - } - } - return normalized; -} - -fn workerHashFile(dir: fs.Dir, hashed_file: *HashedFile, wg: *WaitGroup) void { - defer wg.finish(); - hashed_file.failure = hashFileFallible(dir, hashed_file); -} - -fn hashFileFallible(dir: fs.Dir, hashed_file: *HashedFile) HashedFile.Error!void { - var buf: [8000]u8 = undefined; - var hasher = Hash.init(.{}); - hasher.update(hashed_file.normalized_path); - switch (hashed_file.kind) { - .file => { - var file = try dir.openFile(hashed_file.fs_path, .{}); - defer file.close(); - hasher.update(&.{ 0, @intFromBool(try isExecutable(file)) }); - while (true) { - const bytes_read = try file.read(&buf); - if (bytes_read == 0) break; - hasher.update(buf[0..bytes_read]); - } - }, - .sym_link => { - const link_name = try dir.readLink(hashed_file.fs_path, &buf); - hasher.update(link_name); - }, - } - hasher.final(&hashed_file.hash); -} - -fn isExecutable(file: fs.File) !bool { - if (builtin.os.tag == .windows) { - // TODO check the ACL on Windows. - // Until this is implemented, this could be a false negative on - // Windows, which is why we do not yet set executable_bit_only above - // when unpacking the tarball. - return false; - } else { - const stat = try file.stat(); - return (stat.mode & std.os.S.IXUSR) != 0; - } -} diff --git a/src/main.zig b/src/main.zig index 30911b57ba6b..71f200331866 100644 --- a/src/main.zig +++ b/src/main.zig @@ -4704,7 +4704,7 @@ pub fn cmdBuild(gpa: Allocator, arena: Allocator, args: []const []const u8) !voi defer if (cleanup_build_dir) |*dir| dir.close(); const cwd_path = try process.getCwdAlloc(arena); - const build_zig_basename = if (build_file) |bf| fs.path.basename(bf) else "build.zig"; + const build_zig_basename = if (build_file) |bf| fs.path.basename(bf) else Package.build_zig_basename; const build_directory: Compilation.Directory = blk: { if (build_file) |bf| { if (fs.path.dirname(bf)) |dirname| {