-
Notifications
You must be signed in to change notification settings - Fork 273
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix(vcs): overflow error when repo contains large number of files #1165
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,10 +8,11 @@ | |
|
||
import * as execa from "execa" | ||
import { join, resolve } from "path" | ||
import { flatten, uniq } from "lodash" | ||
import { flatten } from "lodash" | ||
import { ensureDir, pathExists, stat, createReadStream } from "fs-extra" | ||
import { PassThrough } from "stream" | ||
import * as hasha from "hasha" | ||
import split2 = require("split2") | ||
|
||
import { VcsHandler, RemoteSourceParams, VcsFile, GetFilesParams } from "./vcs" | ||
import { ConfigurationError, RuntimeError } from "../exceptions" | ||
|
@@ -54,7 +55,7 @@ export class GitHandler extends VcsHandler { | |
private gitCli(log: LogEntry, cwd: string): GitCli { | ||
return async (...args: string[]) => { | ||
log.silly(`Calling git with args '${args.join(" ")}`) | ||
const { stdout } = await execa("git", args, { cwd }) | ||
const { stdout } = await execa("git", args, { cwd, maxBuffer: 10 * 1024 * 1024 }) | ||
return stdout.split("\n").filter(line => line.length > 0) | ||
} | ||
} | ||
|
@@ -72,62 +73,97 @@ export class GitHandler extends VcsHandler { | |
} | ||
} | ||
|
||
/** | ||
* Returns a list of files, along with file hashes, under the given path, taking into account the configured | ||
* .ignore files, and the specified include/exclude filters. | ||
*/ | ||
async getFiles({ log, path, include, exclude }: GetFilesParams): Promise<VcsFile[]> { | ||
const git = this.gitCli(log, path) | ||
|
||
let lines: string[] = [] | ||
let ignored: string[] = [] | ||
|
||
/** | ||
* TODO: Replace the relative path handling in this function with a generic convertible path object | ||
* once that's been implemented. | ||
*/ | ||
const gitRoot = (await git("rev-parse", "--show-toplevel"))[0] | ||
|
||
try { | ||
/** | ||
* We need to exclude .garden to avoid errors when path is the project root. This happens e.g. for modules | ||
* whose config is colocated with the project config, and that don't specify include paths/patterns. | ||
*/ | ||
// FIXME: We should use `garden.gardenDirPath` instead of ".garden" since the gardenDirPath | ||
// property is configurable. | ||
lines = await git("ls-files", "-s", "--others", "--exclude=.garden", path) | ||
|
||
// List ignored files for each ignore file. We need to run ls-files twice to get both tracked and untracked files. | ||
const commands = flatten(this.ignoreFiles.map(f => { | ||
const lsIgnoredFiles = ["ls-files", "--ignored", `--exclude-per-directory=${f}`] | ||
const lsUntrackedIgnoredFiles = [...lsIgnoredFiles, "--others"] | ||
|
||
return [lsIgnoredFiles, lsUntrackedIgnoredFiles] | ||
})) | ||
|
||
ignored = uniq(flatten(await Bluebird.map(commands, async (cmd) => git(...cmd, path)))) | ||
} catch (err) { | ||
// if we get 128 we're not in a repo root, so we get no files | ||
if (err.exitCode !== 128) { | ||
throw err | ||
// List modified files, so that we can ensure we have the right hash for them later | ||
const modified = new Set((await this.getModifiedFiles(git, path)) | ||
// The output here is relative to the git root, and not the directory `path` | ||
.map(modifiedRelPath => resolve(gitRoot, modifiedRelPath)), | ||
) | ||
|
||
// List tracked but ignored files (we currently exclude those as well, so we need to query that specially) | ||
const trackedButIgnored = new Set(flatten( | ||
await Promise.all(this.ignoreFiles.map(f => | ||
git("ls-files", "--ignored", "--exclude-per-directory", f), | ||
)), | ||
)) | ||
|
||
// We run ls-files for each ignoreFile and do a manual set-intersection (by counting elements in an object) | ||
// in order to optimize the flow. | ||
const paths: { [path: string]: number } = {} | ||
const files: VcsFile[] = [] | ||
|
||
// This function is called for each line output from the ls-files commands that we run, and populates the | ||
// `files` array. | ||
const handleLine = (data: Buffer) => { | ||
const line = data.toString().trim() | ||
if (!line) { | ||
return | ||
} | ||
} | ||
|
||
const files = await Bluebird.map(lines, async (line) => { | ||
let filePath: string | ||
let hash = "" | ||
|
||
const split = line.trim().split("\t") | ||
|
||
if (split.length === 1) { | ||
// File is untracked | ||
return { path: split[0] } | ||
filePath = split[0] | ||
} else { | ||
return { path: split[1], hash: split[0].split(" ")[1] } | ||
filePath = split[1] | ||
hash = split[0].split(" ")[1] | ||
} | ||
}) | ||
|
||
const modifiedArr = ((await this.getModifiedFiles(git, path)) || []) | ||
.map(modifiedRelPath => resolve(gitRoot, modifiedRelPath)) | ||
const modified = new Set(modifiedArr) | ||
// Ignore files that are tracked but still specified in ignore files | ||
if (trackedButIgnored.has(filePath)) { | ||
return | ||
} | ||
|
||
const resolvedPath = resolve(path, filePath) | ||
|
||
// Add the path to `paths` or increment the counter to indicate how many of the ls-files outputs | ||
// contain the path. | ||
if (paths[resolvedPath]) { | ||
paths[resolvedPath] += 1 | ||
} else { | ||
paths[resolvedPath] = 1 | ||
} | ||
|
||
// We push to the output array when all ls-files commands "agree" that it should be included, | ||
// and it passes through the include/exclude filters. | ||
if (paths[resolvedPath] === this.ignoreFiles.length && matchPath(filePath, include, exclude)) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you help me understand this: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. NVM, I get it. The control flow is a little hard to follow but I'm not sure if there's a way to make it simpler without sacrificing perf. |
||
files.push({ path: resolvedPath, hash }) | ||
} | ||
} | ||
|
||
// We run ls-files for each ignore file and collect each return result line with `handleLine` | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you add to the comment that this is the function that actually triggers the population of the |
||
await Bluebird.map(this.ignoreFiles, async (f) => { | ||
const args = ["ls-files", "-s", "--others", "--exclude", this.gardenDirPath, "--exclude-per-directory", f, path] | ||
const proc = execa("git", args, { cwd: path }) | ||
|
||
// Split the command output by line | ||
const splitStream = split2() | ||
splitStream.on("data", handleLine) | ||
proc.stdout!.pipe(splitStream) | ||
|
||
const filtered = files | ||
.filter(f => matchPath(f.path, include, exclude)) | ||
.filter(f => !ignored.includes(f.path)) | ||
try { | ||
await proc | ||
} catch (err) { | ||
// if we get 128 we're not in a repo root, so we just get no files. Otherwise we throw. | ||
if (err.exitCode !== 128) { | ||
throw err | ||
} | ||
} | ||
}) | ||
|
||
return Bluebird.map(filtered, async (f) => { | ||
// Make sure we have a fresh hash for each file | ||
return Bluebird.map(files, async (f) => { | ||
const resolvedPath = resolve(path, f.path) | ||
if (!f.hash || modified.has(resolvedPath)) { | ||
// If we can't compute the hash, i.e. the file is gone, we filter it out below | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It might be useful to document the sequencing above the function since the control flow is a little hard to follow.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Regardless, we should at least add a comment saying that this function returns files filtered against ignore files, includes and excludes, since the name is pretty generic.