Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use git partial clone and worktree to reduce network/file io #5412

Merged
merged 13 commits into from
Dec 13, 2024
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions pkg/app/piped/deploysource/deploysource.go
Original file line number Diff line number Diff line change
Expand Up @@ -211,6 +211,13 @@ func (p *provider) copy(lw io.Writer) (*DeploySource, error) {
return nil, err
}

// To avoid the git operation on copied source.
// TODO: do not copy the .git directory to improve the performance.
if err := os.RemoveAll(filepath.Join(dest, ".git")); err != nil {
fmt.Fprintf(lw, "Unable to remove the .git directory from the copied deploy source (%v)\n", err)
return nil, err
}

return &DeploySource{
RepoDir: dest,
AppDir: filepath.Join(dest, p.appGitPath.Path),
Expand Down
4 changes: 3 additions & 1 deletion pkg/app/piped/driftdetector/cloudrun/detector.go
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ func (d *detector) checkApplication(ctx context.Context, app *model.Application,
return d.reporter.ReportApplicationSyncState(ctx, app.Id, state)
}

func (d *detector) loadHeadServiceManifest(app *model.Application, repo git.Repo, headCommit git.Commit) (provider.ServiceManifest, error) {
func (d *detector) loadHeadServiceManifest(app *model.Application, repo git.Worktree, headCommit git.Commit) (provider.ServiceManifest, error) {
var (
manifestCache = provider.ServiceManifestCache{
AppID: app.Id,
Expand Down Expand Up @@ -263,6 +263,8 @@ func (d *detector) loadHeadServiceManifest(app *model.Application, repo git.Repo
if err != nil {
return provider.ServiceManifest{}, fmt.Errorf("failed to copy the cloned git repository (%w)", err)
}
defer repo.Clean()
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Warashi
it seems that the dir is removed after executing the loadHeadServiceManifest
So we might not call repo.Clean(). WDYT?

		// We have to copy repository into another directory because
		// decrypting the sealed secrets or attaching files might change the git repository.
		if attachmentUsed || encryptionUsed {
			dir, err := os.MkdirTemp("", "detector-git-processing")
			if err != nil {
				return nil, fmt.Errorf("failed to prepare a temporary directory for git repository (%w)", err)
			}
			defer os.RemoveAll(dir)

			repo, err = repo.Copy(filepath.Join(dir, "repo"))
			if err != nil {
				return nil, fmt.Errorf("failed to copy the cloned git repository (%w)", err)
			}
			defer repo.Clean()

			repoDir = repo.GetPath()
			appDir = filepath.Join(repoDir, app.GitPath.Path)
		}

https://github.com/pipe-cd/pipecd/blob/c5216c91bffc906f57b30133f3d0e557f1b1888f/pkg/app/piped/driftdetector/kubernetes/detector.go#L270C1-L281C22

Copy link
Contributor Author

@Warashi Warashi Dec 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ffjlabo

I think we should call repo.Clean() here because the 'repo.Clean()' not only does the directory deletion.
IMO, we should call some methods like Clean() or Close() everywhere because the caller doesn't know what happens with such methods. The caller only knows the cleanup methods exist.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@Warashi

I think we should call repo.Clean() here because the 'repo.Clean()' not only does the directory deletion.

I got it. The git worktree needs additional operation other than deleting the directory to keep the consistency of the worktree as you said.

IMO, we should call some methods like Clean() or Close() everywhere because the caller doesn't know what happens with such methods. The caller only knows the cleanup methods exist.

Also I agree with it.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A little off topic, but the detector reuses the one repo. loadHeadServiceManifest also uses it as an arg. So I feel a little bit confused about cleaning at this point.

But I noticed that this section is used temporarily and different from the actual repo passed as arg. 🙏


repoDir := repo.GetPath()
appDir = filepath.Join(repoDir, app.GitPath.Path)
}
Expand Down
4 changes: 3 additions & 1 deletion pkg/app/piped/driftdetector/ecs/detector.go
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,7 @@ func ignoreParameters(liveManifests provider.ECSManifests, headManifests provide
return live, head
}

func (d *detector) loadConfigs(app *model.Application, repo git.Repo, headCommit git.Commit) (provider.ECSManifests, error) {
func (d *detector) loadConfigs(app *model.Application, repo git.Worktree, headCommit git.Commit) (provider.ECSManifests, error) {
var (
manifestCache = provider.ECSManifestsCache{
AppID: app.Id,
Expand Down Expand Up @@ -387,6 +387,8 @@ func (d *detector) loadConfigs(app *model.Application, repo git.Repo, headCommit
if err != nil {
return provider.ECSManifests{}, fmt.Errorf("failed to copy the cloned git repository (%w)", err)
}
defer repo.Clean()

repoDir := repo.GetPath()
appDir = filepath.Join(repoDir, app.GitPath.Path)
}
Expand Down
4 changes: 3 additions & 1 deletion pkg/app/piped/driftdetector/kubernetes/detector.go
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ func (d *detector) checkApplication(ctx context.Context, app *model.Application,
return d.reporter.ReportApplicationSyncState(ctx, app.Id, state)
}

func (d *detector) loadHeadManifests(ctx context.Context, app *model.Application, repo git.Repo, headCommit git.Commit, watchingResourceKinds []provider.APIVersionKind) ([]provider.Manifest, error) {
func (d *detector) loadHeadManifests(ctx context.Context, app *model.Application, repo git.Worktree, headCommit git.Commit, watchingResourceKinds []provider.APIVersionKind) ([]provider.Manifest, error) {
var (
manifestCache = provider.AppManifestsCache{
AppID: app.Id,
Expand Down Expand Up @@ -278,6 +278,8 @@ func (d *detector) loadHeadManifests(ctx context.Context, app *model.Application
if err != nil {
return nil, fmt.Errorf("failed to copy the cloned git repository (%w)", err)
}
defer repo.Clean()

repoDir = repo.GetPath()
appDir = filepath.Join(repoDir, app.GitPath.Path)
}
Expand Down
4 changes: 3 additions & 1 deletion pkg/app/piped/driftdetector/lambda/detector.go
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ func ignoreAndSortParameters(headSpec provider.FunctionManifestSpec) provider.Fu
return cloneSpec
}

func (d *detector) loadHeadFunctionManifest(app *model.Application, repo git.Repo, headCommit git.Commit) (provider.FunctionManifest, error) {
func (d *detector) loadHeadFunctionManifest(app *model.Application, repo git.Worktree, headCommit git.Commit) (provider.FunctionManifest, error) {
var (
manifestCache = provider.FunctionManifestCache{
AppID: app.Id,
Expand Down Expand Up @@ -312,6 +312,8 @@ func (d *detector) loadHeadFunctionManifest(app *model.Application, repo git.Rep
if err != nil {
return provider.FunctionManifest{}, fmt.Errorf("failed to copy the cloned git repository (%w)", err)
}
defer repo.Clean()

repoDir := repo.GetPath()
appDir = filepath.Join(repoDir, app.GitPath.Path)
}
Expand Down
4 changes: 3 additions & 1 deletion pkg/app/piped/driftdetector/terraform/detector.go
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,7 @@ func (d *detector) check(ctx context.Context) {
}
}

func (d *detector) checkApplication(ctx context.Context, app *model.Application, repo git.Repo, headCommit git.Commit) error {
func (d *detector) checkApplication(ctx context.Context, app *model.Application, repo git.Worktree, headCommit git.Commit) error {
var (
repoDir = repo.GetPath()
appDir = filepath.Join(repoDir, app.GitPath.Path)
Expand Down Expand Up @@ -206,6 +206,8 @@ func (d *detector) checkApplication(ctx context.Context, app *model.Application,
if err != nil {
return fmt.Errorf("failed to copy the cloned git repository (%w)", err)
}
defer repo.Clean()

repoDir = repo.GetPath()
appDir = filepath.Join(repoDir, app.GitPath.Path)
}
Expand Down
4 changes: 2 additions & 2 deletions pkg/app/piped/eventwatcher/eventwatcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,7 @@ func (w *watcher) execute(ctx context.Context, repo git.Repo, repoID string, eve
if err != nil {
return fmt.Errorf("failed to create a new temporary directory: %w", err)
}
tmpRepo, err := repo.Copy(filepath.Join(tmpDir, "tmp-repo"))
tmpRepo, err := repo.CopyToModify(filepath.Join(tmpDir, "tmp-repo"))
if err != nil {
return fmt.Errorf("failed to copy the repository to the temporary directory: %w", err)
}
Expand Down Expand Up @@ -495,7 +495,7 @@ func (w *watcher) updateValues(ctx context.Context, repo git.Repo, repoID string
if err != nil {
return fmt.Errorf("failed to create a new temporary directory: %w", err)
}
tmpRepo, err := repo.Copy(filepath.Join(tmpDir, "tmp-repo"))
tmpRepo, err := repo.CopyToModify(filepath.Join(tmpDir, "tmp-repo"))
if err != nil {
return fmt.Errorf("failed to copy the repository to the temporary directory: %w", err)
}
Expand Down
7 changes: 7 additions & 0 deletions pkg/app/pipedv1/deploysource/deploysource.go
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,13 @@ func (p *provider) copy(lw io.Writer) (*DeploySource, error) {
return nil, err
}

// To avoid the git operation on copied source.
// TODO: do not copy the .git directory to improve the performance.
if err := os.RemoveAll(filepath.Join(dest, ".git")); err != nil {
fmt.Fprintf(lw, "Unable to remove the .git directory from the copied deploy source (%v)\n", err)
return nil, err
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How about using a tar trick to avoid .git copy and remove. I think tar is available everywhere and fast enough compared to cp

tar -cpP --exclude=.git -C "$source_dir" . | tar -xpP -C "$target_dir"

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks. I'll try.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@khanhtc1202

We use the alpine as the base image for piped, so I checked the tar command in the piped image.
First, the tar command exists. But, the options differ from the GNU/BSD tar.
For example, short flags -p and -P do not exist.
IMO, it's not good to depend on this tar behavior because users may run the piped on a non-alpine-based OS.
WDYT?

Note: I checked tar --help in the piped container image. I paste the result below.

$ docker run --rm -it --entrypoint sh  ghcr.io/pipe-cd/piped:v0.50.0-23-g8f23da4@sha256:88ec60ebf2b7e3131ff38b1af4a043117ef688bc636ddd95afd6bd368293651b
/ $ tar --help
BusyBox v1.36.1 (2024-06-10 07:11:47 UTC) multi-call binary.

Usage: tar c|x|t [-ZzJjahmvokO] [-f TARFILE] [-C DIR] [-T FILE] [-X FILE] [LONGOPT]... [FILE]...

Create, extract, or list files from a tar file

        c       Create
        x       Extract
        t       List
        -f FILE Name of TARFILE ('-' for stdin/out)
        -C DIR  Change to DIR before operation
        -v      Verbose
        -O      Extract to stdout
        -m      Don't restore mtime
        -o      Don't restore user:group
        -k      Don't replace existing files
        -Z      (De)compress using compress
        -z      (De)compress using gzip
        -J      (De)compress using xz
        -j      (De)compress using bzip2
        --lzma  (De)compress using lzma
        -a      (De)compress based on extension
        -h      Follow symlinks
        -T FILE File with names to include
        -X FILE File with glob patterns to exclude
        --exclude PATTERN       Glob pattern to exclude
        --overwrite             Replace existing files
        --strip-components NUM  NUM of leading components to strip
        --no-recursion          Don't descend in directories
        --numeric-owner         Use numeric user:group
        --no-same-permissions   Don't restore access permissions

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the version without -P, the more universal solution

tar --exclude='.git' -cf - source_dir | tar -xf - -C destination_dir

@Warashi

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OKay, thank you for providing this. I'll use this script.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tried some tar options and finally reached this commit.
I tested it with busybox's tar (the same as alpine's) and GNU tar.
c5216c9


return &DeploySource{
RepoDir: dest,
AppDir: filepath.Join(dest, p.appGitPath.Path),
Expand Down
4 changes: 2 additions & 2 deletions pkg/app/pipedv1/eventwatcher/eventwatcher.go
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ func (w *watcher) execute(ctx context.Context, repo git.Repo, repoID string, eve
if err != nil {
return fmt.Errorf("failed to create a new temporary directory: %w", err)
}
tmpRepo, err := repo.Copy(filepath.Join(tmpDir, "tmp-repo"))
tmpRepo, err := repo.CopyToModify(filepath.Join(tmpDir, "tmp-repo"))
if err != nil {
return fmt.Errorf("failed to copy the repository to the temporary directory: %w", err)
}
Expand Down Expand Up @@ -478,7 +478,7 @@ func (w *watcher) updateValues(ctx context.Context, repo git.Repo, repoID string
if err != nil {
return fmt.Errorf("failed to create a new temporary directory: %w", err)
}
tmpRepo, err := repo.Copy(filepath.Join(tmpDir, "tmp-repo"))
tmpRepo, err := repo.CopyToModify(filepath.Join(tmpDir, "tmp-repo"))
if err != nil {
return fmt.Errorf("failed to copy the repository to the temporary directory: %w", err)
}
Expand Down
9 changes: 5 additions & 4 deletions pkg/git/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ func (c *client) Clone(ctx context.Context, repoID, remote, branch, destination
return nil, err
}
out, err := retryCommand(3, time.Second, logger, func() ([]byte, error) {
args := []string{"clone", "--mirror", remote, repoCachePath}
args := []string{"clone", "--mirror", "--filter=blob:none", remote, repoCachePath}
args = append(authArgs, args...)
return runGitCommand(ctx, c.gitPath, "", c.envsForRepo(remote), args...)
})
Expand Down Expand Up @@ -214,11 +214,12 @@ func (c *client) Clone(ctx context.Context, repoID, remote, branch, destination
}
}

args := []string{"clone"}
// git worktree add [-f] [--detach] [--checkout] [--lock [--reason <string>]]
// [--orphan] [(-b | -B) <new-branch>] <path> [<commit-ish>]
args := []string{"-C", repoCachePath, "worktree", "add", "--detach", destination}
if branch != "" {
args = append(args, "-b", branch)
args = append(args, branch)
}
args = append(args, repoCachePath, destination)

logger.Info("cloning a repo from cached one in local",
zap.String("src", repoCachePath),
Expand Down
19 changes: 17 additions & 2 deletions pkg/git/gittest/git.mock.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

112 changes: 103 additions & 9 deletions pkg/git/repo.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,8 @@ var (
type Repo interface {
GetPath() string
GetClonedBranch() string
Copy(dest string) (Repo, error)
Copy(dest string) (Worktree, error)
CopyToModify(dest string) (Repo, error)
Comment on lines +38 to +39
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

[IMO] How about using the Worktree and Copy?

	Worktree(dest string) (Worktree, error)
	Copy(dest string) (Repo, error)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It sounds clear, but I want to leave here as-is.
I want to clear when to use the CopyToModify method with its naming.
Almost all of implementation, we don't need modification of git commits/histories, so we should not use CopyToModify, and we can know from its naming.
If we rename them as Worktree and Copy, some implementation may call Copy method mistakenly even if it's enough to call the Worktree method.

WDYT?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Almost all of implementation, we don't need modification of git commits/histories, so we should not use CopyToModify, and we can know from its naming.

I got your point! I agree with it.


ListCommits(ctx context.Context, visionRange string) ([]Commit, error)
GetLatestCommit(ctx context.Context) (Commit, error)
Expand All @@ -52,6 +53,17 @@ type Repo interface {
CommitChanges(ctx context.Context, branch, message string, newBranch bool, changes map[string][]byte, trailers map[string]string) error
}

// Worktree provides functions to get and handle git worktree.
// It is a separate checkout of the repository.
// It is used to make changes to the repository without affecting the main repository.
// Worktree always does checkout with the detached HEAD, so it doesn't affect the main repository.
type Worktree interface {
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added this to restrict operation at worktree.

GetPath() string
Clean() error
Copy(dest string) (Worktree, error)
Checkout(ctx context.Context, commitish string) error
}

type repo struct {
dir string
gitPath string
Expand All @@ -60,6 +72,55 @@ type repo struct {
gitEnvs []string
}

// worktree is a git worktree.
// It is a separate checkout of the repository.
type worktree struct {
base *repo
worktreePath string
}

func (r *worktree) runGitCommand(ctx context.Context, args ...string) ([]byte, error) {
cmd := exec.CommandContext(ctx, r.base.gitPath, args...)
cmd.Dir = r.worktreePath
cmd.Env = append(os.Environ(), r.base.gitEnvs...)
return cmd.CombinedOutput()
}

func (r *worktree) Copy(dest string) (Worktree, error) {
// garbage collecting worktrees
if _, err := r.runGitCommand(context.Background(), "worktree", "prune"); err != nil {
// ignore the error
}

if out, err := r.runGitCommand(context.Background(), "worktree", "add", "--detach", dest); err != nil {
return nil, formatCommandError(err, out)
}

return &worktree{
base: r.base,
worktreePath: dest,
}, nil
}

func (r *worktree) GetPath() string {
return r.worktreePath
}

func (r *worktree) Clean() error {
if out, err := r.base.runGitCommand(context.Background(), "worktree", "remove", r.worktreePath); err != nil {
return formatCommandError(err, out)
}
return nil
}

func (r *worktree) Checkout(ctx context.Context, commitish string) error {
out, err := r.runGitCommand(ctx, "checkout", "--detach", commitish)
if err != nil {
return formatCommandError(err, out)
}
return nil
}

// NewRepo creates a new Repo instance.
func NewRepo(dir, gitPath, remote, clonedBranch string, gitEnvs []string) *repo {
return &repo{
Expand All @@ -81,22 +142,55 @@ func (r *repo) GetClonedBranch() string {
return r.clonedBranch
}

// Copy does copying the repository to the given destination.
// Copy does copying the repository to the given destination using git worktree.
// The repository is cloned to the given destination with the detached HEAD.
// NOTE: the given “dest” must be a path that doesn’t exist yet.
// If you don't, it copies the repo root itself to the given dest as a subdirectory.
func (r *repo) Copy(dest string) (Repo, error) {
cmd := exec.Command("cp", "-rf", r.dir, dest)
out, err := cmd.CombinedOutput()
if err != nil {
// If you don't, you will get an error.
func (r *repo) Copy(dest string) (Worktree, error) {
// garbage collecting worktrees
if _, err := r.runGitCommand(context.Background(), "worktree", "prune"); err != nil {
// ignore the error
}

if out, err := r.runGitCommand(context.Background(), "worktree", "add", "--detach", dest); err != nil {
return nil, formatCommandError(err, out)
}

return &repo{
return &worktree{
base: r,
worktreePath: dest,
}, nil
}

// CopyToModify does cloning the repository to the given destination.
// The repository is cloned to the given destination with the .
// NOTE: the given “dest” must be a path that doesn’t exist yet.
// If you don't, you will get an error.
func (r *repo) CopyToModify(dest string) (Repo, error) {
cmd := exec.Command(r.gitPath, "clone", r.dir, dest)
if out, err := cmd.CombinedOutput(); err != nil {
return nil, formatCommandError(err, out)
}

cloned := &repo{
dir: dest,
gitPath: r.gitPath,
remote: r.remote,
clonedBranch: r.clonedBranch,
}, nil
gitEnvs: r.gitEnvs,
}

// because we did a local cloning so set the remote url of origin
if err := cloned.setRemote(context.Background(), r.remote); err != nil {
return nil, err
}

// fetch the latest changes which doesn't exist in the local repository
if out, err := cloned.runGitCommand(context.Background(), "fetch"); err != nil {
return nil, formatCommandError(err, out)
}

return cloned, nil
}

// ListCommits returns a list of commits in a given revision range.
Expand Down
Loading
Loading