Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(performance): reduce the number of files scanned after merge conflict #946

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
### Added

- The command `ggshield secret scan pre-commit` has a new flag `--merge-skip-unchanged`. It is off by default, if activated,
fnareoh marked this conversation as resolved.
Show resolved Hide resolved
in the case of merge commit, it skips the scan of files that were not modified by merge. This is done for efficiency
but is less secure.
47 changes: 44 additions & 3 deletions ggshield/cmd/secret/scan/precommit.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import os
import subprocess
from pathlib import Path
from typing import Any, List

Expand All @@ -10,18 +12,49 @@
from ggshield.cmd.utils.context_obj import ContextObj
from ggshield.cmd.utils.hooks import check_user_requested_skip
from ggshield.core.scan import Commit, ScanContext, ScanMode
from ggshield.utils.git_shell import check_git_dir
from ggshield.utils.git_shell import check_git_dir, git
from ggshield.verticals.secret import SecretScanCollection, SecretScanner
from ggshield.verticals.secret.output import SecretTextOutputHandler


def check_is_merge_without_conflict() -> bool:
"""Check if the reflog action is a merge without conflict"""
return os.getenv("GIT_REFLOG_ACTION", "").split(" ")[0] == "merge"


def get_merge_branch_from_reflog() -> str:
"""Get the branch that was merged from the reflog"""
return os.getenv("GIT_REFLOG_ACTION", "").split(" ")[-1]


def check_is_merge_with_conflict(cwd: Path) -> bool:
"""Check if MERGE_HEAD exists (meaning we are in a merge with conflict)"""
try:
git(["rev-parse", "--verify", "-q", "MERGE_HEAD"], cwd=cwd)
# MERGE_HEAD exists
return True
except subprocess.CalledProcessError:
# MERGE_HEAD does not exist
return False


@click.command()
@click.option(
"--merge-skip-unchanged",
"-m",
is_flag=True,
help="When scanning a merge commit, skip files that were not modified by the merge"
" (assumes the merged commits are secret free).",
)
@click.argument("precommit_args", nargs=-1, type=click.UNPROCESSED)
@add_secret_scan_common_options()
@click.pass_context
@exception_wrapper
def precommit_cmd(
ctx: click.Context, precommit_args: List[str], **kwargs: Any
ctx: click.Context,
merge_skip_unchanged: bool,
precommit_args: List[str],
**kwargs: Any,
) -> int: # pragma: no cover
"""
Scan as a pre-commit hook all changes that have been staged in a git repository.
Expand All @@ -47,7 +80,15 @@ def precommit_cmd(
target_path=Path.cwd(),
)

commit = Commit.from_staged(ctx_obj.exclusion_regexes)
# Get the commit object
if merge_skip_unchanged and check_is_merge_with_conflict(Path.cwd()):
commit = Commit.from_merge(ctx_obj.exclusion_regexes)
elif merge_skip_unchanged and check_is_merge_without_conflict():
merge_branch = get_merge_branch_from_reflog()
commit = Commit.from_merge(ctx_obj.exclusion_regexes, merge_branch)
else:
commit = Commit.from_staged(ctx_obj.exclusion_regexes)

scanner = SecretScanner(
client=ctx_obj.client,
cache=ctx_obj.cache,
Expand Down
36 changes: 36 additions & 0 deletions ggshield/core/scan/commit.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@
PATCH_PREFIX,
STAGED_PREFIX,
CommitScannable,
get_diff_files,
get_file_sha_in_ref,
get_file_sha_stage,
parse_patch,
)
from .scannable import Scannable
Expand Down Expand Up @@ -73,6 +76,39 @@ def parser(commit: "Commit") -> Iterable[Scannable]:

return Commit(sha, parser, info)

@staticmethod
def from_merge(
exclusion_regexes: Optional[Set[Pattern[str]]] = None,
merge_branch: str = "MERGE_HEAD",
cwd: Optional[Path] = None,
) -> "Commit":
fnareoh marked this conversation as resolved.
Show resolved Hide resolved

diff_files = get_diff_files(cwd=cwd)

shas_in_merge_branch = dict(
get_file_sha_in_ref(merge_branch, diff_files, cwd=cwd)
)
shas_in_head = dict(get_file_sha_in_ref("HEAD", diff_files, cwd=cwd))

files_to_scan = set()
for path, sha in get_file_sha_stage(diff_files, cwd=cwd):
# The file is either new or different from both HEAD and MERGE_HEAD
if sha not in {shas_in_head.get(path), shas_in_merge_branch.get(path)}:
files_to_scan.add(path)

def parser_merge(commit: "Commit") -> Iterable[Scannable]:
patch = git(
["diff", "--staged", *PATCH_COMMON_ARGS, *files_to_scan], cwd=cwd
)
yield from parse_patch(
STAGED_PREFIX,
DIFF_EMPTY_COMMIT_INFO_BLOCK + patch,
exclusion_regexes,
)

info = CommitInformation.from_staged(cwd)
return Commit(sha=None, patch_parser=parser_merge, info=info)

@staticmethod
def from_staged(
exclusion_regexes: Optional[Set[Pattern[str]]] = None,
Expand Down
38 changes: 36 additions & 2 deletions ggshield/core/scan/commit_utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Iterable, List, Optional, Pattern, Set
from typing import Iterable, List, Optional, Pattern, Set, Tuple

from ggshield.utils.files import is_path_excluded
from ggshield.utils.git_shell import Filemode
from ggshield.utils.git_shell import Filemode, git

from .scannable import Scannable

Expand Down Expand Up @@ -235,3 +235,37 @@ def parse_patch(
else:
msg = f"Could not parse patch: {exc}"
raise PatchParseError(msg)


def get_file_sha_in_ref(
ref: str,
files: List[str],
cwd: Optional[Path] = None,
) -> Iterable[Tuple[str, str]]:
"""
Helper function to get the shas of files in the git reference.
"""
output = git(["ls-tree", "-z", ref] + files, cwd=cwd)
for line in output.split("\0")[:-1]:
_, _, sha, path = line.split()
yield (path, sha)


def get_file_sha_stage(
files: List[str], cwd: Optional[Path] = None
) -> Iterable[Tuple[str, str]]:
"""
Helper function to get the shas currently staged of files.
"""
output = git(["ls-files", "--stage", "-z"] + files, cwd=cwd)
for line in output.split("\0")[:-1]:
_, sha, _, path = line.split()
yield (path, sha)


def get_diff_files(cwd: Optional[Path] = None) -> List[str]:
"""
Helper function to get the files modified and staged.
"""
output = git(["diff", "--name-only", "--staged", "-z"], cwd=cwd)
return output.split("\0")[:-1] # remove the trailing \0
78 changes: 78 additions & 0 deletions tests/functional/secret/test_merge_commit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from pathlib import Path
from subprocess import CalledProcessError

import pytest

from tests.functional.utils_create_merge_repo import (
SecretLocation,
generate_repo_with_merge_commit,
)


@pytest.mark.parametrize(
"with_conflict",
[
True,
False,
],
)
@pytest.mark.parametrize(
"secret_location",
[
SecretLocation.MASTER_BRANCH,
SecretLocation.FEATURE_BRANCH,
SecretLocation.NO_SECRET,
],
)
@pytest.mark.parametrize(
"merge_skip_unchanged",
[
True,
False,
],
)
def test_merge_commit_no_conflict(
capsys,
tmp_path: Path,
with_conflict: bool,
secret_location: SecretLocation,
merge_skip_unchanged: bool,
) -> None:

if (
secret_location == SecretLocation.MASTER_BRANCH
and with_conflict
and not merge_skip_unchanged
):
with pytest.raises(CalledProcessError):
generate_repo_with_merge_commit(
tmp_path,
with_conflict=with_conflict,
secret_location=secret_location,
merge_skip_unchanged=merge_skip_unchanged,
)

# AND the error message contains the Gitlab Token
captured = capsys.readouterr()
assert "GitLab Token" in captured.err
else:
generate_repo_with_merge_commit(
tmp_path,
with_conflict=with_conflict,
secret_location=secret_location,
merge_skip_unchanged=merge_skip_unchanged,
)


def test_merge_commit_with_conflict_and_secret_in_conflict(
tmp_path: Path,
) -> None:

with pytest.raises(CalledProcessError) as exc:
generate_repo_with_merge_commit(
tmp_path, with_conflict=True, secret_location=SecretLocation.CONFLICT_FILE
)

# AND the error message contains the Gitlab Token
stderr = exc.value.stderr.decode()
assert "GitLab Token" in stderr
Loading
Loading