Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add progress flag to git scans #421

Merged
merged 3 commits into from
Jan 5, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 22 additions & 6 deletions docs/source/features.rst
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,28 @@ To use ``docker``:
When used this way, `tartufo` will clone the repository to a temporary
directory, scan the local clone, and then delete it.

Displaying Scan Progress
******************************************

When running any Git history scan, you can show scan progress by using
the ``--progress`` or ``-p`` flag.

.. code-block:: sh

$ tartufo scan-local-repo /path/to/my/repo --progress

.. code-block:: text

➜ Scanning master (1 of 59)[17942] [#-----------------------------------] 4% 00:01:26

Legend:
master = current branch being scanned
1 of 59 = number of branches completed (plus current branch) and total number of branches
17942 = number of commits in current branch to process
4% = percentage of commits on current branch completed
00:01:26 = estimated time to complete current branch


Accessing Repositories via SSH from Docker
******************************************

Expand All @@ -76,12 +98,6 @@ When using Docker Desktop for Mac, use ``/run/host-services/ssh-auth.sock`` as
both source and target, then point the environment variable ``SSH_AUTH_SOCK`` to
this same location:

.. code-block:: sh

$ docker run --rm -v "/path/to/my/repo:/git" \
-v /run/host-services/ssh-auth.sock:/run/host-services/ssh-auth.sock \
-e SSH_AUTH_SOCK="/run/host-services/ssh-auth.sock" godaddy/tartufo

Scanning a Folder
+++++++++++++++++++++++++++

Expand Down
10 changes: 10 additions & 0 deletions tartufo/commands/scan_local_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,14 @@
show_default=True,
help="Controls whether the contents of git submodules are scanned",
)
@click.option(
"-p",
"--progress",
is_flag=True,
default=False,
show_default=True,
help="Controls whether to display a progress bar",
)
@click.pass_obj
@click.pass_context
def main(
Expand All @@ -37,13 +45,15 @@ def main(
max_depth: int,
branch: Optional[str],
include_submodules: bool,
progress: bool,
) -> GitRepoScanner:
"""Scan a repository already cloned to your local system."""
git_options = types.GitOptions(
since_commit=since_commit,
max_depth=max_depth,
branch=branch,
include_submodules=include_submodules,
progress=progress,
)
scanner = None
try:
Expand Down
10 changes: 10 additions & 0 deletions tartufo/commands/scan_remote_repo.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,14 @@
show_default=True,
help="Controls whether the contents of git submodules are scanned",
)
@click.option(
"-p",
"--progress",
is_flag=True,
default=False,
show_default=True,
help="Controls whether to display a progress bar",
)
@click.argument("git-url")
@click.pass_obj
@click.pass_context
Expand All @@ -52,13 +60,15 @@ def main(
branch: Optional[str],
work_dir: Optional[str],
include_submodules: bool,
progress: bool,
) -> GitRepoScanner:
"""Automatically clone and scan a remote git repository."""
git_options = types.GitOptions(
since_commit=since_commit,
max_depth=max_depth,
branch=None,
include_submodules=include_submodules,
progress=progress,
)
repo_path: Optional[Path] = None
if work_dir:
Expand Down
1 change: 1 addition & 0 deletions tartufo/commands/update_signatures.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ def scan_local_repo(
max_depth=max_depth,
branch=branch,
include_submodules=include_submodules,
progress=False,
)

with contextlib.redirect_stdout(stdout):
Expand Down
111 changes: 66 additions & 45 deletions tartufo/scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
Any,
Dict,
Generator,
Iterable,
List,
MutableMapping,
Optional,
Expand Down Expand Up @@ -740,12 +741,12 @@ def _iter_diff_index(
file_path = (
delta.new_file.path if delta.new_file.path else delta.old_file.path
)
if delta.status == pygit2.GIT_DELTA_DELETED:
self.logger.debug("Skipping as the file was a git delete operation")
continue
if delta.is_binary:
self.logger.debug("Binary file skipped: %s", file_path)
continue
if delta.status == pygit2.GIT_DELTA_DELETED:
self.logger.debug("Skipping as the file is deleted")
continue
printable_diff: str = patch.text
if not self.global_options.scan_filenames:
# The `printable_diff` contains diff header,
Expand Down Expand Up @@ -835,6 +836,53 @@ def load_repo(self, repo_path: str) -> pygit2.Repository:
except git.GitError as exc:
raise types.GitLocalException(str(exc)) from exc

def _get_chunks(
self, commits: Iterable, already_searched: Set[bytes], branch_name: str
) -> Generator[types.Chunk, None, None]:
diff_hash: bytes
curr_commit: pygit2.Commit = None
prev_commit: pygit2.Commit = None
for curr_commit in commits:
try:
prev_commit = curr_commit.parents[0]
except (IndexError, KeyError, TypeError):
# IndexError: current commit has no parents
# KeyError: current commit has parents which are not local
# If a commit doesn't have a parent skip diff generation since it is the first commit
self.logger.debug(
"Skipping commit %s because it has no parents",
curr_commit.hex,
)
continue
diff_hash = hashlib.md5(
(str(prev_commit) + str(curr_commit)).encode("utf-8")
).digest()
if diff_hash in already_searched:
continue
diff: pygit2.Diff = self._repo.diff(prev_commit, curr_commit)
already_searched.add(diff_hash)
diff.find_similar()
for blob, file_path in self._iter_diff_index(diff):
yield types.Chunk(
blob,
file_path,
util.extract_commit_metadata(curr_commit, branch_name),
True,
)

# Finally, yield the first commit to the branch
if curr_commit:
tree: pygit2.Tree = self._repo.revparse_single(curr_commit.hex).tree
tree_diff: pygit2.Diff = tree.diff_to_tree(swap=True)
iter_diff = self._iter_diff_index(tree_diff)
for blob, file_path in iter_diff:
yield types.Chunk(
blob,
file_path,
util.extract_commit_metadata(curr_commit, branch_name),
True,
)

@property
def chunks(self) -> Generator[types.Chunk, None, None]:
"""Yield individual diffs from the repository's history.
Expand Down Expand Up @@ -871,7 +919,8 @@ def chunks(self) -> Generator[types.Chunk, None, None]:
"Branches to be scanned: %s",
", ".join([str(branch) for branch in branches]),
)

branch_cnt = 0
branch_len = len(branches)
for branch_name in branches:
self.logger.info("Scanning branch: %s", branch_name)
if branch_name == "HEAD":
Expand All @@ -882,53 +931,25 @@ def chunks(self) -> Generator[types.Chunk, None, None]:
commits = self._repo.walk(
branch.resolve().target, pygit2.GIT_SORT_TOPOLOGICAL
)

except AttributeError:
self.logger.debug(
"Skipping branch %s because it cannot be resolved.", branch_name
)
continue
diff_hash: bytes
curr_commit: pygit2.Commit = None
prev_commit: pygit2.Commit = None
for curr_commit in commits:
try:
prev_commit = curr_commit.parents[0]
except (IndexError, KeyError, TypeError):
# IndexError: current commit has no parents
# KeyError: current commit has parents which are not local
# If a commit doesn't have a parent skip diff generation since it is the first commit
self.logger.debug(
"Skipping commit %s because it has no parents", curr_commit.hex
)
continue
diff_hash = hashlib.md5(
(str(prev_commit) + str(curr_commit)).encode("utf-8")
).digest()
if diff_hash in already_searched:
continue
diff: pygit2.Diff = self._repo.diff(prev_commit, curr_commit)
already_searched.add(diff_hash)
diff.find_similar()
for blob, file_path in self._iter_diff_index(diff):
yield types.Chunk(
blob,
file_path,
util.extract_commit_metadata(curr_commit, branch_name),
True,
)

# Finally, yield the first commit to the branch
if curr_commit:
tree: pygit2.Tree = self._repo.revparse_single(curr_commit.hex).tree
tree_diff: pygit2.Diff = tree.diff_to_tree(swap=True)
iter_diff = self._iter_diff_index(tree_diff)
for blob, file_path in iter_diff:
yield types.Chunk(
blob,
file_path,
util.extract_commit_metadata(curr_commit, branch_name),
True,
)
show_progress = self.git_options.progress
if show_progress:
branch_cnt = branch_cnt + 1
lcommits = list(commits)
commit_len = len(lcommits)
with click.progressbar(
lcommits,
label=f"➜ Scanning {branch_name} ({branch_cnt} of {branch_len})[{commit_len}]",
) as pcommits:
yield from self._get_chunks(pcommits, already_searched, branch_name)
else:
yield from self._get_chunks(commits, already_searched, branch_name)


class GitPreCommitScanner(GitScanner):
Expand Down
9 changes: 8 additions & 1 deletion tartufo/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,11 +149,18 @@ class GitOptions:
:param include_submodules: Whether to also scan submodules of the repository
"""

__slots__ = ("since_commit", "max_depth", "branch", "include_submodules")
__slots__ = (
"since_commit",
"max_depth",
"branch",
"include_submodules",
"progress",
)
since_commit: Optional[str]
max_depth: int
branch: Optional[str]
include_submodules: bool
progress: bool


@dataclass
Expand Down
45 changes: 45 additions & 0 deletions tests/test_git_repo_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,49 @@ def test_single_branch_is_loaded_if_specified(self):
mock_branch_foo.resolve().target, pygit2.GIT_SORT_TOPOLOGICAL
)

def test_runs_scans_with_progressbar_enabled(self):
mock_branch_foo = mock.MagicMock()
mock_branch_bar = mock.MagicMock()
self.mock_repo.return_value.listall_branches.return_value = ["foo", "bar"]
self.mock_repo.return_value.branches = {
"foo": mock_branch_foo,
"bar": mock_branch_bar,
}
self.git_options.progress = True
test_scanner = scanner.GitRepoScanner(
self.global_options, self.git_options, "."
)

mock_commit_1 = mock.MagicMock()
mock_commit_1.parents = None
mock_commit_2 = mock.MagicMock()
mock_commit_2.parents = [mock_commit_1]
mock_commit_3 = mock.MagicMock()
mock_commit_3.parents = [mock_commit_2]

self.mock_repo.return_value.walk.return_value = [
mock_commit_3,
mock_commit_2,
mock_commit_1,
]

self.mock_iter_diff.return_value = []
for _ in test_scanner.chunks:
pass

self.mock_repo.return_value.walk.assert_has_calls(
(
mock.call(
mock_branch_foo.resolve().target, pygit2.GIT_SORT_TOPOLOGICAL
),
mock.call(
mock_branch_bar.resolve().target, pygit2.GIT_SORT_TOPOLOGICAL
),
)
)

self.mock_iter_diff.assert_called()

def test_all_branches_are_scanned_for_commits(self):
mock_branch_foo = mock.MagicMock()
mock_branch_bar = mock.MagicMock()
Expand Down Expand Up @@ -249,6 +292,8 @@ def test_all_branches_are_scanned_for_commits(self):
)
)

self.mock_iter_diff.assert_called()

def test_all_commits_are_scanned_for_files(self):
self.mock_repo.return_value.branches = {"foo": mock.MagicMock()}
test_scanner = scanner.GitRepoScanner(
Expand Down