From 6b82697c29269c34d43b3e57ed07b49a1c9e2f4d Mon Sep 17 00:00:00 2001 From: dclayton Date: Wed, 4 Jan 2023 10:52:26 -0700 Subject: [PATCH 1/2] feat: add progress flag to git scans --- docs/source/features.rst | 28 ++++-- tartufo/commands/scan_local_repo.py | 10 +++ tartufo/commands/scan_remote_repo.py | 10 +++ tartufo/commands/update_signatures.py | 1 + tartufo/scanner.py | 118 +++++++++++++++----------- tartufo/types.py | 9 +- tests/test_git_repo_scanner.py | 45 ++++++++++ 7 files changed, 166 insertions(+), 55 deletions(-) diff --git a/docs/source/features.rst b/docs/source/features.rst index cdae273d..209c777a 100644 --- a/docs/source/features.rst +++ b/docs/source/features.rst @@ -54,6 +54,28 @@ To use ``docker``: When used this way, `tartufo` will clone the repository to a temporary directory, scan the local clone, and then delete it. +Displaying Scan Progress +****************************************** + +When running any Git history scan, you can show scan progress by using +the ``--progress`` or ``-p`` flag. + +.. code-block:: sh + + $ tartufo scan-local-repo /path/to/my/repo --progress + +.. code-block:: text + + ➜ Scanning master (1 of 59)[17942] [#-----------------------------------] 4% 00:01:26 + + Legend: + master = current branch being scanned + 1 of 59 = number of branches completed (plus current branch) and total number of branches + 17942 = number of commits in current branch to process + 4% = percentage of commits on current branch completed + 00:01:26 = estimated time to complete current branch + + Accessing Repositories via SSH from Docker ****************************************** @@ -76,12 +98,6 @@ When using Docker Desktop for Mac, use ``/run/host-services/ssh-auth.sock`` as both source and target, then point the environment variable ``SSH_AUTH_SOCK`` to this same location: -.. code-block:: sh - - $ docker run --rm -v "/path/to/my/repo:/git" \ - -v /run/host-services/ssh-auth.sock:/run/host-services/ssh-auth.sock \ - -e SSH_AUTH_SOCK="/run/host-services/ssh-auth.sock" godaddy/tartufo - Scanning a Folder +++++++++++++++++++++++++++ diff --git a/tartufo/commands/scan_local_repo.py b/tartufo/commands/scan_local_repo.py index 0af35b8d..f6a9f3b5 100644 --- a/tartufo/commands/scan_local_repo.py +++ b/tartufo/commands/scan_local_repo.py @@ -27,6 +27,14 @@ show_default=True, help="Controls whether the contents of git submodules are scanned", ) +@click.option( + "-p", + "--progress", + is_flag=True, + default=False, + show_default=True, + help="Controls whether to display a progress bar", +) @click.pass_obj @click.pass_context def main( @@ -37,6 +45,7 @@ def main( max_depth: int, branch: Optional[str], include_submodules: bool, + progress: bool, ) -> GitRepoScanner: """Scan a repository already cloned to your local system.""" git_options = types.GitOptions( @@ -44,6 +53,7 @@ def main( max_depth=max_depth, branch=branch, include_submodules=include_submodules, + progress=progress, ) scanner = None try: diff --git a/tartufo/commands/scan_remote_repo.py b/tartufo/commands/scan_remote_repo.py index 220ce8cf..fd85b8cf 100644 --- a/tartufo/commands/scan_remote_repo.py +++ b/tartufo/commands/scan_remote_repo.py @@ -40,6 +40,14 @@ show_default=True, help="Controls whether the contents of git submodules are scanned", ) +@click.option( + "-p", + "--progress", + is_flag=True, + default=False, + show_default=True, + help="Controls whether to display a progress bar", +) @click.argument("git-url") @click.pass_obj @click.pass_context @@ -52,6 +60,7 @@ def main( branch: Optional[str], work_dir: Optional[str], include_submodules: bool, + progress: bool, ) -> GitRepoScanner: """Automatically clone and scan a remote git repository.""" git_options = types.GitOptions( @@ -59,6 +68,7 @@ def main( max_depth=max_depth, branch=None, include_submodules=include_submodules, + progress=progress, ) repo_path: Optional[Path] = None if work_dir: diff --git a/tartufo/commands/update_signatures.py b/tartufo/commands/update_signatures.py index 9b37a2c3..b34d3b72 100644 --- a/tartufo/commands/update_signatures.py +++ b/tartufo/commands/update_signatures.py @@ -62,6 +62,7 @@ def scan_local_repo( max_depth=max_depth, branch=branch, include_submodules=include_submodules, + progress=False, ) with contextlib.redirect_stdout(stdout): diff --git a/tartufo/scanner.py b/tartufo/scanner.py index 776c064f..c5e5fab9 100755 --- a/tartufo/scanner.py +++ b/tartufo/scanner.py @@ -16,6 +16,7 @@ Any, Dict, Generator, + Iterable, List, MutableMapping, Optional, @@ -740,12 +741,12 @@ def _iter_diff_index( file_path = ( delta.new_file.path if delta.new_file.path else delta.old_file.path ) + if delta.status == pygit2.GIT_DELTA_DELETED: + self.logger.debug("Skipping as the file was a git delete operation") + continue if delta.is_binary: self.logger.debug("Binary file skipped: %s", file_path) continue - if delta.status == pygit2.GIT_DELTA_DELETED: - self.logger.debug("Skipping as the file is deleted") - continue printable_diff: str = patch.text if not self.global_options.scan_filenames: # The `printable_diff` contains diff header, @@ -835,6 +836,53 @@ def load_repo(self, repo_path: str) -> pygit2.Repository: except git.GitError as exc: raise types.GitLocalException(str(exc)) from exc + def _get_chunks( + self, commits: Iterable, already_searched: Set[bytes], branch_name: str + ) -> Generator[types.Chunk, None, None]: + diff_hash: bytes + curr_commit: pygit2.Commit = None + prev_commit: pygit2.Commit = None + for curr_commit in commits: + try: + prev_commit = curr_commit.parents[0] + except (IndexError, KeyError, TypeError): + # IndexError: current commit has no parents + # KeyError: current commit has parents which are not local + # If a commit doesn't have a parent skip diff generation since it is the first commit + self.logger.debug( + "Skipping commit %s because it has no parents", + curr_commit.hex, + ) + continue + diff_hash = hashlib.md5( + (str(prev_commit) + str(curr_commit)).encode("utf-8") + ).digest() + if diff_hash in already_searched: + continue + diff: pygit2.Diff = self._repo.diff(prev_commit, curr_commit) + already_searched.add(diff_hash) + diff.find_similar() + for blob, file_path in self._iter_diff_index(diff): + yield types.Chunk( + blob, + file_path, + util.extract_commit_metadata(curr_commit, branch_name), + True, + ) + + # Finally, yield the first commit to the branch + if curr_commit: + tree: pygit2.Tree = self._repo.revparse_single(curr_commit.hex).tree + tree_diff: pygit2.Diff = tree.diff_to_tree(swap=True) + iter_diff = self._iter_diff_index(tree_diff) + for blob, file_path in iter_diff: + yield types.Chunk( + blob, + file_path, + util.extract_commit_metadata(curr_commit, branch_name), + True, + ) + @property def chunks(self) -> Generator[types.Chunk, None, None]: """Yield individual diffs from the repository's history. @@ -871,7 +919,8 @@ def chunks(self) -> Generator[types.Chunk, None, None]: "Branches to be scanned: %s", ", ".join([str(branch) for branch in branches]), ) - + branch_cnt = 0 + branch_len = len(branches) for branch_name in branches: self.logger.info("Scanning branch: %s", branch_name) if branch_name == "HEAD": @@ -879,56 +928,29 @@ def chunks(self) -> Generator[types.Chunk, None, None]: else: branch = self._repo.branches.get(branch_name) try: - commits = self._repo.walk( - branch.resolve().target, pygit2.GIT_SORT_TOPOLOGICAL + commits = list( + self._repo.walk( + branch.resolve().target, pygit2.GIT_SORT_TOPOLOGICAL + ) ) + except AttributeError: self.logger.debug( "Skipping branch %s because it cannot be resolved.", branch_name ) continue - diff_hash: bytes - curr_commit: pygit2.Commit = None - prev_commit: pygit2.Commit = None - for curr_commit in commits: - try: - prev_commit = curr_commit.parents[0] - except (IndexError, KeyError, TypeError): - # IndexError: current commit has no parents - # KeyError: current commit has parents which are not local - # If a commit doesn't have a parent skip diff generation since it is the first commit - self.logger.debug( - "Skipping commit %s because it has no parents", curr_commit.hex - ) - continue - diff_hash = hashlib.md5( - (str(prev_commit) + str(curr_commit)).encode("utf-8") - ).digest() - if diff_hash in already_searched: - continue - diff: pygit2.Diff = self._repo.diff(prev_commit, curr_commit) - already_searched.add(diff_hash) - diff.find_similar() - for blob, file_path in self._iter_diff_index(diff): - yield types.Chunk( - blob, - file_path, - util.extract_commit_metadata(curr_commit, branch_name), - True, - ) - - # Finally, yield the first commit to the branch - if curr_commit: - tree: pygit2.Tree = self._repo.revparse_single(curr_commit.hex).tree - tree_diff: pygit2.Diff = tree.diff_to_tree(swap=True) - iter_diff = self._iter_diff_index(tree_diff) - for blob, file_path in iter_diff: - yield types.Chunk( - blob, - file_path, - util.extract_commit_metadata(curr_commit, branch_name), - True, - ) + branch_cnt = branch_cnt + 1 + commit_len = len(commits) + + show_progress = self.git_options.progress + if show_progress: + with click.progressbar( + commits, + label=f"➜ Scanning {branch_name} ({branch_cnt} of {branch_len})[{commit_len}]", + ) as pcommits: + yield from self._get_chunks(pcommits, already_searched, branch_name) + else: + yield from self._get_chunks(commits, already_searched, branch_name) class GitPreCommitScanner(GitScanner): diff --git a/tartufo/types.py b/tartufo/types.py index ffa7ee7f..7b648014 100644 --- a/tartufo/types.py +++ b/tartufo/types.py @@ -149,11 +149,18 @@ class GitOptions: :param include_submodules: Whether to also scan submodules of the repository """ - __slots__ = ("since_commit", "max_depth", "branch", "include_submodules") + __slots__ = ( + "since_commit", + "max_depth", + "branch", + "include_submodules", + "progress", + ) since_commit: Optional[str] max_depth: int branch: Optional[str] include_submodules: bool + progress: bool @dataclass diff --git a/tests/test_git_repo_scanner.py b/tests/test_git_repo_scanner.py index 019052f8..743e7416 100644 --- a/tests/test_git_repo_scanner.py +++ b/tests/test_git_repo_scanner.py @@ -209,6 +209,49 @@ def test_single_branch_is_loaded_if_specified(self): mock_branch_foo.resolve().target, pygit2.GIT_SORT_TOPOLOGICAL ) + def test_runs_scans_with_progressbar_enabled(self): + mock_branch_foo = mock.MagicMock() + mock_branch_bar = mock.MagicMock() + self.mock_repo.return_value.listall_branches.return_value = ["foo", "bar"] + self.mock_repo.return_value.branches = { + "foo": mock_branch_foo, + "bar": mock_branch_bar, + } + self.git_options.progress = True + test_scanner = scanner.GitRepoScanner( + self.global_options, self.git_options, "." + ) + + mock_commit_1 = mock.MagicMock() + mock_commit_1.parents = None + mock_commit_2 = mock.MagicMock() + mock_commit_2.parents = [mock_commit_1] + mock_commit_3 = mock.MagicMock() + mock_commit_3.parents = [mock_commit_2] + + self.mock_repo.return_value.walk.return_value = [ + mock_commit_3, + mock_commit_2, + mock_commit_1, + ] + + self.mock_iter_diff.return_value = [] + for _ in test_scanner.chunks: + pass + + self.mock_repo.return_value.walk.assert_has_calls( + ( + mock.call( + mock_branch_foo.resolve().target, pygit2.GIT_SORT_TOPOLOGICAL + ), + mock.call( + mock_branch_bar.resolve().target, pygit2.GIT_SORT_TOPOLOGICAL + ), + ) + ) + + self.mock_iter_diff.assert_called() + def test_all_branches_are_scanned_for_commits(self): mock_branch_foo = mock.MagicMock() mock_branch_bar = mock.MagicMock() @@ -249,6 +292,8 @@ def test_all_branches_are_scanned_for_commits(self): ) ) + self.mock_iter_diff.assert_called() + def test_all_commits_are_scanned_for_files(self): self.mock_repo.return_value.branches = {"foo": mock.MagicMock()} test_scanner = scanner.GitRepoScanner( From 89d409f66bbc5f7a8141d9a021c598b06d612858 Mon Sep 17 00:00:00 2001 From: dclayton Date: Wed, 4 Jan 2023 14:25:09 -0700 Subject: [PATCH 2/2] convert walk to list only for progress --- tartufo/scanner.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tartufo/scanner.py b/tartufo/scanner.py index c5e5fab9..f29a269b 100755 --- a/tartufo/scanner.py +++ b/tartufo/scanner.py @@ -928,10 +928,8 @@ def chunks(self) -> Generator[types.Chunk, None, None]: else: branch = self._repo.branches.get(branch_name) try: - commits = list( - self._repo.walk( - branch.resolve().target, pygit2.GIT_SORT_TOPOLOGICAL - ) + commits = self._repo.walk( + branch.resolve().target, pygit2.GIT_SORT_TOPOLOGICAL ) except AttributeError: @@ -939,13 +937,14 @@ def chunks(self) -> Generator[types.Chunk, None, None]: "Skipping branch %s because it cannot be resolved.", branch_name ) continue - branch_cnt = branch_cnt + 1 - commit_len = len(commits) show_progress = self.git_options.progress if show_progress: + branch_cnt = branch_cnt + 1 + lcommits = list(commits) + commit_len = len(lcommits) with click.progressbar( - commits, + lcommits, label=f"➜ Scanning {branch_name} ({branch_cnt} of {branch_len})[{commit_len}]", ) as pcommits: yield from self._get_chunks(pcommits, already_searched, branch_name)