Skip to content

Commit

Permalink
Merge pull request #947 from GitGuardian/agateau/refactor-get_files_f…
Browse files Browse the repository at this point in the history
…rom_path

agateau/refactor get files from path
  • Loading branch information
agateau-gg committed Aug 12, 2024
2 parents 152e086 + b0229a2 commit 40a904b
Show file tree
Hide file tree
Showing 16 changed files with 169 additions and 224 deletions.
2 changes: 0 additions & 2 deletions ggshield/cmd/iac/scan/all.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,6 @@ def iac_scan_all(
paths = get_iac_files_from_path(
path=directory,
exclusion_regexes=ctx_obj.exclusion_regexes,
# bypass verbose here: we want to display only IaC files
verbose=False,
# If the repository is a git repository, ignore untracked files
ignore_git=False,
ignore_git_staged=(scan_mode == ScanMode.PRE_PUSH_ALL),
Expand Down
1 change: 0 additions & 1 deletion ggshield/cmd/sca/scan/sca_scan_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,6 @@ def get_sca_scan_all_filepaths(
all_filepaths = get_all_files_from_sca_paths(
path=directory,
exclusion_regexes=exclusion_regexes,
verbose=verbose,
# If the repository is a git repository, ignore untracked files
ignore_git=True,
)
Expand Down
13 changes: 8 additions & 5 deletions ggshield/cmd/secret/scan/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,12 @@
add_secret_scan_common_options,
create_output_handler,
)
from ggshield.cmd.secret.scan.ui_utils import print_file_list
from ggshield.cmd.utils.context_obj import ContextObj
from ggshield.core.errors import UnexpectedError
from ggshield.core.scan import ScanContext, ScanMode
from ggshield.core.scan.file import get_files_from_paths
from ggshield.core.scan.file import create_files_from_paths
from ggshield.core.text_utils import display_heading
from ggshield.utils.archive import safe_unpack
from ggshield.utils.click import RealPath
from ggshield.utils.files import ListFilesMode
Expand All @@ -34,6 +36,7 @@ def archive_cmd(
"""
with tempfile.TemporaryDirectory(suffix="ggshield") as temp_dir:
temp_path = Path(temp_dir)
display_heading("Unpacking archive")
try:
safe_unpack(path, extract_dir=temp_path)
except Exception as exn:
Expand All @@ -42,14 +45,14 @@ def archive_cmd(
ctx_obj = ContextObj.get(ctx)
config = ctx_obj.config
verbose = config.user_config.verbose
files = get_files_from_paths(
files, binary_paths = create_files_from_paths(
paths=[temp_path],
exclusion_regexes=ctx_obj.exclusion_regexes,
yes=True,
display_scanned_files=verbose,
display_binary_files=verbose,
list_files_mode=ListFilesMode.ALL,
)
if verbose:
print_file_list(files, binary_paths)
display_heading("Starting scan")

with ctx_obj.ui.create_scanner_ui(len(files), verbose=verbose) as ui:
scan_context = ScanContext(
Expand Down
41 changes: 30 additions & 11 deletions ggshield/cmd/secret/scan/path.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,13 @@
add_secret_scan_common_options,
create_output_handler,
)
from ggshield.cmd.secret.scan.ui_utils import print_file_list
from ggshield.cmd.utils.common_decorators import exception_wrapper
from ggshield.cmd.utils.context_obj import ContextObj
from ggshield.cmd.utils.files import check_directory_not_ignored
from ggshield.core.scan import ScanContext, ScanMode
from ggshield.core.scan.file import get_files_from_paths
from ggshield.core.scan import ScanContext, ScanMode, Scannable
from ggshield.core.scan.file import create_files_from_paths
from ggshield.core.text_utils import display_heading
from ggshield.utils.click import RealPath
from ggshield.utils.files import ListFilesMode
from ggshield.verticals.secret import SecretScanCollection, SecretScanner
Expand Down Expand Up @@ -46,20 +48,27 @@ def path_cmd(
for path in paths:
check_directory_not_ignored(path, ctx_obj.exclusion_regexes)

files = get_files_from_paths(
if not recursive:
if path := next((x for x in paths if x.is_dir()), None):
raise click.UsageError(
f"{click.format_filename(path)} is a directory."
" Use --recursive to scan directories."
)

files, binary_paths = create_files_from_paths(
paths=paths,
exclusion_regexes=ctx_obj.exclusion_regexes,
yes=yes,
display_scanned_files=verbose,
display_binary_files=verbose,
list_files_mode=(
ListFilesMode.FILES_ONLY
if not recursive
else (
ListFilesMode.ALL_BUT_GITIGNORED if use_gitignore else ListFilesMode.ALL
)
ListFilesMode.ALL_BUT_GITIGNORED if use_gitignore else ListFilesMode.ALL
),
)
if verbose:
print_file_list(files, binary_paths)
if not yes:
confirm_scan(files)

if verbose:
display_heading("Starting scan")
target = paths[0] if len(paths) == 1 else Path.cwd()
target_path = target if target.is_dir() else target.parent
with ctx_obj.ui.create_scanner_ui(len(files), verbose=verbose) as scanner_ui:
Expand All @@ -82,3 +91,13 @@ def path_cmd(
)

return output_handler.process_scan(scan)


def confirm_scan(files: List[Scannable]) -> None:
count = len(files)
if count > 1:
click.confirm(
f"{count} files will be scanned. Do you want to continue?",
abort=True,
err=True,
)
24 changes: 12 additions & 12 deletions ggshield/cmd/secret/scan/pypi.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,20 @@
import sys
import tempfile
from pathlib import Path
from typing import Any, List, Pattern, Set
from typing import Any, List, Pattern, Set, Tuple

import click

from ggshield.cmd.secret.scan.secret_scan_common_options import (
add_secret_scan_common_options,
create_output_handler,
)
from ggshield.cmd.secret.scan.ui_utils import print_file_list
from ggshield.cmd.utils.context_obj import ContextObj
from ggshield.core.errors import UnexpectedError
from ggshield.core.scan import ScanContext, ScanMode, Scannable
from ggshield.core.scan.file import get_files_from_paths
from ggshield.core.scan.file import create_files_from_paths
from ggshield.core.text_utils import display_heading
from ggshield.utils.archive import safe_unpack
from ggshield.utils.files import ListFilesMode
from ggshield.verticals.secret import SecretScanCollection, SecretScanner
Expand All @@ -34,15 +36,14 @@ def save_package_to_tmp(temp_dir: Path, package_name: str) -> None:
]

try:
click.echo("Downloading pip package... ", nl=False, err=True)
display_heading("Downloading package")
subprocess.run(
command,
check=True,
stdout=sys.stderr,
stderr=sys.stderr,
timeout=PYPI_DOWNLOAD_TIMEOUT,
)
click.echo("OK", err=True)

except subprocess.CalledProcessError:
raise UnexpectedError(f'Failed to download "{package_name}"')
Expand All @@ -55,23 +56,20 @@ def get_files_from_package(
archive_dir: Path,
package_name: str,
exclusion_regexes: Set[Pattern[str]],
verbose: bool,
) -> List[Scannable]:
) -> Tuple[List[Scannable], List[Path]]:
archive: Path = next(archive_dir.iterdir())

display_heading("Unpacking package")
try:
safe_unpack(archive, extract_dir=archive_dir)
except Exception as exn:
raise UnexpectedError(f'Failed to unpack package "{package_name}": {exn}.')

exclusion_regexes.add(re.compile(re.escape(archive.name)))

return get_files_from_paths(
return create_files_from_paths(
paths=[archive_dir],
exclusion_regexes=exclusion_regexes,
yes=True,
display_scanned_files=verbose,
display_binary_files=verbose,
list_files_mode=ListFilesMode.ALL,
)

Expand Down Expand Up @@ -106,12 +104,14 @@ def pypi_cmd(
temp_path = Path(temp_dir)
save_package_to_tmp(temp_dir=temp_path, package_name=package_name)

files = get_files_from_package(
files, binary_paths = get_files_from_package(
archive_dir=temp_path,
package_name=package_name,
exclusion_regexes=ctx_obj.exclusion_regexes,
verbose=config.user_config.verbose,
)
if verbose:
print_file_list(files, binary_paths)
display_heading("Starting scan")

with ctx_obj.ui.create_scanner_ui(len(files), verbose=verbose) as ui:
scan_context = ScanContext(
Expand Down
15 changes: 15 additions & 0 deletions ggshield/cmd/secret/scan/ui_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from pathlib import Path
from typing import List

from ggshield.core.scan import Scannable
from ggshield.core.text_utils import display_heading, display_info


def print_file_list(files: List[Scannable], binary_paths: List[Path]) -> None:
if binary_paths:
display_heading("Ignored binary files")
for path in binary_paths:
display_info(f"- {path}")
display_heading("Files to scan")
for f in files:
display_info(f"- {f.path}")
4 changes: 2 additions & 2 deletions ggshield/core/scan/__init__.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
from .commit import Commit
from .file import File, get_files_from_paths
from .file import File, create_files_from_paths
from .scan_context import ScanContext
from .scan_mode import ScanMode
from .scannable import DecodeError, Scannable, StringScannable


__all__ = [
"get_files_from_paths",
"create_files_from_paths",
"Commit",
"DecodeError",
"File",
Expand Down
86 changes: 19 additions & 67 deletions ggshield/core/scan/file.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,7 @@
from pathlib import Path
from typing import Iterable, Iterator, List, Pattern, Set, Union
from typing import List, Pattern, Set, Tuple, Union

import click

from ggshield.utils.files import (
ListFilesMode,
UnexpectedDirectoryError,
get_filepaths,
is_path_binary,
url_for_path,
)
from ggshield.utils.files import ListFilesMode, is_path_binary, list_files, url_for_path

from .scannable import Scannable

Expand Down Expand Up @@ -54,68 +46,28 @@ def _read_content(self) -> None:
)


def get_files_from_paths(
def create_files_from_paths(
paths: List[Path],
exclusion_regexes: Set[Pattern[str]],
yes: bool,
display_scanned_files: bool,
display_binary_files: bool,
list_files_mode: ListFilesMode = ListFilesMode.GIT_COMMITTED_OR_STAGED,
) -> List[Scannable]:
) -> Tuple[List[Scannable], List[Path]]:
"""
Create a scan object from files content.
:param paths: List of file/dir paths from the command
:param yes: Skip confirmation option
:param display_scanned_files: In some parts of the code (e.g. SCA), we might want
to display a processed list instead and set this to False
:param display_binary_files: Display all ignored binary files
:param ignore_git: Ignore that the folder is a git repository
Create File instances for `paths` and return them, as well as a list of the ignored
paths found in `paths`.
"""
try:
filepaths = get_filepaths(
paths,
exclusion_regexes,
list_files_mode=list_files_mode,
)
except UnexpectedDirectoryError as error:
raise click.UsageError(
f"{click.format_filename(error.path)} is a directory."
" Use --recursive to scan directories."
)

files = list(generate_files_from_paths(filepaths, display_binary_files))

if display_scanned_files:
for f in files:
click.echo(f"- {click.format_filename(f.filename)}", err=True)

size = len(files)
if size > 1 and not yes:
click.confirm(
f"{size} files will be scanned. Do you want to continue?",
abort=True,
err=True,
)

return files


def generate_files_from_paths(
paths: Iterable[Path],
display_binary_files: bool,
) -> Iterator[Scannable]:
"""Loop on filepaths and return an iterator on scannable files."""
for path in paths:
if path.is_dir() or not path.exists():
continue

filepaths = list_files(
paths,
exclusion_regexes,
list_files_mode=list_files_mode,
)

files: List[Scannable] = []
binary_paths: List[Path] = []
for path in filepaths:
if is_path_binary(path):
if display_binary_files:
click.echo(
f"ignoring binary file: {path}",
err=True,
)
binary_paths.append(path)
continue

yield File(path)
files.append(File(path))

return (files, binary_paths)
Loading

0 comments on commit 40a904b

Please sign in to comment.