diff --git a/ggshield/core/scan/commit_utils.py b/ggshield/core/scan/commit_utils.py index 4f46d60e47..39568ad0f0 100644 --- a/ggshield/core/scan/commit_utils.py +++ b/ggshield/core/scan/commit_utils.py @@ -39,6 +39,10 @@ OLD_NAME_RX = re.compile(r"^--- a/(.*?)\t?$", flags=re.MULTILINE) NEW_NAME_RX = re.compile(r"^\+\+\+ b/(.*?)\t?$", flags=re.MULTILINE) +MULTI_PARENT_HUNK_HEADER_RX = re.compile( + r"^(?P@@+) (?P-\d+(?:,\d+)?) .* (?P\+\d+(?:,\d+)?) @@+$" +) + class PatchParseError(Exception): """ @@ -292,6 +296,9 @@ def parse_patch( file_info = next(x for x in header.files if x.path == path) + if content.startswith("@@@"): + content = convert_multi_parent_diff(content) + yield CommitScannable(sha, file_info.path, content, filemode=file_info.mode) except Exception as exc: if sha: @@ -301,6 +308,57 @@ def parse_patch( raise PatchParseError(msg) +def convert_multi_parent_diff(content: str) -> str: + """ + ggshield output handlers currently do not work with multi-parent diffs, so convert + them into single-parent diffs + """ + lines = content.splitlines() + + # Process header + hunk_header, parent_count = process_multi_parent_hunk_header(lines.pop(0)) + out_lines = [hunk_header] + + # Process content + for line in lines: + columns = line[:parent_count] + text = line[parent_count:] + if columns.startswith("-"): + # Removed from first parent, keep it + text = f"-{text}" + elif columns.startswith("+"): + # Added by first parent, keep it + text = f"+{text}" + elif "+" in columns: + # Added by another parent, keep it but consider it unchanged + text = f" {text}" + elif "-" in columns: + # Removed from another parent, skip it + continue + else: + # Unchanged + text = f" {text}" + out_lines.append(text) + + return "\n".join(out_lines) + + +def process_multi_parent_hunk_header(header: str) -> Tuple[str, int]: + match = MULTI_PARENT_HUNK_HEADER_RX.match(header) + if not match: + raise PatchParseError( # pragma: no cover + f"Failed to parse multi-parent hunk header '{header}'" + ) + + from_ = match.group("from") + to = match.group("to") + + # Parent count is the number of '@' at the beginning of the header, minus 1 + parent_count = len(match.group("at")) - 1 + + return f"@@ {from_} {to} @@", parent_count + + def get_file_sha_in_ref( ref: str, files: List[str], diff --git a/tests/unit/cassettes/test_scan_merge_commit.yaml b/tests/unit/cassettes/test_scan_merge_commit.yaml new file mode 100644 index 0000000000..9649b0965f --- /dev/null +++ b/tests/unit/cassettes/test_scan_merge_commit.yaml @@ -0,0 +1,161 @@ +interactions: + - request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + User-Agent: + - pygitguardian/1.16.0 (Linux;py3.10.12) + method: GET + uri: https://api.gitguardian.com/v1/metadata + response: + body: + string: + '{"version":"v2.97.2","preferences":{"marketplaces__aws_product_url":"http://aws.amazon.com/marketplace/pp/prodview-mrmulzykamba6","on_premise__restrict_signup":true,"on_premise__is_email_server_configured":true,"on_premise__default_sso_config_api_id":null,"onboarding__segmentation_v1_enabled":true,"general__maximum_payload_size":26214400,"general__mutual_tls_mode":"disabled","general__signup_enabled":true},"secret_scan_preferences":{"maximum_documents_per_scan":20,"maximum_document_size":1048576},"remediation_messages":{"pre_commit":"> + How to remediate\n\n Since the secret was detected before the commit was + made:\n 1. replace the secret with its reference (e.g. environment variable).\n 2. + commit again.\n\n> [To apply with caution] If you want to bypass ggshield + (false positive or other reason), run:\n - if you use the pre-commit framework:\n\n SKIP=ggshield + git commit -m \" How to remediate\n\n Since + the secret was detected before the push BUT after the commit, you need to:\n 1. + rewrite the git history making sure to replace the secret with its reference + (e.g. environment variable).\n 2. push again.\n\n To prevent having to rewrite + git history in the future, setup ggshield as a pre-commit hook:\n https://docs.gitguardian.com/ggshield-docs/integrations/git-hooks/pre-commit\n\n> + [To apply with caution] If you want to bypass ggshield (false positive or + other reason), run:\n - if you use the pre-commit framework:\n\n SKIP=ggshield-push + git push","pre_receive":"> How to remediate\n\n A pre-receive hook set server + side prevented you from pushing secrets.\n\n Since the secret was detected + during the push BUT after the commit, you need to:\n 1. rewrite the git history + making sure to replace the secret with its reference (e.g. environment variable).\n 2. + push again.\n\n To prevent having to rewrite git history in the future, setup + ggshield as a pre-commit hook:\n https://docs.gitguardian.com/ggshield-docs/integrations/git-hooks/pre-commit\n\n> + [To apply with caution] If you want to bypass ggshield (false positive or + other reason), run:\n\n git push -o breakglass"}}' + headers: + access-control-expose-headers: + - X-App-Version + allow: + - GET, HEAD, OPTIONS + content-length: + - '2151' + content-type: + - application/json + cross-origin-opener-policy: + - same-origin + date: + - Mon, 23 Sep 2024 09:40:00 GMT + referrer-policy: + - strict-origin-when-cross-origin + server: + - istio-envoy + strict-transport-security: + - max-age=31536000; includeSubDomains + transfer-encoding: + - chunked + vary: + - Accept-Encoding,Cookie + x-app-version: + - v2.97.2 + x-content-type-options: + - nosniff + - nosniff + x-envoy-upstream-service-time: + - '34' + x-frame-options: + - DENY + - SAMEORIGIN + x-sca-engine-version: + - 2.0.0 + x-sca-last-vuln-fetch: + - '2024-09-23T09:06:36.704467+00:00' + x-secrets-engine-version: + - 2.120.0 + x-xss-protection: + - 1; mode=block + status: + code: 200 + message: OK + - request: + body: '[{"filename": "commit://patch/f", "document": "@@ -1,1 +1,2 @@\n-baz\n+username=owly\n+password=368ac3edf9e850d1c0ff9d6c526496f8237ddf91"}]' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '139' + Content-Type: + - application/json + GGShield-Command-Id: + - c3344cdf-dcee-4fe4-8d64-3fc69f95d2d5 + GGShield-Command-Path: + - external + GGShield-OS-Name: + - ubuntu + GGShield-OS-Version: + - '22.04' + GGShield-Python-Version: + - 3.10.12 + GGShield-Version: + - 1.31.0 + User-Agent: + - pygitguardian/1.16.0 (Linux;py3.10.12) + mode: + - path + method: POST + uri: https://api.gitguardian.com/v1/multiscan?ignore_known_secrets=True + response: + body: + string: + '[{"policy_break_count":1,"policies":["File extensions","Filenames","Secrets + detection"],"policy_breaks":[{"type":"Username Password","policy":"Secrets + detection","matches":[{"type":"username","match":"owly","index_start":31,"index_end":34,"line_start":3,"line_end":3},{"type":"password","match":"368ac3edf9e850d1c0ff9d6c526496f8237ddf91","index_start":46,"index_end":85,"line_start":4,"line_end":4}],"incident_url":"","known_secret":false,"validity":"no_checker"}]}]' + headers: + access-control-expose-headers: + - X-App-Version + allow: + - POST, OPTIONS + content-length: + - '466' + content-type: + - application/json + cross-origin-opener-policy: + - same-origin + date: + - Mon, 23 Sep 2024 09:40:01 GMT + referrer-policy: + - strict-origin-when-cross-origin + server: + - istio-envoy + strict-transport-security: + - max-age=31536000; includeSubDomains + vary: + - Cookie + x-app-version: + - v2.97.2 + x-content-type-options: + - nosniff + - nosniff + x-envoy-upstream-service-time: + - '87' + x-frame-options: + - DENY + - SAMEORIGIN + x-sca-engine-version: + - 2.0.0 + x-sca-last-vuln-fetch: + - '2024-09-23T09:06:36.704467+00:00' + x-secrets-engine-version: + - 2.120.0 + x-xss-protection: + - 1; mode=block + status: + code: 200 + message: OK +version: 1 diff --git a/tests/unit/core/scan/test_commit.py b/tests/unit/core/scan/test_commit.py index 1fea8407d5..ca24aa6119 100644 --- a/tests/unit/core/scan/test_commit.py +++ b/tests/unit/core/scan/test_commit.py @@ -404,14 +404,16 @@ def test_from_sha_gets_right_content_for_conflicts(tmp_path): assert len(files) == 1 content = files[0].content - # Content contains the old line, - assert "--Hello" in content - # the new line from main, - assert "- Hello from main" in content - # the new line from b1, - assert " -Hello from b1" in content - # and the result of the conflict resolution - assert "++Solve conflict" in content + # Content has been turned into a single-parent diff + assert ( + content + == """ +@@ -1,2 +1,1 @@ +-Hello +-Hello from main ++Solve conflict +""".strip() + ) def test_from_sha_files_matches_content(tmp_path): diff --git a/tests/unit/core/scan/test_commit_utils.py b/tests/unit/core/scan/test_commit_utils.py index 0fb3c51540..b998995dbe 100644 --- a/tests/unit/core/scan/test_commit_utils.py +++ b/tests/unit/core/scan/test_commit_utils.py @@ -3,7 +3,7 @@ import pytest -from ggshield.core.scan.commit_utils import PatchFileInfo +from ggshield.core.scan.commit_utils import PatchFileInfo, convert_multi_parent_diff from ggshield.utils.git_shell import Filemode @@ -46,3 +46,66 @@ def test_patch_file_info_from_string( mode=mode, ) assert PatchFileInfo.from_string(line) == expected_info + + +@pytest.mark.parametrize( + ("diff", "expected"), + [ + ( + """ +@@@ -1,1 -1,1 +1,2 @@@ +- baz + -bar +++hello +++world +""", + """ +@@ -1,1 +1,2 @@ +-baz ++hello ++world +""", + ), + ( + """ +@@@ -1,8 -1,7 +1,8 @@@ + Some longer content. + +--With more text. +++ +++% This is the result of the merge. + +# This text comes from the main branch. +- # It spawns... +- # 3 lines. ++ > This is some text from the commit branch. + -> It spawns 2 lines. + + To get interesting indices. +""", + """ +@@ -1,8 +1,8 @@ + Some longer content. + +-With more text. ++ ++% This is the result of the merge. + # This text comes from the main branch. +-# It spawns... +-# 3 lines. ++> This is some text from the commit branch. + + To get interesting indices. +""", # noqa:W293 + ), + ], +) +def test_convert_multi_parent_diff(diff: str, expected: str): + """ + GIVEN a multi parent diff + WHEN convert_multi_parent_diff() is called on it + THEN it returns the expected result + """ + diff = diff.strip() + expected = expected.strip() + result = convert_multi_parent_diff(diff) + assert result == expected diff --git a/tests/unit/verticals/secret/test_secret_scanner.py b/tests/unit/verticals/secret/test_secret_scanner.py index 9e3e1ea00d..abec938e13 100644 --- a/tests/unit/verticals/secret/test_secret_scanner.py +++ b/tests/unit/verticals/secret/test_secret_scanner.py @@ -22,6 +22,7 @@ _MULTIPLE_SECRETS_PATCH, _NO_SECRET_PATCH, _ONE_LINE_AND_MULTILINE_PATCH, + _SIMPLE_SECRET_TOKEN, GG_TEST_TOKEN, UNCHECKED_SECRET_PATCH, my_vcr, @@ -192,3 +193,47 @@ def test_handle_scan_quota_limit_reached(): detail = Detail(detail="Quota limit reached.", status_code=403) with pytest.raises(QuotaLimitReachedError): handle_scan_chunk_error(detail, Mock()) + + +def test_scan_merge_commit(client, cache): + """ + GIVEN a merge commit in which a secret was inserted + WHEN it is scanned + THEN the secret is found + """ + commit = Commit.from_patch( + f""" +commit ca68e177596982fa38f181aa9944340a359748d2 +Merge: 2c023f8 502d03c +Author: Aurelien Gateau +Date: Thu Sep 5 18:39:58 2024 +0200 + + Merge branch 'feature' +\0::100644 100644 100644 7601807 5716ca5 8c27e55 MM\0f\0\0diff --cc f +index 7601807,5716ca5..8c27e55 +--- a/f ++++ b/f +@@@ -1,1 -1,1 +1,2 @@@ +- baz + -bar +++username=owly +++password={_SIMPLE_SECRET_TOKEN} +""" + ) + + with my_vcr.use_cassette("test_scan_merge_commit"): + scanner = SecretScanner( + client=client, + cache=cache, + scan_context=ScanContext( + scan_mode=ScanMode.PATH, + command_path="external", + ), + ) + results = scanner.scan(commit.get_files(), scanner_ui=Mock()) + scan = results.results[0].scan + assert len(scan.policy_breaks) == 1 + + matches = {m.match_type: m.match for m in scan.policy_breaks[0].matches} + assert matches["username"] == "owly" + assert matches["password"] == _SIMPLE_SECRET_TOKEN