Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix output handling of merge commits #965

Merged
merged 1 commit into from
Sep 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions ggshield/core/scan/commit_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@
OLD_NAME_RX = re.compile(r"^--- a/(.*?)\t?$", flags=re.MULTILINE)
NEW_NAME_RX = re.compile(r"^\+\+\+ b/(.*?)\t?$", flags=re.MULTILINE)

MULTI_PARENT_HUNK_HEADER_RX = re.compile(
r"^(?P<at>@@+) (?P<from>-\d+(?:,\d+)?) .* (?P<to>\+\d+(?:,\d+)?) @@+$"
)


class PatchParseError(Exception):
"""
Expand Down Expand Up @@ -292,6 +296,9 @@ def parse_patch(

file_info = next(x for x in header.files if x.path == path)

if content.startswith("@@@"):
content = convert_multi_parent_diff(content)

yield CommitScannable(sha, file_info.path, content, filemode=file_info.mode)
except Exception as exc:
if sha:
Expand All @@ -301,6 +308,57 @@ def parse_patch(
raise PatchParseError(msg)


def convert_multi_parent_diff(content: str) -> str:
"""
ggshield output handlers currently do not work with multi-parent diffs, so convert
them into single-parent diffs
"""
lines = content.splitlines()

# Process header
hunk_header, parent_count = process_multi_parent_hunk_header(lines.pop(0))
out_lines = [hunk_header]

# Process content
for line in lines:
columns = line[:parent_count]
text = line[parent_count:]
if columns.startswith("-"):
# Removed from first parent, keep it
text = f"-{text}"
elif columns.startswith("+"):
# Added by first parent, keep it
text = f"+{text}"
elif "+" in columns:
# Added by another parent, keep it but consider it unchanged
text = f" {text}"
elif "-" in columns:
# Removed from another parent, skip it
continue
else:
# Unchanged
text = f" {text}"
out_lines.append(text)

return "\n".join(out_lines)


def process_multi_parent_hunk_header(header: str) -> Tuple[str, int]:
match = MULTI_PARENT_HUNK_HEADER_RX.match(header)
if not match:
raise PatchParseError( # pragma: no cover
f"Failed to parse multi-parent hunk header '{header}'"
)

from_ = match.group("from")
to = match.group("to")

# Parent count is the number of '@' at the beginning of the header, minus 1
parent_count = len(match.group("at")) - 1

return f"@@ {from_} {to} @@", parent_count


def get_file_sha_in_ref(
ref: str,
files: List[str],
Expand Down
161 changes: 161 additions & 0 deletions tests/unit/cassettes/test_scan_merge_commit.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
interactions:
- request:
body: null
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
User-Agent:
- pygitguardian/1.16.0 (Linux;py3.10.12)
method: GET
uri: https://api.gitguardian.com/v1/metadata
response:
body:
string:
'{"version":"v2.97.2","preferences":{"marketplaces__aws_product_url":"http://aws.amazon.com/marketplace/pp/prodview-mrmulzykamba6","on_premise__restrict_signup":true,"on_premise__is_email_server_configured":true,"on_premise__default_sso_config_api_id":null,"onboarding__segmentation_v1_enabled":true,"general__maximum_payload_size":26214400,"general__mutual_tls_mode":"disabled","general__signup_enabled":true},"secret_scan_preferences":{"maximum_documents_per_scan":20,"maximum_document_size":1048576},"remediation_messages":{"pre_commit":">
How to remediate\n\n Since the secret was detected before the commit was
made:\n 1. replace the secret with its reference (e.g. environment variable).\n 2.
commit again.\n\n> [To apply with caution] If you want to bypass ggshield
(false positive or other reason), run:\n - if you use the pre-commit framework:\n\n SKIP=ggshield
git commit -m \"<your message\"","pre_push":"> How to remediate\n\n Since
the secret was detected before the push BUT after the commit, you need to:\n 1.
rewrite the git history making sure to replace the secret with its reference
(e.g. environment variable).\n 2. push again.\n\n To prevent having to rewrite
git history in the future, setup ggshield as a pre-commit hook:\n https://docs.gitguardian.com/ggshield-docs/integrations/git-hooks/pre-commit\n\n>
[To apply with caution] If you want to bypass ggshield (false positive or
other reason), run:\n - if you use the pre-commit framework:\n\n SKIP=ggshield-push
git push","pre_receive":"> How to remediate\n\n A pre-receive hook set server
side prevented you from pushing secrets.\n\n Since the secret was detected
during the push BUT after the commit, you need to:\n 1. rewrite the git history
making sure to replace the secret with its reference (e.g. environment variable).\n 2.
push again.\n\n To prevent having to rewrite git history in the future, setup
ggshield as a pre-commit hook:\n https://docs.gitguardian.com/ggshield-docs/integrations/git-hooks/pre-commit\n\n>
[To apply with caution] If you want to bypass ggshield (false positive or
other reason), run:\n\n git push -o breakglass"}}'
headers:
access-control-expose-headers:
- X-App-Version
allow:
- GET, HEAD, OPTIONS
content-length:
- '2151'
content-type:
- application/json
cross-origin-opener-policy:
- same-origin
date:
- Mon, 23 Sep 2024 09:40:00 GMT
referrer-policy:
- strict-origin-when-cross-origin
server:
- istio-envoy
strict-transport-security:
- max-age=31536000; includeSubDomains
transfer-encoding:
- chunked
vary:
- Accept-Encoding,Cookie
x-app-version:
- v2.97.2
x-content-type-options:
- nosniff
- nosniff
x-envoy-upstream-service-time:
- '34'
x-frame-options:
- DENY
- SAMEORIGIN
x-sca-engine-version:
- 2.0.0
x-sca-last-vuln-fetch:
- '2024-09-23T09:06:36.704467+00:00'
x-secrets-engine-version:
- 2.120.0
x-xss-protection:
- 1; mode=block
status:
code: 200
message: OK
- request:
body: '[{"filename": "commit://patch/f", "document": "@@ -1,1 +1,2 @@\n-baz\n+username=owly\n+password=368ac3edf9e850d1c0ff9d6c526496f8237ddf91"}]'
headers:
Accept:
- '*/*'
Accept-Encoding:
- gzip, deflate
Connection:
- keep-alive
Content-Length:
- '139'
Content-Type:
- application/json
GGShield-Command-Id:
- c3344cdf-dcee-4fe4-8d64-3fc69f95d2d5
GGShield-Command-Path:
- external
GGShield-OS-Name:
- ubuntu
GGShield-OS-Version:
- '22.04'
GGShield-Python-Version:
- 3.10.12
GGShield-Version:
- 1.31.0
User-Agent:
- pygitguardian/1.16.0 (Linux;py3.10.12)
mode:
- path
method: POST
uri: https://api.gitguardian.com/v1/multiscan?ignore_known_secrets=True
response:
body:
string:
'[{"policy_break_count":1,"policies":["File extensions","Filenames","Secrets
detection"],"policy_breaks":[{"type":"Username Password","policy":"Secrets
detection","matches":[{"type":"username","match":"owly","index_start":31,"index_end":34,"line_start":3,"line_end":3},{"type":"password","match":"368ac3edf9e850d1c0ff9d6c526496f8237ddf91","index_start":46,"index_end":85,"line_start":4,"line_end":4}],"incident_url":"","known_secret":false,"validity":"no_checker"}]}]'
headers:
access-control-expose-headers:
- X-App-Version
allow:
- POST, OPTIONS
content-length:
- '466'
content-type:
- application/json
cross-origin-opener-policy:
- same-origin
date:
- Mon, 23 Sep 2024 09:40:01 GMT
referrer-policy:
- strict-origin-when-cross-origin
server:
- istio-envoy
strict-transport-security:
- max-age=31536000; includeSubDomains
vary:
- Cookie
x-app-version:
- v2.97.2
x-content-type-options:
- nosniff
- nosniff
x-envoy-upstream-service-time:
- '87'
x-frame-options:
- DENY
- SAMEORIGIN
x-sca-engine-version:
- 2.0.0
x-sca-last-vuln-fetch:
- '2024-09-23T09:06:36.704467+00:00'
x-secrets-engine-version:
- 2.120.0
x-xss-protection:
- 1; mode=block
status:
code: 200
message: OK
version: 1
18 changes: 10 additions & 8 deletions tests/unit/core/scan/test_commit.py
Original file line number Diff line number Diff line change
Expand Up @@ -404,14 +404,16 @@ def test_from_sha_gets_right_content_for_conflicts(tmp_path):
assert len(files) == 1
content = files[0].content

# Content contains the old line,
assert "--Hello" in content
# the new line from main,
assert "- Hello from main" in content
# the new line from b1,
assert " -Hello from b1" in content
# and the result of the conflict resolution
assert "++Solve conflict" in content
# Content has been turned into a single-parent diff
assert (
content
== """
@@ -1,2 +1,1 @@
-Hello
-Hello from main
+Solve conflict
""".strip()
)


def test_from_sha_files_matches_content(tmp_path):
Expand Down
65 changes: 64 additions & 1 deletion tests/unit/core/scan/test_commit_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import pytest

from ggshield.core.scan.commit_utils import PatchFileInfo
from ggshield.core.scan.commit_utils import PatchFileInfo, convert_multi_parent_diff
from ggshield.utils.git_shell import Filemode


Expand Down Expand Up @@ -46,3 +46,66 @@ def test_patch_file_info_from_string(
mode=mode,
)
assert PatchFileInfo.from_string(line) == expected_info


@pytest.mark.parametrize(
("diff", "expected"),
[
(
"""
@@@ -1,1 -1,1 +1,2 @@@
- baz
-bar
++hello
++world
""",
"""
@@ -1,1 +1,2 @@
-baz
+hello
+world
""",
),
(
"""
@@@ -1,8 -1,7 +1,8 @@@
Some longer content.

--With more text.
++
++% This is the result of the merge.
+# This text comes from the main branch.
- # It spawns...
- # 3 lines.
+ > This is some text from the commit branch.
-> It spawns 2 lines.

To get interesting indices.
""",
"""
@@ -1,8 +1,8 @@
Some longer content.

-With more text.
+
+% This is the result of the merge.
# This text comes from the main branch.
-# It spawns...
-# 3 lines.
+> This is some text from the commit branch.

To get interesting indices.
""", # noqa:W293
),
],
)
def test_convert_multi_parent_diff(diff: str, expected: str):
"""
GIVEN a multi parent diff
WHEN convert_multi_parent_diff() is called on it
THEN it returns the expected result
"""
diff = diff.strip()
expected = expected.strip()
result = convert_multi_parent_diff(diff)
assert result == expected
45 changes: 45 additions & 0 deletions tests/unit/verticals/secret/test_secret_scanner.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
_MULTIPLE_SECRETS_PATCH,
_NO_SECRET_PATCH,
_ONE_LINE_AND_MULTILINE_PATCH,
_SIMPLE_SECRET_TOKEN,
GG_TEST_TOKEN,
UNCHECKED_SECRET_PATCH,
my_vcr,
Expand Down Expand Up @@ -192,3 +193,47 @@ def test_handle_scan_quota_limit_reached():
detail = Detail(detail="Quota limit reached.", status_code=403)
with pytest.raises(QuotaLimitReachedError):
handle_scan_chunk_error(detail, Mock())


def test_scan_merge_commit(client, cache):
"""
GIVEN a merge commit in which a secret was inserted
WHEN it is scanned
THEN the secret is found
"""
commit = Commit.from_patch(
f"""
commit ca68e177596982fa38f181aa9944340a359748d2
Merge: 2c023f8 502d03c
Author: Aurelien Gateau <aurelien.gateau@gitguardian.com>
Date: Thu Sep 5 18:39:58 2024 +0200

Merge branch 'feature'
\0::100644 100644 100644 7601807 5716ca5 8c27e55 MM\0f\0\0diff --cc f
index 7601807,5716ca5..8c27e55
--- a/f
+++ b/f
@@@ -1,1 -1,1 +1,2 @@@
- baz
-bar
++username=owly
++password={_SIMPLE_SECRET_TOKEN}
"""
)

with my_vcr.use_cassette("test_scan_merge_commit"):
scanner = SecretScanner(
client=client,
cache=cache,
scan_context=ScanContext(
scan_mode=ScanMode.PATH,
command_path="external",
),
)
results = scanner.scan(commit.get_files(), scanner_ui=Mock())
scan = results.results[0].scan
assert len(scan.policy_breaks) == 1

matches = {m.match_type: m.match for m in scan.policy_breaks[0].matches}
assert matches["username"] == "owly"
assert matches["password"] == _SIMPLE_SECRET_TOKEN
Loading