Skip to content

Commit

Permalink
less false positives for hex strings
Browse files Browse the repository at this point in the history
  • Loading branch information
Aaron Loo committed Jun 22, 2018
1 parent f8d6212 commit cc2d041
Show file tree
Hide file tree
Showing 4 changed files with 55 additions and 2 deletions.
27 changes: 27 additions & 0 deletions detect_secrets/plugins/high_entropy_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,6 +184,33 @@ class HexHighEntropyString(HighEntropyStringsPlugin):
def __init__(self, limit, *args):
super(HexHighEntropyString, self).__init__(string.hexdigits, limit)

def calculate_shannon_entropy(self, data):
"""
In our investigations, we have found that when the input is all digits,
the number of false positives we get greatly exceeds realistic true
positive scenarios.
Therefore, this tries to capture this heuristic mathemetically.
We do this by noting that the maximum shannon entropy for this charset
is ~3.32 (e.g. "0123456789", with every digit different), and we want
to lower that below the standard limit, 3. However, at the same time,
we also want to accommodate the fact that longer strings have a higher
chance of being a true positive, which means "01234567890123456789"
should be closer to the maximum entropy than the shorter version.
"""
entropy = super(HexHighEntropyString, self).calculate_shannon_entropy(data)
try:
int(data)

# This multiplier was determined through trial and error, with the
# intent of keeping it simple, yet achieving our goals.
entropy -= 1.2 / math.log(len(data), 2)
except ValueError:
pass

return entropy


class Base64HighEntropyString(HighEntropyStringsPlugin):
"""HighEntropyStringsPlugin for base64 encoded strings"""
Expand Down
2 changes: 1 addition & 1 deletion test_data/sample.diff
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ index 8f56ba1..796dbb3 100644

for subdir, dirs, files in os.walk(rootdir):
- if exclude_regex and regex.search(subdir[len(rootdir)+1:]):
+ if exclude_regex and regex.search(subdir[len("0123456789") + 1:]):
+ if exclude_regex and regex.search(subdir[len("012345678a") + 1:]):
continue

for file in files:
Expand Down
2 changes: 1 addition & 1 deletion tests/core/baseline_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def test_single_non_tracked_git_file_should_work(self):
'detect_secrets.core.baseline.os.path.isfile',
return_value=True,
), mock_open(
'Super hidden value "01234567890"',
'Super hidden value "0123456789a"',
'detect_secrets.core.secrets_collection.codecs.open',
):
results = self.get_results('will_be_mocked')
Expand Down
26 changes: 26 additions & 0 deletions tests/plugins/high_entropy_strings_test.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
from __future__ import absolute_import
from __future__ import unicode_literals

import string

import pytest

from detect_secrets.plugins.high_entropy_strings import Base64HighEntropyString
from detect_secrets.plugins.high_entropy_strings import HexHighEntropyString
from detect_secrets.plugins.high_entropy_strings import HighEntropyStringsPlugin
from testing.mocks import mock_file_object


Expand Down Expand Up @@ -182,3 +185,26 @@ def setup(self):
'aaaaaa',
'2b00042f7481c7b056c4b410d28f33cf',
)

def test_discounts_when_all_numbers(self):
original_scanner = HighEntropyStringsPlugin(
string.hexdigits,
3,
)

# This makes sure discounting works.
assert self.logic.calculate_shannon_entropy('0123456789') < \
original_scanner.calculate_shannon_entropy('0123456789')

# This is the goal.
assert self.logic.calculate_shannon_entropy('0123456789') < 3

# This makes sure it is length dependent.
assert self.logic.calculate_shannon_entropy('0123456789') < \
self.logic.calculate_shannon_entropy(
'01234567890123456789',
)

# This makes sure it only occurs with numbers.
assert self.logic.calculate_shannon_entropy('12345a') == \
original_scanner.calculate_shannon_entropy('12345a')

0 comments on commit cc2d041

Please sign in to comment.