From cc2d0411121840dec3b5d34915ad66d8122a148f Mon Sep 17 00:00:00 2001 From: Aaron Loo Date: Fri, 22 Jun 2018 07:43:10 -0700 Subject: [PATCH] less false positives for hex strings --- .../plugins/high_entropy_strings.py | 27 +++++++++++++++++++ test_data/sample.diff | 2 +- tests/core/baseline_test.py | 2 +- tests/plugins/high_entropy_strings_test.py | 26 ++++++++++++++++++ 4 files changed, 55 insertions(+), 2 deletions(-) diff --git a/detect_secrets/plugins/high_entropy_strings.py b/detect_secrets/plugins/high_entropy_strings.py index b6bfb404d..41aebba57 100644 --- a/detect_secrets/plugins/high_entropy_strings.py +++ b/detect_secrets/plugins/high_entropy_strings.py @@ -184,6 +184,33 @@ class HexHighEntropyString(HighEntropyStringsPlugin): def __init__(self, limit, *args): super(HexHighEntropyString, self).__init__(string.hexdigits, limit) + def calculate_shannon_entropy(self, data): + """ + In our investigations, we have found that when the input is all digits, + the number of false positives we get greatly exceeds realistic true + positive scenarios. + + Therefore, this tries to capture this heuristic mathemetically. + + We do this by noting that the maximum shannon entropy for this charset + is ~3.32 (e.g. "0123456789", with every digit different), and we want + to lower that below the standard limit, 3. However, at the same time, + we also want to accommodate the fact that longer strings have a higher + chance of being a true positive, which means "01234567890123456789" + should be closer to the maximum entropy than the shorter version. + """ + entropy = super(HexHighEntropyString, self).calculate_shannon_entropy(data) + try: + int(data) + + # This multiplier was determined through trial and error, with the + # intent of keeping it simple, yet achieving our goals. + entropy -= 1.2 / math.log(len(data), 2) + except ValueError: + pass + + return entropy + class Base64HighEntropyString(HighEntropyStringsPlugin): """HighEntropyStringsPlugin for base64 encoded strings""" diff --git a/test_data/sample.diff b/test_data/sample.diff index d54568de8..2ae198d34 100644 --- a/test_data/sample.diff +++ b/test_data/sample.diff @@ -7,7 +7,7 @@ index 8f56ba1..796dbb3 100644 for subdir, dirs, files in os.walk(rootdir): - if exclude_regex and regex.search(subdir[len(rootdir)+1:]): -+ if exclude_regex and regex.search(subdir[len("0123456789") + 1:]): ++ if exclude_regex and regex.search(subdir[len("012345678a") + 1:]): continue for file in files: diff --git a/tests/core/baseline_test.py b/tests/core/baseline_test.py index 375c5b6fc..855374e59 100644 --- a/tests/core/baseline_test.py +++ b/tests/core/baseline_test.py @@ -79,7 +79,7 @@ def test_single_non_tracked_git_file_should_work(self): 'detect_secrets.core.baseline.os.path.isfile', return_value=True, ), mock_open( - 'Super hidden value "01234567890"', + 'Super hidden value "0123456789a"', 'detect_secrets.core.secrets_collection.codecs.open', ): results = self.get_results('will_be_mocked') diff --git a/tests/plugins/high_entropy_strings_test.py b/tests/plugins/high_entropy_strings_test.py index 6d3a7e1b7..14adbe5f1 100644 --- a/tests/plugins/high_entropy_strings_test.py +++ b/tests/plugins/high_entropy_strings_test.py @@ -1,10 +1,13 @@ from __future__ import absolute_import from __future__ import unicode_literals +import string + import pytest from detect_secrets.plugins.high_entropy_strings import Base64HighEntropyString from detect_secrets.plugins.high_entropy_strings import HexHighEntropyString +from detect_secrets.plugins.high_entropy_strings import HighEntropyStringsPlugin from testing.mocks import mock_file_object @@ -182,3 +185,26 @@ def setup(self): 'aaaaaa', '2b00042f7481c7b056c4b410d28f33cf', ) + + def test_discounts_when_all_numbers(self): + original_scanner = HighEntropyStringsPlugin( + string.hexdigits, + 3, + ) + + # This makes sure discounting works. + assert self.logic.calculate_shannon_entropy('0123456789') < \ + original_scanner.calculate_shannon_entropy('0123456789') + + # This is the goal. + assert self.logic.calculate_shannon_entropy('0123456789') < 3 + + # This makes sure it is length dependent. + assert self.logic.calculate_shannon_entropy('0123456789') < \ + self.logic.calculate_shannon_entropy( + '01234567890123456789', + ) + + # This makes sure it only occurs with numbers. + assert self.logic.calculate_shannon_entropy('12345a') == \ + original_scanner.calculate_shannon_entropy('12345a')