From cc2d0411121840dec3b5d34915ad66d8122a148f Mon Sep 17 00:00:00 2001
From: Aaron Loo <aaronloo@yelp.com>
Date: Fri, 22 Jun 2018 07:43:10 -0700
Subject: [PATCH] less false positives for hex strings

---
 .../plugins/high_entropy_strings.py           | 27 +++++++++++++++++++
 test_data/sample.diff                         |  2 +-
 tests/core/baseline_test.py                   |  2 +-
 tests/plugins/high_entropy_strings_test.py    | 26 ++++++++++++++++++
 4 files changed, 55 insertions(+), 2 deletions(-)

diff --git a/detect_secrets/plugins/high_entropy_strings.py b/detect_secrets/plugins/high_entropy_strings.py
index b6bfb404d..41aebba57 100644
--- a/detect_secrets/plugins/high_entropy_strings.py
+++ b/detect_secrets/plugins/high_entropy_strings.py
@@ -184,6 +184,33 @@ class HexHighEntropyString(HighEntropyStringsPlugin):
     def __init__(self, limit, *args):
         super(HexHighEntropyString, self).__init__(string.hexdigits, limit)
 
+    def calculate_shannon_entropy(self, data):
+        """
+        In our investigations, we have found that when the input is all digits,
+        the number of false positives we get greatly exceeds realistic true
+        positive scenarios.
+
+        Therefore, this tries to capture this heuristic mathemetically.
+
+        We do this by noting that the maximum shannon entropy for this charset
+        is ~3.32 (e.g. "0123456789", with every digit different), and we want
+        to lower that below the standard limit, 3. However, at the same time,
+        we also want to accommodate the fact that longer strings have a higher
+        chance of being a true positive, which means "01234567890123456789"
+        should be closer to the maximum entropy than the shorter version.
+        """
+        entropy = super(HexHighEntropyString, self).calculate_shannon_entropy(data)
+        try:
+            int(data)
+
+            # This multiplier was determined through trial and error, with the
+            # intent of keeping it simple, yet achieving our goals.
+            entropy -= 1.2 / math.log(len(data), 2)
+        except ValueError:
+            pass
+
+        return entropy
+
 
 class Base64HighEntropyString(HighEntropyStringsPlugin):
     """HighEntropyStringsPlugin for base64 encoded strings"""
diff --git a/test_data/sample.diff b/test_data/sample.diff
index d54568de8..2ae198d34 100644
--- a/test_data/sample.diff
+++ b/test_data/sample.diff
@@ -7,7 +7,7 @@ index 8f56ba1..796dbb3 100644
 
      for subdir, dirs, files in os.walk(rootdir):
 -        if exclude_regex and regex.search(subdir[len(rootdir)+1:]):
-+        if exclude_regex and regex.search(subdir[len("0123456789") + 1:]):
++        if exclude_regex and regex.search(subdir[len("012345678a") + 1:]):
              continue
 
          for file in files:
diff --git a/tests/core/baseline_test.py b/tests/core/baseline_test.py
index 375c5b6fc..855374e59 100644
--- a/tests/core/baseline_test.py
+++ b/tests/core/baseline_test.py
@@ -79,7 +79,7 @@ def test_single_non_tracked_git_file_should_work(self):
                 'detect_secrets.core.baseline.os.path.isfile',
                 return_value=True,
         ), mock_open(
-                'Super hidden value "01234567890"',
+                'Super hidden value "0123456789a"',
                 'detect_secrets.core.secrets_collection.codecs.open',
         ):
             results = self.get_results('will_be_mocked')
diff --git a/tests/plugins/high_entropy_strings_test.py b/tests/plugins/high_entropy_strings_test.py
index 6d3a7e1b7..14adbe5f1 100644
--- a/tests/plugins/high_entropy_strings_test.py
+++ b/tests/plugins/high_entropy_strings_test.py
@@ -1,10 +1,13 @@
 from __future__ import absolute_import
 from __future__ import unicode_literals
 
+import string
+
 import pytest
 
 from detect_secrets.plugins.high_entropy_strings import Base64HighEntropyString
 from detect_secrets.plugins.high_entropy_strings import HexHighEntropyString
+from detect_secrets.plugins.high_entropy_strings import HighEntropyStringsPlugin
 from testing.mocks import mock_file_object
 
 
@@ -182,3 +185,26 @@ def setup(self):
             'aaaaaa',
             '2b00042f7481c7b056c4b410d28f33cf',
         )
+
+    def test_discounts_when_all_numbers(self):
+        original_scanner = HighEntropyStringsPlugin(
+            string.hexdigits,
+            3,
+        )
+
+        # This makes sure discounting works.
+        assert self.logic.calculate_shannon_entropy('0123456789') < \
+            original_scanner.calculate_shannon_entropy('0123456789')
+
+        # This is the goal.
+        assert self.logic.calculate_shannon_entropy('0123456789') < 3
+
+        # This makes sure it is length dependent.
+        assert self.logic.calculate_shannon_entropy('0123456789') < \
+            self.logic.calculate_shannon_entropy(
+                '01234567890123456789',
+            )
+
+        # This makes sure it only occurs with numbers.
+        assert self.logic.calculate_shannon_entropy('12345a') == \
+            original_scanner.calculate_shannon_entropy('12345a')