diff --git a/detect_secrets/core/potential_secret.py b/detect_secrets/core/potential_secret.py index d25637418..e7e8253f5 100644 --- a/detect_secrets/core/potential_secret.py +++ b/detect_secrets/core/potential_secret.py @@ -47,10 +47,18 @@ def __init__( self.type = typ self.filename = filename self.lineno = lineno - self.secret_hash = self.hash_secret(secret) + self.set_secret(secret) self.is_secret = is_secret self.is_verified = False + # If two PotentialSecrets have the same values for these fields, + # they are considered equal. Note that line numbers aren't included + # in this, because line numbers are subject to change. + self.fields_to_compare = ['filename', 'secret_hash', 'type'] + + def set_secret(self, secret): + self.secret_hash = self.hash_secret(secret) + # NOTE: Originally, we never wanted to keep the secret value in memory, # after finding it in the codebase. However, to support verifiable # secrets (and avoid the pain of re-scanning again), we need to @@ -61,11 +69,6 @@ def __init__( # in the repository. self.secret_value = secret - # If two PotentialSecrets have the same values for these fields, - # they are considered equal. Note that line numbers aren't included - # in this, because line numbers are subject to change. - self.fields_to_compare = ['filename', 'secret_hash', 'type'] - @staticmethod def hash_secret(secret): """This offers a way to coherently test this class, diff --git a/detect_secrets/plugins/common/initialize.py b/detect_secrets/plugins/common/initialize.py index 44b516cdd..45998179f 100644 --- a/detect_secrets/plugins/common/initialize.py +++ b/detect_secrets/plugins/common/initialize.py @@ -158,7 +158,7 @@ def from_plugin_classname( klass = globals()[plugin_classname] # Make sure the instance is a BasePlugin type, before creating it. - if not issubclass(klass, BasePlugin): + if not issubclass(klass, BasePlugin): # pragma: no cover raise TypeError try: diff --git a/detect_secrets/plugins/common/yaml_file_parser.py b/detect_secrets/plugins/common/yaml_file_parser.py index 2e6902064..69339afb3 100644 --- a/detect_secrets/plugins/common/yaml_file_parser.py +++ b/detect_secrets/plugins/common/yaml_file_parser.py @@ -16,7 +16,7 @@ class YamlFileParser(object): Therefore, we take a different approach: intercept the parsing of the yaml file to identify string values. This assumes: - 1. Secrets are strings + 1. Secrets are strings or binaries 2. Secrets are not keys Then, we calculate the entropy of those string values. @@ -75,7 +75,10 @@ def _tag_dict_values(self, map_node): """ new_values = [] for key, value in map_node.value: - if not value.tag.endswith(':str'): + if not ( + value.tag.endswith(':str') or + value.tag.endswith(':binary') + ): new_values.append((key, value)) continue @@ -83,14 +86,19 @@ def _tag_dict_values(self, map_node): tag=map_node.tag, value=[ self._create_key_value_pair_for_mapping_node_value( - '__value__', - value.value, - 'tag:yaml.org,2002:str', + key='__value__', + value=value.value, + tag=value.tag, + ), + self._create_key_value_pair_for_mapping_node_value( + key='__line__', + value=str(value.__line__), + tag='tag:yaml.org,2002:int', ), self._create_key_value_pair_for_mapping_node_value( - '__line__', - str(value.__line__), - 'tag:yaml.org,2002:int', + key='__is_binary__', + value=str(value.tag.endswith(':binary')), + tag='tag:yaml.org,2002:bool', ), ], ) diff --git a/detect_secrets/plugins/high_entropy_strings.py b/detect_secrets/plugins/high_entropy_strings.py index e5d692906..7262db607 100644 --- a/detect_secrets/plugins/high_entropy_strings.py +++ b/detect_secrets/plugins/high_entropy_strings.py @@ -4,11 +4,13 @@ from backports import configparser except ImportError: # pragma: no cover import configparser +import base64 import math import os import re import string from abc import ABCMeta +from abc import abstractmethod from contextlib import contextmanager import yaml @@ -206,14 +208,23 @@ def _analyze_yaml_file(self, file, filename): try: if '__line__' in item and item['__line__'] not in ignored_lines: - potential_secrets.update( - self.analyze_string( - item['__value__'], - item['__line__'], - filename, - ), + # An isinstance check doesn't work in py2 + # so we need the __is_binary__ field. + string_to_scan = self.decode_binary(item['__value__']) \ + if item['__is_binary__'] \ + else item['__value__'] + + secrets = self.analyze_string( + string_to_scan, + item['__line__'], + filename, ) + if item['__is_binary__']: + secrets = self._encode_yaml_binary_secrets(secrets) + + potential_secrets.update(secrets) + if '__line__' in item: continue @@ -226,6 +237,39 @@ def _analyze_yaml_file(self, file, filename): return potential_secrets + def _encode_yaml_binary_secrets(self, secrets): + new_secrets = {} + """The secrets dict format is + `{PotentialSecret: PotentialSecret}`, where both key and + value are the same object. Therefore, we can just mutate + the potential secret once. + """ + for potential_secret in secrets.keys(): + secret_in_yaml_format = yaml.dump( + self.encode_to_binary(potential_secret.secret_value), + ).replace( + '!!binary ', + '', + ) + + potential_secret.set_secret(secret_in_yaml_format) + + new_secrets[potential_secret] = potential_secret + + return new_secrets + + @abstractmethod + def decode_binary(self, bytes_object): # pragma: no cover + """Converts the bytes to a string which can be checked for + high entropy.""" + pass + + @abstractmethod + def encode_to_binary(self, string): # pragma: no cover + """Converts a string (usually a high-entropy secret) to + binary. Usually the inverse of decode_binary.""" + pass + class HexHighEntropyString(HighEntropyStringsPlugin): """HighEntropyStringsPlugin for hex encoded strings""" @@ -278,6 +322,12 @@ def calculate_shannon_entropy(self, data): return entropy + def decode_binary(self, bytes_object): + return bytes_object.decode('utf-8') + + def encode_to_binary(self, string): + return string.encode('utf-8') + class Base64HighEntropyString(HighEntropyStringsPlugin): """HighEntropyStringsPlugin for base64 encoded strings""" @@ -299,3 +349,9 @@ def __dict__(self): }) return output + + def decode_binary(self, bytes_object): + return base64.b64encode(bytes_object).decode('utf-8') + + def encode_to_binary(self, string): + return base64.b64decode(string) diff --git a/test_data/config.yaml b/test_data/config.yaml index 26ab72ce3..7881662fb 100644 --- a/test_data/config.yaml +++ b/test_data/config.yaml @@ -11,3 +11,5 @@ list_of_keys: - 234567890a test_agent::allowlisted_api_key: 'ToCynx5Se4e2PtoZxEhW7lUJcOX15c54' # pragma: allowlist secret + +high_entropy_binary_secret: !!binary MjNjcnh1IDJieXJpdXYyeXJpaTJidnl1MnI4OXkyb3UwMg== diff --git a/test_data/config2.yaml b/test_data/config2.yaml new file mode 100644 index 000000000..00d97250a --- /dev/null +++ b/test_data/config2.yaml @@ -0,0 +1,2 @@ +# This is yaml.dump('2b00042f7481c7b056c4b410d28f33cf'.encode('utf-8')) +high_entropy_hex_binary_secret: !!binary MmIwMDA0MmY3NDgxYzdiMDU2YzRiNDEwZDI4ZjMzY2Y= diff --git a/tests/plugins/common/yaml_file_parser_test.py b/tests/plugins/common/yaml_file_parser_test.py index f514c5602..b9ff9512d 100644 --- a/tests/plugins/common/yaml_file_parser_test.py +++ b/tests/plugins/common/yaml_file_parser_test.py @@ -1,6 +1,9 @@ from __future__ import absolute_import from __future__ import unicode_literals +import mock +import pytest + from detect_secrets.plugins.common.yaml_file_parser import YamlFileParser from testing.mocks import mock_file_object @@ -19,3 +22,26 @@ def test_get_ignored_lines(self): ignored_lines = YamlFileParser(f).get_ignored_lines() assert ignored_lines == {2, 3} + + @pytest.mark.parametrize( + ['yaml_value', 'expected_value', 'expected_is_binary'], + [ + ('string_value', 'string_value', False), + ('!!binary YWJjZGVm', b'abcdef', True), + ], + ) + def test_possible_secret_format( + self, + yaml_value, + expected_value, + expected_is_binary, + ): + content = 'key: {yaml_value}'.format(yaml_value=yaml_value) + f = mock_file_object(content) + + result = YamlFileParser(f).json() + assert result['key'] == { + '__value__': expected_value, + '__is_binary__': expected_is_binary, + '__line__': mock.ANY, + } diff --git a/tests/plugins/high_entropy_strings_test.py b/tests/plugins/high_entropy_strings_test.py index 9c9b72c7e..97dffc4ea 100644 --- a/tests/plugins/high_entropy_strings_test.py +++ b/tests/plugins/high_entropy_strings_test.py @@ -2,13 +2,11 @@ from __future__ import unicode_literals import codecs -import string import pytest from detect_secrets.plugins.high_entropy_strings import Base64HighEntropyString from detect_secrets.plugins.high_entropy_strings import HexHighEntropyString -from detect_secrets.plugins.high_entropy_strings import HighEntropyStringsPlugin from testing.mocks import mock_file_object @@ -213,12 +211,13 @@ def test_yaml_file(self): with open('test_data/config.yaml') as f: secrets = plugin.analyze(f, 'test_data/config.yaml') - assert len(secrets.values()) == 2 + assert len(secrets.values()) == 3 for secret in secrets.values(): location = str(secret).splitlines()[1] assert location in ( 'Location: test_data/config.yaml:3', 'Location: test_data/config.yaml:6', + 'Location: test_data/config.yaml:15', ) def test_env_file(self): @@ -234,6 +233,21 @@ def test_env_file(self): ) +class HexHighEntropyStringsWithStandardEntropy(HexHighEntropyString): + """Copies the HexHighEntropyString class, but keeps the standard + Shannon entropy calculation. + """ + + def __init__(self, *args, **kwargs): + super(HexHighEntropyStringsWithStandardEntropy, self).__init__( + *args, + **kwargs + ) + + def calculate_shannon_entropy(self, data): + return super(HexHighEntropyString, self).calculate_shannon_entropy(data) + + class TestHexHighEntropyStrings(HighEntropyStringsTest): def setup(self): @@ -247,10 +261,21 @@ def setup(self): secret_string='2b00042f7481c7b056c4b410d28f33cf', ) + def test_yaml_file(self): + plugin = HexHighEntropyString(3) + with open('test_data/config2.yaml') as f: + secrets = plugin.analyze(f, 'test_data/config2.yaml') + + assert len(secrets.values()) == 1 + for secret in secrets.values(): + location = str(secret).splitlines()[1] + assert location in ( + 'Location: test_data/config2.yaml:2', + ) + def test_discounts_when_all_numbers(self): - original_scanner = HighEntropyStringsPlugin( - charset=string.hexdigits, - limit=3, + original_scanner = HexHighEntropyStringsWithStandardEntropy( + hex_limit=3, exclude_lines_regex=None, )