diff --git a/pkg/decoders/utf16.go b/pkg/decoders/utf16.go index c3bc8fbbcd1b..d5599a520ac0 100644 --- a/pkg/decoders/utf16.go +++ b/pkg/decoders/utf16.go @@ -3,7 +3,6 @@ package decoders import ( "bytes" "encoding/binary" - "fmt" "unicode/utf8" "github.com/trufflesecurity/trufflehog/v3/pkg/sources" @@ -17,6 +16,9 @@ func (d *UTF16) FromChunk(chunk *sources.Chunk) *sources.Chunk { } if utf16Data, err := utf16ToUTF8(chunk.Data); err == nil { + if len(utf16Data) == 0 { + return nil + } chunk.Data = utf16Data return chunk } @@ -26,43 +28,19 @@ func (d *UTF16) FromChunk(chunk *sources.Chunk) *sources.Chunk { // utf16ToUTF8 converts a byte slice containing UTF-16 encoded data to a UTF-8 encoded byte slice. func utf16ToUTF8(b []byte) ([]byte, error) { - endianness, err := guessUTF16Endianness(b) - if err != nil { - return nil, err - } - - buf := &bytes.Buffer{} - for i := 0; i < len(b); i += 2 { - r := rune(endianness.Uint16(b[i:])) - if utf8.ValidRune(r) { - buf.WriteRune(r) + var bufBE, bufLE bytes.Buffer + for i := 0; i < len(b)-1; i += 2 { + if r := rune(binary.BigEndian.Uint16(b[i:])); b[i] == 0 && utf8.ValidRune(r) { + if isValidByte(byte(r)) { + bufBE.WriteRune(r) + } } - } - - return buf.Bytes(), nil -} - -func guessUTF16Endianness(b []byte) (binary.ByteOrder, error) { - if len(b) < 2 || len(b)%2 != 0 { - return nil, fmt.Errorf("input length must be even and at least 2 bytes long") - } - - var evenNullBytes, oddNullBytes int - - for i := 0; i < len(b); i += 2 { - if b[i] == 0 { - oddNullBytes++ - } - if b[i+1] == 0 { - evenNullBytes++ + if r := rune(binary.LittleEndian.Uint16(b[i:])); b[i+1] == 0 && utf8.ValidRune(r) { + if isValidByte(byte(r)) { + bufLE.WriteRune(r) + } } } - if evenNullBytes > oddNullBytes { - return binary.LittleEndian, nil - } - if oddNullBytes > evenNullBytes { - return binary.BigEndian, nil - } - return nil, fmt.Errorf("could not determine endianness") + return append(bufLE.Bytes(), bufBE.Bytes()...), nil } diff --git a/pkg/decoders/utf16_test.dll b/pkg/decoders/utf16_test.dll new file mode 100644 index 000000000000..d5ffcb1110dd Binary files /dev/null and b/pkg/decoders/utf16_test.dll differ diff --git a/pkg/decoders/utf16_test.go b/pkg/decoders/utf16_test.go index 92c13dd8d824..9a05e0113076 100644 --- a/pkg/decoders/utf16_test.go +++ b/pkg/decoders/utf16_test.go @@ -2,6 +2,7 @@ package decoders import ( "bytes" + "os" "testing" "github.com/trufflesecurity/trufflehog/v3/pkg/sources" @@ -35,8 +36,8 @@ func TestUTF16Decoder(t *testing.T) { { name: "Invalid UTF-16 input (odd length)", input: []byte{72, 0, 101, 0, 108, 0, 108, 0, 111, 0, 32, 0, 87, 0, 111, 0, 114, 0, 108, 0, 0}, - expected: nil, - expectNil: true, + expected: []byte("Hello Worl"), + expectNil: false, }, } @@ -57,12 +58,32 @@ func TestUTF16Decoder(t *testing.T) { return } if !bytes.Equal(decodedChunk.Data, tc.expected) { - t.Errorf("Expected decoded data: %v, got: %v", tc.expected, decodedChunk.Data) + t.Errorf("Expected decoded data: %s, got: %s", tc.expected, decodedChunk.Data) } }) } } +func TestDLL(t *testing.T) { + data, err := os.ReadFile("utf16_test.dll") + if err != nil { + t.Errorf("Failed to read test data: %v", err) + return + } + + chunk := &sources.Chunk{Data: data} + decoder := &UTF16{} + decodedChunk := decoder.FromChunk(chunk) + if decodedChunk == nil { + t.Errorf("Expected chunk with data, got nil") + return + } + if !bytes.Contains(decodedChunk.Data, []byte("aws_secret_access_key")) { + t.Errorf("Expected chunk to have aws_secret_access_key") + return + } +} + func BenchmarkUtf16ToUtf8(b *testing.B) { // Example UTF-16LE encoded data data := []byte{72, 0, 101, 0, 108, 0, 108, 0, 111, 0, 32, 0, 87, 0, 111, 0, 114, 0, 108, 0, 100, 0} diff --git a/pkg/decoders/utf8.go b/pkg/decoders/utf8.go index 63a8ea43744a..93090e71febe 100644 --- a/pkg/decoders/utf8.go +++ b/pkg/decoders/utf8.go @@ -26,11 +26,6 @@ func (d *UTF8) FromChunk(chunk *sources.Chunk) *sources.Chunk { // extacting contigous portions of printable characters that we care // about from some bytes func extractSubstrings(b []byte) []byte { - isValidByte := func(c byte) bool { - // https://www.rapidtables.com/code/text/ascii-table.html - // split on anything that is not ascii space through tilde - return c > 31 && c < 127 - } field := make([]byte, len(b)) fieldLen := 0 @@ -53,3 +48,9 @@ func extractSubstrings(b []byte) []byte { return buf.Bytes() } + +func isValidByte(c byte) bool { + // https://www.rapidtables.com/code/text/ascii-table.html + // split on anything that is not ascii space through tilde + return c > 31 && c < 127 +}