Skip to content

Commit

Permalink
Use heuristic to choose the most likely UTF-16 decoded string (#1381)
Browse files Browse the repository at this point in the history
* Use heuristic to choose the most likely UTF-16 decoded string

* Assume ASCII and include valid BE and LE bytes

* Remove unused code

* Assume ASCII and return nil when not utf16

---------

Co-authored-by: bill-rich <bill.rich@gmail.com>
  • Loading branch information
mcastorina and bill-rich authored Jun 14, 2023
1 parent 3d39549 commit fb76eaf
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 44 deletions.
50 changes: 14 additions & 36 deletions pkg/decoders/utf16.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package decoders
import (
"bytes"
"encoding/binary"
"fmt"
"unicode/utf8"

"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
Expand All @@ -17,6 +16,9 @@ func (d *UTF16) FromChunk(chunk *sources.Chunk) *sources.Chunk {
}

if utf16Data, err := utf16ToUTF8(chunk.Data); err == nil {
if len(utf16Data) == 0 {
return nil
}
chunk.Data = utf16Data
return chunk
}
Expand All @@ -26,43 +28,19 @@ func (d *UTF16) FromChunk(chunk *sources.Chunk) *sources.Chunk {

// utf16ToUTF8 converts a byte slice containing UTF-16 encoded data to a UTF-8 encoded byte slice.
func utf16ToUTF8(b []byte) ([]byte, error) {
endianness, err := guessUTF16Endianness(b)
if err != nil {
return nil, err
}

buf := &bytes.Buffer{}
for i := 0; i < len(b); i += 2 {
r := rune(endianness.Uint16(b[i:]))
if utf8.ValidRune(r) {
buf.WriteRune(r)
var bufBE, bufLE bytes.Buffer
for i := 0; i < len(b)-1; i += 2 {
if r := rune(binary.BigEndian.Uint16(b[i:])); b[i] == 0 && utf8.ValidRune(r) {
if isValidByte(byte(r)) {
bufBE.WriteRune(r)
}
}
}

return buf.Bytes(), nil
}

func guessUTF16Endianness(b []byte) (binary.ByteOrder, error) {
if len(b) < 2 || len(b)%2 != 0 {
return nil, fmt.Errorf("input length must be even and at least 2 bytes long")
}

var evenNullBytes, oddNullBytes int

for i := 0; i < len(b); i += 2 {
if b[i] == 0 {
oddNullBytes++
}
if b[i+1] == 0 {
evenNullBytes++
if r := rune(binary.LittleEndian.Uint16(b[i:])); b[i+1] == 0 && utf8.ValidRune(r) {
if isValidByte(byte(r)) {
bufLE.WriteRune(r)
}
}
}

if evenNullBytes > oddNullBytes {
return binary.LittleEndian, nil
}
if oddNullBytes > evenNullBytes {
return binary.BigEndian, nil
}
return nil, fmt.Errorf("could not determine endianness")
return append(bufLE.Bytes(), bufBE.Bytes()...), nil
}
Binary file added pkg/decoders/utf16_test.dll
Binary file not shown.
27 changes: 24 additions & 3 deletions pkg/decoders/utf16_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package decoders

import (
"bytes"
"os"
"testing"

"github.com/trufflesecurity/trufflehog/v3/pkg/sources"
Expand Down Expand Up @@ -35,8 +36,8 @@ func TestUTF16Decoder(t *testing.T) {
{
name: "Invalid UTF-16 input (odd length)",
input: []byte{72, 0, 101, 0, 108, 0, 108, 0, 111, 0, 32, 0, 87, 0, 111, 0, 114, 0, 108, 0, 0},
expected: nil,
expectNil: true,
expected: []byte("Hello Worl"),
expectNil: false,
},
}

Expand All @@ -57,12 +58,32 @@ func TestUTF16Decoder(t *testing.T) {
return
}
if !bytes.Equal(decodedChunk.Data, tc.expected) {
t.Errorf("Expected decoded data: %v, got: %v", tc.expected, decodedChunk.Data)
t.Errorf("Expected decoded data: %s, got: %s", tc.expected, decodedChunk.Data)
}
})
}
}

func TestDLL(t *testing.T) {
data, err := os.ReadFile("utf16_test.dll")
if err != nil {
t.Errorf("Failed to read test data: %v", err)
return
}

chunk := &sources.Chunk{Data: data}
decoder := &UTF16{}
decodedChunk := decoder.FromChunk(chunk)
if decodedChunk == nil {
t.Errorf("Expected chunk with data, got nil")
return
}
if !bytes.Contains(decodedChunk.Data, []byte("aws_secret_access_key")) {
t.Errorf("Expected chunk to have aws_secret_access_key")
return
}
}

func BenchmarkUtf16ToUtf8(b *testing.B) {
// Example UTF-16LE encoded data
data := []byte{72, 0, 101, 0, 108, 0, 108, 0, 111, 0, 32, 0, 87, 0, 111, 0, 114, 0, 108, 0, 100, 0}
Expand Down
11 changes: 6 additions & 5 deletions pkg/decoders/utf8.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,6 @@ func (d *UTF8) FromChunk(chunk *sources.Chunk) *sources.Chunk {
// extacting contigous portions of printable characters that we care
// about from some bytes
func extractSubstrings(b []byte) []byte {
isValidByte := func(c byte) bool {
// https://www.rapidtables.com/code/text/ascii-table.html
// split on anything that is not ascii space through tilde
return c > 31 && c < 127
}

field := make([]byte, len(b))
fieldLen := 0
Expand All @@ -53,3 +48,9 @@ func extractSubstrings(b []byte) []byte {

return buf.Bytes()
}

func isValidByte(c byte) bool {
// https://www.rapidtables.com/code/text/ascii-table.html
// split on anything that is not ascii space through tilde
return c > 31 && c < 127
}

0 comments on commit fb76eaf

Please sign in to comment.