-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add trigram indexes for string equality search acceleration
This adds a new statistic, TrigramSignature, which is a 12-byte bitset into which all values for each string field are hashed. This gets used for acceleration of string equality queries. This mechanism can also be used to accelerate regex queries, by parsing the query into a bitset operator tree. This patch doesn't go that far yet.
- Loading branch information
Showing
10 changed files
with
222 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
package bitset | ||
|
||
type Bitset []byte | ||
|
||
func New(sizeBytes int) Bitset { | ||
return make([]byte, sizeBytes) | ||
} | ||
|
||
func (b Bitset) SetBit(i int) { | ||
m := i % len(b) | ||
b[m/8] |= 1 << (m % 8) | ||
} | ||
|
||
func (b Bitset) HasBit(i int) bool { | ||
m := i % len(b) | ||
return b[m/8]&(1<<(m%8)) != 0 | ||
} | ||
|
||
func (b Bitset) Contains(other Bitset) bool { | ||
for i, v := range other { | ||
if b[i]&v != v { | ||
return false | ||
} | ||
} | ||
return true | ||
} | ||
|
||
func (b Bitset) Union(other Bitset) { | ||
for i, v := range other { | ||
b[i] |= v | ||
} | ||
} | ||
|
||
func (b Bitset) Serialize() []byte { | ||
return b | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
package bitset_test | ||
|
||
import ( | ||
"testing" | ||
|
||
"github.com/stretchr/testify/require" | ||
"github.com/wkalt/dp3/util/bitset" | ||
) | ||
|
||
func TestBitset(t *testing.T) { | ||
t.Run("set bit", func(t *testing.T) { | ||
set := bitset.New(12) | ||
set.SetBit(24) | ||
require.True(t, set.HasBit(24)) | ||
}) | ||
|
||
t.Run("contains", func(t *testing.T) { | ||
set1 := bitset.New(12) | ||
set1.SetBit(24) | ||
set1.SetBit(25) | ||
set1.SetBit(26) | ||
set1.SetBit(27) | ||
|
||
set2 := bitset.New(12) | ||
set2.SetBit(24) | ||
set2.SetBit(25) | ||
|
||
require.True(t, set1.Contains(set2)) | ||
}) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
package trigram | ||
|
||
import ( | ||
"encoding/json" | ||
"fmt" | ||
"hash" | ||
|
||
"github.com/spaolacci/murmur3" | ||
"github.com/wkalt/dp3/util/bitset" | ||
) | ||
|
||
type Signature struct { | ||
Bitset bitset.Bitset | ||
Hash32 hash.Hash32 | ||
} | ||
|
||
func (s Signature) MarshalJSON() ([]byte, error) { | ||
bytes := s.Bitset.Serialize() | ||
result, err := json.Marshal(bytes) | ||
if err != nil { | ||
return nil, fmt.Errorf("failed to marshal bitset to JSON: %w", err) | ||
} | ||
return result, nil | ||
} | ||
|
||
func (s *Signature) UnmarshalJSON(data []byte) error { | ||
if err := json.Unmarshal(data, &s.Bitset); err != nil { | ||
return fmt.Errorf("failed to unmarshal bitset: %w", err) | ||
} | ||
s.Hash32 = murmur3.New32() | ||
return nil | ||
} | ||
|
||
func NewSignature(sizeBytes int) Signature { | ||
return Signature{ | ||
Bitset: bitset.New(sizeBytes), | ||
Hash32: murmur3.New32(), | ||
} | ||
} | ||
|
||
func (s Signature) AddTrigram(trgm string) { | ||
s.Hash32.Reset() | ||
_, _ = s.Hash32.Write([]byte(trgm)) | ||
s.Bitset.SetBit(int(s.Hash32.Sum32())) | ||
} | ||
|
||
func (s Signature) AddString(text string) { | ||
for _, t := range ComputeTrigrams(text) { | ||
s.AddTrigram(t) | ||
} | ||
} | ||
|
||
func (s Signature) Contains(other Signature) bool { | ||
return s.Bitset.Contains(other.Bitset) | ||
} | ||
|
||
func (s Signature) Add(other Signature) { | ||
s.Bitset.Union(other.Bitset) | ||
} | ||
|
||
func ComputeTrigrams(text string) []string { | ||
result := []string{} | ||
n := len(text) | ||
if n == 0 { | ||
return result | ||
} | ||
|
||
// Add padding | ||
text = " " + text + " " | ||
n += 3 | ||
for i := 0; i < n-2; i++ { | ||
result = append(result, text[i:i+3]) | ||
} | ||
return result | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
package trigram_test | ||
|
||
import ( | ||
"testing" | ||
|
||
"github.com/stretchr/testify/require" | ||
"github.com/wkalt/dp3/util/trigram" | ||
) | ||
|
||
func TestExtractTrigrams(t *testing.T) { | ||
cases := []struct { | ||
in string | ||
want []string | ||
}{ | ||
{"", []string{}}, | ||
{"cat", []string{" c", " ca", "cat", "at "}}, | ||
{"a", []string{" a", " a "}}, | ||
} | ||
for _, c := range cases { | ||
t.Run(c.in, func(t *testing.T) { | ||
got := trigram.ComputeTrigrams(c.in) | ||
require.Equal(t, c.want, got) | ||
}) | ||
} | ||
} | ||
|
||
func TestSignatureComparisons(t *testing.T) { | ||
s1 := trigram.NewSignature(12) | ||
|
||
s1.AddString("The cat in the hat") | ||
|
||
s2 := trigram.NewSignature(12) | ||
s2.AddString("the hat") | ||
|
||
require.True(t, s1.Contains(s2)) | ||
} |