Skip to content

Commit

Permalink
Add trigram indexes for string equality search acceleration
Browse files Browse the repository at this point in the history
This adds a new statistic, TrigramSignature, which is a 12-byte bitset
into which all values for each string field are hashed. This gets used
for acceleration of string equality queries.

This mechanism can also be used to accelerate regex queries, by parsing
the query into a bitset operator tree. This patch doesn't go that far
yet.
  • Loading branch information
wkalt committed May 28, 2024
1 parent f39bdf7 commit 3553a08
Show file tree
Hide file tree
Showing 10 changed files with 222 additions and 7 deletions.
6 changes: 6 additions & 0 deletions executor/statfilter.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (

"github.com/wkalt/dp3/nodestore"
"github.com/wkalt/dp3/plan"
"github.com/wkalt/dp3/util/trigram"
)

type statfilterfn func(*nodestore.Child) (bool, error)
Expand Down Expand Up @@ -102,6 +103,8 @@ func compileExprEqualsStringFilter(node *plan.Node) (statfilterfn, error) {
if !found {
return passthroughFilter, nil
}
signature := trigram.NewSignature(12)
signature.AddString(s)
return func(child *nodestore.Child) (bool, error) {
textstat, err := child.GetTextStat(fieldname)
if err != nil {
Expand All @@ -110,6 +113,9 @@ func compileExprEqualsStringFilter(node *plan.Node) (statfilterfn, error) {
}
return true, fmt.Errorf("failed to get statistics: %w", err)
}
if !textstat.TrigramSignature.Contains(signature) {
return false, nil
}
return s >= textstat.Min && s <= textstat.Max, nil
}, nil
}
Expand Down
33 changes: 28 additions & 5 deletions executor/statfilter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"github.com/wkalt/dp3/ql"
"github.com/wkalt/dp3/util"
"github.com/wkalt/dp3/util/schema"
"github.com/wkalt/dp3/util/trigram"
)

type statconfig struct {
Expand Down Expand Up @@ -59,9 +60,13 @@ func newChild(t *testing.T, configs ...statconfig) *nodestore.Child {
Max: fieldmax.(float64),
}
case string:
signature := trigram.NewSignature(12)
signature.AddString(fieldmin)
signature.AddString(fieldmax.(string))
statistics.TextStats[idx] = &nodestore.TextSummary{
Min: fieldmin,
Max: fieldmax.(string),
Min: fieldmin,
Max: fieldmax.(string),
TrigramSignature: signature,
}
default:
t.Error("invalid type")
Expand Down Expand Up @@ -170,29 +175,47 @@ func TestStringStatFilters(t *testing.T) {
}

cases := []struct {
operator string
expected []bool
assertion string
operator string
value string
expected []bool
}{
{
"less than",
"<",
"e",
[]bool{true, false, false},
},
{
"less than or equal",
"<=",
"e",
[]bool{true, true, false},
},
{
"greater than",
">",
"e",
[]bool{false, true, true},
},
{
"greater than or equal",
">=",
"e",
[]bool{true, true, true},
},
{
"equal",
"=",
"e",
[]bool{true, true, false},
},
{
"equal can exclude based on trigram",
"=",
"b",
[]bool{false, false, false},
},
}

for _, name := range stringTypes {
Expand All @@ -204,7 +227,7 @@ func TestStringStatFilters(t *testing.T) {

for _, c := range cases {
t.Run(name+" "+c.operator, func(t *testing.T) {
query := basicScan(name, c.operator, "'e'")
query := basicScan(name, c.operator, "'"+c.value+"'")
node := extractWhere(t, query)
filter, err := executor.NewStatFilter(node)
require.NoError(t, err)
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -267,6 +267,7 @@ require (
github.com/minio/minio v0.0.0-20240312195911-24b4f9d748c4
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/relvacode/iso8601 v1.4.0
github.com/spaolacci/murmur3 v1.1.0
golang.org/x/exp v0.0.0-20240222234643-814bf88cf225
golang.org/x/sync v0.6.0
gopkg.in/yaml.v3 v3.0.1 // indirect
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -595,6 +595,8 @@ github.com/shoenig/test v0.6.4/go.mod h1:byHiCGXqrVaflBLAMq/srcZIHynQPQgeyvkvXnj
github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE=
github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc=
github.com/smartystreets/goconvey v0.0.0-20190330032615-68dc04aab96a/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA=
github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI=
github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA=
github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ=
github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE=
github.com/spf13/cobra v0.0.5/go.mod h1:3K3wKZymM7VvHMDS9+Akkh4K60UwM26emMESw8tLCHU=
Expand Down
9 changes: 7 additions & 2 deletions nodestore/statistics.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
fmcap "github.com/foxglove/mcap/go/mcap"
"github.com/wkalt/dp3/util"
"github.com/wkalt/dp3/util/schema"
"github.com/wkalt/dp3/util/trigram"
)

/*
Expand Down Expand Up @@ -104,7 +105,8 @@ type TextSummary struct {
nonempty bool
Min string `json:"min"`
Max string `json:"max"`
// todo: bloom filters, trigrams, etc.

TrigramSignature trigram.Signature `json:"trgmSignature"`
}

func (s *TextSummary) Merge(other *TextSummary) {
Expand All @@ -114,6 +116,7 @@ func (s *TextSummary) Merge(other *TextSummary) {
}
s.Min = min(s.Min, other.Min)
s.Max = max(s.Max, other.Max)
s.TrigramSignature.Add(other.TrigramSignature)
}

func (s *TextSummary) ranges(field string, start, end uint64, schemaHash string) []StatRange {
Expand Down Expand Up @@ -262,7 +265,8 @@ func (s *Statistics) observeNumeric(idx int, v float64) {
func (s *Statistics) observeText(idx int, v string) {
summary, ok := s.TextStats[idx]
if !ok {
summary = &TextSummary{Min: v, Max: v}
summary = &TextSummary{Min: v, Max: v, TrigramSignature: trigram.NewSignature(12)}
summary.TrigramSignature.AddString(v)
s.TextStats[idx] = summary
} else {
if v < summary.Min {
Expand All @@ -271,6 +275,7 @@ func (s *Statistics) observeText(idx int, v string) {
if v > summary.Max {
summary.Max = v
}
summary.TrigramSignature.AddString(v)
}
}

Expand Down
1 change: 1 addition & 0 deletions routes/routes.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ func MakeRoutes(tmgr *treemgr.TreeManager) *mux.Router {
mw.WithCORSAllowedOrigins([]string{
"http://localhost:5174",
"http://localhost:5173",
"http://localhost:8080",
}),
)
r.HandleFunc("/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
Expand Down
36 changes: 36 additions & 0 deletions util/bitset/bitset.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package bitset

type Bitset []byte

func New(sizeBytes int) Bitset {
return make([]byte, sizeBytes)
}

func (b Bitset) SetBit(i int) {
m := i % len(b)
b[m/8] |= 1 << (m % 8)
}

func (b Bitset) HasBit(i int) bool {
m := i % len(b)
return b[m/8]&(1<<(m%8)) != 0
}

func (b Bitset) Contains(other Bitset) bool {
for i, v := range other {
if b[i]&v != v {
return false
}
}
return true
}

func (b Bitset) Union(other Bitset) {
for i, v := range other {
b[i] |= v
}
}

func (b Bitset) Serialize() []byte {
return b
}
30 changes: 30 additions & 0 deletions util/bitset/bitset_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
package bitset_test

import (
"testing"

"github.com/stretchr/testify/require"
"github.com/wkalt/dp3/util/bitset"
)

func TestBitset(t *testing.T) {
t.Run("set bit", func(t *testing.T) {
set := bitset.New(12)
set.SetBit(24)
require.True(t, set.HasBit(24))
})

t.Run("contains", func(t *testing.T) {
set1 := bitset.New(12)
set1.SetBit(24)
set1.SetBit(25)
set1.SetBit(26)
set1.SetBit(27)

set2 := bitset.New(12)
set2.SetBit(24)
set2.SetBit(25)

require.True(t, set1.Contains(set2))
})
}
75 changes: 75 additions & 0 deletions util/trigram/trigram.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
package trigram

import (
"encoding/json"
"fmt"
"hash"

"github.com/spaolacci/murmur3"
"github.com/wkalt/dp3/util/bitset"
)

type Signature struct {
Bitset bitset.Bitset
Hash32 hash.Hash32
}

func (s Signature) MarshalJSON() ([]byte, error) {
bytes := s.Bitset.Serialize()
result, err := json.Marshal(bytes)
if err != nil {
return nil, fmt.Errorf("failed to marshal bitset to JSON: %w", err)
}
return result, nil
}

func (s *Signature) UnmarshalJSON(data []byte) error {
if err := json.Unmarshal(data, &s.Bitset); err != nil {
return fmt.Errorf("failed to unmarshal bitset: %w", err)
}
s.Hash32 = murmur3.New32()
return nil
}

func NewSignature(sizeBytes int) Signature {
return Signature{
Bitset: bitset.New(sizeBytes),
Hash32: murmur3.New32(),
}
}

func (s Signature) AddTrigram(trgm string) {
s.Hash32.Reset()
_, _ = s.Hash32.Write([]byte(trgm))
s.Bitset.SetBit(int(s.Hash32.Sum32()))
}

func (s Signature) AddString(text string) {
for _, t := range ComputeTrigrams(text) {
s.AddTrigram(t)
}
}

func (s Signature) Contains(other Signature) bool {
return s.Bitset.Contains(other.Bitset)
}

func (s Signature) Add(other Signature) {
s.Bitset.Union(other.Bitset)
}

func ComputeTrigrams(text string) []string {
result := []string{}
n := len(text)
if n == 0 {
return result
}

// Add padding
text = " " + text + " "
n += 3
for i := 0; i < n-2; i++ {
result = append(result, text[i:i+3])
}
return result
}
36 changes: 36 additions & 0 deletions util/trigram/trigram_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package trigram_test

import (
"testing"

"github.com/stretchr/testify/require"
"github.com/wkalt/dp3/util/trigram"
)

func TestExtractTrigrams(t *testing.T) {
cases := []struct {
in string
want []string
}{
{"", []string{}},
{"cat", []string{" c", " ca", "cat", "at "}},
{"a", []string{" a", " a "}},
}
for _, c := range cases {
t.Run(c.in, func(t *testing.T) {
got := trigram.ComputeTrigrams(c.in)
require.Equal(t, c.want, got)
})
}
}

func TestSignatureComparisons(t *testing.T) {
s1 := trigram.NewSignature(12)

s1.AddString("The cat in the hat")

s2 := trigram.NewSignature(12)
s2.AddString("the hat")

require.True(t, s1.Contains(s2))
}

0 comments on commit 3553a08

Please sign in to comment.