diff --git a/executor/statfilter.go b/executor/statfilter.go index 9430bd8..421c908 100644 --- a/executor/statfilter.go +++ b/executor/statfilter.go @@ -7,6 +7,7 @@ import ( "github.com/wkalt/dp3/nodestore" "github.com/wkalt/dp3/plan" + "github.com/wkalt/dp3/util/trigram" ) type statfilterfn func(*nodestore.Child) (bool, error) @@ -102,6 +103,8 @@ func compileExprEqualsStringFilter(node *plan.Node) (statfilterfn, error) { if !found { return passthroughFilter, nil } + signature := trigram.NewSignature(12) + signature.AddString(s) return func(child *nodestore.Child) (bool, error) { textstat, err := child.GetTextStat(fieldname) if err != nil { @@ -110,6 +113,9 @@ func compileExprEqualsStringFilter(node *plan.Node) (statfilterfn, error) { } return true, fmt.Errorf("failed to get statistics: %w", err) } + if !textstat.TrigramSignature.Contains(signature) { + return false, nil + } return s >= textstat.Min && s <= textstat.Max, nil }, nil } diff --git a/executor/statfilter_test.go b/executor/statfilter_test.go index 1849cf3..28baed9 100644 --- a/executor/statfilter_test.go +++ b/executor/statfilter_test.go @@ -11,6 +11,7 @@ import ( "github.com/wkalt/dp3/ql" "github.com/wkalt/dp3/util" "github.com/wkalt/dp3/util/schema" + "github.com/wkalt/dp3/util/trigram" ) type statconfig struct { @@ -59,9 +60,13 @@ func newChild(t *testing.T, configs ...statconfig) *nodestore.Child { Max: fieldmax.(float64), } case string: + signature := trigram.NewSignature(12) + signature.AddString(fieldmin) + signature.AddString(fieldmax.(string)) statistics.TextStats[idx] = &nodestore.TextSummary{ - Min: fieldmin, - Max: fieldmax.(string), + Min: fieldmin, + Max: fieldmax.(string), + TrigramSignature: signature, } default: t.Error("invalid type") @@ -170,29 +175,47 @@ func TestStringStatFilters(t *testing.T) { } cases := []struct { - operator string - expected []bool + assertion string + operator string + value string + expected []bool }{ { + "less than", "<", + "e", []bool{true, false, false}, }, { + "less than or equal", "<=", + "e", []bool{true, true, false}, }, { + "greater than", ">", + "e", []bool{false, true, true}, }, { + "greater than or equal", ">=", + "e", []bool{true, true, true}, }, { + "equal", "=", + "e", []bool{true, true, false}, }, + { + "equal can exclude based on trigram", + "=", + "b", + []bool{false, false, false}, + }, } for _, name := range stringTypes { @@ -204,7 +227,7 @@ func TestStringStatFilters(t *testing.T) { for _, c := range cases { t.Run(name+" "+c.operator, func(t *testing.T) { - query := basicScan(name, c.operator, "'e'") + query := basicScan(name, c.operator, "'"+c.value+"'") node := extractWhere(t, query) filter, err := executor.NewStatFilter(node) require.NoError(t, err) diff --git a/go.mod b/go.mod index bc07e63..ba40ea4 100644 --- a/go.mod +++ b/go.mod @@ -267,6 +267,7 @@ require ( github.com/minio/minio v0.0.0-20240312195911-24b4f9d748c4 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/relvacode/iso8601 v1.4.0 + github.com/spaolacci/murmur3 v1.1.0 golang.org/x/exp v0.0.0-20240222234643-814bf88cf225 golang.org/x/sync v0.6.0 gopkg.in/yaml.v3 v3.0.1 // indirect diff --git a/go.sum b/go.sum index 1b5c6c0..87df5c3 100644 --- a/go.sum +++ b/go.sum @@ -595,6 +595,8 @@ github.com/shoenig/test v0.6.4/go.mod h1:byHiCGXqrVaflBLAMq/srcZIHynQPQgeyvkvXnj github.com/sirupsen/logrus v1.4.2/go.mod h1:tLMulIdttU9McNUspp0xgXVQah82FyeX6MwdIuYE2rE= github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= github.com/smartystreets/goconvey v0.0.0-20190330032615-68dc04aab96a/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= +github.com/spaolacci/murmur3 v1.1.0 h1:7c1g84S4BPRrfL5Xrdp6fOJ206sU9y293DDHaoy0bLI= +github.com/spaolacci/murmur3 v1.1.0/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= github.com/spf13/afero v1.1.2/go.mod h1:j4pytiNVoe2o6bmDsKpLACNPDBIoEAkihy7loJ1B0CQ= github.com/spf13/cast v1.3.0/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE= github.com/spf13/cobra v0.0.5/go.mod h1:3K3wKZymM7VvHMDS9+Akkh4K60UwM26emMESw8tLCHU= diff --git a/nodestore/statistics.go b/nodestore/statistics.go index 9c6c48e..96a3c3a 100644 --- a/nodestore/statistics.go +++ b/nodestore/statistics.go @@ -7,6 +7,7 @@ import ( fmcap "github.com/foxglove/mcap/go/mcap" "github.com/wkalt/dp3/util" "github.com/wkalt/dp3/util/schema" + "github.com/wkalt/dp3/util/trigram" ) /* @@ -104,7 +105,8 @@ type TextSummary struct { nonempty bool Min string `json:"min"` Max string `json:"max"` - // todo: bloom filters, trigrams, etc. + + TrigramSignature trigram.Signature `json:"trgmSignature"` } func (s *TextSummary) Merge(other *TextSummary) { @@ -114,6 +116,7 @@ func (s *TextSummary) Merge(other *TextSummary) { } s.Min = min(s.Min, other.Min) s.Max = max(s.Max, other.Max) + s.TrigramSignature.Add(other.TrigramSignature) } func (s *TextSummary) ranges(field string, start, end uint64, schemaHash string) []StatRange { @@ -262,7 +265,8 @@ func (s *Statistics) observeNumeric(idx int, v float64) { func (s *Statistics) observeText(idx int, v string) { summary, ok := s.TextStats[idx] if !ok { - summary = &TextSummary{Min: v, Max: v} + summary = &TextSummary{Min: v, Max: v, TrigramSignature: trigram.NewSignature(12)} + summary.TrigramSignature.AddString(v) s.TextStats[idx] = summary } else { if v < summary.Min { @@ -271,6 +275,7 @@ func (s *Statistics) observeText(idx int, v string) { if v > summary.Max { summary.Max = v } + summary.TrigramSignature.AddString(v) } } diff --git a/routes/routes.go b/routes/routes.go index 1860c2e..261e68b 100644 --- a/routes/routes.go +++ b/routes/routes.go @@ -37,6 +37,7 @@ func MakeRoutes(tmgr *treemgr.TreeManager) *mux.Router { mw.WithCORSAllowedOrigins([]string{ "http://localhost:5174", "http://localhost:5173", + "http://localhost:8080", }), ) r.HandleFunc("/", http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { diff --git a/util/bitset/bitset.go b/util/bitset/bitset.go new file mode 100644 index 0000000..037ba7c --- /dev/null +++ b/util/bitset/bitset.go @@ -0,0 +1,36 @@ +package bitset + +type Bitset []byte + +func New(sizeBytes int) Bitset { + return make([]byte, sizeBytes) +} + +func (b Bitset) SetBit(i int) { + m := i % len(b) + b[m/8] |= 1 << (m % 8) +} + +func (b Bitset) HasBit(i int) bool { + m := i % len(b) + return b[m/8]&(1<<(m%8)) != 0 +} + +func (b Bitset) Contains(other Bitset) bool { + for i, v := range other { + if b[i]&v != v { + return false + } + } + return true +} + +func (b Bitset) Union(other Bitset) { + for i, v := range other { + b[i] |= v + } +} + +func (b Bitset) Serialize() []byte { + return b +} diff --git a/util/bitset/bitset_test.go b/util/bitset/bitset_test.go new file mode 100644 index 0000000..3bf90fe --- /dev/null +++ b/util/bitset/bitset_test.go @@ -0,0 +1,30 @@ +package bitset_test + +import ( + "testing" + + "github.com/stretchr/testify/require" + "github.com/wkalt/dp3/util/bitset" +) + +func TestBitset(t *testing.T) { + t.Run("set bit", func(t *testing.T) { + set := bitset.New(12) + set.SetBit(24) + require.True(t, set.HasBit(24)) + }) + + t.Run("contains", func(t *testing.T) { + set1 := bitset.New(12) + set1.SetBit(24) + set1.SetBit(25) + set1.SetBit(26) + set1.SetBit(27) + + set2 := bitset.New(12) + set2.SetBit(24) + set2.SetBit(25) + + require.True(t, set1.Contains(set2)) + }) +} diff --git a/util/trigram/trigram.go b/util/trigram/trigram.go new file mode 100644 index 0000000..9c12304 --- /dev/null +++ b/util/trigram/trigram.go @@ -0,0 +1,75 @@ +package trigram + +import ( + "encoding/json" + "fmt" + "hash" + + "github.com/spaolacci/murmur3" + "github.com/wkalt/dp3/util/bitset" +) + +type Signature struct { + Bitset bitset.Bitset + Hash32 hash.Hash32 +} + +func (s Signature) MarshalJSON() ([]byte, error) { + bytes := s.Bitset.Serialize() + result, err := json.Marshal(bytes) + if err != nil { + return nil, fmt.Errorf("failed to marshal bitset to JSON: %w", err) + } + return result, nil +} + +func (s *Signature) UnmarshalJSON(data []byte) error { + if err := json.Unmarshal(data, &s.Bitset); err != nil { + return fmt.Errorf("failed to unmarshal bitset: %w", err) + } + s.Hash32 = murmur3.New32() + return nil +} + +func NewSignature(sizeBytes int) Signature { + return Signature{ + Bitset: bitset.New(sizeBytes), + Hash32: murmur3.New32(), + } +} + +func (s Signature) AddTrigram(trgm string) { + s.Hash32.Reset() + _, _ = s.Hash32.Write([]byte(trgm)) + s.Bitset.SetBit(int(s.Hash32.Sum32())) +} + +func (s Signature) AddString(text string) { + for _, t := range ComputeTrigrams(text) { + s.AddTrigram(t) + } +} + +func (s Signature) Contains(other Signature) bool { + return s.Bitset.Contains(other.Bitset) +} + +func (s Signature) Add(other Signature) { + s.Bitset.Union(other.Bitset) +} + +func ComputeTrigrams(text string) []string { + result := []string{} + n := len(text) + if n == 0 { + return result + } + + // Add padding + text = " " + text + " " + n += 3 + for i := 0; i < n-2; i++ { + result = append(result, text[i:i+3]) + } + return result +} diff --git a/util/trigram/trigram_test.go b/util/trigram/trigram_test.go new file mode 100644 index 0000000..8e142a4 --- /dev/null +++ b/util/trigram/trigram_test.go @@ -0,0 +1,36 @@ +package trigram_test + +import ( + "testing" + + "github.com/stretchr/testify/require" + "github.com/wkalt/dp3/util/trigram" +) + +func TestExtractTrigrams(t *testing.T) { + cases := []struct { + in string + want []string + }{ + {"", []string{}}, + {"cat", []string{" c", " ca", "cat", "at "}}, + {"a", []string{" a", " a "}}, + } + for _, c := range cases { + t.Run(c.in, func(t *testing.T) { + got := trigram.ComputeTrigrams(c.in) + require.Equal(t, c.want, got) + }) + } +} + +func TestSignatureComparisons(t *testing.T) { + s1 := trigram.NewSignature(12) + + s1.AddString("The cat in the hat") + + s2 := trigram.NewSignature(12) + s2.AddString("the hat") + + require.True(t, s1.Contains(s2)) +}