Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

bleve v2.0.0 proposal PR #1494

Merged
merged 29 commits into from
Jan 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
bb883f6
initial refactor to remove circular deps
mschoch Nov 10, 2020
81209a8
reduce map access
mschoch Nov 11, 2020
f25d522
remove encodeFieldType from upsidedown
mschoch Nov 16, 2020
c32ba08
reduce index and segment API surface area (#1498)
mschoch Nov 23, 2020
b85c082
move regexp compilation into scorch (#1500)
mschoch Nov 24, 2020
01f884d
move in more things from the segment package (#1501)
mschoch Nov 24, 2020
8cd7ab1
move unadorned into scorch (#1502)
mschoch Nov 24, 2020
9d86800
switch to tagged api versions (#1503)
mschoch Nov 27, 2020
1460d32
update to latest versions (#1506)
mschoch Nov 30, 2020
cf6396d
move the plugin interface into scorch (#1507)
mschoch Dec 2, 2020
9548dc5
update Advanced() method (#1509)
mschoch Dec 3, 2020
9c73f9d
make upsidedown store API ext module (#1512)
mschoch Dec 4, 2020
1fcbc4f
move FieldCache into upsidedown (#1513)
mschoch Dec 4, 2020
010cf8f
make ErrUnknownIndexType upsidedown specific (#1514)
mschoch Dec 4, 2020
bcacd48
move regexp interface to searcher (#1515)
mschoch Dec 4, 2020
66ef9cf
remove use of dump methods from index reader (#1516)
mschoch Dec 6, 2020
0050aac
remove remaining blevex imports (#1520)
mschoch Dec 16, 2020
4da47c3
use the moved indexing options (#1521)
mschoch Dec 16, 2020
1af4ca2
update to work with removed items (#1524)
mschoch Dec 18, 2020
3f65216
do not type assert on document or field (#1525)
mschoch Dec 18, 2020
a6e5e07
update for latest renaming (#1527)
mschoch Dec 22, 2020
44887af
switch default index to scorch with latest zap (#1528)
mschoch Dec 22, 2020
3f670ba
update to release candidate apis (#1531)
mschoch Dec 31, 2020
d464b26
Adding configurable freq/norm processing (#1526)
sreekanth-cb Dec 31, 2020
4f3ee06
restore correct copyright header (#1532)
mschoch Jan 5, 2021
8e14fc9
MatchOperator should be that type instead of int (#1410)
ethervoid Jan 6, 2021
987d561
Merge branch 'master' into fix-circular-deps
mschoch Jan 6, 2021
eedb767
update to v1 apis and zapx versions (#1537)
mschoch Jan 12, 2021
e843f84
declare module v2 and update internal refs (#1538)
mschoch Jan 12, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
The table of contents is too big for display.
Diff view
Diff view
  •  
  •  
  •  
4 changes: 2 additions & 2 deletions analysis/analyzer/custom/custom.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,8 @@ package custom
import (
"fmt"

"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)

const Name = "custom"
Expand Down
6 changes: 3 additions & 3 deletions analysis/analyzer/keyword/keyword.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
package keyword

import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/tokenizer/single"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/single"
"github.com/blevesearch/bleve/v2/registry"
)

const Name = "keyword"
Expand Down
8 changes: 4 additions & 4 deletions analysis/analyzer/simple/simple.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
package simple

import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/token/lowercase"
"github.com/blevesearch/bleve/analysis/tokenizer/letter"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/letter"
"github.com/blevesearch/bleve/v2/registry"
)

const Name = "simple"
Expand Down
10 changes: 5 additions & 5 deletions analysis/analyzer/standard/standard.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
package standard

import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/lang/en"
"github.com/blevesearch/bleve/analysis/token/lowercase"
"github.com/blevesearch/bleve/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/lang/en"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/unicode"
"github.com/blevesearch/bleve/v2/registry"
)

const Name = "standard"
Expand Down
10 changes: 5 additions & 5 deletions analysis/analyzer/web/web.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,11 @@
package web

import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/lang/en"
"github.com/blevesearch/bleve/analysis/token/lowercase"
"github.com/blevesearch/bleve/analysis/tokenizer/web"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/lang/en"
"github.com/blevesearch/bleve/v2/analysis/token/lowercase"
"github.com/blevesearch/bleve/v2/analysis/tokenizer/web"
"github.com/blevesearch/bleve/v2/registry"
)

const Name = "web"
Expand Down
9 changes: 5 additions & 4 deletions analysis/benchmark_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,11 +15,12 @@
package analysis_test

import (
index "github.com/blevesearch/bleve_index_api"
"testing"

"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/analyzer/standard"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/analyzer/standard"
"github.com/blevesearch/bleve/v2/registry"
)

func BenchmarkAnalysis(b *testing.B) {
Expand All @@ -32,7 +33,7 @@ func BenchmarkAnalysis(b *testing.B) {
}

ts := analyzer.Analyze(bleveWikiArticle)
freqs := analysis.TokenFrequency(ts, nil, true)
freqs := analysis.TokenFrequency(ts, nil, index.IncludeTermVectors)
if len(freqs) != 511 {
b.Errorf("expected %d freqs, got %d", 511, len(freqs))
}
Expand Down
4 changes: 2 additions & 2 deletions analysis/char/asciifolding/asciifolding.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
package asciifolding

import (
"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)

const Name = "asciifolding"
Expand Down
6 changes: 3 additions & 3 deletions analysis/char/html/html.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ package html
import (
"regexp"

"github.com/blevesearch/bleve/analysis"
regexpCharFilter "github.com/blevesearch/bleve/analysis/char/regexp"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
regexpCharFilter "github.com/blevesearch/bleve/v2/analysis/char/regexp"
"github.com/blevesearch/bleve/v2/registry"
)

const Name = "html"
Expand Down
4 changes: 2 additions & 2 deletions analysis/char/regexp/regexp.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ import (
"fmt"
"regexp"

"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)

const Name = "regexp"
Expand Down
6 changes: 3 additions & 3 deletions analysis/char/zerowidthnonjoiner/zerowidthnonjoiner.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ package zerowidthnonjoiner
import (
"regexp"

"github.com/blevesearch/bleve/analysis"
regexpCharFilter "github.com/blevesearch/bleve/analysis/char/regexp"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
regexpCharFilter "github.com/blevesearch/bleve/v2/analysis/char/regexp"
"github.com/blevesearch/bleve/v2/registry"
)

const Name = "zero_width_spaces"
Expand Down
4 changes: 2 additions & 2 deletions analysis/datetime/flexible/flexible.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ import (
"fmt"
"time"

"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/registry"
)

const Name = "flexiblego"
Expand Down
2 changes: 1 addition & 1 deletion analysis/datetime/flexible/flexible_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import (
"testing"
"time"

"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/v2/analysis"
)

func TestFlexibleDateTimeParser(t *testing.T) {
Expand Down
6 changes: 3 additions & 3 deletions analysis/datetime/optional/optional.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,9 @@ package optional
import (
"time"

"github.com/blevesearch/bleve/analysis"
"github.com/blevesearch/bleve/analysis/datetime/flexible"
"github.com/blevesearch/bleve/registry"
"github.com/blevesearch/bleve/v2/analysis"
"github.com/blevesearch/bleve/v2/analysis/datetime/flexible"
"github.com/blevesearch/bleve/v2/registry"
)

const Name = "dateTimeOptional"
Expand Down
124 changes: 21 additions & 103 deletions analysis/freq.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,105 +15,18 @@
package analysis

import (
"reflect"

"github.com/blevesearch/bleve/size"
index "github.com/blevesearch/bleve_index_api"
)

var reflectStaticSizeTokenLocation int
var reflectStaticSizeTokenFreq int

func init() {
var tl TokenLocation
reflectStaticSizeTokenLocation = int(reflect.TypeOf(tl).Size())
var tf TokenFreq
reflectStaticSizeTokenFreq = int(reflect.TypeOf(tf).Size())
}

// TokenLocation represents one occurrence of a term at a particular location in
// a field. Start, End and Position have the same meaning as in analysis.Token.
// Field and ArrayPositions identify the field value in the source document.
// See document.Field for details.
type TokenLocation struct {
Field string
ArrayPositions []uint64
Start int
End int
Position int
}

func (tl *TokenLocation) Size() int {
rv := reflectStaticSizeTokenLocation
rv += len(tl.ArrayPositions) * size.SizeOfUint64
return rv
}

// TokenFreq represents all the occurrences of a term in all fields of a
// document.
type TokenFreq struct {
Term []byte
Locations []*TokenLocation
frequency int
}

func (tf *TokenFreq) Size() int {
rv := reflectStaticSizeTokenFreq
rv += len(tf.Term)
for _, loc := range tf.Locations {
rv += loc.Size()
}
return rv
}

func (tf *TokenFreq) Frequency() int {
return tf.frequency
}

// TokenFrequencies maps document terms to their combined frequencies from all
// fields.
type TokenFrequencies map[string]*TokenFreq
func TokenFrequency(tokens TokenStream, arrayPositions []uint64, options index.FieldIndexingOptions) index.TokenFrequencies {
rv := make(map[string]*index.TokenFreq, len(tokens))

func (tfs TokenFrequencies) Size() int {
rv := size.SizeOfMap
rv += len(tfs) * (size.SizeOfString + size.SizeOfPtr)
for k, v := range tfs {
rv += len(k)
rv += v.Size()
}
return rv
}

func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) {
// walk the new token frequencies
for tfk, tf := range other {
// set the remoteField value in incoming token freqs
for _, l := range tf.Locations {
l.Field = remoteField
}
existingTf, exists := tfs[tfk]
if exists {
existingTf.Locations = append(existingTf.Locations, tf.Locations...)
existingTf.frequency = existingTf.frequency + tf.frequency
} else {
tfs[tfk] = &TokenFreq{
Term: tf.Term,
frequency: tf.frequency,
Locations: make([]*TokenLocation, len(tf.Locations)),
}
copy(tfs[tfk].Locations, tf.Locations)
}
}
}

func TokenFrequency(tokens TokenStream, arrayPositions []uint64, includeTermVectors bool) TokenFrequencies {
rv := make(map[string]*TokenFreq, len(tokens))

if includeTermVectors {
tls := make([]TokenLocation, len(tokens))
if options.IncludeTermVectors() {
tls := make([]index.TokenLocation, len(tokens))
tlNext := 0

for _, token := range tokens {
tls[tlNext] = TokenLocation{
tls[tlNext] = index.TokenLocation{
ArrayPositions: arrayPositions,
Start: token.Start,
End: token.End,
Expand All @@ -123,27 +36,32 @@ func TokenFrequency(tokens TokenStream, arrayPositions []uint64, includeTermVect
curr, ok := rv[string(token.Term)]
if ok {
curr.Locations = append(curr.Locations, &tls[tlNext])
curr.frequency++
} else {
rv[string(token.Term)] = &TokenFreq{
curr = &index.TokenFreq{
Term: token.Term,
Locations: []*TokenLocation{&tls[tlNext]},
frequency: 1,
Locations: []*index.TokenLocation{&tls[tlNext]},
}
rv[string(token.Term)] = curr
}

if !options.SkipFreqNorm() {
curr.SetFrequency(curr.Frequency() + 1)
}

tlNext++
}
} else {
for _, token := range tokens {
curr, exists := rv[string(token.Term)]
if exists {
curr.frequency++
} else {
rv[string(token.Term)] = &TokenFreq{
Term: token.Term,
frequency: 1,
if !exists {
curr = &index.TokenFreq{
Term: token.Term,
}
rv[string(token.Term)] = curr
}

if !options.SkipFreqNorm() {
curr.SetFrequency(curr.Frequency() + 1)
}
}
}
Expand Down
Loading