From d8d57e6990c3b928f2832b980158b1a69d8ff773 Mon Sep 17 00:00:00 2001 From: Stanislav Sokolov Date: Thu, 16 Feb 2017 12:44:52 +0500 Subject: [PATCH 001/728] Added Russian analyzer with snowball stemmer --- analysis/lang/ru/analyzer_ru.go | 57 ++ analysis/lang/ru/analyzer_ru_test.go | 70 ++ analysis/lang/ru/snowball/stem_Unicode.go | 737 ++++++++++++++++++++++ analysis/lang/ru/stemmer_ru.go | 50 ++ analysis/lang/ru/stemmer_ru_test.go | 67 ++ analysis/lang/ru/stop_filter_ru.go | 33 + analysis/lang/ru/stop_words_ru.go | 256 ++++++++ analysis/token/snowball/snowball.go | 59 ++ analysis/token/snowball/snowball_test.go | 115 ++++ 9 files changed, 1444 insertions(+) create mode 100644 analysis/lang/ru/analyzer_ru.go create mode 100644 analysis/lang/ru/analyzer_ru_test.go create mode 100644 analysis/lang/ru/snowball/stem_Unicode.go create mode 100644 analysis/lang/ru/stemmer_ru.go create mode 100644 analysis/lang/ru/stemmer_ru_test.go create mode 100644 analysis/lang/ru/stop_filter_ru.go create mode 100644 analysis/lang/ru/stop_words_ru.go create mode 100644 analysis/token/snowball/snowball.go create mode 100644 analysis/token/snowball/snowball_test.go diff --git a/analysis/lang/ru/analyzer_ru.go b/analysis/lang/ru/analyzer_ru.go new file mode 100644 index 000000000..3b3404037 --- /dev/null +++ b/analysis/lang/ru/analyzer_ru.go @@ -0,0 +1,57 @@ +// Copyright (c) 2014 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ru + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/bleve/analysis/token/lowercase" + "github.com/blevesearch/bleve/analysis/tokenizer/unicode" +) + +const AnalyzerName = "ru" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + tokenizer, err := cache.TokenizerNamed(unicode.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name) + if err != nil { + return nil, err + } + stopRuFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerRuFilter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: tokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopRuFilter, + stemmerRuFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/lang/ru/analyzer_ru_test.go b/analysis/lang/ru/analyzer_ru_test.go new file mode 100644 index 000000000..a7ffef414 --- /dev/null +++ b/analysis/lang/ru/analyzer_ru_test.go @@ -0,0 +1,70 @@ +// Copyright (c) 2014 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ru + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestRussianAnalyzer(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + // stemming + { + input: []byte("километрах"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("километр"), + }, + }, + }, + { + input: []byte("актеров"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("актер"), + }, + }, + }, + // stop word + { + input: []byte("как"), + output: analysis.TokenStream{}, + }, + } + + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed(AnalyzerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := analyzer.Analyze(test.input) + if len(actual) != len(test.output) { + t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) + } + for i, tok := range actual { + if !reflect.DeepEqual(tok.Term, test.output[i].Term) { + t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) + } + } + } +} diff --git a/analysis/lang/ru/snowball/stem_Unicode.go b/analysis/lang/ru/snowball/stem_Unicode.go new file mode 100644 index 000000000..dfd2f2eb0 --- /dev/null +++ b/analysis/lang/ru/snowball/stem_Unicode.go @@ -0,0 +1,737 @@ +//! This file was generated automatically by the Snowball to Go compiler +//! http://snowballstem.org/ + +package snowball + +import ( + snowballRuntime "github.com/snowballstem/snowball/go" +) + +var A_0 = []*snowballRuntime.Among{ + &snowballRuntime.Among{Str: "\u0432\u0448\u0438\u0441\u044C", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044B\u0432\u0448\u0438\u0441\u044C", A: 0, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u0432\u0448\u0438\u0441\u044C", A: 0, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0432", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044B\u0432", A: 3, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u0432", A: 3, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0432\u0448\u0438", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044B\u0432\u0448\u0438", A: 6, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u0432\u0448\u0438", A: 6, B: 2, F: nil}, +} + +var A_1 = []*snowballRuntime.Among{ + &snowballRuntime.Among{Str: "\u0435\u043C\u0443", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u043E\u043C\u0443", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044B\u0445", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u0445", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0443\u044E", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044E\u044E", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0435\u044E", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u043E\u044E", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044F\u044F", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0430\u044F", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044B\u0435", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0435\u0435", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u0435", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u043E\u0435", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044B\u043C\u0438", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u043C\u0438", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044B\u0439", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0435\u0439", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u0439", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u043E\u0439", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044B\u043C", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0435\u043C", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u043C", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u043E\u043C", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0435\u0433\u043E", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u043E\u0433\u043E", A: -1, B: 1, F: nil}, +} + +var A_2 = []*snowballRuntime.Among{ + &snowballRuntime.Among{Str: "\u0432\u0448", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044B\u0432\u0448", A: 0, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u0432\u0448", A: 0, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0449", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044E\u0449", A: 3, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0443\u044E\u0449", A: 4, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0435\u043C", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u043D\u043D", A: -1, B: 1, F: nil}, +} + +var A_3 = []*snowballRuntime.Among{ + &snowballRuntime.Among{Str: "\u0441\u044C", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0441\u044F", A: -1, B: 1, F: nil}, +} + +var A_4 = []*snowballRuntime.Among{ + &snowballRuntime.Among{Str: "\u044B\u0442", A: -1, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u044E\u0442", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0443\u044E\u0442", A: 1, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u044F\u0442", A: -1, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0435\u0442", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0443\u0435\u0442", A: 4, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u0442", A: -1, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u043D\u044B", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0435\u043D\u044B", A: 7, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0442\u044C", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044B\u0442\u044C", A: 9, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u0442\u044C", A: 9, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0435\u0448\u044C", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u0448\u044C", A: -1, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u044E", A: -1, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0443\u044E", A: 14, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u043B\u0430", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044B\u043B\u0430", A: 16, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u043B\u0430", A: 16, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u043D\u0430", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0435\u043D\u0430", A: 19, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0435\u0442\u0435", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u0442\u0435", A: -1, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0439\u0442\u0435", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0443\u0439\u0442\u0435", A: 23, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0435\u0439\u0442\u0435", A: 23, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u043B\u0438", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044B\u043B\u0438", A: 26, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u043B\u0438", A: 26, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0439", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0443\u0439", A: 29, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0435\u0439", A: 29, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u043B", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044B\u043B", A: 32, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u043B", A: 32, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u044B\u043C", A: -1, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0435\u043C", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u043C", A: -1, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u043D", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0435\u043D", A: 38, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u043B\u043E", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044B\u043B\u043E", A: 40, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u043B\u043E", A: 40, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u043D\u043E", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0435\u043D\u043E", A: 43, B: 2, F: nil}, + &snowballRuntime.Among{Str: "\u043D\u043D\u043E", A: 43, B: 1, F: nil}, +} + +var A_5 = []*snowballRuntime.Among{ + &snowballRuntime.Among{Str: "\u0443", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044F\u0445", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u044F\u0445", A: 1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0430\u0445", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044B", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044C", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044E", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044C\u044E", A: 6, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u044E", A: 6, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044F", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044C\u044F", A: 9, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u044F", A: 9, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0430", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0435\u0432", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u043E\u0432", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0435", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044C\u0435", A: 15, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u0435", A: 15, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0438", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0435\u0438", A: 18, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u0438", A: 18, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044F\u043C\u0438", A: 18, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u044F\u043C\u0438", A: 21, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0430\u043C\u0438", A: 18, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0439", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0435\u0439", A: 24, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u0435\u0439", A: 25, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u0439", A: 24, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u043E\u0439", A: 24, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044F\u043C", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u044F\u043C", A: 29, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0430\u043C", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0435\u043C", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u0438\u0435\u043C", A: 32, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u043E\u043C", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u043E", A: -1, B: 1, F: nil}, +} + +var A_6 = []*snowballRuntime.Among{ + &snowballRuntime.Among{Str: "\u043E\u0441\u0442", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u043E\u0441\u0442\u044C", A: -1, B: 1, F: nil}, +} + +var A_7 = []*snowballRuntime.Among{ + &snowballRuntime.Among{Str: "\u0435\u0439\u0448", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u044C", A: -1, B: 3, F: nil}, + &snowballRuntime.Among{Str: "\u0435\u0439\u0448\u0435", A: -1, B: 1, F: nil}, + &snowballRuntime.Among{Str: "\u043D", A: -1, B: 2, F: nil}, +} + +var G_v = []byte{33, 65, 8, 232} + +type Context struct { + i_p2 int + i_pV int +} + +func r_mark_regions(env *snowballRuntime.Env, ctx interface{}) bool { + context := ctx.(*Context) + _ = context + // (, line 57 + context.i_pV = env.Limit + context.i_p2 = env.Limit + // do, line 61 + var v_1 = env.Cursor +lab0: + for { + // (, line 61 + // gopast, line 62 + golab1: + for { + lab2: + for { + if !env.InGrouping(G_v, 1072, 1103) { + break lab2 + } + break golab1 + } + if env.Cursor >= env.Limit { + break lab0 + } + env.NextChar() + } + // setmark pV, line 62 + context.i_pV = env.Cursor + // gopast, line 62 + golab3: + for { + lab4: + for { + if !env.OutGrouping(G_v, 1072, 1103) { + break lab4 + } + break golab3 + } + if env.Cursor >= env.Limit { + break lab0 + } + env.NextChar() + } + // gopast, line 63 + golab5: + for { + lab6: + for { + if !env.InGrouping(G_v, 1072, 1103) { + break lab6 + } + break golab5 + } + if env.Cursor >= env.Limit { + break lab0 + } + env.NextChar() + } + // gopast, line 63 + golab7: + for { + lab8: + for { + if !env.OutGrouping(G_v, 1072, 1103) { + break lab8 + } + break golab7 + } + if env.Cursor >= env.Limit { + break lab0 + } + env.NextChar() + } + // setmark p2, line 63 + context.i_p2 = env.Cursor + break lab0 + } + env.Cursor = v_1 + return true +} + +func r_R2(env *snowballRuntime.Env, ctx interface{}) bool { + context := ctx.(*Context) + _ = context + if !(context.i_p2 <= env.Cursor) { + return false + } + return true +} + +func r_perfective_gerund(env *snowballRuntime.Env, ctx interface{}) bool { + context := ctx.(*Context) + _ = context + var among_var int32 + // (, line 71 + // [, line 72 + env.Ket = env.Cursor + // substring, line 72 + among_var = env.FindAmongB(A_0, context) + if among_var == 0 { + return false + } + // ], line 72 + env.Bra = env.Cursor + if among_var == 0 { + return false + } else if among_var == 1 { + // (, line 76 + // or, line 76 + lab0: + for { + var v_1 = env.Limit - env.Cursor + lab1: + for { + // literal, line 76 + if !env.EqSB("\u0430") { + break lab1 + } + break lab0 + } + env.Cursor = env.Limit - v_1 + // literal, line 76 + if !env.EqSB("\u044F") { + return false + } + break lab0 + } + // delete, line 76 + if !env.SliceDel() { + return false + } + } else if among_var == 2 { + // (, line 83 + // delete, line 83 + if !env.SliceDel() { + return false + } + } + return true +} + +func r_adjective(env *snowballRuntime.Env, ctx interface{}) bool { + context := ctx.(*Context) + _ = context + var among_var int32 + // (, line 87 + // [, line 88 + env.Ket = env.Cursor + // substring, line 88 + among_var = env.FindAmongB(A_1, context) + if among_var == 0 { + return false + } + // ], line 88 + env.Bra = env.Cursor + if among_var == 0 { + return false + } else if among_var == 1 { + // (, line 97 + // delete, line 97 + if !env.SliceDel() { + return false + } + } + return true +} + +func r_adjectival(env *snowballRuntime.Env, ctx interface{}) bool { + context := ctx.(*Context) + _ = context + var among_var int32 + // (, line 101 + // call adjective, line 102 + if !r_adjective(env, context) { + return false + } + // try, line 109 + var v_1 = env.Limit - env.Cursor +lab0: + for { + // (, line 109 + // [, line 110 + env.Ket = env.Cursor + // substring, line 110 + among_var = env.FindAmongB(A_2, context) + if among_var == 0 { + env.Cursor = env.Limit - v_1 + break lab0 + } + // ], line 110 + env.Bra = env.Cursor + if among_var == 0 { + env.Cursor = env.Limit - v_1 + break lab0 + } else if among_var == 1 { + // (, line 115 + // or, line 115 + lab1: + for { + var v_2 = env.Limit - env.Cursor + lab2: + for { + // literal, line 115 + if !env.EqSB("\u0430") { + break lab2 + } + break lab1 + } + env.Cursor = env.Limit - v_2 + // literal, line 115 + if !env.EqSB("\u044F") { + env.Cursor = env.Limit - v_1 + break lab0 + } + break lab1 + } + // delete, line 115 + if !env.SliceDel() { + return false + } + } else if among_var == 2 { + // (, line 122 + // delete, line 122 + if !env.SliceDel() { + return false + } + } + break lab0 + } + return true +} + +func r_reflexive(env *snowballRuntime.Env, ctx interface{}) bool { + context := ctx.(*Context) + _ = context + var among_var int32 + // (, line 128 + // [, line 129 + env.Ket = env.Cursor + // substring, line 129 + among_var = env.FindAmongB(A_3, context) + if among_var == 0 { + return false + } + // ], line 129 + env.Bra = env.Cursor + if among_var == 0 { + return false + } else if among_var == 1 { + // (, line 132 + // delete, line 132 + if !env.SliceDel() { + return false + } + } + return true +} + +func r_verb(env *snowballRuntime.Env, ctx interface{}) bool { + context := ctx.(*Context) + _ = context + var among_var int32 + // (, line 136 + // [, line 137 + env.Ket = env.Cursor + // substring, line 137 + among_var = env.FindAmongB(A_4, context) + if among_var == 0 { + return false + } + // ], line 137 + env.Bra = env.Cursor + if among_var == 0 { + return false + } else if among_var == 1 { + // (, line 143 + // or, line 143 + lab0: + for { + var v_1 = env.Limit - env.Cursor + lab1: + for { + // literal, line 143 + if !env.EqSB("\u0430") { + break lab1 + } + break lab0 + } + env.Cursor = env.Limit - v_1 + // literal, line 143 + if !env.EqSB("\u044F") { + return false + } + break lab0 + } + // delete, line 143 + if !env.SliceDel() { + return false + } + } else if among_var == 2 { + // (, line 151 + // delete, line 151 + if !env.SliceDel() { + return false + } + } + return true +} + +func r_noun(env *snowballRuntime.Env, ctx interface{}) bool { + context := ctx.(*Context) + _ = context + var among_var int32 + // (, line 159 + // [, line 160 + env.Ket = env.Cursor + // substring, line 160 + among_var = env.FindAmongB(A_5, context) + if among_var == 0 { + return false + } + // ], line 160 + env.Bra = env.Cursor + if among_var == 0 { + return false + } else if among_var == 1 { + // (, line 167 + // delete, line 167 + if !env.SliceDel() { + return false + } + } + return true +} + +func r_derivational(env *snowballRuntime.Env, ctx interface{}) bool { + context := ctx.(*Context) + _ = context + var among_var int32 + // (, line 175 + // [, line 176 + env.Ket = env.Cursor + // substring, line 176 + among_var = env.FindAmongB(A_6, context) + if among_var == 0 { + return false + } + // ], line 176 + env.Bra = env.Cursor + // call R2, line 176 + if !r_R2(env, context) { + return false + } + if among_var == 0 { + return false + } else if among_var == 1 { + // (, line 179 + // delete, line 179 + if !env.SliceDel() { + return false + } + } + return true +} + +func r_tidy_up(env *snowballRuntime.Env, ctx interface{}) bool { + context := ctx.(*Context) + _ = context + var among_var int32 + // (, line 183 + // [, line 184 + env.Ket = env.Cursor + // substring, line 184 + among_var = env.FindAmongB(A_7, context) + if among_var == 0 { + return false + } + // ], line 184 + env.Bra = env.Cursor + if among_var == 0 { + return false + } else if among_var == 1 { + // (, line 188 + // delete, line 188 + if !env.SliceDel() { + return false + } + // [, line 189 + env.Ket = env.Cursor + // literal, line 189 + if !env.EqSB("\u043D") { + return false + } + // ], line 189 + env.Bra = env.Cursor + // literal, line 189 + if !env.EqSB("\u043D") { + return false + } + // delete, line 189 + if !env.SliceDel() { + return false + } + } else if among_var == 2 { + // (, line 192 + // literal, line 192 + if !env.EqSB("\u043D") { + return false + } + // delete, line 192 + if !env.SliceDel() { + return false + } + } else if among_var == 3 { + // (, line 194 + // delete, line 194 + if !env.SliceDel() { + return false + } + } + return true +} + +func Stem(env *snowballRuntime.Env) bool { + var context = &Context{ + i_p2: 0, + i_pV: 0, + } + _ = context + // (, line 199 + // do, line 201 + var v_1 = env.Cursor +lab0: + for { + // call mark_regions, line 201 + if !r_mark_regions(env, context) { + break lab0 + } + break lab0 + } + env.Cursor = v_1 + // backwards, line 202 + env.LimitBackward = env.Cursor + env.Cursor = env.Limit + // setlimit, line 202 + var v_2 = env.Limit - env.Cursor + // tomark, line 202 + if env.Cursor < context.i_pV { + return false + } + env.Cursor = context.i_pV + var v_3 = env.LimitBackward + env.LimitBackward = env.Cursor + env.Cursor = env.Limit - v_2 + // (, line 202 + // do, line 203 + var v_4 = env.Limit - env.Cursor +lab1: + for { + // (, line 203 + // or, line 204 + lab2: + for { + var v_5 = env.Limit - env.Cursor + lab3: + for { + // call perfective_gerund, line 204 + if !r_perfective_gerund(env, context) { + break lab3 + } + break lab2 + } + env.Cursor = env.Limit - v_5 + // (, line 205 + // try, line 205 + var v_6 = env.Limit - env.Cursor + lab4: + for { + // call reflexive, line 205 + if !r_reflexive(env, context) { + env.Cursor = env.Limit - v_6 + break lab4 + } + break lab4 + } + // or, line 206 + lab5: + for { + var v_7 = env.Limit - env.Cursor + lab6: + for { + // call adjectival, line 206 + if !r_adjectival(env, context) { + break lab6 + } + break lab5 + } + env.Cursor = env.Limit - v_7 + lab7: + for { + // call verb, line 206 + if !r_verb(env, context) { + break lab7 + } + break lab5 + } + env.Cursor = env.Limit - v_7 + // call noun, line 206 + if !r_noun(env, context) { + break lab1 + } + break lab5 + } + break lab2 + } + break lab1 + } + env.Cursor = env.Limit - v_4 + // try, line 209 + var v_8 = env.Limit - env.Cursor +lab8: + for { + // (, line 209 + // [, line 209 + env.Ket = env.Cursor + // literal, line 209 + if !env.EqSB("\u0438") { + env.Cursor = env.Limit - v_8 + break lab8 + } + // ], line 209 + env.Bra = env.Cursor + // delete, line 209 + if !env.SliceDel() { + return false + } + break lab8 + } + // do, line 212 + var v_9 = env.Limit - env.Cursor +lab9: + for { + // call derivational, line 212 + if !r_derivational(env, context) { + break lab9 + } + break lab9 + } + env.Cursor = env.Limit - v_9 + // do, line 213 + var v_10 = env.Limit - env.Cursor +lab10: + for { + // call tidy_up, line 213 + if !r_tidy_up(env, context) { + break lab10 + } + break lab10 + } + env.Cursor = env.Limit - v_10 + env.LimitBackward = v_3 + env.Cursor = env.LimitBackward + return true +} diff --git a/analysis/lang/ru/stemmer_ru.go b/analysis/lang/ru/stemmer_ru.go new file mode 100644 index 000000000..6da9095b0 --- /dev/null +++ b/analysis/lang/ru/stemmer_ru.go @@ -0,0 +1,50 @@ +// Copyright (c) 2014 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ru + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/lang/ru/snowball" + "github.com/blevesearch/bleve/registry" + + snowballRuntime "github.com/snowballstem/snowball/go" +) + +const SnowballStemmerName = "stemmer_ru_snowball" + +type RussianStemmerFilter struct { +} + +func NewRussianStemmerFilter() *RussianStemmerFilter { + return &RussianStemmerFilter{} +} + +func (s *RussianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + + env := snowballRuntime.NewEnv(string(token.Term)) + snowball.Stem(env) + token.Term = []byte(env.Current()) + } + return input +} + +func RussianStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewRussianStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(SnowballStemmerName, RussianStemmerFilterConstructor) +} diff --git a/analysis/lang/ru/stemmer_ru_test.go b/analysis/lang/ru/stemmer_ru_test.go new file mode 100644 index 000000000..1795497ff --- /dev/null +++ b/analysis/lang/ru/stemmer_ru_test.go @@ -0,0 +1,67 @@ +// Copyright (c) 2015 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ru + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestSnowballRussianStemmer(t *testing.T) { + tests := []struct { + input analysis.TokenStream + output analysis.TokenStream + }{ + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("актеров"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("актер"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("километров"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("километр"), + }, + }, + }, + } + + cache := registry.NewCache() + filter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := filter.Filter(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) + } + } +} diff --git a/analysis/lang/ru/stop_filter_ru.go b/analysis/lang/ru/stop_filter_ru.go new file mode 100644 index 000000000..5679420a1 --- /dev/null +++ b/analysis/lang/ru/stop_filter_ru.go @@ -0,0 +1,33 @@ +// Copyright (c) 2014 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ru + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/token/stop" + "github.com/blevesearch/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/lang/ru/stop_words_ru.go b/analysis/lang/ru/stop_words_ru.go new file mode 100644 index 000000000..60bec0236 --- /dev/null +++ b/analysis/lang/ru/stop_words_ru.go @@ -0,0 +1,256 @@ +package ru + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const StopName = "stop_ru" + +var RussianStopWords = []byte(` | From http://snowball.tartarus.org/algorithms/russian/stop.txt + + + | a russian stop word list. comments begin with vertical bar. each stop + | word is at the start of a line. + + | this is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + | letter 'ё' is translated to 'е'. + + и | and + в | in/into + во | alternative form + не | not + что | what/that + он | he + на | on/onto + я | i + с | from + со | alternative form + как | how + а | milder form of 'no' (but) + то | conjunction and form of 'that' + все | all + она | she + так | so, thus + его | him + но | but + да | yes/and + ты | thou + к | towards, by + у | around, chez + же | intensifier particle + вы | you + за | beyond, behind + бы | conditional/subj. particle + по | up to, along + только | only + ее | her + мне | to me + было | it was + вот | here is/are, particle + от | away from + меня | me + еще | still, yet, more + нет | no, there isnt/arent + о | about + из | out of + ему | to him + теперь | now + когда | when + даже | even + ну | so, well + вдруг | suddenly + ли | interrogative particle + если | if + уже | already, but homonym of 'narrower' + или | or + ни | neither + быть | to be + был | he was + него | prepositional form of его + до | up to + вас | you accusative + нибудь | indef. suffix preceded by hyphen + опять | again + уж | already, but homonym of 'adder' + вам | to you + сказал | he said + ведь | particle 'after all' + там | there + потом | then + себя | oneself + ничего | nothing + ей | to her + может | usually with 'быть' as 'maybe' + они | they + тут | here + где | where + есть | there is/are + надо | got to, must + ней | prepositional form of ей + для | for + мы | we + тебя | thee + их | them, their + чем | than + была | she was + сам | self + чтоб | in order to + без | without + будто | as if + человек | man, person, one + чего | genitive form of 'what' + раз | once + тоже | also + себе | to oneself + под | beneath + жизнь | life + будет | will be + ж | short form of intensifer particle 'же' + тогда | then + кто | who + этот | this + говорил | was saying + того | genitive form of 'that' + потому | for that reason + этого | genitive form of 'this' + какой | which + совсем | altogether + ним | prepositional form of 'его', 'они' + здесь | here + этом | prepositional form of 'этот' + один | one + почти | almost + мой | my + тем | instrumental/dative plural of 'тот', 'то' + чтобы | full form of 'in order that' + нее | her (acc.) + кажется | it seems + сейчас | now + были | they were + куда | where to + зачем | why + сказать | to say + всех | all (acc., gen. preposn. plural) + никогда | never + сегодня | today + можно | possible, one can + при | by + наконец | finally + два | two + об | alternative form of 'о', about + другой | another + хоть | even + после | after + над | above + больше | more + тот | that one (masc.) + через | across, in + эти | these + нас | us + про | about + всего | in all, only, of all + них | prepositional form of 'они' (they) + какая | which, feminine + много | lots + разве | interrogative particle + сказала | she said + три | three + эту | this, acc. fem. sing. + моя | my, feminine + впрочем | moreover, besides + хорошо | good + свою | ones own, acc. fem. sing. + этой | oblique form of 'эта', fem. 'this' + перед | in front of + иногда | sometimes + лучше | better + чуть | a little + том | preposn. form of 'that one' + нельзя | one must not + такой | such a one + им | to them + более | more + всегда | always + конечно | of course + всю | acc. fem. sing of 'all' + между | between + + + | b: some paradigms + | + | personal pronouns + | + | я меня мне мной [мною] + | ты тебя тебе тобой [тобою] + | он его ему им [него, нему, ним] + | она ее эи ею [нее, нэи, нею] + | оно его ему им [него, нему, ним] + | + | мы нас нам нами + | вы вас вам вами + | они их им ими [них, ним, ними] + | + | себя себе собой [собою] + | + | demonstrative pronouns: этот (this), тот (that) + | + | этот эта это эти + | этого эты это эти + | этого этой этого этих + | этому этой этому этим + | этим этой этим [этою] этими + | этом этой этом этих + | + | тот та то те + | того ту то те + | того той того тех + | тому той тому тем + | тем той тем [тою] теми + | том той том тех + | + | determinative pronouns + | + | (a) весь (all) + | + | весь вся все все + | всего всю все все + | всего всей всего всех + | всему всей всему всем + | всем всей всем [всею] всеми + | всем всей всем всех + | + | (b) сам (himself etc) + | + | сам сама само сами + | самого саму само самих + | самого самой самого самих + | самому самой самому самим + | самим самой самим [самою] самими + | самом самой самом самих + | + | stems of verbs 'to be', 'to have', 'to do' and modal + | + | быть бы буд быв есть суть + | име + | дел + | мог мож мочь + | уме + | хоч хот + | долж + | можн + | нужн + | нельзя +`) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(RussianStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/token/snowball/snowball.go b/analysis/token/snowball/snowball.go new file mode 100644 index 000000000..ae876137a --- /dev/null +++ b/analysis/token/snowball/snowball.go @@ -0,0 +1,59 @@ +// Copyright (c) 2014 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package snowball + +import ( + "fmt" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/kljensen/snowball" +) + +const Name = "stemmer_snowball" + +type SnowballStemmer struct { + langauge string +} + +func NewSnowballStemmer(language string) *SnowballStemmer { + return &SnowballStemmer{ + langauge: language, + } +} + +func (s *SnowballStemmer) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + // if it is not a protected keyword, stem it + if !token.KeyWord { + stemmed, _ := snowball.Stem(string(token.Term), s.langauge, true) + token.Term = []byte(stemmed) + } + } + return input +} + +func SnowballStemmerConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + language, ok := config["language"].(string) + if !ok { + return nil, fmt.Errorf("must specify language") + } + return NewSnowballStemmer(language), nil +} + +func init() { + registry.RegisterTokenFilter(Name, SnowballStemmerConstructor) +} diff --git a/analysis/token/snowball/snowball_test.go b/analysis/token/snowball/snowball_test.go new file mode 100644 index 000000000..80c2f6f47 --- /dev/null +++ b/analysis/token/snowball/snowball_test.go @@ -0,0 +1,115 @@ +// Copyright (c) 2014 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package snowball + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" +) + +func TestSnowballStemmer(t *testing.T) { + + inputTokenStream := analysis.TokenStream{ + &analysis.Token{ + Term: []byte("walking"), + }, + &analysis.Token{ + Term: []byte("talked"), + }, + &analysis.Token{ + Term: []byte("business"), + }, + &analysis.Token{ + Term: []byte("protected"), + KeyWord: true, + }, + &analysis.Token{ + Term: []byte("cat"), + }, + &analysis.Token{ + Term: []byte("done"), + }, + // a term which does stem, but does not change length + &analysis.Token{ + Term: []byte("marty"), + }, + } + + expectedTokenStream := analysis.TokenStream{ + &analysis.Token{ + Term: []byte("walk"), + }, + &analysis.Token{ + Term: []byte("talk"), + }, + &analysis.Token{ + Term: []byte("busi"), + }, + &analysis.Token{ + Term: []byte("protected"), + KeyWord: true, + }, + &analysis.Token{ + Term: []byte("cat"), + }, + &analysis.Token{ + Term: []byte("done"), + }, + &analysis.Token{ + Term: []byte("marti"), + }, + } + + filter := NewSnowballStemmer("english") + ouputTokenStream := filter.Filter(inputTokenStream) + if !reflect.DeepEqual(ouputTokenStream, expectedTokenStream) { + t.Errorf("expected %#v got %#v", expectedTokenStream[3], ouputTokenStream[3]) + } +} + +func BenchmarkSnowballStemmer(b *testing.B) { + + inputTokenStream := analysis.TokenStream{ + &analysis.Token{ + Term: []byte("walking"), + }, + &analysis.Token{ + Term: []byte("talked"), + }, + &analysis.Token{ + Term: []byte("business"), + }, + &analysis.Token{ + Term: []byte("protected"), + KeyWord: true, + }, + &analysis.Token{ + Term: []byte("cat"), + }, + &analysis.Token{ + Term: []byte("done"), + }, + } + + filter := NewSnowballStemmer("english") + b.ResetTimer() + + for i := 0; i < b.N; i++ { + filter.Filter(inputTokenStream) + } + +} From dc9f994d95957101c5041e44096dac1d5e986253 Mon Sep 17 00:00:00 2001 From: Andrey Khomenko Date: Thu, 20 Jul 2017 12:06:45 -0400 Subject: [PATCH 002/728] Update index.go --- index.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index.go b/index.go index 293ec9877..e85652d96 100644 --- a/index.go +++ b/index.go @@ -76,7 +76,7 @@ func (b *Batch) SetInternal(key, val []byte) { b.internal.SetInternal(key, val) } -// SetInternal adds the specified delete internal +// DeleteInternal adds the specified delete internal // operation to the batch. NOTE: the bleve Index is // not updated until the batch is executed. func (b *Batch) DeleteInternal(key []byte) { From 4ddc50e86deabe3fc6f23dcc28e869eefbf99250 Mon Sep 17 00:00:00 2001 From: Joachim Schwarm Date: Tue, 21 Nov 2017 16:35:07 +0100 Subject: [PATCH 003/728] typo in documentation --- query.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/query.go b/query.go index 1fecfa25c..523db5ec0 100644 --- a/query.go +++ b/query.go @@ -209,8 +209,8 @@ func NewGeoBoundingBoxQuery(topLeftLon, topLeftLat, bottomRightLon, bottomRightL return query.NewGeoBoundingBoxQuery(topLeftLon, topLeftLat, bottomRightLon, bottomRightLat) } -// NewGeoDistanceQuery creates a new Query for performing geo bounding -// box searches. The arguments describe a position and a distance. Documents +// NewGeoDistanceQuery creates a new Query for performing geo distance +// searches. The arguments describe a position and a distance. Documents // which have an indexed geo point which is less than or equal to the provided // distance from the given position will be returned. func NewGeoDistanceQuery(lon, lat float64, distance string) *query.GeoDistanceQuery { From 23f6dc1cc6c83ce028fa9d36d93243130b0c7605 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 29 Sep 2017 12:42:37 -0400 Subject: [PATCH 004/728] working in-memory version --- index/analysis.go | 10 +- index/scorch/README.md | 420 ++++++++ index/scorch/field_dict_test.go | 164 ++++ index/scorch/introducer.go | 85 ++ index/scorch/reader.go | 84 ++ index/scorch/reader_test.go | 511 ++++++++++ index/scorch/scorch.go | 218 +++++ index/scorch/scorch_test.go | 1109 ++++++++++++++++++++++ index/scorch/segment/mem/build.go | 220 +++++ index/scorch/segment/mem/dict.go | 87 ++ index/scorch/segment/mem/posting.go | 160 ++++ index/scorch/segment/mem/segment.go | 132 +++ index/scorch/segment/mem/segment_test.go | 521 ++++++++++ index/scorch/segment/segment.go | 66 ++ index/scorch/snapshot_index.go | 300 ++++++ index/scorch/snapshot_index_dict.go | 78 ++ index/scorch/snapshot_index_doc.go | 53 ++ index/scorch/snapshot_index_tfr.go | 91 ++ index/scorch/snapshot_segment.go | 64 ++ index/scorch/stats.go | 33 + mapping/index.go | 2 +- 21 files changed, 4406 insertions(+), 2 deletions(-) create mode 100644 index/scorch/README.md create mode 100644 index/scorch/field_dict_test.go create mode 100644 index/scorch/introducer.go create mode 100644 index/scorch/reader.go create mode 100644 index/scorch/reader_test.go create mode 100644 index/scorch/scorch.go create mode 100644 index/scorch/scorch_test.go create mode 100644 index/scorch/segment/mem/build.go create mode 100644 index/scorch/segment/mem/dict.go create mode 100644 index/scorch/segment/mem/posting.go create mode 100644 index/scorch/segment/mem/segment.go create mode 100644 index/scorch/segment/mem/segment_test.go create mode 100644 index/scorch/segment/segment.go create mode 100644 index/scorch/snapshot_index.go create mode 100644 index/scorch/snapshot_index_dict.go create mode 100644 index/scorch/snapshot_index_doc.go create mode 100644 index/scorch/snapshot_index_tfr.go create mode 100644 index/scorch/snapshot_segment.go create mode 100644 index/scorch/stats.go diff --git a/index/analysis.go b/index/analysis.go index b626b9f3e..840dad97a 100644 --- a/index/analysis.go +++ b/index/analysis.go @@ -14,7 +14,10 @@ package index -import "github.com/blevesearch/bleve/document" +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/document" +) type IndexRow interface { KeySize() int @@ -29,6 +32,11 @@ type IndexRow interface { type AnalysisResult struct { DocID string Rows []IndexRow + + // scorch + Document *document.Document + Analyzed []analysis.TokenFrequencies + Length []int } type AnalysisWork struct { diff --git a/index/scorch/README.md b/index/scorch/README.md new file mode 100644 index 000000000..cec982ebd --- /dev/null +++ b/index/scorch/README.md @@ -0,0 +1,420 @@ +# scorch + +## Definitions + +Batch +- A collection of Documents to mutate in the index. + +Document +- Has a unique identifier (arbitrary bytes). +- Is comprised of a list of fields. + +Field +- Has a name (string). +- Has a type (text, number, date, geopoint). +- Has a value (depending on type). +- Can be indexed, stored, or both. +- If indexed, can be analyzed. +-m If indexed, can optionally store term vectors. + +## Scope + +Scorch *MUST* implement the bleve.index API without requiring any changes to this API. + +Scorch *MAY* introduce new interfaces, which can be discovered to allow use of new capabilities not in the current API. + +## Implementation + +The scorch implementation starts with the concept of a segmented index. + +A segment is simply a slice, subset, or portion of the entire index. A segmented index is one which is composed of one or more segments. Although segments are created in a particular order, knowing this ordering is not required to achieve correct semantics when querying. Because there is no ordering, this means that when searching an index, you can (and should) search all the segments concurrently. + +### Internal Wrapper + +In order to accommodate the existing APIs while also improving the implementation, the scorch implementation includes some wrapper functionality that must be described. + +#### \_id field + +In scorch, field 0 is prearranged to be named \_id. All documents have a value for this field, which is the documents external identifier. In this version the field *MUST* be both indexed AND stored. The scorch wrapper adds this field, as it will not be present in the Document from the calling bleve code. + +NOTE: If a document already contains a field \_id, it will be replaced. If this is problematic, the caller must ensure such a scenario does not happen. + +### Proposed Structures + +``` +type Segment interface { + + Dictionary(field string) TermDictionary + +} + +type TermDictionary interface { + + PostingsList(term string, excluding PostingsList) PostingsList + +} + +type PostingsList interface { + + Next() Posting + + And(other PostingsList) PostingsList + Or(other PostingsList) PostingsList + +} + +type Posting interface { + Number() uint64 + + Frequency() uint64 + Norm() float64 + + Locations() Locations +} + +type Locations interface { + Start() uint64 + End() uint64 + Pos() uint64 + ArrayPositions() ... +} + +type DeletedDocs { + +} + +type SegmentSnapshot struct { + segment Segment + deleted PostingsList +} + +type IndexSnapshot struct { + segment []SegmentSnapshot +} +``` +**What about errors?** +**What about memory mgmnt or context?** +**Postings List separate iterator to separate stateful from stateless** +### Mutating the Index + +The bleve.index API has methods for directly making individual mutations (Update/Delete/SetInternal/DeleteInternal), however for this first implementation, we assume that all of these calls can simply be turned into a Batch of size 1. This may be highly inefficient, but it will be correct. This decision is made based on the fact that Couchbase FTS always uses Batches. + +NOTE: As a side-effect of this decision, it should be clear that performance tuning may depend on the batch size, which may in-turn require changes in FTS. + +From this point forward, only Batch mutations will be discussed. + +Sequence of Operations: + +1. For each document in the batch, search through all existing segments. The goal is to build up a per-segment bitset which tells us which documents in that segment are obsoleted by the addition of the new segment we're currently building. NOTE: we're not ready for this change to take effect yet, so rather than this operation mutating anything, they simply return bitsets, which we can apply later. Logically, this is something like: + + ``` + foreach segment { + dict := segment.Dictionary("\_id") + postings := empty postings list + foreach docID { + postings = postings.Or(dict.PostingsList(docID, nil)) + } + } + ``` + + NOTE: it is illustrated above as nested for loops, but some or all of these could be concurrently. The end result is that for each segment, we have (possibly empty) bitset. + +2. Also concurrent with 1, the documents in the batch are analyzed. This analysis proceeds using the existing analyzer pool. + +3. (after 2 completes) Analyzed documents are fed into a function which builds a new Segment representing this information. + +4. We now have everything we need to update the state of the system to include this new snapshot. + + - Acquire a lock + - Create a new IndexSnapshot + - For each SegmentSnapshot in the IndexSnapshot, take the deleted PostingsList and OR it with the new postings list for this Segment. Construct a new SegmentSnapshot for the segment using this new deleted PostingsList. Append this SegmentSnapshot to the IndexSnapshot. + - Create a new SegmentSnapshot wrapping our new segment with nil deleted docs. + - Append the new SegmentSnapshot to the IndexSnapshot + - Release the lock + +An ASCII art example: + ``` + 0 - Empty Index + + No segments + + IndexSnapshot + segments [] + deleted [] + + + 1 - Index Batch [ A B C ] + + segment 0 + numbers [ 1 2 3 ] + \_id [ A B C ] + + IndexSnapshot + segments [ 0 ] + deleted [ nil ] + + + 2 - Index Batch [ B' ] + + segment 0 1 + numbers [ 1 2 3 ] [ 1 ] + \_id [ A B C ] [ B ] + + Compute bitset segment-0-deleted-by-1: + [ 0 1 0 ] + + OR it with previous (nil) (call it 0-1) + [ 0 1 0 ] + + IndexSnapshot + segments [ 0 1 ] + deleted [ 0-1 nil ] + + 3 - Index Batch [ C' ] + + segment 0 1 2 + numbers [ 1 2 3 ] [ 1 ] [ 1 ] + \_id [ A B C ] [ B ] [ C ] + + Compute bitset segment-0-deleted-by-2: + [ 0 0 1 ] + + OR it with previous ([ 0 1 0 ]) (call it 0-12) + [ 0 1 1 ] + + Compute bitset segment-1-deleted-by-2: + [ 0 0 0 ] + + OR it with previous (nil) + still just nil + + + IndexSnapshot + segments [ 0 1 2 ] + deleted [ 0-12 nil nil ] + ``` + +**is there opportunity to stop early when doc is found in one segment** +**also, more efficient way to find bits for long lists of ids?** + +### Searching + +In the bleve.index API all searching starts by getting an IndexReader, which represents a snapshot of the index at a point in time. + +As described in the section above, our index implementation maintains a pointer to the current IndexSnapshot. When a caller gets an IndexReader, they get a copy of this pointer, and can use it as long as they like. The IndexSnapshot contains SegmentSnapshots, which only contain pointers to immutable segments. The deleted posting lists associated with a segment change over time, but the particular deleted posting list in YOUR snapshot is immutable. This gives a stable view of the data. + +#### Term Search + +Term search is the only searching primitive exposed in today's bleve.index API. This ultimately could limit our ability to take advantage of the indexing improvements, but it also means it will be easier to get a first version of this working. + +A term search for term T in field F will look something like this: + +``` + searchResultPostings = empty + foreach segment { + dict := segment.Dictionary(F) + segmentResultPostings = dict.PostingsList(T, segmentSnapshotDeleted) + // make segmentLocal numbers into global numbers, and flip bits in searchResultPostings + } +``` + +The searchResultPostings will be a new implementation of the TermFieldReader inteface. + +As a reminder this interface is: + +``` +// TermFieldReader is the interface exposing the enumeration of documents +// containing a given term in a given field. Documents are returned in byte +// lexicographic order over their identifiers. +type TermFieldReader interface { + // Next returns the next document containing the term in this field, or nil + // when it reaches the end of the enumeration. The preAlloced TermFieldDoc + // is optional, and when non-nil, will be used instead of allocating memory. + Next(preAlloced *TermFieldDoc) (*TermFieldDoc, error) + + // Advance resets the enumeration at specified document or its immediate + // follower. + Advance(ID IndexInternalID, preAlloced *TermFieldDoc) (*TermFieldDoc, error) + + // Count returns the number of documents contains the term in this field. + Count() uint64 + Close() error +} +``` + +At first glance this appears problematic, we have no way to return documents in order of their identifiers. But it turns out the wording of this perhaps too strong, or a bit ambiguous. Originally, this referred to the external identifiers, but with the introduction of a distinction between internal/external identifiers, returning them in order of their internal identifiers is also acceptable. **ASIDE**: the reason for this is that most callers just use Next() and literally don't care what the order is, they could be in any order and it would be fine. There is only one search that cares and that is the ConjunctionSearcher, which relies on Next/Advance having very specific semantics. Later in this document we will have a proposal to split into multiple interfaces: + +- The weakest interface, only supports Next() no ordering at all. +- Ordered, supporting Advance() +- And/Or'able capable of internally efficiently doing these ops with like interfaces (if not capable then can always fall back to external walking) + +But, the good news is that we don't even have to do that for our first implementation. As long as the global numbers we use for internal identifiers are consistent within this IndexSnapshot, then Next() will be ordered by ascending document number, and Advance() will still work correctly. + +NOTE: there is another place where we rely on the ordering of these hits, and that is in the "\_id" sort order. Previously this was the natural order, and a NOOP for the collector, now it must be implemented by actually sorting on the "\_id" field. We probably should introduce at least a marker interface to detect this. + +An ASCII art example: + +``` +Let's start with the IndexSnapshot we ended with earlier: + +3 - Index Batch [ C' ] + + segment 0 1 2 + numbers [ 1 2 3 ] [ 1 ] [ 1 ] + \_id [ A B C ] [ B ] [ C ] + + Compute bitset segment-0-deleted-by-2: + [ 0 0 1 ] + + OR it with previous ([ 0 1 0 ]) (call it 0-12) + [ 0 1 1 ] + +Compute bitset segment-1-deleted-by-2: + [ 0 0 0 ] + +OR it with previous (nil) + still just nil + + + IndexSnapshot + segments [ 0 1 2 ] + deleted [ 0-12 nil nil ] + +Now let's search for the term 'cat' in the field 'desc' and let's assume that Document C (both versions) would match it. + +Concurrently: + + - Segment 0 + - Get Term Dictionary For Field 'desc' + - From it get Postings List for term 'cat' EXCLUDING 0-12 + - raw segment matches [ 0 0 1 ] but excluding [ 0 1 1 ] gives [ 0 0 0 ] + - Segment 1 + - Get Term Dictionary For Field 'desc' + - From it get Postings List for term 'cat' excluding nil + - [ 0 ] + - Segment 2 + - Get Term Dictionary For Field 'desc' + - From it get Postings List for term 'cat' excluding nil + - [ 1 ] + +Map local bitsets into global number space (global meaning cross-segment but still unique to this snapshot) + +IndexSnapshot already should have mapping something like: +0 - Offset 0 +1 - Offset 3 (because segment 0 had 3 docs) +2 - Offset 4 (becuase segment 1 had 1 doc) + +This maps to search result bitset: + +[ 0 0 0 0 1] + +Caller would call Next() and get doc number 5 (assuming 1 based indexing for now) + +Caller could then ask to get term locations, stored fields, external doc ID for document number 5. Internally in the IndexSnapshot, we can now convert that back, and realize doc number 5 comes from segment 2, 5-4=1 so we're looking for doc number 1 in segment 2. That happens to be C... + +``` + +#### Future improvements + +In the future, interfaces to detect these non-serially operating TermFieldReaders could expose their own And() and Or() up to the higher level Conjunction/Disjunction searchers. Doing this alone offers some win, but also means there would be greater burden on the Searcher code rewriting logical expressions for maximum performance. + +Another related topic is that of peak memory usage. With serially operating TermFieldReaders it was necessary to start them all at the same time and operate in unison. However, with these non-serially operating TermFieldReaders we have the option of doing a few at a time, consolidating them, dispoting the intermediaries, and then doing a few more. For very complex queries with many clauses this could reduce peak memory usage. + + +### Memory Tracking + +All segments must be able to produce two statistics, an estimate of their explicit memory usage, and their actual size on disk (if any). For in-memory segments, disk usage could be zero, and the memory usage represents the entire information content. For mmap-based disk segments, the memory could be as low as the size of tracking structure itself (say just a few pointers). + +This would allow the implementation to throttle or block incoming mutations when a threshold memory usage has (or would be) exceeded. + +### Persistence + +Obviously, we want to support (but maybe not require) asynchronous persistence of segments. My expectation is that segments are initially built in memory. At some point they are persisted to disk. This poses some interesting challenges. + +At runtime, the state of an index (it's IndexSnapshot) is not only the contents of the segments, but also the bitmasks of deleted documents. These bitmasks indirectly encode an ordering in which the segments were added. The reason is that the bitmasks encode which items have been obsoleted by other (subsequent or more future) segments. In the runtime implementation we compute bitmask deltas and then merge them at the same time we bring the new segment in. One idea is that we could take a similar approach on disk. When we persist a segment, we persist the bitmask deltas of segments known to exist at that time, and eventually these can get merged up into a base segment deleted bitmask. + +This also relates to the topic rollback, addressed next... + + +### Rollback + +One desirable property in the Couchbase ecosystem is the ability to rollback to some previous (though typically not long ago) state. One idea for keeping this property in this design is to protect some of the most recent segments from merging. Then, if necessary, they could be "undone" to reveal previous states of the system. In these scenarios "undone" has to properly undo the deleted bitmasks on the other segments. Again, the current thinking is that rather than "undo" anything, it could be work that was deferred in the first place, thus making it easier to logically undo. + +Another possibly related approach would be to tie this into our existing snapshot mechanism. Perhaps simulating a slow reader (holding onto index snapshots) for some period of time, can be the mechanism to achieve the desired end goal. + + +### Internal Storage + +The bleve.index API has support for "internal storage". The ability to store information under a separate name space. + +This is not used for high volume storage, so it is tempting to think we could just put a small k/v store alongside the rest of the index. But, the reality is that this storage is used to maintain key information related to the rollback scenario. Because of this, its crucial that ordering and overwriting of key/value pairs correspond with actual segment persistence in the index. Based on this, I believe its important to put the internal key/value pairs inside the segments themselves. But, this also means that they must follow a similar "deleted" bitmask approach to obsolete values in older segments. But, this also seems to substantially increase the complexity of the solution because of the separate name space, it would appear to require its own bitmask. Further keys aren't numeric, which then implies yet another mapping from internal key to number, etc. + +More thought is required here. + +### Merging + +The segmented index approach requires merging to prevent the number of segments from growing too large. + +Recent experience with LSMs has taught us that having the correct merge strategy can make a huge difference in the overall performance of the system. In particular, a simple merge strategy which merges segments too aggressively can lead to high write amplification and unnecessarily rendering cached data useless. + +A few simple principles have been identified. + +- Roughly we merge multiple smaller segments into a single larger one. +- The larger a segment gets the less likely we should be to ever merge it. +- Segments with large numbers of deleted/obsoleted items are good candidates as the merge will result in a space savings. +- Segments with all items deleted/obsoleted can be dropped. + +Merging of a segment should be able to proceed even if that segment is held by an ongoing snapshot, it should only delay the removal of it. + + +## TODO + +- need reference counting on the segments, to know when we can safely remove? + +- how well will bitmaps perform when large and possibly mmap'd? + + +----- +thinking out loud on storage + +- fields + - field name - field id +- term dictionary + - field id - FST (values postings ids) +- postings + - posting id - postings list +- freqs + - posting id - freqs list +- norms + - posting id - norms list +- stored + - docNum + - field id - field values + + + +---- + +race dialog with steve: + +state: 2, 4, 8 + +- introducing new segment X + - deleted bitmasks, 2, 4, 8 + +- merger, merge 4 and 8 + new segment Y + + +- merger wins + + state: 2, 9 + + introducer: need to recompute bitmask for 9, could lose again and keep losing race + +- introducer wins + + state: 2, 4, 8, X + 2-X, 4-X, 8-X, nil + + merger finishes: new segment Y, is not valid, need to be recomputed diff --git a/index/scorch/field_dict_test.go b/index/scorch/field_dict_test.go new file mode 100644 index 000000000..81285e76a --- /dev/null +++ b/index/scorch/field_dict_test.go @@ -0,0 +1,164 @@ +package scorch + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" +) + +func TestIndexFieldDict(t *testing.T) { + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, nil, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + var expectedCount uint64 + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + doc = document.NewDocument("2") + doc.AddField(document.NewTextFieldWithAnalyzer("name", []uint64{}, []byte("test test test"), testAnalyzer)) + doc.AddField(document.NewTextFieldCustom("desc", []uint64{}, []byte("eat more rice"), document.IndexField|document.IncludeTermVectors, testAnalyzer)) + doc.AddField(document.NewTextFieldCustom("prefix", []uint64{}, []byte("bob cat cats catting dog doggy zoo"), document.IndexField|document.IncludeTermVectors, testAnalyzer)) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err := indexReader.Close() + if err != nil { + t.Fatal(err) + } + }() + + dict, err := indexReader.FieldDict("name") + if err != nil { + t.Errorf("error creating reader: %v", err) + } + defer func() { + err := dict.Close() + if err != nil { + t.Fatal(err) + } + }() + + termCount := 0 + curr, err := dict.Next() + for err == nil && curr != nil { + termCount++ + if curr.Term != "test" { + t.Errorf("expected term to be 'test', got '%s'", curr.Term) + } + curr, err = dict.Next() + } + if termCount != 1 { + t.Errorf("expected 1 term for this field, got %d", termCount) + } + + dict2, err := indexReader.FieldDict("desc") + if err != nil { + t.Errorf("error creating reader: %v", err) + } + defer func() { + err := dict2.Close() + if err != nil { + t.Fatal(err) + } + }() + + termCount = 0 + terms := make([]string, 0) + curr, err = dict2.Next() + for err == nil && curr != nil { + termCount++ + terms = append(terms, curr.Term) + curr, err = dict2.Next() + } + if termCount != 3 { + t.Errorf("expected 3 term for this field, got %d", termCount) + } + expectedTerms := []string{"eat", "more", "rice"} + if !reflect.DeepEqual(expectedTerms, terms) { + t.Errorf("expected %#v, got %#v", expectedTerms, terms) + } + // test start and end range + dict3, err := indexReader.FieldDictRange("desc", []byte("fun"), []byte("nice")) + if err != nil { + t.Errorf("error creating reader: %v", err) + } + defer func() { + err := dict3.Close() + if err != nil { + t.Fatal(err) + } + }() + + termCount = 0 + terms = make([]string, 0) + curr, err = dict3.Next() + for err == nil && curr != nil { + termCount++ + terms = append(terms, curr.Term) + curr, err = dict3.Next() + } + if termCount != 1 { + t.Errorf("expected 1 term for this field, got %d", termCount) + } + expectedTerms = []string{"more"} + if !reflect.DeepEqual(expectedTerms, terms) { + t.Errorf("expected %#v, got %#v", expectedTerms, terms) + } + + // test use case for prefix + dict4, err := indexReader.FieldDictPrefix("prefix", []byte("cat")) + if err != nil { + t.Errorf("error creating reader: %v", err) + } + defer func() { + err := dict4.Close() + if err != nil { + t.Fatal(err) + } + }() + + termCount = 0 + terms = make([]string, 0) + curr, err = dict4.Next() + for err == nil && curr != nil { + termCount++ + terms = append(terms, curr.Term) + curr, err = dict4.Next() + } + if termCount != 3 { + t.Errorf("expected 3 term for this field, got %d", termCount) + } + expectedTerms = []string{"cat", "cats", "catting"} + if !reflect.DeepEqual(expectedTerms, terms) { + t.Errorf("expected %#v, got %#v", expectedTerms, terms) + } +} diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go new file mode 100644 index 000000000..dc748ad85 --- /dev/null +++ b/index/scorch/introducer.go @@ -0,0 +1,85 @@ +package scorch + +import ( + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index/scorch/segment" +) + +type segmentIntroduction struct { + id uint64 + data segment.Segment + obsoletes map[uint64]*roaring.Bitmap + ids []string + internal map[string][]byte + + applied chan struct{} +} + +func (s *Scorch) mainLoop() { + for { + select { + case <-s.closeCh: + return + + case next := <-s.introductions: + + // acquire lock + s.rootLock.Lock() + + // prepare new index snapshot, with curr size + 1 + newSnapshot := &IndexSnapshot{ + segment: make([]*SegmentSnapshot, len(s.root.segment)+1), + offsets: make([]uint64, len(s.root.segment)+1), + internal: make(map[string][]byte, len(s.root.segment)), + } + + // iterate through current segments + var running uint64 + for i := range s.root.segment { + // see if optimistic work included this segment + delta, ok := next.obsoletes[s.root.segment[i].id] + if !ok { + delta = s.root.segment[i].segment.DocNumbers(next.ids) + } + newSnapshot.segment[i] = &SegmentSnapshot{ + id: s.root.segment[i].id, + segment: s.root.segment[i].segment, + } + // apply new obsoletions + if s.root.segment[i].deleted == nil { + newSnapshot.segment[i].deleted = delta + } else { + newSnapshot.segment[i].deleted = s.root.segment[i].deleted.Clone() + newSnapshot.segment[i].deleted.Or(delta) + } + + newSnapshot.offsets[i] = running + running += s.root.segment[i].Count() + + } + // put new segment at end + newSnapshot.segment[len(s.root.segment)] = &SegmentSnapshot{ + id: next.id, + segment: next.data, + } + newSnapshot.offsets[len(s.root.segment)] = running + // copy old values + for key, oldVal := range s.root.internal { + newSnapshot.internal[key] = oldVal + } + // set new values and apply deletes + for key, newVal := range next.internal { + if newVal != nil { + newSnapshot.internal[key] = newVal + } else { + delete(newSnapshot.internal, key) + } + } + // swap in new segment + s.root = newSnapshot + // release lock + s.rootLock.Unlock() + close(next.applied) + } + } +} diff --git a/index/scorch/reader.go b/index/scorch/reader.go new file mode 100644 index 000000000..acb01905d --- /dev/null +++ b/index/scorch/reader.go @@ -0,0 +1,84 @@ +package scorch + +import ( + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" +) + +type Reader struct { + root *IndexSnapshot +} + +func (r *Reader) TermFieldReader(term []byte, field string, includeFreq, + includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { + return r.root.TermFieldReader(term, field, includeFreq, includeNorm, includeTermVectors) +} + +// DocIDReader returns an iterator over all doc ids +// The caller must close returned instance to release associated resources. +func (r *Reader) DocIDReaderAll() (index.DocIDReader, error) { + return r.root.DocIDReaderAll() +} + +func (r *Reader) DocIDReaderOnly(ids []string) (index.DocIDReader, error) { + return r.root.DocIDReaderOnly(ids) +} + +func (r *Reader) FieldDict(field string) (index.FieldDict, error) { + return r.root.FieldDict(field) +} + +// FieldDictRange is currently defined to include the start and end terms +func (r *Reader) FieldDictRange(field string, startTerm []byte, + endTerm []byte) (index.FieldDict, error) { + return r.root.FieldDictRange(field, startTerm, endTerm) +} + +func (r *Reader) FieldDictPrefix(field string, + termPrefix []byte) (index.FieldDict, error) { + return r.root.FieldDictPrefix(field, termPrefix) +} + +func (r *Reader) Document(id string) (*document.Document, error) { + return r.root.Document(id) +} +func (r *Reader) DocumentVisitFieldTerms(id index.IndexInternalID, fields []string, + visitor index.DocumentFieldTermVisitor) error { + panic("document visit field terms not implemented") +} + +func (r *Reader) Fields() ([]string, error) { + return r.root.Fields() +} + +func (r *Reader) GetInternal(key []byte) ([]byte, error) { + return r.root.GetInternal(key) +} + +func (r *Reader) DocCount() (uint64, error) { + return r.root.DocCount() +} + +func (r *Reader) ExternalID(id index.IndexInternalID) (string, error) { + return r.root.ExternalID(id) +} + +func (r *Reader) InternalID(id string) (index.IndexInternalID, error) { + return r.root.InternalID(id) +} + +func (r *Reader) DumpAll() chan interface{} { + panic("dumpall") +} + +func (r *Reader) DumpDoc(id string) chan interface{} { + panic("dumpdoc") +} + +func (r *Reader) DumpFields() chan interface{} { + panic("dumpfields") +} + +func (r *Reader) Close() error { + return nil +} diff --git a/index/scorch/reader_test.go b/index/scorch/reader_test.go new file mode 100644 index 000000000..a050bb44a --- /dev/null +++ b/index/scorch/reader_test.go @@ -0,0 +1,511 @@ +package scorch + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" +) + +func TestIndexReader(t *testing.T) { + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, nil, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + var expectedCount uint64 + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + doc = document.NewDocument("2") + doc.AddField(document.NewTextFieldWithAnalyzer("name", []uint64{}, []byte("test test test"), testAnalyzer)) + doc.AddField(document.NewTextFieldCustom("desc", []uint64{}, []byte("eat more rice"), document.IndexField|document.IncludeTermVectors, testAnalyzer)) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err := indexReader.Close() + if err != nil { + t.Fatal(err) + } + }() + + // first look for a term that doesn't exist + reader, err := indexReader.TermFieldReader([]byte("nope"), "name", true, true, true) + if err != nil { + t.Errorf("Error accessing term field reader: %v", err) + } + count := reader.Count() + if count != 0 { + t.Errorf("Expected doc count to be: %d got: %d", 0, count) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } + + reader, err = indexReader.TermFieldReader([]byte("test"), "name", true, true, true) + if err != nil { + t.Errorf("Error accessing term field reader: %v", err) + } + + expectedCount = 2 + count = reader.Count() + if count != expectedCount { + t.Errorf("Exptected doc count to be: %d got: %d", expectedCount, count) + } + + var match *index.TermFieldDoc + var actualCount uint64 + match, err = reader.Next(nil) + for err == nil && match != nil { + match, err = reader.Next(nil) + if err != nil { + t.Errorf("unexpected error reading next") + } + actualCount++ + } + if actualCount != count { + t.Errorf("count was 2, but only saw %d", actualCount) + } + + internalID2, err := indexReader.InternalID("2") + if err != nil { + t.Fatal(err) + } + expectedMatch := &index.TermFieldDoc{ + ID: internalID2, + Freq: 1, + Norm: 0.5773502588272095, + Vectors: []*index.TermFieldVector{ + { + Field: "desc", + Pos: 3, + Start: 9, + End: 13, + }, + }, + } + tfr, err := indexReader.TermFieldReader([]byte("rice"), "desc", true, true, true) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + match, err = tfr.Next(nil) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if !reflect.DeepEqual(expectedMatch, match) { + t.Errorf("got %#v, expected %#v", match, expectedMatch) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } + + // now test usage of advance + reader, err = indexReader.TermFieldReader([]byte("test"), "name", true, true, true) + if err != nil { + t.Errorf("Error accessing term field reader: %v", err) + } + + match, err = reader.Advance(internalID2, nil) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if match == nil { + t.Fatalf("Expected match, got nil") + } + if !match.ID.Equals(internalID2) { + t.Errorf("Expected ID '2', got '%s'", match.ID) + } + // NOTE: no point in changing this to internal id 3, there is no id 3 + // the test is looking for something that doens't exist and this doesn't + match, err = reader.Advance(index.IndexInternalID("3"), nil) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if match != nil { + t.Errorf("expected nil, got %v", match) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } + + // now test creating a reader for a field that doesn't exist + reader, err = indexReader.TermFieldReader([]byte("water"), "doesnotexist", true, true, true) + if err != nil { + t.Errorf("Error accessing term field reader: %v", err) + } + count = reader.Count() + if count != 0 { + t.Errorf("expected count 0 for reader of non-existent field") + } + match, err = reader.Next(nil) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if match != nil { + t.Errorf("expected nil, got %v", match) + } + match, err = reader.Advance(index.IndexInternalID("anywhere"), nil) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if match != nil { + t.Errorf("expected nil, got %v", match) + } + +} + +func TestIndexDocIdReader(t *testing.T) { + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, nil, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + var expectedCount uint64 + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + doc = document.NewDocument("2") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test test test"))) + doc.AddField(document.NewTextFieldWithIndexingOptions("desc", []uint64{}, []byte("eat more rice"), document.IndexField|document.IncludeTermVectors)) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err := indexReader.Close() + if err != nil { + t.Error(err) + } + }() + + // first get all doc ids + reader, err := indexReader.DocIDReaderAll() + if err != nil { + t.Errorf("Error accessing doc id reader: %v", err) + } + defer func() { + err := reader.Close() + if err != nil { + t.Fatal(err) + } + }() + + id, err := reader.Next() + count := uint64(0) + for id != nil { + count++ + id, err = reader.Next() + } + if count != expectedCount { + t.Errorf("expected %d, got %d", expectedCount, count) + } + + // try it again, but jump to the second doc this time + reader2, err := indexReader.DocIDReaderAll() + if err != nil { + t.Errorf("Error accessing doc id reader: %v", err) + } + defer func() { + err := reader2.Close() + if err != nil { + t.Error(err) + } + }() + + internalID2, err := indexReader.InternalID("2") + if err != nil { + t.Fatal(err) + } + + id, err = reader2.Advance(internalID2) + if err != nil { + t.Error(err) + } + if !id.Equals(internalID2) { + t.Errorf("expected to find id '2', got '%s'", id) + } + + // again 3 doesn't exist cannot use internal id for 3 as there is none + // the important aspect is that this id doesn't exist, so its ok + id, err = reader2.Advance(index.IndexInternalID("3")) + if err != nil { + t.Error(err) + } + if id != nil { + t.Errorf("expected to find id '', got '%s'", id) + } +} + +func TestIndexDocIdOnlyReader(t *testing.T) { + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, nil, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + doc := document.NewDocument("1") + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + doc = document.NewDocument("3") + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + doc = document.NewDocument("5") + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + doc = document.NewDocument("7") + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + doc = document.NewDocument("9") + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err := indexReader.Close() + if err != nil { + t.Error(err) + } + }() + + onlyIds := []string{"1", "5", "9"} + reader, err := indexReader.DocIDReaderOnly(onlyIds) + if err != nil { + t.Errorf("Error accessing doc id reader: %v", err) + } + defer func() { + err := reader.Close() + if err != nil { + t.Fatal(err) + } + }() + + id, err := reader.Next() + count := uint64(0) + for id != nil { + count++ + id, err = reader.Next() + if err != nil { + t.Fatal(err) + } + } + if count != 3 { + t.Errorf("expected 3, got %d", count) + } + + // commented out because advance works with internal ids + // this test presumes we see items in external doc id order + // which is no longer the case, so simply converting external ids + // to internal ones is not logically correct + // not removing though because we need some way to test Advance() + + // // try it again, but jump + // reader2, err := indexReader.DocIDReaderOnly(onlyIds) + // if err != nil { + // t.Errorf("Error accessing doc id reader: %v", err) + // } + // defer func() { + // err := reader2.Close() + // if err != nil { + // t.Error(err) + // } + // }() + // + // id, err = reader2.Advance(index.IndexInternalID("5")) + // if err != nil { + // t.Error(err) + // } + // if !id.Equals(index.IndexInternalID("5")) { + // t.Errorf("expected to find id '5', got '%s'", id) + // } + // + // id, err = reader2.Advance(index.IndexInternalID("a")) + // if err != nil { + // t.Error(err) + // } + // if id != nil { + // t.Errorf("expected to find id '', got '%s'", id) + // } + + // some keys aren't actually there + onlyIds = []string{"0", "2", "4", "5", "6", "8", "a"} + reader3, err := indexReader.DocIDReaderOnly(onlyIds) + if err != nil { + t.Errorf("Error accessing doc id reader: %v", err) + } + defer func() { + err := reader3.Close() + if err != nil { + t.Error(err) + } + }() + + id, err = reader3.Next() + count = uint64(0) + for id != nil { + count++ + id, err = reader3.Next() + } + if count != 1 { + t.Errorf("expected 1, got %d", count) + } + + // commented out because advance works with internal ids + // this test presumes we see items in external doc id order + // which is no longer the case, so simply converting external ids + // to internal ones is not logically correct + // not removing though because we need some way to test Advance() + + // // mix advance and next + // onlyIds = []string{"0", "1", "3", "5", "6", "9"} + // reader4, err := indexReader.DocIDReaderOnly(onlyIds) + // if err != nil { + // t.Errorf("Error accessing doc id reader: %v", err) + // } + // defer func() { + // err := reader4.Close() + // if err != nil { + // t.Error(err) + // } + // }() + // + // // first key is "1" + // id, err = reader4.Next() + // if err != nil { + // t.Error(err) + // } + // if !id.Equals(index.IndexInternalID("1")) { + // t.Errorf("expected to find id '1', got '%s'", id) + // } + // + // // advancing to key we dont have gives next + // id, err = reader4.Advance(index.IndexInternalID("2")) + // if err != nil { + // t.Error(err) + // } + // if !id.Equals(index.IndexInternalID("3")) { + // t.Errorf("expected to find id '3', got '%s'", id) + // } + // + // // next after advance works + // id, err = reader4.Next() + // if err != nil { + // t.Error(err) + // } + // if !id.Equals(index.IndexInternalID("5")) { + // t.Errorf("expected to find id '5', got '%s'", id) + // } + // + // // advancing to key we do have works + // id, err = reader4.Advance(index.IndexInternalID("9")) + // if err != nil { + // t.Error(err) + // } + // if !id.Equals(index.IndexInternalID("9")) { + // t.Errorf("expected to find id '9', got '%s'", id) + // } + // + // // advance backwards at end + // id, err = reader4.Advance(index.IndexInternalID("4")) + // if err != nil { + // t.Error(err) + // } + // if !id.Equals(index.IndexInternalID("5")) { + // t.Errorf("expected to find id '5', got '%s'", id) + // } + // + // // next after advance works + // id, err = reader4.Next() + // if err != nil { + // t.Error(err) + // } + // if !id.Equals(index.IndexInternalID("9")) { + // t.Errorf("expected to find id '9', got '%s'", id) + // } + // + // // advance backwards to key that exists, but not in only set + // id, err = reader4.Advance(index.IndexInternalID("7")) + // if err != nil { + // t.Error(err) + // } + // if !id.Equals(index.IndexInternalID("9")) { + // t.Errorf("expected to find id '9', got '%s'", id) + // } + +} diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go new file mode 100644 index 000000000..d20261f5e --- /dev/null +++ b/index/scorch/scorch.go @@ -0,0 +1,218 @@ +package scorch + +import ( + "encoding/json" + "sync" + "sync/atomic" + "time" + + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/index/scorch/segment/mem" + "github.com/blevesearch/bleve/index/store" + "github.com/blevesearch/bleve/registry" +) + +const Name = "scorch" + +const Version uint8 = 1 + +type Scorch struct { + version uint8 + storeConfig map[string]interface{} + analysisQueue *index.AnalysisQueue + stats *Stats + nextSegmentID uint64 + + rootLock sync.RWMutex + root *IndexSnapshot + + closeCh chan struct{} + introductions chan *segmentIntroduction +} + +func NewScorch(storeName string, storeConfig map[string]interface{}, analysisQueue *index.AnalysisQueue) (index.Index, error) { + rv := &Scorch{ + version: Version, + storeConfig: storeConfig, + analysisQueue: analysisQueue, + stats: &Stats{}, + root: &IndexSnapshot{}, + } + return rv, nil +} + +func (s *Scorch) Open() error { + s.closeCh = make(chan struct{}) + s.introductions = make(chan *segmentIntroduction) + go s.mainLoop() + return nil +} + +func (s *Scorch) Close() error { + close(s.closeCh) + return nil +} + +func (s *Scorch) Update(doc *document.Document) error { + b := index.NewBatch() + b.Update(doc) + return s.Batch(b) +} + +func (s *Scorch) Delete(id string) error { + b := index.NewBatch() + b.Delete(id) + return s.Batch(b) +} + +// Batch applices a batch of changes to the index atomically +func (s *Scorch) Batch(batch *index.Batch) error { + + analysisStart := time.Now() + + resultChan := make(chan *index.AnalysisResult, len(batch.IndexOps)) + + var numUpdates uint64 + var numPlainTextBytes uint64 + var ids []string + for docID, doc := range batch.IndexOps { + if doc != nil { + // insert _id field + doc.AddField(document.NewTextFieldCustom("_id", nil, []byte(doc.ID), document.IndexField|document.StoreField, nil)) + numUpdates++ + numPlainTextBytes += doc.NumPlainTextBytes() + } + ids = append(ids, docID) + } + + // FIXME could sort ids list concurrent with analysis? + + go func() { + for _, doc := range batch.IndexOps { + if doc != nil { + aw := index.NewAnalysisWork(s, doc, resultChan) + // put the work on the queue + s.analysisQueue.Queue(aw) + } + } + }() + + // wait for analysis result + analysisResults := make([]*index.AnalysisResult, int(numUpdates)) + // newRowsMap := make(map[string][]index.IndexRow) + var itemsDeQueued uint64 + for itemsDeQueued < numUpdates { + result := <-resultChan + //newRowsMap[result.DocID] = result.Rows + analysisResults[itemsDeQueued] = result + itemsDeQueued++ + } + close(resultChan) + + atomic.AddUint64(&s.stats.analysisTime, uint64(time.Since(analysisStart))) + + var newSegment segment.Segment + if len(analysisResults) > 0 { + newSegment = mem.NewFromAnalyzedDocs(analysisResults) + } else { + newSegment = mem.New() + } + s.prepareSegment(newSegment, ids, batch.InternalOps) + + return nil +} + +func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, + internalOps map[string][]byte) error { + + // new introduction + introduction := &segmentIntroduction{ + id: atomic.AddUint64(&s.nextSegmentID, 1), + data: newSegment, + ids: ids, + obsoletes: make(map[uint64]*roaring.Bitmap), + internal: internalOps, + applied: make(chan struct{}), + } + + // get read lock, to optimistically prepare obsoleted info + s.rootLock.RLock() + for i := range s.root.segment { + delta := s.root.segment[i].segment.DocNumbers(ids) + introduction.obsoletes[s.root.segment[i].id] = delta + } + s.rootLock.RUnlock() + + s.introductions <- introduction + + // block until this segment is applied + <-introduction.applied + + return nil +} + +func (s *Scorch) SetInternal(key, val []byte) error { + b := index.NewBatch() + b.SetInternal(key, val) + return s.Batch(b) +} + +func (s *Scorch) DeleteInternal(key []byte) error { + b := index.NewBatch() + b.DeleteInternal(key) + return s.Batch(b) +} + +// Reader returns a low-level accessor on the index data. Close it to +// release associated resources. +func (s *Scorch) Reader() (index.IndexReader, error) { + s.rootLock.RLock() + defer s.rootLock.RUnlock() + return &Reader{ + root: s.root, + }, nil +} + +func (s *Scorch) Stats() json.Marshaler { + return s.stats +} +func (s *Scorch) StatsMap() map[string]interface{} { + return s.stats.statsMap() +} + +func (s *Scorch) Analyze(d *document.Document) *index.AnalysisResult { + rv := &index.AnalysisResult{ + Document: d, + Analyzed: make([]analysis.TokenFrequencies, len(d.Fields)+len(d.CompositeFields)), + Length: make([]int, len(d.Fields)+len(d.CompositeFields)), + } + + for i, field := range d.Fields { + if field.Options().IsIndexed() { + fieldLength, tokenFreqs := field.Analyze() + rv.Analyzed[i] = tokenFreqs + rv.Length[i] = fieldLength + + if len(d.CompositeFields) > 0 { + // see if any of the composite fields need this + for _, compositeField := range d.CompositeFields { + compositeField.Compose(field.Name(), fieldLength, tokenFreqs) + } + } + } + } + + return rv +} + +func (s *Scorch) Advanced() (store.KVStore, error) { + return nil, nil +} + +func init() { + registry.RegisterIndexType(Name, NewScorch) +} diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go new file mode 100644 index 000000000..48e80c8b8 --- /dev/null +++ b/index/scorch/scorch_test.go @@ -0,0 +1,1109 @@ +// Copyright (c) 2014 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "log" + "reflect" + "regexp" + "strconv" + "sync" + "testing" + "time" + + "github.com/blevesearch/bleve/analysis" + regexpTokenizer "github.com/blevesearch/bleve/analysis/tokenizer/regexp" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" +) + +var testAnalyzer = &analysis.Analyzer{ + Tokenizer: regexpTokenizer.NewRegexpTokenizer(regexp.MustCompile(`\w+`)), +} + +func TestIndexInsert(t *testing.T) { + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, nil, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + var expectedCount uint64 + reader, err := idx.Reader() + if err != nil { + t.Fatal(err) + } + docCount, err := reader.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } + + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + reader, err = idx.Reader() + if err != nil { + t.Fatal(err) + } + docCount, err = reader.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } +} + +func TestIndexInsertThenDelete(t *testing.T) { + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, nil, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + var expectedCount uint64 + reader, err := idx.Reader() + if err != nil { + t.Fatal(err) + } + docCount, err := reader.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } + + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + doc2 := document.NewDocument("2") + doc2.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc2) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + reader, err = idx.Reader() + if err != nil { + t.Fatal(err) + } + docCount, err = reader.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } + + err = idx.Delete("1") + if err != nil { + t.Errorf("Error deleting entry from index: %v", err) + } + expectedCount-- + + reader, err = idx.Reader() + if err != nil { + t.Fatal(err) + } + docCount, err = reader.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } + + err = idx.Delete("2") + if err != nil { + t.Errorf("Error deleting entry from index: %v", err) + } + expectedCount-- + + reader, err = idx.Reader() + if err != nil { + t.Fatal(err) + } + docCount, err = reader.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } +} + +func TestIndexInsertThenUpdate(t *testing.T) { + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, nil, analysisQueue) + if err != nil { + t.Fatal(err) + } + + var expectedCount uint64 + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + // this update should overwrite one term, and introduce one new one + doc = document.NewDocument("1") + doc.AddField(document.NewTextFieldWithAnalyzer("name", []uint64{}, []byte("test fail"), testAnalyzer)) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error deleting entry from index: %v", err) + } + + // now do another update that should remove one of the terms + doc = document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("fail"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error deleting entry from index: %v", err) + } + + reader, err := idx.Reader() + if err != nil { + t.Fatal(err) + } + docCount, err := reader.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } +} + +func TestIndexInsertMultiple(t *testing.T) { + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, nil, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + + var expectedCount uint64 + + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + doc = document.NewDocument("2") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + doc = document.NewDocument("3") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + reader, err := idx.Reader() + if err != nil { + t.Fatal(err) + } + docCount, err := reader.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } +} + +func TestIndexInsertWithStore(t *testing.T) { + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, nil, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + var expectedCount uint64 + reader, err := idx.Reader() + if err != nil { + t.Fatal(err) + } + docCount, err := reader.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } + + doc := document.NewDocument("1") + doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("test"), document.IndexField|document.StoreField)) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + reader, err = idx.Reader() + if err != nil { + t.Fatal(err) + } + docCount, err = reader.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err := indexReader.Close() + if err != nil { + t.Fatal(err) + } + }() + + storedDoc, err := indexReader.Document("1") + if err != nil { + t.Error(err) + } + + if len(storedDoc.Fields) != 2 { + t.Errorf("expected 1 stored field, got %d", len(storedDoc.Fields)) + } + for _, field := range storedDoc.Fields { + if field.Name() == "name" { + textField, ok := field.(*document.TextField) + if !ok { + t.Errorf("expected text field") + } + if string(textField.Value()) != "test" { + t.Errorf("expected field content 'test', got '%s'", string(textField.Value())) + } + } else if field.Name() == "_id" { + textField, ok := field.(*document.TextField) + if !ok { + t.Errorf("expected text field") + } + if string(textField.Value()) != "1" { + t.Errorf("expected field content '1', got '%s'", string(textField.Value())) + } + } + } +} + +func TestIndexInternalCRUD(t *testing.T) { + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, nil, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + + // get something that doesn't exist yet + val, err := indexReader.GetInternal([]byte("key")) + if err != nil { + t.Error(err) + } + if val != nil { + t.Errorf("expected nil, got %s", val) + } + + err = indexReader.Close() + if err != nil { + t.Fatal(err) + } + + // set + err = idx.SetInternal([]byte("key"), []byte("abc")) + if err != nil { + t.Error(err) + } + + indexReader2, err := idx.Reader() + if err != nil { + t.Error(err) + } + + // get + val, err = indexReader2.GetInternal([]byte("key")) + if err != nil { + t.Error(err) + } + if string(val) != "abc" { + t.Errorf("expected %s, got '%s'", "abc", val) + } + + err = indexReader2.Close() + if err != nil { + t.Fatal(err) + } + + // delete + err = idx.DeleteInternal([]byte("key")) + if err != nil { + t.Error(err) + } + + indexReader3, err := idx.Reader() + if err != nil { + t.Error(err) + } + + // get again + val, err = indexReader3.GetInternal([]byte("key")) + if err != nil { + t.Error(err) + } + if val != nil { + t.Errorf("expected nil, got %s", val) + } + + err = indexReader3.Close() + if err != nil { + t.Fatal(err) + } +} + +func TestIndexBatch(t *testing.T) { + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, nil, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + var expectedCount uint64 + + // first create 2 docs the old fashioned way + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + doc = document.NewDocument("2") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test2"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + // now create a batch which does 3 things + // insert new doc + // update existing doc + // delete existing doc + // net document count change 0 + + batch := index.NewBatch() + doc = document.NewDocument("3") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test3"))) + batch.Update(doc) + doc = document.NewDocument("2") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test2updated"))) + batch.Update(doc) + batch.Delete("1") + + err = idx.Batch(batch) + if err != nil { + t.Error(err) + } + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err := indexReader.Close() + if err != nil { + t.Fatal(err) + } + }() + + docCount, err := indexReader.DocCount() + if err != nil { + t.Fatal(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + + docIDReader, err := indexReader.DocIDReaderAll() + if err != nil { + t.Error(err) + } + var docIds []index.IndexInternalID + docID, err := docIDReader.Next() + for docID != nil && err == nil { + docIds = append(docIds, docID) + docID, err = docIDReader.Next() + } + if err != nil { + t.Error(err) + } + externalDocIds := map[string]struct{}{} + // convert back to external doc ids + for _, id := range docIds { + externalID, err := indexReader.ExternalID(id) + if err != nil { + t.Fatal(err) + } + externalDocIds[externalID] = struct{}{} + } + expectedDocIds := map[string]struct{}{ + "2": struct{}{}, + "3": struct{}{}, + } + if !reflect.DeepEqual(externalDocIds, expectedDocIds) { + t.Errorf("expected ids: %v, got ids: %v", expectedDocIds, externalDocIds) + } +} + +func TestIndexInsertUpdateDeleteWithMultipleTypesStored(t *testing.T) { + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, nil, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + var expectedCount uint64 + reader, err := idx.Reader() + if err != nil { + t.Fatal(err) + } + docCount, err := reader.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } + + doc := document.NewDocument("1") + doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("test"), document.IndexField|document.StoreField)) + doc.AddField(document.NewNumericFieldWithIndexingOptions("age", []uint64{}, 35.99, document.IndexField|document.StoreField)) + df, err := document.NewDateTimeFieldWithIndexingOptions("unixEpoch", []uint64{}, time.Unix(0, 0), document.IndexField|document.StoreField) + if err != nil { + t.Error(err) + } + doc.AddField(df) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + reader, err = idx.Reader() + if err != nil { + t.Fatal(err) + } + docCount, err = reader.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + + storedDoc, err := indexReader.Document("1") + if err != nil { + t.Error(err) + } + + err = indexReader.Close() + if err != nil { + t.Error(err) + } + + if len(storedDoc.Fields) != 4 { + t.Errorf("expected 4 stored field, got %d", len(storedDoc.Fields)) + } + for _, field := range storedDoc.Fields { + + if field.Name() == "name" { + textField, ok := field.(*document.TextField) + if !ok { + t.Errorf("expected text field") + } + if string(textField.Value()) != "test" { + t.Errorf("expected field content 'test', got '%s'", string(textField.Value())) + } + } else if field.Name() == "age" { + numField, ok := field.(*document.NumericField) + if !ok { + t.Errorf("expected numeric field") + } + numFieldNumer, err := numField.Number() + if err != nil { + t.Error(err) + } else { + if numFieldNumer != 35.99 { + t.Errorf("expeted numeric value 35.99, got %f", numFieldNumer) + } + } + } else if field.Name() == "unixEpoch" { + dateField, ok := field.(*document.DateTimeField) + if !ok { + t.Errorf("expected date field") + } + dateFieldDate, err := dateField.DateTime() + if err != nil { + t.Error(err) + } else { + if dateFieldDate != time.Unix(0, 0).UTC() { + t.Errorf("expected date value unix epoch, got %v", dateFieldDate) + } + } + } + + } + + // now update the document, but omit one of the fields + doc = document.NewDocument("1") + doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("testup"), document.IndexField|document.StoreField)) + doc.AddField(document.NewNumericFieldWithIndexingOptions("age", []uint64{}, 36.99, document.IndexField|document.StoreField)) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + indexReader2, err := idx.Reader() + if err != nil { + t.Error(err) + } + + // expected doc count shouldn't have changed + docCount, err = indexReader2.DocCount() + if err != nil { + t.Fatal(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + + // should only get 2 fields back now though + storedDoc, err = indexReader2.Document("1") + if err != nil { + t.Error(err) + } + + err = indexReader2.Close() + if err != nil { + t.Error(err) + } + + if len(storedDoc.Fields) != 3 { + t.Errorf("expected 3 stored field, got %d", len(storedDoc.Fields)) + } + + for _, field := range storedDoc.Fields { + + if field.Name() == "name" { + textField, ok := field.(*document.TextField) + if !ok { + t.Errorf("expected text field") + } + if string(textField.Value()) != "testup" { + t.Errorf("expected field content 'testup', got '%s'", string(textField.Value())) + } + } else if field.Name() == "age" { + numField, ok := field.(*document.NumericField) + if !ok { + t.Errorf("expected numeric field") + } + numFieldNumer, err := numField.Number() + if err != nil { + t.Error(err) + } else { + if numFieldNumer != 36.99 { + t.Errorf("expeted numeric value 36.99, got %f", numFieldNumer) + } + } + } + } + + // now delete the document + err = idx.Delete("1") + expectedCount-- + + // expected doc count shouldn't have changed + reader, err = idx.Reader() + if err != nil { + t.Fatal(err) + } + docCount, err = reader.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } +} + +func TestIndexInsertFields(t *testing.T) { + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, nil, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + doc := document.NewDocument("1") + doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("test"), document.IndexField|document.StoreField)) + doc.AddField(document.NewNumericFieldWithIndexingOptions("age", []uint64{}, 35.99, document.IndexField|document.StoreField)) + dateField, err := document.NewDateTimeFieldWithIndexingOptions("unixEpoch", []uint64{}, time.Unix(0, 0), document.IndexField|document.StoreField) + if err != nil { + t.Error(err) + } + doc.AddField(dateField) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err := indexReader.Close() + if err != nil { + t.Fatal(err) + } + }() + + fields, err := indexReader.Fields() + if err != nil { + t.Error(err) + } else { + fieldsMap := map[string]struct{}{} + for _, field := range fields { + fieldsMap[field] = struct{}{} + } + expectedFieldsMap := map[string]struct{}{ + "_id": struct{}{}, + "name": struct{}{}, + "age": struct{}{}, + "unixEpoch": struct{}{}, + } + if !reflect.DeepEqual(fieldsMap, expectedFieldsMap) { + t.Errorf("expected fields: %v, got %v", expectedFieldsMap, fieldsMap) + } + } + +} + +func TestIndexUpdateComposites(t *testing.T) { + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, nil, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + doc := document.NewDocument("1") + doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("test"), document.IndexField|document.StoreField)) + doc.AddField(document.NewTextFieldWithIndexingOptions("title", []uint64{}, []byte("mister"), document.IndexField|document.StoreField)) + doc.AddField(document.NewCompositeFieldWithIndexingOptions("_all", true, nil, nil, document.IndexField)) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + // now lets update it + doc = document.NewDocument("1") + doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("testupdated"), document.IndexField|document.StoreField)) + doc.AddField(document.NewTextFieldWithIndexingOptions("title", []uint64{}, []byte("misterupdated"), document.IndexField|document.StoreField)) + doc.AddField(document.NewCompositeFieldWithIndexingOptions("_all", true, nil, nil, document.IndexField)) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err := indexReader.Close() + if err != nil { + t.Fatal(err) + } + }() + + // make sure new values are in index + storedDoc, err := indexReader.Document("1") + if err != nil { + t.Error(err) + } + if len(storedDoc.Fields) != 3 { + t.Errorf("expected 3 stored field, got %d", len(storedDoc.Fields)) + } + for _, field := range storedDoc.Fields { + if field.Name() == "name" { + textField, ok := field.(*document.TextField) + if !ok { + t.Errorf("expected text field") + } + if string(textField.Value()) != "testupdated" { + t.Errorf("expected field content 'test', got '%s'", string(textField.Value())) + } + } + } +} + +func TestIndexTermReaderCompositeFields(t *testing.T) { + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, nil, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + doc := document.NewDocument("1") + doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("test"), document.IndexField|document.StoreField|document.IncludeTermVectors)) + doc.AddField(document.NewTextFieldWithIndexingOptions("title", []uint64{}, []byte("mister"), document.IndexField|document.StoreField|document.IncludeTermVectors)) + doc.AddField(document.NewCompositeFieldWithIndexingOptions("_all", true, nil, nil, document.IndexField|document.IncludeTermVectors)) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err := indexReader.Close() + if err != nil { + t.Fatal(err) + } + }() + + termFieldReader, err := indexReader.TermFieldReader([]byte("mister"), "_all", true, true, true) + if err != nil { + t.Error(err) + } + + tfd, err := termFieldReader.Next(nil) + for tfd != nil && err == nil { + externalID, err := indexReader.ExternalID(tfd.ID) + if err != nil { + t.Fatal(err) + } + if externalID != "1" { + t.Errorf("expected to find document id 1") + } + tfd, err = termFieldReader.Next(nil) + } + if err != nil { + t.Error(err) + } +} + +func TestConcurrentUpdate(t *testing.T) { + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, nil, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + // do some concurrent updates + var wg sync.WaitGroup + for i := 0; i < 10; i++ { + wg.Add(1) + go func(i int) { + doc := document.NewDocument("1") + doc.AddField(document.NewTextFieldWithIndexingOptions(strconv.Itoa(i), []uint64{}, []byte(strconv.Itoa(i)), document.StoreField)) + err := idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + wg.Done() + }(i) + } + wg.Wait() + + // now load the name field and see what we get + r, err := idx.Reader() + if err != nil { + log.Fatal(err) + } + + doc, err := r.Document("1") + if err != nil { + log.Fatal(err) + } + + if len(doc.Fields) > 2 { + t.Errorf("expected no more than 2 fields, found %d", len(doc.Fields)) + } +} + +func TestLargeField(t *testing.T) { + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, nil, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + var largeFieldValue []byte + for len(largeFieldValue) < 4096 { + largeFieldValue = append(largeFieldValue, bleveWikiArticle1K...) + } + + d := document.NewDocument("large") + f := document.NewTextFieldWithIndexingOptions("desc", nil, largeFieldValue, document.IndexField|document.StoreField) + d.AddField(f) + + err = idx.Update(d) + if err != nil { + t.Fatal(err) + } +} + +var bleveWikiArticle1K = []byte(`Boiling liquid expanding vapor explosion +From Wikipedia, the free encyclopedia +See also: Boiler explosion and Steam explosion + +Flames subsequent to a flammable liquid BLEVE from a tanker. BLEVEs do not necessarily involve fire. + +This article's tone or style may not reflect the encyclopedic tone used on Wikipedia. See Wikipedia's guide to writing better articles for suggestions. (July 2013) +A boiling liquid expanding vapor explosion (BLEVE, /ˈblɛviː/ blev-ee) is an explosion caused by the rupture of a vessel containing a pressurized liquid above its boiling point.[1] +Contents [hide] +1 Mechanism +1.1 Water example +1.2 BLEVEs without chemical reactions +2 Fires +3 Incidents +4 Safety measures +5 See also +6 References +7 External links +Mechanism[edit] + +This section needs additional citations for verification. Please help improve this article by adding citations to reliable sources. Unsourced material may be challenged and removed. (July 2013) +There are three characteristics of liquids which are relevant to the discussion of a BLEVE:`) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go new file mode 100644 index 000000000..ff6fb6056 --- /dev/null +++ b/index/scorch/segment/mem/build.go @@ -0,0 +1,220 @@ +package mem + +import ( + "math" + "sort" + + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" +) + +// NewFromAnalyzedDocs places the analyzed document mutations into this segment +func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { + s := New() + + // ensure that _id field get fieldID 0 + s.getOrDefineField("_id", false) + + // walk each doc + for _, result := range results { + s.processDocument(result) + } + + // go back and sort the dictKeys + for _, dict := range s.dictKeys { + sort.Strings(dict) + } + + // professional debugging + // + // log.Printf("fields: %v\n", s.fields) + // log.Printf("fieldsInv: %v\n", s.fieldsInv) + // log.Printf("fieldsLoc: %v\n", s.fieldsLoc) + // log.Printf("dicts: %v\n", s.dicts) + // log.Printf("dict keys: %v\n", s.dictKeys) + // for i, posting := range s.postings { + // log.Printf("posting %d: %v\n", i, posting) + // } + // for i, freq := range s.freqs { + // log.Printf("freq %d: %v\n", i, freq) + // } + // for i, norm := range s.norms { + // log.Printf("norm %d: %v\n", i, norm) + // } + // for i, field := range s.locfields { + // log.Printf("field %d: %v\n", i, field) + // } + // for i, start := range s.locstarts { + // log.Printf("start %d: %v\n", i, start) + // } + // for i, end := range s.locends { + // log.Printf("end %d: %v\n", i, end) + // } + // for i, pos := range s.locpos { + // log.Printf("pos %d: %v\n", i, pos) + // } + // for i, apos := range s.locarraypos { + // log.Printf("apos %d: %v\n", i, apos) + // } + // log.Printf("stored: %v\n", s.stored) + // log.Printf("stored types: %v\n", s.storedTypes) + // log.Printf("stored pos: %v\n", s.storedPos) + + return s +} + +func (s *Segment) processDocument(result *index.AnalysisResult) { + // used to collate information across fields + docMap := map[uint16]analysis.TokenFrequencies{} + fieldLens := map[uint16]int{} + docNum := uint64(s.addDocument()) + + processField := func(field uint16, name string, l int, tf analysis.TokenFrequencies) { + fieldLens[field] += l + if existingFreqs, ok := docMap[field]; ok { + existingFreqs.MergeAll(name, tf) + } else { + docMap[field] = tf + } + } + + storeField := func(docNum uint64, field uint16, typ byte, val []byte, pos []uint64) { + s.stored[docNum][field] = append(s.stored[docNum][field], val) + s.storedTypes[docNum][field] = append(s.storedTypes[docNum][field], typ) + s.storedPos[docNum][field] = append(s.storedPos[docNum][field], pos) + } + + // walk each composite field + for _, field := range result.Document.CompositeFields { + fieldID := uint16(s.getOrDefineField(field.Name(), false)) + l, tf := field.Analyze() + processField(fieldID, field.Name(), l, tf) + } + + // walk each field + for i, field := range result.Document.Fields { + fieldID := uint16(s.getOrDefineField(field.Name(), field.Options().IncludeTermVectors())) + l := result.Length[i] + tf := result.Analyzed[i] + processField(fieldID, field.Name(), l, tf) + if field.Options().IsStored() { + storeField(docNum, fieldID, encodeFieldType(field), field.Value(), field.ArrayPositions()) + } + } + + // now that its been rolled up into docMap, walk that + for fieldID, tokenFrequencies := range docMap { + for term, tokenFreq := range tokenFrequencies { + fieldTermPostings := s.dicts[fieldID][term] + + // FIXME this if/else block has duplicate code that has resulted in + // bugs fixed/missed more than once, need to refactor + if fieldTermPostings == 0 { + // need to build new posting + bs := roaring.New() + bs.AddInt(int(docNum)) + + newPostingID := uint64(len(s.postings) + 1) + // add this new bitset to the postings slice + s.postings = append(s.postings, bs) + // add this to the details slice + s.freqs = append(s.freqs, []uint64{uint64(tokenFreq.Frequency())}) + s.norms = append(s.norms, []float32{float32(1.0 / math.Sqrt(float64(fieldLens[fieldID])))}) + // add to locations + var locfields []uint16 + var locstarts []uint64 + var locends []uint64 + var locpos []uint64 + var locarraypos [][]uint64 + for _, loc := range tokenFreq.Locations { + var locf = fieldID + if loc.Field != "" { + locf = uint16(s.getOrDefineField(loc.Field, false)) + } + locfields = append(locfields, locf) + locstarts = append(locstarts, uint64(loc.Start)) + locends = append(locends, uint64(loc.End)) + locpos = append(locpos, uint64(loc.Position)) + if len(loc.ArrayPositions) > 0 { + locarraypos = append(locarraypos, loc.ArrayPositions) + } else { + locarraypos = append(locarraypos, nil) + } + } + s.locfields = append(s.locfields, locfields) + s.locstarts = append(s.locstarts, locstarts) + s.locends = append(s.locends, locends) + s.locpos = append(s.locpos, locpos) + s.locarraypos = append(s.locarraypos, locarraypos) + // record it + s.dicts[fieldID][term] = newPostingID + // this term was new for this field, add it to dictKeys + s.dictKeys[fieldID] = append(s.dictKeys[fieldID], term) + } else { + // posting already started for this field/term + // the actual offset is - 1, because 0 is zero value + bs := s.postings[fieldTermPostings-1] + bs.AddInt(int(docNum)) + s.freqs[fieldTermPostings-1] = append(s.freqs[fieldTermPostings-1], uint64(tokenFreq.Frequency())) + s.norms[fieldTermPostings-1] = append(s.norms[fieldTermPostings-1], float32(1.0/math.Sqrt(float64(fieldLens[fieldID])))) + for _, loc := range tokenFreq.Locations { + var locf = fieldID + if loc.Field != "" { + locf = uint16(s.getOrDefineField(loc.Field, false)) + } + s.locfields[fieldTermPostings-1] = append(s.locfields[fieldTermPostings-1], locf) + s.locstarts[fieldTermPostings-1] = append(s.locstarts[fieldTermPostings-1], uint64(loc.Start)) + s.locends[fieldTermPostings-1] = append(s.locends[fieldTermPostings-1], uint64(loc.End)) + s.locpos[fieldTermPostings-1] = append(s.locpos[fieldTermPostings-1], uint64(loc.Position)) + if len(loc.ArrayPositions) > 0 { + s.locarraypos[fieldTermPostings-1] = append(s.locarraypos[fieldTermPostings-1], loc.ArrayPositions) + } else { + s.locarraypos[fieldTermPostings-1] = append(s.locarraypos[fieldTermPostings-1], nil) + } + } + } + } + } +} + +func (s *Segment) getOrDefineField(name string, hasLoc bool) int { + fieldID, ok := s.fields[name] + if !ok { + fieldID = uint16(len(s.fieldsInv) + 1) + s.fields[name] = fieldID + s.fieldsInv = append(s.fieldsInv, name) + s.fieldsLoc = append(s.fieldsLoc, hasLoc) + s.dicts = append(s.dicts, make(map[string]uint64)) + s.dictKeys = append(s.dictKeys, make([]string, 0)) + } + return int(fieldID - 1) +} + +func (s *Segment) addDocument() int { + docNum := len(s.stored) + s.stored = append(s.stored, map[uint16][][]byte{}) + s.storedTypes = append(s.storedTypes, map[uint16][]byte{}) + s.storedPos = append(s.storedPos, map[uint16][][]uint64{}) + return docNum +} + +func encodeFieldType(f document.Field) byte { + fieldType := byte('x') + switch f.(type) { + case *document.TextField: + fieldType = 't' + case *document.NumericField: + fieldType = 'n' + case *document.DateTimeField: + fieldType = 'd' + case *document.BooleanField: + fieldType = 'b' + case *document.GeoPointField: + fieldType = 'g' + case *document.CompositeField: + fieldType = 'c' + } + return fieldType +} diff --git a/index/scorch/segment/mem/dict.go b/index/scorch/segment/mem/dict.go new file mode 100644 index 000000000..a7d609f6d --- /dev/null +++ b/index/scorch/segment/mem/dict.go @@ -0,0 +1,87 @@ +package mem + +import ( + "sort" + "strings" + + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment" +) + +// Dictionary is the in-memory representation of the term dictionary +type Dictionary struct { + segment *Segment + field string + fieldID uint16 +} + +// PostingsList returns the postings list for the specified term +func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) segment.PostingsList { + return &PostingsList{ + dictionary: d, + term: term, + postingsID: d.segment.dicts[d.fieldID][term], + except: except, + } +} + +// Iterator returns an iterator for this dictionary +func (d *Dictionary) Iterator() segment.DictionaryIterator { + return &DictionaryIterator{ + d: d, + } +} + +// PrefixIterator returns an iterator which only visits terms having the +// the specified prefix +func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { + offset := sort.SearchStrings(d.segment.dictKeys[d.fieldID], prefix) + return &DictionaryIterator{ + d: d, + prefix: prefix, + offset: offset, + } +} + +// RangeIterator returns an iterator which only visits terms between the +// start and end terms. NOTE: bleve.index API specifies the end is inclusive. +func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator { + offset := sort.SearchStrings(d.segment.dictKeys[d.fieldID], start) + return &DictionaryIterator{ + d: d, + offset: offset, + end: end, + } +} + +// DictionaryIterator is an iterator for term dictionary +type DictionaryIterator struct { + d *Dictionary + prefix string + end string + offset int +} + +// Next returns the next entry in the dictionary +func (d *DictionaryIterator) Next() (*index.DictEntry, error) { + if d.offset > len(d.d.segment.dictKeys[d.d.fieldID])-1 { + return nil, nil + } + next := d.d.segment.dictKeys[d.d.fieldID][d.offset] + // check prefix + if d.prefix != "" && !strings.HasPrefix(next, d.prefix) { + return nil, nil + } + // check end (bleve.index API demands inclusive end) + if d.end != "" && next > d.end { + return nil, nil + } + + d.offset++ + postingID := d.d.segment.dicts[d.d.fieldID][next] + return &index.DictEntry{ + Term: next, + Count: d.d.segment.postings[postingID-1].GetCardinality(), + }, nil +} diff --git a/index/scorch/segment/mem/posting.go b/index/scorch/segment/mem/posting.go new file mode 100644 index 000000000..ac8b2fb72 --- /dev/null +++ b/index/scorch/segment/mem/posting.go @@ -0,0 +1,160 @@ +package mem + +import ( + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index/scorch/segment" +) + +// PostingsList is an in-memory represenation of a postings list +type PostingsList struct { + dictionary *Dictionary + term string + postingsID uint64 + except *roaring.Bitmap +} + +// Count returns the number of items on this postings list +func (p *PostingsList) Count() uint64 { + var rv uint64 + if p.postingsID > 0 { + rv = p.dictionary.segment.postings[p.postingsID-1].GetCardinality() + if p.except != nil { + except := p.except.GetCardinality() + if except > rv { + // avoid underflow + except = rv + } + rv -= except + } + } + return rv +} + +// Iterator returns an iterator for this postings list +func (p *PostingsList) Iterator() segment.PostingsIterator { + rv := &PostingsIterator{ + postings: p, + } + if p.postingsID > 0 { + allbits := p.dictionary.segment.postings[p.postingsID-1] + rv.all = allbits.Iterator() + if p.except != nil { + allExcept := allbits.Clone() + allExcept.AndNot(p.except) + rv.actual = allExcept.Iterator() + } else { + rv.actual = allbits.Iterator() + } + } + + return rv +} + +// PostingsIterator provides a way to iterate through the postings list +type PostingsIterator struct { + postings *PostingsList + all roaring.IntIterable + offset int + locoffset int + actual roaring.IntIterable +} + +// Next returns the next posting on the postings list, or nil at the end +func (i *PostingsIterator) Next() segment.Posting { + if i.actual == nil || !i.actual.HasNext() { + return nil + } + n := i.actual.Next() + allN := i.all.Next() + + // n is the next actual hit (excluding some postings) + // allN is the next hit in the full postings + // if they don't match, adjust offsets to factor in item we're skipping over + // incr the all iterator, and check again + for allN != n { + i.locoffset += int(i.postings.dictionary.segment.freqs[i.postings.postingsID-1][i.offset]) + i.offset++ + allN = i.all.Next() + } + rv := &Posting{ + iterator: i, + docNum: uint64(n), + offset: i.offset, + locoffset: i.locoffset, + } + + i.locoffset += int(i.postings.dictionary.segment.freqs[i.postings.postingsID-1][i.offset]) + i.offset++ + return rv +} + +// Posting is a single entry in a postings list +type Posting struct { + iterator *PostingsIterator + docNum uint64 + offset int + locoffset int +} + +// Number returns the document number of this posting in this segment +func (p *Posting) Number() uint64 { + return p.docNum +} + +// Frequency returns the frequence of occurance of this term in this doc/field +func (p *Posting) Frequency() uint64 { + return p.iterator.postings.dictionary.segment.freqs[p.iterator.postings.postingsID-1][p.offset] +} + +// Norm returns the normalization factor for this posting +func (p *Posting) Norm() float64 { + return float64(p.iterator.postings.dictionary.segment.norms[p.iterator.postings.postingsID-1][p.offset]) +} + +// Locations returns the location information for each occurance +func (p *Posting) Locations() []segment.Location { + if !p.iterator.postings.dictionary.segment.fieldsLoc[p.iterator.postings.dictionary.fieldID] { + return nil + } + freq := int(p.Frequency()) + rv := make([]segment.Location, freq) + for i := 0; i < freq; i++ { + rv[i] = &Location{ + p: p, + offset: p.locoffset + i, + } + } + return rv +} + +// Location represents the location of a single occurance +type Location struct { + p *Posting + offset int +} + +// Field returns the name of the field (useful in composite fields to know +// which original field the value came from) +func (l *Location) Field() string { + return l.p.iterator.postings.dictionary.segment.fieldsInv[l.p.iterator.postings.dictionary.segment.locfields[l.p.iterator.postings.postingsID-1][l.offset]] +} + +// Start returns the start byte offset of this occurance +func (l *Location) Start() uint64 { + return l.p.iterator.postings.dictionary.segment.locstarts[l.p.iterator.postings.postingsID-1][l.offset] +} + +// End returns the end byte offset of this occurance +func (l *Location) End() uint64 { + return l.p.iterator.postings.dictionary.segment.locends[l.p.iterator.postings.postingsID-1][l.offset] +} + +// Pos returns the 1-based phrase position of this occurance +func (l *Location) Pos() uint64 { + return l.p.iterator.postings.dictionary.segment.locpos[l.p.iterator.postings.postingsID-1][l.offset] +} + +// ArrayPositions returns the array position vector associated with this occurance +func (l *Location) ArrayPositions() []uint64 { + return l.p.iterator.postings.dictionary.segment.locarraypos[l.p.iterator.postings.postingsID-1][l.offset] +} diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go new file mode 100644 index 000000000..fdec1b2e6 --- /dev/null +++ b/index/scorch/segment/mem/segment.go @@ -0,0 +1,132 @@ +package mem + +import ( + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index/scorch/segment" +) + +// KNOWN ISSUES +// - LIMITATION - we decided whether or not to store term vectors for a field +// at the segment level, based on the first definition of a +// field we see. in normal bleve usage this is fine, all +// instances of a field definition will be the same. however, +// advanced users may violate this and provide unique field +// definitions with each document. this segment does not +// support this usage. + +// TODO +// - need better testing of multiple docs, iterating freqs, locations and +// and verifying the correct results are returned +// - need tests for term dictionary iteration + +// Segment is an in memory implementation of scorch.Segment +type Segment struct { + + // fields name -> id+1 + fields map[string]uint16 + // fields id -> name + fieldsInv []string + // field id -> has location info + fieldsLoc []bool + + // term dictionary + // field id -> term -> posting id + 1 + dicts []map[string]uint64 + + // term dictionary keys + // field id -> []dictionary keys + dictKeys [][]string + + // postings list + // postings list id -> postings bitmap + postings []*roaring.Bitmap + + // term frequencies + // postings list id -> freqs (one for each hit in bitmap) + freqs [][]uint64 + + // field norms + // postings list id -> norms (one for each hit in bitmap) + norms [][]float32 + + // field/start/end/pos/locarraypos + // postings list id -> start/end/pos/locarraypos (one for each freq) + locfields [][]uint16 + locstarts [][]uint64 + locends [][]uint64 + locpos [][]uint64 + locarraypos [][][]uint64 + + // stored field values + // docNum -> field id -> slice of values (each value []byte) + stored []map[uint16][][]byte + + // stored field types + // docNum -> field id -> slice of types (each type byte) + storedTypes []map[uint16][]byte + + // stored field array positions + // docNum -> field id -> slice of array positions (each is []uint64) + storedPos []map[uint16][][]uint64 +} + +// New builds a new empty Segment +func New() *Segment { + return &Segment{ + fields: map[string]uint16{}, + } +} + +// Fields returns the field names used in this segment +func (s *Segment) Fields() []string { + return s.fieldsInv +} + +// VisitDocument invokes the DocFieldValueVistor for each stored field +// for the specified doc number +func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { + // ensure document number exists + if int(num) > len(s.stored)-1 { + return nil + } + docFields := s.stored[int(num)] + for field, values := range docFields { + for i, value := range values { + keepGoing := visitor(s.fieldsInv[field], s.storedTypes[int(num)][field][i], value, s.storedPos[int(num)][field][i]) + if !keepGoing { + return nil + } + } + } + return nil +} + +// Dictionary returns the term dictionary for the specified field +func (s *Segment) Dictionary(field string) segment.TermDictionary { + return &Dictionary{ + segment: s, + field: field, + fieldID: uint16(s.getOrDefineField(field, false)), + } +} + +// Count returns the number of documents in this segment +// (this has no notion of deleted docs) +func (s *Segment) Count() uint64 { + return uint64(len(s.stored)) +} + +// DocNumbers returns a bitset corresponding to the doc numbers of all the +// provided _id strings +func (s *Segment) DocNumbers(ids []string) *roaring.Bitmap { + + idDictionary := s.dicts[s.getOrDefineField("_id", false)] + rv := roaring.New() + for _, id := range ids { + postingID := idDictionary[id] + if postingID > 0 { + rv.Or(s.postings[postingID-1]) + } + } + return rv +} diff --git a/index/scorch/segment/mem/segment_test.go b/index/scorch/segment/mem/segment_test.go new file mode 100644 index 000000000..bc840e147 --- /dev/null +++ b/index/scorch/segment/mem/segment_test.go @@ -0,0 +1,521 @@ +package mem + +import ( + "math" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" +) + +func TestEmpty(t *testing.T) { + + emptySegment := New() + + if emptySegment.Count() != 0 { + t.Errorf("expected count 0, got %d", emptySegment.Count()) + } + + dict := emptySegment.Dictionary("name") + if dict == nil { + t.Fatal("got nil dict, expected non-nil") + } + + postingsList := dict.PostingsList("marty", nil) + if postingsList == nil { + t.Fatal("got nil postings list, expected non-nil") + } + + postingsItr := postingsList.Iterator() + if postingsItr == nil { + t.Fatal("got nil iterator, expected non-nil") + } + + count := 0 + nextPosting := postingsItr.Next() + for nextPosting != nil { + count++ + nextPosting = postingsItr.Next() + } + + if count != 0 { + t.Errorf("expected count to be 0, got %d", count) + } + + // now try and visit a document + emptySegment.VisitDocument(0, func(field string, typ byte, value []byte, pos []uint64) bool { + t.Errorf("document visitor called, not expected") + return true + }) +} + +func TestSingle(t *testing.T) { + + doc := &document.Document{ + ID: "a", + Fields: []document.Field{ + document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil), + document.NewTextFieldCustom("name", nil, []byte("wow"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + }, + CompositeFields: []*document.CompositeField{ + document.NewCompositeField("_all", true, nil, nil), + }, + } + + // forge analyzed docs + results := []*index.AnalysisResult{ + &index.AnalysisResult{ + Document: doc, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("a"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("wow"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("some"), + }, + &analysis.Token{ + Start: 5, + End: 10, + Position: 2, + Term: []byte("thing"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("cold"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("dark"), + }, + }, nil, true), + }, + Length: []int{ + 1, + 1, + 2, + 1, + 1, + }, + }, + } + + // fix up composite fields + for _, ar := range results { + for i, f := range ar.Document.Fields { + for _, cf := range ar.Document.CompositeFields { + cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i]) + } + } + } + + segment := NewFromAnalyzedDocs(results) + if segment == nil { + t.Fatalf("segment nil, not expected") + } + + if segment.Count() != 1 { + t.Errorf("expected count 1, got %d", segment.Count()) + } + + // check the _id field + dict := segment.Dictionary("_id") + if dict == nil { + t.Fatal("got nil dict, expected non-nil") + } + + postingsList := dict.PostingsList("a", nil) + if postingsList == nil { + t.Fatal("got nil postings list, expected non-nil") + } + + postingsItr := postingsList.Iterator() + if postingsItr == nil { + t.Fatal("got nil iterator, expected non-nil") + } + + count := 0 + nextPosting := postingsItr.Next() + for nextPosting != nil { + count++ + if nextPosting.Frequency() != 1 { + t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) + } + if nextPosting.Number() != 0 { + t.Errorf("expected doc number 0, got %d", nextPosting.Number()) + } + if nextPosting.Norm() != 1.0 { + t.Errorf("expected norm 1.0, got %f", nextPosting.Norm()) + } + + nextPosting = postingsItr.Next() + } + + if count != 1 { + t.Errorf("expected count to be 1, got %d", count) + } + + // check the name field + dict = segment.Dictionary("name") + if dict == nil { + t.Fatal("got nil dict, expected non-nil") + } + + postingsList = dict.PostingsList("wow", nil) + if postingsList == nil { + t.Fatal("got nil postings list, expected non-nil") + } + + postingsItr = postingsList.Iterator() + if postingsItr == nil { + t.Fatal("got nil iterator, expected non-nil") + } + + count = 0 + nextPosting = postingsItr.Next() + for nextPosting != nil { + count++ + if nextPosting.Frequency() != 1 { + t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) + } + if nextPosting.Number() != 0 { + t.Errorf("expected doc number 0, got %d", nextPosting.Number()) + } + if nextPosting.Norm() != 1.0 { + t.Errorf("expected norm 1.0, got %f", nextPosting.Norm()) + } + for _, loc := range nextPosting.Locations() { + if loc.Start() != 0 { + t.Errorf("expected loc start to be 0, got %d", loc.Start()) + } + if loc.End() != 3 { + t.Errorf("expected loc end to be 3, got %d", loc.End()) + } + if loc.Pos() != 1 { + t.Errorf("expected loc pos to be 1, got %d", loc.Pos()) + } + if loc.ArrayPositions() != nil { + t.Errorf("expect loc array pos to be nil, got %v", loc.ArrayPositions()) + } + } + + nextPosting = postingsItr.Next() + } + + if count != 1 { + t.Errorf("expected count to be 1, got %d", count) + } + + // check the _all field (composite) + dict = segment.Dictionary("_all") + if dict == nil { + t.Fatal("got nil dict, expected non-nil") + } + + postingsList = dict.PostingsList("wow", nil) + if postingsList == nil { + t.Fatal("got nil postings list, expected non-nil") + } + + postingsItr = postingsList.Iterator() + if postingsItr == nil { + t.Fatal("got nil iterator, expected non-nil") + } + + count = 0 + nextPosting = postingsItr.Next() + for nextPosting != nil { + count++ + if nextPosting.Frequency() != 1 { + t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) + } + if nextPosting.Number() != 0 { + t.Errorf("expected doc number 0, got %d", nextPosting.Number()) + } + expectedNorm := float32(1.0 / math.Sqrt(float64(6))) + if nextPosting.Norm() != float64(expectedNorm) { + t.Errorf("expected norm %f, got %f", expectedNorm, nextPosting.Norm()) + } + for _, loc := range nextPosting.Locations() { + if loc.Start() != 0 { + t.Errorf("expected loc start to be 0, got %d", loc.Start()) + } + if loc.End() != 3 { + t.Errorf("expected loc end to be 3, got %d", loc.End()) + } + if loc.Pos() != 1 { + t.Errorf("expected loc pos to be 1, got %d", loc.Pos()) + } + if loc.ArrayPositions() != nil { + t.Errorf("expect loc array pos to be nil, got %v", loc.ArrayPositions()) + } + } + + nextPosting = postingsItr.Next() + } + + if count != 1 { + t.Errorf("expected count to be 1, got %d", count) + } + + // now try and visit a document + var fieldValuesSeen int + segment.VisitDocument(0, func(field string, typ byte, value []byte, pos []uint64) bool { + fieldValuesSeen++ + return true + }) + if fieldValuesSeen != 5 { + t.Errorf("expected 5 field values, got %d", fieldValuesSeen) + } + +} + +func TestMultiple(t *testing.T) { + + doc := &document.Document{ + ID: "a", + Fields: []document.Field{ + document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil), + document.NewTextFieldCustom("name", nil, []byte("wow"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + }, + CompositeFields: []*document.CompositeField{ + document.NewCompositeField("_all", true, nil, nil), + }, + } + + doc2 := &document.Document{ + ID: "b", + Fields: []document.Field{ + document.NewTextFieldCustom("_id", nil, []byte("b"), document.IndexField|document.StoreField, nil), + document.NewTextFieldCustom("name", nil, []byte("who"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + }, + CompositeFields: []*document.CompositeField{ + document.NewCompositeField("_all", true, nil, nil), + }, + } + + // forge analyzed docs + results := []*index.AnalysisResult{ + &index.AnalysisResult{ + Document: doc, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("a"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("wow"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("some"), + }, + &analysis.Token{ + Start: 5, + End: 10, + Position: 2, + Term: []byte("thing"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("cold"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("dark"), + }, + }, nil, true), + }, + Length: []int{ + 1, + 1, + 2, + 1, + 1, + }, + }, + &index.AnalysisResult{ + Document: doc2, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("b"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("who"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("some"), + }, + &analysis.Token{ + Start: 5, + End: 10, + Position: 2, + Term: []byte("thing"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("cold"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("dark"), + }, + }, nil, true), + }, + Length: []int{ + 1, + 1, + 2, + 1, + 1, + }, + }, + } + + // fix up composite fields + for _, ar := range results { + for i, f := range ar.Document.Fields { + for _, cf := range ar.Document.CompositeFields { + cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i]) + } + } + } + + segment := NewFromAnalyzedDocs(results) + if segment == nil { + t.Fatalf("segment nil, not expected") + } + + if segment.Count() != 2 { + t.Errorf("expected count 2, got %d", segment.Count()) + } + + // check the desc field + dict := segment.Dictionary("desc") + if dict == nil { + t.Fatal("got nil dict, expected non-nil") + } + + postingsList := dict.PostingsList("thing", nil) + if postingsList == nil { + t.Fatal("got nil postings list, expected non-nil") + } + + postingsItr := postingsList.Iterator() + if postingsItr == nil { + t.Fatal("got nil iterator, expected non-nil") + } + + count := 0 + nextPosting := postingsItr.Next() + for nextPosting != nil { + count++ + nextPosting = postingsItr.Next() + } + + if count != 2 { + t.Errorf("expected count to be 2, got %d", count) + } + + // get docnum of a + exclude := segment.DocNumbers([]string{"a"}) + + // look for term 'thing' excluding doc 'a' + postingsListExcluding := dict.PostingsList("thing", exclude) + if postingsList == nil { + t.Fatal("got nil postings list, expected non-nil") + } + + postingsItrExcluding := postingsListExcluding.Iterator() + if postingsItr == nil { + t.Fatal("got nil iterator, expected non-nil") + } + + count = 0 + nextPosting = postingsItrExcluding.Next() + for nextPosting != nil { + count++ + nextPosting = postingsItrExcluding.Next() + } + + if count != 1 { + t.Errorf("expected count to be 1, got %d", count) + } + +} diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go new file mode 100644 index 000000000..5cd3d5d7f --- /dev/null +++ b/index/scorch/segment/segment.go @@ -0,0 +1,66 @@ +package segment + +import ( + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index" +) + +// DocumentFieldValueVisitor defines a callback to be visited for each +// stored field value. The return value determines if the visitor +// should keep going. Returning true continues visiting, false stops. +type DocumentFieldValueVisitor func(field string, typ byte, value []byte, pos []uint64) bool + +type Segment interface { + Dictionary(field string) TermDictionary + + VisitDocument(num uint64, visitor DocumentFieldValueVisitor) error + Count() uint64 + + DocNumbers([]string) *roaring.Bitmap + + Fields() []string +} + +type TermDictionary interface { + PostingsList(term string, except *roaring.Bitmap) PostingsList + + Iterator() DictionaryIterator + PrefixIterator(prefix string) DictionaryIterator + RangeIterator(start, end string) DictionaryIterator +} + +type DictionaryIterator interface { + Next() (*index.DictEntry, error) +} + +type PostingsList interface { + Iterator() PostingsIterator + + Count() uint64 + + // NOTE deferred for future work + + // And(other PostingsList) PostingsList + // Or(other PostingsList) PostingsList +} + +type PostingsIterator interface { + Next() Posting +} + +type Posting interface { + Number() uint64 + + Frequency() uint64 + Norm() float64 + + Locations() []Location +} + +type Location interface { + Field() string + Start() uint64 + End() uint64 + Pos() uint64 + ArrayPositions() []uint64 +} diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go new file mode 100644 index 000000000..a33cc58e1 --- /dev/null +++ b/index/scorch/snapshot_index.go @@ -0,0 +1,300 @@ +package scorch + +import ( + "bytes" + "container/heap" + "encoding/binary" + "fmt" + + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment" +) + +type IndexSnapshot struct { + segment []*SegmentSnapshot + offsets []uint64 + internal map[string][]byte +} + +func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i segment.TermDictionary) segment.DictionaryIterator) (*IndexSnapshotFieldDict, error) { + + results := make(chan segment.DictionaryIterator) + for index, segment := range i.segment { + go func(index int, segment *SegmentSnapshot) { + dict := segment.Dictionary(field) + results <- makeItr(dict) + }(index, segment) + } + + rv := &IndexSnapshotFieldDict{ + snapshot: i, + cursors: make([]*segmentDictCursor, 0, len(i.segment)), + } + for count := 0; count < len(i.segment); count++ { + di := <-results + next, err := di.Next() + if err != nil { + return nil, err + } + if next != nil { + rv.cursors = append(rv.cursors, &segmentDictCursor{ + itr: di, + curr: next, + }) + } + } + // prepare heap + heap.Init(rv) + + return rv, nil +} + +func (i *IndexSnapshot) FieldDict(field string) (index.FieldDict, error) { + return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { + return i.Iterator() + }) +} + +func (i *IndexSnapshot) FieldDictRange(field string, startTerm []byte, + endTerm []byte) (index.FieldDict, error) { + return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { + return i.RangeIterator(string(startTerm), string(endTerm)) + }) +} + +func (i *IndexSnapshot) FieldDictPrefix(field string, + termPrefix []byte) (index.FieldDict, error) { + return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { + return i.PrefixIterator(string(termPrefix)) + }) +} + +func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) { + + type segmentDocNumsResult struct { + index int + docs *roaring.Bitmap + } + + results := make(chan *segmentDocNumsResult) + for index, segment := range i.segment { + go func(index int, segment *SegmentSnapshot) { + docnums := roaring.NewBitmap() + docnums.AddRange(0, segment.Count()) + results <- &segmentDocNumsResult{ + index: index, + docs: docnums, + } + }(index, segment) + } + + rv := &IndexSnapshotDocIDReader{ + snapshot: i, + iterators: make([]roaring.IntIterable, len(i.segment)), + } + for count := 0; count < len(i.segment); count++ { + sdnr := <-results + rv.iterators[sdnr.index] = sdnr.docs.Iterator() + } + + return rv, nil +} + +func (i *IndexSnapshot) DocIDReaderOnly(ids []string) (index.DocIDReader, error) { + + type segmentDocNumsResult struct { + index int + docs *roaring.Bitmap + } + + results := make(chan *segmentDocNumsResult) + for index, segment := range i.segment { + go func(index int, segment *SegmentSnapshot) { + docnums := segment.DocNumbers(ids) + results <- &segmentDocNumsResult{ + index: index, + docs: docnums, + } + }(index, segment) + } + + rv := &IndexSnapshotDocIDReader{ + snapshot: i, + iterators: make([]roaring.IntIterable, len(i.segment)), + } + for count := 0; count < len(i.segment); count++ { + sdnr := <-results + rv.iterators[count] = sdnr.docs.Iterator() + } + + return rv, nil +} + +func (i *IndexSnapshot) Fields() ([]string, error) { + // FIXME not making this concurrent for now as it's not used in hot path + // of any searches at the moment (just a debug aid) + fieldsMap := map[string]struct{}{} + for _, segment := range i.segment { + fields := segment.Fields() + for _, field := range fields { + fieldsMap[field] = struct{}{} + } + } + rv := make([]string, 0, len(fieldsMap)) + for k := range fieldsMap { + rv = append(rv, k) + } + return rv, nil +} + +func (i *IndexSnapshot) GetInternal(key []byte) ([]byte, error) { + return i.internal[string(key)], nil +} + +func (i *IndexSnapshot) DocCount() (uint64, error) { + var rv uint64 + for _, segment := range i.segment { + rv += segment.Count() + } + return rv, nil +} + +func (i *IndexSnapshot) Document(id string) (*document.Document, error) { + // FIXME could be done more efficiently directly, but reusing for simplicity + tfr, err := i.TermFieldReader([]byte(id), "_id", false, false, false) + if err != nil { + return nil, err + } + defer tfr.Close() + + next, err := tfr.Next(nil) + if err != nil { + return nil, err + } + + docNum := docInternalToNumber(next.ID) + segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) + + rv := document.NewDocument(id) + i.segment[segmentIndex].VisitDocument(localDocNum, func(name string, typ byte, value []byte, pos []uint64) bool { + switch typ { + case 't': + rv.AddField(document.NewTextField(name, pos, value)) + case 'n': + rv.AddField(document.NewNumericFieldFromBytes(name, pos, value)) + case 'd': + rv.AddField(document.NewDateTimeFieldFromBytes(name, pos, value)) + case 'b': + rv.AddField(document.NewBooleanFieldFromBytes(name, pos, value)) + case 'g': + rv.AddField(document.NewGeoPointFieldFromBytes(name, pos, value)) + } + + return true + }) + + return rv, nil +} + +func (i *IndexSnapshot) segmentIndexAndLocalDocNumFromGlobal(docNum uint64) (int, uint64) { + var segmentIndex uint64 + for j := 1; j < len(i.offsets); j++ { + if docNum >= i.offsets[j] { + segmentIndex = uint64(j) + } else { + break + } + } + + localDocNum := docNum - i.offsets[segmentIndex] + return int(segmentIndex), localDocNum +} + +func (i *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) { + docNum := docInternalToNumber(id) + segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) + + var found bool + var rv string + i.segment[segmentIndex].VisitDocument(localDocNum, func(field string, typ byte, value []byte, pos []uint64) bool { + if field == "_id" { + found = true + rv = string(value) + return false + } + return true + }) + + if found { + return rv, nil + } + return "", fmt.Errorf("document number %d not found", docNum) +} + +func (i *IndexSnapshot) InternalID(id string) (index.IndexInternalID, error) { + // FIXME could be done more efficiently directly, but reusing for simplicity + tfr, err := i.TermFieldReader([]byte(id), "_id", false, false, false) + if err != nil { + return nil, err + } + defer tfr.Close() + + next, err := tfr.Next(nil) + if err != nil { + return nil, err + } + + return next.ID, nil +} + +func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, + includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { + + type segmentPostingResult struct { + index int + postings segment.PostingsList + } + + results := make(chan *segmentPostingResult) + for index, segment := range i.segment { + go func(index int, segment *SegmentSnapshot) { + dict := segment.Dictionary(field) + pl := dict.PostingsList(string(term), nil) + results <- &segmentPostingResult{ + index: index, + postings: pl, + } + }(index, segment) + } + + rv := &IndexSnapshotTermFieldReader{ + snapshot: i, + postings: make([]segment.PostingsList, len(i.segment)), + iterators: make([]segment.PostingsIterator, len(i.segment)), + includeFreq: includeFreq, + includeNorm: includeNorm, + includeTermVectors: includeTermVectors, + } + for count := 0; count < len(i.segment); count++ { + spr := <-results + rv.postings[spr.index] = spr.postings + rv.iterators[spr.index] = spr.postings.Iterator() + } + + return rv, nil +} + +func docNumberToBytes(in uint64) []byte { + + buf := new(bytes.Buffer) + _ = binary.Write(buf, binary.BigEndian, in) + return buf.Bytes() +} + +func docInternalToNumber(in index.IndexInternalID) uint64 { + var res uint64 + binary.Read(bytes.NewReader(in), binary.BigEndian, &res) + return res +} diff --git a/index/scorch/snapshot_index_dict.go b/index/scorch/snapshot_index_dict.go new file mode 100644 index 000000000..443e401e6 --- /dev/null +++ b/index/scorch/snapshot_index_dict.go @@ -0,0 +1,78 @@ +package scorch + +import ( + "container/heap" + + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment" +) + +type segmentDictCursor struct { + itr segment.DictionaryIterator + curr *index.DictEntry +} + +type IndexSnapshotFieldDict struct { + snapshot *IndexSnapshot + cursors []*segmentDictCursor +} + +func (i *IndexSnapshotFieldDict) Len() int { return len(i.cursors) } +func (i *IndexSnapshotFieldDict) Less(a, b int) bool { + return i.cursors[a].curr.Term < i.cursors[b].curr.Term +} +func (i *IndexSnapshotFieldDict) Swap(a, b int) { + i.cursors[a], i.cursors[b] = i.cursors[b], i.cursors[a] +} + +func (i *IndexSnapshotFieldDict) Push(x interface{}) { + i.cursors = append(i.cursors, x.(*segmentDictCursor)) +} + +func (i *IndexSnapshotFieldDict) Pop() interface{} { + n := len(i.cursors) + x := i.cursors[n-1] + i.cursors = i.cursors[0 : n-1] + return x +} + +func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) { + if len(i.cursors) <= 0 { + return nil, nil + } + rv := i.cursors[0].curr + next, err := i.cursors[0].itr.Next() + if err != nil { + return nil, err + } + if next == nil { + // at end of this cursor, remove it + heap.Pop(i) + } else { + // modified heap, fix it + i.cursors[0].curr = next + heap.Fix(i, 0) + } + // look for any other entries with the exact same term + for len(i.cursors) > 0 && i.cursors[0].curr.Term == rv.Term { + rv.Count += i.cursors[0].curr.Count + next, err := i.cursors[0].itr.Next() + if err != nil { + return nil, err + } + if next == nil { + // at end of this cursor, remove it + heap.Pop(i) + } else { + // modified heap, fix it + i.cursors[0].curr = next + heap.Fix(i, 0) + } + } + + return rv, nil +} + +func (i *IndexSnapshotFieldDict) Close() error { + return nil +} diff --git a/index/scorch/snapshot_index_doc.go b/index/scorch/snapshot_index_doc.go new file mode 100644 index 000000000..2b1144874 --- /dev/null +++ b/index/scorch/snapshot_index_doc.go @@ -0,0 +1,53 @@ +package scorch + +import ( + "bytes" + + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index" +) + +type IndexSnapshotDocIDReader struct { + snapshot *IndexSnapshot + iterators []roaring.IntIterable + segmentOffset int +} + +func (i *IndexSnapshotDocIDReader) Next() (index.IndexInternalID, error) { + for i.segmentOffset < len(i.iterators) { + if !i.iterators[i.segmentOffset].HasNext() { + i.segmentOffset++ + continue + } + next := i.iterators[i.segmentOffset].Next() + // make segment number into global number by adding offset + globalOffset := i.snapshot.offsets[i.segmentOffset] + return docNumberToBytes(uint64(next) + globalOffset), nil + } + return nil, nil +} + +func (i *IndexSnapshotDocIDReader) Advance(ID index.IndexInternalID) (index.IndexInternalID, error) { + // FIXME do something better + next, err := i.Next() + if err != nil { + return nil, err + } + if next == nil { + return nil, nil + } + for bytes.Compare(next, ID) < 0 { + next, err = i.Next() + if err != nil { + return nil, err + } + if next == nil { + break + } + } + return next, nil +} + +func (i *IndexSnapshotDocIDReader) Close() error { + return nil +} diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go new file mode 100644 index 000000000..44172f3d0 --- /dev/null +++ b/index/scorch/snapshot_index_tfr.go @@ -0,0 +1,91 @@ +package scorch + +import ( + "bytes" + + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment" +) + +type IndexSnapshotTermFieldReader struct { + snapshot *IndexSnapshot + postings []segment.PostingsList + iterators []segment.PostingsIterator + segmentOffset int + includeFreq bool + includeNorm bool + includeTermVectors bool +} + +func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) { + rv := preAlloced + if rv == nil { + rv = &index.TermFieldDoc{} + } + // find the next hit + for i.segmentOffset < len(i.postings) { + next := i.iterators[i.segmentOffset].Next() + if next != nil { + // make segment number into global number by adding offset + globalOffset := i.snapshot.offsets[i.segmentOffset] + nnum := next.Number() + rv.ID = docNumberToBytes(nnum + globalOffset) + if i.includeFreq { + rv.Freq = next.Frequency() + } + if i.includeNorm { + rv.Norm = next.Norm() + } + if i.includeTermVectors { + locs := next.Locations() + rv.Vectors = make([]*index.TermFieldVector, len(locs)) + for i, loc := range locs { + rv.Vectors[i] = &index.TermFieldVector{ + Start: loc.Start(), + End: loc.End(), + Pos: loc.Pos(), + ArrayPositions: loc.ArrayPositions(), + Field: loc.Field(), + } + } + } + + return rv, nil + } + i.segmentOffset++ + } + return nil, nil +} + +func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) { + // FIXME do something better + next, err := i.Next(preAlloced) + if err != nil { + return nil, err + } + if next == nil { + return nil, nil + } + for bytes.Compare(next.ID, ID) < 0 { + next, err = i.Next(preAlloced) + if err != nil { + return nil, err + } + if next == nil { + break + } + } + return next, nil +} + +func (i *IndexSnapshotTermFieldReader) Count() uint64 { + var rv uint64 + for _, posting := range i.postings { + rv += posting.Count() + } + return rv +} + +func (i *IndexSnapshotTermFieldReader) Close() error { + return nil +} diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go new file mode 100644 index 000000000..b32725e96 --- /dev/null +++ b/index/scorch/snapshot_segment.go @@ -0,0 +1,64 @@ +package scorch + +import ( + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index/scorch/segment" +) + +type SegmentDictionarySnapshot struct { + s *SegmentSnapshot + d segment.TermDictionary +} + +func (s *SegmentDictionarySnapshot) PostingsList(term string, except *roaring.Bitmap) segment.PostingsList { + return s.d.PostingsList(term, s.s.deleted) +} + +func (s *SegmentDictionarySnapshot) Iterator() segment.DictionaryIterator { + return s.d.Iterator() +} + +func (s *SegmentDictionarySnapshot) PrefixIterator(prefix string) segment.DictionaryIterator { + return s.d.PrefixIterator(prefix) +} + +func (s *SegmentDictionarySnapshot) RangeIterator(start, end string) segment.DictionaryIterator { + return s.d.RangeIterator(start, end) +} + +type SegmentSnapshot struct { + id uint64 + segment segment.Segment + deleted *roaring.Bitmap +} + +func (s *SegmentSnapshot) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { + return s.segment.VisitDocument(num, visitor) +} + +func (s *SegmentSnapshot) Count() uint64 { + rv := s.segment.Count() + if s.deleted != nil { + rv -= s.deleted.GetCardinality() + } + return rv +} + +func (s *SegmentSnapshot) Dictionary(field string) segment.TermDictionary { + return &SegmentDictionarySnapshot{ + s: s, + d: s.segment.Dictionary(field), + } +} + +func (s *SegmentSnapshot) DocNumbers(docIDs []string) *roaring.Bitmap { + rv := s.segment.DocNumbers(docIDs) + if s.deleted != nil { + rv.AndNot(s.deleted) + } + return rv +} + +func (s *SegmentSnapshot) Fields() []string { + return s.segment.Fields() +} diff --git a/index/scorch/stats.go b/index/scorch/stats.go new file mode 100644 index 000000000..f49c8178c --- /dev/null +++ b/index/scorch/stats.go @@ -0,0 +1,33 @@ +package scorch + +import ( + "encoding/json" + "sync/atomic" +) + +// Stats tracks statistics about the index +type Stats struct { + analysisTime, indexTime uint64 +} + +// FIXME wire up these other stats again +func (s *Stats) statsMap() map[string]interface{} { + m := map[string]interface{}{} + // m["updates"] = atomic.LoadUint64(&i.updates) + // m["deletes"] = atomic.LoadUint64(&i.deletes) + // m["batches"] = atomic.LoadUint64(&i.batches) + // m["errors"] = atomic.LoadUint64(&i.errors) + m["analysis_time"] = atomic.LoadUint64(&s.analysisTime) + m["index_time"] = atomic.LoadUint64(&s.indexTime) + // m["term_searchers_started"] = atomic.LoadUint64(&i.termSearchersStarted) + // m["term_searchers_finished"] = atomic.LoadUint64(&i.termSearchersFinished) + // m["num_plain_text_bytes_indexed"] = atomic.LoadUint64(&i.numPlainTextBytesIndexed) + + return m +} + +// MarshalJSON implements json.Marshaler +func (s *Stats) MarshalJSON() ([]byte, error) { + m := s.statsMap() + return json.Marshal(m) +} diff --git a/mapping/index.go b/mapping/index.go index 737f26ffe..cefa59803 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -339,7 +339,7 @@ func (im *IndexMappingImpl) newWalkContext(doc *document.Document, dm *DocumentM doc: doc, im: im, dm: dm, - excludedFromAll: []string{}, + excludedFromAll: []string{"_id"}, } } From 848aca4639566905f79f39f688a784f1424185cf Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 29 Nov 2017 13:34:15 -0500 Subject: [PATCH 005/728] fix issues identified by errcheck --- index/scorch/scorch.go | 4 +--- index/scorch/segment/mem/segment_test.go | 10 ++++++-- index/scorch/snapshot_index.go | 30 +++++++++++++++++------- 3 files changed, 31 insertions(+), 13 deletions(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index d20261f5e..1f99b42fe 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -121,9 +121,7 @@ func (s *Scorch) Batch(batch *index.Batch) error { } else { newSegment = mem.New() } - s.prepareSegment(newSegment, ids, batch.InternalOps) - - return nil + return s.prepareSegment(newSegment, ids, batch.InternalOps) } func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, diff --git a/index/scorch/segment/mem/segment_test.go b/index/scorch/segment/mem/segment_test.go index bc840e147..4c056d7ad 100644 --- a/index/scorch/segment/mem/segment_test.go +++ b/index/scorch/segment/mem/segment_test.go @@ -44,10 +44,13 @@ func TestEmpty(t *testing.T) { } // now try and visit a document - emptySegment.VisitDocument(0, func(field string, typ byte, value []byte, pos []uint64) bool { + err := emptySegment.VisitDocument(0, func(field string, typ byte, value []byte, pos []uint64) bool { t.Errorf("document visitor called, not expected") return true }) + if err != nil { + t.Fatal(err) + } } func TestSingle(t *testing.T) { @@ -288,10 +291,13 @@ func TestSingle(t *testing.T) { // now try and visit a document var fieldValuesSeen int - segment.VisitDocument(0, func(field string, typ byte, value []byte, pos []uint64) bool { + err := segment.VisitDocument(0, func(field string, typ byte, value []byte, pos []uint64) bool { fieldValuesSeen++ return true }) + if err != nil { + t.Fatal(err) + } if fieldValuesSeen != 5 { t.Errorf("expected 5 field values, got %d", fieldValuesSeen) } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index a33cc58e1..8c6ea1aaf 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -161,13 +161,17 @@ func (i *IndexSnapshot) DocCount() (uint64, error) { return rv, nil } -func (i *IndexSnapshot) Document(id string) (*document.Document, error) { +func (i *IndexSnapshot) Document(id string) (rv *document.Document, err error) { // FIXME could be done more efficiently directly, but reusing for simplicity tfr, err := i.TermFieldReader([]byte(id), "_id", false, false, false) if err != nil { return nil, err } - defer tfr.Close() + defer func() { + if cerr := tfr.Close(); err == nil && cerr != nil { + err = cerr + } + }() next, err := tfr.Next(nil) if err != nil { @@ -177,8 +181,8 @@ func (i *IndexSnapshot) Document(id string) (*document.Document, error) { docNum := docInternalToNumber(next.ID) segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) - rv := document.NewDocument(id) - i.segment[segmentIndex].VisitDocument(localDocNum, func(name string, typ byte, value []byte, pos []uint64) bool { + rv = document.NewDocument(id) + err = i.segment[segmentIndex].VisitDocument(localDocNum, func(name string, typ byte, value []byte, pos []uint64) bool { switch typ { case 't': rv.AddField(document.NewTextField(name, pos, value)) @@ -194,6 +198,9 @@ func (i *IndexSnapshot) Document(id string) (*document.Document, error) { return true }) + if err != nil { + return nil, err + } return rv, nil } @@ -218,7 +225,7 @@ func (i *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) { var found bool var rv string - i.segment[segmentIndex].VisitDocument(localDocNum, func(field string, typ byte, value []byte, pos []uint64) bool { + err := i.segment[segmentIndex].VisitDocument(localDocNum, func(field string, typ byte, value []byte, pos []uint64) bool { if field == "_id" { found = true rv = string(value) @@ -226,6 +233,9 @@ func (i *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) { } return true }) + if err != nil { + return "", err + } if found { return rv, nil @@ -233,13 +243,17 @@ func (i *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) { return "", fmt.Errorf("document number %d not found", docNum) } -func (i *IndexSnapshot) InternalID(id string) (index.IndexInternalID, error) { +func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err error) { // FIXME could be done more efficiently directly, but reusing for simplicity tfr, err := i.TermFieldReader([]byte(id), "_id", false, false, false) if err != nil { return nil, err } - defer tfr.Close() + defer func() { + if cerr := tfr.Close(); err == nil && cerr != nil { + err = cerr + } + }() next, err := tfr.Next(nil) if err != nil { @@ -295,6 +309,6 @@ func docNumberToBytes(in uint64) []byte { func docInternalToNumber(in index.IndexInternalID) uint64 { var res uint64 - binary.Read(bytes.NewReader(in), binary.BigEndian, &res) + _ = binary.Read(bytes.NewReader(in), binary.BigEndian, &res) return res } From 67986d41bfab5634bc11d81f7a02bd7279782476 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 30 Nov 2017 08:36:01 -0800 Subject: [PATCH 006/728] scorch InternalID() handles case of unknown docId --- index/scorch/reader_test.go | 8 ++++++++ index/scorch/snapshot_index.go | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/index/scorch/reader_test.go b/index/scorch/reader_test.go index a050bb44a..f673b198b 100644 --- a/index/scorch/reader_test.go +++ b/index/scorch/reader_test.go @@ -93,6 +93,14 @@ func TestIndexReader(t *testing.T) { t.Errorf("count was 2, but only saw %d", actualCount) } + internalIDBogus, err := indexReader.InternalID("a-bogus-docId") + if err != nil { + t.Fatal(err) + } + if internalIDBogus != nil { + t.Errorf("expected bogus docId to have nil InternalID") + } + internalID2, err := indexReader.InternalID("2") if err != nil { t.Fatal(err) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 8c6ea1aaf..11fc063c5 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -256,7 +256,7 @@ func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err err }() next, err := tfr.Next(nil) - if err != nil { + if err != nil || next == nil { return nil, err } From 398dcb19b3c7bc96e74f42a838784307715a92da Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 30 Nov 2017 10:37:02 -0800 Subject: [PATCH 007/728] scorch introducer uses the roaring.Or(x, y) API Instead of cloning an input bitmap, the roaring.Or(x, y) implementation fills a brand new result bitmap, which should be allow for more efficient packing and memory utilization. --- index/scorch/introducer.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index dc748ad85..37a66dc84 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -49,8 +49,7 @@ func (s *Scorch) mainLoop() { if s.root.segment[i].deleted == nil { newSnapshot.segment[i].deleted = delta } else { - newSnapshot.segment[i].deleted = s.root.segment[i].deleted.Clone() - newSnapshot.segment[i].deleted.Or(delta) + newSnapshot.segment[i].deleted = roaring.Or(s.root.segment[i].deleted, delta) } newSnapshot.offsets[i] = running From 395458ce830fc0b86963a37b8728b03be594c65c Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 1 Dec 2017 07:26:47 -0500 Subject: [PATCH 008/728] refactor to make mem segment contents exported --- index/scorch/segment/mem/build.go | 72 ++++++++++++++--------------- index/scorch/segment/mem/dict.go | 14 +++--- index/scorch/segment/mem/posting.go | 24 +++++----- index/scorch/segment/mem/segment.go | 62 ++++++++++++------------- 4 files changed, 86 insertions(+), 86 deletions(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index ff6fb6056..5154c182f 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -23,7 +23,7 @@ func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { } // go back and sort the dictKeys - for _, dict := range s.dictKeys { + for _, dict := range s.DictKeys { sort.Strings(dict) } @@ -81,9 +81,9 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { } storeField := func(docNum uint64, field uint16, typ byte, val []byte, pos []uint64) { - s.stored[docNum][field] = append(s.stored[docNum][field], val) - s.storedTypes[docNum][field] = append(s.storedTypes[docNum][field], typ) - s.storedPos[docNum][field] = append(s.storedPos[docNum][field], pos) + s.Stored[docNum][field] = append(s.Stored[docNum][field], val) + s.StoredTypes[docNum][field] = append(s.StoredTypes[docNum][field], typ) + s.StoredPos[docNum][field] = append(s.StoredPos[docNum][field], pos) } // walk each composite field @@ -107,7 +107,7 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { // now that its been rolled up into docMap, walk that for fieldID, tokenFrequencies := range docMap { for term, tokenFreq := range tokenFrequencies { - fieldTermPostings := s.dicts[fieldID][term] + fieldTermPostings := s.Dicts[fieldID][term] // FIXME this if/else block has duplicate code that has resulted in // bugs fixed/missed more than once, need to refactor @@ -116,12 +116,12 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { bs := roaring.New() bs.AddInt(int(docNum)) - newPostingID := uint64(len(s.postings) + 1) + newPostingID := uint64(len(s.Postings) + 1) // add this new bitset to the postings slice - s.postings = append(s.postings, bs) + s.Postings = append(s.Postings, bs) // add this to the details slice - s.freqs = append(s.freqs, []uint64{uint64(tokenFreq.Frequency())}) - s.norms = append(s.norms, []float32{float32(1.0 / math.Sqrt(float64(fieldLens[fieldID])))}) + s.Freqs = append(s.Freqs, []uint64{uint64(tokenFreq.Frequency())}) + s.Norms = append(s.Norms, []float32{float32(1.0 / math.Sqrt(float64(fieldLens[fieldID])))}) // add to locations var locfields []uint16 var locstarts []uint64 @@ -143,35 +143,35 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { locarraypos = append(locarraypos, nil) } } - s.locfields = append(s.locfields, locfields) - s.locstarts = append(s.locstarts, locstarts) - s.locends = append(s.locends, locends) - s.locpos = append(s.locpos, locpos) - s.locarraypos = append(s.locarraypos, locarraypos) + s.Locfields = append(s.Locfields, locfields) + s.Locstarts = append(s.Locstarts, locstarts) + s.Locends = append(s.Locends, locends) + s.Locpos = append(s.Locpos, locpos) + s.Locarraypos = append(s.Locarraypos, locarraypos) // record it - s.dicts[fieldID][term] = newPostingID + s.Dicts[fieldID][term] = newPostingID // this term was new for this field, add it to dictKeys - s.dictKeys[fieldID] = append(s.dictKeys[fieldID], term) + s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term) } else { // posting already started for this field/term // the actual offset is - 1, because 0 is zero value - bs := s.postings[fieldTermPostings-1] + bs := s.Postings[fieldTermPostings-1] bs.AddInt(int(docNum)) - s.freqs[fieldTermPostings-1] = append(s.freqs[fieldTermPostings-1], uint64(tokenFreq.Frequency())) - s.norms[fieldTermPostings-1] = append(s.norms[fieldTermPostings-1], float32(1.0/math.Sqrt(float64(fieldLens[fieldID])))) + s.Freqs[fieldTermPostings-1] = append(s.Freqs[fieldTermPostings-1], uint64(tokenFreq.Frequency())) + s.Norms[fieldTermPostings-1] = append(s.Norms[fieldTermPostings-1], float32(1.0/math.Sqrt(float64(fieldLens[fieldID])))) for _, loc := range tokenFreq.Locations { var locf = fieldID if loc.Field != "" { locf = uint16(s.getOrDefineField(loc.Field, false)) } - s.locfields[fieldTermPostings-1] = append(s.locfields[fieldTermPostings-1], locf) - s.locstarts[fieldTermPostings-1] = append(s.locstarts[fieldTermPostings-1], uint64(loc.Start)) - s.locends[fieldTermPostings-1] = append(s.locends[fieldTermPostings-1], uint64(loc.End)) - s.locpos[fieldTermPostings-1] = append(s.locpos[fieldTermPostings-1], uint64(loc.Position)) + s.Locfields[fieldTermPostings-1] = append(s.Locfields[fieldTermPostings-1], locf) + s.Locstarts[fieldTermPostings-1] = append(s.Locstarts[fieldTermPostings-1], uint64(loc.Start)) + s.Locends[fieldTermPostings-1] = append(s.Locends[fieldTermPostings-1], uint64(loc.End)) + s.Locpos[fieldTermPostings-1] = append(s.Locpos[fieldTermPostings-1], uint64(loc.Position)) if len(loc.ArrayPositions) > 0 { - s.locarraypos[fieldTermPostings-1] = append(s.locarraypos[fieldTermPostings-1], loc.ArrayPositions) + s.Locarraypos[fieldTermPostings-1] = append(s.Locarraypos[fieldTermPostings-1], loc.ArrayPositions) } else { - s.locarraypos[fieldTermPostings-1] = append(s.locarraypos[fieldTermPostings-1], nil) + s.Locarraypos[fieldTermPostings-1] = append(s.Locarraypos[fieldTermPostings-1], nil) } } } @@ -180,23 +180,23 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { } func (s *Segment) getOrDefineField(name string, hasLoc bool) int { - fieldID, ok := s.fields[name] + fieldID, ok := s.FieldsMap[name] if !ok { - fieldID = uint16(len(s.fieldsInv) + 1) - s.fields[name] = fieldID - s.fieldsInv = append(s.fieldsInv, name) - s.fieldsLoc = append(s.fieldsLoc, hasLoc) - s.dicts = append(s.dicts, make(map[string]uint64)) - s.dictKeys = append(s.dictKeys, make([]string, 0)) + fieldID = uint16(len(s.FieldsInv) + 1) + s.FieldsMap[name] = fieldID + s.FieldsInv = append(s.FieldsInv, name) + s.FieldsLoc = append(s.FieldsLoc, hasLoc) + s.Dicts = append(s.Dicts, make(map[string]uint64)) + s.DictKeys = append(s.DictKeys, make([]string, 0)) } return int(fieldID - 1) } func (s *Segment) addDocument() int { - docNum := len(s.stored) - s.stored = append(s.stored, map[uint16][][]byte{}) - s.storedTypes = append(s.storedTypes, map[uint16][]byte{}) - s.storedPos = append(s.storedPos, map[uint16][][]uint64{}) + docNum := len(s.Stored) + s.Stored = append(s.Stored, map[uint16][][]byte{}) + s.StoredTypes = append(s.StoredTypes, map[uint16][]byte{}) + s.StoredPos = append(s.StoredPos, map[uint16][][]uint64{}) return docNum } diff --git a/index/scorch/segment/mem/dict.go b/index/scorch/segment/mem/dict.go index a7d609f6d..c724493da 100644 --- a/index/scorch/segment/mem/dict.go +++ b/index/scorch/segment/mem/dict.go @@ -21,7 +21,7 @@ func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) segment.P return &PostingsList{ dictionary: d, term: term, - postingsID: d.segment.dicts[d.fieldID][term], + postingsID: d.segment.Dicts[d.fieldID][term], except: except, } } @@ -36,7 +36,7 @@ func (d *Dictionary) Iterator() segment.DictionaryIterator { // PrefixIterator returns an iterator which only visits terms having the // the specified prefix func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { - offset := sort.SearchStrings(d.segment.dictKeys[d.fieldID], prefix) + offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], prefix) return &DictionaryIterator{ d: d, prefix: prefix, @@ -47,7 +47,7 @@ func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { // RangeIterator returns an iterator which only visits terms between the // start and end terms. NOTE: bleve.index API specifies the end is inclusive. func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator { - offset := sort.SearchStrings(d.segment.dictKeys[d.fieldID], start) + offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], start) return &DictionaryIterator{ d: d, offset: offset, @@ -65,10 +65,10 @@ type DictionaryIterator struct { // Next returns the next entry in the dictionary func (d *DictionaryIterator) Next() (*index.DictEntry, error) { - if d.offset > len(d.d.segment.dictKeys[d.d.fieldID])-1 { + if d.offset > len(d.d.segment.DictKeys[d.d.fieldID])-1 { return nil, nil } - next := d.d.segment.dictKeys[d.d.fieldID][d.offset] + next := d.d.segment.DictKeys[d.d.fieldID][d.offset] // check prefix if d.prefix != "" && !strings.HasPrefix(next, d.prefix) { return nil, nil @@ -79,9 +79,9 @@ func (d *DictionaryIterator) Next() (*index.DictEntry, error) { } d.offset++ - postingID := d.d.segment.dicts[d.d.fieldID][next] + postingID := d.d.segment.Dicts[d.d.fieldID][next] return &index.DictEntry{ Term: next, - Count: d.d.segment.postings[postingID-1].GetCardinality(), + Count: d.d.segment.Postings[postingID-1].GetCardinality(), }, nil } diff --git a/index/scorch/segment/mem/posting.go b/index/scorch/segment/mem/posting.go index ac8b2fb72..fa1c0a00f 100644 --- a/index/scorch/segment/mem/posting.go +++ b/index/scorch/segment/mem/posting.go @@ -17,7 +17,7 @@ type PostingsList struct { func (p *PostingsList) Count() uint64 { var rv uint64 if p.postingsID > 0 { - rv = p.dictionary.segment.postings[p.postingsID-1].GetCardinality() + rv = p.dictionary.segment.Postings[p.postingsID-1].GetCardinality() if p.except != nil { except := p.except.GetCardinality() if except > rv { @@ -36,7 +36,7 @@ func (p *PostingsList) Iterator() segment.PostingsIterator { postings: p, } if p.postingsID > 0 { - allbits := p.dictionary.segment.postings[p.postingsID-1] + allbits := p.dictionary.segment.Postings[p.postingsID-1] rv.all = allbits.Iterator() if p.except != nil { allExcept := allbits.Clone() @@ -72,7 +72,7 @@ func (i *PostingsIterator) Next() segment.Posting { // if they don't match, adjust offsets to factor in item we're skipping over // incr the all iterator, and check again for allN != n { - i.locoffset += int(i.postings.dictionary.segment.freqs[i.postings.postingsID-1][i.offset]) + i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset]) i.offset++ allN = i.all.Next() } @@ -83,7 +83,7 @@ func (i *PostingsIterator) Next() segment.Posting { locoffset: i.locoffset, } - i.locoffset += int(i.postings.dictionary.segment.freqs[i.postings.postingsID-1][i.offset]) + i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset]) i.offset++ return rv } @@ -103,17 +103,17 @@ func (p *Posting) Number() uint64 { // Frequency returns the frequence of occurance of this term in this doc/field func (p *Posting) Frequency() uint64 { - return p.iterator.postings.dictionary.segment.freqs[p.iterator.postings.postingsID-1][p.offset] + return p.iterator.postings.dictionary.segment.Freqs[p.iterator.postings.postingsID-1][p.offset] } // Norm returns the normalization factor for this posting func (p *Posting) Norm() float64 { - return float64(p.iterator.postings.dictionary.segment.norms[p.iterator.postings.postingsID-1][p.offset]) + return float64(p.iterator.postings.dictionary.segment.Norms[p.iterator.postings.postingsID-1][p.offset]) } // Locations returns the location information for each occurance func (p *Posting) Locations() []segment.Location { - if !p.iterator.postings.dictionary.segment.fieldsLoc[p.iterator.postings.dictionary.fieldID] { + if !p.iterator.postings.dictionary.segment.FieldsLoc[p.iterator.postings.dictionary.fieldID] { return nil } freq := int(p.Frequency()) @@ -136,25 +136,25 @@ type Location struct { // Field returns the name of the field (useful in composite fields to know // which original field the value came from) func (l *Location) Field() string { - return l.p.iterator.postings.dictionary.segment.fieldsInv[l.p.iterator.postings.dictionary.segment.locfields[l.p.iterator.postings.postingsID-1][l.offset]] + return l.p.iterator.postings.dictionary.segment.FieldsInv[l.p.iterator.postings.dictionary.segment.Locfields[l.p.iterator.postings.postingsID-1][l.offset]] } // Start returns the start byte offset of this occurance func (l *Location) Start() uint64 { - return l.p.iterator.postings.dictionary.segment.locstarts[l.p.iterator.postings.postingsID-1][l.offset] + return l.p.iterator.postings.dictionary.segment.Locstarts[l.p.iterator.postings.postingsID-1][l.offset] } // End returns the end byte offset of this occurance func (l *Location) End() uint64 { - return l.p.iterator.postings.dictionary.segment.locends[l.p.iterator.postings.postingsID-1][l.offset] + return l.p.iterator.postings.dictionary.segment.Locends[l.p.iterator.postings.postingsID-1][l.offset] } // Pos returns the 1-based phrase position of this occurance func (l *Location) Pos() uint64 { - return l.p.iterator.postings.dictionary.segment.locpos[l.p.iterator.postings.postingsID-1][l.offset] + return l.p.iterator.postings.dictionary.segment.Locpos[l.p.iterator.postings.postingsID-1][l.offset] } // ArrayPositions returns the array position vector associated with this occurance func (l *Location) ArrayPositions() []uint64 { - return l.p.iterator.postings.dictionary.segment.locarraypos[l.p.iterator.postings.postingsID-1][l.offset] + return l.p.iterator.postings.dictionary.segment.Locarraypos[l.p.iterator.postings.postingsID-1][l.offset] } diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go index fdec1b2e6..fe71f17c6 100644 --- a/index/scorch/segment/mem/segment.go +++ b/index/scorch/segment/mem/segment.go @@ -22,77 +22,77 @@ import ( // Segment is an in memory implementation of scorch.Segment type Segment struct { - // fields name -> id+1 - fields map[string]uint16 + // FieldsMap name -> id+1 + FieldsMap map[string]uint16 // fields id -> name - fieldsInv []string + FieldsInv []string // field id -> has location info - fieldsLoc []bool + FieldsLoc []bool // term dictionary // field id -> term -> posting id + 1 - dicts []map[string]uint64 + Dicts []map[string]uint64 // term dictionary keys // field id -> []dictionary keys - dictKeys [][]string + DictKeys [][]string - // postings list - // postings list id -> postings bitmap - postings []*roaring.Bitmap + // Postings list + // Postings list id -> Postings bitmap + Postings []*roaring.Bitmap // term frequencies - // postings list id -> freqs (one for each hit in bitmap) - freqs [][]uint64 + // postings list id -> Freqs (one for each hit in bitmap) + Freqs [][]uint64 - // field norms - // postings list id -> norms (one for each hit in bitmap) - norms [][]float32 + // field Norms + // postings list id -> Norms (one for each hit in bitmap) + Norms [][]float32 // field/start/end/pos/locarraypos // postings list id -> start/end/pos/locarraypos (one for each freq) - locfields [][]uint16 - locstarts [][]uint64 - locends [][]uint64 - locpos [][]uint64 - locarraypos [][][]uint64 + Locfields [][]uint16 + Locstarts [][]uint64 + Locends [][]uint64 + Locpos [][]uint64 + Locarraypos [][][]uint64 - // stored field values + // Stored field values // docNum -> field id -> slice of values (each value []byte) - stored []map[uint16][][]byte + Stored []map[uint16][][]byte // stored field types // docNum -> field id -> slice of types (each type byte) - storedTypes []map[uint16][]byte + StoredTypes []map[uint16][]byte // stored field array positions // docNum -> field id -> slice of array positions (each is []uint64) - storedPos []map[uint16][][]uint64 + StoredPos []map[uint16][][]uint64 } // New builds a new empty Segment func New() *Segment { return &Segment{ - fields: map[string]uint16{}, + FieldsMap: map[string]uint16{}, } } // Fields returns the field names used in this segment func (s *Segment) Fields() []string { - return s.fieldsInv + return s.FieldsInv } // VisitDocument invokes the DocFieldValueVistor for each stored field // for the specified doc number func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { // ensure document number exists - if int(num) > len(s.stored)-1 { + if int(num) > len(s.Stored)-1 { return nil } - docFields := s.stored[int(num)] + docFields := s.Stored[int(num)] for field, values := range docFields { for i, value := range values { - keepGoing := visitor(s.fieldsInv[field], s.storedTypes[int(num)][field][i], value, s.storedPos[int(num)][field][i]) + keepGoing := visitor(s.FieldsInv[field], s.StoredTypes[int(num)][field][i], value, s.StoredPos[int(num)][field][i]) if !keepGoing { return nil } @@ -113,19 +113,19 @@ func (s *Segment) Dictionary(field string) segment.TermDictionary { // Count returns the number of documents in this segment // (this has no notion of deleted docs) func (s *Segment) Count() uint64 { - return uint64(len(s.stored)) + return uint64(len(s.Stored)) } // DocNumbers returns a bitset corresponding to the doc numbers of all the // provided _id strings func (s *Segment) DocNumbers(ids []string) *roaring.Bitmap { - idDictionary := s.dicts[s.getOrDefineField("_id", false)] + idDictionary := s.Dicts[s.getOrDefineField("_id", false)] rv := roaring.New() for _, id := range ids { postingID := idDictionary[id] if postingID > 0 { - rv.Or(s.postings[postingID-1]) + rv.Or(s.Postings[postingID-1]) } } return rv From bcd4bdc3d10a8dc2cf1887e5459dc12ae4cafbe0 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 1 Dec 2017 07:27:04 -0500 Subject: [PATCH 009/728] added initial bolt thought to README --- index/scorch/README.md | 92 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 91 insertions(+), 1 deletion(-) diff --git a/index/scorch/README.md b/index/scorch/README.md index cec982ebd..690e7d3d5 100644 --- a/index/scorch/README.md +++ b/index/scorch/README.md @@ -183,7 +183,7 @@ An ASCII art example: [ 0 1 1 ] Compute bitset segment-1-deleted-by-2: - [ 0 0 0 ] + [ 0 ] OR it with previous (nil) still just nil @@ -418,3 +418,93 @@ state: 2, 4, 8 2-X, 4-X, 8-X, nil merger finishes: new segment Y, is not valid, need to be recomputed + + +### Bolt Segment Proposal + +Bucket + +"f" field storage + + Key Val + field name field id (var uint16) + + // TODO field location bits + +"d" term dictionary storage + Key Val + field id (var uint16) Vellum FST (mapping term to posting id uint64) + + +"p" postings list storage + Key Val + posting id (var uint64) Roaring Bitmap Serialization (doc numbers) - see FromBuffer + + +"x" chunked data storage + Key Val + chunk id (var uint64) sub-bucket + + Key Val + posting id (var uint64) sub-bucket + + + ALL Compressed Integer Encoding []uint64 + Key Val + "f" freqs 1 value per hit + "n" norms 1 value per hit + "i" fields values per hit + "s" start values per hit + "e" end values per hit + "p" pos values per hit + "a" array pos + entries + each entry is count + followed by uint64 + +"s" stored field data + Key Val + doc num (var uint64) sub-bucket + + Key Val + "m" mossy-like meta packed + + 16 bits - field id + 8 bits - field type + 2? bits - array pos length + + X bits - offset + X bits - length + + "d" raw []byte data (possibly compressed, need segment level config?) + + "a" array position info, packed slice uint64 + + + + + +Notes: + +It is assumed that each IndexReader (snapshot) starts a new Bolt TX (read-only) immediately, and holds it up until it is no longer needed. This allows us to use (unsafely) the raw bytes coming out of BoltDB as return values. Bolt guarantees they will be safe for the duration of the transaction (which we arrange to be the life of the index snapshot). + +Only physically store the fields in one direction, even though at runtime we need both. Upon opening the index, we can read in all the k/v pairs in the "f" bucket. We use the unsafe package to create a []string inverted mapping pointing to the underlying []byte in the BoltDB values. + +The term dictionary is stored opaquely as Vellum FST for each field. When accessing these keys, the []byte return to us is mmap'd by bolt under the hood. We then pass this to vellum using its []byte API, which then operates on it without ever forcing whole thing into memory unless needed. + +We do not need to persist the dictkeys slice since it is only there to support the dictionary iterator prefix/range searches, which are supported directly by the FST. + +Theory of operation of chunked storage is as follows. The postings list iterators only allow starting at the beginning, and have no "advance" capability. In the memory version, this means we always know the Nth hit in the postings list is the Nth entry in some other densely packed slice. However, while OK when everything is in RAM, this is not as suitable for a structure on disk, where wading through detailed info of records you don't care about is too expensive. Instead, we assume some fixed chunking, say 1024. All detailed info for document number N can be found inside of chunk N/1024. Now, the Advance operation still has to Next it's way through the posting list. But, now when it reaches a hit, it knows the chunk index as well as the hit index inside that chunk. Further, we push the chunk offsets to the top of the bolt structure, under the theory that we're likely to access data inside a chunk at the same time. For example, you're likely to access the frequency and norm values for a document hit together, so by organizing by chunk first, we increase the likelihood that this info is nearby on disk. + +The "f" and "n" sub-buckets inside a posting have 1 entry for each hit. (you must next-next-next within the chunk) + +The "i", "s", "e", "p", sub-buckets have entries for each hit. (you must have read and know the freq) + +The "a" sub-bucket has groupings, where each grouping starts with a count, followed by entries. + +For example, lets say hit docNum 27 has freq of 2. The first location for the hit has array positions (0, 1) length 2, and the second location for the hit has array positions (1, 3, 2) length 3. The entries in the slice for this hit look like: + +2 0 1 3 1 3 2 +^ ^ +| next entry, number of ints to follow for it +number of ints to follow for this entry From c2047dcdf9cf911a4b2212f03cb5921ce6f26dba Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 1 Dec 2017 08:54:39 -0500 Subject: [PATCH 010/728] refactor doc id reader creation to share more code fix issue identified by steve --- index/scorch/snapshot_index.go | 41 ++++++++++---------------------- index/scorch/snapshot_segment.go | 10 ++++++++ 2 files changed, 23 insertions(+), 28 deletions(-) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 11fc063c5..fa2113ccd 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -72,61 +72,46 @@ func (i *IndexSnapshot) FieldDictPrefix(field string, } func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) { - - type segmentDocNumsResult struct { - index int - docs *roaring.Bitmap - } - results := make(chan *segmentDocNumsResult) for index, segment := range i.segment { go func(index int, segment *SegmentSnapshot) { - docnums := roaring.NewBitmap() - docnums.AddRange(0, segment.Count()) results <- &segmentDocNumsResult{ index: index, - docs: docnums, + docs: segment.DocNumbersLive(), } }(index, segment) } - rv := &IndexSnapshotDocIDReader{ - snapshot: i, - iterators: make([]roaring.IntIterable, len(i.segment)), - } - for count := 0; count < len(i.segment); count++ { - sdnr := <-results - rv.iterators[sdnr.index] = sdnr.docs.Iterator() - } - - return rv, nil + return i.newDocIDReader(results) } func (i *IndexSnapshot) DocIDReaderOnly(ids []string) (index.DocIDReader, error) { - - type segmentDocNumsResult struct { - index int - docs *roaring.Bitmap - } - results := make(chan *segmentDocNumsResult) for index, segment := range i.segment { go func(index int, segment *SegmentSnapshot) { - docnums := segment.DocNumbers(ids) results <- &segmentDocNumsResult{ index: index, - docs: docnums, + docs: segment.DocNumbers(ids), } }(index, segment) } + return i.newDocIDReader(results) +} + +type segmentDocNumsResult struct { + index int + docs *roaring.Bitmap +} + +func (i *IndexSnapshot) newDocIDReader(results chan *segmentDocNumsResult) (index.DocIDReader, error) { rv := &IndexSnapshotDocIDReader{ snapshot: i, iterators: make([]roaring.IntIterable, len(i.segment)), } for count := 0; count < len(i.segment); count++ { sdnr := <-results - rv.iterators[count] = sdnr.docs.Iterator() + rv.iterators[sdnr.index] = sdnr.docs.Iterator() } return rv, nil diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index b32725e96..ffacb5287 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -59,6 +59,16 @@ func (s *SegmentSnapshot) DocNumbers(docIDs []string) *roaring.Bitmap { return rv } +// DocNumbersLive returns bitsit containing doc numbers for all live docs +func (s *SegmentSnapshot) DocNumbersLive() *roaring.Bitmap { + rv := roaring.NewBitmap() + rv.AddRange(0, s.segment.Count()) + if s.deleted != nil { + rv.AndNot(s.deleted) + } + return rv +} + func (s *SegmentSnapshot) Fields() []string { return s.segment.Fields() } From 7c964de8bfdd93087169e8478064a2e293200e62 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 1 Dec 2017 09:26:51 -0500 Subject: [PATCH 011/728] switch to binary search for finding segment from global doc num added unit tests for this function specifically --- index/scorch/reader_test.go | 104 +++++++++++++++++++++++++++++++++ index/scorch/snapshot_index.go | 13 ++--- 2 files changed, 109 insertions(+), 8 deletions(-) diff --git a/index/scorch/reader_test.go b/index/scorch/reader_test.go index f673b198b..6b801faa3 100644 --- a/index/scorch/reader_test.go +++ b/index/scorch/reader_test.go @@ -517,3 +517,107 @@ func TestIndexDocIdOnlyReader(t *testing.T) { // } } + +func TestSegmentIndexAndLocalDocNumFromGlobal(t *testing.T) { + tests := []struct { + offsets []uint64 + globalDocNum uint64 + segmentIndex int + localDocNum uint64 + }{ + // just 1 segment + { + offsets: []uint64{0}, + globalDocNum: 0, + segmentIndex: 0, + localDocNum: 0, + }, + { + offsets: []uint64{0}, + globalDocNum: 1, + segmentIndex: 0, + localDocNum: 1, + }, + { + offsets: []uint64{0}, + globalDocNum: 25, + segmentIndex: 0, + localDocNum: 25, + }, + // now 2 segments, 30 docs in first + { + offsets: []uint64{0, 30}, + globalDocNum: 0, + segmentIndex: 0, + localDocNum: 0, + }, + { + offsets: []uint64{0, 30}, + globalDocNum: 1, + segmentIndex: 0, + localDocNum: 1, + }, + { + offsets: []uint64{0, 30}, + globalDocNum: 25, + segmentIndex: 0, + localDocNum: 25, + }, + { + offsets: []uint64{0, 30}, + globalDocNum: 30, + segmentIndex: 1, + localDocNum: 0, + }, + { + offsets: []uint64{0, 30}, + globalDocNum: 35, + segmentIndex: 1, + localDocNum: 5, + }, + // lots of segments + { + offsets: []uint64{0, 30, 40, 70, 99, 172, 800, 25000}, + globalDocNum: 0, + segmentIndex: 0, + localDocNum: 0, + }, + { + offsets: []uint64{0, 30, 40, 70, 99, 172, 800, 25000}, + globalDocNum: 25, + segmentIndex: 0, + localDocNum: 25, + }, + { + offsets: []uint64{0, 30, 40, 70, 99, 172, 800, 25000}, + globalDocNum: 35, + segmentIndex: 1, + localDocNum: 5, + }, + { + offsets: []uint64{0, 30, 40, 70, 99, 172, 800, 25000}, + globalDocNum: 100, + segmentIndex: 4, + localDocNum: 1, + }, + { + offsets: []uint64{0, 30, 40, 70, 99, 172, 800, 25000}, + globalDocNum: 825, + segmentIndex: 6, + localDocNum: 25, + }, + } + + for _, test := range tests { + i := &IndexSnapshot{ + offsets: test.offsets, + } + gotSegmentIndex, gotLocalDocNum := i.segmentIndexAndLocalDocNumFromGlobal(test.globalDocNum) + if gotSegmentIndex != test.segmentIndex { + t.Errorf("got segment index %d expected %d for offsets %v globalDocNum %d", gotSegmentIndex, test.segmentIndex, test.offsets, test.globalDocNum) + } + if gotLocalDocNum != test.localDocNum { + t.Errorf("got localDocNum %d expected %d for offsets %v globalDocNum %d", gotLocalDocNum, test.localDocNum, test.offsets, test.globalDocNum) + } + } +} diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index fa2113ccd..2c85b8fee 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -5,6 +5,7 @@ import ( "container/heap" "encoding/binary" "fmt" + "sort" "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/document" @@ -191,14 +192,10 @@ func (i *IndexSnapshot) Document(id string) (rv *document.Document, err error) { } func (i *IndexSnapshot) segmentIndexAndLocalDocNumFromGlobal(docNum uint64) (int, uint64) { - var segmentIndex uint64 - for j := 1; j < len(i.offsets); j++ { - if docNum >= i.offsets[j] { - segmentIndex = uint64(j) - } else { - break - } - } + segmentIndex := sort.Search(len(i.offsets), + func(x int) bool { + return i.offsets[x] > docNum + }) - 1 localDocNum := docNum - i.offsets[segmentIndex] return int(segmentIndex), localDocNum From eb256f78bc63fef2d97a8aec81d06df375345879 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 1 Dec 2017 09:30:07 -0500 Subject: [PATCH 012/728] switch to constant referring to id field id 0 this avoids potentially mutating something that is intended to be immutable --- index/scorch/segment/mem/segment.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go index fe71f17c6..e9a361c2f 100644 --- a/index/scorch/segment/mem/segment.go +++ b/index/scorch/segment/mem/segment.go @@ -5,6 +5,9 @@ import ( "github.com/blevesearch/bleve/index/scorch/segment" ) +// _id field is always guaranteed to have fieldID of 0 +const idFieldID uint16 = 0 + // KNOWN ISSUES // - LIMITATION - we decided whether or not to store term vectors for a field // at the segment level, based on the first definition of a @@ -120,7 +123,7 @@ func (s *Segment) Count() uint64 { // provided _id strings func (s *Segment) DocNumbers(ids []string) *roaring.Bitmap { - idDictionary := s.Dicts[s.getOrDefineField("_id", false)] + idDictionary := s.Dicts[idFieldID] rv := roaring.New() for _, id := range ids { postingID := idDictionary[id] From cff14f12121ffd362f0796cbc99621b8c8503d43 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 1 Dec 2017 09:50:27 -0500 Subject: [PATCH 013/728] fix crash in DocNumbers when segment is empty --- index/scorch/segment/mem/segment.go | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go index e9a361c2f..f4294ebd7 100644 --- a/index/scorch/segment/mem/segment.go +++ b/index/scorch/segment/mem/segment.go @@ -122,13 +122,17 @@ func (s *Segment) Count() uint64 { // DocNumbers returns a bitset corresponding to the doc numbers of all the // provided _id strings func (s *Segment) DocNumbers(ids []string) *roaring.Bitmap { - - idDictionary := s.Dicts[idFieldID] rv := roaring.New() - for _, id := range ids { - postingID := idDictionary[id] - if postingID > 0 { - rv.Or(s.Postings[postingID-1]) + + // guard against empty segment + if len(s.FieldsMap) > 0 { + idDictionary := s.Dicts[idFieldID] + + for _, id := range ids { + postingID := idDictionary[id] + if postingID > 0 { + rv.Or(s.Postings[postingID-1]) + } } } return rv From 89aa02cf5b7fb3ad9c36fdb79068e8ce69fd98fe Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 1 Dec 2017 15:12:08 -0500 Subject: [PATCH 014/728] fix highlighting of composite fields updated log statements for refactored names --- index/scorch/segment/mem/build.go | 34 +++++++++++++++---------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 5154c182f..d995dd041 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -29,38 +29,38 @@ func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { // professional debugging // - // log.Printf("fields: %v\n", s.fields) - // log.Printf("fieldsInv: %v\n", s.fieldsInv) - // log.Printf("fieldsLoc: %v\n", s.fieldsLoc) - // log.Printf("dicts: %v\n", s.dicts) - // log.Printf("dict keys: %v\n", s.dictKeys) - // for i, posting := range s.postings { + // log.Printf("fields: %v\n", s.FieldsMap) + // log.Printf("fieldsInv: %v\n", s.FieldsInv) + // log.Printf("fieldsLoc: %v\n", s.FieldsLoc) + // log.Printf("dicts: %v\n", s.Dicts) + // log.Printf("dict keys: %v\n", s.DictKeys) + // for i, posting := range s.Postings { // log.Printf("posting %d: %v\n", i, posting) // } - // for i, freq := range s.freqs { + // for i, freq := range s.Freqs { // log.Printf("freq %d: %v\n", i, freq) // } - // for i, norm := range s.norms { + // for i, norm := range s.Norms { // log.Printf("norm %d: %v\n", i, norm) // } - // for i, field := range s.locfields { + // for i, field := range s.Locfields { // log.Printf("field %d: %v\n", i, field) // } - // for i, start := range s.locstarts { + // for i, start := range s.Locstarts { // log.Printf("start %d: %v\n", i, start) // } - // for i, end := range s.locends { + // for i, end := range s.Locends { // log.Printf("end %d: %v\n", i, end) // } - // for i, pos := range s.locpos { + // for i, pos := range s.Locpos { // log.Printf("pos %d: %v\n", i, pos) // } - // for i, apos := range s.locarraypos { + // for i, apos := range s.Locarraypos { // log.Printf("apos %d: %v\n", i, apos) // } - // log.Printf("stored: %v\n", s.stored) - // log.Printf("stored types: %v\n", s.storedTypes) - // log.Printf("stored pos: %v\n", s.storedPos) + // log.Printf("stored: %v\n", s.Stored) + // log.Printf("stored types: %v\n", s.StoredTypes) + // log.Printf("stored pos: %v\n", s.StoredPos) return s } @@ -88,7 +88,7 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { // walk each composite field for _, field := range result.Document.CompositeFields { - fieldID := uint16(s.getOrDefineField(field.Name(), false)) + fieldID := uint16(s.getOrDefineField(field.Name(), true)) l, tf := field.Analyze() processField(fieldID, field.Name(), l, tf) } From b74cf4b08171b5c372f50bd777d8864b338f41bc Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 1 Dec 2017 15:42:50 -0500 Subject: [PATCH 015/728] add copyright header to all new files in scorch --- index/scorch/field_dict_test.go | 14 ++++++++++++++ index/scorch/introducer.go | 14 ++++++++++++++ index/scorch/reader.go | 14 ++++++++++++++ index/scorch/reader_test.go | 14 ++++++++++++++ index/scorch/scorch.go | 14 ++++++++++++++ index/scorch/scorch_test.go | 2 +- index/scorch/segment/mem/build.go | 14 ++++++++++++++ index/scorch/segment/mem/dict.go | 14 ++++++++++++++ index/scorch/segment/mem/posting.go | 14 ++++++++++++++ index/scorch/segment/mem/segment.go | 14 ++++++++++++++ index/scorch/segment/mem/segment_test.go | 14 ++++++++++++++ index/scorch/segment/segment.go | 14 ++++++++++++++ index/scorch/snapshot_index.go | 14 ++++++++++++++ index/scorch/snapshot_index_dict.go | 14 ++++++++++++++ index/scorch/snapshot_index_doc.go | 14 ++++++++++++++ index/scorch/snapshot_index_tfr.go | 14 ++++++++++++++ index/scorch/snapshot_segment.go | 14 ++++++++++++++ index/scorch/stats.go | 14 ++++++++++++++ 18 files changed, 239 insertions(+), 1 deletion(-) diff --git a/index/scorch/field_dict_test.go b/index/scorch/field_dict_test.go index 81285e76a..856f3d6cf 100644 --- a/index/scorch/field_dict_test.go +++ b/index/scorch/field_dict_test.go @@ -1,3 +1,17 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package scorch import ( diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 37a66dc84..8b333b309 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -1,3 +1,17 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package scorch import ( diff --git a/index/scorch/reader.go b/index/scorch/reader.go index acb01905d..0e643f7ca 100644 --- a/index/scorch/reader.go +++ b/index/scorch/reader.go @@ -1,3 +1,17 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package scorch import ( diff --git a/index/scorch/reader_test.go b/index/scorch/reader_test.go index 6b801faa3..ef5d6d4f5 100644 --- a/index/scorch/reader_test.go +++ b/index/scorch/reader_test.go @@ -1,3 +1,17 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package scorch import ( diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 1f99b42fe..90b2fa0d3 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -1,3 +1,17 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package scorch import ( diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index 48e80c8b8..aeb6c997a 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -1,4 +1,4 @@ -// Copyright (c) 2014 Couchbase, Inc. +// Copyright (c) 2017 Couchbase, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index d995dd041..dbba39b13 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -1,3 +1,17 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package mem import ( diff --git a/index/scorch/segment/mem/dict.go b/index/scorch/segment/mem/dict.go index c724493da..015578949 100644 --- a/index/scorch/segment/mem/dict.go +++ b/index/scorch/segment/mem/dict.go @@ -1,3 +1,17 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package mem import ( diff --git a/index/scorch/segment/mem/posting.go b/index/scorch/segment/mem/posting.go index fa1c0a00f..2ce889888 100644 --- a/index/scorch/segment/mem/posting.go +++ b/index/scorch/segment/mem/posting.go @@ -1,3 +1,17 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package mem import ( diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go index f4294ebd7..5d964e183 100644 --- a/index/scorch/segment/mem/segment.go +++ b/index/scorch/segment/mem/segment.go @@ -1,3 +1,17 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package mem import ( diff --git a/index/scorch/segment/mem/segment_test.go b/index/scorch/segment/mem/segment_test.go index 4c056d7ad..612bb947e 100644 --- a/index/scorch/segment/mem/segment_test.go +++ b/index/scorch/segment/mem/segment_test.go @@ -1,3 +1,17 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package mem import ( diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index 5cd3d5d7f..a6ef16e40 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -1,3 +1,17 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package segment import ( diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 2c85b8fee..bf076b91c 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -1,3 +1,17 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package scorch import ( diff --git a/index/scorch/snapshot_index_dict.go b/index/scorch/snapshot_index_dict.go index 443e401e6..3c902cad6 100644 --- a/index/scorch/snapshot_index_dict.go +++ b/index/scorch/snapshot_index_dict.go @@ -1,3 +1,17 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package scorch import ( diff --git a/index/scorch/snapshot_index_doc.go b/index/scorch/snapshot_index_doc.go index 2b1144874..4656079b0 100644 --- a/index/scorch/snapshot_index_doc.go +++ b/index/scorch/snapshot_index_doc.go @@ -1,3 +1,17 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package scorch import ( diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index 44172f3d0..c2123a1dd 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -1,3 +1,17 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package scorch import ( diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index ffacb5287..67cdfb900 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -1,3 +1,17 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package scorch import ( diff --git a/index/scorch/stats.go b/index/scorch/stats.go index f49c8178c..13668480d 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -1,3 +1,17 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package scorch import ( From 22ffc8940e091388f529a0531356e529759c2886 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 4 Dec 2017 18:06:06 -0500 Subject: [PATCH 016/728] update segment API to return error in key places --- index/scorch/introducer.go | 6 +- index/scorch/scorch.go | 5 +- index/scorch/segment/mem/dict.go | 5 +- index/scorch/segment/mem/posting.go | 6 +- index/scorch/segment/mem/segment.go | 8 +- index/scorch/segment/mem/segment_test.go | 118 +++++++++++++++------ index/scorch/segment/segment.go | 8 +- index/scorch/snapshot_index.go | 126 +++++++++++++++-------- index/scorch/snapshot_index_tfr.go | 5 +- index/scorch/snapshot_segment.go | 21 ++-- 10 files changed, 212 insertions(+), 96 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 8b333b309..d2978a2df 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -53,7 +53,11 @@ func (s *Scorch) mainLoop() { // see if optimistic work included this segment delta, ok := next.obsoletes[s.root.segment[i].id] if !ok { - delta = s.root.segment[i].segment.DocNumbers(next.ids) + var err error + delta, err = s.root.segment[i].segment.DocNumbers(next.ids) + if err != nil { + panic(err) + } } newSnapshot.segment[i] = &SegmentSnapshot{ id: s.root.segment[i].id, diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 90b2fa0d3..b80ff2482 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -154,7 +154,10 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, // get read lock, to optimistically prepare obsoleted info s.rootLock.RLock() for i := range s.root.segment { - delta := s.root.segment[i].segment.DocNumbers(ids) + delta, err := s.root.segment[i].segment.DocNumbers(ids) + if err != nil { + return err + } introduction.obsoletes[s.root.segment[i].id] = delta } s.rootLock.RUnlock() diff --git a/index/scorch/segment/mem/dict.go b/index/scorch/segment/mem/dict.go index 015578949..939c287e9 100644 --- a/index/scorch/segment/mem/dict.go +++ b/index/scorch/segment/mem/dict.go @@ -31,13 +31,14 @@ type Dictionary struct { } // PostingsList returns the postings list for the specified term -func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) segment.PostingsList { +func (d *Dictionary) PostingsList(term string, + except *roaring.Bitmap) (segment.PostingsList, error) { return &PostingsList{ dictionary: d, term: term, postingsID: d.segment.Dicts[d.fieldID][term], except: except, - } + }, nil } // Iterator returns an iterator for this dictionary diff --git a/index/scorch/segment/mem/posting.go b/index/scorch/segment/mem/posting.go index 2ce889888..b6fd0c6a7 100644 --- a/index/scorch/segment/mem/posting.go +++ b/index/scorch/segment/mem/posting.go @@ -74,9 +74,9 @@ type PostingsIterator struct { } // Next returns the next posting on the postings list, or nil at the end -func (i *PostingsIterator) Next() segment.Posting { +func (i *PostingsIterator) Next() (segment.Posting, error) { if i.actual == nil || !i.actual.HasNext() { - return nil + return nil, nil } n := i.actual.Next() allN := i.all.Next() @@ -99,7 +99,7 @@ func (i *PostingsIterator) Next() segment.Posting { i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset]) i.offset++ - return rv + return rv, nil } // Posting is a single entry in a postings list diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go index 5d964e183..a1eb29e27 100644 --- a/index/scorch/segment/mem/segment.go +++ b/index/scorch/segment/mem/segment.go @@ -119,12 +119,12 @@ func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVi } // Dictionary returns the term dictionary for the specified field -func (s *Segment) Dictionary(field string) segment.TermDictionary { +func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) { return &Dictionary{ segment: s, field: field, fieldID: uint16(s.getOrDefineField(field, false)), - } + }, nil } // Count returns the number of documents in this segment @@ -135,7 +135,7 @@ func (s *Segment) Count() uint64 { // DocNumbers returns a bitset corresponding to the doc numbers of all the // provided _id strings -func (s *Segment) DocNumbers(ids []string) *roaring.Bitmap { +func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) { rv := roaring.New() // guard against empty segment @@ -149,5 +149,5 @@ func (s *Segment) DocNumbers(ids []string) *roaring.Bitmap { } } } - return rv + return rv, nil } diff --git a/index/scorch/segment/mem/segment_test.go b/index/scorch/segment/mem/segment_test.go index 612bb947e..2b8452051 100644 --- a/index/scorch/segment/mem/segment_test.go +++ b/index/scorch/segment/mem/segment_test.go @@ -31,12 +31,18 @@ func TestEmpty(t *testing.T) { t.Errorf("expected count 0, got %d", emptySegment.Count()) } - dict := emptySegment.Dictionary("name") + dict, err := emptySegment.Dictionary("name") + if err != nil { + t.Fatal(err) + } if dict == nil { t.Fatal("got nil dict, expected non-nil") } - postingsList := dict.PostingsList("marty", nil) + postingsList, err := dict.PostingsList("marty", nil) + if err != nil { + t.Fatal(err) + } if postingsList == nil { t.Fatal("got nil postings list, expected non-nil") } @@ -47,10 +53,13 @@ func TestEmpty(t *testing.T) { } count := 0 - nextPosting := postingsItr.Next() - for nextPosting != nil { + nextPosting, err := postingsItr.Next() + for nextPosting != nil && err == nil { count++ - nextPosting = postingsItr.Next() + nextPosting, err = postingsItr.Next() + } + if err != nil { + t.Fatal(err) } if count != 0 { @@ -58,7 +67,7 @@ func TestEmpty(t *testing.T) { } // now try and visit a document - err := emptySegment.VisitDocument(0, func(field string, typ byte, value []byte, pos []uint64) bool { + err = emptySegment.VisitDocument(0, func(field string, typ byte, value []byte, pos []uint64) bool { t.Errorf("document visitor called, not expected") return true }) @@ -164,12 +173,18 @@ func TestSingle(t *testing.T) { } // check the _id field - dict := segment.Dictionary("_id") + dict, err := segment.Dictionary("_id") + if err != nil { + t.Fatal(err) + } if dict == nil { t.Fatal("got nil dict, expected non-nil") } - postingsList := dict.PostingsList("a", nil) + postingsList, err := dict.PostingsList("a", nil) + if err != nil { + t.Fatal(err) + } if postingsList == nil { t.Fatal("got nil postings list, expected non-nil") } @@ -180,8 +195,8 @@ func TestSingle(t *testing.T) { } count := 0 - nextPosting := postingsItr.Next() - for nextPosting != nil { + nextPosting, err := postingsItr.Next() + for nextPosting != nil && err == nil { count++ if nextPosting.Frequency() != 1 { t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) @@ -193,7 +208,10 @@ func TestSingle(t *testing.T) { t.Errorf("expected norm 1.0, got %f", nextPosting.Norm()) } - nextPosting = postingsItr.Next() + nextPosting, err = postingsItr.Next() + } + if err != nil { + t.Fatal(err) } if count != 1 { @@ -201,12 +219,18 @@ func TestSingle(t *testing.T) { } // check the name field - dict = segment.Dictionary("name") + dict, err = segment.Dictionary("name") + if err != nil { + t.Fatal(err) + } if dict == nil { t.Fatal("got nil dict, expected non-nil") } - postingsList = dict.PostingsList("wow", nil) + postingsList, err = dict.PostingsList("wow", nil) + if err != nil { + t.Fatal(err) + } if postingsList == nil { t.Fatal("got nil postings list, expected non-nil") } @@ -217,8 +241,8 @@ func TestSingle(t *testing.T) { } count = 0 - nextPosting = postingsItr.Next() - for nextPosting != nil { + nextPosting, err = postingsItr.Next() + for nextPosting != nil && err == nil { count++ if nextPosting.Frequency() != 1 { t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) @@ -244,7 +268,10 @@ func TestSingle(t *testing.T) { } } - nextPosting = postingsItr.Next() + nextPosting, err = postingsItr.Next() + } + if err != nil { + t.Fatal(err) } if count != 1 { @@ -252,12 +279,18 @@ func TestSingle(t *testing.T) { } // check the _all field (composite) - dict = segment.Dictionary("_all") + dict, err = segment.Dictionary("_all") + if err != nil { + t.Fatal(err) + } if dict == nil { t.Fatal("got nil dict, expected non-nil") } - postingsList = dict.PostingsList("wow", nil) + postingsList, err = dict.PostingsList("wow", nil) + if err != nil { + t.Fatal(err) + } if postingsList == nil { t.Fatal("got nil postings list, expected non-nil") } @@ -268,8 +301,8 @@ func TestSingle(t *testing.T) { } count = 0 - nextPosting = postingsItr.Next() - for nextPosting != nil { + nextPosting, err = postingsItr.Next() + for nextPosting != nil && err == nil { count++ if nextPosting.Frequency() != 1 { t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) @@ -296,7 +329,10 @@ func TestSingle(t *testing.T) { } } - nextPosting = postingsItr.Next() + nextPosting, err = postingsItr.Next() + } + if err != nil { + t.Fatal(err) } if count != 1 { @@ -305,7 +341,7 @@ func TestSingle(t *testing.T) { // now try and visit a document var fieldValuesSeen int - err := segment.VisitDocument(0, func(field string, typ byte, value []byte, pos []uint64) bool { + err = segment.VisitDocument(0, func(field string, typ byte, value []byte, pos []uint64) bool { fieldValuesSeen++ return true }) @@ -487,12 +523,18 @@ func TestMultiple(t *testing.T) { } // check the desc field - dict := segment.Dictionary("desc") + dict, err := segment.Dictionary("desc") + if err != nil { + t.Fatal(err) + } if dict == nil { t.Fatal("got nil dict, expected non-nil") } - postingsList := dict.PostingsList("thing", nil) + postingsList, err := dict.PostingsList("thing", nil) + if err != nil { + t.Fatal(err) + } if postingsList == nil { t.Fatal("got nil postings list, expected non-nil") } @@ -503,10 +545,13 @@ func TestMultiple(t *testing.T) { } count := 0 - nextPosting := postingsItr.Next() - for nextPosting != nil { + nextPosting, err := postingsItr.Next() + for nextPosting != nil && err == nil { count++ - nextPosting = postingsItr.Next() + nextPosting, err = postingsItr.Next() + } + if err != nil { + t.Fatal(err) } if count != 2 { @@ -514,10 +559,16 @@ func TestMultiple(t *testing.T) { } // get docnum of a - exclude := segment.DocNumbers([]string{"a"}) + exclude, err := segment.DocNumbers([]string{"a"}) + if err != nil { + t.Fatal(err) + } // look for term 'thing' excluding doc 'a' - postingsListExcluding := dict.PostingsList("thing", exclude) + postingsListExcluding, err := dict.PostingsList("thing", exclude) + if err != nil { + t.Fatal(err) + } if postingsList == nil { t.Fatal("got nil postings list, expected non-nil") } @@ -528,10 +579,13 @@ func TestMultiple(t *testing.T) { } count = 0 - nextPosting = postingsItrExcluding.Next() - for nextPosting != nil { + nextPosting, err = postingsItrExcluding.Next() + for nextPosting != nil && err == nil { count++ - nextPosting = postingsItrExcluding.Next() + nextPosting, err = postingsItrExcluding.Next() + } + if err != nil { + t.Fatal(err) } if count != 1 { diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index a6ef16e40..77ab13857 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -25,18 +25,18 @@ import ( type DocumentFieldValueVisitor func(field string, typ byte, value []byte, pos []uint64) bool type Segment interface { - Dictionary(field string) TermDictionary + Dictionary(field string) (TermDictionary, error) VisitDocument(num uint64, visitor DocumentFieldValueVisitor) error Count() uint64 - DocNumbers([]string) *roaring.Bitmap + DocNumbers([]string) (*roaring.Bitmap, error) Fields() []string } type TermDictionary interface { - PostingsList(term string, except *roaring.Bitmap) PostingsList + PostingsList(term string, except *roaring.Bitmap) (PostingsList, error) Iterator() DictionaryIterator PrefixIterator(prefix string) DictionaryIterator @@ -59,7 +59,7 @@ type PostingsList interface { } type PostingsIterator interface { - Next() Posting + Next() (Posting, error) } type Posting interface { diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index bf076b91c..c059fe734 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -27,6 +27,17 @@ import ( "github.com/blevesearch/bleve/index/scorch/segment" ) +type asynchSegmentResult struct { + dictItr segment.DictionaryIterator + + index int + docs *roaring.Bitmap + + postings segment.PostingsList + + err error +} + type IndexSnapshot struct { segment []*SegmentSnapshot offsets []uint64 @@ -35,31 +46,44 @@ type IndexSnapshot struct { func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i segment.TermDictionary) segment.DictionaryIterator) (*IndexSnapshotFieldDict, error) { - results := make(chan segment.DictionaryIterator) + results := make(chan *asynchSegmentResult) for index, segment := range i.segment { go func(index int, segment *SegmentSnapshot) { - dict := segment.Dictionary(field) - results <- makeItr(dict) + dict, err := segment.Dictionary(field) + if err != nil { + results <- &asynchSegmentResult{err: err} + } else { + results <- &asynchSegmentResult{dictItr: makeItr(dict)} + } }(index, segment) } + var err error rv := &IndexSnapshotFieldDict{ snapshot: i, cursors: make([]*segmentDictCursor, 0, len(i.segment)), } for count := 0; count < len(i.segment); count++ { - di := <-results - next, err := di.Next() - if err != nil { - return nil, err - } - if next != nil { - rv.cursors = append(rv.cursors, &segmentDictCursor{ - itr: di, - curr: next, - }) + asr := <-results + if asr.err != nil && err == nil { + err = asr.err + } else { + next, err2 := asr.dictItr.Next() + if err2 != nil && err == nil { + err = err2 + } + if next != nil { + rv.cursors = append(rv.cursors, &segmentDictCursor{ + itr: asr.dictItr, + curr: next, + }) + } } } + // after ensuring we've read all items on channel + if err != nil { + return nil, err + } // prepare heap heap.Init(rv) @@ -87,10 +111,10 @@ func (i *IndexSnapshot) FieldDictPrefix(field string, } func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) { - results := make(chan *segmentDocNumsResult) + results := make(chan *asynchSegmentResult) for index, segment := range i.segment { go func(index int, segment *SegmentSnapshot) { - results <- &segmentDocNumsResult{ + results <- &asynchSegmentResult{ index: index, docs: segment.DocNumbersLive(), } @@ -101,12 +125,17 @@ func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) { } func (i *IndexSnapshot) DocIDReaderOnly(ids []string) (index.DocIDReader, error) { - results := make(chan *segmentDocNumsResult) + results := make(chan *asynchSegmentResult) for index, segment := range i.segment { go func(index int, segment *SegmentSnapshot) { - results <- &segmentDocNumsResult{ - index: index, - docs: segment.DocNumbers(ids), + docs, err := segment.DocNumbers(ids) + if err != nil { + results <- &asynchSegmentResult{err: err} + } else { + results <- &asynchSegmentResult{ + index: index, + docs: docs, + } } }(index, segment) } @@ -114,19 +143,23 @@ func (i *IndexSnapshot) DocIDReaderOnly(ids []string) (index.DocIDReader, error) return i.newDocIDReader(results) } -type segmentDocNumsResult struct { - index int - docs *roaring.Bitmap -} - -func (i *IndexSnapshot) newDocIDReader(results chan *segmentDocNumsResult) (index.DocIDReader, error) { +func (i *IndexSnapshot) newDocIDReader(results chan *asynchSegmentResult) (index.DocIDReader, error) { rv := &IndexSnapshotDocIDReader{ snapshot: i, iterators: make([]roaring.IntIterable, len(i.segment)), } + var err error for count := 0; count < len(i.segment); count++ { - sdnr := <-results - rv.iterators[sdnr.index] = sdnr.docs.Iterator() + asr := <-results + if asr.err != nil && err != nil { + err = asr.err + } else { + rv.iterators[asr.index] = asr.docs.Iterator() + } + } + + if err != nil { + return nil, err } return rv, nil @@ -262,23 +295,27 @@ func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err err func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { - type segmentPostingResult struct { - index int - postings segment.PostingsList - } - - results := make(chan *segmentPostingResult) + results := make(chan *asynchSegmentResult) for index, segment := range i.segment { go func(index int, segment *SegmentSnapshot) { - dict := segment.Dictionary(field) - pl := dict.PostingsList(string(term), nil) - results <- &segmentPostingResult{ - index: index, - postings: pl, + dict, err := segment.Dictionary(field) + if err != nil { + results <- &asynchSegmentResult{err: err} + } else { + pl, err := dict.PostingsList(string(term), nil) + if err != nil { + results <- &asynchSegmentResult{err: err} + } else { + results <- &asynchSegmentResult{ + index: index, + postings: pl, + } + } } }(index, segment) } + var err error rv := &IndexSnapshotTermFieldReader{ snapshot: i, postings: make([]segment.PostingsList, len(i.segment)), @@ -288,9 +325,16 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, includeTermVectors: includeTermVectors, } for count := 0; count < len(i.segment); count++ { - spr := <-results - rv.postings[spr.index] = spr.postings - rv.iterators[spr.index] = spr.postings.Iterator() + asr := <-results + if asr.err != nil && err == nil { + err = asr.err + } else { + rv.postings[asr.index] = asr.postings + rv.iterators[asr.index] = asr.postings.Iterator() + } + } + if err != nil { + return nil, err } return rv, nil diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index c2123a1dd..936704906 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -38,7 +38,10 @@ func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*in } // find the next hit for i.segmentOffset < len(i.postings) { - next := i.iterators[i.segmentOffset].Next() + next, err := i.iterators[i.segmentOffset].Next() + if err != nil { + return nil, err + } if next != nil { // make segment number into global number by adding offset globalOffset := i.snapshot.offsets[i.segmentOffset] diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 67cdfb900..6380a15fd 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -24,7 +24,7 @@ type SegmentDictionarySnapshot struct { d segment.TermDictionary } -func (s *SegmentDictionarySnapshot) PostingsList(term string, except *roaring.Bitmap) segment.PostingsList { +func (s *SegmentDictionarySnapshot) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) { return s.d.PostingsList(term, s.s.deleted) } @@ -58,19 +58,26 @@ func (s *SegmentSnapshot) Count() uint64 { return rv } -func (s *SegmentSnapshot) Dictionary(field string) segment.TermDictionary { +func (s *SegmentSnapshot) Dictionary(field string) (segment.TermDictionary, error) { + d, err := s.segment.Dictionary(field) + if err != nil { + return nil, err + } return &SegmentDictionarySnapshot{ s: s, - d: s.segment.Dictionary(field), - } + d: d, + }, nil } -func (s *SegmentSnapshot) DocNumbers(docIDs []string) *roaring.Bitmap { - rv := s.segment.DocNumbers(docIDs) +func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) { + rv, err := s.segment.DocNumbers(docIDs) + if err != nil { + return nil, err + } if s.deleted != nil { rv.AndNot(s.deleted) } - return rv + return rv, nil } // DocNumbersLive returns bitsit containing doc numbers for all live docs From ed067f45dd0562380606d93f50d432891438abf7 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 5 Dec 2017 09:31:02 -0500 Subject: [PATCH 017/728] added Close() method to Segment --- index/scorch/segment/mem/segment.go | 5 +++++ index/scorch/segment/segment.go | 2 ++ 2 files changed, 7 insertions(+) diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go index a1eb29e27..a0611947d 100644 --- a/index/scorch/segment/mem/segment.go +++ b/index/scorch/segment/mem/segment.go @@ -151,3 +151,8 @@ func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) { } return rv, nil } + +// Close releases all resources associated with this segment +func (s *Segment) Close() error { + return nil +} diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index 77ab13857..6a9d70730 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -33,6 +33,8 @@ type Segment interface { DocNumbers([]string) (*roaring.Bitmap, error) Fields() []string + + Close() error } type TermDictionary interface { From 87e2627551841cecba0bd25067350eb2f199f5ec Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 5 Dec 2017 09:49:41 -0500 Subject: [PATCH 018/728] added dictionary tests to mem segment --- index/scorch/segment/mem/dict_test.go | 160 ++++++++++++++++++++++++++ 1 file changed, 160 insertions(+) create mode 100644 index/scorch/segment/mem/dict_test.go diff --git a/index/scorch/segment/mem/dict_test.go b/index/scorch/segment/mem/dict_test.go new file mode 100644 index 000000000..adfa4957d --- /dev/null +++ b/index/scorch/segment/mem/dict_test.go @@ -0,0 +1,160 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mem + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" +) + +func TestDictionary(t *testing.T) { + doc := &document.Document{ + ID: "a", + Fields: []document.Field{ + document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil), + document.NewTextFieldCustom("desc", nil, []byte("apple ball cat dog egg fish bat"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + }, + } + + // forge analyzed docs + results := []*index.AnalysisResult{ + &index.AnalysisResult{ + Document: doc, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("a"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 5, + Position: 1, + Term: []byte("apple"), + }, + &analysis.Token{ + Start: 6, + End: 10, + Position: 2, + Term: []byte("ball"), + }, + &analysis.Token{ + Start: 11, + End: 14, + Position: 3, + Term: []byte("cat"), + }, + &analysis.Token{ + Start: 15, + End: 18, + Position: 4, + Term: []byte("dog"), + }, + &analysis.Token{ + Start: 19, + End: 22, + Position: 5, + Term: []byte("egg"), + }, + &analysis.Token{ + Start: 20, + End: 24, + Position: 6, + Term: []byte("fish"), + }, + &analysis.Token{ + Start: 25, + End: 28, + Position: 7, + Term: []byte("bat"), + }, + }, nil, true), + }, + Length: []int{ + 1, + 7, + }, + }, + } + + segment := NewFromAnalyzedDocs(results) + if segment == nil { + t.Fatalf("segment nil, not expected") + } + + dict, err := segment.Dictionary("desc") + if err != nil { + t.Fatal(err) + } + + // test basic full iterator + expected := []string{"apple", "ball", "bat", "cat", "dog", "egg", "fish"} + var got []string + itr := dict.Iterator() + next, err := itr.Next() + for next != nil && err == nil { + got = append(got, next.Term) + next, err = itr.Next() + } + if err != nil { + t.Fatalf("dict itr error: %v", err) + } + + if !reflect.DeepEqual(expected, got) { + t.Errorf("expected: %v, got: %v", expected, got) + } + + // test prefix iterator + expected = []string{"ball", "bat"} + got = got[:0] + itr = dict.PrefixIterator("b") + next, err = itr.Next() + for next != nil && err == nil { + got = append(got, next.Term) + next, err = itr.Next() + } + if err != nil { + t.Fatalf("dict itr error: %v", err) + } + + if !reflect.DeepEqual(expected, got) { + t.Errorf("expected: %v, got: %v", expected, got) + } + + // test range iterator + expected = []string{"cat", "dog", "egg"} + got = got[:0] + itr = dict.RangeIterator("cat", "egg") + next, err = itr.Next() + for next != nil && err == nil { + got = append(got, next.Term) + next, err = itr.Next() + } + if err != nil { + t.Fatalf("dict itr error: %v", err) + } + + if !reflect.DeepEqual(expected, got) { + t.Errorf("expected: %v, got: %v", expected, got) + } +} From e08fdab54a570fe50bee5b541dd7297dea6c39e3 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 5 Dec 2017 10:13:27 -0500 Subject: [PATCH 019/728] remove todo item --- index/scorch/segment/mem/segment.go | 1 - 1 file changed, 1 deletion(-) diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go index a0611947d..4d3d1d113 100644 --- a/index/scorch/segment/mem/segment.go +++ b/index/scorch/segment/mem/segment.go @@ -34,7 +34,6 @@ const idFieldID uint16 = 0 // TODO // - need better testing of multiple docs, iterating freqs, locations and // and verifying the correct results are returned -// - need tests for term dictionary iteration // Segment is an in memory implementation of scorch.Segment type Segment struct { From 7a6b5483f23b119fad5f2bad7e96a2f1a1ae0327 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 5 Dec 2017 11:58:05 -0500 Subject: [PATCH 020/728] add validation that all locations were seen --- index/scorch/segment/mem/segment_test.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/index/scorch/segment/mem/segment_test.go b/index/scorch/segment/mem/segment_test.go index 2b8452051..74fb870d8 100644 --- a/index/scorch/segment/mem/segment_test.go +++ b/index/scorch/segment/mem/segment_test.go @@ -253,7 +253,9 @@ func TestSingle(t *testing.T) { if nextPosting.Norm() != 1.0 { t.Errorf("expected norm 1.0, got %f", nextPosting.Norm()) } + var numLocs uint64 for _, loc := range nextPosting.Locations() { + numLocs++ if loc.Start() != 0 { t.Errorf("expected loc start to be 0, got %d", loc.Start()) } @@ -267,6 +269,9 @@ func TestSingle(t *testing.T) { t.Errorf("expect loc array pos to be nil, got %v", loc.ArrayPositions()) } } + if numLocs != nextPosting.Frequency() { + t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs) + } nextPosting, err = postingsItr.Next() } @@ -314,7 +319,9 @@ func TestSingle(t *testing.T) { if nextPosting.Norm() != float64(expectedNorm) { t.Errorf("expected norm %f, got %f", expectedNorm, nextPosting.Norm()) } + var numLocs uint64 for _, loc := range nextPosting.Locations() { + numLocs++ if loc.Start() != 0 { t.Errorf("expected loc start to be 0, got %d", loc.Start()) } @@ -328,6 +335,9 @@ func TestSingle(t *testing.T) { t.Errorf("expect loc array pos to be nil, got %v", loc.ArrayPositions()) } } + if numLocs != nextPosting.Frequency() { + t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs) + } nextPosting, err = postingsItr.Next() } From 8f0350865b2d12b27f0afcacfa14a4de67ca7f01 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 5 Dec 2017 12:17:56 -0500 Subject: [PATCH 021/728] add test for segment fields method --- index/scorch/segment/mem/segment_test.go | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/index/scorch/segment/mem/segment_test.go b/index/scorch/segment/mem/segment_test.go index 74fb870d8..0d9ec4534 100644 --- a/index/scorch/segment/mem/segment_test.go +++ b/index/scorch/segment/mem/segment_test.go @@ -168,6 +168,23 @@ func TestSingle(t *testing.T) { t.Fatalf("segment nil, not expected") } + expectFields := map[string]struct{}{ + "_id": struct{}{}, + "_all": struct{}{}, + "name": struct{}{}, + "desc": struct{}{}, + "tag": struct{}{}, + } + fields := segment.Fields() + if len(fields) != len(expectFields) { + t.Errorf("expected %d fields, only got %d", len(expectFields), len(fields)) + } + for _, field := range fields { + if _, ok := expectFields[field]; !ok { + t.Errorf("got unexpected field: %s", field) + } + } + if segment.Count() != 1 { t.Errorf("expected count 1, got %d", segment.Count()) } From 8d9d45115ff04d843e37f343cd1e7a34bb194365 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 5 Dec 2017 12:20:06 -0500 Subject: [PATCH 022/728] add test of location field --- index/scorch/segment/mem/segment_test.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/index/scorch/segment/mem/segment_test.go b/index/scorch/segment/mem/segment_test.go index 0d9ec4534..78c242416 100644 --- a/index/scorch/segment/mem/segment_test.go +++ b/index/scorch/segment/mem/segment_test.go @@ -273,6 +273,9 @@ func TestSingle(t *testing.T) { var numLocs uint64 for _, loc := range nextPosting.Locations() { numLocs++ + if loc.Field() != "name" { + t.Errorf("expected loc field to be 'name', got '%s'", loc.Field()) + } if loc.Start() != 0 { t.Errorf("expected loc start to be 0, got %d", loc.Start()) } @@ -339,6 +342,9 @@ func TestSingle(t *testing.T) { var numLocs uint64 for _, loc := range nextPosting.Locations() { numLocs++ + if loc.Field() != "name" { + t.Errorf("expected loc field to be 'name', got '%s'", loc.Field()) + } if loc.Start() != 0 { t.Errorf("expected loc start to be 0, got %d", loc.Start()) } From 30e9d6daa547cd0196e6e9b63a35e747c3971938 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 5 Dec 2017 12:54:44 -0500 Subject: [PATCH 023/728] add better testing of array positions --- index/scorch/segment/mem/segment_test.go | 74 ++++++++++++++++++++++-- 1 file changed, 68 insertions(+), 6 deletions(-) diff --git a/index/scorch/segment/mem/segment_test.go b/index/scorch/segment/mem/segment_test.go index 78c242416..603fd5a9b 100644 --- a/index/scorch/segment/mem/segment_test.go +++ b/index/scorch/segment/mem/segment_test.go @@ -16,6 +16,7 @@ package mem import ( "math" + "reflect" "testing" "github.com/blevesearch/bleve/analysis" @@ -134,7 +135,7 @@ func TestSingle(t *testing.T) { Position: 1, Term: []byte("cold"), }, - }, nil, true), + }, []uint64{0}, true), analysis.TokenFrequency(analysis.TokenStream{ &analysis.Token{ Start: 0, @@ -142,7 +143,7 @@ func TestSingle(t *testing.T) { Position: 1, Term: []byte("dark"), }, - }, nil, true), + }, []uint64{1}, true), }, Length: []int{ 1, @@ -372,6 +373,67 @@ func TestSingle(t *testing.T) { t.Errorf("expected count to be 1, got %d", count) } + // now try a field with array positions + dict, err = segment.Dictionary("tag") + if err != nil { + t.Fatal(err) + } + if dict == nil { + t.Fatal("got nil dict, expected non-nil") + } + + postingsList, err = dict.PostingsList("dark", nil) + if err != nil { + t.Fatal(err) + } + if postingsList == nil { + t.Fatal("got nil postings list, expected non-nil") + } + + postingsItr = postingsList.Iterator() + if postingsItr == nil { + t.Fatal("got nil iterator, expected non-nil") + } + + nextPosting, err = postingsItr.Next() + for nextPosting != nil && err == nil { + + if nextPosting.Frequency() != 1 { + t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) + } + if nextPosting.Number() != 0 { + t.Errorf("expected doc number 0, got %d", nextPosting.Number()) + } + var numLocs uint64 + for _, loc := range nextPosting.Locations() { + numLocs++ + if loc.Field() != "tag" { + t.Errorf("expected loc field to be 'name', got '%s'", loc.Field()) + } + if loc.Start() != 0 { + t.Errorf("expected loc start to be 0, got %d", loc.Start()) + } + if loc.End() != 4 { + t.Errorf("expected loc end to be 3, got %d", loc.End()) + } + if loc.Pos() != 1 { + t.Errorf("expected loc pos to be 1, got %d", loc.Pos()) + } + expectArrayPos := []uint64{1} + if !reflect.DeepEqual(loc.ArrayPositions(), expectArrayPos) { + t.Errorf("expect loc array pos to be %v, got %v", expectArrayPos, loc.ArrayPositions()) + } + } + if numLocs != nextPosting.Frequency() { + t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs) + } + + nextPosting, err = postingsItr.Next() + } + if err != nil { + t.Fatal(err) + } + // now try and visit a document var fieldValuesSeen int err = segment.VisitDocument(0, func(field string, typ byte, value []byte, pos []uint64) bool { @@ -459,7 +521,7 @@ func TestMultiple(t *testing.T) { Position: 1, Term: []byte("cold"), }, - }, nil, true), + }, []uint64{0}, true), analysis.TokenFrequency(analysis.TokenStream{ &analysis.Token{ Start: 0, @@ -467,7 +529,7 @@ func TestMultiple(t *testing.T) { Position: 1, Term: []byte("dark"), }, - }, nil, true), + }, []uint64{1}, true), }, Length: []int{ 1, @@ -517,7 +579,7 @@ func TestMultiple(t *testing.T) { Position: 1, Term: []byte("cold"), }, - }, nil, true), + }, []uint64{0}, true), analysis.TokenFrequency(analysis.TokenStream{ &analysis.Token{ Start: 0, @@ -525,7 +587,7 @@ func TestMultiple(t *testing.T) { Position: 1, Term: []byte("dark"), }, - }, nil, true), + }, []uint64{1}, true), }, Length: []int{ 1, From f6be841668e562a623e486b09a41b2389e5b32c7 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 5 Dec 2017 13:01:36 -0500 Subject: [PATCH 024/728] add test for postings list count method --- index/scorch/segment/mem/segment_test.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/index/scorch/segment/mem/segment_test.go b/index/scorch/segment/mem/segment_test.go index 603fd5a9b..7eb691476 100644 --- a/index/scorch/segment/mem/segment_test.go +++ b/index/scorch/segment/mem/segment_test.go @@ -668,6 +668,11 @@ func TestMultiple(t *testing.T) { t.Fatal("got nil postings list, expected non-nil") } + postingsListExcludingCount := postingsListExcluding.Count() + if postingsListExcludingCount != 1 { + t.Errorf("expected count from postings list to be 1, got %d", postingsListExcludingCount) + } + postingsItrExcluding := postingsListExcluding.Iterator() if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") From ece27ef21551f2be790d9fa75a7ce2a798542ea1 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 5 Dec 2017 13:05:12 -0500 Subject: [PATCH 025/728] adding initial version of bolt persisted segment --- index/scorch/segment/bolt/build.go | 500 +++++++++++++++++++++ index/scorch/segment/bolt/build_test.go | 288 ++++++++++++ index/scorch/segment/bolt/dict.go | 161 +++++++ index/scorch/segment/bolt/dict_test.go | 183 ++++++++ index/scorch/segment/bolt/int.go | 94 ++++ index/scorch/segment/bolt/int_test.go | 96 ++++ index/scorch/segment/bolt/posting.go | 323 ++++++++++++++ index/scorch/segment/bolt/segment.go | 309 +++++++++++++ index/scorch/segment/bolt/segment_test.go | 517 ++++++++++++++++++++++ 9 files changed, 2471 insertions(+) create mode 100644 index/scorch/segment/bolt/build.go create mode 100644 index/scorch/segment/bolt/build_test.go create mode 100644 index/scorch/segment/bolt/dict.go create mode 100644 index/scorch/segment/bolt/dict_test.go create mode 100644 index/scorch/segment/bolt/int.go create mode 100644 index/scorch/segment/bolt/int_test.go create mode 100644 index/scorch/segment/bolt/posting.go create mode 100644 index/scorch/segment/bolt/segment.go create mode 100644 index/scorch/segment/bolt/segment_test.go diff --git a/index/scorch/segment/bolt/build.go b/index/scorch/segment/bolt/build.go new file mode 100644 index 000000000..6ed5719d3 --- /dev/null +++ b/index/scorch/segment/bolt/build.go @@ -0,0 +1,500 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package bolt + +import ( + "bytes" + "encoding/binary" + "math" + + "github.com/RoaringBitmap/roaring" + "github.com/Smerity/govarint" + "github.com/blevesearch/bleve/index/scorch/segment/mem" + "github.com/boltdb/bolt" + "github.com/couchbaselabs/vellum" + "github.com/golang/snappy" +) + +var fieldsBucket = []byte{'a'} +var dictBucket = []byte{'b'} +var postingsBucket = []byte{'c'} +var postingDetailsBucket = []byte{'d'} +var storedBucket = []byte{'e'} +var configBucket = []byte{'x'} + +var indexLocsKey = []byte{'l'} + +var freqNormKey = []byte{'a'} +var locKey = []byte{'b'} + +var metaKey = []byte{'a'} +var dataKey = []byte{'b'} + +var chunkKey = []byte{'c'} +var versionKey = []byte{'v'} + +var version = 0 + +func persistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) (err error) { + + db, err := bolt.Open(path, 0777, nil) + if err != nil { + return err + } + defer func() { + if cerr := db.Close(); err == nil && cerr != nil { + err = cerr + } + }() + + tx, err := db.Begin(true) + if err != nil { + return err + } + defer func() { + if err == nil { + err = tx.Commit() + } else { + _ = tx.Rollback() + } + }() + + err = persistFields(memSegment, tx) + if err != nil { + return err + } + + err = persistDictionary(memSegment, tx) + if err != nil { + return err + } + + err = persistPostings(memSegment, tx) + if err != nil { + return err + } + + err = persistPostingsDetails(memSegment, tx, chunkFactor) + if err != nil { + return err + } + + err = persistStored(memSegment, tx) + if err != nil { + return err + } + + err = persistConfig(tx, chunkFactor) + if err != nil { + return err + } + + return nil +} + +// persistFields puts the fields as separate k/v pairs in the fields bucket +// makes very little attempt to squeeze a lot of perf because it is expected +// this is usually somewhat small, and when re-opened it will be read once and +// kept on the heap, and not read out of the file subsequently +func persistFields(memSegment *mem.Segment, tx *bolt.Tx) error { + bucket, err := tx.CreateBucket(fieldsBucket) + if err != nil { + return err + } + bucket.FillPercent = 1.0 + + // build/persist a bitset corresponding to the field locs array + indexLocs := roaring.NewBitmap() + for i, indexLoc := range memSegment.FieldsLoc { + if indexLoc { + indexLocs.AddInt(i) + } + } + var indexLocsBuffer bytes.Buffer + _, err = indexLocs.WriteTo(&indexLocsBuffer) + if err != nil { + return err + } + err = bucket.Put(indexLocsKey, indexLocsBuffer.Bytes()) + if err != nil { + return err + } + + // we use special varint which is still guaranteed to sort correctly + fieldBuf := make([]byte, 0, maxVarintSize) + for fieldID, fieldName := range memSegment.FieldsInv { + if fieldID != 0 { + // reset buffer if necessary + fieldBuf = fieldBuf[:0] + } + fieldBuf = EncodeUvarintAscending(fieldBuf, uint64(fieldID)) + err = bucket.Put(fieldBuf, []byte(fieldName)) + if err != nil { + return err + } + } + return nil +} + +func persistDictionary(memSegment *mem.Segment, tx *bolt.Tx) error { + bucket, err := tx.CreateBucket(dictBucket) + if err != nil { + return err + } + bucket.FillPercent = 1.0 + + // TODO consider whether or not there is benefit to building the vellums + // concurrently. While we have to insert them into the bolt in order, + // the (presumably) heavier lifting involved in building the FST could + // be done concurrently. + + fieldBuf := make([]byte, 0, maxVarintSize) + for fieldID, fieldTerms := range memSegment.DictKeys { + if fieldID != 0 { + // reset buffers if necessary + fieldBuf = fieldBuf[:0] + } + // start a new vellum for this field + var buffer bytes.Buffer + builder, err := vellum.New(&buffer, nil) + if err != nil { + return err + } + + dict := memSegment.Dicts[fieldID] + // now walk the dictionary in order of fieldTerms (already sorted) + for i := range fieldTerms { + err = builder.Insert([]byte(fieldTerms[i]), dict[fieldTerms[i]]-1) + if err != nil { + return err + } + } + err = builder.Close() + if err != nil { + return err + } + + // put this FST into bolt + // we use special varint which is still guaranteed to sort correctly + fieldBuf = EncodeUvarintAscending(fieldBuf, uint64(fieldID)) + err = bucket.Put(fieldBuf, buffer.Bytes()) + if err != nil { + return err + } + } + + return nil +} + +func persistPostings(memSegment *mem.Segment, tx *bolt.Tx) error { + bucket, err := tx.CreateBucket(postingsBucket) + if err != nil { + return err + } + bucket.FillPercent = 1.0 + + postingIDBuf := make([]byte, 0, maxVarintSize) + for postingID := range memSegment.Postings { + if postingID != 0 { + // reset buffers if necessary + postingIDBuf = postingIDBuf[:0] + } + postingIDBuf = EncodeUvarintAscending(postingIDBuf, uint64(postingID)) + var postingsBuf bytes.Buffer + _, err := memSegment.Postings[postingID].WriteTo(&postingsBuf) + if err != nil { + return err + } + err = bucket.Put(postingIDBuf, postingsBuf.Bytes()) + if err != nil { + return err + } + } + + return nil +} + +func persistPostingsDetails(memSegment *mem.Segment, tx *bolt.Tx, + chunkFactor uint32) error { + bucket, err := tx.CreateBucket(postingDetailsBucket) + if err != nil { + return err + } + bucket.FillPercent = 1.0 + + postingIDBuf := make([]byte, 0, maxVarintSize) + for postingID := range memSegment.Postings { + if postingID != 0 { + // reset buffers if necessary + postingIDBuf = postingIDBuf[:0] + } + postingIDBuf = EncodeUvarintAscending(postingIDBuf, uint64(postingID)) + + // make bucket for posting details + postingBucket, err := bucket.CreateBucket(postingIDBuf) + if err != nil { + return err + } + postingBucket.FillPercent = 1.0 + + err = persistPostingDetails(memSegment, postingBucket, postingID, chunkFactor) + if err != nil { + return err + } + } + + return nil +} + +func persistPostingDetails(memSegment *mem.Segment, postingBucket *bolt.Bucket, + postingID int, chunkFactor uint32) error { + // walk the postings list + var err error + var chunkBucket *bolt.Bucket + var currChunk uint32 + chunkIDBuf := make([]byte, 0, maxVarintSize) + postingsListItr := memSegment.Postings[postingID].Iterator() + var encoder *govarint.Base128Encoder + var locEncoder *govarint.Base128Encoder + + encodingBuf := &bytes.Buffer{} + locEncodingBuf := &bytes.Buffer{} + + var offset int + var locOffset int + for postingsListItr.HasNext() { + docNum := postingsListItr.Next() + chunk := docNum / chunkFactor + + // create new chunk bucket if necessary + if chunkBucket == nil || currChunk != chunk { + + // close out last chunk + if chunkBucket != nil { + + // fix me write freq/norms + encoder.Close() + err = chunkBucket.Put(freqNormKey, encodingBuf.Bytes()) + if err != nil { + return err + } + locEncoder.Close() + err = chunkBucket.Put(locKey, locEncodingBuf.Bytes()) + if err != nil { + return err + } + + // reset for next + chunkIDBuf = chunkIDBuf[:0] + encodingBuf = &bytes.Buffer{} + locEncodingBuf = &bytes.Buffer{} + } + + // prepare next chunk + chunkIDBuf = EncodeUvarintAscending(chunkIDBuf, uint64(chunk)) + chunkBucket, err = postingBucket.CreateBucket(chunkIDBuf) + if err != nil { + return err + } + chunkBucket.FillPercent = 1.0 + currChunk = chunk + + encoder = govarint.NewU64Base128Encoder(encodingBuf) + locEncoder = govarint.NewU64Base128Encoder(locEncodingBuf) + } + + // put freq + _, err = encoder.PutU64(memSegment.Freqs[postingID][offset]) + if err != nil { + return err + } + + // put norm + norm := memSegment.Norms[postingID][offset] + normBits := math.Float32bits(norm) + _, err = encoder.PutU32(normBits) + if err != nil { + return err + } + + // put locations + + for i := 0; i < int(memSegment.Freqs[postingID][offset]); i++ { + + if len(memSegment.Locfields[postingID]) > 0 { + // put field + _, err = locEncoder.PutU64(uint64(memSegment.Locfields[postingID][locOffset])) + if err != nil { + return err + } + + // put pos + _, err = locEncoder.PutU64(memSegment.Locpos[postingID][locOffset]) + if err != nil { + return err + } + + // put start + _, err = locEncoder.PutU64(memSegment.Locstarts[postingID][locOffset]) + if err != nil { + return err + } + + // put end + _, err = locEncoder.PutU64(memSegment.Locends[postingID][locOffset]) + if err != nil { + return err + } + + // put array positions + num := len(memSegment.Locarraypos[postingID][locOffset]) + + // put the number of array positions to follow + _, err = locEncoder.PutU64(uint64(num)) + if err != nil { + return err + } + + // put each array position + for j := 0; j < num; j++ { + _, err = locEncoder.PutU64(memSegment.Locarraypos[postingID][locOffset][j]) + if err != nil { + return err + } + } + } + + locOffset++ + } + + offset++ + } + + // close out last chunk + + if chunkBucket != nil { + // fix me write freq/norms + encoder.Close() + err = chunkBucket.Put(freqNormKey, encodingBuf.Bytes()) + if err != nil { + return err + } + locEncoder.Close() + err = chunkBucket.Put(locKey, locEncodingBuf.Bytes()) + if err != nil { + return err + } + } + + return nil +} + +func persistStored(memSegment *mem.Segment, tx *bolt.Tx) error { + bucket, err := tx.CreateBucket(storedBucket) + if err != nil { + return err + } + bucket.FillPercent = 1.0 + + var curr int + // we use special varint which is still guaranteed to sort correctly + docNumBuf := make([]byte, 0, maxVarintSize) + for docNum, storedValues := range memSegment.Stored { + var metaBuf bytes.Buffer + var data, compressed []byte + if docNum != 0 { + // reset buffer if necessary + docNumBuf = docNumBuf[:0] + curr = 0 + } + // create doc sub-bucket + docNumBuf = EncodeUvarintAscending(docNumBuf, uint64(docNum)) + docBucket, err := bucket.CreateBucket(docNumBuf) + if err != nil { + return err + } + docBucket.FillPercent = 1.0 + + metaEncoder := govarint.NewU64Base128Encoder(&metaBuf) + + // encode fields in order + for fieldID := range memSegment.FieldsInv { + if storedFieldValues, ok := storedValues[uint16(fieldID)]; ok { + // has stored values for this field + num := len(storedFieldValues) + + // process each value + for i := 0; i < num; i++ { + // encode field + metaEncoder.PutU64(uint64(fieldID)) + // encode type + metaEncoder.PutU64(uint64(memSegment.StoredTypes[docNum][uint16(fieldID)][i])) + // encode start offset + metaEncoder.PutU64(uint64(curr)) + // end len + metaEncoder.PutU64(uint64(len(storedFieldValues[i]))) + // encode number of array pos + metaEncoder.PutU64(uint64(len(memSegment.StoredPos[docNum][uint16(fieldID)][i]))) + // encode all array positions + for j := 0; j < len(memSegment.StoredPos[docNum][uint16(fieldID)][i]); j++ { + metaEncoder.PutU64(memSegment.StoredPos[docNum][uint16(fieldID)][i][j]) + } + // append data + data = append(data, storedFieldValues[i]...) + // update curr + curr += len(storedFieldValues[i]) + } + } + } + metaEncoder.Close() + + err = docBucket.Put(metaKey, metaBuf.Bytes()) + if err != nil { + return err + } + + // compress data + compressed = snappy.Encode(compressed, data) + + err = docBucket.Put(dataKey, compressed) + if err != nil { + return err + } + + } + + return nil +} + +func persistConfig(tx *bolt.Tx, chunkFactor uint32) error { + bucket, err := tx.CreateBucket(configBucket) + if err != nil { + return err + } + + chunkVal := make([]byte, 4) + binary.BigEndian.PutUint32(chunkVal, chunkFactor) + err = bucket.Put(chunkKey, chunkVal) + if err != nil { + return err + } + + err = bucket.Put(versionKey, []byte{byte(version)}) + if err != nil { + return err + } + + return nil +} diff --git a/index/scorch/segment/bolt/build_test.go b/index/scorch/segment/bolt/build_test.go new file mode 100644 index 000000000..d4b93f159 --- /dev/null +++ b/index/scorch/segment/bolt/build_test.go @@ -0,0 +1,288 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package bolt + +import ( + "os" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment/mem" +) + +func TestBuild(t *testing.T) { + os.RemoveAll("/tmp/scorch.bolt") + + memSegment := buildMemSegment() + err := persistSegment(memSegment, "/tmp/scorch.bolt", 1024) + if err != nil { + t.Fatal(err) + } +} + +func buildMemSegment() *mem.Segment { + doc := &document.Document{ + ID: "a", + Fields: []document.Field{ + document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil), + document.NewTextFieldCustom("name", nil, []byte("wow"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + }, + CompositeFields: []*document.CompositeField{ + document.NewCompositeField("_all", true, nil, []string{"_id"}), + }, + } + + // forge analyzed docs + results := []*index.AnalysisResult{ + &index.AnalysisResult{ + Document: doc, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("a"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("wow"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("some"), + }, + &analysis.Token{ + Start: 5, + End: 10, + Position: 2, + Term: []byte("thing"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("cold"), + }, + }, []uint64{0}, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("dark"), + }, + }, []uint64{1}, true), + }, + Length: []int{ + 1, + 1, + 2, + 1, + 1, + }, + }, + } + + // fix up composite fields + for _, ar := range results { + for i, f := range ar.Document.Fields { + for _, cf := range ar.Document.CompositeFields { + cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i]) + } + } + } + + return mem.NewFromAnalyzedDocs(results) +} + +func buildMemSegmentMulti() *mem.Segment { + + doc := &document.Document{ + ID: "a", + Fields: []document.Field{ + document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil), + document.NewTextFieldCustom("name", nil, []byte("wow"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + }, + CompositeFields: []*document.CompositeField{ + document.NewCompositeField("_all", true, nil, []string{"_id"}), + }, + } + + doc2 := &document.Document{ + ID: "b", + Fields: []document.Field{ + document.NewTextFieldCustom("_id", nil, []byte("b"), document.IndexField|document.StoreField, nil), + document.NewTextFieldCustom("name", nil, []byte("who"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + }, + CompositeFields: []*document.CompositeField{ + document.NewCompositeField("_all", true, nil, []string{"_id"}), + }, + } + + // forge analyzed docs + results := []*index.AnalysisResult{ + &index.AnalysisResult{ + Document: doc, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("a"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("wow"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("some"), + }, + &analysis.Token{ + Start: 5, + End: 10, + Position: 2, + Term: []byte("thing"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("cold"), + }, + }, []uint64{0}, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("dark"), + }, + }, []uint64{1}, true), + }, + Length: []int{ + 1, + 1, + 2, + 1, + 1, + }, + }, + &index.AnalysisResult{ + Document: doc2, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("b"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("who"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("some"), + }, + &analysis.Token{ + Start: 5, + End: 10, + Position: 2, + Term: []byte("thing"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("cold"), + }, + }, []uint64{0}, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("dark"), + }, + }, []uint64{1}, true), + }, + Length: []int{ + 1, + 1, + 2, + 1, + 1, + }, + }, + } + + // fix up composite fields + for _, ar := range results { + for i, f := range ar.Document.Fields { + for _, cf := range ar.Document.CompositeFields { + cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i]) + } + } + } + + segment := mem.NewFromAnalyzedDocs(results) + + return segment +} diff --git a/index/scorch/segment/bolt/dict.go b/index/scorch/segment/bolt/dict.go new file mode 100644 index 000000000..0d7ab5eca --- /dev/null +++ b/index/scorch/segment/bolt/dict.go @@ -0,0 +1,161 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package bolt + +import ( + "fmt" + + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/couchbaselabs/vellum" + "github.com/couchbaselabs/vellum/regexp" +) + +// Dictionary is the bolt representation of the term dictionary +type Dictionary struct { + segment *Segment + field string + fieldID uint16 + fst *vellum.FST +} + +// PostingsList returns the postings list for the specified term +func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) { + return d.postingsList(term, except) +} + +func (d *Dictionary) postingsList(term string, except *roaring.Bitmap) (*PostingsList, error) { + rv := &PostingsList{ + dictionary: d, + term: term, + except: except, + } + + if d.fst != nil { + postingsID, exists, err := d.fst.Get([]byte(term)) + if err != nil { + return nil, fmt.Errorf("vellum err: %v", err) + } + if exists { + rv.postingsID = postingsID + postingsIDKey := EncodeUvarintAscending(nil, postingsID) + bucket := d.segment.tx.Bucket(postingsBucket) + if bucket == nil { + return nil, fmt.Errorf("postings bucket missing") + } + + roaringBytes := bucket.Get(postingsIDKey) + if roaringBytes == nil { + return nil, fmt.Errorf("postings for postingsID %d missing", postingsID) + } + bitmap := roaring.NewBitmap() + _, err = bitmap.FromBuffer(roaringBytes) + if err != nil { + return nil, fmt.Errorf("error loading roaring bitmap: %v", err) + } + + rv.postings = bitmap + rv.postingKey = postingsIDKey + } + } + + return rv, nil +} + +// Iterator returns an iterator for this dictionary +func (d *Dictionary) Iterator() segment.DictionaryIterator { + + rv := &DictionaryIterator{ + d: d, + } + + if d.fst != nil { + itr, err := d.fst.Iterator(nil, nil) + if err == nil { + rv.itr = itr + } + } + + return rv +} + +// PrefixIterator returns an iterator which only visits terms having the +// the specified prefix +func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { + rv := &DictionaryIterator{ + d: d, + } + + if d.fst != nil { + r, err := regexp.New(prefix + ".*") + if err == nil { + itr, err := d.fst.Search(r, nil, nil) + if err == nil { + rv.itr = itr + } + } + } + + return rv +} + +// RangeIterator returns an iterator which only visits terms between the +// start and end terms. NOTE: bleve.index API specifies the end is inclusive. +func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator { + rv := &DictionaryIterator{ + d: d, + } + + // need to increment the end position to be inclusive + endBytes := []byte(end) + if endBytes[len(endBytes)-1] < 0xff { + endBytes[len(endBytes)-1]++ + } else { + endBytes = append(endBytes, 0xff) + } + + if d.fst != nil { + itr, err := d.fst.Iterator([]byte(start), endBytes) + if err == nil { + rv.itr = itr + } + } + + return rv +} + +// DictionaryIterator is an iterator for term dictionary +type DictionaryIterator struct { + d *Dictionary + itr vellum.Iterator + err error +} + +// Next returns the next entry in the dictionary +func (i *DictionaryIterator) Next() (*index.DictEntry, error) { + if i.err == vellum.ErrIteratorDone { + return nil, nil + } else if i.err != nil { + return nil, i.err + } + term, count := i.itr.Current() + rv := &index.DictEntry{ + Term: string(term), + Count: count, + } + i.err = i.itr.Next() + return rv, nil +} diff --git a/index/scorch/segment/bolt/dict_test.go b/index/scorch/segment/bolt/dict_test.go new file mode 100644 index 000000000..6b3926a87 --- /dev/null +++ b/index/scorch/segment/bolt/dict_test.go @@ -0,0 +1,183 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package bolt + +import ( + "os" + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment/mem" +) + +func buildMemSegmentForDict() *mem.Segment { + doc := &document.Document{ + ID: "a", + Fields: []document.Field{ + document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil), + document.NewTextFieldCustom("desc", nil, []byte("apple ball cat dog egg fish bat"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + }, + } + + // forge analyzed docs + results := []*index.AnalysisResult{ + &index.AnalysisResult{ + Document: doc, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("a"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 5, + Position: 1, + Term: []byte("apple"), + }, + &analysis.Token{ + Start: 6, + End: 10, + Position: 2, + Term: []byte("ball"), + }, + &analysis.Token{ + Start: 11, + End: 14, + Position: 3, + Term: []byte("cat"), + }, + &analysis.Token{ + Start: 15, + End: 18, + Position: 4, + Term: []byte("dog"), + }, + &analysis.Token{ + Start: 19, + End: 22, + Position: 5, + Term: []byte("egg"), + }, + &analysis.Token{ + Start: 20, + End: 24, + Position: 6, + Term: []byte("fish"), + }, + &analysis.Token{ + Start: 25, + End: 28, + Position: 7, + Term: []byte("bat"), + }, + }, nil, true), + }, + Length: []int{ + 1, + 7, + }, + }, + } + + segment := mem.NewFromAnalyzedDocs(results) + + return segment +} + +func TestDictionary(t *testing.T) { + + _ = os.RemoveAll("/tmp/scorch.bolt") + + memSegment := buildMemSegmentForDict() + err := persistSegment(memSegment, "/tmp/scorch.bolt", 1024) + if err != nil { + t.Fatalf("error persisting segment: %v", err) + } + + segment, err := Open("/tmp/scorch.bolt") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + dict, err := segment.Dictionary("desc") + if err != nil { + t.Fatal(err) + } + + // test basic full iterator + expected := []string{"apple", "ball", "bat", "cat", "dog", "egg", "fish"} + var got []string + itr := dict.Iterator() + next, err := itr.Next() + for next != nil && err == nil { + got = append(got, next.Term) + next, err = itr.Next() + } + if err != nil { + t.Fatalf("dict itr error: %v", err) + } + + if !reflect.DeepEqual(expected, got) { + t.Errorf("expected: %v, got: %v", expected, got) + } + + // test prefix iterator + expected = []string{"ball", "bat"} + got = got[:0] + itr = dict.PrefixIterator("b") + next, err = itr.Next() + for next != nil && err == nil { + got = append(got, next.Term) + next, err = itr.Next() + } + if err != nil { + t.Fatalf("dict itr error: %v", err) + } + + if !reflect.DeepEqual(expected, got) { + t.Errorf("expected: %v, got: %v", expected, got) + } + + // test range iterator + expected = []string{"cat", "dog", "egg"} + got = got[:0] + itr = dict.RangeIterator("cat", "egg") + next, err = itr.Next() + for next != nil && err == nil { + got = append(got, next.Term) + next, err = itr.Next() + } + if err != nil { + t.Fatalf("dict itr error: %v", err) + } + + if !reflect.DeepEqual(expected, got) { + t.Errorf("expected: %v, got: %v", expected, got) + } +} diff --git a/index/scorch/segment/bolt/int.go b/index/scorch/segment/bolt/int.go new file mode 100644 index 000000000..a4af3a7a8 --- /dev/null +++ b/index/scorch/segment/bolt/int.go @@ -0,0 +1,94 @@ +// Copyright 2014 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +// This code originated from: +// https://github.com/cockroachdb/cockroach/blob/2dd65dde5d90c157f4b93f92502ca1063b904e1d/pkg/util/encoding/encoding.go + +// Modified to not use pkg/errors + +package bolt + +import "fmt" + +const ( + maxVarintSize = 9 + + // IntMin is chosen such that the range of int tags does not overlap the + // ascii character set that is frequently used in testing. + IntMin = 0x80 // 128 + intMaxWidth = 8 + intZero = IntMin + intMaxWidth // 136 + intSmall = IntMax - intZero - intMaxWidth // 109 + // IntMax is the maximum int tag value. + IntMax = 0xfd // 253 +) + +// EncodeUvarintAscending encodes the uint64 value using a variable length +// (length-prefixed) representation. The length is encoded as a single +// byte indicating the number of encoded bytes (-8) to follow. See +// EncodeVarintAscending for rationale. The encoded bytes are appended to the +// supplied buffer and the final buffer is returned. +func EncodeUvarintAscending(b []byte, v uint64) []byte { + switch { + case v <= intSmall: + return append(b, intZero+byte(v)) + case v <= 0xff: + return append(b, IntMax-7, byte(v)) + case v <= 0xffff: + return append(b, IntMax-6, byte(v>>8), byte(v)) + case v <= 0xffffff: + return append(b, IntMax-5, byte(v>>16), byte(v>>8), byte(v)) + case v <= 0xffffffff: + return append(b, IntMax-4, byte(v>>24), byte(v>>16), byte(v>>8), byte(v)) + case v <= 0xffffffffff: + return append(b, IntMax-3, byte(v>>32), byte(v>>24), byte(v>>16), byte(v>>8), + byte(v)) + case v <= 0xffffffffffff: + return append(b, IntMax-2, byte(v>>40), byte(v>>32), byte(v>>24), byte(v>>16), + byte(v>>8), byte(v)) + case v <= 0xffffffffffffff: + return append(b, IntMax-1, byte(v>>48), byte(v>>40), byte(v>>32), byte(v>>24), + byte(v>>16), byte(v>>8), byte(v)) + default: + return append(b, IntMax, byte(v>>56), byte(v>>48), byte(v>>40), byte(v>>32), + byte(v>>24), byte(v>>16), byte(v>>8), byte(v)) + } +} + +// DecodeUvarintAscending decodes a varint encoded uint64 from the input +// buffer. The remainder of the input buffer and the decoded uint64 +// are returned. +func DecodeUvarintAscending(b []byte) ([]byte, uint64, error) { + if len(b) == 0 { + return nil, 0, fmt.Errorf("insufficient bytes to decode uvarint value") + } + length := int(b[0]) - intZero + b = b[1:] // skip length byte + if length <= intSmall { + return b, uint64(length), nil + } + length -= intSmall + if length < 0 || length > 8 { + return nil, 0, fmt.Errorf("invalid uvarint length of %d", length) + } else if len(b) < length { + return nil, 0, fmt.Errorf("insufficient bytes to decode uvarint value: %q", b) + } + var v uint64 + // It is faster to range over the elements in a slice than to index + // into the slice on each loop iteration. + for _, t := range b[:length] { + v = (v << 8) | uint64(t) + } + return b[length:], v, nil +} diff --git a/index/scorch/segment/bolt/int_test.go b/index/scorch/segment/bolt/int_test.go new file mode 100644 index 000000000..e59918c8b --- /dev/null +++ b/index/scorch/segment/bolt/int_test.go @@ -0,0 +1,96 @@ +// Copyright 2014 The Cockroach Authors. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +// This code originated from: +// https://github.com/cockroachdb/cockroach/blob/2dd65dde5d90c157f4b93f92502ca1063b904e1d/pkg/util/encoding/encoding_test.go + +// Modified to only test the parts we borrowed + +package bolt + +import ( + "bytes" + "math" + "testing" +) + +type testCaseUint64 struct { + value uint64 + expEnc []byte +} + +func TestEncodeDecodeUvarint(t *testing.T) { + testBasicEncodeDecodeUint64(EncodeUvarintAscending, DecodeUvarintAscending, false, t) + testCases := []testCaseUint64{ + {0, []byte{0x88}}, + {1, []byte{0x89}}, + {109, []byte{0xf5}}, + {110, []byte{0xf6, 0x6e}}, + {1 << 8, []byte{0xf7, 0x01, 0x00}}, + {math.MaxUint64, []byte{0xfd, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}}, + } + testCustomEncodeUint64(testCases, EncodeUvarintAscending, t) +} + +func testBasicEncodeDecodeUint64( + encFunc func([]byte, uint64) []byte, + decFunc func([]byte) ([]byte, uint64, error), + descending bool, t *testing.T, +) { + testCases := []uint64{ + 0, 1, + 1<<8 - 1, 1 << 8, + 1<<16 - 1, 1 << 16, + 1<<24 - 1, 1 << 24, + 1<<32 - 1, 1 << 32, + 1<<40 - 1, 1 << 40, + 1<<48 - 1, 1 << 48, + 1<<56 - 1, 1 << 56, + math.MaxUint64 - 1, math.MaxUint64, + } + + var lastEnc []byte + for i, v := range testCases { + enc := encFunc(nil, v) + if i > 0 { + if (descending && bytes.Compare(enc, lastEnc) >= 0) || + (!descending && bytes.Compare(enc, lastEnc) < 0) { + t.Errorf("ordered constraint violated for %d: [% x] vs. [% x]", v, enc, lastEnc) + } + } + b, decode, err := decFunc(enc) + if err != nil { + t.Error(err) + continue + } + if len(b) != 0 { + t.Errorf("leftover bytes: [% x]", b) + } + if decode != v { + t.Errorf("decode yielded different value than input: %d vs. %d", decode, v) + } + lastEnc = enc + } +} + +func testCustomEncodeUint64( + testCases []testCaseUint64, encFunc func([]byte, uint64) []byte, t *testing.T, +) { + for _, test := range testCases { + enc := encFunc(nil, test.value) + if !bytes.Equal(enc, test.expEnc) { + t.Errorf("expected [% x]; got [% x] (value: %d)", test.expEnc, enc, test.value) + } + } +} diff --git a/index/scorch/segment/bolt/posting.go b/index/scorch/segment/bolt/posting.go new file mode 100644 index 000000000..e5d6c8938 --- /dev/null +++ b/index/scorch/segment/bolt/posting.go @@ -0,0 +1,323 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package bolt + +import ( + "bytes" + "fmt" + "math" + + "github.com/RoaringBitmap/roaring" + "github.com/Smerity/govarint" + "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/boltdb/bolt" +) + +// PostingsList is an in-memory represenation of a postings list +type PostingsList struct { + dictionary *Dictionary + term string + postingsID uint64 + postings *roaring.Bitmap + except *roaring.Bitmap + postingKey []byte +} + +// Iterator returns an iterator for this postings list +func (p *PostingsList) Iterator() segment.PostingsIterator { + rv := &PostingsIterator{ + postings: p, + } + if p.postings != nil { + detailsBucket := p.dictionary.segment.tx.Bucket(postingDetailsBucket) + rv.detailBucket = detailsBucket.Bucket(p.postingKey) + rv.all = p.postings.Iterator() + if p.except != nil { + allExcept := p.postings.Clone() + allExcept.AndNot(p.except) + rv.actual = allExcept.Iterator() + } else { + rv.actual = p.postings.Iterator() + } + } + + return rv +} + +// Count returns the number of items on this postings list +func (p *PostingsList) Count() uint64 { + var rv uint64 + if p.postings != nil { + rv = p.postings.GetCardinality() + if p.except != nil { + except := p.except.GetCardinality() + if except > rv { + // avoid underflow + except = rv + } + rv -= except + } + } + return rv +} + +// PostingsIterator provides a way to iterate through the postings list +type PostingsIterator struct { + postings *PostingsList + all roaring.IntIterable + offset int + locoffset int + actual roaring.IntIterable + detailBucket *bolt.Bucket + + currChunk uint32 + currChunkFreqNorm []byte + currChunkLoc []byte + freqNormDecoder *govarint.Base128Decoder + locDecoder *govarint.Base128Decoder +} + +func (i *PostingsIterator) loadChunk(chunk int) error { + // load correct chunk bytes + chunkID := EncodeUvarintAscending(nil, uint64(chunk)) + chunkBucket := i.detailBucket.Bucket(chunkID) + if chunkBucket == nil { + return fmt.Errorf("chunk %d missing", chunkID) + } + i.currChunkFreqNorm = chunkBucket.Get(freqNormKey) + i.freqNormDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkFreqNorm)) + i.currChunkLoc = chunkBucket.Get(locKey) + i.locDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkLoc)) + i.currChunk = uint32(chunk) + return nil +} + +func (i *PostingsIterator) readFreqNorm() (uint64, uint64, error) { + freq, err := i.freqNormDecoder.GetU64() + if err != nil { + return 0, 0, fmt.Errorf("error reading frequency: %v", err) + } + normBits, err := i.freqNormDecoder.GetU64() + if err != nil { + return 0, 0, fmt.Errorf("error reading norm: %v", err) + } + return freq, normBits, err +} + +// readLocation processes all the integers on the stream representing a single +// location. if you care about it, pass in a non-nil location struct, and we +// will fill it. if you don't care about it, pass in nil and we safely consume +// the contents. +func (i *PostingsIterator) readLocation(l *Location) error { + // read off field + fieldID, err := i.locDecoder.GetU64() + if err != nil { + return fmt.Errorf("error reading location field: %v", err) + } + // read off pos + pos, err := i.locDecoder.GetU64() + if err != nil { + return fmt.Errorf("error reading location pos: %v", err) + } + // read off start + start, err := i.locDecoder.GetU64() + if err != nil { + return fmt.Errorf("error reading location start: %v", err) + } + // read off end + end, err := i.locDecoder.GetU64() + if err != nil { + return fmt.Errorf("error reading location end: %v", err) + } + // read off num array pos + numArrayPos, err := i.locDecoder.GetU64() + if err != nil { + return fmt.Errorf("error reading location num array pos: %v", err) + } + + // group these together for less branching + if l != nil { + l.field = i.postings.dictionary.segment.fieldsInv[fieldID] + l.pos = pos + l.start = start + l.end = end + if numArrayPos > 0 { + l.ap = make([]uint64, int(numArrayPos)) + } + } + + // read off array positions + for k := 0; k < int(numArrayPos); k++ { + ap, err := i.locDecoder.GetU64() + if err != nil { + return fmt.Errorf("error reading array position: %v", err) + } + if l != nil { + l.ap[k] = ap + } + } + + return nil +} + +// Next returns the next posting on the postings list, or nil at the end +func (i *PostingsIterator) Next() (segment.Posting, error) { + if i.actual == nil || !i.actual.HasNext() { + return nil, nil + } + n := i.actual.Next() + nChunk := n / i.postings.dictionary.segment.chunkFactor + allN := i.all.Next() + allNChunk := allN / i.postings.dictionary.segment.chunkFactor + + // n is the next actual hit (excluding some postings) + // allN is the next hit in the full postings + // if they don't match, adjust offsets to factor in item we're skipping over + // incr the all iterator, and check again + for allN != n { + + // in different chunks, reset offsets + if allNChunk != nChunk { + i.locoffset = 0 + i.offset = 0 + } else { + + if i.currChunk != nChunk || i.currChunkFreqNorm == nil { + err := i.loadChunk(int(nChunk)) + if err != nil { + return nil, fmt.Errorf("error loading chunk: %v", err) + } + } + + // read off freq/offsets even though we don't care about them + freq, _, err := i.readFreqNorm() + if err != nil { + return nil, err + } + if i.postings.dictionary.segment.fieldsLoc[i.postings.dictionary.fieldID] { + for j := 0; j < int(freq); j++ { + err := i.readLocation(nil) + if err != nil { + return nil, err + } + } + } + + // in same chunk, need to account for offsets + i.offset++ + } + + allN = i.all.Next() + } + + if i.currChunk != nChunk || i.currChunkFreqNorm == nil { + err := i.loadChunk(int(nChunk)) + if err != nil { + return nil, fmt.Errorf("error loading chunk: %v", err) + } + } + + rv := &Posting{ + iterator: i, + docNum: uint64(n), + } + + var err error + var normBits uint64 + rv.freq, normBits, err = i.readFreqNorm() + if err != nil { + return nil, err + } + rv.norm = math.Float32frombits(uint32(normBits)) + if i.postings.dictionary.segment.fieldsLoc[i.postings.dictionary.fieldID] { + // read off 'freq' locations + rv.locs = make([]segment.Location, rv.freq) + locs := make([]Location, rv.freq) + for j := 0; j < int(rv.freq); j++ { + err := i.readLocation(&locs[j]) + if err != nil { + return nil, err + } + rv.locs[j] = &locs[j] + } + } + + return rv, nil +} + +// Posting is a single entry in a postings list +type Posting struct { + iterator *PostingsIterator + docNum uint64 + + freq uint64 + norm float32 + locs []segment.Location +} + +// Number returns the document number of this posting in this segment +func (p *Posting) Number() uint64 { + return p.docNum +} + +// Frequency returns the frequence of occurance of this term in this doc/field +func (p *Posting) Frequency() uint64 { + return p.freq +} + +// Norm returns the normalization factor for this posting +func (p *Posting) Norm() float64 { + return float64(p.norm) +} + +// Locations returns the location information for each occurance +func (p *Posting) Locations() []segment.Location { + return p.locs +} + +// Location represents the location of a single occurance +type Location struct { + field string + pos uint64 + start uint64 + end uint64 + ap []uint64 +} + +// Field returns the name of the field (useful in composite fields to know +// which original field the value came from) +func (l *Location) Field() string { + return l.field +} + +// Start returns the start byte offset of this occurance +func (l *Location) Start() uint64 { + return l.start +} + +// End returns the end byte offset of this occurance +func (l *Location) End() uint64 { + return l.end +} + +// Pos returns the 1-based phrase position of this occurance +func (l *Location) Pos() uint64 { + return l.pos +} + +// ArrayPositions returns the array position vector associated with this occurance +func (l *Location) ArrayPositions() []uint64 { + return l.ap +} diff --git a/index/scorch/segment/bolt/segment.go b/index/scorch/segment/bolt/segment.go new file mode 100644 index 000000000..835313b87 --- /dev/null +++ b/index/scorch/segment/bolt/segment.go @@ -0,0 +1,309 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package bolt + +import ( + "bytes" + "encoding/binary" + "fmt" + "io" + + "github.com/RoaringBitmap/roaring" + "github.com/Smerity/govarint" + "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/boltdb/bolt" + "github.com/couchbaselabs/vellum" + "github.com/golang/snappy" +) + +var readOnlyOptions = &bolt.Options{ + ReadOnly: true, +} + +// _id field is always guaranteed to have fieldID of 0 +const idFieldID uint16 = 0 + +// Open returns a boltdb impl of a segment +func Open(path string) (segment.Segment, error) { + + db, err := bolt.Open(path, 0600, readOnlyOptions) + if err != nil { + return nil, err + } + + tx, err := db.Begin(false) + if err != nil { + _ = db.Close() + return nil, err + } + + rv := &Segment{ + db: db, + tx: tx, + fieldsMap: make(map[string]uint16), + } + + err = rv.loadConfig() + if err != nil { + _ = db.Close() + return nil, err + } + + err = rv.loadFields() + if err != nil { + _ = db.Close() + return nil, err + } + + return rv, nil +} + +// Segment implements a boltdb based implementation of a segment +type Segment struct { + version uint8 + chunkFactor uint32 + db *bolt.DB + tx *bolt.Tx + + fieldsMap map[string]uint16 + fieldsInv []string + fieldsLoc []bool +} + +func (s *Segment) loadConfig() (err error) { + bucket := s.tx.Bucket(configBucket) + if bucket == nil { + return fmt.Errorf("config bucket missing") + } + + ver := bucket.Get(versionKey) + if ver == nil { + return fmt.Errorf("version key missing") + } + s.version = ver[0] + + chunk := bucket.Get(chunkKey) + if chunk == nil { + return fmt.Errorf("chunk key is missing") + } + s.chunkFactor = binary.BigEndian.Uint32(chunk) + + return nil +} + +// loadFields reads the fields info from the segment so that we never have to go +// back to disk to access this (small and used frequently) +func (s *Segment) loadFields() (err error) { + + bucket := s.tx.Bucket(fieldsBucket) + if bucket == nil { + return fmt.Errorf("fields bucket missing") + } + + indexLocs := roaring.NewBitmap() + err = bucket.ForEach(func(k []byte, v []byte) error { + + // process index locations bitset + if k[0] == indexLocsKey[0] { + _, err2 := indexLocs.FromBuffer(v) + if err2 != nil { + return fmt.Errorf("error loading indexLocs: %v", err2) + } + } else { + + _, fieldID, err2 := DecodeUvarintAscending(k) + if err2 != nil { + return err2 + } + // we store fieldID+1 in so we can discern the zero value + s.fieldsMap[string(v)] = uint16(fieldID + 1) + } + return nil + }) + if err != nil { + return err + } + + // now setup the inverse (should have same size as map and be keyed 0-(len-1)) + s.fieldsInv = make([]string, len(s.fieldsMap)) + for k, v := range s.fieldsMap { + s.fieldsInv[int(v)-1] = k + } + s.fieldsLoc = make([]bool, len(s.fieldsInv)) + for i := range s.fieldsInv { + if indexLocs.ContainsInt(i) { + s.fieldsLoc[i] = true + } + } + + return nil +} + +// Fields returns the field names used in this segment +func (s *Segment) Fields() []string { + return s.fieldsInv +} + +// Count returns the number of documents in this segment +// (this has no notion of deleted docs) +func (s *Segment) Count() uint64 { + return uint64(s.tx.Bucket(storedBucket).Stats().BucketN - 1) +} + +// Dictionary returns the term dictionary for the specified field +func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) { + return s.dictionary(field) +} + +func (s *Segment) dictionary(field string) (*Dictionary, error) { + + rv := &Dictionary{ + segment: s, + field: field, + } + + rv.fieldID = s.fieldsMap[field] + if rv.fieldID > 0 { + rv.fieldID = rv.fieldID - 1 + fieldIDKey := EncodeUvarintAscending(nil, uint64(rv.fieldID)) + bucket := s.tx.Bucket(dictBucket) + if bucket == nil { + return nil, fmt.Errorf("dictionary bucket missing") + } + fstBytes := bucket.Get(fieldIDKey) + if fstBytes == nil { + return nil, fmt.Errorf("dictionary field %s bytes nil", field) + } + if fstBytes != nil { + fst, err := vellum.Load(fstBytes) + if err != nil { + return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err) + } + if err == nil { + rv.fst = fst + } + } + + } + + return rv, nil +} + +// VisitDocument invokes the DocFieldValueVistor for each stored field +// for the specified doc number +func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { + storedBuucket := s.tx.Bucket(storedBucket) + if storedBuucket == nil { + return fmt.Errorf("stored bucket missing") + } + docNumKey := EncodeUvarintAscending(nil, num) + docBucket := storedBuucket.Bucket(docNumKey) + if docBucket == nil { + return fmt.Errorf("segment has no doc number %d", num) + } + metaBytes := docBucket.Get(metaKey) + if metaBytes == nil { + return fmt.Errorf("stored meta bytes for doc number %d is nil", num) + } + dataBytes := docBucket.Get(dataKey) + if dataBytes == nil { + return fmt.Errorf("stored data bytes for doc number %d is nil", num) + } + uncompressed, err := snappy.Decode(nil, dataBytes) + if err != nil { + return err + } + + reader := bytes.NewReader(metaBytes) + decoder := govarint.NewU64Base128Decoder(reader) + + keepGoing := true + for keepGoing { + field, err := decoder.GetU64() + if err == io.EOF { + break + } + if err != nil { + return err + } + typ, err := decoder.GetU64() + if err != nil { + return err + } + offset, err := decoder.GetU64() + if err != nil { + return err + } + l, err := decoder.GetU64() + if err != nil { + return err + } + numap, err := decoder.GetU64() + if err != nil { + return err + } + var arrayPos []uint64 + if numap > 0 { + arrayPos = make([]uint64, numap) + for i := 0; i < int(numap); i++ { + ap, err := decoder.GetU64() + if err != nil { + return err + } + arrayPos[i] = ap + } + } + + value := uncompressed[offset : offset+l] + keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos) + } + + return nil +} + +// DocNumbers returns a bitset corresponding to the doc numbers of all the +// provided _id strings +func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) { + rv := roaring.New() + + if len(s.fieldsMap) > 0 { + idDict, err := s.dictionary("_id") + if err != nil { + return nil, err + } + + for _, id := range ids { + postings, err := idDict.postingsList(id, nil) + if err != nil { + return nil, err + } + if postings.postings != nil { + rv.Or(postings.postings) + } + } + } + + return rv, nil +} + +// Close releases all resources associated with this segment +func (s *Segment) Close() error { + err := s.tx.Rollback() + if err != nil { + _ = s.db.Close() + return err + } + return s.db.Close() +} diff --git a/index/scorch/segment/bolt/segment_test.go b/index/scorch/segment/bolt/segment_test.go new file mode 100644 index 000000000..b00c71926 --- /dev/null +++ b/index/scorch/segment/bolt/segment_test.go @@ -0,0 +1,517 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package bolt + +import ( + "math" + "os" + "reflect" + "testing" +) + +func TestOpen(t *testing.T) { + _ = os.RemoveAll("/tmp/scorch.bolt") + + memSegment := buildMemSegment() + err := persistSegment(memSegment, "/tmp/scorch.bolt", 1024) + if err != nil { + t.Fatalf("error persisting segment: %v", err) + } + + segment, err := Open("/tmp/scorch.bolt") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + expectFields := map[string]struct{}{ + "_id": struct{}{}, + "_all": struct{}{}, + "name": struct{}{}, + "desc": struct{}{}, + "tag": struct{}{}, + } + fields := segment.Fields() + if len(fields) != len(expectFields) { + t.Errorf("expected %d fields, only got %d", len(expectFields), len(fields)) + } + for _, field := range fields { + if _, ok := expectFields[field]; !ok { + t.Errorf("got unexpected field: %s", field) + } + } + + docCount := segment.Count() + if docCount != 1 { + t.Errorf("expected count 1, got %d", docCount) + } + + // check the _id field + dict, err := segment.Dictionary("_id") + if err != nil { + t.Fatal(err) + } + if dict == nil { + t.Fatal("got nil dict, expected non-nil") + } + + postingsList, err := dict.PostingsList("a", nil) + if err != nil { + t.Fatal(err) + } + if postingsList == nil { + t.Fatal("got nil postings list, expected non-nil") + } + + postingsItr := postingsList.Iterator() + if postingsItr == nil { + t.Fatal("got nil iterator, expected non-nil") + } + + count := 0 + nextPosting, err := postingsItr.Next() + for nextPosting != nil && err == nil { + count++ + if nextPosting.Frequency() != 1 { + t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) + } + if nextPosting.Number() != 0 { + t.Errorf("expected doc number 0, got %d", nextPosting.Number()) + } + if nextPosting.Norm() != 1.0 { + t.Errorf("expected norm 1.0, got %f", nextPosting.Norm()) + } + + nextPosting, err = postingsItr.Next() + } + if err != nil { + t.Fatal(err) + } + + if count != 1 { + t.Errorf("expected count to be 1, got %d", count) + } + + // check the name field + dict, err = segment.Dictionary("name") + if err != nil { + t.Fatal(err) + } + if dict == nil { + t.Fatal("got nil dict, expected non-nil") + } + + postingsList, err = dict.PostingsList("wow", nil) + if err != nil { + t.Fatal(err) + } + if postingsList == nil { + t.Fatal("got nil postings list, expected non-nil") + } + + postingsItr = postingsList.Iterator() + if postingsItr == nil { + t.Fatal("got nil iterator, expected non-nil") + } + + count = 0 + nextPosting, err = postingsItr.Next() + for nextPosting != nil && err == nil { + count++ + if nextPosting.Frequency() != 1 { + t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) + } + if nextPosting.Number() != 0 { + t.Errorf("expected doc number 0, got %d", nextPosting.Number()) + } + if nextPosting.Norm() != 1.0 { + t.Errorf("expected norm 1.0, got %f", nextPosting.Norm()) + } + var numLocs uint64 + for _, loc := range nextPosting.Locations() { + numLocs++ + if loc.Field() != "name" { + t.Errorf("expected loc field to be 'name', got '%s'", loc.Field()) + } + if loc.Start() != 0 { + t.Errorf("expected loc start to be 0, got %d", loc.Start()) + } + if loc.End() != 3 { + t.Errorf("expected loc end to be 3, got %d", loc.End()) + } + if loc.Pos() != 1 { + t.Errorf("expected loc pos to be 1, got %d", loc.Pos()) + } + if loc.ArrayPositions() != nil { + t.Errorf("expect loc array pos to be nil, got %v", loc.ArrayPositions()) + } + } + if numLocs != nextPosting.Frequency() { + t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs) + } + + nextPosting, err = postingsItr.Next() + } + if err != nil { + t.Fatal(err) + } + + if count != 1 { + t.Errorf("expected count to be 1, got %d", count) + } + + // check the _all field (composite) + dict, err = segment.Dictionary("_all") + if err != nil { + t.Fatal(err) + } + if dict == nil { + t.Fatal("got nil dict, expected non-nil") + } + + postingsList, err = dict.PostingsList("wow", nil) + if err != nil { + t.Fatal(err) + } + if postingsList == nil { + t.Fatal("got nil postings list, expected non-nil") + } + + postingsItr = postingsList.Iterator() + if postingsItr == nil { + t.Fatal("got nil iterator, expected non-nil") + } + + count = 0 + nextPosting, err = postingsItr.Next() + for nextPosting != nil && err == nil { + count++ + if nextPosting.Frequency() != 1 { + t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) + } + if nextPosting.Number() != 0 { + t.Errorf("expected doc number 0, got %d", nextPosting.Number()) + } + expectedNorm := float32(1.0 / math.Sqrt(float64(5))) + if nextPosting.Norm() != float64(expectedNorm) { + t.Errorf("expected norm %f, got %f", expectedNorm, nextPosting.Norm()) + } + var numLocs uint64 + for _, loc := range nextPosting.Locations() { + numLocs++ + if loc.Field() != "name" { + t.Errorf("expected loc field to be 'name', got '%s'", loc.Field()) + } + if loc.Start() != 0 { + t.Errorf("expected loc start to be 0, got %d", loc.Start()) + } + if loc.End() != 3 { + t.Errorf("expected loc end to be 3, got %d", loc.End()) + } + if loc.Pos() != 1 { + t.Errorf("expected loc pos to be 1, got %d", loc.Pos()) + } + if loc.ArrayPositions() != nil { + t.Errorf("expect loc array pos to be nil, got %v", loc.ArrayPositions()) + } + } + if numLocs != nextPosting.Frequency() { + t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs) + } + + nextPosting, err = postingsItr.Next() + } + if err != nil { + t.Fatal(err) + } + + if count != 1 { + t.Errorf("expected count to be 1, got %d", count) + } + + // now try a field with array positions + dict, err = segment.Dictionary("tag") + if err != nil { + t.Fatal(err) + } + if dict == nil { + t.Fatal("got nil dict, expected non-nil") + } + + postingsList, err = dict.PostingsList("dark", nil) + if err != nil { + t.Fatal(err) + } + if postingsList == nil { + t.Fatal("got nil postings list, expected non-nil") + } + + postingsItr = postingsList.Iterator() + if postingsItr == nil { + t.Fatal("got nil iterator, expected non-nil") + } + + nextPosting, err = postingsItr.Next() + for nextPosting != nil && err == nil { + + if nextPosting.Frequency() != 1 { + t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) + } + if nextPosting.Number() != 0 { + t.Errorf("expected doc number 0, got %d", nextPosting.Number()) + } + var numLocs uint64 + for _, loc := range nextPosting.Locations() { + numLocs++ + if loc.Field() != "tag" { + t.Errorf("expected loc field to be 'name', got '%s'", loc.Field()) + } + if loc.Start() != 0 { + t.Errorf("expected loc start to be 0, got %d", loc.Start()) + } + if loc.End() != 4 { + t.Errorf("expected loc end to be 3, got %d", loc.End()) + } + if loc.Pos() != 1 { + t.Errorf("expected loc pos to be 1, got %d", loc.Pos()) + } + expectArrayPos := []uint64{1} + if !reflect.DeepEqual(loc.ArrayPositions(), expectArrayPos) { + t.Errorf("expect loc array pos to be %v, got %v", expectArrayPos, loc.ArrayPositions()) + } + } + if numLocs != nextPosting.Frequency() { + t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs) + } + + nextPosting, err = postingsItr.Next() + } + if err != nil { + t.Fatal(err) + } + + // now try and visit a document + var fieldValuesSeen int + err = segment.VisitDocument(0, func(field string, typ byte, value []byte, pos []uint64) bool { + fieldValuesSeen++ + return true + }) + if err != nil { + t.Fatal(err) + } + if fieldValuesSeen != 5 { + t.Errorf("expected 5 field values, got %d", fieldValuesSeen) + } +} + +func TestOpenMulti(t *testing.T) { + _ = os.RemoveAll("/tmp/scorch.bolt") + + memSegment := buildMemSegmentMulti() + err := persistSegment(memSegment, "/tmp/scorch.bolt", 1024) + if err != nil { + t.Fatalf("error persisting segment: %v", err) + } + + segment, err := Open("/tmp/scorch.bolt") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + if segment.Count() != 2 { + t.Errorf("expected count 2, got %d", segment.Count()) + } + + // check the desc field + dict, err := segment.Dictionary("desc") + if err != nil { + t.Fatal(err) + } + if dict == nil { + t.Fatal("got nil dict, expected non-nil") + } + + postingsList, err := dict.PostingsList("thing", nil) + if err != nil { + t.Fatal(err) + } + if postingsList == nil { + t.Fatal("got nil postings list, expected non-nil") + } + + postingsItr := postingsList.Iterator() + if postingsItr == nil { + t.Fatal("got nil iterator, expected non-nil") + } + + count := 0 + nextPosting, err := postingsItr.Next() + for nextPosting != nil && err == nil { + count++ + nextPosting, err = postingsItr.Next() + } + if err != nil { + t.Fatal(err) + } + + if count != 2 { + t.Errorf("expected count to be 2, got %d", count) + } + + // get docnum of a + exclude, err := segment.DocNumbers([]string{"a"}) + if err != nil { + t.Fatal(err) + } + + // look for term 'thing' excluding doc 'a' + postingsListExcluding, err := dict.PostingsList("thing", exclude) + if err != nil { + t.Fatal(err) + } + if postingsList == nil { + t.Fatal("got nil postings list, expected non-nil") + } + + postingsListExcludingCount := postingsListExcluding.Count() + if postingsListExcludingCount != 1 { + t.Errorf("expected count from postings list to be 1, got %d", postingsListExcludingCount) + } + + postingsItrExcluding := postingsListExcluding.Iterator() + if postingsItr == nil { + t.Fatal("got nil iterator, expected non-nil") + } + + count = 0 + nextPosting, err = postingsItrExcluding.Next() + for nextPosting != nil && err == nil { + count++ + nextPosting, err = postingsItrExcluding.Next() + } + if err != nil { + t.Fatal(err) + } + + if count != 1 { + t.Errorf("expected count to be 1, got %d", count) + } +} + +func TestOpenMultiWithTwoChunks(t *testing.T) { + _ = os.RemoveAll("/tmp/scorch.bolt") + + memSegment := buildMemSegmentMulti() + err := persistSegment(memSegment, "/tmp/scorch.bolt", 1) + if err != nil { + t.Fatalf("error persisting segment: %v", err) + } + + segment, err := Open("/tmp/scorch.bolt") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + if segment.Count() != 2 { + t.Errorf("expected count 2, got %d", segment.Count()) + } + + // check the desc field + dict, err := segment.Dictionary("desc") + if err != nil { + t.Fatal(err) + } + if dict == nil { + t.Fatal("got nil dict, expected non-nil") + } + + postingsList, err := dict.PostingsList("thing", nil) + if err != nil { + t.Fatal(err) + } + if postingsList == nil { + t.Fatal("got nil postings list, expected non-nil") + } + + postingsItr := postingsList.Iterator() + if postingsItr == nil { + t.Fatal("got nil iterator, expected non-nil") + } + + count := 0 + nextPosting, err := postingsItr.Next() + for nextPosting != nil && err == nil { + count++ + nextPosting, err = postingsItr.Next() + } + if err != nil { + t.Fatal(err) + } + + if count != 2 { + t.Errorf("expected count to be 2, got %d", count) + } + + // get docnum of a + exclude, err := segment.DocNumbers([]string{"a"}) + if err != nil { + t.Fatal(err) + } + + // look for term 'thing' excluding doc 'a' + postingsListExcluding, err := dict.PostingsList("thing", exclude) + if err != nil { + t.Fatal(err) + } + if postingsList == nil { + t.Fatal("got nil postings list, expected non-nil") + } + + postingsItrExcluding := postingsListExcluding.Iterator() + if postingsItr == nil { + t.Fatal("got nil iterator, expected non-nil") + } + + count = 0 + nextPosting, err = postingsItrExcluding.Next() + for nextPosting != nil && err == nil { + count++ + nextPosting, err = postingsItrExcluding.Next() + } + if err != nil { + t.Fatal(err) + } + + if count != 1 { + t.Errorf("expected count to be 1, got %d", count) + } +} From 898a6b1e8579d04eb5852eac298371df180c0a73 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 5 Dec 2017 13:32:57 -0500 Subject: [PATCH 026/728] fix errcheck issues --- index/scorch/segment/bolt/build.go | 30 ++++++++++++++++++++----- index/scorch/segment/bolt/build_test.go | 2 +- 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/index/scorch/segment/bolt/build.go b/index/scorch/segment/bolt/build.go index 6ed5719d3..0c68bab59 100644 --- a/index/scorch/segment/bolt/build.go +++ b/index/scorch/segment/bolt/build.go @@ -438,18 +438,36 @@ func persistStored(memSegment *mem.Segment, tx *bolt.Tx) error { // process each value for i := 0; i < num; i++ { // encode field - metaEncoder.PutU64(uint64(fieldID)) + _, err2 := metaEncoder.PutU64(uint64(fieldID)) + if err2 != nil { + return err2 + } // encode type - metaEncoder.PutU64(uint64(memSegment.StoredTypes[docNum][uint16(fieldID)][i])) + _, err2 = metaEncoder.PutU64(uint64(memSegment.StoredTypes[docNum][uint16(fieldID)][i])) + if err2 != nil { + return err2 + } // encode start offset - metaEncoder.PutU64(uint64(curr)) + _, err2 = metaEncoder.PutU64(uint64(curr)) + if err2 != nil { + return err2 + } // end len - metaEncoder.PutU64(uint64(len(storedFieldValues[i]))) + _, err2 = metaEncoder.PutU64(uint64(len(storedFieldValues[i]))) + if err2 != nil { + return err2 + } // encode number of array pos - metaEncoder.PutU64(uint64(len(memSegment.StoredPos[docNum][uint16(fieldID)][i]))) + _, err2 = metaEncoder.PutU64(uint64(len(memSegment.StoredPos[docNum][uint16(fieldID)][i]))) + if err2 != nil { + return err2 + } // encode all array positions for j := 0; j < len(memSegment.StoredPos[docNum][uint16(fieldID)][i]); j++ { - metaEncoder.PutU64(memSegment.StoredPos[docNum][uint16(fieldID)][i][j]) + _, err2 = metaEncoder.PutU64(memSegment.StoredPos[docNum][uint16(fieldID)][i][j]) + if err2 != nil { + return err2 + } } // append data data = append(data, storedFieldValues[i]...) diff --git a/index/scorch/segment/bolt/build_test.go b/index/scorch/segment/bolt/build_test.go index d4b93f159..3f869d86a 100644 --- a/index/scorch/segment/bolt/build_test.go +++ b/index/scorch/segment/bolt/build_test.go @@ -25,7 +25,7 @@ import ( ) func TestBuild(t *testing.T) { - os.RemoveAll("/tmp/scorch.bolt") + _ = os.RemoveAll("/tmp/scorch.bolt") memSegment := buildMemSegment() err := persistSegment(memSegment, "/tmp/scorch.bolt", 1024) From b1346b4c8a51a2a787c5f157a82087cea9b517ca Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 5 Dec 2017 16:09:00 -0500 Subject: [PATCH 027/728] add readme describing our use of bolt as a segment format --- index/scorch/segment/bolt/README.md | 306 ++++++++++++++++++++++++++++ 1 file changed, 306 insertions(+) create mode 100644 index/scorch/segment/bolt/README.md diff --git a/index/scorch/segment/bolt/README.md b/index/scorch/segment/bolt/README.md new file mode 100644 index 000000000..2c6cd31d5 --- /dev/null +++ b/index/scorch/segment/bolt/README.md @@ -0,0 +1,306 @@ +# bolt segment format + +## top level key space (all sub-buckets, as bolt has no root bucket) + +We have chosen to letter these starting with 'a' and in the code refer to them with more meaningful names. The reason is that we intend to write them in order, and this lets us rearrange them more easily later. + +- 'a' field storage +- 'b' term dictionaries +- 'c' postings list +- 'd' postings details +- 'e' stored fields +- 'x' configuration + +## variable length integers that sort correctly (insert order same as numeric) + +We use numbers as keys in several places. We want those keys to be small, so we prefer to use a variable length key to minimize space, but we also want to insert these in order, so the encoding has to sort correctly. + +We have chosen to the the scheme found in [CockroachDB](https://github.com/cockroachdb/cockroach/blob/2dd65dde5d90c157f4b93f92502ca1063b904e1d/pkg/util/encoding/encoding.go). + +In short, the first byte indicates how many bytes will follow, with a few other nice properties. +- values 0-127 are not used in the first byte (this means we can still use any ASCII values we want and avoid collision) +- very small values are packed directly into this first byte +For the full details see the link above. + +## field storage bucket + +Contains one row for each field, the key is the integer field ID, and the value is the string name associated with the field. + +There is one additional row with key 'l'. The value is a binary serialization of a [roaring bitmap](https://github.com/RoaringBitmap/roaring), with bits set for each field id which also index location details with each posting. + +## term dictionary bucket + +Contains one row for each field, the key is the integer field ID, and the value is a binary serialization of the [Vellum](https://github.com/couchbaselabs/vellum) FST. The Vellum FST maps from term (utf-8 string) to a posting ID (uint64). + +## postings list bucket + +Contains one row for each postings list, the key is the integer posting ID, the value is a binary serialization of a [roaring bitmap](https://github.com/RoaringBitmap/roaring). The roaring bitmap has bits set for each doc number that used this term in this field. + +## posting details bucket + +Contains one sub-bucket for each postings list, the name of the sub-bucket is the posting ID. + +### individual posting detail sub-bucket + +Contains one sub-bucket for each chunk. A chunk contains details for sub-section of the docNum key space. By default, the chunk size is 1024, so all posting details for the first 1024 docs are in chunk zero, then the next 1024 in chunk one, and so on. + +The purpose of the chunking is so that when trying to Seek/Advance through a large number of hits to something much further ahead, we have to keep seeking through the roaring bitmap, but we can jump to the nearest chunk for details, and only seek within the details of the current chunk. + +#### chunk posting detail sub-bucket + +Contains two key/value pairs: + +Key 'a' contains a [govarint](https://github.com/Smerity/govarint) compressed slice of uint64 values. For each hit in the postings list, there are two values on this list, the first is the term frequency (uint64) and the second is the norm factor (float32). + +Key 'b' contains a [govarint](https://github.com/Smerity/govarint) compressed slice of uint64 values. For each location (there will be one location for each 'frequency' in the list above) there will be a variable number of uint64 values as follows: + +- field ID (uint16) +- pos (uint64) +- start (uint64) +- end (uint64) +- number of array position entries that follow (uint64) +- variable number of array positions (each uint64) + +## stored field values sub-bucket + +Contains one sub-bucket for each doc number (uint64). + +## stored field doc specific sub-bucket + +Contains two key/value pairs: + +Key 'a' contains a [govarint](https://github.com/Smerity/govarint) compressed slice of uint64 values. For each stored field there are a variable number of uint64 values as follows: + +- field ID (uint16) +- value type (byte) (string/number/date/geo/etc) +- start offset (in the uncompressed slice of data) +- length (in the uncompressed slice of data) +- number of array position entries that follow (uint64) +- variable number of array positions (each uint64) + +Key 'b' contains a [snappy]() compressed sequence of bytes. The input to the snappy compression was a slice of bytes containing the field values, in the same order the metadata slice was created. + +## configuration sub-bucket + +Currently contains two key/value pairs: + +Key 'c' contains a BigEndian encoded uint32 chunk size. This chunk size must be used when computing doc number to chunk conversions in this segment. + +Key 'v' contains a version number, currently 0. + +## Example + +The following is a dump of the boltdb bucket/key/value space for a segment which contains two documents: + +``` +{ + "_id": "a", + "name": "wow", + "desc": "some thing", + "tag": ["cold", "dark"] +} + +{ + "_id": "b", + "name": "who", + "desc": "some thing", + "tag": ["cold", "dark"] +} +``` + +``` +[61] ('a' - field storage) + 6c ('l' - roaring bitmap of field IDs which have index location data) + 3a 30 00 00 01 00 00 00 00 00 03 00 10 00 00 00 01 00 02 00 03 00 04 00 + 88 (field ID 0) + 5f 69 64 (utf-8 string '_id') + 89 (field ID 1) + 5f 61 6c 6c (utf-8 string '_all') + 8a (field ID 2) + 6e 61 6d 65 (utf-8 string 'name') + 8b (field ID 3) + 64 65 73 63 (utf-8 string 'desc') + 8c (field ID 4) + 74 61 67 (utf-8 string 'tag') +[62] ('b' - term dictionary) + 88 (field ID 0) + 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 0b 05 00 00 62 61 11 02 02 00 00 00 00 00 00 00 17 00 00 00 00 00 00 00 (vellum FST data) + 89 + 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 10 92 cf c4 00 10 a7 c7 c5 00 10 82 d0 c4 00 10 97 cb c8 ce 00 10 84 00 10 8c 00 0d 01 04 6f 68 11 02 00 02 01 04 03 01 0f 15 1a 1f 77 74 73 64 63 11 05 06 00 00 00 00 00 00 00 43 00 00 00 00 00 00 00 + 8a + 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 10 84 00 10 8c 00 06 01 04 6f 68 11 02 06 01 11 8c 02 00 00 00 00 00 00 00 21 00 00 00 00 00 00 00 + 8b + 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 10 82 d0 c4 00 10 97 cb c8 ce 08 07 01 07 74 73 11 02 02 00 00 00 00 00 00 00 22 00 00 00 00 00 00 00 + 8c + 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 10 92 cf c4 00 10 a7 c7 c5 0a 09 01 06 64 63 11 02 02 00 00 00 00 00 00 00 21 00 00 00 00 00 00 00 +[63] ('c' - postings lists) + 88 (field ID 0) + 3a 30 00 00 01 00 00 00 00 00 00 00 10 00 00 00 00 00 (roaring bitmap data) + 89 + 3a 30 00 00 01 00 00 00 00 00 01 00 10 00 00 00 00 00 01 00 + 8a + 3a 30 00 00 01 00 00 00 00 00 01 00 10 00 00 00 00 00 01 00 + 8b + 3a 30 00 00 01 00 00 00 00 00 01 00 10 00 00 00 00 00 01 00 + 8c + 3a 30 00 00 01 00 00 00 00 00 01 00 10 00 00 00 00 00 01 00 + 8d + 3a 30 00 00 01 00 00 00 00 00 00 00 10 00 00 00 00 00 + 8e + 3a 30 00 00 01 00 00 00 00 00 00 00 10 00 00 00 00 00 + 8f + 3a 30 00 00 01 00 00 00 00 00 01 00 10 00 00 00 00 00 01 00 + 90 + 3a 30 00 00 01 00 00 00 00 00 01 00 10 00 00 00 00 00 01 00 + 91 + 3a 30 00 00 01 00 00 00 00 00 01 00 10 00 00 00 00 00 01 00 + 92 + 3a 30 00 00 01 00 00 00 00 00 01 00 10 00 00 00 00 00 01 00 + 93 + 3a 30 00 00 01 00 00 00 00 00 00 00 10 00 00 00 01 00 + 94 + 3a 30 00 00 01 00 00 00 00 00 00 00 10 00 00 00 01 00 + 95 + 3a 30 00 00 01 00 00 00 00 00 00 00 10 00 00 00 01 00 +[64] ('d' - postings details) + [88] (posting ID 0) + [88] (chunk ID 0) + 61 ('a' term freq/norm data) + 01 ae f2 93 f7 03 + 62 ('b' term location data) + 02 01 00 03 00 + [89] (posting ID 1) + [88] (chunk ID 0) + 61 ('a' term freq/norm data) + 01 ae f2 93 f7 03 + 62 ('b' term location data) + 03 01 00 04 00 + [89] (chunk ID 1) + 61 ('a' term freq/norm data) + 01 ae f2 93 f7 03 + 62 ('b' term location data) + 03 01 00 04 00 + [8a] + [88] + 61 + 01 ae f2 93 f7 03 + 62 + 03 02 05 0a 00 + [89] + 61 + 01 ae f2 93 f7 03 + 62 + 03 02 05 0a 00 + [8b] + [88] + 61 + 01 ae f2 93 f7 03 + 62 + 04 01 00 04 01 00 + [89] + 61 + 01 ae f2 93 f7 03 + 62 + 04 01 00 04 01 00 + [8c] + [88] + 61 + 01 ae f2 93 f7 03 + 62 + 04 01 00 04 01 01 + [89] + 61 + 01 ae f2 93 f7 03 + 62 + 04 01 00 04 01 01 + [8d] + [88] + 61 + 01 80 80 80 fc 03 + 62 + + [8e] + [88] + 61 + 01 80 80 80 fc 03 + 62 + 02 01 00 03 00 + [8f] + [88] + 61 + 01 f3 89 d4 f9 03 + 62 + 03 01 00 04 00 + [89] + 61 + 01 f3 89 d4 f9 03 + 62 + 03 01 00 04 00 + [90] + [88] + 61 + 01 f3 89 d4 f9 03 + 62 + 03 02 05 0a 00 + [89] + 61 + 01 f3 89 d4 f9 03 + 62 + 03 02 05 0a 00 + [91] + [88] + 61 + 01 f3 89 d4 f9 03 + 62 + 04 01 00 04 01 00 + [89] + 61 + 01 f3 89 d4 f9 03 + 62 + 04 01 00 04 01 00 + [92] + [88] + 61 + 01 f3 89 d4 f9 03 + 62 + 04 01 00 04 01 01 + [89] + 61 + 01 f3 89 d4 f9 03 + 62 + 04 01 00 04 01 01 + [93] + [89] + 61 + 01 80 80 80 fc 03 + 62 + + [94] + [89] + 61 + 01 80 80 80 fc 03 + 62 + 02 01 00 03 00 + [95] + [89] + 61 + 01 ae f2 93 f7 03 + 62 + 02 01 00 03 00 +[65] ('e' - stored fields) + [88] (doc num 0) + 61 ('a' - stored field meta slice) + 00 74 00 01 00 02 74 01 03 00 03 74 04 0a 00 04 74 0e 04 01 00 04 74 12 04 01 01 + 62 ('b' - snappy compressed value bytes) + 16 54 61 77 6f 77 73 6f 6d 65 20 74 68 69 6e 67 63 6f 6c 64 64 61 72 6b + [89] + 61 + 00 74 00 01 00 02 74 01 03 00 03 74 04 0a 00 04 74 0e 04 01 00 04 74 12 04 01 01 + 62 + 16 54 62 77 68 6f 73 6f 6d 65 20 74 68 69 6e 67 63 6f 6c 64 64 61 72 6b +[78] ('x' - configuration) + 63 ('c' - chunk size) + 00 00 00 01 (big endian 1) + 76 ('v' - version) + 00 (single byte 0) +``` From adac4f41db1e8c226b3f5339f9bbe75940ecf09a Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 6 Dec 2017 18:33:47 -0500 Subject: [PATCH 028/728] initial version of scorch which persists index to disk --- index/scorch/field_dict_test.go | 18 +- index/scorch/introducer.go | 32 +- index/scorch/persister.go | 329 ++++++++++++++++++++ index/scorch/reader_test.go | 33 +- index/scorch/scorch.go | 106 +++++-- index/scorch/scorch_test.go | 261 ++++++++++++++-- index/scorch/segment/bolt/build.go | 28 +- index/scorch/segment/bolt/build_test.go | 2 +- index/scorch/segment/bolt/dict.go | 2 +- index/scorch/segment/bolt/dict_test.go | 2 +- index/scorch/segment/bolt/posting.go | 2 +- index/scorch/segment/bolt/segment.go | 18 +- index/scorch/segment/bolt/segment_test.go | 6 +- index/scorch/segment/empty.go | 61 ++++ index/scorch/segment/{bolt => }/int.go | 4 +- index/scorch/segment/{bolt => }/int_test.go | 2 +- index/scorch/segment/mem/segment.go | 17 +- index/scorch/snapshot_index.go | 1 + index/scorch/snapshot_segment.go | 7 + 19 files changed, 839 insertions(+), 92 deletions(-) create mode 100644 index/scorch/persister.go create mode 100644 index/scorch/segment/empty.go rename index/scorch/segment/{bolt => }/int.go (98%) rename index/scorch/segment/{bolt => }/int_test.go (99%) diff --git a/index/scorch/field_dict_test.go b/index/scorch/field_dict_test.go index 856f3d6cf..a25c5c984 100644 --- a/index/scorch/field_dict_test.go +++ b/index/scorch/field_dict_test.go @@ -23,20 +23,26 @@ import ( ) func TestIndexFieldDict(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, nil, analysisQueue) + idx, err := NewScorch(Name, testConfig, analysisQueue) if err != nil { t.Fatal(err) } err = idx.Open() if err != nil { - t.Errorf("error opening index: %v", err) + t.Fatalf("error opening index: %v", err) } defer func() { - err := idx.Close() - if err != nil { - t.Fatal(err) + cerr := idx.Close() + if cerr != nil { + t.Fatal(cerr) } }() @@ -96,7 +102,7 @@ func TestIndexFieldDict(t *testing.T) { dict2, err := indexReader.FieldDict("desc") if err != nil { - t.Errorf("error creating reader: %v", err) + t.Fatalf("error creating reader: %v", err) } defer func() { err := dict2.Close() diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index d2978a2df..0b30044a1 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -15,6 +15,8 @@ package scorch import ( + "fmt" + "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index/scorch/segment" ) @@ -26,17 +28,21 @@ type segmentIntroduction struct { ids []string internal map[string][]byte - applied chan struct{} + applied chan error + persisted chan error } func (s *Scorch) mainLoop() { + var notify notificationChan +OUTER: for { select { case <-s.closeCh: - return + break OUTER - case next := <-s.introductions: + case notify = <-s.introducerNotifier: + case next := <-s.introductions: // acquire lock s.rootLock.Lock() @@ -45,7 +51,9 @@ func (s *Scorch) mainLoop() { segment: make([]*SegmentSnapshot, len(s.root.segment)+1), offsets: make([]uint64, len(s.root.segment)+1), internal: make(map[string][]byte, len(s.root.segment)), + epoch: s.nextSnapshotEpoch, } + s.nextSnapshotEpoch++ // iterate through current segments var running uint64 @@ -56,12 +64,15 @@ func (s *Scorch) mainLoop() { var err error delta, err = s.root.segment[i].segment.DocNumbers(next.ids) if err != nil { - panic(err) + next.applied <- fmt.Errorf("error computing doc numbers: %v", err) + close(next.applied) + continue OUTER } } newSnapshot.segment[i] = &SegmentSnapshot{ id: s.root.segment[i].id, segment: s.root.segment[i].segment, + notify: s.root.segment[i].notify, } // apply new obsoletions if s.root.segment[i].deleted == nil { @@ -80,6 +91,12 @@ func (s *Scorch) mainLoop() { segment: next.data, } newSnapshot.offsets[len(s.root.segment)] = running + if !s.unsafeBatch { + newSnapshot.segment[len(s.root.segment)].notify = append( + newSnapshot.segment[len(s.root.segment)].notify, + next.persisted, + ) + } // copy old values for key, oldVal := range s.root.internal { newSnapshot.internal[key] = oldVal @@ -97,6 +114,13 @@ func (s *Scorch) mainLoop() { // release lock s.rootLock.Unlock() close(next.applied) + + if notify != nil { + close(notify) + notify = nil + } } } + + s.asyncTasks.Done() } diff --git a/index/scorch/persister.go b/index/scorch/persister.go new file mode 100644 index 000000000..0e3318e8c --- /dev/null +++ b/index/scorch/persister.go @@ -0,0 +1,329 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "bytes" + "fmt" + "log" + "os" + "strings" + + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index/scorch/segment" + scorchBolt "github.com/blevesearch/bleve/index/scorch/segment/bolt" + "github.com/blevesearch/bleve/index/scorch/segment/mem" + "github.com/boltdb/bolt" +) + +type notificationChan chan struct{} + +func (s *Scorch) persisterLoop() { + var lastPersistedEpoch uint64 +OUTER: + for { + select { + case <-s.closeCh: + break OUTER + + default: + // check to see if there is a new snapshot to persist + s.rootLock.RLock() + ourSnapshot := s.root + s.rootLock.RUnlock() + + //for ourSnapshot.epoch != lastPersistedEpoch { + if ourSnapshot.epoch != lastPersistedEpoch { + // lets get started + err := s.persistSnapshot(ourSnapshot) + if err != nil { + log.Printf("got err persisting snapshot: %v", err) + continue OUTER + } + lastPersistedEpoch = ourSnapshot.epoch + } + + // tell the introducer we're waiting for changes + // first make a notification chan + notifyUs := make(notificationChan) + + // give it to the introducer + select { + case <-s.closeCh: + break OUTER + case s.introducerNotifier <- notifyUs: + } + + // check again + s.rootLock.RLock() + ourSnapshot = s.root + s.rootLock.RUnlock() + if ourSnapshot.epoch != lastPersistedEpoch { + + // lets get started + err := s.persistSnapshot(ourSnapshot) + if err != nil { + log.Printf("got err persisting snapshot: %v", err) + continue OUTER + } + lastPersistedEpoch = ourSnapshot.epoch + } + + // now wait for it (but also detect close) + select { + case <-s.closeCh: + break OUTER + case <-notifyUs: + // woken up, next loop should pick up work + } + } + } + s.asyncTasks.Done() +} + +func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { + // start a write transaction + tx, err := s.rootBolt.Begin(true) + if err != nil { + return err + } + defer func() { + if err == nil { + err = tx.Commit() + } else { + _ = tx.Rollback() + } + }() + + snapshotsBucket, err := tx.CreateBucketIfNotExists(boltSnapshotsBucket) + if err != nil { + return err + } + newSnapshotKey := segment.EncodeUvarintAscending(nil, snapshot.epoch) + snapshotBucket, err := snapshotsBucket.CreateBucketIfNotExists(newSnapshotKey) + if err != nil { + return err + } + + // persist internal values + internalBucket, err := snapshotBucket.CreateBucketIfNotExists(boltInternalKey) + if err != nil { + return err + } + // TODO optimize writing these in order? + for k, v := range snapshot.internal { + internalBucket.Put([]byte(k), v) + } + + newSegmentPaths := make(map[uint64]string) + + // first ensure that each segment in this snapshot has been persisted + for i, segmentSnapshot := range snapshot.segment { + snapshotSegmentKey := segment.EncodeUvarintAscending(nil, uint64(i)) + snapshotSegmentBucket, err2 := snapshotBucket.CreateBucketIfNotExists(snapshotSegmentKey) + if err2 != nil { + return err2 + } + switch seg := segmentSnapshot.segment.(type) { + case *mem.Segment: + // need to persist this to disk + filename := fmt.Sprintf("%x.bolt", segmentSnapshot.id) + path := s.path + string(os.PathSeparator) + filename + err2 := scorchBolt.PersistSegment(seg, path, 1024) + if err2 != nil { + return fmt.Errorf("error persisting segment: %v", err2) + } + newSegmentPaths[segmentSnapshot.id] = path + snapshotSegmentBucket.Put(boltPathKey, []byte(filename)) + case *scorchBolt.Segment: + + path := seg.Path() + filename := strings.TrimPrefix(path, s.path+string(os.PathSeparator)) + snapshotSegmentBucket.Put(boltPathKey, []byte(filename)) + default: + return fmt.Errorf("unknown segment type: %T", seg) + } + // store current deleted bits + var roaringBuf bytes.Buffer + if segmentSnapshot.deleted != nil { + _, err = segmentSnapshot.deleted.WriteTo(&roaringBuf) + if err != nil { + return fmt.Errorf("error persisting roaring bytes: %v", err) + } + snapshotSegmentBucket.Put(boltDeletedKey, roaringBuf.Bytes()) + } + } + + // now try to open all the new snapshots + newSegments := make(map[uint64]segment.Segment) + for segmentID, path := range newSegmentPaths { + newSegments[segmentID], err = scorchBolt.Open(path) + if err != nil { + return fmt.Errorf("error opening new segment at %s, %v", path, err) + } + } + + // get write lock and update the current snapshot with disk-based versions + var notifications []chan error + + s.rootLock.Lock() + newIndexSnapshot := &IndexSnapshot{ + epoch: s.root.epoch, + segment: make([]*SegmentSnapshot, len(s.root.segment)), + offsets: make([]uint64, len(s.root.offsets)), + internal: make(map[string][]byte, len(s.root.internal)), + } + for i, segmentSnapshot := range s.root.segment { + // see if this segment has been replaced + if replacement, ok := newSegments[segmentSnapshot.id]; ok { + newSegmentSnapshot := &SegmentSnapshot{ + segment: replacement, + deleted: segmentSnapshot.deleted, + id: segmentSnapshot.id, + } + newIndexSnapshot.segment[i] = newSegmentSnapshot + // add the old segment snapshots notifications to the list + for _, notification := range segmentSnapshot.notify { + notifications = append(notifications, notification) + } + } else { + newIndexSnapshot.segment[i] = s.root.segment[i] + } + newIndexSnapshot.offsets[i] = s.root.offsets[i] + } + for k, v := range s.root.internal { + newIndexSnapshot.internal[k] = v + } + s.root = newIndexSnapshot + s.rootLock.Unlock() + + // now that we've given up the lock, notify everyone that we've safely + // persisted their data + for _, notification := range notifications { + close(notification) + } + + return nil +} + +// bolt snapshot code + +var boltSnapshotsBucket = []byte{'s'} +var boltPathKey = []byte{'p'} +var boltDeletedKey = []byte{'d'} +var boltInternalKey = []byte{'i'} + +func (s *Scorch) loadFromBolt() error { + return s.rootBolt.View(func(tx *bolt.Tx) error { + snapshots := tx.Bucket(boltSnapshotsBucket) + if snapshots == nil { + return nil + } + c := snapshots.Cursor() + for k, _ := c.Last(); k != nil; k, _ = c.Prev() { + _, snapshotEpoch, err := segment.DecodeUvarintAscending(k) + if err != nil { + log.Printf("unable to parse segment epoch % x, contiuing", k) + continue + } + snapshot := snapshots.Bucket(k) + if snapshot == nil { + log.Printf("snapshot key, but bucket missing % x, continuing", k) + continue + } + indexSnapshot, err := s.loadSnapshot(snapshot) + if err != nil { + log.Printf("unable to load snapshot, %v continuing", err) + continue + } + indexSnapshot.epoch = snapshotEpoch + // set the nextSegmentID + for _, segment := range indexSnapshot.segment { + if segment.id > s.nextSegmentID { + s.nextSegmentID = segment.id + } + } + s.nextSegmentID++ + s.nextSnapshotEpoch = snapshotEpoch + 1 + s.root = indexSnapshot + break + } + return nil + }) +} + +func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { + + rv := &IndexSnapshot{ + internal: make(map[string][]byte), + } + var running uint64 + c := snapshot.Cursor() + for k, _ := c.First(); k != nil; k, _ = c.Next() { + if k[0] == boltInternalKey[0] { + internalBucket := snapshot.Bucket(k) + internalBucket.ForEach(func(key []byte, val []byte) error { + copiedVal := append([]byte(nil), val...) + rv.internal[string(key)] = copiedVal + return nil + }) + } else { + segmentBucket := snapshot.Bucket(k) + if segmentBucket == nil { + return nil, fmt.Errorf("segment key, but bucket missing % x", k) + } + segmentSnapshot, err := s.loadSegment(segmentBucket) + if err != nil { + return nil, fmt.Errorf("failed to load segment: %v", err) + } + _, segmentSnapshot.id, err = segment.DecodeUvarintAscending(k) + if err != nil { + return nil, fmt.Errorf("failed to decode segment id: %v", err) + } + rv.segment = append(rv.segment, segmentSnapshot) + rv.offsets = append(rv.offsets, running) + running += segmentSnapshot.segment.Count() + } + } + return rv, nil +} + +func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, error) { + pathBytes := segmentBucket.Get(boltPathKey) + if pathBytes == nil { + return nil, fmt.Errorf("segment path missing") + } + segmentPath := s.path + string(os.PathSeparator) + string(pathBytes) + segment, err := scorchBolt.Open(segmentPath) + if err != nil { + return nil, fmt.Errorf("error opening bolt segment: %v", err) + } + + rv := &SegmentSnapshot{ + segment: segment, + } + deletedBytes := segmentBucket.Get(boltDeletedKey) + if deletedBytes != nil { + deletedBitmap := roaring.NewBitmap() + r := bytes.NewReader(deletedBytes) + _, err := deletedBitmap.ReadFrom(r) + if err != nil { + return nil, fmt.Errorf("error reading deleted bytes: %v", err) + } + rv.deleted = deletedBitmap + } + + return rv, nil +} diff --git a/index/scorch/reader_test.go b/index/scorch/reader_test.go index ef5d6d4f5..2cd42fe47 100644 --- a/index/scorch/reader_test.go +++ b/index/scorch/reader_test.go @@ -23,14 +23,21 @@ import ( ) func TestIndexReader(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, nil, analysisQueue) + idx, err := NewScorch(Name, testConfig, analysisQueue) if err != nil { t.Fatal(err) } err = idx.Open() if err != nil { - t.Errorf("error opening index: %v", err) + t.Fatalf("error opening index: %v", err) } defer func() { err := idx.Close() @@ -205,14 +212,21 @@ func TestIndexReader(t *testing.T) { } func TestIndexDocIdReader(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, nil, analysisQueue) + idx, err := NewScorch(Name, testConfig, analysisQueue) if err != nil { t.Fatal(err) } err = idx.Open() if err != nil { - t.Errorf("error opening index: %v", err) + t.Fatalf("error opening index: %v", err) } defer func() { err := idx.Close() @@ -309,14 +323,21 @@ func TestIndexDocIdReader(t *testing.T) { } func TestIndexDocIdOnlyReader(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, nil, analysisQueue) + idx, err := NewScorch(Name, testConfig, analysisQueue) if err != nil { t.Fatal(err) } err = idx.Open() if err != nil { - t.Errorf("error opening index: %v", err) + t.Fatalf("error opening index: %v", err) } defer func() { err := idx.Close() diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index b80ff2482..678349216 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -16,6 +16,8 @@ package scorch import ( "encoding/json" + "fmt" + "os" "sync" "sync/atomic" "time" @@ -28,6 +30,7 @@ import ( "github.com/blevesearch/bleve/index/scorch/segment/mem" "github.com/blevesearch/bleve/index/store" "github.com/blevesearch/bleve/registry" + "github.com/boltdb/bolt" ) const Name = "scorch" @@ -35,40 +38,95 @@ const Name = "scorch" const Version uint8 = 1 type Scorch struct { - version uint8 - storeConfig map[string]interface{} - analysisQueue *index.AnalysisQueue - stats *Stats - nextSegmentID uint64 + version uint8 + config map[string]interface{} + analysisQueue *index.AnalysisQueue + stats *Stats + nextSegmentID uint64 + nextSnapshotEpoch uint64 + path string + + unsafeBatch bool rootLock sync.RWMutex root *IndexSnapshot - closeCh chan struct{} - introductions chan *segmentIntroduction + closeCh chan struct{} + introductions chan *segmentIntroduction + introducerNotifier chan notificationChan + rootBolt *bolt.DB + asyncTasks sync.WaitGroup } -func NewScorch(storeName string, storeConfig map[string]interface{}, analysisQueue *index.AnalysisQueue) (index.Index, error) { +func NewScorch(storeName string, config map[string]interface{}, analysisQueue *index.AnalysisQueue) (index.Index, error) { rv := &Scorch{ - version: Version, - storeConfig: storeConfig, - analysisQueue: analysisQueue, - stats: &Stats{}, - root: &IndexSnapshot{}, + version: Version, + config: config, + analysisQueue: analysisQueue, + stats: &Stats{}, + root: &IndexSnapshot{}, + nextSnapshotEpoch: 1, } return rv, nil } func (s *Scorch) Open() error { + var ok bool + s.path, ok = s.config["path"].(string) + if !ok { + return fmt.Errorf("must specify path") + } + if s.path == "" { + return os.ErrInvalid + } + + err := os.MkdirAll(s.path, 0700) + if err != nil { + return err + } + + rootBoltPath := s.path + string(os.PathSeparator) + "root.bolt" + + s.rootBolt, err = bolt.Open(rootBoltPath, 0600, nil) + if err != nil { + return err + } + + // now see if there is any existing state to load + err = s.loadFromBolt() + if err != nil { + return err + } + s.closeCh = make(chan struct{}) s.introductions = make(chan *segmentIntroduction) + s.introducerNotifier = make(chan notificationChan) + + s.asyncTasks.Add(1) go s.mainLoop() + s.asyncTasks.Add(1) + go s.persisterLoop() + return nil } -func (s *Scorch) Close() error { +func (s *Scorch) Close() (err error) { + // signal to async tasks we want to close close(s.closeCh) - return nil + // wait for them to close + s.asyncTasks.Wait() + // now close the root bolt + + err = s.rootBolt.Close() + s.rootLock.Lock() + for _, segment := range s.root.segment { + cerr := segment.Close() + if err == nil { + err = cerr + } + } + + return } func (s *Scorch) Update(doc *document.Document) error { @@ -85,7 +143,6 @@ func (s *Scorch) Delete(id string) error { // Batch applices a batch of changes to the index atomically func (s *Scorch) Batch(batch *index.Batch) error { - analysisStart := time.Now() resultChan := make(chan *index.AnalysisResult, len(batch.IndexOps)) @@ -148,7 +205,11 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, ids: ids, obsoletes: make(map[uint64]*roaring.Bitmap), internal: internalOps, - applied: make(chan struct{}), + applied: make(chan error), + } + + if !s.unsafeBatch { + introduction.persisted = make(chan error) } // get read lock, to optimistically prepare obsoleted info @@ -165,9 +226,16 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, s.introductions <- introduction // block until this segment is applied - <-introduction.applied + err := <-introduction.applied + if err != nil { + return err + } - return nil + if !s.unsafeBatch { + err = <-introduction.persisted + } + + return err } func (s *Scorch) SetInternal(key, val []byte) error { diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index aeb6c997a..52a86ab2e 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -16,6 +16,7 @@ package scorch import ( "log" + "os" "reflect" "regexp" "strconv" @@ -29,13 +30,85 @@ import ( "github.com/blevesearch/bleve/index" ) +func DestroyTest() error { + return os.RemoveAll("/tmp/bleve-scorch-test") +} + +var testConfig = map[string]interface{}{ + "path": "/tmp/bleve-scorch-test", +} + var testAnalyzer = &analysis.Analyzer{ Tokenizer: regexpTokenizer.NewRegexpTokenizer(regexp.MustCompile(`\w+`)), } -func TestIndexInsert(t *testing.T) { +func TestIndexOpenReopen(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, nil, analysisQueue) + idx, err := NewScorch(Name, testConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + + var expectedCount uint64 + reader, err := idx.Reader() + if err != nil { + t.Fatal(err) + } + docCount, err := reader.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } + + // insert a doc + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + reader, err = idx.Reader() + if err != nil { + t.Fatal(err) + } + docCount, err = reader.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } + + // now close it + err = idx.Close() + if err != nil { + t.Fatal(err) + } + + idx, err = NewScorch(Name, testConfig, analysisQueue) if err != nil { t.Fatal(err) } @@ -43,6 +116,48 @@ func TestIndexInsert(t *testing.T) { if err != nil { t.Errorf("error opening index: %v", err) } + + // check the doc count again after reopening it + reader, err = idx.Reader() + if err != nil { + t.Fatal(err) + } + docCount, err = reader.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } + + // now close it + err = idx.Close() + if err != nil { + t.Fatal(err) + } +} + +func TestIndexInsert(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, testConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Fatalf("error opening index: %v", err) + } defer func() { err := idx.Close() if err != nil { @@ -93,14 +208,21 @@ func TestIndexInsert(t *testing.T) { } func TestIndexInsertThenDelete(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, nil, analysisQueue) + idx, err := NewScorch(Name, testConfig, analysisQueue) if err != nil { t.Fatal(err) } err = idx.Open() if err != nil { - t.Errorf("error opening index: %v", err) + t.Fatalf("error opening index: %v", err) } defer func() { err := idx.Close() @@ -204,8 +326,15 @@ func TestIndexInsertThenDelete(t *testing.T) { } func TestIndexInsertThenUpdate(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, nil, analysisQueue) + idx, err := NewScorch(Name, testConfig, analysisQueue) if err != nil { t.Fatal(err) } @@ -213,7 +342,7 @@ func TestIndexInsertThenUpdate(t *testing.T) { var expectedCount uint64 err = idx.Open() if err != nil { - t.Errorf("error opening index: %v", err) + t.Fatalf("error opening index: %v", err) } defer func() { err := idx.Close() @@ -264,15 +393,28 @@ func TestIndexInsertThenUpdate(t *testing.T) { } func TestIndexInsertMultiple(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, nil, analysisQueue) + idx, err := NewScorch(Name, testConfig, analysisQueue) if err != nil { t.Fatal(err) } err = idx.Open() if err != nil { - t.Errorf("error opening index: %v", err) + t.Fatalf("error opening index: %v", err) } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() var expectedCount uint64 @@ -318,19 +460,26 @@ func TestIndexInsertMultiple(t *testing.T) { } func TestIndexInsertWithStore(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, nil, analysisQueue) + idx, err := NewScorch(Name, testConfig, analysisQueue) if err != nil { t.Fatal(err) } err = idx.Open() if err != nil { - t.Errorf("error opening index: %v", err) + t.Fatalf("error opening index: %v", err) } defer func() { - err := idx.Close() + cerr := idx.Close() if err != nil { - t.Fatal(err) + t.Fatal(cerr) } }() @@ -416,14 +565,21 @@ func TestIndexInsertWithStore(t *testing.T) { } func TestIndexInternalCRUD(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, nil, analysisQueue) + idx, err := NewScorch(Name, testConfig, analysisQueue) if err != nil { t.Fatal(err) } err = idx.Open() if err != nil { - t.Errorf("error opening index: %v", err) + t.Fatalf("error opening index: %v", err) } defer func() { err := idx.Close() @@ -503,14 +659,21 @@ func TestIndexInternalCRUD(t *testing.T) { } func TestIndexBatch(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, nil, analysisQueue) + idx, err := NewScorch(Name, testConfig, analysisQueue) if err != nil { t.Fatal(err) } err = idx.Open() if err != nil { - t.Errorf("error opening index: %v", err) + t.Fatalf("error opening index: %v", err) } defer func() { err := idx.Close() @@ -609,14 +772,21 @@ func TestIndexBatch(t *testing.T) { } func TestIndexInsertUpdateDeleteWithMultipleTypesStored(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, nil, analysisQueue) + idx, err := NewScorch(Name, testConfig, analysisQueue) if err != nil { t.Fatal(err) } err = idx.Open() if err != nil { - t.Errorf("error opening index: %v", err) + t.Fatalf("error opening index: %v", err) } defer func() { err := idx.Close() @@ -817,14 +987,21 @@ func TestIndexInsertUpdateDeleteWithMultipleTypesStored(t *testing.T) { } func TestIndexInsertFields(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, nil, analysisQueue) + idx, err := NewScorch(Name, testConfig, analysisQueue) if err != nil { t.Fatal(err) } err = idx.Open() if err != nil { - t.Errorf("error opening index: %v", err) + t.Fatalf("error opening index: %v", err) } defer func() { err := idx.Close() @@ -879,14 +1056,21 @@ func TestIndexInsertFields(t *testing.T) { } func TestIndexUpdateComposites(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, nil, analysisQueue) + idx, err := NewScorch(Name, testConfig, analysisQueue) if err != nil { t.Fatal(err) } err = idx.Open() if err != nil { - t.Errorf("error opening index: %v", err) + t.Fatalf("error opening index: %v", err) } defer func() { err := idx.Close() @@ -947,14 +1131,21 @@ func TestIndexUpdateComposites(t *testing.T) { } func TestIndexTermReaderCompositeFields(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, nil, analysisQueue) + idx, err := NewScorch(Name, testConfig, analysisQueue) if err != nil { t.Fatal(err) } err = idx.Open() if err != nil { - t.Errorf("error opening index: %v", err) + t.Fatalf("error opening index: %v", err) } defer func() { err := idx.Close() @@ -1005,14 +1196,21 @@ func TestIndexTermReaderCompositeFields(t *testing.T) { } func TestConcurrentUpdate(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, nil, analysisQueue) + idx, err := NewScorch(Name, testConfig, analysisQueue) if err != nil { t.Fatal(err) } err = idx.Open() if err != nil { - t.Errorf("error opening index: %v", err) + t.Fatalf("error opening index: %v", err) } defer func() { err := idx.Close() @@ -1054,14 +1252,21 @@ func TestConcurrentUpdate(t *testing.T) { } func TestLargeField(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, nil, analysisQueue) + idx, err := NewScorch(Name, testConfig, analysisQueue) if err != nil { t.Fatal(err) } err = idx.Open() if err != nil { - t.Errorf("error opening index: %v", err) + t.Fatalf("error opening index: %v", err) } defer func() { err := idx.Close() diff --git a/index/scorch/segment/bolt/build.go b/index/scorch/segment/bolt/build.go index 0c68bab59..ac01f9b8d 100644 --- a/index/scorch/segment/bolt/build.go +++ b/index/scorch/segment/bolt/build.go @@ -21,6 +21,7 @@ import ( "github.com/RoaringBitmap/roaring" "github.com/Smerity/govarint" + "github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/index/scorch/segment/mem" "github.com/boltdb/bolt" "github.com/couchbaselabs/vellum" @@ -47,8 +48,7 @@ var versionKey = []byte{'v'} var version = 0 -func persistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) (err error) { - +func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) (err error) { db, err := bolt.Open(path, 0777, nil) if err != nil { return err @@ -133,13 +133,13 @@ func persistFields(memSegment *mem.Segment, tx *bolt.Tx) error { } // we use special varint which is still guaranteed to sort correctly - fieldBuf := make([]byte, 0, maxVarintSize) + fieldBuf := make([]byte, 0, segment.MaxVarintSize) for fieldID, fieldName := range memSegment.FieldsInv { if fieldID != 0 { // reset buffer if necessary fieldBuf = fieldBuf[:0] } - fieldBuf = EncodeUvarintAscending(fieldBuf, uint64(fieldID)) + fieldBuf = segment.EncodeUvarintAscending(fieldBuf, uint64(fieldID)) err = bucket.Put(fieldBuf, []byte(fieldName)) if err != nil { return err @@ -160,7 +160,7 @@ func persistDictionary(memSegment *mem.Segment, tx *bolt.Tx) error { // the (presumably) heavier lifting involved in building the FST could // be done concurrently. - fieldBuf := make([]byte, 0, maxVarintSize) + fieldBuf := make([]byte, 0, segment.MaxVarintSize) for fieldID, fieldTerms := range memSegment.DictKeys { if fieldID != 0 { // reset buffers if necessary @@ -188,7 +188,7 @@ func persistDictionary(memSegment *mem.Segment, tx *bolt.Tx) error { // put this FST into bolt // we use special varint which is still guaranteed to sort correctly - fieldBuf = EncodeUvarintAscending(fieldBuf, uint64(fieldID)) + fieldBuf = segment.EncodeUvarintAscending(fieldBuf, uint64(fieldID)) err = bucket.Put(fieldBuf, buffer.Bytes()) if err != nil { return err @@ -205,13 +205,13 @@ func persistPostings(memSegment *mem.Segment, tx *bolt.Tx) error { } bucket.FillPercent = 1.0 - postingIDBuf := make([]byte, 0, maxVarintSize) + postingIDBuf := make([]byte, 0, segment.MaxVarintSize) for postingID := range memSegment.Postings { if postingID != 0 { // reset buffers if necessary postingIDBuf = postingIDBuf[:0] } - postingIDBuf = EncodeUvarintAscending(postingIDBuf, uint64(postingID)) + postingIDBuf = segment.EncodeUvarintAscending(postingIDBuf, uint64(postingID)) var postingsBuf bytes.Buffer _, err := memSegment.Postings[postingID].WriteTo(&postingsBuf) if err != nil { @@ -234,13 +234,13 @@ func persistPostingsDetails(memSegment *mem.Segment, tx *bolt.Tx, } bucket.FillPercent = 1.0 - postingIDBuf := make([]byte, 0, maxVarintSize) + postingIDBuf := make([]byte, 0, segment.MaxVarintSize) for postingID := range memSegment.Postings { if postingID != 0 { // reset buffers if necessary postingIDBuf = postingIDBuf[:0] } - postingIDBuf = EncodeUvarintAscending(postingIDBuf, uint64(postingID)) + postingIDBuf = segment.EncodeUvarintAscending(postingIDBuf, uint64(postingID)) // make bucket for posting details postingBucket, err := bucket.CreateBucket(postingIDBuf) @@ -264,7 +264,7 @@ func persistPostingDetails(memSegment *mem.Segment, postingBucket *bolt.Bucket, var err error var chunkBucket *bolt.Bucket var currChunk uint32 - chunkIDBuf := make([]byte, 0, maxVarintSize) + chunkIDBuf := make([]byte, 0, segment.MaxVarintSize) postingsListItr := memSegment.Postings[postingID].Iterator() var encoder *govarint.Base128Encoder var locEncoder *govarint.Base128Encoder @@ -303,7 +303,7 @@ func persistPostingDetails(memSegment *mem.Segment, postingBucket *bolt.Bucket, } // prepare next chunk - chunkIDBuf = EncodeUvarintAscending(chunkIDBuf, uint64(chunk)) + chunkIDBuf = segment.EncodeUvarintAscending(chunkIDBuf, uint64(chunk)) chunkBucket, err = postingBucket.CreateBucket(chunkIDBuf) if err != nil { return err @@ -410,7 +410,7 @@ func persistStored(memSegment *mem.Segment, tx *bolt.Tx) error { var curr int // we use special varint which is still guaranteed to sort correctly - docNumBuf := make([]byte, 0, maxVarintSize) + docNumBuf := make([]byte, 0, segment.MaxVarintSize) for docNum, storedValues := range memSegment.Stored { var metaBuf bytes.Buffer var data, compressed []byte @@ -420,7 +420,7 @@ func persistStored(memSegment *mem.Segment, tx *bolt.Tx) error { curr = 0 } // create doc sub-bucket - docNumBuf = EncodeUvarintAscending(docNumBuf, uint64(docNum)) + docNumBuf = segment.EncodeUvarintAscending(docNumBuf, uint64(docNum)) docBucket, err := bucket.CreateBucket(docNumBuf) if err != nil { return err diff --git a/index/scorch/segment/bolt/build_test.go b/index/scorch/segment/bolt/build_test.go index 3f869d86a..deb4157d4 100644 --- a/index/scorch/segment/bolt/build_test.go +++ b/index/scorch/segment/bolt/build_test.go @@ -28,7 +28,7 @@ func TestBuild(t *testing.T) { _ = os.RemoveAll("/tmp/scorch.bolt") memSegment := buildMemSegment() - err := persistSegment(memSegment, "/tmp/scorch.bolt", 1024) + err := PersistSegment(memSegment, "/tmp/scorch.bolt", 1024) if err != nil { t.Fatal(err) } diff --git a/index/scorch/segment/bolt/dict.go b/index/scorch/segment/bolt/dict.go index 0d7ab5eca..0f38a3d60 100644 --- a/index/scorch/segment/bolt/dict.go +++ b/index/scorch/segment/bolt/dict.go @@ -51,7 +51,7 @@ func (d *Dictionary) postingsList(term string, except *roaring.Bitmap) (*Posting } if exists { rv.postingsID = postingsID - postingsIDKey := EncodeUvarintAscending(nil, postingsID) + postingsIDKey := segment.EncodeUvarintAscending(nil, postingsID) bucket := d.segment.tx.Bucket(postingsBucket) if bucket == nil { return nil, fmt.Errorf("postings bucket missing") diff --git a/index/scorch/segment/bolt/dict_test.go b/index/scorch/segment/bolt/dict_test.go index 6b3926a87..2df57d67f 100644 --- a/index/scorch/segment/bolt/dict_test.go +++ b/index/scorch/segment/bolt/dict_test.go @@ -109,7 +109,7 @@ func TestDictionary(t *testing.T) { _ = os.RemoveAll("/tmp/scorch.bolt") memSegment := buildMemSegmentForDict() - err := persistSegment(memSegment, "/tmp/scorch.bolt", 1024) + err := PersistSegment(memSegment, "/tmp/scorch.bolt", 1024) if err != nil { t.Fatalf("error persisting segment: %v", err) } diff --git a/index/scorch/segment/bolt/posting.go b/index/scorch/segment/bolt/posting.go index e5d6c8938..bd038a575 100644 --- a/index/scorch/segment/bolt/posting.go +++ b/index/scorch/segment/bolt/posting.go @@ -91,7 +91,7 @@ type PostingsIterator struct { func (i *PostingsIterator) loadChunk(chunk int) error { // load correct chunk bytes - chunkID := EncodeUvarintAscending(nil, uint64(chunk)) + chunkID := segment.EncodeUvarintAscending(nil, uint64(chunk)) chunkBucket := i.detailBucket.Bucket(chunkID) if chunkBucket == nil { return fmt.Errorf("chunk %d missing", chunkID) diff --git a/index/scorch/segment/bolt/segment.go b/index/scorch/segment/bolt/segment.go index 835313b87..f53a98fe6 100644 --- a/index/scorch/segment/bolt/segment.go +++ b/index/scorch/segment/bolt/segment.go @@ -123,7 +123,7 @@ func (s *Segment) loadFields() (err error) { } } else { - _, fieldID, err2 := DecodeUvarintAscending(k) + _, fieldID, err2 := segment.DecodeUvarintAscending(k) if err2 != nil { return err2 } @@ -164,7 +164,11 @@ func (s *Segment) Count() uint64 { // Dictionary returns the term dictionary for the specified field func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) { - return s.dictionary(field) + dict, err := s.dictionary(field) + if err == nil && dict == nil { + return &segment.EmptyDictionary{}, nil + } + return dict, err } func (s *Segment) dictionary(field string) (*Dictionary, error) { @@ -177,7 +181,7 @@ func (s *Segment) dictionary(field string) (*Dictionary, error) { rv.fieldID = s.fieldsMap[field] if rv.fieldID > 0 { rv.fieldID = rv.fieldID - 1 - fieldIDKey := EncodeUvarintAscending(nil, uint64(rv.fieldID)) + fieldIDKey := segment.EncodeUvarintAscending(nil, uint64(rv.fieldID)) bucket := s.tx.Bucket(dictBucket) if bucket == nil { return nil, fmt.Errorf("dictionary bucket missing") @@ -196,6 +200,8 @@ func (s *Segment) dictionary(field string) (*Dictionary, error) { } } + } else { + return nil, nil } return rv, nil @@ -208,7 +214,7 @@ func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVi if storedBuucket == nil { return fmt.Errorf("stored bucket missing") } - docNumKey := EncodeUvarintAscending(nil, num) + docNumKey := segment.EncodeUvarintAscending(nil, num) docBucket := storedBuucket.Bucket(docNumKey) if docBucket == nil { return fmt.Errorf("segment has no doc number %d", num) @@ -307,3 +313,7 @@ func (s *Segment) Close() error { } return s.db.Close() } + +func (s *Segment) Path() string { + return s.db.Path() +} diff --git a/index/scorch/segment/bolt/segment_test.go b/index/scorch/segment/bolt/segment_test.go index b00c71926..16ac2cadd 100644 --- a/index/scorch/segment/bolt/segment_test.go +++ b/index/scorch/segment/bolt/segment_test.go @@ -25,7 +25,7 @@ func TestOpen(t *testing.T) { _ = os.RemoveAll("/tmp/scorch.bolt") memSegment := buildMemSegment() - err := persistSegment(memSegment, "/tmp/scorch.bolt", 1024) + err := PersistSegment(memSegment, "/tmp/scorch.bolt", 1024) if err != nil { t.Fatalf("error persisting segment: %v", err) } @@ -325,7 +325,7 @@ func TestOpenMulti(t *testing.T) { _ = os.RemoveAll("/tmp/scorch.bolt") memSegment := buildMemSegmentMulti() - err := persistSegment(memSegment, "/tmp/scorch.bolt", 1024) + err := PersistSegment(memSegment, "/tmp/scorch.bolt", 1024) if err != nil { t.Fatalf("error persisting segment: %v", err) } @@ -425,7 +425,7 @@ func TestOpenMultiWithTwoChunks(t *testing.T) { _ = os.RemoveAll("/tmp/scorch.bolt") memSegment := buildMemSegmentMulti() - err := persistSegment(memSegment, "/tmp/scorch.bolt", 1) + err := PersistSegment(memSegment, "/tmp/scorch.bolt", 1) if err != nil { t.Fatalf("error persisting segment: %v", err) } diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go new file mode 100644 index 000000000..0913eeba8 --- /dev/null +++ b/index/scorch/segment/empty.go @@ -0,0 +1,61 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package segment + +import ( + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index" +) + +type EmptyDictionary struct{} + +func (e *EmptyDictionary) PostingsList(term string, + except *roaring.Bitmap) (PostingsList, error) { + return &EmptyPostingsList{}, nil +} + +func (e *EmptyDictionary) Iterator() DictionaryIterator { + return &EmptyDictionaryIterator{} +} + +func (e *EmptyDictionary) PrefixIterator(prefix string) DictionaryIterator { + return &EmptyDictionaryIterator{} +} + +func (e *EmptyDictionary) RangeIterator(start, end string) DictionaryIterator { + return &EmptyDictionaryIterator{} +} + +type EmptyDictionaryIterator struct{} + +func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) { + return nil, nil +} + +type EmptyPostingsList struct{} + +func (e *EmptyPostingsList) Iterator() PostingsIterator { + return &EmptyPostingsIterator{} +} + +func (e *EmptyPostingsList) Count() uint64 { + return 0 +} + +type EmptyPostingsIterator struct{} + +func (e *EmptyPostingsIterator) Next() (Posting, error) { + return nil, nil +} diff --git a/index/scorch/segment/bolt/int.go b/index/scorch/segment/int.go similarity index 98% rename from index/scorch/segment/bolt/int.go rename to index/scorch/segment/int.go index a4af3a7a8..a4836ebf8 100644 --- a/index/scorch/segment/bolt/int.go +++ b/index/scorch/segment/int.go @@ -17,12 +17,12 @@ // Modified to not use pkg/errors -package bolt +package segment import "fmt" const ( - maxVarintSize = 9 + MaxVarintSize = 9 // IntMin is chosen such that the range of int tags does not overlap the // ascii character set that is frequently used in testing. diff --git a/index/scorch/segment/bolt/int_test.go b/index/scorch/segment/int_test.go similarity index 99% rename from index/scorch/segment/bolt/int_test.go rename to index/scorch/segment/int_test.go index e59918c8b..3d2ab6fd7 100644 --- a/index/scorch/segment/bolt/int_test.go +++ b/index/scorch/segment/int_test.go @@ -17,7 +17,7 @@ // Modified to only test the parts we borrowed -package bolt +package segment import ( "bytes" diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go index 4d3d1d113..4940bb4cf 100644 --- a/index/scorch/segment/mem/segment.go +++ b/index/scorch/segment/mem/segment.go @@ -15,6 +15,8 @@ package mem import ( + "fmt" + "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index/scorch/segment" ) @@ -117,12 +119,25 @@ func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVi return nil } +func (s *Segment) getField(name string) (int, error) { + fieldID, ok := s.FieldsMap[name] + if !ok { + return 0, fmt.Errorf("no field named %s", name) + } + return int(fieldID - 1), nil +} + // Dictionary returns the term dictionary for the specified field func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) { + fieldID, err := s.getField(field) + if err != nil { + // no such field, return empty dictionary + return &segment.EmptyDictionary{}, nil + } return &Dictionary{ segment: s, field: field, - fieldID: uint16(s.getOrDefineField(field, false)), + fieldID: uint16(fieldID), }, nil } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index c059fe734..c96082c54 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -42,6 +42,7 @@ type IndexSnapshot struct { segment []*SegmentSnapshot offsets []uint64 internal map[string][]byte + epoch uint64 } func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i segment.TermDictionary) segment.DictionaryIterator) (*IndexSnapshotFieldDict, error) { diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 6380a15fd..1a50eb6cc 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -44,6 +44,12 @@ type SegmentSnapshot struct { id uint64 segment segment.Segment deleted *roaring.Bitmap + + notify []chan error +} + +func (s *SegmentSnapshot) Close() error { + return s.segment.Close() } func (s *SegmentSnapshot) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { @@ -51,6 +57,7 @@ func (s *SegmentSnapshot) VisitDocument(num uint64, visitor segment.DocumentFiel } func (s *SegmentSnapshot) Count() uint64 { + rv := s.segment.Count() if s.deleted != nil { rv -= s.deleted.GetCardinality() From e47010563583f05ea5e72062725fd92d9d3c6447 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 6 Dec 2017 18:36:14 -0500 Subject: [PATCH 029/728] fix issues identified by errcheck --- index/scorch/persister.go | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 0e3318e8c..44393e961 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -124,7 +124,10 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { } // TODO optimize writing these in order? for k, v := range snapshot.internal { - internalBucket.Put([]byte(k), v) + err = internalBucket.Put([]byte(k), v) + if err != nil { + return err + } } newSegmentPaths := make(map[uint64]string) @@ -146,12 +149,18 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { return fmt.Errorf("error persisting segment: %v", err2) } newSegmentPaths[segmentSnapshot.id] = path - snapshotSegmentBucket.Put(boltPathKey, []byte(filename)) + err = snapshotSegmentBucket.Put(boltPathKey, []byte(filename)) + if err != nil { + return err + } case *scorchBolt.Segment: path := seg.Path() filename := strings.TrimPrefix(path, s.path+string(os.PathSeparator)) - snapshotSegmentBucket.Put(boltPathKey, []byte(filename)) + err = snapshotSegmentBucket.Put(boltPathKey, []byte(filename)) + if err != nil { + return err + } default: return fmt.Errorf("unknown segment type: %T", seg) } @@ -162,7 +171,10 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { if err != nil { return fmt.Errorf("error persisting roaring bytes: %v", err) } - snapshotSegmentBucket.Put(boltDeletedKey, roaringBuf.Bytes()) + err = snapshotSegmentBucket.Put(boltDeletedKey, roaringBuf.Bytes()) + if err != nil { + return err + } } } @@ -274,11 +286,14 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { for k, _ := c.First(); k != nil; k, _ = c.Next() { if k[0] == boltInternalKey[0] { internalBucket := snapshot.Bucket(k) - internalBucket.ForEach(func(key []byte, val []byte) error { + err := internalBucket.ForEach(func(key []byte, val []byte) error { copiedVal := append([]byte(nil), val...) rv.internal[string(key)] = copiedVal return nil }) + if err != nil { + return nil, err + } } else { segmentBucket := snapshot.Bucket(k) if segmentBucket == nil { From ff2e6b98e4673c427fb526dc5d5101f10c70f2e4 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Sat, 9 Dec 2017 12:43:02 -0500 Subject: [PATCH 030/728] added empty segment --- index/scorch/segment/empty.go | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index 0913eeba8..724195007 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -19,6 +19,33 @@ import ( "github.com/blevesearch/bleve/index" ) +type EmptySegment struct{} + +func (e *EmptySegment) Dictionary(field string) (TermDictionary, error) { + return &EmptyDictionary{}, nil +} + +func (e *EmptySegment) VisitDocument(num uint64, visitor DocumentFieldValueVisitor) error { + return nil +} + +func (e *EmptySegment) Count() uint64 { + return 0 +} + +func (e *EmptySegment) DocNumbers([]string) (*roaring.Bitmap, error) { + r := roaring.NewBitmap() + return r, nil +} + +func (e *EmptySegment) Fields() []string { + return []string{} +} + +func (e *EmptySegment) Close() error { + return nil +} + type EmptyDictionary struct{} func (e *EmptyDictionary) PostingsList(term string, From 9781d9b08963160181d706b8ec0f1805452187e8 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Sat, 9 Dec 2017 14:28:33 -0500 Subject: [PATCH 031/728] add initial version of zap file format --- index/scorch/segment/zap/README.md | 120 ++++ index/scorch/segment/zap/build.go | 615 ++++++++++++++++++ index/scorch/segment/zap/build_test.go | 288 ++++++++ index/scorch/segment/zap/cmd/zap/README.md | 3 + index/scorch/segment/zap/cmd/zap/cmd/dict.go | 72 ++ .../scorch/segment/zap/cmd/zap/cmd/explore.go | 124 ++++ .../scorch/segment/zap/cmd/zap/cmd/footer.go | 43 ++ index/scorch/segment/zap/cmd/zap/cmd/root.go | 58 ++ .../scorch/segment/zap/cmd/zap/cmd/stored.go | 73 +++ index/scorch/segment/zap/cmd/zap/main.go | 23 + index/scorch/segment/zap/count.go | 55 ++ index/scorch/segment/zap/dict.go | 165 +++++ index/scorch/segment/zap/dict_test.go | 183 ++++++ index/scorch/segment/zap/posting.go | 362 +++++++++++ index/scorch/segment/zap/segment.go | 352 ++++++++++ index/scorch/segment/zap/segment_test.go | 517 +++++++++++++++ 16 files changed, 3053 insertions(+) create mode 100644 index/scorch/segment/zap/README.md create mode 100644 index/scorch/segment/zap/build.go create mode 100644 index/scorch/segment/zap/build_test.go create mode 100644 index/scorch/segment/zap/cmd/zap/README.md create mode 100644 index/scorch/segment/zap/cmd/zap/cmd/dict.go create mode 100644 index/scorch/segment/zap/cmd/zap/cmd/explore.go create mode 100644 index/scorch/segment/zap/cmd/zap/cmd/footer.go create mode 100644 index/scorch/segment/zap/cmd/zap/cmd/root.go create mode 100644 index/scorch/segment/zap/cmd/zap/cmd/stored.go create mode 100644 index/scorch/segment/zap/cmd/zap/main.go create mode 100644 index/scorch/segment/zap/count.go create mode 100644 index/scorch/segment/zap/dict.go create mode 100644 index/scorch/segment/zap/dict_test.go create mode 100644 index/scorch/segment/zap/posting.go create mode 100644 index/scorch/segment/zap/segment.go create mode 100644 index/scorch/segment/zap/segment_test.go diff --git a/index/scorch/segment/zap/README.md b/index/scorch/segment/zap/README.md new file mode 100644 index 000000000..079eef9b3 --- /dev/null +++ b/index/scorch/segment/zap/README.md @@ -0,0 +1,120 @@ +# zap file format + +## stored fields section + +- for each document + - preparation phase: + - produce a slice of metadata bytes and data bytes + - produce these slices in field id order + - field value is appended to the data slice + - metadata slice is govarint encoded with the following values for each field value + - field id (uint16) + - field type (byte) + - field value start offset in uncompressed data slice (uint64) + - field value length (uint64) + - field number of array positions (uint64) + - one additional value for each array position (uint64) + - compress the data slice using snappy + - file writing phase: + - remember the start offset for this document + - write out meta data length (varint uint64) + - write out compressed data length (varint uint64) + - write out the metadata bytes + - write out the compressed data bytes + +## stored fields idx + +- for each document + - write start offset (remembered from previous section) of stored data (big endian uint64) + +With this index and a known document number, we have direct access to all the stored field data. + +## posting details (freq/norm) section + +- for each posting list + - produce a slice containing multiple consecutive chunks (each chunk is govarint stream) + - produce a slice remembering offsets of where each chunk starts + - preparation phase: + - for each hit in the posting list + - if this hit is in next chunk close out encoding of last chunk and record offset start of next + - encode term frequency (uint64) + - encode norm factor (float32) + - file writing phase: + - remember start position for this posting list details + - write out number of chunks that follow (varint uint64) + - write out length of each chunk (each a varint uint64) + - write out the byte slice containing all the chunk data + +If you know the doc number you're interested in, this format lets you jump to the correct chunk (docNum/chunkFactor) directly and then seek within that chunk until you find it. + +## posting details (location) section + +- for each posting list + - produce a slice containing multiple consecutive chunks (each chunk is govarint stream) + - produce a slice remembering offsets of where each chunk starts + - preparation phase: + - for each hit in the posting list + - if this hit is in next chunk close out encoding of last chunk and record offset start of next + - encode field (uint16) + - encode field pos (uint64) + - encode field start (uint64) + - encode field end (uint64) + - encode number of array positions to follow (uint64) + - encode each array position (each uint64) + - file writing phase: + - remember start position for this posting list details + - write out number of chunks that follow (varint uint64) + - write out length of each chunk (each a varint uint64) + - write out the byte slice containing all the chunk data + +If you know the doc number you're interested in, this format lets you jump to the correct chunk (docNum/chunkFactor) directly and then seek within that chunk until you find it. + +## postings list section + +- for each posting list + - preparation phase: + - encode roaring bitmap posting list to bytes (so we know the length) + - file writing phase: + - remember the start position for this posting list + - write freq/norm details offset (remembered from previous, as varint uint64) + - write location details offset (remembered from previous, as varint uint64) + - write length of encoded roaring bitmap + - write the serialized roaring bitmap data + +## dictionary + +- for each field + - preparation phase: + - encode vellum FST with dictionary data pointing to file offset of posting list (remembered from previous) + - file writing phase: + - remember the start position of this persistDictionary + - write length of vellum data (varint uint64) + - write out vellum data + +## fields section + +- for each field + - file writing phase: + - remember start offset for each field + - write 1 if field has location info indexed, 0 if not (varint uint64) + - write dictionary address (remembered from previous) (varint uint64) + - write length of field name (varint uint64) + - write field name bytes + +## fields idx + +- for each field + - file writing phase: + - write big endian uint64 of start offset for each field + +NOTE: currently we don't know or record the length of this fields index. Instead we rely on the fact that we know it immediately precedes a footer of known size. + +## footer + +- file writing phase + - write number of docs (big endian uint64) + - write stored field index location (big endian uint64) + - write field index location (big endian uint64) + - write out chunk factor (big endian uint32) + - write out version (big endian uint32) + - write out file CRC of everything preceding this (big endian uint32) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go new file mode 100644 index 000000000..1ed95e80f --- /dev/null +++ b/index/scorch/segment/zap/build.go @@ -0,0 +1,615 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bufio" + "bytes" + "encoding/binary" + "math" + "os" + + "github.com/Smerity/govarint" + "github.com/blevesearch/bleve/index/scorch/segment/mem" + "github.com/couchbaselabs/vellum" + "github.com/golang/snappy" +) + +var version uint32 + +// PersistSegment takes the in-memory segment and persists it to the specified +// path in the zap file format. +func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) (err error) { + + flag := os.O_RDWR | os.O_CREATE + + f, err := os.OpenFile(path, flag, 0600) + if err != nil { + return err + } + + // bufer the output + br := bufio.NewWriter(f) + + // wrap it for counting (tracking offsets) + cr := NewCountHashWriter(br) + + var storedIndexOffset uint64 + storedIndexOffset, err = persistStored(memSegment, cr) + if err != nil { + return err + } + + var freqOffsets, locOffsets []uint64 + freqOffsets, locOffsets, err = persistPostingDetails(memSegment, cr, chunkFactor) + if err != nil { + return err + } + + var postingsLocs []uint64 + postingsLocs, err = persistPostingsLists(memSegment, cr, freqOffsets, locOffsets) + if err != nil { + return err + } + + var dictLocs []uint64 + dictLocs, err = persistDictionary(memSegment, cr, postingsLocs) + if err != nil { + return err + } + + var fieldIndexStart uint64 + fieldIndexStart, err = persistFields(memSegment, cr, dictLocs) + if err != nil { + return err + } + + err = persistFooter(uint64(len(memSegment.Stored)), storedIndexOffset, + fieldIndexStart, chunkFactor, cr) + if err != nil { + return err + } + + err = br.Flush() + if err != nil { + return err + } + + err = f.Close() + if err != nil { + return err + } + + return nil +} + +func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) { + + var curr int + var metaBuf bytes.Buffer + var data, compressed []byte + + docNumOffsets := make(map[int]uint64, len(memSegment.Stored)) + + for docNum, storedValues := range memSegment.Stored { + if docNum != 0 { + // reset buffer if necessary + metaBuf.Reset() + data = data[:0] + compressed = compressed[:0] + curr = 0 + } + + metaEncoder := govarint.NewU64Base128Encoder(&metaBuf) + + // encode fields in order + for fieldID := range memSegment.FieldsInv { + if storedFieldValues, ok := storedValues[uint16(fieldID)]; ok { + // has stored values for this field + num := len(storedFieldValues) + + // process each value + for i := 0; i < num; i++ { + // encode field + _, err2 := metaEncoder.PutU64(uint64(fieldID)) + if err2 != nil { + return 0, err2 + } + // encode type + _, err2 = metaEncoder.PutU64(uint64(memSegment.StoredTypes[docNum][uint16(fieldID)][i])) + if err2 != nil { + return 0, err2 + } + // encode start offset + _, err2 = metaEncoder.PutU64(uint64(curr)) + if err2 != nil { + return 0, err2 + } + // end len + _, err2 = metaEncoder.PutU64(uint64(len(storedFieldValues[i]))) + if err2 != nil { + return 0, err2 + } + // encode number of array pos + _, err2 = metaEncoder.PutU64(uint64(len(memSegment.StoredPos[docNum][uint16(fieldID)][i]))) + if err2 != nil { + return 0, err2 + } + // encode all array positions + for j := 0; j < len(memSegment.StoredPos[docNum][uint16(fieldID)][i]); j++ { + _, err2 = metaEncoder.PutU64(memSegment.StoredPos[docNum][uint16(fieldID)][i][j]) + if err2 != nil { + return 0, err2 + } + } + // append data + data = append(data, storedFieldValues[i]...) + // update curr + curr += len(storedFieldValues[i]) + } + } + } + metaEncoder.Close() + + metaBytes := metaBuf.Bytes() + + // compress the data + compressed = snappy.Encode(compressed, data) + + // record where we're about to start writing + docNumOffsets[docNum] = uint64(w.Count()) + + buf := make([]byte, binary.MaxVarintLen64) + // write out the meta length + n := binary.PutUvarint(buf, uint64(len(metaBytes))) + _, err := w.Write(buf[:n]) + if err != nil { + return 0, err + } + // write out the compressed data length + n = binary.PutUvarint(buf, uint64(len(compressed))) + _, err = w.Write(buf[:n]) + if err != nil { + return 0, err + } + // now write the meta + _, err = w.Write(metaBytes) + if err != nil { + return 0, err + } + // now write the compressed data + _, err = w.Write(compressed) + if err != nil { + return 0, err + } + } + + // return value is the start of the stored index + rv := uint64(w.Count()) + // now write out the stored doc index + for docNum := range memSegment.Stored { + err := binary.Write(w, binary.BigEndian, docNumOffsets[docNum]) + if err != nil { + return 0, err + } + } + + return rv, nil +} + +func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFactor uint32) ([]uint64, []uint64, error) { + var freqOffsets, locOfffsets []uint64 + for postingID := range memSegment.Postings { + postingsListItr := memSegment.Postings[postingID].Iterator() + + total := uint64(len(memSegment.Stored))/uint64(chunkFactor) + 1 + + var freqNormBuf []byte + var offset int + + var encodingBuf bytes.Buffer + encoder := govarint.NewU64Base128Encoder(&encodingBuf) + + chunkLens := make([]uint64, total) + var currChunk uint64 + for postingsListItr.HasNext() { + docNum := postingsListItr.Next() + chunk := uint64(docNum) / uint64(chunkFactor) + + if chunk != currChunk { + // starting a new chunk + if encoder != nil { + // close out last + encoder.Close() + encodingBytes := encodingBuf.Bytes() + chunkLens[currChunk] = uint64(len(encodingBytes)) + freqNormBuf = append(freqNormBuf, encodingBytes...) + encodingBuf.Reset() + encoder = govarint.NewU64Base128Encoder(&encodingBuf) + } + + currChunk = chunk + } + + // put freq + _, err := encoder.PutU64(memSegment.Freqs[postingID][offset]) + if err != nil { + return nil, nil, err + } + + // put norm + norm := memSegment.Norms[postingID][offset] + normBits := math.Float32bits(norm) + _, err = encoder.PutU32(normBits) + if err != nil { + return nil, nil, err + } + + offset++ + } + + // close out last chunk + if encoder != nil { + // fix me write freq/norms + encoder.Close() + encodingBytes := encodingBuf.Bytes() + chunkLens[currChunk] = uint64(len(encodingBytes)) + freqNormBuf = append(freqNormBuf, encodingBytes...) + } + + // record where this postings freq info starts + freqOffsets = append(freqOffsets, uint64(w.Count())) + + buf := make([]byte, binary.MaxVarintLen64) + // write out the number of chunks + n := binary.PutUvarint(buf, uint64(total)) + _, err := w.Write(buf[:n]) + if err != nil { + return nil, nil, err + } + // write out the chunk lens + for _, chunkLen := range chunkLens { + n := binary.PutUvarint(buf, uint64(chunkLen)) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, nil, err + } + } + // write out the data + _, err = w.Write(freqNormBuf) + if err != nil { + return nil, nil, err + } + + } + + // now do it again for the locations + for postingID := range memSegment.Postings { + postingsListItr := memSegment.Postings[postingID].Iterator() + + total := uint64(len(memSegment.Stored))/uint64(chunkFactor) + 1 + + var locBuf []byte + var offset int + var locOffset int + + var encodingBuf bytes.Buffer + encoder := govarint.NewU64Base128Encoder(&encodingBuf) + + chunkLens := make([]uint64, total) + var currChunk uint64 + for postingsListItr.HasNext() { + docNum := postingsListItr.Next() + chunk := uint64(docNum) / uint64(chunkFactor) + + if chunk != currChunk { + // starting a new chunk + if encoder != nil { + // close out last + encoder.Close() + encodingBytes := encodingBuf.Bytes() + chunkLens[currChunk] = uint64(len(encodingBytes)) + locBuf = append(locBuf, encodingBytes...) + encodingBuf.Reset() + encoder = govarint.NewU64Base128Encoder(&encodingBuf) + } + currChunk = chunk + } + + for i := 0; i < int(memSegment.Freqs[postingID][offset]); i++ { + + if len(memSegment.Locfields[postingID]) > 0 { + // put field + _, err := encoder.PutU64(uint64(memSegment.Locfields[postingID][locOffset])) + if err != nil { + return nil, nil, err + } + + // put pos + _, err = encoder.PutU64(memSegment.Locpos[postingID][locOffset]) + if err != nil { + return nil, nil, err + } + + // put start + _, err = encoder.PutU64(memSegment.Locstarts[postingID][locOffset]) + if err != nil { + return nil, nil, err + } + + // put end + _, err = encoder.PutU64(memSegment.Locends[postingID][locOffset]) + if err != nil { + return nil, nil, err + } + + // put array positions + num := len(memSegment.Locarraypos[postingID][locOffset]) + + // put the number of array positions to follow + _, err = encoder.PutU64(uint64(num)) + if err != nil { + return nil, nil, err + } + + // put each array position + for j := 0; j < num; j++ { + _, err = encoder.PutU64(memSegment.Locarraypos[postingID][locOffset][j]) + if err != nil { + return nil, nil, err + } + } + } + + locOffset++ + } + offset++ + } + + // close out last chunk + if encoder != nil { + // fix me write freq/norms + encoder.Close() + encodingBytes := encodingBuf.Bytes() + chunkLens[currChunk] = uint64(len(encodingBytes)) + locBuf = append(locBuf, encodingBytes...) + } + + // record where this postings loc info starts + locOfffsets = append(locOfffsets, uint64(w.Count())) + + buf := make([]byte, binary.MaxVarintLen64) + // write out the number of chunks + n := binary.PutUvarint(buf, uint64(total)) + _, err := w.Write(buf[:n]) + if err != nil { + return nil, nil, err + } + // write out the chunk lens + for _, chunkLen := range chunkLens { + n := binary.PutUvarint(buf, uint64(chunkLen)) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, nil, err + } + } + // write out the data + _, err = w.Write(locBuf) + if err != nil { + return nil, nil, err + } + + } + return freqOffsets, locOfffsets, nil +} + +func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, freqOffsets, locOffsets []uint64) ([]uint64, error) { + var rv []uint64 + + var postingsBuf bytes.Buffer + for postingID := range memSegment.Postings { + if postingID != 0 { + postingsBuf.Reset() + } + + // record where we start this posting list + rv = append(rv, uint64(w.Count())) + + // write out postings list to memory so we know the len + postingsListLen, err := memSegment.Postings[postingID].WriteTo(&postingsBuf) + if err != nil { + return nil, err + } + + // write out the start of the term info + buf := make([]byte, binary.MaxVarintLen64) + n := binary.PutUvarint(buf, freqOffsets[postingID]) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, err + } + + // write out the start of the loc info + n = binary.PutUvarint(buf, locOffsets[postingID]) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, err + } + + // write out the length of this postings list + n = binary.PutUvarint(buf, uint64(postingsListLen)) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, err + } + + // write out the postings list itself + _, err = w.Write(postingsBuf.Bytes()) + if err != nil { + return nil, err + } + } + + return rv, nil +} + +func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs []uint64) ([]uint64, error) { + var rv []uint64 + + var buffer bytes.Buffer + for fieldID, fieldTerms := range memSegment.DictKeys { + if fieldID != 0 { + buffer.Reset() + } + + // start a new vellum for this field + builder, err := vellum.New(&buffer, nil) + if err != nil { + return nil, err + } + + dict := memSegment.Dicts[fieldID] + // now walk the dictionary in order of fieldTerms (already sorted) + for i := range fieldTerms { + postingID := dict[fieldTerms[i]] - 1 + postingsAddr := postingsLocs[postingID] + err = builder.Insert([]byte(fieldTerms[i]), postingsAddr) + if err != nil { + return nil, err + } + } + err = builder.Close() + if err != nil { + return nil, err + } + + // record where this dictionary starts + rv = append(rv, uint64(w.Count())) + + vellumData := buffer.Bytes() + + // write out the length of the vellum data + buf := make([]byte, binary.MaxVarintLen64) + // write out the number of chunks + n := binary.PutUvarint(buf, uint64(len(vellumData))) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, err + } + + // write this vellum to disk + _, err = w.Write(vellumData) + if err != nil { + return nil, err + } + } + + return rv, nil +} + +func persistFields(memSegment *mem.Segment, w *CountHashWriter, dictLocs []uint64) (uint64, error) { + var rv uint64 + + var fieldStarts []uint64 + for fieldID, fieldName := range memSegment.FieldsInv { + + // record start of this field + fieldStarts = append(fieldStarts, uint64(w.Count())) + + buf := make([]byte, binary.MaxVarintLen64) + // write out if the field has indexed locs (0 or 1) + var indexedLoc uint64 + if memSegment.FieldsLoc[fieldID] { + indexedLoc = 1 + } + n := binary.PutUvarint(buf, indexedLoc) + _, err := w.Write(buf[:n]) + if err != nil { + return 0, err + } + + // write out dict location for this field + n = binary.PutUvarint(buf, dictLocs[fieldID]) + _, err = w.Write(buf[:n]) + if err != nil { + return 0, err + } + + // write out the length of the field name + n = binary.PutUvarint(buf, uint64(len(fieldName))) + _, err = w.Write(buf[:n]) + if err != nil { + return 0, err + } + + // write out the field name + _, err = w.Write([]byte(fieldName)) + if err != nil { + return 0, err + } + } + + // now write out the fields index + rv = uint64(w.Count()) + + // now write out the stored doc index + for fieldID := range memSegment.FieldsInv { + err := binary.Write(w, binary.BigEndian, fieldStarts[fieldID]) + if err != nil { + return 0, err + } + } + + return rv, nil +} + +// NOTE: update if you make the footer bigger +// crc + ver + chunk + field offset + stored offset + num docs +const footerSize = 4 + 4 + 4 + 8 + 8 + 8 + +func persistFooter(numDocs, storedIndexOffset, fieldIndexOffset uint64, + chunkFactor uint32, w *CountHashWriter) error { + // write out the number of docs + err := binary.Write(w, binary.BigEndian, numDocs) + if err != nil { + return err + } + // write out the stored field index location: + err = binary.Write(w, binary.BigEndian, storedIndexOffset) + if err != nil { + return err + } + // write out the field index location + err = binary.Write(w, binary.BigEndian, fieldIndexOffset) + if err != nil { + return err + } + // write out 32-bit chunk factor + err = binary.Write(w, binary.BigEndian, chunkFactor) + if err != nil { + return err + } + // write out 32-bit version + err = binary.Write(w, binary.BigEndian, version) + if err != nil { + return err + } + // write out CRC-32 of everything upto but not including this CRC + err = binary.Write(w, binary.BigEndian, w.Sum32()) + if err != nil { + return err + } + return nil +} diff --git a/index/scorch/segment/zap/build_test.go b/index/scorch/segment/zap/build_test.go new file mode 100644 index 000000000..da0e12ba6 --- /dev/null +++ b/index/scorch/segment/zap/build_test.go @@ -0,0 +1,288 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "os" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment/mem" +) + +func TestBuild(t *testing.T) { + _ = os.RemoveAll("/tmp/scorch.zap") + + memSegment := buildMemSegment() + err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024) + if err != nil { + t.Fatal(err) + } +} + +func buildMemSegment() *mem.Segment { + doc := &document.Document{ + ID: "a", + Fields: []document.Field{ + document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil), + document.NewTextFieldCustom("name", nil, []byte("wow"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + }, + CompositeFields: []*document.CompositeField{ + document.NewCompositeField("_all", true, nil, []string{"_id"}), + }, + } + + // forge analyzed docs + results := []*index.AnalysisResult{ + &index.AnalysisResult{ + Document: doc, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("a"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("wow"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("some"), + }, + &analysis.Token{ + Start: 5, + End: 10, + Position: 2, + Term: []byte("thing"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("cold"), + }, + }, []uint64{0}, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("dark"), + }, + }, []uint64{1}, true), + }, + Length: []int{ + 1, + 1, + 2, + 1, + 1, + }, + }, + } + + // fix up composite fields + for _, ar := range results { + for i, f := range ar.Document.Fields { + for _, cf := range ar.Document.CompositeFields { + cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i]) + } + } + } + + return mem.NewFromAnalyzedDocs(results) +} + +func buildMemSegmentMulti() *mem.Segment { + + doc := &document.Document{ + ID: "a", + Fields: []document.Field{ + document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil), + document.NewTextFieldCustom("name", nil, []byte("wow"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + }, + CompositeFields: []*document.CompositeField{ + document.NewCompositeField("_all", true, nil, []string{"_id"}), + }, + } + + doc2 := &document.Document{ + ID: "b", + Fields: []document.Field{ + document.NewTextFieldCustom("_id", nil, []byte("b"), document.IndexField|document.StoreField, nil), + document.NewTextFieldCustom("name", nil, []byte("who"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + }, + CompositeFields: []*document.CompositeField{ + document.NewCompositeField("_all", true, nil, []string{"_id"}), + }, + } + + // forge analyzed docs + results := []*index.AnalysisResult{ + &index.AnalysisResult{ + Document: doc, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("a"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("wow"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("some"), + }, + &analysis.Token{ + Start: 5, + End: 10, + Position: 2, + Term: []byte("thing"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("cold"), + }, + }, []uint64{0}, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("dark"), + }, + }, []uint64{1}, true), + }, + Length: []int{ + 1, + 1, + 2, + 1, + 1, + }, + }, + &index.AnalysisResult{ + Document: doc2, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("b"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("who"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("some"), + }, + &analysis.Token{ + Start: 5, + End: 10, + Position: 2, + Term: []byte("thing"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("cold"), + }, + }, []uint64{0}, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("dark"), + }, + }, []uint64{1}, true), + }, + Length: []int{ + 1, + 1, + 2, + 1, + 1, + }, + }, + } + + // fix up composite fields + for _, ar := range results { + for i, f := range ar.Document.Fields { + for _, cf := range ar.Document.CompositeFields { + cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i]) + } + } + } + + segment := mem.NewFromAnalyzedDocs(results) + + return segment +} diff --git a/index/scorch/segment/zap/cmd/zap/README.md b/index/scorch/segment/zap/cmd/zap/README.md new file mode 100644 index 000000000..99f55d365 --- /dev/null +++ b/index/scorch/segment/zap/cmd/zap/README.md @@ -0,0 +1,3 @@ +# zap command line utility + +Kind of a hack just put together quickly to let me debug some issues. diff --git a/index/scorch/segment/zap/cmd/zap/cmd/dict.go b/index/scorch/segment/zap/cmd/zap/cmd/dict.go new file mode 100644 index 000000000..74e59e902 --- /dev/null +++ b/index/scorch/segment/zap/cmd/zap/cmd/dict.go @@ -0,0 +1,72 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "encoding/binary" + "fmt" + + "github.com/couchbaselabs/vellum" + "github.com/spf13/cobra" +) + +// dictCmd represents the dict command +var dictCmd = &cobra.Command{ + Use: "dict [path] [field]", + Short: "dict prints the term dictionary for the specified field", + Long: `The dict command lets you print the term dictionary for the specified field.`, + RunE: func(cmd *cobra.Command, args []string) error { + if len(args) < 2 { + return fmt.Errorf("must specify field") + } + + data := segment.Data() + + addr, err := segment.DictAddr(args[1]) + if err != nil { + return fmt.Errorf("error determing address: %v", err) + } + fmt.Printf("dictionary for field starts at %d (%x)\n", addr, addr) + + vellumLen, read := binary.Uvarint(data[addr : addr+binary.MaxVarintLen64]) + fmt.Printf("vellum length: %d\n", vellumLen) + fstBytes := data[addr+uint64(read) : addr+uint64(read)+vellumLen] + fmt.Printf("raw vellum data % x\n", fstBytes) + fmt.Printf("dictionary:\n\n") + if fstBytes != nil { + fst, err := vellum.Load(fstBytes) + if err != nil { + return fmt.Errorf("dictionary field %s vellum err: %v", args[1], err) + } + + itr, err := fst.Iterator(nil, nil) + for err == nil { + currTerm, currVal := itr.Current() + fmt.Printf("%s - %d (%x)\n", currTerm, currVal, currVal) + err = itr.Next() + } + if err != nil && err != vellum.ErrIteratorDone { + return fmt.Errorf("error iterating dictionary: %v", err) + } + + } + + return nil + }, +} + +func init() { + RootCmd.AddCommand(dictCmd) +} diff --git a/index/scorch/segment/zap/cmd/zap/cmd/explore.go b/index/scorch/segment/zap/cmd/zap/cmd/explore.go new file mode 100644 index 000000000..42ab82732 --- /dev/null +++ b/index/scorch/segment/zap/cmd/zap/cmd/explore.go @@ -0,0 +1,124 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "encoding/binary" + "fmt" + "log" + + "github.com/couchbaselabs/vellum" + "github.com/spf13/cobra" +) + +// exploreCmd represents the explore command +var exploreCmd = &cobra.Command{ + Use: "explore [path] [field] ", + Short: "explores the index by field, then term (optional), and then docNum (optional)", + Long: `The explore command lets you explore the index in order of field, then optionally by term, then optionally again by doc number.`, + RunE: func(cmd *cobra.Command, args []string) error { + if len(args) < 2 { + return fmt.Errorf("must specify field") + } + + data := segment.Data() + + addr, err := segment.DictAddr(args[1]) + if err != nil { + return fmt.Errorf("error determing address: %v", err) + } + fmt.Printf("dictionary for field starts at %d (%x)\n", addr, addr) + + vellumLen, read := binary.Uvarint(data[addr : addr+binary.MaxVarintLen64]) + fmt.Printf("vellum length: %d\n", vellumLen) + fstBytes := data[addr+uint64(read) : addr+uint64(read)+vellumLen] + fmt.Printf("raw vellum data % x\n", fstBytes) + + if len(args) >= 3 { + if fstBytes != nil { + fst, err := vellum.Load(fstBytes) + if err != nil { + return fmt.Errorf("dictionary field %s vellum err: %v", args[1], err) + } + postingsAddr, exists, err := fst.Get([]byte(args[2])) + if err != nil { + return fmt.Errorf("error looking for term : %v", err) + } + if exists { + fmt.Printf("postings list begins at %d (%x)\n", postingsAddr, postingsAddr) + + var n uint64 + freqAddr, read := binary.Uvarint(data[postingsAddr : postingsAddr+binary.MaxVarintLen64]) + n += uint64(read) + + var locAddr uint64 + locAddr, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64]) + n += uint64(read) + + var postingListLen uint64 + postingListLen, _ = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64]) + + fmt.Printf("Posting List Length: %d\n", postingListLen) + + fmt.Printf("Freq details at: %d (%x)\n", freqAddr, freqAddr) + numChunks, r2 := binary.Uvarint(data[freqAddr : freqAddr+binary.MaxVarintLen64]) + n = uint64(r2) + + var freqOffsets []uint64 + for j := uint64(0); j < numChunks; j++ { + chunkLen, r3 := binary.Uvarint(data[freqAddr+n : freqAddr+n+binary.MaxVarintLen64]) + n += uint64(r3) + freqOffsets = append(freqOffsets, chunkLen) + } + running := freqAddr + n + for k, offset := range freqOffsets { + fmt.Printf("freq chunk: %d, len %d, start at %d (%x) end %d (%x)\n", k, offset, running, running, running+offset, running+offset) + running += offset + } + + fmt.Printf("Loc details at: %d (%x)\n", locAddr, locAddr) + numLChunks, r4 := binary.Uvarint(data[locAddr : locAddr+binary.MaxVarintLen64]) + n = uint64(r4) + fmt.Printf("there are %d loc chunks\n", numLChunks) + + var locOffsets []uint64 + for j := uint64(0); j < numLChunks; j++ { + log.Printf("reading from %d(%x)\n", locAddr+n, locAddr+n) + log.Printf("data i see here: % x\n", data[locAddr+n:locAddr+n+binary.MaxVarintLen64]) + lchunkLen, r4 := binary.Uvarint(data[locAddr+n : locAddr+n+binary.MaxVarintLen64]) + n += uint64(r4) + log.Printf("see chunk len %d(%x)\n", lchunkLen, lchunkLen) + locOffsets = append(locOffsets, lchunkLen) + } + + running2 := locAddr + n + for k, offset := range locOffsets { + fmt.Printf("loc chunk: %d, len %d(%x), start at %d (%x) end %d (%x)\n", k, offset, offset, running2, running2, running2+offset, running2+offset) + running2 += offset + } + + } else { + fmt.Printf("dictionary does not contain term '%s'\n", args[2]) + } + } + } + + return nil + }, +} + +func init() { + RootCmd.AddCommand(exploreCmd) +} diff --git a/index/scorch/segment/zap/cmd/zap/cmd/footer.go b/index/scorch/segment/zap/cmd/zap/cmd/footer.go new file mode 100644 index 000000000..177f4e71b --- /dev/null +++ b/index/scorch/segment/zap/cmd/zap/cmd/footer.go @@ -0,0 +1,43 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "fmt" + + "github.com/spf13/cobra" +) + +// footerCmd represents the footer command +var footerCmd = &cobra.Command{ + Use: "footer [path]", + Short: "prints the contents of the zap footer", + Long: `The footer command will print the contents of the footer.`, + RunE: func(cmd *cobra.Command, args []string) error { + data := segment.Data() + fmt.Printf("Length: %d\n", len(data)) + fmt.Printf("CRC: %#x\n", segment.CRC()) + fmt.Printf("Version: %d\n", segment.Version()) + fmt.Printf("Chunk Factor: %d\n", segment.ChunkFactor()) + fmt.Printf("Fields Idx: %d (%#x)\n", segment.FieldsIndexOffset(), segment.FieldsIndexOffset()) + fmt.Printf("Stored Idx: %d (%#x)\n", segment.StoredIndexOffset(), segment.StoredIndexOffset()) + fmt.Printf("Num Docs: %d\n", segment.NumDocs()) + return nil + }, +} + +func init() { + RootCmd.AddCommand(footerCmd) +} diff --git a/index/scorch/segment/zap/cmd/zap/cmd/root.go b/index/scorch/segment/zap/cmd/zap/cmd/root.go new file mode 100644 index 000000000..f969bbf13 --- /dev/null +++ b/index/scorch/segment/zap/cmd/zap/cmd/root.go @@ -0,0 +1,58 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "fmt" + "os" + + "github.com/blevesearch/bleve/index/scorch/segment/zap" + "github.com/spf13/cobra" +) + +var segment *zap.Segment + +// RootCmd represents the base command when called without any subcommands +var RootCmd = &cobra.Command{ + Use: "zap", + Short: "command-line tool to interact with a zap file", + Long: `Zap is a command-line tool to interact with a zap file.`, + PersistentPreRunE: func(cmd *cobra.Command, args []string) error { + + if len(args) < 1 { + return fmt.Errorf("must specify path to zap file") + } + + segInf, err := zap.Open(args[0]) + if err != nil { + return fmt.Errorf("error opening zap file: %v", err) + } + segment = segInf.(*zap.Segment) + + return nil + }, + PersistentPostRunE: func(cmd *cobra.Command, args []string) error { + return nil + }, +} + +// Execute adds all child commands to the root command sets flags appropriately. +// This is called by main.main(). It only needs to happen once to the rootCmd. +func Execute() { + if err := RootCmd.Execute(); err != nil { + fmt.Println(err) + os.Exit(-1) + } +} diff --git a/index/scorch/segment/zap/cmd/zap/cmd/stored.go b/index/scorch/segment/zap/cmd/zap/cmd/stored.go new file mode 100644 index 000000000..64e42c7e6 --- /dev/null +++ b/index/scorch/segment/zap/cmd/zap/cmd/stored.go @@ -0,0 +1,73 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "encoding/binary" + "fmt" + "strconv" + + "github.com/golang/snappy" + "github.com/spf13/cobra" +) + +// storedCmd represents the stored command +var storedCmd = &cobra.Command{ + Use: "stored [path] [docNum]", + Short: "prints the stored section for a doc number", + Long: `The stored command will print the raw stored data bytes for the specified document number.`, + RunE: func(cmd *cobra.Command, args []string) error { + if len(args) < 2 { + return fmt.Errorf("must specify doc number") + } + docNum, err := strconv.Atoi(args[1]) + if err != nil { + return fmt.Errorf("unable to parse doc number: %v", err) + } + if docNum >= int(segment.NumDocs()) { + return fmt.Errorf("invalid doc number %d (valid 0 - %d)", docNum, segment.NumDocs()-1) + } + data := segment.Data() + storedIdx := segment.StoredIndexOffset() + // read docNum entry in the index + indexPos := storedIdx + (8 * uint64(docNum)) + storedStartAddr := binary.BigEndian.Uint64(data[indexPos : indexPos+8]) + fmt.Printf("Stored field starts at %d (%#x)\n", storedStartAddr, storedStartAddr) + + var n uint64 + metaLen, read := binary.Uvarint(data[storedStartAddr : storedStartAddr+binary.MaxVarintLen64]) + n += uint64(read) + fmt.Printf("Meta Len: %d\n", metaLen) + var dataLen uint64 + dataLen, read = binary.Uvarint(data[storedStartAddr+n : storedStartAddr+n+binary.MaxVarintLen64]) + n += uint64(read) + fmt.Printf("Data Len: %d\n", dataLen) + meta := data[storedStartAddr+n : storedStartAddr+n+metaLen] + fmt.Printf("Raw meta: % x\n", meta) + raw := data[storedStartAddr+n+metaLen : storedStartAddr+n+metaLen+dataLen] + fmt.Printf("Raw data (len %d): % x\n", len(raw), raw) + uncompressed, err := snappy.Decode(nil, raw) + if err != nil { + panic(err) + } + fmt.Printf("Uncompressed data (len %d): % x\n", len(uncompressed), uncompressed) + + return nil + }, +} + +func init() { + RootCmd.AddCommand(storedCmd) +} diff --git a/index/scorch/segment/zap/cmd/zap/main.go b/index/scorch/segment/zap/cmd/zap/main.go new file mode 100644 index 000000000..23c500a33 --- /dev/null +++ b/index/scorch/segment/zap/cmd/zap/main.go @@ -0,0 +1,23 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "github.com/blevesearch/bleve/index/scorch/segment/zap/cmd/zap/cmd" +) + +func main() { + cmd.Execute() +} diff --git a/index/scorch/segment/zap/count.go b/index/scorch/segment/zap/count.go new file mode 100644 index 000000000..2f0b92de2 --- /dev/null +++ b/index/scorch/segment/zap/count.go @@ -0,0 +1,55 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "hash" + "hash/crc32" + "io" +) + +// CountHashWriter is a wrapper around a Writer which counts the number of +// bytes which have been written +type CountHashWriter struct { + w io.Writer + h hash.Hash32 + n int +} + +// NewCountHashWriter returns a CountHashWriter which wraps the provided Writer +func NewCountHashWriter(w io.Writer) *CountHashWriter { + return &CountHashWriter{ + w: w, + h: crc32.NewIEEE(), + } +} + +// Write writes the provided bytes to the wrapped writer and counts the bytes +func (c *CountHashWriter) Write(b []byte) (int, error) { + n, err := c.w.Write(b) + c.n += n + _, _ = c.h.Write(b) + return n, err +} + +// Count returns the number of bytes written +func (c *CountHashWriter) Count() int { + return c.n +} + +// Sum32 returns the CRC-32 hash of the content written to this writer +func (c *CountHashWriter) Sum32() uint32 { + return c.h.Sum32() +} diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go new file mode 100644 index 000000000..d69195958 --- /dev/null +++ b/index/scorch/segment/zap/dict.go @@ -0,0 +1,165 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "encoding/binary" + "fmt" + + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/couchbaselabs/vellum" + "github.com/couchbaselabs/vellum/regexp" +) + +// Dictionary is the zap representation of the term dictionary +type Dictionary struct { + segment *Segment + field string + fieldID uint16 + fst *vellum.FST +} + +// PostingsList returns the postings list for the specified term +func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) { + return d.postingsList(term, except) +} + +func (d *Dictionary) postingsList(term string, except *roaring.Bitmap) (*PostingsList, error) { + rv := &PostingsList{ + dictionary: d, + term: term, + except: except, + } + + if d.fst != nil { + postingsOffset, exists, err := d.fst.Get([]byte(term)) + if err != nil { + return nil, fmt.Errorf("vellum err: %v", err) + } + if exists { + rv.postingsOffset = postingsOffset + // read the location of the freq/norm details + var n uint64 + var read int + + rv.freqOffset, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+binary.MaxVarintLen64]) + n += uint64(read) + rv.locOffset, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + var postingsLen uint64 + postingsLen, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + + roaringBytes := d.segment.mm[postingsOffset+n : postingsOffset+n+postingsLen] + + bitmap := roaring.NewBitmap() + _, err = bitmap.FromBuffer(roaringBytes) + if err != nil { + return nil, fmt.Errorf("error loading roaring bitmap: %v", err) + } + + rv.postings = bitmap + } + } + + return rv, nil +} + +// Iterator returns an iterator for this dictionary +func (d *Dictionary) Iterator() segment.DictionaryIterator { + + rv := &DictionaryIterator{ + d: d, + } + + if d.fst != nil { + itr, err := d.fst.Iterator(nil, nil) + if err == nil { + rv.itr = itr + } + } + + return rv +} + +// PrefixIterator returns an iterator which only visits terms having the +// the specified prefix +func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { + rv := &DictionaryIterator{ + d: d, + } + + if d.fst != nil { + r, err := regexp.New(prefix + ".*") + if err == nil { + itr, err := d.fst.Search(r, nil, nil) + if err == nil { + rv.itr = itr + } + } + } + + return rv +} + +// RangeIterator returns an iterator which only visits terms between the +// start and end terms. NOTE: bleve.index API specifies the end is inclusive. +func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator { + rv := &DictionaryIterator{ + d: d, + } + + // need to increment the end position to be inclusive + endBytes := []byte(end) + if endBytes[len(endBytes)-1] < 0xff { + endBytes[len(endBytes)-1]++ + } else { + endBytes = append(endBytes, 0xff) + } + + if d.fst != nil { + itr, err := d.fst.Iterator([]byte(start), endBytes) + if err == nil { + rv.itr = itr + } + } + + return rv +} + +// DictionaryIterator is an iterator for term dictionary +type DictionaryIterator struct { + d *Dictionary + itr vellum.Iterator + err error +} + +// Next returns the next entry in the dictionary +func (i *DictionaryIterator) Next() (*index.DictEntry, error) { + if i.itr == nil || i.err == vellum.ErrIteratorDone { + return nil, nil + } else if i.err != nil { + return nil, i.err + } + term, count := i.itr.Current() + rv := &index.DictEntry{ + Term: string(term), + Count: count, + } + i.err = i.itr.Next() + return rv, nil +} diff --git a/index/scorch/segment/zap/dict_test.go b/index/scorch/segment/zap/dict_test.go new file mode 100644 index 000000000..336fb37ca --- /dev/null +++ b/index/scorch/segment/zap/dict_test.go @@ -0,0 +1,183 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "os" + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment/mem" +) + +func buildMemSegmentForDict() *mem.Segment { + doc := &document.Document{ + ID: "a", + Fields: []document.Field{ + document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil), + document.NewTextFieldCustom("desc", nil, []byte("apple ball cat dog egg fish bat"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + }, + } + + // forge analyzed docs + results := []*index.AnalysisResult{ + &index.AnalysisResult{ + Document: doc, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("a"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 5, + Position: 1, + Term: []byte("apple"), + }, + &analysis.Token{ + Start: 6, + End: 10, + Position: 2, + Term: []byte("ball"), + }, + &analysis.Token{ + Start: 11, + End: 14, + Position: 3, + Term: []byte("cat"), + }, + &analysis.Token{ + Start: 15, + End: 18, + Position: 4, + Term: []byte("dog"), + }, + &analysis.Token{ + Start: 19, + End: 22, + Position: 5, + Term: []byte("egg"), + }, + &analysis.Token{ + Start: 20, + End: 24, + Position: 6, + Term: []byte("fish"), + }, + &analysis.Token{ + Start: 25, + End: 28, + Position: 7, + Term: []byte("bat"), + }, + }, nil, true), + }, + Length: []int{ + 1, + 7, + }, + }, + } + + segment := mem.NewFromAnalyzedDocs(results) + + return segment +} + +func TestDictionary(t *testing.T) { + + _ = os.RemoveAll("/tmp/scorch.zap") + + memSegment := buildMemSegmentForDict() + err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024) + if err != nil { + t.Fatalf("error persisting segment: %v", err) + } + + segment, err := Open("/tmp/scorch.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + dict, err := segment.Dictionary("desc") + if err != nil { + t.Fatal(err) + } + + // test basic full iterator + expected := []string{"apple", "ball", "bat", "cat", "dog", "egg", "fish"} + var got []string + itr := dict.Iterator() + next, err := itr.Next() + for next != nil && err == nil { + got = append(got, next.Term) + next, err = itr.Next() + } + if err != nil { + t.Fatalf("dict itr error: %v", err) + } + + if !reflect.DeepEqual(expected, got) { + t.Errorf("expected: %v, got: %v", expected, got) + } + + // test prefix iterator + expected = []string{"ball", "bat"} + got = got[:0] + itr = dict.PrefixIterator("b") + next, err = itr.Next() + for next != nil && err == nil { + got = append(got, next.Term) + next, err = itr.Next() + } + if err != nil { + t.Fatalf("dict itr error: %v", err) + } + + if !reflect.DeepEqual(expected, got) { + t.Errorf("expected: %v, got: %v", expected, got) + } + + // test range iterator + expected = []string{"cat", "dog", "egg"} + got = got[:0] + itr = dict.RangeIterator("cat", "egg") + next, err = itr.Next() + for next != nil && err == nil { + got = append(got, next.Term) + next, err = itr.Next() + } + if err != nil { + t.Fatalf("dict itr error: %v", err) + } + + if !reflect.DeepEqual(expected, got) { + t.Errorf("expected: %v, got: %v", expected, got) + } +} diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go new file mode 100644 index 000000000..f29020093 --- /dev/null +++ b/index/scorch/segment/zap/posting.go @@ -0,0 +1,362 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "fmt" + "math" + + "github.com/RoaringBitmap/roaring" + "github.com/Smerity/govarint" + "github.com/blevesearch/bleve/index/scorch/segment" +) + +// PostingsList is an in-memory represenation of a postings list +type PostingsList struct { + dictionary *Dictionary + term string + postingsOffset uint64 + freqOffset uint64 + locOffset uint64 + postings *roaring.Bitmap + except *roaring.Bitmap + postingKey []byte +} + +// Iterator returns an iterator for this postings list +func (p *PostingsList) Iterator() segment.PostingsIterator { + rv := &PostingsIterator{ + postings: p, + } + if p.postings != nil { + // prepare the freq chunk details + var n uint64 + var read int + var numFreqChunks uint64 + numFreqChunks, read = binary.Uvarint(p.dictionary.segment.mm[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + rv.freqChunkLens = make([]uint64, int(numFreqChunks)) + for i := 0; i < int(numFreqChunks); i++ { + rv.freqChunkLens[i], read = binary.Uvarint(p.dictionary.segment.mm[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + } + rv.freqChunkStart = p.freqOffset + n + + // prepare the loc chunk details + n = 0 + var numLocChunks uint64 + numLocChunks, read = binary.Uvarint(p.dictionary.segment.mm[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + rv.locChunkLens = make([]uint64, int(numLocChunks)) + for i := 0; i < int(numLocChunks); i++ { + rv.locChunkLens[i], read = binary.Uvarint(p.dictionary.segment.mm[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + } + rv.locChunkStart = p.locOffset + n + + rv.all = p.postings.Iterator() + if p.except != nil { + allExcept := p.postings.Clone() + allExcept.AndNot(p.except) + rv.actual = allExcept.Iterator() + } else { + rv.actual = p.postings.Iterator() + } + } + + return rv +} + +// Count returns the number of items on this postings list +func (p *PostingsList) Count() uint64 { + var rv uint64 + if p.postings != nil { + rv = p.postings.GetCardinality() + if p.except != nil { + except := p.except.GetCardinality() + if except > rv { + // avoid underflow + except = rv + } + rv -= except + } + } + return rv +} + +// PostingsIterator provides a way to iterate through the postings list +type PostingsIterator struct { + postings *PostingsList + all roaring.IntIterable + offset int + locoffset int + actual roaring.IntIterable + + currChunk uint32 + currChunkFreqNorm []byte + currChunkLoc []byte + freqNormDecoder *govarint.Base128Decoder + locDecoder *govarint.Base128Decoder + + freqChunkLens []uint64 + freqChunkStart uint64 + + locChunkLens []uint64 + locChunkStart uint64 +} + +func (i *PostingsIterator) loadChunk(chunk int) error { + if chunk >= len(i.freqChunkLens) || chunk >= len(i.locChunkLens) { + return fmt.Errorf("tried to load chunk that doesn't exist %d/(%d %d)", chunk, len(i.freqChunkLens), len(i.locChunkLens)) + } + // load correct chunk bytes + start := i.freqChunkStart + for j := 0; j < chunk; j++ { + start += i.freqChunkLens[j] + } + end := start + i.freqChunkLens[chunk] + i.currChunkFreqNorm = i.postings.dictionary.segment.mm[start:end] + i.freqNormDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkFreqNorm)) + + start = i.locChunkStart + for j := 0; j < chunk; j++ { + start += i.locChunkLens[j] + } + end = start + i.locChunkLens[chunk] + i.currChunkLoc = i.postings.dictionary.segment.mm[start:end] + i.locDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkLoc)) + i.currChunk = uint32(chunk) + return nil +} + +func (i *PostingsIterator) readFreqNorm() (uint64, uint64, error) { + freq, err := i.freqNormDecoder.GetU64() + if err != nil { + return 0, 0, fmt.Errorf("error reading frequency: %v", err) + } + normBits, err := i.freqNormDecoder.GetU64() + if err != nil { + return 0, 0, fmt.Errorf("error reading norm: %v", err) + } + return freq, normBits, err +} + +// readLocation processes all the integers on the stream representing a single +// location. if you care about it, pass in a non-nil location struct, and we +// will fill it. if you don't care about it, pass in nil and we safely consume +// the contents. +func (i *PostingsIterator) readLocation(l *Location) error { + // read off field + fieldID, err := i.locDecoder.GetU64() + if err != nil { + return fmt.Errorf("error reading location field: %v", err) + } + // read off pos + pos, err := i.locDecoder.GetU64() + if err != nil { + return fmt.Errorf("error reading location pos: %v", err) + } + // read off start + start, err := i.locDecoder.GetU64() + if err != nil { + return fmt.Errorf("error reading location start: %v", err) + } + // read off end + end, err := i.locDecoder.GetU64() + if err != nil { + return fmt.Errorf("error reading location end: %v", err) + } + // read off num array pos + numArrayPos, err := i.locDecoder.GetU64() + if err != nil { + return fmt.Errorf("error reading location num array pos: %v", err) + } + + // group these together for less branching + if l != nil { + l.field = i.postings.dictionary.segment.fieldsInv[fieldID] + l.pos = pos + l.start = start + l.end = end + if numArrayPos > 0 { + l.ap = make([]uint64, int(numArrayPos)) + } + } + + // read off array positions + for k := 0; k < int(numArrayPos); k++ { + ap, err := i.locDecoder.GetU64() + if err != nil { + return fmt.Errorf("error reading array position: %v", err) + } + if l != nil { + l.ap[k] = ap + } + } + + return nil +} + +// Next returns the next posting on the postings list, or nil at the end +func (i *PostingsIterator) Next() (segment.Posting, error) { + if i.actual == nil || !i.actual.HasNext() { + return nil, nil + } + n := i.actual.Next() + nChunk := n / i.postings.dictionary.segment.chunkFactor + allN := i.all.Next() + allNChunk := allN / i.postings.dictionary.segment.chunkFactor + + // n is the next actual hit (excluding some postings) + // allN is the next hit in the full postings + // if they don't match, adjust offsets to factor in item we're skipping over + // incr the all iterator, and check again + for allN != n { + + // in different chunks, reset offsets + if allNChunk != nChunk { + i.locoffset = 0 + i.offset = 0 + } else { + + if i.currChunk != nChunk || i.currChunkFreqNorm == nil { + err := i.loadChunk(int(nChunk)) + if err != nil { + return nil, fmt.Errorf("error loading chunk: %v", err) + } + } + + // read off freq/offsets even though we don't care about them + freq, _, err := i.readFreqNorm() + if err != nil { + return nil, err + } + if i.postings.dictionary.segment.fieldsLoc[i.postings.dictionary.fieldID] { + for j := 0; j < int(freq); j++ { + err := i.readLocation(nil) + if err != nil { + return nil, err + } + } + } + + // in same chunk, need to account for offsets + i.offset++ + } + + allN = i.all.Next() + } + + if i.currChunk != nChunk || i.currChunkFreqNorm == nil { + err := i.loadChunk(int(nChunk)) + if err != nil { + return nil, fmt.Errorf("error loading chunk: %v", err) + } + } + + rv := &Posting{ + iterator: i, + docNum: uint64(n), + } + + var err error + var normBits uint64 + rv.freq, normBits, err = i.readFreqNorm() + if err != nil { + return nil, err + } + rv.norm = math.Float32frombits(uint32(normBits)) + if i.postings.dictionary.segment.fieldsLoc[i.postings.dictionary.fieldID] { + // read off 'freq' locations + rv.locs = make([]segment.Location, rv.freq) + locs := make([]Location, rv.freq) + for j := 0; j < int(rv.freq); j++ { + err := i.readLocation(&locs[j]) + if err != nil { + return nil, err + } + rv.locs[j] = &locs[j] + } + } + + return rv, nil +} + +// Posting is a single entry in a postings list +type Posting struct { + iterator *PostingsIterator + docNum uint64 + + freq uint64 + norm float32 + locs []segment.Location +} + +// Number returns the document number of this posting in this segment +func (p *Posting) Number() uint64 { + return p.docNum +} + +// Frequency returns the frequence of occurance of this term in this doc/field +func (p *Posting) Frequency() uint64 { + return p.freq +} + +// Norm returns the normalization factor for this posting +func (p *Posting) Norm() float64 { + return float64(p.norm) +} + +// Locations returns the location information for each occurance +func (p *Posting) Locations() []segment.Location { + return p.locs +} + +// Location represents the location of a single occurance +type Location struct { + field string + pos uint64 + start uint64 + end uint64 + ap []uint64 +} + +// Field returns the name of the field (useful in composite fields to know +// which original field the value came from) +func (l *Location) Field() string { + return l.field +} + +// Start returns the start byte offset of this occurance +func (l *Location) Start() uint64 { + return l.start +} + +// End returns the end byte offset of this occurance +func (l *Location) End() uint64 { + return l.end +} + +// Pos returns the 1-based phrase position of this occurance +func (l *Location) Pos() uint64 { + return l.pos +} + +// ArrayPositions returns the array position vector associated with this occurance +func (l *Location) ArrayPositions() []uint64 { + return l.ap +} diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go new file mode 100644 index 000000000..c8e8f389b --- /dev/null +++ b/index/scorch/segment/zap/segment.go @@ -0,0 +1,352 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "fmt" + "io" + "os" + + "github.com/RoaringBitmap/roaring" + "github.com/Smerity/govarint" + "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/couchbaselabs/vellum" + mmap "github.com/edsrzf/mmap-go" + "github.com/golang/snappy" +) + +// Open returns a zap impl of a segment +func Open(path string) (segment.Segment, error) { + f, err := os.Open(path) + if err != nil { + return nil, err + } + mm, err := mmap.Map(f, mmap.RDONLY, 0) + if err != nil { + // mmap failed, try to close the file + _ = f.Close() + return nil, err + } + + rv := &Segment{ + f: f, + mm: mm, + path: path, + fieldsMap: make(map[string]uint16), + } + + err = rv.loadConfig() + if err != nil { + _ = rv.Close() + return nil, err + } + + err = rv.loadFields() + if err != nil { + _ = rv.Close() + return nil, err + } + + return rv, nil +} + +// Segment implements the segment.Segment inteface over top the zap file format +type Segment struct { + f *os.File + mm mmap.MMap + path string + crc uint32 + version uint32 + chunkFactor uint32 + numDocs uint64 + storedIndexOffset uint64 + fieldsIndexOffset uint64 + + fieldsMap map[string]uint16 + fieldsInv []string + fieldsLoc []bool + fieldsOffsets []uint64 +} + +func (s *Segment) loadConfig() error { + crcOffset := len(s.mm) - 4 + s.crc = binary.BigEndian.Uint32(s.mm[crcOffset : crcOffset+4]) + verOffset := crcOffset - 4 + s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4]) + if s.version != version { + return fmt.Errorf("unsupported version %d", s.version) + } + chunkOffset := verOffset - 4 + s.chunkFactor = binary.BigEndian.Uint32(s.mm[chunkOffset : chunkOffset+4]) + fieldsOffset := chunkOffset - 8 + s.fieldsIndexOffset = binary.BigEndian.Uint64(s.mm[fieldsOffset : fieldsOffset+8]) + storedOffset := fieldsOffset - 8 + s.storedIndexOffset = binary.BigEndian.Uint64(s.mm[storedOffset : storedOffset+8]) + docNumOffset := storedOffset - 8 + s.numDocs = binary.BigEndian.Uint64(s.mm[docNumOffset : docNumOffset+8]) + return nil + +} + +func (s *Segment) loadFields() error { + // NOTE for now we assume the fields index immediately preceeds the footer + // if this changes, need to adjust accordingly (or store epxlicit length) + fieldsIndexEnd := uint64(len(s.mm) - footerSize) + + // iterate through fields index + var fieldID uint64 + for s.fieldsIndexOffset+(8*fieldID) < fieldsIndexEnd { + addr := binary.BigEndian.Uint64(s.mm[s.fieldsIndexOffset+(8*fieldID) : s.fieldsIndexOffset+(8*fieldID)+8]) + var n uint64 + hasStoredLoc, read := binary.Uvarint(s.mm[addr:fieldsIndexEnd]) + n += uint64(read) + if hasStoredLoc == 1 { + s.fieldsLoc = append(s.fieldsLoc, true) + } else { + s.fieldsLoc = append(s.fieldsLoc, false) + } + + var dictLoc uint64 + dictLoc, read = binary.Uvarint(s.mm[addr+n : fieldsIndexEnd]) + n += uint64(read) + s.fieldsOffsets = append(s.fieldsOffsets, dictLoc) + + var nameLen uint64 + nameLen, read = binary.Uvarint(s.mm[addr+n : fieldsIndexEnd]) + n += uint64(read) + + name := string(s.mm[addr+n : addr+n+nameLen]) + s.fieldsInv = append(s.fieldsInv, name) + s.fieldsMap[name] = uint16(fieldID + 1) + + fieldID++ + } + return nil +} + +// Dictionary returns the term dictionary for the specified field +func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) { + dict, err := s.dictionary(field) + if err == nil && dict == nil { + return &segment.EmptyDictionary{}, nil + } + return dict, err +} + +func (s *Segment) dictionary(field string) (*Dictionary, error) { + rv := &Dictionary{ + segment: s, + field: field, + } + + rv.fieldID = s.fieldsMap[field] + if rv.fieldID > 0 { + rv.fieldID = rv.fieldID - 1 + + dictStart := s.fieldsOffsets[rv.fieldID] + + // read the length of the vellum data + vellumLen, read := binary.Uvarint(s.mm[dictStart : dictStart+binary.MaxVarintLen64]) + fstBytes := s.mm[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen] + if fstBytes != nil { + fst, err := vellum.Load(fstBytes) + if err != nil { + return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err) + } + if err == nil { + rv.fst = fst + } + } + + } else { + return nil, nil + } + + return rv, nil +} + +// VisitDocument invokes the DocFieldValueVistor for each stored field +// for the specified doc number +func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { + // first make sure this is a valid number in this segment + if num < s.numDocs { + docStoredStartAddr := s.storedIndexOffset + (8 * num) + docStoredStart := binary.BigEndian.Uint64(s.mm[docStoredStartAddr : docStoredStartAddr+8]) + var n uint64 + metaLen, read := binary.Uvarint(s.mm[docStoredStart : docStoredStart+binary.MaxVarintLen64]) + n += uint64(read) + var dataLen uint64 + dataLen, read = binary.Uvarint(s.mm[docStoredStart+n : docStoredStart+n+binary.MaxVarintLen64]) + n += uint64(read) + meta := s.mm[docStoredStart+n : docStoredStart+n+metaLen] + data := s.mm[docStoredStart+n+metaLen : docStoredStart+n+metaLen+dataLen] + uncompressed, err := snappy.Decode(nil, data) + if err != nil { + panic(err) + } + // now decode meta and process + reader := bytes.NewReader(meta) + decoder := govarint.NewU64Base128Decoder(reader) + + keepGoing := true + for keepGoing { + field, err := decoder.GetU64() + if err == io.EOF { + break + } + if err != nil { + return err + } + typ, err := decoder.GetU64() + if err != nil { + return err + } + offset, err := decoder.GetU64() + if err != nil { + return err + } + l, err := decoder.GetU64() + if err != nil { + return err + } + numap, err := decoder.GetU64() + if err != nil { + return err + } + var arrayPos []uint64 + if numap > 0 { + arrayPos = make([]uint64, numap) + for i := 0; i < int(numap); i++ { + ap, err := decoder.GetU64() + if err != nil { + return err + } + arrayPos[i] = ap + } + } + + value := uncompressed[offset : offset+l] + keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos) + } + } + return nil +} + +// Count returns the number of documents in this segment. +func (s *Segment) Count() uint64 { + return s.numDocs +} + +// DocNumbers returns a bitset corresponding to the doc numbers of all the +// provided _id strings +func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) { + rv := roaring.New() + + if len(s.fieldsMap) > 0 { + idDict, err := s.dictionary("_id") + if err != nil { + return nil, err + } + + for _, id := range ids { + postings, err := idDict.postingsList(id, nil) + if err != nil { + return nil, err + } + if postings.postings != nil { + rv.Or(postings.postings) + } + } + } + + return rv, nil +} + +// Fields returns the field names used in this segment +func (s *Segment) Fields() []string { + return s.fieldsInv +} + +// Path returns the path of this segment on disk +func (s *Segment) Path() string { + return s.path +} + +// Close releases all resources associated with this segment +func (s *Segment) Close() (err error) { + if s.mm != nil { + err = s.mm.Unmap() + } + // try to close file even if unmap failed + if s.f != nil { + err2 := s.f.Close() + if err == nil { + // try to return first error + err = err2 + } + } + return +} + +// some helpers i started adding for the command-line utility + +// Data returns the underlying mmaped data slice +func (s *Segment) Data() []byte { + return s.mm +} + +// CRC returns the CRC value stored in the file footer +func (s *Segment) CRC() uint32 { + return s.crc +} + +// Version returns the file version in the file footer +func (s *Segment) Version() uint32 { + return s.version +} + +// ChunkFactor returns the chunk factor in the file footer +func (s *Segment) ChunkFactor() uint32 { + return s.chunkFactor +} + +// FieldsIndexOffset returns the fields index offset in the file footer +func (s *Segment) FieldsIndexOffset() uint64 { + return s.fieldsIndexOffset +} + +// StoredIndexOffset returns the stored value index offset in the file foooter +func (s *Segment) StoredIndexOffset() uint64 { + return s.storedIndexOffset +} + +// NumDocs returns the number of documents in the file footer +func (s *Segment) NumDocs() uint64 { + return s.numDocs +} + +// DictAddr is a helper function to compute the file offset where the +// dictionary is stored for the specified field. +func (s *Segment) DictAddr(field string) (uint64, error) { + var fieldID uint16 + var ok bool + if fieldID, ok = s.fieldsMap[field]; !ok { + return 0, fmt.Errorf("no such field '%s'", field) + } + + return s.fieldsOffsets[fieldID-1], nil +} diff --git a/index/scorch/segment/zap/segment_test.go b/index/scorch/segment/zap/segment_test.go new file mode 100644 index 000000000..d4241c1d9 --- /dev/null +++ b/index/scorch/segment/zap/segment_test.go @@ -0,0 +1,517 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "math" + "os" + "reflect" + "testing" +) + +func TestOpen(t *testing.T) { + _ = os.RemoveAll("/tmp/scorch.zap") + + memSegment := buildMemSegment() + err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024) + if err != nil { + t.Fatalf("error persisting segment: %v", err) + } + + segment, err := Open("/tmp/scorch.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + expectFields := map[string]struct{}{ + "_id": struct{}{}, + "_all": struct{}{}, + "name": struct{}{}, + "desc": struct{}{}, + "tag": struct{}{}, + } + fields := segment.Fields() + if len(fields) != len(expectFields) { + t.Errorf("expected %d fields, only got %d", len(expectFields), len(fields)) + } + for _, field := range fields { + if _, ok := expectFields[field]; !ok { + t.Errorf("got unexpected field: %s", field) + } + } + + docCount := segment.Count() + if docCount != 1 { + t.Errorf("expected count 1, got %d", docCount) + } + + // check the _id field + dict, err := segment.Dictionary("_id") + if err != nil { + t.Fatal(err) + } + if dict == nil { + t.Fatal("got nil dict, expected non-nil") + } + + postingsList, err := dict.PostingsList("a", nil) + if err != nil { + t.Fatal(err) + } + if postingsList == nil { + t.Fatal("got nil postings list, expected non-nil") + } + + postingsItr := postingsList.Iterator() + if postingsItr == nil { + t.Fatal("got nil iterator, expected non-nil") + } + + count := 0 + nextPosting, err := postingsItr.Next() + for nextPosting != nil && err == nil { + count++ + if nextPosting.Frequency() != 1 { + t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) + } + if nextPosting.Number() != 0 { + t.Errorf("expected doc number 0, got %d", nextPosting.Number()) + } + if nextPosting.Norm() != 1.0 { + t.Errorf("expected norm 1.0, got %f", nextPosting.Norm()) + } + + nextPosting, err = postingsItr.Next() + } + if err != nil { + t.Fatal(err) + } + + if count != 1 { + t.Errorf("expected count to be 1, got %d", count) + } + + // check the name field + dict, err = segment.Dictionary("name") + if err != nil { + t.Fatal(err) + } + if dict == nil { + t.Fatal("got nil dict, expected non-nil") + } + + postingsList, err = dict.PostingsList("wow", nil) + if err != nil { + t.Fatal(err) + } + if postingsList == nil { + t.Fatal("got nil postings list, expected non-nil") + } + + postingsItr = postingsList.Iterator() + if postingsItr == nil { + t.Fatal("got nil iterator, expected non-nil") + } + + count = 0 + nextPosting, err = postingsItr.Next() + for nextPosting != nil && err == nil { + count++ + if nextPosting.Frequency() != 1 { + t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) + } + if nextPosting.Number() != 0 { + t.Errorf("expected doc number 0, got %d", nextPosting.Number()) + } + if nextPosting.Norm() != 1.0 { + t.Errorf("expected norm 1.0, got %f", nextPosting.Norm()) + } + var numLocs uint64 + for _, loc := range nextPosting.Locations() { + numLocs++ + if loc.Field() != "name" { + t.Errorf("expected loc field to be 'name', got '%s'", loc.Field()) + } + if loc.Start() != 0 { + t.Errorf("expected loc start to be 0, got %d", loc.Start()) + } + if loc.End() != 3 { + t.Errorf("expected loc end to be 3, got %d", loc.End()) + } + if loc.Pos() != 1 { + t.Errorf("expected loc pos to be 1, got %d", loc.Pos()) + } + if loc.ArrayPositions() != nil { + t.Errorf("expect loc array pos to be nil, got %v", loc.ArrayPositions()) + } + } + if numLocs != nextPosting.Frequency() { + t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs) + } + + nextPosting, err = postingsItr.Next() + } + if err != nil { + t.Fatal(err) + } + + if count != 1 { + t.Errorf("expected count to be 1, got %d", count) + } + + // check the _all field (composite) + dict, err = segment.Dictionary("_all") + if err != nil { + t.Fatal(err) + } + if dict == nil { + t.Fatal("got nil dict, expected non-nil") + } + + postingsList, err = dict.PostingsList("wow", nil) + if err != nil { + t.Fatal(err) + } + if postingsList == nil { + t.Fatal("got nil postings list, expected non-nil") + } + + postingsItr = postingsList.Iterator() + if postingsItr == nil { + t.Fatal("got nil iterator, expected non-nil") + } + + count = 0 + nextPosting, err = postingsItr.Next() + for nextPosting != nil && err == nil { + count++ + if nextPosting.Frequency() != 1 { + t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) + } + if nextPosting.Number() != 0 { + t.Errorf("expected doc number 0, got %d", nextPosting.Number()) + } + expectedNorm := float32(1.0 / math.Sqrt(float64(5))) + if nextPosting.Norm() != float64(expectedNorm) { + t.Errorf("expected norm %f, got %f", expectedNorm, nextPosting.Norm()) + } + var numLocs uint64 + for _, loc := range nextPosting.Locations() { + numLocs++ + if loc.Field() != "name" { + t.Errorf("expected loc field to be 'name', got '%s'", loc.Field()) + } + if loc.Start() != 0 { + t.Errorf("expected loc start to be 0, got %d", loc.Start()) + } + if loc.End() != 3 { + t.Errorf("expected loc end to be 3, got %d", loc.End()) + } + if loc.Pos() != 1 { + t.Errorf("expected loc pos to be 1, got %d", loc.Pos()) + } + if loc.ArrayPositions() != nil { + t.Errorf("expect loc array pos to be nil, got %v", loc.ArrayPositions()) + } + } + if numLocs != nextPosting.Frequency() { + t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs) + } + + nextPosting, err = postingsItr.Next() + } + if err != nil { + t.Fatal(err) + } + + if count != 1 { + t.Errorf("expected count to be 1, got %d", count) + } + + // now try a field with array positions + dict, err = segment.Dictionary("tag") + if err != nil { + t.Fatal(err) + } + if dict == nil { + t.Fatal("got nil dict, expected non-nil") + } + + postingsList, err = dict.PostingsList("dark", nil) + if err != nil { + t.Fatal(err) + } + if postingsList == nil { + t.Fatal("got nil postings list, expected non-nil") + } + + postingsItr = postingsList.Iterator() + if postingsItr == nil { + t.Fatal("got nil iterator, expected non-nil") + } + + nextPosting, err = postingsItr.Next() + for nextPosting != nil && err == nil { + + if nextPosting.Frequency() != 1 { + t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) + } + if nextPosting.Number() != 0 { + t.Errorf("expected doc number 0, got %d", nextPosting.Number()) + } + var numLocs uint64 + for _, loc := range nextPosting.Locations() { + numLocs++ + if loc.Field() != "tag" { + t.Errorf("expected loc field to be 'name', got '%s'", loc.Field()) + } + if loc.Start() != 0 { + t.Errorf("expected loc start to be 0, got %d", loc.Start()) + } + if loc.End() != 4 { + t.Errorf("expected loc end to be 3, got %d", loc.End()) + } + if loc.Pos() != 1 { + t.Errorf("expected loc pos to be 1, got %d", loc.Pos()) + } + expectArrayPos := []uint64{1} + if !reflect.DeepEqual(loc.ArrayPositions(), expectArrayPos) { + t.Errorf("expect loc array pos to be %v, got %v", expectArrayPos, loc.ArrayPositions()) + } + } + if numLocs != nextPosting.Frequency() { + t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs) + } + + nextPosting, err = postingsItr.Next() + } + if err != nil { + t.Fatal(err) + } + + // now try and visit a document + var fieldValuesSeen int + err = segment.VisitDocument(0, func(field string, typ byte, value []byte, pos []uint64) bool { + fieldValuesSeen++ + return true + }) + if err != nil { + t.Fatal(err) + } + if fieldValuesSeen != 5 { + t.Errorf("expected 5 field values, got %d", fieldValuesSeen) + } +} + +func TestOpenMulti(t *testing.T) { + _ = os.RemoveAll("/tmp/scorch.zap") + + memSegment := buildMemSegmentMulti() + err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024) + if err != nil { + t.Fatalf("error persisting segment: %v", err) + } + + segment, err := Open("/tmp/scorch.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + if segment.Count() != 2 { + t.Errorf("expected count 2, got %d", segment.Count()) + } + + // check the desc field + dict, err := segment.Dictionary("desc") + if err != nil { + t.Fatal(err) + } + if dict == nil { + t.Fatal("got nil dict, expected non-nil") + } + + postingsList, err := dict.PostingsList("thing", nil) + if err != nil { + t.Fatal(err) + } + if postingsList == nil { + t.Fatal("got nil postings list, expected non-nil") + } + + postingsItr := postingsList.Iterator() + if postingsItr == nil { + t.Fatal("got nil iterator, expected non-nil") + } + + count := 0 + nextPosting, err := postingsItr.Next() + for nextPosting != nil && err == nil { + count++ + nextPosting, err = postingsItr.Next() + } + if err != nil { + t.Fatal(err) + } + + if count != 2 { + t.Errorf("expected count to be 2, got %d", count) + } + + // get docnum of a + exclude, err := segment.DocNumbers([]string{"a"}) + if err != nil { + t.Fatal(err) + } + + // look for term 'thing' excluding doc 'a' + postingsListExcluding, err := dict.PostingsList("thing", exclude) + if err != nil { + t.Fatal(err) + } + if postingsList == nil { + t.Fatal("got nil postings list, expected non-nil") + } + + postingsListExcludingCount := postingsListExcluding.Count() + if postingsListExcludingCount != 1 { + t.Errorf("expected count from postings list to be 1, got %d", postingsListExcludingCount) + } + + postingsItrExcluding := postingsListExcluding.Iterator() + if postingsItr == nil { + t.Fatal("got nil iterator, expected non-nil") + } + + count = 0 + nextPosting, err = postingsItrExcluding.Next() + for nextPosting != nil && err == nil { + count++ + nextPosting, err = postingsItrExcluding.Next() + } + if err != nil { + t.Fatal(err) + } + + if count != 1 { + t.Errorf("expected count to be 1, got %d", count) + } +} + +func TestOpenMultiWithTwoChunks(t *testing.T) { + _ = os.RemoveAll("/tmp/scorch.zap") + + memSegment := buildMemSegmentMulti() + err := PersistSegment(memSegment, "/tmp/scorch.zap", 1) + if err != nil { + t.Fatalf("error persisting segment: %v", err) + } + + segment, err := Open("/tmp/scorch.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + if segment.Count() != 2 { + t.Errorf("expected count 2, got %d", segment.Count()) + } + + // check the desc field + dict, err := segment.Dictionary("desc") + if err != nil { + t.Fatal(err) + } + if dict == nil { + t.Fatal("got nil dict, expected non-nil") + } + + postingsList, err := dict.PostingsList("thing", nil) + if err != nil { + t.Fatal(err) + } + if postingsList == nil { + t.Fatal("got nil postings list, expected non-nil") + } + + postingsItr := postingsList.Iterator() + if postingsItr == nil { + t.Fatal("got nil iterator, expected non-nil") + } + + count := 0 + nextPosting, err := postingsItr.Next() + for nextPosting != nil && err == nil { + count++ + nextPosting, err = postingsItr.Next() + } + if err != nil { + t.Fatal(err) + } + + if count != 2 { + t.Errorf("expected count to be 2, got %d", count) + } + + // get docnum of a + exclude, err := segment.DocNumbers([]string{"a"}) + if err != nil { + t.Fatal(err) + } + + // look for term 'thing' excluding doc 'a' + postingsListExcluding, err := dict.PostingsList("thing", exclude) + if err != nil { + t.Fatal(err) + } + if postingsList == nil { + t.Fatal("got nil postings list, expected non-nil") + } + + postingsItrExcluding := postingsListExcluding.Iterator() + if postingsItr == nil { + t.Fatal("got nil iterator, expected non-nil") + } + + count = 0 + nextPosting, err = postingsItrExcluding.Next() + for nextPosting != nil && err == nil { + count++ + nextPosting, err = postingsItrExcluding.Next() + } + if err != nil { + t.Fatal(err) + } + + if count != 1 { + t.Errorf("expected count to be 1, got %d", count) + } +} From 414899618b647f72f9633960be20586416f118f3 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Sat, 9 Dec 2017 14:28:50 -0500 Subject: [PATCH 032/728] switch from bolt format to zap in the persister --- index/scorch/persister.go | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 44393e961..8a171b311 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -23,8 +23,8 @@ import ( "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index/scorch/segment" - scorchBolt "github.com/blevesearch/bleve/index/scorch/segment/bolt" "github.com/blevesearch/bleve/index/scorch/segment/mem" + "github.com/blevesearch/bleve/index/scorch/segment/zap" "github.com/boltdb/bolt" ) @@ -142,9 +142,9 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { switch seg := segmentSnapshot.segment.(type) { case *mem.Segment: // need to persist this to disk - filename := fmt.Sprintf("%x.bolt", segmentSnapshot.id) + filename := fmt.Sprintf("%x.zap", segmentSnapshot.id) path := s.path + string(os.PathSeparator) + filename - err2 := scorchBolt.PersistSegment(seg, path, 1024) + err2 := zap.PersistSegment(seg, path, 1024) if err2 != nil { return fmt.Errorf("error persisting segment: %v", err2) } @@ -153,8 +153,7 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { if err != nil { return err } - case *scorchBolt.Segment: - + case *zap.Segment: path := seg.Path() filename := strings.TrimPrefix(path, s.path+string(os.PathSeparator)) err = snapshotSegmentBucket.Put(boltPathKey, []byte(filename)) @@ -181,7 +180,7 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { // now try to open all the new snapshots newSegments := make(map[uint64]segment.Segment) for segmentID, path := range newSegmentPaths { - newSegments[segmentID], err = scorchBolt.Open(path) + newSegments[segmentID], err = zap.Open(path) if err != nil { return fmt.Errorf("error opening new segment at %s, %v", path, err) } @@ -321,7 +320,7 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro return nil, fmt.Errorf("segment path missing") } segmentPath := s.path + string(os.PathSeparator) + string(pathBytes) - segment, err := scorchBolt.Open(segmentPath) + segment, err := zap.Open(segmentPath) if err != nil { return nil, fmt.Errorf("error opening bolt segment: %v", err) } From e0d9828cd05bfd0ee9d118cec196af4aca7b8a42 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Sat, 9 Dec 2017 14:42:36 -0500 Subject: [PATCH 033/728] add more detail to the readme --- index/scorch/segment/zap/README.md | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/index/scorch/segment/zap/README.md b/index/scorch/segment/zap/README.md index 079eef9b3..39735d34d 100644 --- a/index/scorch/segment/zap/README.md +++ b/index/scorch/segment/zap/README.md @@ -1,5 +1,25 @@ # zap file format +The file is written in the reverse order that we typically access data. This helps us write in one pass since later sections of the file require file offsets of things we've already written. + +Current usage: + +- mmap the entire file +- crc-32 bytes and version are in fixed position at end of the file +- reading remainder of footer could be version specific +- remainder of footer gives us: + - 2 important offsets (fields index and stored data index) + - 2 important values (number of docs and chunk factor) +- field data is processed once and memoized onto the heap so that we never have to go back to disk for it +- access to stored data by doc number means first navigating to the stored data index, then accessing a fixed position offset into that slice, which gives us the actual address of the data. the first bytes of that section tell us the size of data so that we know where it ends. +- access to all other indexed data follows the following pattern: + - first know the field name -> convert to id + - next navigate to term dictionary for that field + - some operations stop here and do dictionary ops + - next use dictionary to navigate to posting list for a specific term + - walk posting list + - if necessary, walk posting details as we go + ## stored fields section - for each document From dc0adc882739a6a1c819aa7b7c144ce4fbfe05a0 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Sat, 9 Dec 2017 20:52:01 -0500 Subject: [PATCH 034/728] add fsync --- index/scorch/segment/zap/build.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 1ed95e80f..bef263ac1 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -87,6 +87,11 @@ func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) (e return err } + err = f.Sync() + if err != nil { + return err + } + err = f.Close() if err != nil { return err From 690cd39921945253fe3227649fec7cec89074575 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Sun, 10 Dec 2017 08:55:59 -0500 Subject: [PATCH 035/728] add crazy slow but functional DocumentVisitFieldTerms --- index/scorch/reader.go | 2 +- index/scorch/scorch_test.go | 60 ++++++++++++++++++++++++++++++++ index/scorch/snapshot_index.go | 9 +++++ index/scorch/snapshot_segment.go | 45 ++++++++++++++++++++++++ 4 files changed, 115 insertions(+), 1 deletion(-) diff --git a/index/scorch/reader.go b/index/scorch/reader.go index 0e643f7ca..e4abbce64 100644 --- a/index/scorch/reader.go +++ b/index/scorch/reader.go @@ -58,7 +58,7 @@ func (r *Reader) Document(id string) (*document.Document, error) { } func (r *Reader) DocumentVisitFieldTerms(id index.IndexInternalID, fields []string, visitor index.DocumentFieldTermVisitor) error { - panic("document visit field terms not implemented") + return r.root.DocumentVisitFieldTerms(id, fields, visitor) } func (r *Reader) Fields() ([]string, error) { diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index 52a86ab2e..25854bcd3 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -1195,6 +1195,66 @@ func TestIndexTermReaderCompositeFields(t *testing.T) { } } +func TestIndexDocumentVisitFieldTerms(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, testConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + doc := document.NewDocument("1") + doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("test"), document.IndexField|document.StoreField|document.IncludeTermVectors)) + doc.AddField(document.NewTextFieldWithIndexingOptions("title", []uint64{}, []byte("mister"), document.IndexField|document.StoreField|document.IncludeTermVectors)) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err := indexReader.Close() + if err != nil { + t.Fatal(err) + } + }() + + fieldTerms := make(index.FieldTerms) + + err = indexReader.DocumentVisitFieldTerms(index.IndexInternalID("1"), []string{"name", "title"}, func(field string, term []byte) { + fieldTerms[field] = append(fieldTerms[field], string(term)) + }) + if err != nil { + t.Error(err) + } + expectedFieldTerms := index.FieldTerms{ + "name": []string{"test"}, + "title": []string{"mister"}, + } + if !reflect.DeepEqual(fieldTerms, expectedFieldTerms) { + t.Errorf("expected field terms: %#v, got: %#v", expectedFieldTerms, fieldTerms) + } +} + func TestConcurrentUpdate(t *testing.T) { defer func() { err := DestroyTest() diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index c96082c54..d2efa2dcd 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -341,6 +341,15 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, return rv, nil } +func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID, fields []string, + visitor index.DocumentFieldTermVisitor) error { + + docNum := docInternalToNumber(id) + segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) + + return i.segment[segmentIndex].DocumentVisitFieldTerms(localDocNum, fields, visitor) +} + func docNumberToBytes(in uint64) []byte { buf := new(bytes.Buffer) diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 1a50eb6cc..14c49450b 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -16,6 +16,7 @@ package scorch import ( "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" ) @@ -56,6 +57,50 @@ func (s *SegmentSnapshot) VisitDocument(num uint64, visitor segment.DocumentFiel return s.segment.VisitDocument(num, visitor) } +func (s *SegmentSnapshot) DocumentVisitFieldTerms(num uint64, fields []string, + visitor index.DocumentFieldTermVisitor) error { + collection := make(map[string][][]byte) + // collect field indexed values + for _, field := range fields { + dict, err := s.Dictionary(field) + if err != nil { + return err + } + dictItr := dict.Iterator() + var next *index.DictEntry + next, err = dictItr.Next() + for next != nil && err == nil { + postings, err2 := dict.PostingsList(next.Term, nil) + if err2 != nil { + return err2 + } + postingsItr := postings.Iterator() + nextPosting, err2 := postingsItr.Next() + for err2 == nil && nextPosting != nil && nextPosting.Number() <= num { + if nextPosting.Number() == num { + // got what we're looking for + collection[field] = append(collection[field], []byte(next.Term)) + } + nextPosting, err = postingsItr.Next() + } + if err2 != nil { + return err + } + next, err = dictItr.Next() + } + if err != nil { + return err + } + } + // invoke callback + for field, values := range collection { + for _, value := range values { + visitor(field, value) + } + } + return nil +} + func (s *SegmentSnapshot) Count() uint64 { rv := s.segment.Count() From e8cc7ac0bff37727e4dc8043e4bb4da09523c341 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 11 Dec 2017 09:05:50 -0500 Subject: [PATCH 036/728] add new fields command to zap cmd-line util --- .../scorch/segment/zap/cmd/zap/cmd/fields.go | 70 +++++++++++++++++++ 1 file changed, 70 insertions(+) create mode 100644 index/scorch/segment/zap/cmd/zap/cmd/fields.go diff --git a/index/scorch/segment/zap/cmd/zap/cmd/fields.go b/index/scorch/segment/zap/cmd/zap/cmd/fields.go new file mode 100644 index 000000000..472d966f5 --- /dev/null +++ b/index/scorch/segment/zap/cmd/zap/cmd/fields.go @@ -0,0 +1,70 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "encoding/binary" + "fmt" + + "github.com/blevesearch/bleve/index/scorch/segment/zap" + "github.com/spf13/cobra" +) + +// fieldsCmd represents the fields command +var fieldsCmd = &cobra.Command{ + Use: "fields [path]", + Short: "fields prints the fields in the specified file", + Long: `The fields command lets you print the fields in the specified file.`, + RunE: func(cmd *cobra.Command, args []string) error { + + data := segment.Data() + + crcOffset := len(data) - 4 + verOffset := crcOffset - 4 + chunkOffset := verOffset - 4 + fieldsOffset := chunkOffset - 8 + fieldsIndexOffset := binary.BigEndian.Uint64(data[fieldsOffset : fieldsOffset+8]) + fieldsIndexEnd := uint64(len(data) - zap.FooterSize) + + // iterate through fields index + var fieldID uint64 + for fieldsIndexOffset+(8*fieldID) < fieldsIndexEnd { + addr := binary.BigEndian.Uint64(data[fieldsIndexOffset+(8*fieldID) : fieldsIndexOffset+(8*fieldID)+8]) + var n uint64 + indexedLoc, read := binary.Uvarint(data[addr:fieldsIndexEnd]) + n += uint64(read) + + var dictLoc uint64 + dictLoc, read = binary.Uvarint(data[addr+n : fieldsIndexEnd]) + n += uint64(read) + + var nameLen uint64 + nameLen, read = binary.Uvarint(data[addr+n : fieldsIndexEnd]) + n += uint64(read) + + name := string(data[addr+n : addr+n+nameLen]) + + fmt.Printf("field %d '%s' indexedLoc: %t starts at %d (%x)\n", fieldID, name, indexedLoc == 1, dictLoc, dictLoc) + + fieldID++ + } + + return nil + }, +} + +func init() { + RootCmd.AddCommand(fieldsCmd) +} From 8280859bb8a5c1d83b8db2eca1f19c65920bfb29 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 11 Dec 2017 09:07:01 -0500 Subject: [PATCH 037/728] handle read-only and in-mem only cases --- index/scorch/scorch.go | 64 +++++++++++++++++++---------- index/scorch/segment/zap/build.go | 2 +- index/scorch/segment/zap/segment.go | 2 +- 3 files changed, 44 insertions(+), 24 deletions(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 678349216..d95cb05bc 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -38,6 +38,7 @@ const Name = "scorch" const Version uint8 = 1 type Scorch struct { + readOnly bool version uint8 config map[string]interface{} analysisQueue *index.AnalysisQueue @@ -67,6 +68,10 @@ func NewScorch(storeName string, config map[string]interface{}, analysisQueue *i root: &IndexSnapshot{}, nextSnapshotEpoch: 1, } + ro, ok := config["read_only"].(bool) + if ok { + rv.readOnly = ro + } return rv, nil } @@ -77,25 +82,35 @@ func (s *Scorch) Open() error { return fmt.Errorf("must specify path") } if s.path == "" { - return os.ErrInvalid + s.unsafeBatch = true } - err := os.MkdirAll(s.path, 0700) - if err != nil { - return err + var rootBoltOpt *bolt.Options + if s.readOnly { + rootBoltOpt = &bolt.Options{ + ReadOnly: true, + } + } else { + if s.path != "" { + err := os.MkdirAll(s.path, 0700) + if err != nil { + return err + } + } } - rootBoltPath := s.path + string(os.PathSeparator) + "root.bolt" + var err error + if s.path != "" { + s.rootBolt, err = bolt.Open(rootBoltPath, 0600, rootBoltOpt) + if err != nil { + return err + } - s.rootBolt, err = bolt.Open(rootBoltPath, 0600, nil) - if err != nil { - return err - } - - // now see if there is any existing state to load - err = s.loadFromBolt() - if err != nil { - return err + // now see if there is any existing state to load + err = s.loadFromBolt() + if err != nil { + return err + } } s.closeCh = make(chan struct{}) @@ -104,8 +119,11 @@ func (s *Scorch) Open() error { s.asyncTasks.Add(1) go s.mainLoop() - s.asyncTasks.Add(1) - go s.persisterLoop() + + if !s.readOnly && s.path != "" { + s.asyncTasks.Add(1) + go s.persisterLoop() + } return nil } @@ -117,12 +135,14 @@ func (s *Scorch) Close() (err error) { s.asyncTasks.Wait() // now close the root bolt - err = s.rootBolt.Close() - s.rootLock.Lock() - for _, segment := range s.root.segment { - cerr := segment.Close() - if err == nil { - err = cerr + if s.rootBolt != nil { + err = s.rootBolt.Close() + s.rootLock.Lock() + for _, segment := range s.root.segment { + cerr := segment.Close() + if err == nil { + err = cerr + } } } diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index bef263ac1..0ccb6cb0c 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -582,7 +582,7 @@ func persistFields(memSegment *mem.Segment, w *CountHashWriter, dictLocs []uint6 // NOTE: update if you make the footer bigger // crc + ver + chunk + field offset + stored offset + num docs -const footerSize = 4 + 4 + 4 + 8 + 8 + 8 +const FooterSize = 4 + 4 + 4 + 8 + 8 + 8 func persistFooter(numDocs, storedIndexOffset, fieldIndexOffset uint64, chunkFactor uint32, w *CountHashWriter) error { diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index c8e8f389b..b8784921a 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -105,7 +105,7 @@ func (s *Segment) loadConfig() error { func (s *Segment) loadFields() error { // NOTE for now we assume the fields index immediately preceeds the footer // if this changes, need to adjust accordingly (or store epxlicit length) - fieldsIndexEnd := uint64(len(s.mm) - footerSize) + fieldsIndexEnd := uint64(len(s.mm) - FooterSize) // iterate through fields index var fieldID uint64 From eada7b209b30c188133877b344c4ea0addfbfdf8 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 11 Dec 2017 10:16:56 -0500 Subject: [PATCH 038/728] fix test issue identified by sreekanth --- index/scorch/scorch_test.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index 25854bcd3..b46a5ffd9 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -1240,7 +1240,12 @@ func TestIndexDocumentVisitFieldTerms(t *testing.T) { fieldTerms := make(index.FieldTerms) - err = indexReader.DocumentVisitFieldTerms(index.IndexInternalID("1"), []string{"name", "title"}, func(field string, term []byte) { + internalID, err := indexReader.GetInternal([]byte("1")) + if err != nil { + t.Fatal(err) + } + + err = indexReader.DocumentVisitFieldTerms(internalID, []string{"name", "title"}, func(field string, term []byte) { fieldTerms[field] = append(fieldTerms[field], string(term)) }) if err != nil { From d7eb223e146b1b2b048fd2e1689a4986ac45ade5 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 11 Dec 2017 10:20:26 -0500 Subject: [PATCH 039/728] remove bolt segment format upcomning breaking changes and no desire to maintain --- index/scorch/segment/bolt/README.md | 306 ------------- index/scorch/segment/bolt/build.go | 518 ---------------------- index/scorch/segment/bolt/build_test.go | 288 ------------ index/scorch/segment/bolt/dict.go | 161 ------- index/scorch/segment/bolt/dict_test.go | 183 -------- index/scorch/segment/bolt/posting.go | 323 -------------- index/scorch/segment/bolt/segment.go | 319 ------------- index/scorch/segment/bolt/segment_test.go | 517 --------------------- 8 files changed, 2615 deletions(-) delete mode 100644 index/scorch/segment/bolt/README.md delete mode 100644 index/scorch/segment/bolt/build.go delete mode 100644 index/scorch/segment/bolt/build_test.go delete mode 100644 index/scorch/segment/bolt/dict.go delete mode 100644 index/scorch/segment/bolt/dict_test.go delete mode 100644 index/scorch/segment/bolt/posting.go delete mode 100644 index/scorch/segment/bolt/segment.go delete mode 100644 index/scorch/segment/bolt/segment_test.go diff --git a/index/scorch/segment/bolt/README.md b/index/scorch/segment/bolt/README.md deleted file mode 100644 index 2c6cd31d5..000000000 --- a/index/scorch/segment/bolt/README.md +++ /dev/null @@ -1,306 +0,0 @@ -# bolt segment format - -## top level key space (all sub-buckets, as bolt has no root bucket) - -We have chosen to letter these starting with 'a' and in the code refer to them with more meaningful names. The reason is that we intend to write them in order, and this lets us rearrange them more easily later. - -- 'a' field storage -- 'b' term dictionaries -- 'c' postings list -- 'd' postings details -- 'e' stored fields -- 'x' configuration - -## variable length integers that sort correctly (insert order same as numeric) - -We use numbers as keys in several places. We want those keys to be small, so we prefer to use a variable length key to minimize space, but we also want to insert these in order, so the encoding has to sort correctly. - -We have chosen to the the scheme found in [CockroachDB](https://github.com/cockroachdb/cockroach/blob/2dd65dde5d90c157f4b93f92502ca1063b904e1d/pkg/util/encoding/encoding.go). - -In short, the first byte indicates how many bytes will follow, with a few other nice properties. -- values 0-127 are not used in the first byte (this means we can still use any ASCII values we want and avoid collision) -- very small values are packed directly into this first byte -For the full details see the link above. - -## field storage bucket - -Contains one row for each field, the key is the integer field ID, and the value is the string name associated with the field. - -There is one additional row with key 'l'. The value is a binary serialization of a [roaring bitmap](https://github.com/RoaringBitmap/roaring), with bits set for each field id which also index location details with each posting. - -## term dictionary bucket - -Contains one row for each field, the key is the integer field ID, and the value is a binary serialization of the [Vellum](https://github.com/couchbaselabs/vellum) FST. The Vellum FST maps from term (utf-8 string) to a posting ID (uint64). - -## postings list bucket - -Contains one row for each postings list, the key is the integer posting ID, the value is a binary serialization of a [roaring bitmap](https://github.com/RoaringBitmap/roaring). The roaring bitmap has bits set for each doc number that used this term in this field. - -## posting details bucket - -Contains one sub-bucket for each postings list, the name of the sub-bucket is the posting ID. - -### individual posting detail sub-bucket - -Contains one sub-bucket for each chunk. A chunk contains details for sub-section of the docNum key space. By default, the chunk size is 1024, so all posting details for the first 1024 docs are in chunk zero, then the next 1024 in chunk one, and so on. - -The purpose of the chunking is so that when trying to Seek/Advance through a large number of hits to something much further ahead, we have to keep seeking through the roaring bitmap, but we can jump to the nearest chunk for details, and only seek within the details of the current chunk. - -#### chunk posting detail sub-bucket - -Contains two key/value pairs: - -Key 'a' contains a [govarint](https://github.com/Smerity/govarint) compressed slice of uint64 values. For each hit in the postings list, there are two values on this list, the first is the term frequency (uint64) and the second is the norm factor (float32). - -Key 'b' contains a [govarint](https://github.com/Smerity/govarint) compressed slice of uint64 values. For each location (there will be one location for each 'frequency' in the list above) there will be a variable number of uint64 values as follows: - -- field ID (uint16) -- pos (uint64) -- start (uint64) -- end (uint64) -- number of array position entries that follow (uint64) -- variable number of array positions (each uint64) - -## stored field values sub-bucket - -Contains one sub-bucket for each doc number (uint64). - -## stored field doc specific sub-bucket - -Contains two key/value pairs: - -Key 'a' contains a [govarint](https://github.com/Smerity/govarint) compressed slice of uint64 values. For each stored field there are a variable number of uint64 values as follows: - -- field ID (uint16) -- value type (byte) (string/number/date/geo/etc) -- start offset (in the uncompressed slice of data) -- length (in the uncompressed slice of data) -- number of array position entries that follow (uint64) -- variable number of array positions (each uint64) - -Key 'b' contains a [snappy]() compressed sequence of bytes. The input to the snappy compression was a slice of bytes containing the field values, in the same order the metadata slice was created. - -## configuration sub-bucket - -Currently contains two key/value pairs: - -Key 'c' contains a BigEndian encoded uint32 chunk size. This chunk size must be used when computing doc number to chunk conversions in this segment. - -Key 'v' contains a version number, currently 0. - -## Example - -The following is a dump of the boltdb bucket/key/value space for a segment which contains two documents: - -``` -{ - "_id": "a", - "name": "wow", - "desc": "some thing", - "tag": ["cold", "dark"] -} - -{ - "_id": "b", - "name": "who", - "desc": "some thing", - "tag": ["cold", "dark"] -} -``` - -``` -[61] ('a' - field storage) - 6c ('l' - roaring bitmap of field IDs which have index location data) - 3a 30 00 00 01 00 00 00 00 00 03 00 10 00 00 00 01 00 02 00 03 00 04 00 - 88 (field ID 0) - 5f 69 64 (utf-8 string '_id') - 89 (field ID 1) - 5f 61 6c 6c (utf-8 string '_all') - 8a (field ID 2) - 6e 61 6d 65 (utf-8 string 'name') - 8b (field ID 3) - 64 65 73 63 (utf-8 string 'desc') - 8c (field ID 4) - 74 61 67 (utf-8 string 'tag') -[62] ('b' - term dictionary) - 88 (field ID 0) - 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 0b 05 00 00 62 61 11 02 02 00 00 00 00 00 00 00 17 00 00 00 00 00 00 00 (vellum FST data) - 89 - 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 10 92 cf c4 00 10 a7 c7 c5 00 10 82 d0 c4 00 10 97 cb c8 ce 00 10 84 00 10 8c 00 0d 01 04 6f 68 11 02 00 02 01 04 03 01 0f 15 1a 1f 77 74 73 64 63 11 05 06 00 00 00 00 00 00 00 43 00 00 00 00 00 00 00 - 8a - 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 10 84 00 10 8c 00 06 01 04 6f 68 11 02 06 01 11 8c 02 00 00 00 00 00 00 00 21 00 00 00 00 00 00 00 - 8b - 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 10 82 d0 c4 00 10 97 cb c8 ce 08 07 01 07 74 73 11 02 02 00 00 00 00 00 00 00 22 00 00 00 00 00 00 00 - 8c - 01 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 10 92 cf c4 00 10 a7 c7 c5 0a 09 01 06 64 63 11 02 02 00 00 00 00 00 00 00 21 00 00 00 00 00 00 00 -[63] ('c' - postings lists) - 88 (field ID 0) - 3a 30 00 00 01 00 00 00 00 00 00 00 10 00 00 00 00 00 (roaring bitmap data) - 89 - 3a 30 00 00 01 00 00 00 00 00 01 00 10 00 00 00 00 00 01 00 - 8a - 3a 30 00 00 01 00 00 00 00 00 01 00 10 00 00 00 00 00 01 00 - 8b - 3a 30 00 00 01 00 00 00 00 00 01 00 10 00 00 00 00 00 01 00 - 8c - 3a 30 00 00 01 00 00 00 00 00 01 00 10 00 00 00 00 00 01 00 - 8d - 3a 30 00 00 01 00 00 00 00 00 00 00 10 00 00 00 00 00 - 8e - 3a 30 00 00 01 00 00 00 00 00 00 00 10 00 00 00 00 00 - 8f - 3a 30 00 00 01 00 00 00 00 00 01 00 10 00 00 00 00 00 01 00 - 90 - 3a 30 00 00 01 00 00 00 00 00 01 00 10 00 00 00 00 00 01 00 - 91 - 3a 30 00 00 01 00 00 00 00 00 01 00 10 00 00 00 00 00 01 00 - 92 - 3a 30 00 00 01 00 00 00 00 00 01 00 10 00 00 00 00 00 01 00 - 93 - 3a 30 00 00 01 00 00 00 00 00 00 00 10 00 00 00 01 00 - 94 - 3a 30 00 00 01 00 00 00 00 00 00 00 10 00 00 00 01 00 - 95 - 3a 30 00 00 01 00 00 00 00 00 00 00 10 00 00 00 01 00 -[64] ('d' - postings details) - [88] (posting ID 0) - [88] (chunk ID 0) - 61 ('a' term freq/norm data) - 01 ae f2 93 f7 03 - 62 ('b' term location data) - 02 01 00 03 00 - [89] (posting ID 1) - [88] (chunk ID 0) - 61 ('a' term freq/norm data) - 01 ae f2 93 f7 03 - 62 ('b' term location data) - 03 01 00 04 00 - [89] (chunk ID 1) - 61 ('a' term freq/norm data) - 01 ae f2 93 f7 03 - 62 ('b' term location data) - 03 01 00 04 00 - [8a] - [88] - 61 - 01 ae f2 93 f7 03 - 62 - 03 02 05 0a 00 - [89] - 61 - 01 ae f2 93 f7 03 - 62 - 03 02 05 0a 00 - [8b] - [88] - 61 - 01 ae f2 93 f7 03 - 62 - 04 01 00 04 01 00 - [89] - 61 - 01 ae f2 93 f7 03 - 62 - 04 01 00 04 01 00 - [8c] - [88] - 61 - 01 ae f2 93 f7 03 - 62 - 04 01 00 04 01 01 - [89] - 61 - 01 ae f2 93 f7 03 - 62 - 04 01 00 04 01 01 - [8d] - [88] - 61 - 01 80 80 80 fc 03 - 62 - - [8e] - [88] - 61 - 01 80 80 80 fc 03 - 62 - 02 01 00 03 00 - [8f] - [88] - 61 - 01 f3 89 d4 f9 03 - 62 - 03 01 00 04 00 - [89] - 61 - 01 f3 89 d4 f9 03 - 62 - 03 01 00 04 00 - [90] - [88] - 61 - 01 f3 89 d4 f9 03 - 62 - 03 02 05 0a 00 - [89] - 61 - 01 f3 89 d4 f9 03 - 62 - 03 02 05 0a 00 - [91] - [88] - 61 - 01 f3 89 d4 f9 03 - 62 - 04 01 00 04 01 00 - [89] - 61 - 01 f3 89 d4 f9 03 - 62 - 04 01 00 04 01 00 - [92] - [88] - 61 - 01 f3 89 d4 f9 03 - 62 - 04 01 00 04 01 01 - [89] - 61 - 01 f3 89 d4 f9 03 - 62 - 04 01 00 04 01 01 - [93] - [89] - 61 - 01 80 80 80 fc 03 - 62 - - [94] - [89] - 61 - 01 80 80 80 fc 03 - 62 - 02 01 00 03 00 - [95] - [89] - 61 - 01 ae f2 93 f7 03 - 62 - 02 01 00 03 00 -[65] ('e' - stored fields) - [88] (doc num 0) - 61 ('a' - stored field meta slice) - 00 74 00 01 00 02 74 01 03 00 03 74 04 0a 00 04 74 0e 04 01 00 04 74 12 04 01 01 - 62 ('b' - snappy compressed value bytes) - 16 54 61 77 6f 77 73 6f 6d 65 20 74 68 69 6e 67 63 6f 6c 64 64 61 72 6b - [89] - 61 - 00 74 00 01 00 02 74 01 03 00 03 74 04 0a 00 04 74 0e 04 01 00 04 74 12 04 01 01 - 62 - 16 54 62 77 68 6f 73 6f 6d 65 20 74 68 69 6e 67 63 6f 6c 64 64 61 72 6b -[78] ('x' - configuration) - 63 ('c' - chunk size) - 00 00 00 01 (big endian 1) - 76 ('v' - version) - 00 (single byte 0) -``` diff --git a/index/scorch/segment/bolt/build.go b/index/scorch/segment/bolt/build.go deleted file mode 100644 index ac01f9b8d..000000000 --- a/index/scorch/segment/bolt/build.go +++ /dev/null @@ -1,518 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package bolt - -import ( - "bytes" - "encoding/binary" - "math" - - "github.com/RoaringBitmap/roaring" - "github.com/Smerity/govarint" - "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/blevesearch/bleve/index/scorch/segment/mem" - "github.com/boltdb/bolt" - "github.com/couchbaselabs/vellum" - "github.com/golang/snappy" -) - -var fieldsBucket = []byte{'a'} -var dictBucket = []byte{'b'} -var postingsBucket = []byte{'c'} -var postingDetailsBucket = []byte{'d'} -var storedBucket = []byte{'e'} -var configBucket = []byte{'x'} - -var indexLocsKey = []byte{'l'} - -var freqNormKey = []byte{'a'} -var locKey = []byte{'b'} - -var metaKey = []byte{'a'} -var dataKey = []byte{'b'} - -var chunkKey = []byte{'c'} -var versionKey = []byte{'v'} - -var version = 0 - -func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) (err error) { - db, err := bolt.Open(path, 0777, nil) - if err != nil { - return err - } - defer func() { - if cerr := db.Close(); err == nil && cerr != nil { - err = cerr - } - }() - - tx, err := db.Begin(true) - if err != nil { - return err - } - defer func() { - if err == nil { - err = tx.Commit() - } else { - _ = tx.Rollback() - } - }() - - err = persistFields(memSegment, tx) - if err != nil { - return err - } - - err = persistDictionary(memSegment, tx) - if err != nil { - return err - } - - err = persistPostings(memSegment, tx) - if err != nil { - return err - } - - err = persistPostingsDetails(memSegment, tx, chunkFactor) - if err != nil { - return err - } - - err = persistStored(memSegment, tx) - if err != nil { - return err - } - - err = persistConfig(tx, chunkFactor) - if err != nil { - return err - } - - return nil -} - -// persistFields puts the fields as separate k/v pairs in the fields bucket -// makes very little attempt to squeeze a lot of perf because it is expected -// this is usually somewhat small, and when re-opened it will be read once and -// kept on the heap, and not read out of the file subsequently -func persistFields(memSegment *mem.Segment, tx *bolt.Tx) error { - bucket, err := tx.CreateBucket(fieldsBucket) - if err != nil { - return err - } - bucket.FillPercent = 1.0 - - // build/persist a bitset corresponding to the field locs array - indexLocs := roaring.NewBitmap() - for i, indexLoc := range memSegment.FieldsLoc { - if indexLoc { - indexLocs.AddInt(i) - } - } - var indexLocsBuffer bytes.Buffer - _, err = indexLocs.WriteTo(&indexLocsBuffer) - if err != nil { - return err - } - err = bucket.Put(indexLocsKey, indexLocsBuffer.Bytes()) - if err != nil { - return err - } - - // we use special varint which is still guaranteed to sort correctly - fieldBuf := make([]byte, 0, segment.MaxVarintSize) - for fieldID, fieldName := range memSegment.FieldsInv { - if fieldID != 0 { - // reset buffer if necessary - fieldBuf = fieldBuf[:0] - } - fieldBuf = segment.EncodeUvarintAscending(fieldBuf, uint64(fieldID)) - err = bucket.Put(fieldBuf, []byte(fieldName)) - if err != nil { - return err - } - } - return nil -} - -func persistDictionary(memSegment *mem.Segment, tx *bolt.Tx) error { - bucket, err := tx.CreateBucket(dictBucket) - if err != nil { - return err - } - bucket.FillPercent = 1.0 - - // TODO consider whether or not there is benefit to building the vellums - // concurrently. While we have to insert them into the bolt in order, - // the (presumably) heavier lifting involved in building the FST could - // be done concurrently. - - fieldBuf := make([]byte, 0, segment.MaxVarintSize) - for fieldID, fieldTerms := range memSegment.DictKeys { - if fieldID != 0 { - // reset buffers if necessary - fieldBuf = fieldBuf[:0] - } - // start a new vellum for this field - var buffer bytes.Buffer - builder, err := vellum.New(&buffer, nil) - if err != nil { - return err - } - - dict := memSegment.Dicts[fieldID] - // now walk the dictionary in order of fieldTerms (already sorted) - for i := range fieldTerms { - err = builder.Insert([]byte(fieldTerms[i]), dict[fieldTerms[i]]-1) - if err != nil { - return err - } - } - err = builder.Close() - if err != nil { - return err - } - - // put this FST into bolt - // we use special varint which is still guaranteed to sort correctly - fieldBuf = segment.EncodeUvarintAscending(fieldBuf, uint64(fieldID)) - err = bucket.Put(fieldBuf, buffer.Bytes()) - if err != nil { - return err - } - } - - return nil -} - -func persistPostings(memSegment *mem.Segment, tx *bolt.Tx) error { - bucket, err := tx.CreateBucket(postingsBucket) - if err != nil { - return err - } - bucket.FillPercent = 1.0 - - postingIDBuf := make([]byte, 0, segment.MaxVarintSize) - for postingID := range memSegment.Postings { - if postingID != 0 { - // reset buffers if necessary - postingIDBuf = postingIDBuf[:0] - } - postingIDBuf = segment.EncodeUvarintAscending(postingIDBuf, uint64(postingID)) - var postingsBuf bytes.Buffer - _, err := memSegment.Postings[postingID].WriteTo(&postingsBuf) - if err != nil { - return err - } - err = bucket.Put(postingIDBuf, postingsBuf.Bytes()) - if err != nil { - return err - } - } - - return nil -} - -func persistPostingsDetails(memSegment *mem.Segment, tx *bolt.Tx, - chunkFactor uint32) error { - bucket, err := tx.CreateBucket(postingDetailsBucket) - if err != nil { - return err - } - bucket.FillPercent = 1.0 - - postingIDBuf := make([]byte, 0, segment.MaxVarintSize) - for postingID := range memSegment.Postings { - if postingID != 0 { - // reset buffers if necessary - postingIDBuf = postingIDBuf[:0] - } - postingIDBuf = segment.EncodeUvarintAscending(postingIDBuf, uint64(postingID)) - - // make bucket for posting details - postingBucket, err := bucket.CreateBucket(postingIDBuf) - if err != nil { - return err - } - postingBucket.FillPercent = 1.0 - - err = persistPostingDetails(memSegment, postingBucket, postingID, chunkFactor) - if err != nil { - return err - } - } - - return nil -} - -func persistPostingDetails(memSegment *mem.Segment, postingBucket *bolt.Bucket, - postingID int, chunkFactor uint32) error { - // walk the postings list - var err error - var chunkBucket *bolt.Bucket - var currChunk uint32 - chunkIDBuf := make([]byte, 0, segment.MaxVarintSize) - postingsListItr := memSegment.Postings[postingID].Iterator() - var encoder *govarint.Base128Encoder - var locEncoder *govarint.Base128Encoder - - encodingBuf := &bytes.Buffer{} - locEncodingBuf := &bytes.Buffer{} - - var offset int - var locOffset int - for postingsListItr.HasNext() { - docNum := postingsListItr.Next() - chunk := docNum / chunkFactor - - // create new chunk bucket if necessary - if chunkBucket == nil || currChunk != chunk { - - // close out last chunk - if chunkBucket != nil { - - // fix me write freq/norms - encoder.Close() - err = chunkBucket.Put(freqNormKey, encodingBuf.Bytes()) - if err != nil { - return err - } - locEncoder.Close() - err = chunkBucket.Put(locKey, locEncodingBuf.Bytes()) - if err != nil { - return err - } - - // reset for next - chunkIDBuf = chunkIDBuf[:0] - encodingBuf = &bytes.Buffer{} - locEncodingBuf = &bytes.Buffer{} - } - - // prepare next chunk - chunkIDBuf = segment.EncodeUvarintAscending(chunkIDBuf, uint64(chunk)) - chunkBucket, err = postingBucket.CreateBucket(chunkIDBuf) - if err != nil { - return err - } - chunkBucket.FillPercent = 1.0 - currChunk = chunk - - encoder = govarint.NewU64Base128Encoder(encodingBuf) - locEncoder = govarint.NewU64Base128Encoder(locEncodingBuf) - } - - // put freq - _, err = encoder.PutU64(memSegment.Freqs[postingID][offset]) - if err != nil { - return err - } - - // put norm - norm := memSegment.Norms[postingID][offset] - normBits := math.Float32bits(norm) - _, err = encoder.PutU32(normBits) - if err != nil { - return err - } - - // put locations - - for i := 0; i < int(memSegment.Freqs[postingID][offset]); i++ { - - if len(memSegment.Locfields[postingID]) > 0 { - // put field - _, err = locEncoder.PutU64(uint64(memSegment.Locfields[postingID][locOffset])) - if err != nil { - return err - } - - // put pos - _, err = locEncoder.PutU64(memSegment.Locpos[postingID][locOffset]) - if err != nil { - return err - } - - // put start - _, err = locEncoder.PutU64(memSegment.Locstarts[postingID][locOffset]) - if err != nil { - return err - } - - // put end - _, err = locEncoder.PutU64(memSegment.Locends[postingID][locOffset]) - if err != nil { - return err - } - - // put array positions - num := len(memSegment.Locarraypos[postingID][locOffset]) - - // put the number of array positions to follow - _, err = locEncoder.PutU64(uint64(num)) - if err != nil { - return err - } - - // put each array position - for j := 0; j < num; j++ { - _, err = locEncoder.PutU64(memSegment.Locarraypos[postingID][locOffset][j]) - if err != nil { - return err - } - } - } - - locOffset++ - } - - offset++ - } - - // close out last chunk - - if chunkBucket != nil { - // fix me write freq/norms - encoder.Close() - err = chunkBucket.Put(freqNormKey, encodingBuf.Bytes()) - if err != nil { - return err - } - locEncoder.Close() - err = chunkBucket.Put(locKey, locEncodingBuf.Bytes()) - if err != nil { - return err - } - } - - return nil -} - -func persistStored(memSegment *mem.Segment, tx *bolt.Tx) error { - bucket, err := tx.CreateBucket(storedBucket) - if err != nil { - return err - } - bucket.FillPercent = 1.0 - - var curr int - // we use special varint which is still guaranteed to sort correctly - docNumBuf := make([]byte, 0, segment.MaxVarintSize) - for docNum, storedValues := range memSegment.Stored { - var metaBuf bytes.Buffer - var data, compressed []byte - if docNum != 0 { - // reset buffer if necessary - docNumBuf = docNumBuf[:0] - curr = 0 - } - // create doc sub-bucket - docNumBuf = segment.EncodeUvarintAscending(docNumBuf, uint64(docNum)) - docBucket, err := bucket.CreateBucket(docNumBuf) - if err != nil { - return err - } - docBucket.FillPercent = 1.0 - - metaEncoder := govarint.NewU64Base128Encoder(&metaBuf) - - // encode fields in order - for fieldID := range memSegment.FieldsInv { - if storedFieldValues, ok := storedValues[uint16(fieldID)]; ok { - // has stored values for this field - num := len(storedFieldValues) - - // process each value - for i := 0; i < num; i++ { - // encode field - _, err2 := metaEncoder.PutU64(uint64(fieldID)) - if err2 != nil { - return err2 - } - // encode type - _, err2 = metaEncoder.PutU64(uint64(memSegment.StoredTypes[docNum][uint16(fieldID)][i])) - if err2 != nil { - return err2 - } - // encode start offset - _, err2 = metaEncoder.PutU64(uint64(curr)) - if err2 != nil { - return err2 - } - // end len - _, err2 = metaEncoder.PutU64(uint64(len(storedFieldValues[i]))) - if err2 != nil { - return err2 - } - // encode number of array pos - _, err2 = metaEncoder.PutU64(uint64(len(memSegment.StoredPos[docNum][uint16(fieldID)][i]))) - if err2 != nil { - return err2 - } - // encode all array positions - for j := 0; j < len(memSegment.StoredPos[docNum][uint16(fieldID)][i]); j++ { - _, err2 = metaEncoder.PutU64(memSegment.StoredPos[docNum][uint16(fieldID)][i][j]) - if err2 != nil { - return err2 - } - } - // append data - data = append(data, storedFieldValues[i]...) - // update curr - curr += len(storedFieldValues[i]) - } - } - } - metaEncoder.Close() - - err = docBucket.Put(metaKey, metaBuf.Bytes()) - if err != nil { - return err - } - - // compress data - compressed = snappy.Encode(compressed, data) - - err = docBucket.Put(dataKey, compressed) - if err != nil { - return err - } - - } - - return nil -} - -func persistConfig(tx *bolt.Tx, chunkFactor uint32) error { - bucket, err := tx.CreateBucket(configBucket) - if err != nil { - return err - } - - chunkVal := make([]byte, 4) - binary.BigEndian.PutUint32(chunkVal, chunkFactor) - err = bucket.Put(chunkKey, chunkVal) - if err != nil { - return err - } - - err = bucket.Put(versionKey, []byte{byte(version)}) - if err != nil { - return err - } - - return nil -} diff --git a/index/scorch/segment/bolt/build_test.go b/index/scorch/segment/bolt/build_test.go deleted file mode 100644 index deb4157d4..000000000 --- a/index/scorch/segment/bolt/build_test.go +++ /dev/null @@ -1,288 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package bolt - -import ( - "os" - "testing" - - "github.com/blevesearch/bleve/analysis" - "github.com/blevesearch/bleve/document" - "github.com/blevesearch/bleve/index" - "github.com/blevesearch/bleve/index/scorch/segment/mem" -) - -func TestBuild(t *testing.T) { - _ = os.RemoveAll("/tmp/scorch.bolt") - - memSegment := buildMemSegment() - err := PersistSegment(memSegment, "/tmp/scorch.bolt", 1024) - if err != nil { - t.Fatal(err) - } -} - -func buildMemSegment() *mem.Segment { - doc := &document.Document{ - ID: "a", - Fields: []document.Field{ - document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil), - document.NewTextFieldCustom("name", nil, []byte("wow"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - }, - CompositeFields: []*document.CompositeField{ - document.NewCompositeField("_all", true, nil, []string{"_id"}), - }, - } - - // forge analyzed docs - results := []*index.AnalysisResult{ - &index.AnalysisResult{ - Document: doc, - Analyzed: []analysis.TokenFrequencies{ - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 1, - Position: 1, - Term: []byte("a"), - }, - }, nil, false), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("wow"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("some"), - }, - &analysis.Token{ - Start: 5, - End: 10, - Position: 2, - Term: []byte("thing"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("cold"), - }, - }, []uint64{0}, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("dark"), - }, - }, []uint64{1}, true), - }, - Length: []int{ - 1, - 1, - 2, - 1, - 1, - }, - }, - } - - // fix up composite fields - for _, ar := range results { - for i, f := range ar.Document.Fields { - for _, cf := range ar.Document.CompositeFields { - cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i]) - } - } - } - - return mem.NewFromAnalyzedDocs(results) -} - -func buildMemSegmentMulti() *mem.Segment { - - doc := &document.Document{ - ID: "a", - Fields: []document.Field{ - document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil), - document.NewTextFieldCustom("name", nil, []byte("wow"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - }, - CompositeFields: []*document.CompositeField{ - document.NewCompositeField("_all", true, nil, []string{"_id"}), - }, - } - - doc2 := &document.Document{ - ID: "b", - Fields: []document.Field{ - document.NewTextFieldCustom("_id", nil, []byte("b"), document.IndexField|document.StoreField, nil), - document.NewTextFieldCustom("name", nil, []byte("who"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - }, - CompositeFields: []*document.CompositeField{ - document.NewCompositeField("_all", true, nil, []string{"_id"}), - }, - } - - // forge analyzed docs - results := []*index.AnalysisResult{ - &index.AnalysisResult{ - Document: doc, - Analyzed: []analysis.TokenFrequencies{ - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 1, - Position: 1, - Term: []byte("a"), - }, - }, nil, false), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("wow"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("some"), - }, - &analysis.Token{ - Start: 5, - End: 10, - Position: 2, - Term: []byte("thing"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("cold"), - }, - }, []uint64{0}, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("dark"), - }, - }, []uint64{1}, true), - }, - Length: []int{ - 1, - 1, - 2, - 1, - 1, - }, - }, - &index.AnalysisResult{ - Document: doc2, - Analyzed: []analysis.TokenFrequencies{ - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 1, - Position: 1, - Term: []byte("b"), - }, - }, nil, false), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("who"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("some"), - }, - &analysis.Token{ - Start: 5, - End: 10, - Position: 2, - Term: []byte("thing"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("cold"), - }, - }, []uint64{0}, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("dark"), - }, - }, []uint64{1}, true), - }, - Length: []int{ - 1, - 1, - 2, - 1, - 1, - }, - }, - } - - // fix up composite fields - for _, ar := range results { - for i, f := range ar.Document.Fields { - for _, cf := range ar.Document.CompositeFields { - cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i]) - } - } - } - - segment := mem.NewFromAnalyzedDocs(results) - - return segment -} diff --git a/index/scorch/segment/bolt/dict.go b/index/scorch/segment/bolt/dict.go deleted file mode 100644 index 0f38a3d60..000000000 --- a/index/scorch/segment/bolt/dict.go +++ /dev/null @@ -1,161 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package bolt - -import ( - "fmt" - - "github.com/RoaringBitmap/roaring" - "github.com/blevesearch/bleve/index" - "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/couchbaselabs/vellum" - "github.com/couchbaselabs/vellum/regexp" -) - -// Dictionary is the bolt representation of the term dictionary -type Dictionary struct { - segment *Segment - field string - fieldID uint16 - fst *vellum.FST -} - -// PostingsList returns the postings list for the specified term -func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) { - return d.postingsList(term, except) -} - -func (d *Dictionary) postingsList(term string, except *roaring.Bitmap) (*PostingsList, error) { - rv := &PostingsList{ - dictionary: d, - term: term, - except: except, - } - - if d.fst != nil { - postingsID, exists, err := d.fst.Get([]byte(term)) - if err != nil { - return nil, fmt.Errorf("vellum err: %v", err) - } - if exists { - rv.postingsID = postingsID - postingsIDKey := segment.EncodeUvarintAscending(nil, postingsID) - bucket := d.segment.tx.Bucket(postingsBucket) - if bucket == nil { - return nil, fmt.Errorf("postings bucket missing") - } - - roaringBytes := bucket.Get(postingsIDKey) - if roaringBytes == nil { - return nil, fmt.Errorf("postings for postingsID %d missing", postingsID) - } - bitmap := roaring.NewBitmap() - _, err = bitmap.FromBuffer(roaringBytes) - if err != nil { - return nil, fmt.Errorf("error loading roaring bitmap: %v", err) - } - - rv.postings = bitmap - rv.postingKey = postingsIDKey - } - } - - return rv, nil -} - -// Iterator returns an iterator for this dictionary -func (d *Dictionary) Iterator() segment.DictionaryIterator { - - rv := &DictionaryIterator{ - d: d, - } - - if d.fst != nil { - itr, err := d.fst.Iterator(nil, nil) - if err == nil { - rv.itr = itr - } - } - - return rv -} - -// PrefixIterator returns an iterator which only visits terms having the -// the specified prefix -func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { - rv := &DictionaryIterator{ - d: d, - } - - if d.fst != nil { - r, err := regexp.New(prefix + ".*") - if err == nil { - itr, err := d.fst.Search(r, nil, nil) - if err == nil { - rv.itr = itr - } - } - } - - return rv -} - -// RangeIterator returns an iterator which only visits terms between the -// start and end terms. NOTE: bleve.index API specifies the end is inclusive. -func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator { - rv := &DictionaryIterator{ - d: d, - } - - // need to increment the end position to be inclusive - endBytes := []byte(end) - if endBytes[len(endBytes)-1] < 0xff { - endBytes[len(endBytes)-1]++ - } else { - endBytes = append(endBytes, 0xff) - } - - if d.fst != nil { - itr, err := d.fst.Iterator([]byte(start), endBytes) - if err == nil { - rv.itr = itr - } - } - - return rv -} - -// DictionaryIterator is an iterator for term dictionary -type DictionaryIterator struct { - d *Dictionary - itr vellum.Iterator - err error -} - -// Next returns the next entry in the dictionary -func (i *DictionaryIterator) Next() (*index.DictEntry, error) { - if i.err == vellum.ErrIteratorDone { - return nil, nil - } else if i.err != nil { - return nil, i.err - } - term, count := i.itr.Current() - rv := &index.DictEntry{ - Term: string(term), - Count: count, - } - i.err = i.itr.Next() - return rv, nil -} diff --git a/index/scorch/segment/bolt/dict_test.go b/index/scorch/segment/bolt/dict_test.go deleted file mode 100644 index 2df57d67f..000000000 --- a/index/scorch/segment/bolt/dict_test.go +++ /dev/null @@ -1,183 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package bolt - -import ( - "os" - "reflect" - "testing" - - "github.com/blevesearch/bleve/analysis" - "github.com/blevesearch/bleve/document" - "github.com/blevesearch/bleve/index" - "github.com/blevesearch/bleve/index/scorch/segment/mem" -) - -func buildMemSegmentForDict() *mem.Segment { - doc := &document.Document{ - ID: "a", - Fields: []document.Field{ - document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil), - document.NewTextFieldCustom("desc", nil, []byte("apple ball cat dog egg fish bat"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - }, - } - - // forge analyzed docs - results := []*index.AnalysisResult{ - &index.AnalysisResult{ - Document: doc, - Analyzed: []analysis.TokenFrequencies{ - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 1, - Position: 1, - Term: []byte("a"), - }, - }, nil, false), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 5, - Position: 1, - Term: []byte("apple"), - }, - &analysis.Token{ - Start: 6, - End: 10, - Position: 2, - Term: []byte("ball"), - }, - &analysis.Token{ - Start: 11, - End: 14, - Position: 3, - Term: []byte("cat"), - }, - &analysis.Token{ - Start: 15, - End: 18, - Position: 4, - Term: []byte("dog"), - }, - &analysis.Token{ - Start: 19, - End: 22, - Position: 5, - Term: []byte("egg"), - }, - &analysis.Token{ - Start: 20, - End: 24, - Position: 6, - Term: []byte("fish"), - }, - &analysis.Token{ - Start: 25, - End: 28, - Position: 7, - Term: []byte("bat"), - }, - }, nil, true), - }, - Length: []int{ - 1, - 7, - }, - }, - } - - segment := mem.NewFromAnalyzedDocs(results) - - return segment -} - -func TestDictionary(t *testing.T) { - - _ = os.RemoveAll("/tmp/scorch.bolt") - - memSegment := buildMemSegmentForDict() - err := PersistSegment(memSegment, "/tmp/scorch.bolt", 1024) - if err != nil { - t.Fatalf("error persisting segment: %v", err) - } - - segment, err := Open("/tmp/scorch.bolt") - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - defer func() { - cerr := segment.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", err) - } - }() - - dict, err := segment.Dictionary("desc") - if err != nil { - t.Fatal(err) - } - - // test basic full iterator - expected := []string{"apple", "ball", "bat", "cat", "dog", "egg", "fish"} - var got []string - itr := dict.Iterator() - next, err := itr.Next() - for next != nil && err == nil { - got = append(got, next.Term) - next, err = itr.Next() - } - if err != nil { - t.Fatalf("dict itr error: %v", err) - } - - if !reflect.DeepEqual(expected, got) { - t.Errorf("expected: %v, got: %v", expected, got) - } - - // test prefix iterator - expected = []string{"ball", "bat"} - got = got[:0] - itr = dict.PrefixIterator("b") - next, err = itr.Next() - for next != nil && err == nil { - got = append(got, next.Term) - next, err = itr.Next() - } - if err != nil { - t.Fatalf("dict itr error: %v", err) - } - - if !reflect.DeepEqual(expected, got) { - t.Errorf("expected: %v, got: %v", expected, got) - } - - // test range iterator - expected = []string{"cat", "dog", "egg"} - got = got[:0] - itr = dict.RangeIterator("cat", "egg") - next, err = itr.Next() - for next != nil && err == nil { - got = append(got, next.Term) - next, err = itr.Next() - } - if err != nil { - t.Fatalf("dict itr error: %v", err) - } - - if !reflect.DeepEqual(expected, got) { - t.Errorf("expected: %v, got: %v", expected, got) - } -} diff --git a/index/scorch/segment/bolt/posting.go b/index/scorch/segment/bolt/posting.go deleted file mode 100644 index bd038a575..000000000 --- a/index/scorch/segment/bolt/posting.go +++ /dev/null @@ -1,323 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package bolt - -import ( - "bytes" - "fmt" - "math" - - "github.com/RoaringBitmap/roaring" - "github.com/Smerity/govarint" - "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/boltdb/bolt" -) - -// PostingsList is an in-memory represenation of a postings list -type PostingsList struct { - dictionary *Dictionary - term string - postingsID uint64 - postings *roaring.Bitmap - except *roaring.Bitmap - postingKey []byte -} - -// Iterator returns an iterator for this postings list -func (p *PostingsList) Iterator() segment.PostingsIterator { - rv := &PostingsIterator{ - postings: p, - } - if p.postings != nil { - detailsBucket := p.dictionary.segment.tx.Bucket(postingDetailsBucket) - rv.detailBucket = detailsBucket.Bucket(p.postingKey) - rv.all = p.postings.Iterator() - if p.except != nil { - allExcept := p.postings.Clone() - allExcept.AndNot(p.except) - rv.actual = allExcept.Iterator() - } else { - rv.actual = p.postings.Iterator() - } - } - - return rv -} - -// Count returns the number of items on this postings list -func (p *PostingsList) Count() uint64 { - var rv uint64 - if p.postings != nil { - rv = p.postings.GetCardinality() - if p.except != nil { - except := p.except.GetCardinality() - if except > rv { - // avoid underflow - except = rv - } - rv -= except - } - } - return rv -} - -// PostingsIterator provides a way to iterate through the postings list -type PostingsIterator struct { - postings *PostingsList - all roaring.IntIterable - offset int - locoffset int - actual roaring.IntIterable - detailBucket *bolt.Bucket - - currChunk uint32 - currChunkFreqNorm []byte - currChunkLoc []byte - freqNormDecoder *govarint.Base128Decoder - locDecoder *govarint.Base128Decoder -} - -func (i *PostingsIterator) loadChunk(chunk int) error { - // load correct chunk bytes - chunkID := segment.EncodeUvarintAscending(nil, uint64(chunk)) - chunkBucket := i.detailBucket.Bucket(chunkID) - if chunkBucket == nil { - return fmt.Errorf("chunk %d missing", chunkID) - } - i.currChunkFreqNorm = chunkBucket.Get(freqNormKey) - i.freqNormDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkFreqNorm)) - i.currChunkLoc = chunkBucket.Get(locKey) - i.locDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkLoc)) - i.currChunk = uint32(chunk) - return nil -} - -func (i *PostingsIterator) readFreqNorm() (uint64, uint64, error) { - freq, err := i.freqNormDecoder.GetU64() - if err != nil { - return 0, 0, fmt.Errorf("error reading frequency: %v", err) - } - normBits, err := i.freqNormDecoder.GetU64() - if err != nil { - return 0, 0, fmt.Errorf("error reading norm: %v", err) - } - return freq, normBits, err -} - -// readLocation processes all the integers on the stream representing a single -// location. if you care about it, pass in a non-nil location struct, and we -// will fill it. if you don't care about it, pass in nil and we safely consume -// the contents. -func (i *PostingsIterator) readLocation(l *Location) error { - // read off field - fieldID, err := i.locDecoder.GetU64() - if err != nil { - return fmt.Errorf("error reading location field: %v", err) - } - // read off pos - pos, err := i.locDecoder.GetU64() - if err != nil { - return fmt.Errorf("error reading location pos: %v", err) - } - // read off start - start, err := i.locDecoder.GetU64() - if err != nil { - return fmt.Errorf("error reading location start: %v", err) - } - // read off end - end, err := i.locDecoder.GetU64() - if err != nil { - return fmt.Errorf("error reading location end: %v", err) - } - // read off num array pos - numArrayPos, err := i.locDecoder.GetU64() - if err != nil { - return fmt.Errorf("error reading location num array pos: %v", err) - } - - // group these together for less branching - if l != nil { - l.field = i.postings.dictionary.segment.fieldsInv[fieldID] - l.pos = pos - l.start = start - l.end = end - if numArrayPos > 0 { - l.ap = make([]uint64, int(numArrayPos)) - } - } - - // read off array positions - for k := 0; k < int(numArrayPos); k++ { - ap, err := i.locDecoder.GetU64() - if err != nil { - return fmt.Errorf("error reading array position: %v", err) - } - if l != nil { - l.ap[k] = ap - } - } - - return nil -} - -// Next returns the next posting on the postings list, or nil at the end -func (i *PostingsIterator) Next() (segment.Posting, error) { - if i.actual == nil || !i.actual.HasNext() { - return nil, nil - } - n := i.actual.Next() - nChunk := n / i.postings.dictionary.segment.chunkFactor - allN := i.all.Next() - allNChunk := allN / i.postings.dictionary.segment.chunkFactor - - // n is the next actual hit (excluding some postings) - // allN is the next hit in the full postings - // if they don't match, adjust offsets to factor in item we're skipping over - // incr the all iterator, and check again - for allN != n { - - // in different chunks, reset offsets - if allNChunk != nChunk { - i.locoffset = 0 - i.offset = 0 - } else { - - if i.currChunk != nChunk || i.currChunkFreqNorm == nil { - err := i.loadChunk(int(nChunk)) - if err != nil { - return nil, fmt.Errorf("error loading chunk: %v", err) - } - } - - // read off freq/offsets even though we don't care about them - freq, _, err := i.readFreqNorm() - if err != nil { - return nil, err - } - if i.postings.dictionary.segment.fieldsLoc[i.postings.dictionary.fieldID] { - for j := 0; j < int(freq); j++ { - err := i.readLocation(nil) - if err != nil { - return nil, err - } - } - } - - // in same chunk, need to account for offsets - i.offset++ - } - - allN = i.all.Next() - } - - if i.currChunk != nChunk || i.currChunkFreqNorm == nil { - err := i.loadChunk(int(nChunk)) - if err != nil { - return nil, fmt.Errorf("error loading chunk: %v", err) - } - } - - rv := &Posting{ - iterator: i, - docNum: uint64(n), - } - - var err error - var normBits uint64 - rv.freq, normBits, err = i.readFreqNorm() - if err != nil { - return nil, err - } - rv.norm = math.Float32frombits(uint32(normBits)) - if i.postings.dictionary.segment.fieldsLoc[i.postings.dictionary.fieldID] { - // read off 'freq' locations - rv.locs = make([]segment.Location, rv.freq) - locs := make([]Location, rv.freq) - for j := 0; j < int(rv.freq); j++ { - err := i.readLocation(&locs[j]) - if err != nil { - return nil, err - } - rv.locs[j] = &locs[j] - } - } - - return rv, nil -} - -// Posting is a single entry in a postings list -type Posting struct { - iterator *PostingsIterator - docNum uint64 - - freq uint64 - norm float32 - locs []segment.Location -} - -// Number returns the document number of this posting in this segment -func (p *Posting) Number() uint64 { - return p.docNum -} - -// Frequency returns the frequence of occurance of this term in this doc/field -func (p *Posting) Frequency() uint64 { - return p.freq -} - -// Norm returns the normalization factor for this posting -func (p *Posting) Norm() float64 { - return float64(p.norm) -} - -// Locations returns the location information for each occurance -func (p *Posting) Locations() []segment.Location { - return p.locs -} - -// Location represents the location of a single occurance -type Location struct { - field string - pos uint64 - start uint64 - end uint64 - ap []uint64 -} - -// Field returns the name of the field (useful in composite fields to know -// which original field the value came from) -func (l *Location) Field() string { - return l.field -} - -// Start returns the start byte offset of this occurance -func (l *Location) Start() uint64 { - return l.start -} - -// End returns the end byte offset of this occurance -func (l *Location) End() uint64 { - return l.end -} - -// Pos returns the 1-based phrase position of this occurance -func (l *Location) Pos() uint64 { - return l.pos -} - -// ArrayPositions returns the array position vector associated with this occurance -func (l *Location) ArrayPositions() []uint64 { - return l.ap -} diff --git a/index/scorch/segment/bolt/segment.go b/index/scorch/segment/bolt/segment.go deleted file mode 100644 index f53a98fe6..000000000 --- a/index/scorch/segment/bolt/segment.go +++ /dev/null @@ -1,319 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package bolt - -import ( - "bytes" - "encoding/binary" - "fmt" - "io" - - "github.com/RoaringBitmap/roaring" - "github.com/Smerity/govarint" - "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/boltdb/bolt" - "github.com/couchbaselabs/vellum" - "github.com/golang/snappy" -) - -var readOnlyOptions = &bolt.Options{ - ReadOnly: true, -} - -// _id field is always guaranteed to have fieldID of 0 -const idFieldID uint16 = 0 - -// Open returns a boltdb impl of a segment -func Open(path string) (segment.Segment, error) { - - db, err := bolt.Open(path, 0600, readOnlyOptions) - if err != nil { - return nil, err - } - - tx, err := db.Begin(false) - if err != nil { - _ = db.Close() - return nil, err - } - - rv := &Segment{ - db: db, - tx: tx, - fieldsMap: make(map[string]uint16), - } - - err = rv.loadConfig() - if err != nil { - _ = db.Close() - return nil, err - } - - err = rv.loadFields() - if err != nil { - _ = db.Close() - return nil, err - } - - return rv, nil -} - -// Segment implements a boltdb based implementation of a segment -type Segment struct { - version uint8 - chunkFactor uint32 - db *bolt.DB - tx *bolt.Tx - - fieldsMap map[string]uint16 - fieldsInv []string - fieldsLoc []bool -} - -func (s *Segment) loadConfig() (err error) { - bucket := s.tx.Bucket(configBucket) - if bucket == nil { - return fmt.Errorf("config bucket missing") - } - - ver := bucket.Get(versionKey) - if ver == nil { - return fmt.Errorf("version key missing") - } - s.version = ver[0] - - chunk := bucket.Get(chunkKey) - if chunk == nil { - return fmt.Errorf("chunk key is missing") - } - s.chunkFactor = binary.BigEndian.Uint32(chunk) - - return nil -} - -// loadFields reads the fields info from the segment so that we never have to go -// back to disk to access this (small and used frequently) -func (s *Segment) loadFields() (err error) { - - bucket := s.tx.Bucket(fieldsBucket) - if bucket == nil { - return fmt.Errorf("fields bucket missing") - } - - indexLocs := roaring.NewBitmap() - err = bucket.ForEach(func(k []byte, v []byte) error { - - // process index locations bitset - if k[0] == indexLocsKey[0] { - _, err2 := indexLocs.FromBuffer(v) - if err2 != nil { - return fmt.Errorf("error loading indexLocs: %v", err2) - } - } else { - - _, fieldID, err2 := segment.DecodeUvarintAscending(k) - if err2 != nil { - return err2 - } - // we store fieldID+1 in so we can discern the zero value - s.fieldsMap[string(v)] = uint16(fieldID + 1) - } - return nil - }) - if err != nil { - return err - } - - // now setup the inverse (should have same size as map and be keyed 0-(len-1)) - s.fieldsInv = make([]string, len(s.fieldsMap)) - for k, v := range s.fieldsMap { - s.fieldsInv[int(v)-1] = k - } - s.fieldsLoc = make([]bool, len(s.fieldsInv)) - for i := range s.fieldsInv { - if indexLocs.ContainsInt(i) { - s.fieldsLoc[i] = true - } - } - - return nil -} - -// Fields returns the field names used in this segment -func (s *Segment) Fields() []string { - return s.fieldsInv -} - -// Count returns the number of documents in this segment -// (this has no notion of deleted docs) -func (s *Segment) Count() uint64 { - return uint64(s.tx.Bucket(storedBucket).Stats().BucketN - 1) -} - -// Dictionary returns the term dictionary for the specified field -func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) { - dict, err := s.dictionary(field) - if err == nil && dict == nil { - return &segment.EmptyDictionary{}, nil - } - return dict, err -} - -func (s *Segment) dictionary(field string) (*Dictionary, error) { - - rv := &Dictionary{ - segment: s, - field: field, - } - - rv.fieldID = s.fieldsMap[field] - if rv.fieldID > 0 { - rv.fieldID = rv.fieldID - 1 - fieldIDKey := segment.EncodeUvarintAscending(nil, uint64(rv.fieldID)) - bucket := s.tx.Bucket(dictBucket) - if bucket == nil { - return nil, fmt.Errorf("dictionary bucket missing") - } - fstBytes := bucket.Get(fieldIDKey) - if fstBytes == nil { - return nil, fmt.Errorf("dictionary field %s bytes nil", field) - } - if fstBytes != nil { - fst, err := vellum.Load(fstBytes) - if err != nil { - return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err) - } - if err == nil { - rv.fst = fst - } - } - - } else { - return nil, nil - } - - return rv, nil -} - -// VisitDocument invokes the DocFieldValueVistor for each stored field -// for the specified doc number -func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { - storedBuucket := s.tx.Bucket(storedBucket) - if storedBuucket == nil { - return fmt.Errorf("stored bucket missing") - } - docNumKey := segment.EncodeUvarintAscending(nil, num) - docBucket := storedBuucket.Bucket(docNumKey) - if docBucket == nil { - return fmt.Errorf("segment has no doc number %d", num) - } - metaBytes := docBucket.Get(metaKey) - if metaBytes == nil { - return fmt.Errorf("stored meta bytes for doc number %d is nil", num) - } - dataBytes := docBucket.Get(dataKey) - if dataBytes == nil { - return fmt.Errorf("stored data bytes for doc number %d is nil", num) - } - uncompressed, err := snappy.Decode(nil, dataBytes) - if err != nil { - return err - } - - reader := bytes.NewReader(metaBytes) - decoder := govarint.NewU64Base128Decoder(reader) - - keepGoing := true - for keepGoing { - field, err := decoder.GetU64() - if err == io.EOF { - break - } - if err != nil { - return err - } - typ, err := decoder.GetU64() - if err != nil { - return err - } - offset, err := decoder.GetU64() - if err != nil { - return err - } - l, err := decoder.GetU64() - if err != nil { - return err - } - numap, err := decoder.GetU64() - if err != nil { - return err - } - var arrayPos []uint64 - if numap > 0 { - arrayPos = make([]uint64, numap) - for i := 0; i < int(numap); i++ { - ap, err := decoder.GetU64() - if err != nil { - return err - } - arrayPos[i] = ap - } - } - - value := uncompressed[offset : offset+l] - keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos) - } - - return nil -} - -// DocNumbers returns a bitset corresponding to the doc numbers of all the -// provided _id strings -func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) { - rv := roaring.New() - - if len(s.fieldsMap) > 0 { - idDict, err := s.dictionary("_id") - if err != nil { - return nil, err - } - - for _, id := range ids { - postings, err := idDict.postingsList(id, nil) - if err != nil { - return nil, err - } - if postings.postings != nil { - rv.Or(postings.postings) - } - } - } - - return rv, nil -} - -// Close releases all resources associated with this segment -func (s *Segment) Close() error { - err := s.tx.Rollback() - if err != nil { - _ = s.db.Close() - return err - } - return s.db.Close() -} - -func (s *Segment) Path() string { - return s.db.Path() -} diff --git a/index/scorch/segment/bolt/segment_test.go b/index/scorch/segment/bolt/segment_test.go deleted file mode 100644 index 16ac2cadd..000000000 --- a/index/scorch/segment/bolt/segment_test.go +++ /dev/null @@ -1,517 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package bolt - -import ( - "math" - "os" - "reflect" - "testing" -) - -func TestOpen(t *testing.T) { - _ = os.RemoveAll("/tmp/scorch.bolt") - - memSegment := buildMemSegment() - err := PersistSegment(memSegment, "/tmp/scorch.bolt", 1024) - if err != nil { - t.Fatalf("error persisting segment: %v", err) - } - - segment, err := Open("/tmp/scorch.bolt") - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - defer func() { - cerr := segment.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", err) - } - }() - - expectFields := map[string]struct{}{ - "_id": struct{}{}, - "_all": struct{}{}, - "name": struct{}{}, - "desc": struct{}{}, - "tag": struct{}{}, - } - fields := segment.Fields() - if len(fields) != len(expectFields) { - t.Errorf("expected %d fields, only got %d", len(expectFields), len(fields)) - } - for _, field := range fields { - if _, ok := expectFields[field]; !ok { - t.Errorf("got unexpected field: %s", field) - } - } - - docCount := segment.Count() - if docCount != 1 { - t.Errorf("expected count 1, got %d", docCount) - } - - // check the _id field - dict, err := segment.Dictionary("_id") - if err != nil { - t.Fatal(err) - } - if dict == nil { - t.Fatal("got nil dict, expected non-nil") - } - - postingsList, err := dict.PostingsList("a", nil) - if err != nil { - t.Fatal(err) - } - if postingsList == nil { - t.Fatal("got nil postings list, expected non-nil") - } - - postingsItr := postingsList.Iterator() - if postingsItr == nil { - t.Fatal("got nil iterator, expected non-nil") - } - - count := 0 - nextPosting, err := postingsItr.Next() - for nextPosting != nil && err == nil { - count++ - if nextPosting.Frequency() != 1 { - t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) - } - if nextPosting.Number() != 0 { - t.Errorf("expected doc number 0, got %d", nextPosting.Number()) - } - if nextPosting.Norm() != 1.0 { - t.Errorf("expected norm 1.0, got %f", nextPosting.Norm()) - } - - nextPosting, err = postingsItr.Next() - } - if err != nil { - t.Fatal(err) - } - - if count != 1 { - t.Errorf("expected count to be 1, got %d", count) - } - - // check the name field - dict, err = segment.Dictionary("name") - if err != nil { - t.Fatal(err) - } - if dict == nil { - t.Fatal("got nil dict, expected non-nil") - } - - postingsList, err = dict.PostingsList("wow", nil) - if err != nil { - t.Fatal(err) - } - if postingsList == nil { - t.Fatal("got nil postings list, expected non-nil") - } - - postingsItr = postingsList.Iterator() - if postingsItr == nil { - t.Fatal("got nil iterator, expected non-nil") - } - - count = 0 - nextPosting, err = postingsItr.Next() - for nextPosting != nil && err == nil { - count++ - if nextPosting.Frequency() != 1 { - t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) - } - if nextPosting.Number() != 0 { - t.Errorf("expected doc number 0, got %d", nextPosting.Number()) - } - if nextPosting.Norm() != 1.0 { - t.Errorf("expected norm 1.0, got %f", nextPosting.Norm()) - } - var numLocs uint64 - for _, loc := range nextPosting.Locations() { - numLocs++ - if loc.Field() != "name" { - t.Errorf("expected loc field to be 'name', got '%s'", loc.Field()) - } - if loc.Start() != 0 { - t.Errorf("expected loc start to be 0, got %d", loc.Start()) - } - if loc.End() != 3 { - t.Errorf("expected loc end to be 3, got %d", loc.End()) - } - if loc.Pos() != 1 { - t.Errorf("expected loc pos to be 1, got %d", loc.Pos()) - } - if loc.ArrayPositions() != nil { - t.Errorf("expect loc array pos to be nil, got %v", loc.ArrayPositions()) - } - } - if numLocs != nextPosting.Frequency() { - t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs) - } - - nextPosting, err = postingsItr.Next() - } - if err != nil { - t.Fatal(err) - } - - if count != 1 { - t.Errorf("expected count to be 1, got %d", count) - } - - // check the _all field (composite) - dict, err = segment.Dictionary("_all") - if err != nil { - t.Fatal(err) - } - if dict == nil { - t.Fatal("got nil dict, expected non-nil") - } - - postingsList, err = dict.PostingsList("wow", nil) - if err != nil { - t.Fatal(err) - } - if postingsList == nil { - t.Fatal("got nil postings list, expected non-nil") - } - - postingsItr = postingsList.Iterator() - if postingsItr == nil { - t.Fatal("got nil iterator, expected non-nil") - } - - count = 0 - nextPosting, err = postingsItr.Next() - for nextPosting != nil && err == nil { - count++ - if nextPosting.Frequency() != 1 { - t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) - } - if nextPosting.Number() != 0 { - t.Errorf("expected doc number 0, got %d", nextPosting.Number()) - } - expectedNorm := float32(1.0 / math.Sqrt(float64(5))) - if nextPosting.Norm() != float64(expectedNorm) { - t.Errorf("expected norm %f, got %f", expectedNorm, nextPosting.Norm()) - } - var numLocs uint64 - for _, loc := range nextPosting.Locations() { - numLocs++ - if loc.Field() != "name" { - t.Errorf("expected loc field to be 'name', got '%s'", loc.Field()) - } - if loc.Start() != 0 { - t.Errorf("expected loc start to be 0, got %d", loc.Start()) - } - if loc.End() != 3 { - t.Errorf("expected loc end to be 3, got %d", loc.End()) - } - if loc.Pos() != 1 { - t.Errorf("expected loc pos to be 1, got %d", loc.Pos()) - } - if loc.ArrayPositions() != nil { - t.Errorf("expect loc array pos to be nil, got %v", loc.ArrayPositions()) - } - } - if numLocs != nextPosting.Frequency() { - t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs) - } - - nextPosting, err = postingsItr.Next() - } - if err != nil { - t.Fatal(err) - } - - if count != 1 { - t.Errorf("expected count to be 1, got %d", count) - } - - // now try a field with array positions - dict, err = segment.Dictionary("tag") - if err != nil { - t.Fatal(err) - } - if dict == nil { - t.Fatal("got nil dict, expected non-nil") - } - - postingsList, err = dict.PostingsList("dark", nil) - if err != nil { - t.Fatal(err) - } - if postingsList == nil { - t.Fatal("got nil postings list, expected non-nil") - } - - postingsItr = postingsList.Iterator() - if postingsItr == nil { - t.Fatal("got nil iterator, expected non-nil") - } - - nextPosting, err = postingsItr.Next() - for nextPosting != nil && err == nil { - - if nextPosting.Frequency() != 1 { - t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) - } - if nextPosting.Number() != 0 { - t.Errorf("expected doc number 0, got %d", nextPosting.Number()) - } - var numLocs uint64 - for _, loc := range nextPosting.Locations() { - numLocs++ - if loc.Field() != "tag" { - t.Errorf("expected loc field to be 'name', got '%s'", loc.Field()) - } - if loc.Start() != 0 { - t.Errorf("expected loc start to be 0, got %d", loc.Start()) - } - if loc.End() != 4 { - t.Errorf("expected loc end to be 3, got %d", loc.End()) - } - if loc.Pos() != 1 { - t.Errorf("expected loc pos to be 1, got %d", loc.Pos()) - } - expectArrayPos := []uint64{1} - if !reflect.DeepEqual(loc.ArrayPositions(), expectArrayPos) { - t.Errorf("expect loc array pos to be %v, got %v", expectArrayPos, loc.ArrayPositions()) - } - } - if numLocs != nextPosting.Frequency() { - t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs) - } - - nextPosting, err = postingsItr.Next() - } - if err != nil { - t.Fatal(err) - } - - // now try and visit a document - var fieldValuesSeen int - err = segment.VisitDocument(0, func(field string, typ byte, value []byte, pos []uint64) bool { - fieldValuesSeen++ - return true - }) - if err != nil { - t.Fatal(err) - } - if fieldValuesSeen != 5 { - t.Errorf("expected 5 field values, got %d", fieldValuesSeen) - } -} - -func TestOpenMulti(t *testing.T) { - _ = os.RemoveAll("/tmp/scorch.bolt") - - memSegment := buildMemSegmentMulti() - err := PersistSegment(memSegment, "/tmp/scorch.bolt", 1024) - if err != nil { - t.Fatalf("error persisting segment: %v", err) - } - - segment, err := Open("/tmp/scorch.bolt") - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - defer func() { - cerr := segment.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", err) - } - }() - - if segment.Count() != 2 { - t.Errorf("expected count 2, got %d", segment.Count()) - } - - // check the desc field - dict, err := segment.Dictionary("desc") - if err != nil { - t.Fatal(err) - } - if dict == nil { - t.Fatal("got nil dict, expected non-nil") - } - - postingsList, err := dict.PostingsList("thing", nil) - if err != nil { - t.Fatal(err) - } - if postingsList == nil { - t.Fatal("got nil postings list, expected non-nil") - } - - postingsItr := postingsList.Iterator() - if postingsItr == nil { - t.Fatal("got nil iterator, expected non-nil") - } - - count := 0 - nextPosting, err := postingsItr.Next() - for nextPosting != nil && err == nil { - count++ - nextPosting, err = postingsItr.Next() - } - if err != nil { - t.Fatal(err) - } - - if count != 2 { - t.Errorf("expected count to be 2, got %d", count) - } - - // get docnum of a - exclude, err := segment.DocNumbers([]string{"a"}) - if err != nil { - t.Fatal(err) - } - - // look for term 'thing' excluding doc 'a' - postingsListExcluding, err := dict.PostingsList("thing", exclude) - if err != nil { - t.Fatal(err) - } - if postingsList == nil { - t.Fatal("got nil postings list, expected non-nil") - } - - postingsListExcludingCount := postingsListExcluding.Count() - if postingsListExcludingCount != 1 { - t.Errorf("expected count from postings list to be 1, got %d", postingsListExcludingCount) - } - - postingsItrExcluding := postingsListExcluding.Iterator() - if postingsItr == nil { - t.Fatal("got nil iterator, expected non-nil") - } - - count = 0 - nextPosting, err = postingsItrExcluding.Next() - for nextPosting != nil && err == nil { - count++ - nextPosting, err = postingsItrExcluding.Next() - } - if err != nil { - t.Fatal(err) - } - - if count != 1 { - t.Errorf("expected count to be 1, got %d", count) - } -} - -func TestOpenMultiWithTwoChunks(t *testing.T) { - _ = os.RemoveAll("/tmp/scorch.bolt") - - memSegment := buildMemSegmentMulti() - err := PersistSegment(memSegment, "/tmp/scorch.bolt", 1) - if err != nil { - t.Fatalf("error persisting segment: %v", err) - } - - segment, err := Open("/tmp/scorch.bolt") - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - defer func() { - cerr := segment.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", err) - } - }() - - if segment.Count() != 2 { - t.Errorf("expected count 2, got %d", segment.Count()) - } - - // check the desc field - dict, err := segment.Dictionary("desc") - if err != nil { - t.Fatal(err) - } - if dict == nil { - t.Fatal("got nil dict, expected non-nil") - } - - postingsList, err := dict.PostingsList("thing", nil) - if err != nil { - t.Fatal(err) - } - if postingsList == nil { - t.Fatal("got nil postings list, expected non-nil") - } - - postingsItr := postingsList.Iterator() - if postingsItr == nil { - t.Fatal("got nil iterator, expected non-nil") - } - - count := 0 - nextPosting, err := postingsItr.Next() - for nextPosting != nil && err == nil { - count++ - nextPosting, err = postingsItr.Next() - } - if err != nil { - t.Fatal(err) - } - - if count != 2 { - t.Errorf("expected count to be 2, got %d", count) - } - - // get docnum of a - exclude, err := segment.DocNumbers([]string{"a"}) - if err != nil { - t.Fatal(err) - } - - // look for term 'thing' excluding doc 'a' - postingsListExcluding, err := dict.PostingsList("thing", exclude) - if err != nil { - t.Fatal(err) - } - if postingsList == nil { - t.Fatal("got nil postings list, expected non-nil") - } - - postingsItrExcluding := postingsListExcluding.Iterator() - if postingsItr == nil { - t.Fatal("got nil iterator, expected non-nil") - } - - count = 0 - nextPosting, err = postingsItrExcluding.Next() - for nextPosting != nil && err == nil { - count++ - nextPosting, err = postingsItrExcluding.Next() - } - if err != nil { - t.Fatal(err) - } - - if count != 1 { - t.Errorf("expected count to be 1, got %d", count) - } -} From 00722aa2997272e4ac043d91359465e0169a01e7 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 11 Dec 2017 15:38:44 -0500 Subject: [PATCH 040/728] disable http unit test which relied on debug functionality --- http/handlers_test.go | 27 --------------------------- 1 file changed, 27 deletions(-) diff --git a/http/handlers_test.go b/http/handlers_test.go index 830260ab8..b6911b0b5 100644 --- a/http/handlers_test.go +++ b/http/handlers_test.go @@ -500,33 +500,6 @@ func TestHandlers(t *testing.T) { Status: http.StatusNotFound, ResponseBody: []byte(`no such index 'tix'`), }, - { - Desc: "debug doc", - Handler: debugHandler, - Path: "/ti1/a/debug", - Method: "GET", - Params: url.Values{ - "indexName": []string{"ti1"}, - "docID": []string{"a"}, - }, - Status: http.StatusOK, - ResponseMatch: map[string]bool{ - `"key"`: true, - `"val"`: true, - }, - }, - { - Desc: "debug doc invalid index", - Handler: debugHandler, - Path: "/ti1/a/debug", - Method: "GET", - Params: url.Values{ - "indexName": []string{"tix"}, - "docID": []string{"a"}, - }, - Status: http.StatusNotFound, - ResponseBody: []byte(`no such index 'tix'`), - }, { Desc: "create alias", Handler: aliasHandler, From f13b786609ba6b13480b05fa9c1dcf8d74b1b2d1 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 11 Dec 2017 15:47:41 -0500 Subject: [PATCH 041/728] fix up issues to get all bleve unit tests passing for scorch make scorch default --- config.go | 4 +- index/scorch/reader.go | 18 ++++- index/scorch/segment/mem/build.go | 23 ++++-- index/scorch/segment/mem/posting.go | 6 +- index/scorch/segment/mem/segment.go | 5 +- index/scorch/segment/zap/build.go | 70 ++++++++++++++----- .../scorch/segment/zap/cmd/zap/cmd/fields.go | 8 +-- index/scorch/segment/zap/dict.go | 2 + index/scorch/segment/zap/posting.go | 36 +++++++--- index/scorch/segment/zap/segment.go | 11 +-- index/scorch/snapshot_index.go | 7 +- index/scorch/snapshot_index_tfr.go | 59 +++++++++++----- 12 files changed, 172 insertions(+), 77 deletions(-) diff --git a/config.go b/config.go index 74d407fdd..c1475db74 100644 --- a/config.go +++ b/config.go @@ -21,8 +21,8 @@ import ( "time" "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch" "github.com/blevesearch/bleve/index/store/gtreap" - "github.com/blevesearch/bleve/index/upsidedown" "github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/search/highlight/highlighter/html" ) @@ -69,7 +69,7 @@ func init() { Config.DefaultMemKVStore = gtreap.Name // default index - Config.DefaultIndexType = upsidedown.Name + Config.DefaultIndexType = scorch.Name bootDuration := time.Since(bootStart) bleveExpVar.Add("bootDuration", int64(bootDuration)) diff --git a/index/scorch/reader.go b/index/scorch/reader.go index e4abbce64..9a20aa013 100644 --- a/index/scorch/reader.go +++ b/index/scorch/reader.go @@ -82,15 +82,27 @@ func (r *Reader) InternalID(id string) (index.IndexInternalID, error) { } func (r *Reader) DumpAll() chan interface{} { - panic("dumpall") + rv := make(chan interface{}) + go func() { + close(rv) + }() + return rv } func (r *Reader) DumpDoc(id string) chan interface{} { - panic("dumpdoc") + rv := make(chan interface{}) + go func() { + close(rv) + }() + return rv } func (r *Reader) DumpFields() chan interface{} { - panic("dumpfields") + rv := make(chan interface{}) + go func() { + close(rv) + }() + return rv } func (r *Reader) Close() error { diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index dbba39b13..e111ce4f7 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -29,7 +29,7 @@ func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { s := New() // ensure that _id field get fieldID 0 - s.getOrDefineField("_id", false) + s.getOrDefineField("_id") // walk each doc for _, result := range results { @@ -102,14 +102,14 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { // walk each composite field for _, field := range result.Document.CompositeFields { - fieldID := uint16(s.getOrDefineField(field.Name(), true)) + fieldID := uint16(s.getOrDefineField(field.Name())) l, tf := field.Analyze() processField(fieldID, field.Name(), l, tf) } // walk each field for i, field := range result.Document.Fields { - fieldID := uint16(s.getOrDefineField(field.Name(), field.Options().IncludeTermVectors())) + fieldID := uint16(s.getOrDefineField(field.Name())) l := result.Length[i] tf := result.Analyzed[i] processField(fieldID, field.Name(), l, tf) @@ -133,6 +133,9 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { newPostingID := uint64(len(s.Postings) + 1) // add this new bitset to the postings slice s.Postings = append(s.Postings, bs) + + locationBS := roaring.New() + s.PostingsLocs = append(s.PostingsLocs, locationBS) // add this to the details slice s.Freqs = append(s.Freqs, []uint64{uint64(tokenFreq.Frequency())}) s.Norms = append(s.Norms, []float32{float32(1.0 / math.Sqrt(float64(fieldLens[fieldID])))}) @@ -142,10 +145,13 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { var locends []uint64 var locpos []uint64 var locarraypos [][]uint64 + if len(tokenFreq.Locations) > 0 { + locationBS.AddInt(int(docNum)) + } for _, loc := range tokenFreq.Locations { var locf = fieldID if loc.Field != "" { - locf = uint16(s.getOrDefineField(loc.Field, false)) + locf = uint16(s.getOrDefineField(loc.Field)) } locfields = append(locfields, locf) locstarts = append(locstarts, uint64(loc.Start)) @@ -171,12 +177,16 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { // the actual offset is - 1, because 0 is zero value bs := s.Postings[fieldTermPostings-1] bs.AddInt(int(docNum)) + locationBS := s.PostingsLocs[fieldTermPostings-1] s.Freqs[fieldTermPostings-1] = append(s.Freqs[fieldTermPostings-1], uint64(tokenFreq.Frequency())) s.Norms[fieldTermPostings-1] = append(s.Norms[fieldTermPostings-1], float32(1.0/math.Sqrt(float64(fieldLens[fieldID])))) + if len(tokenFreq.Locations) > 0 { + locationBS.AddInt(int(docNum)) + } for _, loc := range tokenFreq.Locations { var locf = fieldID if loc.Field != "" { - locf = uint16(s.getOrDefineField(loc.Field, false)) + locf = uint16(s.getOrDefineField(loc.Field)) } s.Locfields[fieldTermPostings-1] = append(s.Locfields[fieldTermPostings-1], locf) s.Locstarts[fieldTermPostings-1] = append(s.Locstarts[fieldTermPostings-1], uint64(loc.Start)) @@ -193,13 +203,12 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { } } -func (s *Segment) getOrDefineField(name string, hasLoc bool) int { +func (s *Segment) getOrDefineField(name string) int { fieldID, ok := s.FieldsMap[name] if !ok { fieldID = uint16(len(s.FieldsInv) + 1) s.FieldsMap[name] = fieldID s.FieldsInv = append(s.FieldsInv, name) - s.FieldsLoc = append(s.FieldsLoc, hasLoc) s.Dicts = append(s.Dicts, make(map[string]uint64)) s.DictKeys = append(s.DictKeys, make([]string, 0)) } diff --git a/index/scorch/segment/mem/posting.go b/index/scorch/segment/mem/posting.go index b6fd0c6a7..d91a00561 100644 --- a/index/scorch/segment/mem/posting.go +++ b/index/scorch/segment/mem/posting.go @@ -51,6 +51,7 @@ func (p *PostingsList) Iterator() segment.PostingsIterator { } if p.postingsID > 0 { allbits := p.dictionary.segment.Postings[p.postingsID-1] + rv.locations = p.dictionary.segment.PostingsLocs[p.postingsID-1] rv.all = allbits.Iterator() if p.except != nil { allExcept := allbits.Clone() @@ -68,6 +69,7 @@ func (p *PostingsList) Iterator() segment.PostingsIterator { type PostingsIterator struct { postings *PostingsList all roaring.IntIterable + locations *roaring.Bitmap offset int locoffset int actual roaring.IntIterable @@ -95,6 +97,7 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { docNum: uint64(n), offset: i.offset, locoffset: i.locoffset, + hasLoc: i.locations.Contains(n), } i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset]) @@ -108,6 +111,7 @@ type Posting struct { docNum uint64 offset int locoffset int + hasLoc bool } // Number returns the document number of this posting in this segment @@ -127,7 +131,7 @@ func (p *Posting) Norm() float64 { // Locations returns the location information for each occurance func (p *Posting) Locations() []segment.Location { - if !p.iterator.postings.dictionary.segment.FieldsLoc[p.iterator.postings.dictionary.fieldID] { + if !p.hasLoc { return nil } freq := int(p.Frequency()) diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go index 4940bb4cf..cdbff5839 100644 --- a/index/scorch/segment/mem/segment.go +++ b/index/scorch/segment/mem/segment.go @@ -44,8 +44,6 @@ type Segment struct { FieldsMap map[string]uint16 // fields id -> name FieldsInv []string - // field id -> has location info - FieldsLoc []bool // term dictionary // field id -> term -> posting id + 1 @@ -59,6 +57,9 @@ type Segment struct { // Postings list id -> Postings bitmap Postings []*roaring.Bitmap + // Postings List has locations + PostingsLocs []*roaring.Bitmap + // term frequencies // postings list id -> Freqs (one for each hit in bitmap) Freqs [][]uint64 diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 0ccb6cb0c..a5d2bec25 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -27,7 +27,7 @@ import ( "github.com/golang/snappy" ) -var version uint32 +const version uint32 = 1 // PersistSegment takes the in-memory segment and persists it to the specified // path in the zap file format. @@ -58,8 +58,14 @@ func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) (e return err } + var postingsListLocs []uint64 + postingsListLocs, err = persistPostingsLocs(memSegment, cr) + if err != nil { + return err + } + var postingsLocs []uint64 - postingsLocs, err = persistPostingsLists(memSegment, cr, freqOffsets, locOffsets) + postingsLocs, err = persistPostingsLists(memSegment, cr, postingsListLocs, freqOffsets, locOffsets) if err != nil { return err } @@ -420,7 +426,43 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac return freqOffsets, locOfffsets, nil } -func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, freqOffsets, locOffsets []uint64) ([]uint64, error) { +func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) ([]uint64, error) { + var rv []uint64 + + var postingsBuf bytes.Buffer + for postingID := range memSegment.PostingsLocs { + if postingID != 0 { + postingsBuf.Reset() + } + + // record where we start this posting loc + rv = append(rv, uint64(w.Count())) + + // write out postings locs to memory so we know the len + postingsLocLen, err := memSegment.PostingsLocs[postingID].WriteTo(&postingsBuf) + if err != nil { + return nil, err + } + + buf := make([]byte, binary.MaxVarintLen64) + // write out the length of this postings locs + n := binary.PutUvarint(buf, uint64(postingsLocLen)) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, err + } + + // write out the postings list itself + _, err = w.Write(postingsBuf.Bytes()) + if err != nil { + return nil, err + } + } + + return rv, nil +} + +func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, postingsListLocs, freqOffsets, locOffsets []uint64) ([]uint64, error) { var rv []uint64 var postingsBuf bytes.Buffer @@ -453,6 +495,13 @@ func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, freqOffse return nil, err } + // write out the start of the loc posting list + n = binary.PutUvarint(buf, postingsListLocs[postingID]) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, err + } + // write out the length of this postings list n = binary.PutUvarint(buf, uint64(postingsListLen)) _, err = w.Write(buf[:n]) @@ -534,20 +583,9 @@ func persistFields(memSegment *mem.Segment, w *CountHashWriter, dictLocs []uint6 fieldStarts = append(fieldStarts, uint64(w.Count())) buf := make([]byte, binary.MaxVarintLen64) - // write out if the field has indexed locs (0 or 1) - var indexedLoc uint64 - if memSegment.FieldsLoc[fieldID] { - indexedLoc = 1 - } - n := binary.PutUvarint(buf, indexedLoc) - _, err := w.Write(buf[:n]) - if err != nil { - return 0, err - } - // write out dict location for this field - n = binary.PutUvarint(buf, dictLocs[fieldID]) - _, err = w.Write(buf[:n]) + n := binary.PutUvarint(buf, dictLocs[fieldID]) + _, err := w.Write(buf[:n]) if err != nil { return 0, err } diff --git a/index/scorch/segment/zap/cmd/zap/cmd/fields.go b/index/scorch/segment/zap/cmd/zap/cmd/fields.go index 472d966f5..98cdf9d73 100644 --- a/index/scorch/segment/zap/cmd/zap/cmd/fields.go +++ b/index/scorch/segment/zap/cmd/zap/cmd/fields.go @@ -43,11 +43,7 @@ var fieldsCmd = &cobra.Command{ for fieldsIndexOffset+(8*fieldID) < fieldsIndexEnd { addr := binary.BigEndian.Uint64(data[fieldsIndexOffset+(8*fieldID) : fieldsIndexOffset+(8*fieldID)+8]) var n uint64 - indexedLoc, read := binary.Uvarint(data[addr:fieldsIndexEnd]) - n += uint64(read) - - var dictLoc uint64 - dictLoc, read = binary.Uvarint(data[addr+n : fieldsIndexEnd]) + dictLoc, read := binary.Uvarint(data[addr+n : fieldsIndexEnd]) n += uint64(read) var nameLen uint64 @@ -56,7 +52,7 @@ var fieldsCmd = &cobra.Command{ name := string(data[addr+n : addr+n+nameLen]) - fmt.Printf("field %d '%s' indexedLoc: %t starts at %d (%x)\n", fieldID, name, indexedLoc == 1, dictLoc, dictLoc) + fmt.Printf("field %d '%s' starts at %d (%x)\n", fieldID, name, dictLoc, dictLoc) fieldID++ } diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index d69195958..e4824851e 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -60,6 +60,8 @@ func (d *Dictionary) postingsList(term string, except *roaring.Bitmap) (*Posting n += uint64(read) rv.locOffset, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) n += uint64(read) + rv.locBitmapOffset, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) + n += uint64(read) var postingsLen uint64 postingsLen, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) n += uint64(read) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index f29020093..051cfef8e 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -27,14 +27,15 @@ import ( // PostingsList is an in-memory represenation of a postings list type PostingsList struct { - dictionary *Dictionary - term string - postingsOffset uint64 - freqOffset uint64 - locOffset uint64 - postings *roaring.Bitmap - except *roaring.Bitmap - postingKey []byte + dictionary *Dictionary + term string + postingsOffset uint64 + freqOffset uint64 + locOffset uint64 + locBitmapOffset uint64 + postings *roaring.Bitmap + except *roaring.Bitmap + postingKey []byte } // Iterator returns an iterator for this postings list @@ -68,6 +69,18 @@ func (p *PostingsList) Iterator() segment.PostingsIterator { } rv.locChunkStart = p.locOffset + n + var locBitmapLen uint64 + locBitmapLen, read = binary.Uvarint(p.dictionary.segment.mm[p.locBitmapOffset : p.locBitmapOffset+binary.MaxVarintLen64]) + roaringBytes := p.dictionary.segment.mm[p.locBitmapOffset+uint64(read) : p.locBitmapOffset+uint64(read)+locBitmapLen] + bitmap := roaring.NewBitmap() + _, err := bitmap.FromBuffer(roaringBytes) + if err != nil { + // return nil, fmt.Errorf("error loading roaring bitmap: %v", err) + // FIXME dont break api yet + panic("i died") + } + rv.locBitmap = bitmap + rv.all = p.postings.Iterator() if p.except != nil { allExcept := p.postings.Clone() @@ -86,6 +99,7 @@ func (p *PostingsList) Count() uint64 { var rv uint64 if p.postings != nil { rv = p.postings.GetCardinality() + if p.except != nil { except := p.except.GetCardinality() if except > rv { @@ -117,6 +131,8 @@ type PostingsIterator struct { locChunkLens []uint64 locChunkStart uint64 + + locBitmap *roaring.Bitmap } func (i *PostingsIterator) loadChunk(chunk int) error { @@ -245,7 +261,7 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { if err != nil { return nil, err } - if i.postings.dictionary.segment.fieldsLoc[i.postings.dictionary.fieldID] { + if i.locBitmap.Contains(allN) { for j := 0; j < int(freq); j++ { err := i.readLocation(nil) if err != nil { @@ -280,7 +296,7 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { return nil, err } rv.norm = math.Float32frombits(uint32(normBits)) - if i.postings.dictionary.segment.fieldsLoc[i.postings.dictionary.fieldID] { + if i.locBitmap.Contains(n) { // read off 'freq' locations rv.locs = make([]segment.Location, rv.freq) locs := make([]Location, rv.freq) diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index b8784921a..2c6d0bfed 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -78,7 +78,6 @@ type Segment struct { fieldsMap map[string]uint16 fieldsInv []string - fieldsLoc []bool fieldsOffsets []uint64 } @@ -112,16 +111,8 @@ func (s *Segment) loadFields() error { for s.fieldsIndexOffset+(8*fieldID) < fieldsIndexEnd { addr := binary.BigEndian.Uint64(s.mm[s.fieldsIndexOffset+(8*fieldID) : s.fieldsIndexOffset+(8*fieldID)+8]) var n uint64 - hasStoredLoc, read := binary.Uvarint(s.mm[addr:fieldsIndexEnd]) - n += uint64(read) - if hasStoredLoc == 1 { - s.fieldsLoc = append(s.fieldsLoc, true) - } else { - s.fieldsLoc = append(s.fieldsLoc, false) - } - var dictLoc uint64 - dictLoc, read = binary.Uvarint(s.mm[addr+n : fieldsIndexEnd]) + dictLoc, read := binary.Uvarint(s.mm[addr+n : fieldsIndexEnd]) n += uint64(read) s.fieldsOffsets = append(s.fieldsOffsets, dictLoc) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index d2efa2dcd..10d208efd 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -212,6 +212,11 @@ func (i *IndexSnapshot) Document(id string) (rv *document.Document, err error) { return nil, err } + if next == nil { + // no such doc exists + return nil, nil + } + docNum := docInternalToNumber(next.ID) segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) @@ -318,6 +323,7 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, var err error rv := &IndexSnapshotTermFieldReader{ + term: term, snapshot: i, postings: make([]segment.PostingsList, len(i.segment)), iterators: make([]segment.PostingsIterator, len(i.segment)), @@ -337,7 +343,6 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, if err != nil { return nil, err } - return rv, nil } diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index 936704906..1fbabdfbb 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -22,6 +22,7 @@ import ( ) type IndexSnapshotTermFieldReader struct { + term []byte snapshot *IndexSnapshot postings []segment.PostingsList iterators []segment.PostingsIterator @@ -29,6 +30,8 @@ type IndexSnapshotTermFieldReader struct { includeFreq bool includeNorm bool includeTermVectors bool + currPosting segment.Posting + currID index.IndexInternalID } func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) { @@ -47,26 +50,11 @@ func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*in globalOffset := i.snapshot.offsets[i.segmentOffset] nnum := next.Number() rv.ID = docNumberToBytes(nnum + globalOffset) - if i.includeFreq { - rv.Freq = next.Frequency() - } - if i.includeNorm { - rv.Norm = next.Norm() - } - if i.includeTermVectors { - locs := next.Locations() - rv.Vectors = make([]*index.TermFieldVector, len(locs)) - for i, loc := range locs { - rv.Vectors[i] = &index.TermFieldVector{ - Start: loc.Start(), - End: loc.End(), - Pos: loc.Pos(), - ArrayPositions: loc.ArrayPositions(), - Field: loc.Field(), - } - } - } + i.postingToTermFieldDoc(next, rv) + + i.currID = rv.ID + i.currPosting = next return rv, nil } i.segmentOffset++ @@ -74,7 +62,40 @@ func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*in return nil, nil } +func (i *IndexSnapshotTermFieldReader) postingToTermFieldDoc(next segment.Posting, rv *index.TermFieldDoc) { + if i.includeFreq { + rv.Freq = next.Frequency() + } + if i.includeNorm { + rv.Norm = next.Norm() + } + if i.includeTermVectors { + locs := next.Locations() + rv.Vectors = make([]*index.TermFieldVector, len(locs)) + for i, loc := range locs { + rv.Vectors[i] = &index.TermFieldVector{ + Start: loc.Start(), + End: loc.End(), + Pos: loc.Pos(), + ArrayPositions: loc.ArrayPositions(), + Field: loc.Field(), + } + } + } +} + +// Advance go fuck yourself editor func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) { + // first make sure we aren't already pointing at the right thing, (due to way searchers work) + if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 { + rv := preAlloced + if rv == nil { + rv = &index.TermFieldDoc{} + } + rv.ID = i.currID + i.postingToTermFieldDoc(i.currPosting, rv) + return rv, nil + } // FIXME do something better next, err := i.Next(preAlloced) if err != nil { From 74b2eeb14d4a27c68f0ed0c25e4d40cad7dc27e1 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 11 Dec 2017 15:59:36 -0500 Subject: [PATCH 042/728] refactor where we do some work so we can return error --- index/scorch/segment/zap/dict.go | 15 +++++++++++++- index/scorch/segment/zap/posting.go | 31 ++++++++++------------------- 2 files changed, 24 insertions(+), 22 deletions(-) diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index e4824851e..5d3c160ba 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -60,8 +60,21 @@ func (d *Dictionary) postingsList(term string, except *roaring.Bitmap) (*Posting n += uint64(read) rv.locOffset, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) n += uint64(read) - rv.locBitmapOffset, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) + + var locBitmapOffset uint64 + locBitmapOffset, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) n += uint64(read) + + // go ahead and load loc bitmap + var locBitmapLen uint64 + locBitmapLen, read = binary.Uvarint(d.segment.mm[locBitmapOffset : locBitmapOffset+binary.MaxVarintLen64]) + locRoaringBytes := d.segment.mm[locBitmapOffset+uint64(read) : locBitmapOffset+uint64(read)+locBitmapLen] + rv.locBitmap = roaring.NewBitmap() + _, err := rv.locBitmap.FromBuffer(locRoaringBytes) + if err != nil { + return nil, fmt.Errorf("error loading roaring bitmap of locations with hits: %v", err) + } + var postingsLen uint64 postingsLen, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) n += uint64(read) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 051cfef8e..eb5218d28 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -27,15 +27,15 @@ import ( // PostingsList is an in-memory represenation of a postings list type PostingsList struct { - dictionary *Dictionary - term string - postingsOffset uint64 - freqOffset uint64 - locOffset uint64 - locBitmapOffset uint64 - postings *roaring.Bitmap - except *roaring.Bitmap - postingKey []byte + dictionary *Dictionary + term string + postingsOffset uint64 + freqOffset uint64 + locOffset uint64 + locBitmap *roaring.Bitmap + postings *roaring.Bitmap + except *roaring.Bitmap + postingKey []byte } // Iterator returns an iterator for this postings list @@ -68,18 +68,7 @@ func (p *PostingsList) Iterator() segment.PostingsIterator { n += uint64(read) } rv.locChunkStart = p.locOffset + n - - var locBitmapLen uint64 - locBitmapLen, read = binary.Uvarint(p.dictionary.segment.mm[p.locBitmapOffset : p.locBitmapOffset+binary.MaxVarintLen64]) - roaringBytes := p.dictionary.segment.mm[p.locBitmapOffset+uint64(read) : p.locBitmapOffset+uint64(read)+locBitmapLen] - bitmap := roaring.NewBitmap() - _, err := bitmap.FromBuffer(roaringBytes) - if err != nil { - // return nil, fmt.Errorf("error loading roaring bitmap: %v", err) - // FIXME dont break api yet - panic("i died") - } - rv.locBitmap = bitmap + rv.locBitmap = p.locBitmap rv.all = p.postings.Iterator() if p.except != nil { From f246e0e4c012ae87c9b4c86686a803ab329a85f9 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 11 Dec 2017 16:22:29 -0500 Subject: [PATCH 043/728] update README for zap file format changes --- index/scorch/segment/zap/README.md | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/index/scorch/segment/zap/README.md b/index/scorch/segment/zap/README.md index 39735d34d..b7a1b9e67 100644 --- a/index/scorch/segment/zap/README.md +++ b/index/scorch/segment/zap/README.md @@ -19,6 +19,7 @@ Current usage: - next use dictionary to navigate to posting list for a specific term - walk posting list - if necessary, walk posting details as we go + - if location info is desired, consult location bitmap to see if it is there ## stored fields section @@ -89,6 +90,16 @@ If you know the doc number you're interested in, this format lets you jump to th If you know the doc number you're interested in, this format lets you jump to the correct chunk (docNum/chunkFactor) directly and then seek within that chunk until you find it. +## bitmaps of hits with location info + +- for each posting list + - preparation phase: + - encode roaring bitmap (inidicating which hits have location details indexed) posting list to bytes (so we know the length) + - file writing phase: + - remember the start position for this bitmap + - write length of encoded roaring bitmap + - write the serialized roaring bitmap data + ## postings list section - for each posting list @@ -98,6 +109,7 @@ If you know the doc number you're interested in, this format lets you jump to th - remember the start position for this posting list - write freq/norm details offset (remembered from previous, as varint uint64) - write location details offset (remembered from previous, as varint uint64) + - write location bitmap offset (remembered from pervious, as varint uint64) - write length of encoded roaring bitmap - write the serialized roaring bitmap data @@ -116,7 +128,6 @@ If you know the doc number you're interested in, this format lets you jump to th - for each field - file writing phase: - remember start offset for each field - - write 1 if field has location info indexed, 0 if not (varint uint64) - write dictionary address (remembered from previous) (varint uint64) - write length of field name (varint uint64) - write field name bytes From 58ef21a88aecbf245353615f652a5accfc756e2f Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 11 Dec 2017 16:24:46 -0500 Subject: [PATCH 044/728] fix golint issue --- index/scorch/segment/zap/build.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index a5d2bec25..2bc520f63 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -618,8 +618,8 @@ func persistFields(memSegment *mem.Segment, w *CountHashWriter, dictLocs []uint6 return rv, nil } -// NOTE: update if you make the footer bigger -// crc + ver + chunk + field offset + stored offset + num docs +// FooterSize is the size of the footer record in bytes +// crc + ver + chunk + field offset + stored offset + num docs const FooterSize = 4 + 4 + 4 + 8 + 8 + 8 func persistFooter(numDocs, storedIndexOffset, fieldIndexOffset uint64, From 3461fb741f1fb2b11d5fd45c262b2d1d0e3e1eb8 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 11 Dec 2017 10:40:35 -0800 Subject: [PATCH 045/728] mergeplan: a placeholder planner that merges all segments A stepping stone to fleshing out the API contract. --- index/scorch/mergeplan/merge_plan.go | 275 ++++++++++++++++++++++ index/scorch/mergeplan/merge_plan_test.go | 92 ++++++++ index/scorch/mergeplan/sort.go | 27 +++ 3 files changed, 394 insertions(+) create mode 100644 index/scorch/mergeplan/merge_plan.go create mode 100644 index/scorch/mergeplan/merge_plan_test.go create mode 100644 index/scorch/mergeplan/sort.go diff --git a/index/scorch/mergeplan/merge_plan.go b/index/scorch/mergeplan/merge_plan.go new file mode 100644 index 000000000..ff50cff37 --- /dev/null +++ b/index/scorch/mergeplan/merge_plan.go @@ -0,0 +1,275 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mergeplan + +import ( + "math" + "sort" +) + +// A Segment represents the information that the planner needs to +// calculate segment merging. +type Segment interface { + // Unique id of the segment -- used for sorting. + Id() uint64 + + // Full segment size (the size before any logical deletions). + FullSize() int64 + + // Size of the live data of the segment; i.e., FullSize() minus + // any logical deletions. + LiveSize() int64 +} + +// Plan() will functionally compute a merge plan. A segment will be +// assigned to at most a single MergeTask in the output MergePlan. A +// segment not assigned to any MergeTask means the segment should +// remain unmerged. +func Plan(segments []Segment, o *MergePlanOptions) ( + result *MergePlan, err error) { + if len(segments) <= 1 { + return nil, nil + } + + // TODO: PLACEHOLDER implementation for now, that always merges + // all the candidates. + return &MergePlan{ + Tasks: []*MergeTask{ + &MergeTask{ + Segments: segments, + }, + }, + }, nil +} + +// A MergePlan is the result of the Plan() API. +// +// The planner doesn’t know how or whether these tasks are executed -- +// that’s up to a separate merge execution system, which might execute +// these tasks concurrently or not, and which might execute all the +// tasks or not. +type MergePlan struct { + Tasks []*MergeTask +} + +// A MergeTask represents several segments that should be merged +// together into a single segment. +type MergeTask struct { + Segments []Segment +} + +// The MergePlanOptions is designed to be reusable between planning calls. +type MergePlanOptions struct { + // Max # segments per logarithmic tier, or max width of any + // logarithmic “step”. Smaller values mean more merging but fewer + // segments. Should be >= SegmentsPerMergeTask, else you'll have + // too much merging. + MaxSegmentsPerTier int + + // Max size of any segment produced after merging. Actual + // merging, however, may produce segment sizes different than the + // planner’s predicted sizes. + MaxSegmentSize int64 + + // The number of segments in any resulting MergeTask. e.g., + // len(result.Tasks[ * ].Segments) == SegmentsPerMergeTask. + SegmentsPerMergeTask int + + // Small segments are rounded up to this size, i.e., treated as + // equal (floor) size for consideration. This is to prevent lots + // of tiny segments from resulting in a long tail in the index. + FloorSegmentSize int64 + + // Controls how aggressively merges that reclaim more deletions + // are favored. Higher values will more aggressively target + // merges that reclaim deletions, but be careful not to go so high + // that way too much merging takes place; a value of 3.0 is + // probably nearly too high. A value of 0.0 means deletions don't + // impact merge selection. + ReclaimDeletesWeight float64 + + // Only consider a segment for merging if its delete percentage is + // over this threshold. + MinDeletesPct float64 + + // Optional, defaults to mergeplan.CalcBudget(). + CalcBudget func(totalSize int64, firstTierSize int64, + o *MergePlanOptions) (budgetNumSegments int) + + // Optional, defaults to mergeplan.ScoreSegments(). + ScoreSegments func(segments []Segment, o *MergePlanOptions) float64 + + // Optional. + Logger func(string) +} + +// Returns the higher of the input or FloorSegmentSize. +func (o *MergePlanOptions) RaiseToFloorSegmentSize(s int64) int64 { + if s > o.FloorSegmentSize { + return s + } + return o.FloorSegmentSize +} + +// Suggested default options. +var DefaultMergePlanOptions = MergePlanOptions{ + MaxSegmentsPerTier: 10, + MaxSegmentSize: 5000000, + SegmentsPerMergeTask: 10, + FloorSegmentSize: 2000, + ReclaimDeletesWeight: 2.0, + MinDeletesPct: 10.0, +} + +// ------------------------------------------- + +func plan(segmentsIn []Segment, o *MergePlanOptions) ( + result *MergePlan, err error) { + if len(segmentsIn) <= 1 { + return nil, nil + } + + if o == nil { + o = &DefaultMergePlanOptions + } + + segments := append([]Segment(nil), segmentsIn...) // Copy. + + sort.Sort(byLiveSizeDescending(segments)) + + var segmentsLiveSize int64 + + var minLiveSize int64 = math.MaxInt64 + + var eligible []Segment + var eligibleLiveSize int64 + + for _, segment := range segments { + segmentsLiveSize += segment.LiveSize() + + if minLiveSize > segment.LiveSize() { + minLiveSize = segment.LiveSize() + } + + // Only small-enough segments are eligible. + if segment.LiveSize() < o.MaxSegmentSize/2 { + eligible = append(eligible, segment) + eligibleLiveSize += segment.LiveSize() + } + } + + minLiveSize = o.RaiseToFloorSegmentSize(minLiveSize) + + calcBudget := o.CalcBudget + if calcBudget == nil { + calcBudget = CalcBudget + } + + budgetNumSegments := CalcBudget(eligibleLiveSize, minLiveSize, o) + + scoreSegments := o.ScoreSegments + if scoreSegments == nil { + scoreSegments = ScoreSegments + } + + rv := &MergePlan{} + + // While we’re over budget, keep looping, which might produce + // another MergeTask. + for len(eligible) > budgetNumSegments { + // Track a current best roster as we examine and score + // potential rosters of merges. + var bestRoster []Segment + var bestRosterScore float64 // Lower score is better. + + for startIdx := 0; startIdx < len(eligible)-o.SegmentsPerMergeTask; startIdx++ { + var roster []Segment + var rosterLiveSize int64 + + for idx := startIdx; idx < len(eligible) && len(roster) < o.SegmentsPerMergeTask; idx++ { + rosterCandidate := eligible[idx] + + if rosterLiveSize+rosterCandidate.LiveSize() > o.MaxSegmentSize { + // NOTE: We continue the loop, to try to “pack” + // the roster with smaller segments to get closer + // to the max size; but, we aren't doing full, + // comprehensive "bin-packing" permutations. + continue + } + + roster = append(roster, rosterCandidate) + rosterLiveSize += rosterCandidate.LiveSize() + } + + rosterScore := scoreSegments(roster, o) + + if len(bestRoster) <= 0 || rosterScore < bestRosterScore { + bestRoster = roster + bestRosterScore = rosterScore + } + } + + if len(bestRoster) <= 0 { + return rv, nil + } + + rv.Tasks = append(rv.Tasks, &MergeTask{ + Segments: bestRoster, + }) + + eligible = removeSegments(eligible, bestRoster) + } + + return rv, nil +} + +// Compute the number of segments that would be needed to cover the +// totalSize, by climbing up a logarithmic staircase of segment tiers. +func CalcBudget(totalSize int64, firstTierSize int64, o *MergePlanOptions) ( + budgetNumSegments int) { + tierSize := firstTierSize + + for totalSize > 0 { + segmentsInTier := float64(totalSize) / float64(tierSize) + if segmentsInTier < float64(o.MaxSegmentsPerTier) { + budgetNumSegments += int(math.Ceil(segmentsInTier)) + break + } + + budgetNumSegments += o.MaxSegmentsPerTier + totalSize -= int64(o.MaxSegmentsPerTier) * tierSize + tierSize *= int64(o.SegmentsPerMergeTask) + } + + return budgetNumSegments +} + +// removeSegments() keeps the ordering of the result segments stable. +func removeSegments(segments []Segment, toRemove []Segment) (rv []Segment) { +OUTER: + for _, segment := range segments { + for _, r := range toRemove { + if segment == r { + continue OUTER + } + } + rv = append(rv, segment) + } + return rv +} + +func ScoreSegments(segments []Segment, o *MergePlanOptions) float64 { + return 0 // TODO. Bogus score. +} diff --git a/index/scorch/mergeplan/merge_plan_test.go b/index/scorch/mergeplan/merge_plan_test.go new file mode 100644 index 000000000..0f0f0fcca --- /dev/null +++ b/index/scorch/mergeplan/merge_plan_test.go @@ -0,0 +1,92 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mergeplan + +import ( + "reflect" + "testing" +) + +// Implements the Segment interface for testing, +type segment struct { + id uint64 + fullSize int64 + liveSize int64 +} + +func (s *segment) Id() uint64 { return s.id } +func (s *segment) FullSize() int64 { return s.fullSize } +func (s *segment) LiveSize() int64 { return s.liveSize } + +func makeLinearSegments(n int) (rv []Segment) { + for i := 0; i < n; i++ { + rv = append(rv, &segment{ + id: uint64(i), + fullSize: int64(i), + liveSize: int64(i), + }) + } + return rv +} + +func TestSimplePlan(t *testing.T) { + segs := makeLinearSegments(10) + + tests := []struct { + desc string + segments []Segment + expectPlan *MergePlan + expectErr error + }{ + {"nil candidates", + nil, nil, nil}, + {"empty candidates", + []Segment{}, nil, nil}, + {"1 candidate", + []Segment{segs[0]}, + nil, + nil, + }, + {"2 candidates", + []Segment{ + segs[0], + segs[1], + }, + &MergePlan{ + []*MergeTask{ + &MergeTask{ + Segments: []Segment{ + segs[0], + segs[1], + }, + }, + }, + }, + nil, + }, + } + + for testi, test := range tests { + plan, err := Plan(test.segments, &DefaultMergePlanOptions) + if err != test.expectErr { + t.Errorf("testi: %d, test: %v, got err: %v", + testi, test, err) + } + if !reflect.DeepEqual(plan, test.expectPlan) { + t.Errorf("testi: %d, test: %v, got plan: %v", + testi, test, plan) + } + } +} diff --git a/index/scorch/mergeplan/sort.go b/index/scorch/mergeplan/sort.go new file mode 100644 index 000000000..4eabc31db --- /dev/null +++ b/index/scorch/mergeplan/sort.go @@ -0,0 +1,27 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package mergeplan + +type byLiveSizeDescending []Segment + +func (a byLiveSizeDescending) Len() int { return len(a) } +func (a byLiveSizeDescending) Swap(i, j int) { a[i], a[j] = a[j], a[i] } + +func (a byLiveSizeDescending) Less(i, j int) bool { + if a[i].LiveSize() != a[j].LiveSize() { + return a[i].LiveSize() < a[j].LiveSize() + } + return a[i].Id() < a[j].Id() +} From 3873237e46a1192a27b04c4381602698779f7170 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 11 Dec 2017 22:09:26 -0500 Subject: [PATCH 046/728] try newer version of bolt (seeing random crashes on travis) --- vendor/manifest | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vendor/manifest b/vendor/manifest index 839e6fe5e..60c5fbb5e 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -21,7 +21,7 @@ "importpath": "github.com/boltdb/bolt", "repository": "https://github.com/boltdb/bolt", "vcs": "", - "revision": "144418e1475d8bf7abbdc48583500f1a20c62ea7", + "revision": "9da31745363232bc1e27dbab3569e77383a51585", "branch": "master", "notests": true }, @@ -127,4 +127,4 @@ "notests": true } ] -} \ No newline at end of file +} From 927216df8ccdd7f1cfe8eaa24664b9909f44f9e0 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 12 Dec 2017 08:42:13 -0500 Subject: [PATCH 047/728] fix postings list count impl --- index/scorch/segment/zap/posting.go | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index eb5218d28..68db04299 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -85,20 +85,13 @@ func (p *PostingsList) Iterator() segment.PostingsIterator { // Count returns the number of items on this postings list func (p *PostingsList) Count() uint64 { - var rv uint64 if p.postings != nil { - rv = p.postings.GetCardinality() - if p.except != nil { - except := p.except.GetCardinality() - if except > rv { - // avoid underflow - except = rv - } - rv -= except + return roaring.AndNot(p.postings, p.except).GetCardinality() } + return p.postings.GetCardinality() } - return rv + return 0 } // PostingsIterator provides a way to iterate through the postings list From 665c3c80ffddeedaf219cc9878618def4e00e7a2 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 12 Dec 2017 11:21:55 -0500 Subject: [PATCH 048/728] initial cut of zap segment merging --- index/scorch/segment/zap/intcoder.go | 111 +++++ index/scorch/segment/zap/intcoder_test.go | 59 +++ index/scorch/segment/zap/merge.go | 526 ++++++++++++++++++++++ index/scorch/segment/zap/merge_test.go | 280 ++++++++++++ 4 files changed, 976 insertions(+) create mode 100644 index/scorch/segment/zap/intcoder.go create mode 100644 index/scorch/segment/zap/intcoder_test.go create mode 100644 index/scorch/segment/zap/merge.go create mode 100644 index/scorch/segment/zap/merge_test.go diff --git a/index/scorch/segment/zap/intcoder.go b/index/scorch/segment/zap/intcoder.go new file mode 100644 index 000000000..a682740f1 --- /dev/null +++ b/index/scorch/segment/zap/intcoder.go @@ -0,0 +1,111 @@ +package zap + +import ( + "bytes" + "encoding/binary" + "io" + + "github.com/Smerity/govarint" +) + +type chunkedIntCoder struct { + final []byte + maxDocNum uint64 + chunkSize uint64 + chunkBuf bytes.Buffer + encoder *govarint.Base128Encoder + chunkLens []uint64 + currChunk uint64 +} + +// newChunkedIntCoder returns a new chunk int coder which packs data into +// chunks based on the provided chunkSize and supports up to the specified +// maxDocNum +func newChunkedIntCoder(chunkSize uint64, maxDocNum uint64) *chunkedIntCoder { + total := maxDocNum/chunkSize + 1 + rv := &chunkedIntCoder{ + chunkSize: chunkSize, + maxDocNum: maxDocNum, + chunkLens: make([]uint64, total), + } + rv.encoder = govarint.NewU64Base128Encoder(&rv.chunkBuf) + + return rv +} + +// Reset lets you reuse this chunked int coder. buffers are reset and reused +// from previous use. you cannot change the chunk size or max doc num. +func (c *chunkedIntCoder) Reset() { + c.final = c.final[:0] + c.chunkBuf.Reset() + c.currChunk = 0 + for i := range c.chunkLens { + c.chunkLens[i] = 0 + } +} + +// Add encodes the provided integers into the correct chunk for the provided +// doc num. You MUST call Add() with increasing docNums. +func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error { + chunk := docNum / c.chunkSize + if chunk != c.currChunk { + // starting a new chunk + if c.encoder != nil { + // close out last + c.encoder.Close() + encodingBytes := c.chunkBuf.Bytes() + c.chunkLens[c.currChunk] = uint64(len(encodingBytes)) + c.final = append(c.final, encodingBytes...) + c.chunkBuf.Reset() + c.encoder = govarint.NewU64Base128Encoder(&c.chunkBuf) + } + c.currChunk = chunk + } + + for _, val := range vals { + _, err := c.encoder.PutU64(val) + if err != nil { + return err + } + } + + return nil +} + +// Close indicates you are done calling Add() this allows the final chunk +// to be encoded. +func (c *chunkedIntCoder) Close() { + c.encoder.Close() + encodingBytes := c.chunkBuf.Bytes() + c.chunkLens[c.currChunk] = uint64(len(encodingBytes)) + c.final = append(c.final, encodingBytes...) +} + +// Write commits all the encoded chunked integers to the provided writer. +func (c *chunkedIntCoder) Write(w io.Writer) (int, error) { + var tw int + buf := make([]byte, binary.MaxVarintLen64) + // write out the number of chunks + n := binary.PutUvarint(buf, uint64(len(c.chunkLens))) + nw, err := w.Write(buf[:n]) + tw += nw + if err != nil { + return tw, err + } + // write out the chunk lens + for _, chunkLen := range c.chunkLens { + n := binary.PutUvarint(buf, uint64(chunkLen)) + nw, err = w.Write(buf[:n]) + tw += nw + if err != nil { + return tw, err + } + } + // write out the data + nw, err = w.Write(c.final) + tw += nw + if err != nil { + return tw, err + } + return tw, nil +} diff --git a/index/scorch/segment/zap/intcoder_test.go b/index/scorch/segment/zap/intcoder_test.go new file mode 100644 index 000000000..f2623a548 --- /dev/null +++ b/index/scorch/segment/zap/intcoder_test.go @@ -0,0 +1,59 @@ +package zap + +import ( + "bytes" + "reflect" + "testing" +) + +func TestChunkIntCoder(t *testing.T) { + tests := []struct { + maxDocNum uint64 + chunkSize uint64 + docNums []uint64 + vals [][]uint64 + expected []byte + }{ + { + maxDocNum: 0, + chunkSize: 1, + docNums: []uint64{0}, + vals: [][]uint64{ + []uint64{3}, + }, + // 1 chunk, chunk-0 length 1, value 3 + expected: []byte{0x1, 0x1, 0x3}, + }, + { + maxDocNum: 1, + chunkSize: 1, + docNums: []uint64{0, 1}, + vals: [][]uint64{ + []uint64{3}, + []uint64{7}, + }, + // 2 chunks, chunk-0 length 1, chunk-1 length 1, value 3, value 7 + expected: []byte{0x2, 0x1, 0x1, 0x3, 0x7}, + }, + } + + for _, test := range tests { + + cic := newChunkedIntCoder(test.chunkSize, test.maxDocNum) + for i, docNum := range test.docNums { + err := cic.Add(docNum, test.vals[i]...) + if err != nil { + t.Fatalf("error adding to intcoder: %v", err) + } + } + cic.Close() + var actual bytes.Buffer + _, err := cic.Write(&actual) + if err != nil { + t.Fatalf("error writing: %v", err) + } + if !reflect.DeepEqual(test.expected, actual.Bytes()) { + t.Errorf("got % x, expected % x", actual.Bytes(), test.expected) + } + } +} diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go new file mode 100644 index 000000000..fe5155d16 --- /dev/null +++ b/index/scorch/segment/zap/merge.go @@ -0,0 +1,526 @@ +package zap + +import ( + "bufio" + "bytes" + "encoding/binary" + "fmt" + "io" + "math" + "os" + + "github.com/RoaringBitmap/roaring" + "github.com/Smerity/govarint" + "github.com/couchbaselabs/vellum" + "github.com/golang/snappy" +) + +// Merge takes a slice of zap segments, bit masks describing which documents +// from the may be dropped, and creates a new segment containing the remaining +// data. This new segment is built at the specified path, with the provided +// chunkFactor. +func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, chunkFactor uint32) error { + + flag := os.O_RDWR | os.O_CREATE + + f, err := os.OpenFile(path, flag, 0600) + if err != nil { + return err + } + + // bufer the output + br := bufio.NewWriter(f) + + // wrap it for counting (tracking offsets) + cr := NewCountHashWriter(br) + + fieldsInv := mergeFields(segments) + fieldsMap := mapFields(fieldsInv) + + newSegDocCount := computeNewDocCount(segments, drops) + + var newDocNums [][]uint64 + var storedIndexOffset uint64 + storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, + fieldsMap, fieldsInv, newSegDocCount, cr) + if err != nil { + return err + } + + // FIXME temp until computed + //dictLocs := make([]uint64, len(fieldsInv)) + + var dictLocs []uint64 + dictLocs, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap, + newDocNums, newSegDocCount, cr) + if err != nil { + return err + } + + var fieldsIndexOffset uint64 + fieldsIndexOffset, err = persistMergedFields(fieldsInv, cr, dictLocs) + if err != nil { + return err + } + + err = persistFooter(newSegDocCount, storedIndexOffset, + fieldsIndexOffset, chunkFactor, cr) + if err != nil { + return err + } + + err = br.Flush() + if err != nil { + return err + } + + err = f.Sync() + if err != nil { + return err + } + + err = f.Close() + if err != nil { + return err + } + + return nil +} + +func mapFields(fields []string) map[string]uint16 { + rv := make(map[string]uint16) + for i, fieldName := range fields { + rv[fieldName] = uint16(i) + } + return rv +} + +func computeNewDocCount(segments []*Segment, drops []*roaring.Bitmap) uint64 { + var newSegDocCount uint64 + for segI, segment := range segments { + segIAfterDrop := segment.NumDocs() + if drops[segI] != nil { + segIAfterDrop -= drops[segI].GetCardinality() + } + newSegDocCount += segIAfterDrop + } + return newSegDocCount +} + +func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer) (int, error) { + var buffer bytes.Buffer + // write out postings list to memory so we know the len + postingsListLen, err := r.WriteTo(&buffer) + if err != nil { + return 0, err + } + var tw int + // write out the length of this postings list + buf := make([]byte, binary.MaxVarintLen64) + n := binary.PutUvarint(buf, uint64(postingsListLen)) + nw, err := w.Write(buf[:n]) + tw += nw + if err != nil { + return tw, err + } + + // write out the postings list itself + nw, err = w.Write(buffer.Bytes()) + tw += nw + if err != nil { + return tw, err + } + + return tw, nil +} + +func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, + fieldsInv []string, fieldsMap map[string]uint16, newDocNums [][]uint64, newSegDocCount uint64, + w *CountHashWriter) ([]uint64, error) { + + rv := make([]uint64, len(fieldsInv)) + + var vellumBuf bytes.Buffer + // for each field + for fieldID, fieldName := range fieldsInv { + if fieldID != 0 { + vellumBuf.Reset() + } + newVellum, err := vellum.New(&vellumBuf, nil) + if err != nil { + return nil, err + } + + // collect FTS iterators from all segments for this field + var dicts []*Dictionary + var itrs []vellum.Iterator + for _, segment := range segments { + dict, err2 := segment.dictionary(fieldName) + if err2 != nil { + return nil, err2 + } + dicts = append(dicts, dict) + + itr, err2 := dict.fst.Iterator(nil, nil) + if err2 != nil { + return nil, err2 + } + itrs = append(itrs, itr) + } + + // create merging iterator + mergeItr, err := vellum.NewMergeIterator(itrs, func(postingOffsets []uint64) uint64 { + // we don't actually use the merged value + return 0 + }) + + tfEncoder := newChunkedIntCoder(1024, newSegDocCount-1) + locEncoder := newChunkedIntCoder(1024, newSegDocCount-1) + for err == nil { + term, _ := mergeItr.Current() + + newRoaring := roaring.NewBitmap() + newRoaringLocs := roaring.NewBitmap() + tfEncoder.Reset() + locEncoder.Reset() + + // now go back and get posting list for this term + // but pass in the deleted docs for that segment + for dictI, dict := range dicts { + postings, err2 := dict.postingsList(string(term), drops[dictI]) + if err2 != nil { + return nil, err2 + } + + postItr := postings.Iterator() + next, err2 := postItr.Next() + for next != nil && err2 == nil { + hitNewDocNum := newDocNums[dictI][next.Number()] + if hitNewDocNum == docDropped { + return nil, fmt.Errorf("see hit with dropped doc num") + } + newRoaring.Add(uint32(hitNewDocNum)) + // encode norm bits + norm := next.Norm() + normBits := math.Float32bits(float32(norm)) + err3 := tfEncoder.Add(hitNewDocNum, next.Frequency(), uint64(normBits)) + if err3 != nil { + return nil, err3 + } + locs := next.Locations() + if len(locs) > 0 { + newRoaringLocs.Add(uint32(hitNewDocNum)) + for _, loc := range locs { + args := make([]uint64, 0, 5+len(loc.ArrayPositions())) + args = append(args, uint64(fieldsMap[loc.Field()])) + args = append(args, loc.Pos()) + args = append(args, loc.Start()) + args = append(args, loc.End()) + args = append(args, uint64(len(loc.ArrayPositions()))) + args = append(args, loc.ArrayPositions()...) + locEncoder.Add(hitNewDocNum, args...) + } + } + next, err2 = postItr.Next() + } + if err != nil { + return nil, err + } + + } + tfEncoder.Close() + locEncoder.Close() + + if newRoaring.GetCardinality() > 0 { + // this field/term actually has hits in the new segment, lets write it down + freqOffset := uint64(w.Count()) + _, err = tfEncoder.Write(w) + if err != nil { + return nil, err + } + locOffset := uint64(w.Count()) + _, err = locEncoder.Write(w) + if err != nil { + return nil, err + } + postingLocOffset := uint64(w.Count()) + _, err = writeRoaringWithLen(newRoaringLocs, w) + if err != nil { + return nil, err + } + postingOffset := uint64(w.Count()) + // write out the start of the term info + buf := make([]byte, binary.MaxVarintLen64) + n := binary.PutUvarint(buf, freqOffset) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, err + } + + // write out the start of the loc info + n = binary.PutUvarint(buf, locOffset) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, err + } + + // write out the start of the loc posting list + n = binary.PutUvarint(buf, postingLocOffset) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, err + } + _, err = writeRoaringWithLen(newRoaring, w) + if err != nil { + return nil, err + } + + newVellum.Insert(term, postingOffset) + } + + err = mergeItr.Next() + } + if err != nil && err != vellum.ErrIteratorDone { + return nil, err + } + + dictOffset := uint64(w.Count()) + err = newVellum.Close() + if err != nil { + return nil, err + } + vellumData := vellumBuf.Bytes() + + // write out the length of the vellum data + buf := make([]byte, binary.MaxVarintLen64) + // write out the number of chunks + n := binary.PutUvarint(buf, uint64(len(vellumData))) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, err + } + + // write this vellum to disk + _, err = w.Write(vellumData) + if err != nil { + return nil, err + } + + rv[fieldID] = dictOffset + } + + return rv, nil +} + +const docDropped = math.MaxUint64 + +func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, + fieldsMap map[string]uint16, fieldsInv []string, newSegDocCount uint64, + w *CountHashWriter) (uint64, [][]uint64, error) { + var rv [][]uint64 + var newDocNum int + + var curr int + var metaBuf bytes.Buffer + var data, compressed []byte + + docNumOffsets := make([]uint64, newSegDocCount) + + // for each segment + for segI, segment := range segments { + var segNewDocNums []uint64 + + // for each doc num + for docNum := uint64(0); docNum < segment.numDocs; docNum++ { + metaBuf.Reset() + data = data[:0] + compressed = compressed[:0] + curr = 0 + + metaEncoder := govarint.NewU64Base128Encoder(&metaBuf) + + if drops[segI] != nil && drops[segI].Contains(uint32(docNum)) { + segNewDocNums = append(segNewDocNums, docDropped) + } else { + segNewDocNums = append(segNewDocNums, uint64(newDocNum)) + // collect all the data + vals := make(map[uint16][][]byte) + typs := make(map[uint16][]byte) + poss := make(map[uint16][][]uint64) + err := segment.VisitDocument(docNum, func(field string, typ byte, value []byte, pos []uint64) bool { + fieldID := fieldsMap[field] + vals[fieldID] = append(vals[fieldID], value) + typs[fieldID] = append(typs[fieldID], typ) + poss[fieldID] = append(poss[fieldID], pos) + return true + }) + if err != nil { + return 0, nil, err + } + + // now walk the fields in order + for fieldID := range fieldsInv { + + if storedFieldValues, ok := vals[uint16(fieldID)]; ok { + + // has stored values for this field + num := len(storedFieldValues) + + // process each value + for i := 0; i < num; i++ { + // encode field + _, err2 := metaEncoder.PutU64(uint64(fieldID)) + if err2 != nil { + return 0, nil, err2 + } + // encode type + _, err2 = metaEncoder.PutU64(uint64(typs[uint16(fieldID)][i])) + if err2 != nil { + return 0, nil, err2 + } + // encode start offset + _, err2 = metaEncoder.PutU64(uint64(curr)) + if err2 != nil { + return 0, nil, err2 + } + // end len + _, err2 = metaEncoder.PutU64(uint64(len(storedFieldValues[i]))) + if err2 != nil { + return 0, nil, err2 + } + // encode number of array pos + _, err2 = metaEncoder.PutU64(uint64(len(poss[uint16(fieldID)][i]))) + if err2 != nil { + return 0, nil, err2 + } + // encode all array positions + for j := 0; j < len(poss[uint16(fieldID)][i]); j++ { + _, err2 = metaEncoder.PutU64(poss[uint16(fieldID)][i][j]) + if err2 != nil { + return 0, nil, err2 + } + } + // append data + data = append(data, storedFieldValues[i]...) + // update curr + curr += len(storedFieldValues[i]) + + } + } + } + + metaEncoder.Close() + metaBytes := metaBuf.Bytes() + compressed = snappy.Encode(compressed, data) + // record where we're about to start writing + docNumOffsets[newDocNum] = uint64(w.Count()) + + buf := make([]byte, binary.MaxVarintLen64) + // write out the meta length + n := binary.PutUvarint(buf, uint64(len(metaBytes))) + _, err = w.Write(buf[:n]) + if err != nil { + return 0, nil, err + } + // write out the compressed data length + n = binary.PutUvarint(buf, uint64(len(compressed))) + _, err = w.Write(buf[:n]) + if err != nil { + return 0, nil, err + } + // now write the meta + _, err = w.Write(metaBytes) + if err != nil { + return 0, nil, err + } + // now write the compressed data + _, err = w.Write(compressed) + if err != nil { + return 0, nil, err + } + + newDocNum++ + } + } + rv = append(rv, segNewDocNums) + } + + // return value is the start of the stored index + offset := uint64(w.Count()) + // now write out the stored doc index + for docNum := range docNumOffsets { + err := binary.Write(w, binary.BigEndian, docNumOffsets[docNum]) + if err != nil { + return 0, nil, err + } + } + + return offset, rv, nil +} + +// mergeFields builds a unified list of fields used across all the input segments +func mergeFields(segments []*Segment) []string { + fieldsMap := map[string]struct{}{} + + for _, segment := range segments { + fields := segment.Fields() + for _, field := range fields { + fieldsMap[field] = struct{}{} + } + } + rv := make([]string, 0, len(fieldsMap)) + // ensure _id stays first + rv = append(rv, "_id") + for k := range fieldsMap { + if k != "_id" { + rv = append(rv, k) + } + } + + return rv +} + +func persistMergedFields(fieldsInv []string, w *CountHashWriter, dictLocs []uint64) (uint64, error) { + var rv uint64 + + var fieldStarts []uint64 + for fieldID, fieldName := range fieldsInv { + + // record start of this field + fieldStarts = append(fieldStarts, uint64(w.Count())) + + buf := make([]byte, binary.MaxVarintLen64) + // write out dict location for this field + n := binary.PutUvarint(buf, dictLocs[fieldID]) + _, err := w.Write(buf[:n]) + if err != nil { + return 0, err + } + + // write out the length of the field name + n = binary.PutUvarint(buf, uint64(len(fieldName))) + _, err = w.Write(buf[:n]) + if err != nil { + return 0, err + } + + // write out the field name + _, err = w.Write([]byte(fieldName)) + if err != nil { + return 0, err + } + } + + // now write out the fields index + rv = uint64(w.Count()) + + // now write out the stored doc index + for fieldID := range fieldsInv { + err := binary.Write(w, binary.BigEndian, fieldStarts[fieldID]) + if err != nil { + return 0, err + } + } + + return rv, nil +} diff --git a/index/scorch/segment/zap/merge_test.go b/index/scorch/segment/zap/merge_test.go new file mode 100644 index 000000000..53bcde7fb --- /dev/null +++ b/index/scorch/segment/zap/merge_test.go @@ -0,0 +1,280 @@ +package zap + +import ( + "os" + "testing" + + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment/mem" +) + +func TestMerge(t *testing.T) { + _ = os.RemoveAll("/tmp/scorch.zap") + _ = os.RemoveAll("/tmp/scorch2.zap") + _ = os.RemoveAll("/tmp/scorch3.zap") + + memSegment := buildMemSegmentMulti() + err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024) + if err != nil { + t.Fatal(err) + } + + memSegment2 := buildMemSegmentMulti2() + err = PersistSegment(memSegment2, "/tmp/scorch2.zap", 1024) + if err != nil { + t.Fatal(err) + } + + segment, err := Open("/tmp/scorch.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + segment2, err := Open("/tmp/scorch2.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment2.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + segsToMerge := make([]*Segment, 2) + segsToMerge[0] = segment.(*Segment) + segsToMerge[1] = segment2.(*Segment) + + err = Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024) + if err != nil { + t.Fatal(err) + } +} + +func TestMergeAndDrop(t *testing.T) { + _ = os.RemoveAll("/tmp/scorch.zap") + _ = os.RemoveAll("/tmp/scorch2.zap") + _ = os.RemoveAll("/tmp/scorch3.zap") + + memSegment := buildMemSegmentMulti() + err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024) + if err != nil { + t.Fatal(err) + } + + memSegment2 := buildMemSegmentMulti2() + err = PersistSegment(memSegment2, "/tmp/scorch2.zap", 1024) + if err != nil { + t.Fatal(err) + } + + segment, err := Open("/tmp/scorch.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + segment2, err := Open("/tmp/scorch2.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment2.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + segsToMerge := make([]*Segment, 2) + segsToMerge[0] = segment.(*Segment) + segsToMerge[1] = segment2.(*Segment) + + docsToDrop := make([]*roaring.Bitmap, 2) + docsToDrop[0] = roaring.NewBitmap() + docsToDrop[0].AddInt(1) + docsToDrop[1] = roaring.NewBitmap() + docsToDrop[1].AddInt(1) + + err = Merge(segsToMerge, docsToDrop, "/tmp/scorch3.zap", 1024) + if err != nil { + t.Fatal(err) + } +} + +func buildMemSegmentMulti2() *mem.Segment { + + doc := &document.Document{ + ID: "c", + Fields: []document.Field{ + document.NewTextFieldCustom("_id", nil, []byte("c"), document.IndexField|document.StoreField, nil), + document.NewTextFieldCustom("name", nil, []byte("mat"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + }, + CompositeFields: []*document.CompositeField{ + document.NewCompositeField("_all", true, nil, []string{"_id"}), + }, + } + + doc2 := &document.Document{ + ID: "d", + Fields: []document.Field{ + document.NewTextFieldCustom("_id", nil, []byte("d"), document.IndexField|document.StoreField, nil), + document.NewTextFieldCustom("name", nil, []byte("joa"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), + }, + CompositeFields: []*document.CompositeField{ + document.NewCompositeField("_all", true, nil, []string{"_id"}), + }, + } + + // forge analyzed docs + results := []*index.AnalysisResult{ + &index.AnalysisResult{ + Document: doc, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("c"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("mat"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("some"), + }, + &analysis.Token{ + Start: 5, + End: 10, + Position: 2, + Term: []byte("thing"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("cold"), + }, + }, []uint64{0}, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("dark"), + }, + }, []uint64{1}, true), + }, + Length: []int{ + 1, + 1, + 2, + 1, + 1, + }, + }, + &index.AnalysisResult{ + Document: doc2, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("d"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("joa"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("some"), + }, + &analysis.Token{ + Start: 5, + End: 10, + Position: 2, + Term: []byte("thing"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("cold"), + }, + }, []uint64{0}, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("dark"), + }, + }, []uint64{1}, true), + }, + Length: []int{ + 1, + 1, + 2, + 1, + 1, + }, + }, + } + + // fix up composite fields + for _, ar := range results { + for i, f := range ar.Document.Fields { + for _, cf := range ar.Document.CompositeFields { + cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i]) + } + } + } + + segment := mem.NewFromAnalyzedDocs(results) + + return segment +} From 57121e40a85624b112fd4333bc6c30ba2a7064bb Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 12 Dec 2017 11:41:14 -0500 Subject: [PATCH 049/728] fix issues identified by errcheck --- index/scorch/segment/zap/merge.go | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index fe5155d16..b41dc4275 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -218,7 +218,10 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, args = append(args, loc.End()) args = append(args, uint64(len(loc.ArrayPositions()))) args = append(args, loc.ArrayPositions()...) - locEncoder.Add(hitNewDocNum, args...) + err = locEncoder.Add(hitNewDocNum, args...) + if err != nil { + return nil, err + } } } next, err2 = postItr.Next() @@ -275,7 +278,10 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, return nil, err } - newVellum.Insert(term, postingOffset) + err = newVellum.Insert(term, postingOffset) + if err != nil { + return nil, err + } } err = mergeItr.Next() From 59a1e26300606112431b1c04b7a077dc0d164a45 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 11 Dec 2017 15:03:29 -0800 Subject: [PATCH 050/728] mergeplan: scoring implemented --- index/scorch/mergeplan/merge_plan.go | 126 ++++--- index/scorch/mergeplan/merge_plan_test.go | 422 ++++++++++++++++++++-- index/scorch/mergeplan/sort.go | 5 +- 3 files changed, 465 insertions(+), 88 deletions(-) diff --git a/index/scorch/mergeplan/merge_plan.go b/index/scorch/mergeplan/merge_plan.go index ff50cff37..18bb3ec16 100644 --- a/index/scorch/mergeplan/merge_plan.go +++ b/index/scorch/mergeplan/merge_plan.go @@ -37,21 +37,8 @@ type Segment interface { // assigned to at most a single MergeTask in the output MergePlan. A // segment not assigned to any MergeTask means the segment should // remain unmerged. -func Plan(segments []Segment, o *MergePlanOptions) ( - result *MergePlan, err error) { - if len(segments) <= 1 { - return nil, nil - } - - // TODO: PLACEHOLDER implementation for now, that always merges - // all the candidates. - return &MergePlan{ - Tasks: []*MergeTask{ - &MergeTask{ - Segments: segments, - }, - }, - }, nil +func Plan(segments []Segment, o *MergePlanOptions) (*MergePlan, error) { + return plan(segments, o) } // A MergePlan is the result of the Plan() API. @@ -100,10 +87,6 @@ type MergePlanOptions struct { // impact merge selection. ReclaimDeletesWeight float64 - // Only consider a segment for merging if its delete percentage is - // over this threshold. - MinDeletesPct float64 - // Optional, defaults to mergeplan.CalcBudget(). CalcBudget func(totalSize int64, firstTierSize int64, o *MergePlanOptions) (budgetNumSegments int) @@ -130,13 +113,11 @@ var DefaultMergePlanOptions = MergePlanOptions{ SegmentsPerMergeTask: 10, FloorSegmentSize: 2000, ReclaimDeletesWeight: 2.0, - MinDeletesPct: 10.0, } // ------------------------------------------- -func plan(segmentsIn []Segment, o *MergePlanOptions) ( - result *MergePlan, err error) { +func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) { if len(segmentsIn) <= 1 { return nil, nil } @@ -153,8 +134,8 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) ( var minLiveSize int64 = math.MaxInt64 - var eligible []Segment - var eligibleLiveSize int64 + var eligibles []Segment + var eligiblesLiveSize int64 for _, segment := range segments { segmentsLiveSize += segment.LiveSize() @@ -165,8 +146,8 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) ( // Only small-enough segments are eligible. if segment.LiveSize() < o.MaxSegmentSize/2 { - eligible = append(eligible, segment) - eligibleLiveSize += segment.LiveSize() + eligibles = append(eligibles, segment) + eligiblesLiveSize += segment.LiveSize() } } @@ -177,7 +158,7 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) ( calcBudget = CalcBudget } - budgetNumSegments := CalcBudget(eligibleLiveSize, minLiveSize, o) + budgetNumSegments := CalcBudget(eligiblesLiveSize, minLiveSize, o) scoreSegments := o.ScoreSegments if scoreSegments == nil { @@ -188,36 +169,32 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) ( // While we’re over budget, keep looping, which might produce // another MergeTask. - for len(eligible) > budgetNumSegments { + for len(eligibles) > budgetNumSegments { // Track a current best roster as we examine and score // potential rosters of merges. var bestRoster []Segment var bestRosterScore float64 // Lower score is better. - for startIdx := 0; startIdx < len(eligible)-o.SegmentsPerMergeTask; startIdx++ { + for startIdx := 0; startIdx < len(eligibles)-o.SegmentsPerMergeTask; startIdx++ { var roster []Segment var rosterLiveSize int64 - for idx := startIdx; idx < len(eligible) && len(roster) < o.SegmentsPerMergeTask; idx++ { - rosterCandidate := eligible[idx] + for idx := startIdx; idx < len(eligibles) && len(roster) < o.SegmentsPerMergeTask; idx++ { + eligible := eligibles[idx] - if rosterLiveSize+rosterCandidate.LiveSize() > o.MaxSegmentSize { - // NOTE: We continue the loop, to try to “pack” - // the roster with smaller segments to get closer - // to the max size; but, we aren't doing full, - // comprehensive "bin-packing" permutations. - continue + if rosterLiveSize+eligible.LiveSize() < o.MaxSegmentSize { + roster = append(roster, eligible) + rosterLiveSize += eligible.LiveSize() } - - roster = append(roster, rosterCandidate) - rosterLiveSize += rosterCandidate.LiveSize() } - rosterScore := scoreSegments(roster, o) + if len(roster) > 0 { + rosterScore := scoreSegments(roster, o) - if len(bestRoster) <= 0 || rosterScore < bestRosterScore { - bestRoster = roster - bestRosterScore = rosterScore + if len(bestRoster) <= 0 || rosterScore < bestRosterScore { + bestRoster = roster + bestRosterScore = rosterScore + } } } @@ -225,11 +202,9 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) ( return rv, nil } - rv.Tasks = append(rv.Tasks, &MergeTask{ - Segments: bestRoster, - }) + rv.Tasks = append(rv.Tasks, &MergeTask{Segments: bestRoster}) - eligible = removeSegments(eligible, bestRoster) + eligibles = removeSegments(eligibles, bestRoster) } return rv, nil @@ -240,24 +215,38 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) ( func CalcBudget(totalSize int64, firstTierSize int64, o *MergePlanOptions) ( budgetNumSegments int) { tierSize := firstTierSize + if tierSize < 1 { + tierSize = 1 + } + + maxSegmentsPerTier := o.MaxSegmentsPerTier + if maxSegmentsPerTier < 1 { + maxSegmentsPerTier = 1 + } + + segmentsPerMergeTask := int64(o.SegmentsPerMergeTask) + if segmentsPerMergeTask < 2 { + segmentsPerMergeTask = 2 + } for totalSize > 0 { segmentsInTier := float64(totalSize) / float64(tierSize) - if segmentsInTier < float64(o.MaxSegmentsPerTier) { + if segmentsInTier < float64(maxSegmentsPerTier) { budgetNumSegments += int(math.Ceil(segmentsInTier)) break } - budgetNumSegments += o.MaxSegmentsPerTier - totalSize -= int64(o.MaxSegmentsPerTier) * tierSize - tierSize *= int64(o.SegmentsPerMergeTask) + budgetNumSegments += maxSegmentsPerTier + totalSize -= int64(maxSegmentsPerTier) * tierSize + tierSize *= segmentsPerMergeTask } return budgetNumSegments } -// removeSegments() keeps the ordering of the result segments stable. -func removeSegments(segments []Segment, toRemove []Segment) (rv []Segment) { +// Of note, removeSegments() keeps the ordering of the results stable. +func removeSegments(segments []Segment, toRemove []Segment) []Segment { + rv := make([]Segment, 0, len(segments)-len(toRemove)) OUTER: for _, segment := range segments { for _, r := range toRemove { @@ -270,6 +259,33 @@ OUTER: return rv } +// Smaller result score is better. func ScoreSegments(segments []Segment, o *MergePlanOptions) float64 { - return 0 // TODO. Bogus score. + var totBeforeSize int64 + var totAfterSize int64 + var totAfterSizeFloored int64 + + for _, segment := range segments { + totBeforeSize += segment.FullSize() + totAfterSize += segment.LiveSize() + totAfterSizeFloored += o.RaiseToFloorSegmentSize(segment.LiveSize()) + } + + // Roughly guess the "balance" of the segments -- whether the + // segments are about the same size. + balance := + float64(o.RaiseToFloorSegmentSize(segments[0].LiveSize())) / + float64(totAfterSizeFloored) + + // Gently favor smaller merges over bigger ones. We don't want to + // make the exponent too large else we end up with poor merges of + // small segments in order to avoid the large merges. + score := balance * math.Pow(float64(totAfterSize), 0.05) + + // Strongly favor merges that reclaim deletes. + nonDelRatio := float64(totAfterSize) / float64(totBeforeSize) + + score *= math.Pow(nonDelRatio, o.ReclaimDeletesWeight) + + return score } diff --git a/index/scorch/mergeplan/merge_plan_test.go b/index/scorch/mergeplan/merge_plan_test.go index 0f0f0fcca..c1bca8459 100644 --- a/index/scorch/mergeplan/merge_plan_test.go +++ b/index/scorch/mergeplan/merge_plan_test.go @@ -15,61 +15,99 @@ package mergeplan import ( + "encoding/json" + "fmt" + "os" "reflect" + "sort" + "strings" "testing" ) // Implements the Segment interface for testing, type segment struct { - id uint64 - fullSize int64 - liveSize int64 + MyId uint64 + MyFullSize int64 + MyLiveSize int64 } -func (s *segment) Id() uint64 { return s.id } -func (s *segment) FullSize() int64 { return s.fullSize } -func (s *segment) LiveSize() int64 { return s.liveSize } +func (s *segment) Id() uint64 { return s.MyId } +func (s *segment) FullSize() int64 { return s.MyFullSize } +func (s *segment) LiveSize() int64 { return s.MyLiveSize } func makeLinearSegments(n int) (rv []Segment) { for i := 0; i < n; i++ { rv = append(rv, &segment{ - id: uint64(i), - fullSize: int64(i), - liveSize: int64(i), + MyId: uint64(i), + MyFullSize: int64(i), + MyLiveSize: int64(i), }) } return rv } +// ---------------------------------------- + func TestSimplePlan(t *testing.T) { segs := makeLinearSegments(10) tests := []struct { - desc string - segments []Segment - expectPlan *MergePlan - expectErr error + Desc string + Segments []Segment + Options *MergePlanOptions + ExpectPlan *MergePlan + ExpectErr error }{ - {"nil candidates", - nil, nil, nil}, - {"empty candidates", - []Segment{}, nil, nil}, - {"1 candidate", - []Segment{segs[0]}, + {"nil segments", + nil, nil, nil, nil}, + {"empty segments", + []Segment{}, nil, nil, nil}, + {"1 segment", + []Segment{segs[1]}, + nil, + nil, + nil, + }, + {"2 segments", + []Segment{ + segs[1], + segs[2], + }, nil, + &MergePlan{}, nil, }, - {"2 candidates", + {"3 segments", []Segment{ - segs[0], segs[1], + segs[2], + segs[9], + }, + nil, + &MergePlan{}, + nil, + }, + {"many segments", + []Segment{ + segs[1], + segs[2], + segs[3], + segs[4], + segs[5], + segs[6], + }, + &MergePlanOptions{ + MaxSegmentsPerTier: 1, + MaxSegmentSize: 1000, + SegmentsPerMergeTask: 2, + FloorSegmentSize: 1, }, &MergePlan{ - []*MergeTask{ + Tasks: []*MergeTask{ &MergeTask{ Segments: []Segment{ - segs[0], - segs[1], + segs[6], + segs[5], }, }, }, @@ -79,14 +117,336 @@ func TestSimplePlan(t *testing.T) { } for testi, test := range tests { - plan, err := Plan(test.segments, &DefaultMergePlanOptions) - if err != test.expectErr { - t.Errorf("testi: %d, test: %v, got err: %v", - testi, test, err) + plan, err := Plan(test.Segments, test.Options) + if err != test.ExpectErr { + testj, _ := json.Marshal(&test) + t.Errorf("testi: %d, test: %s, got err: %v", + testi, testj, err) } - if !reflect.DeepEqual(plan, test.expectPlan) { - t.Errorf("testi: %d, test: %v, got plan: %v", - testi, test, plan) + if !reflect.DeepEqual(plan, test.ExpectPlan) { + testj, _ := json.Marshal(&test) + planj, _ := json.Marshal(&plan) + t.Errorf("testi: %d, test: %s, got plan: %s", + testi, testj, planj) } } } + +// ---------------------------------------- + +func TestSort(t *testing.T) { + segs := makeLinearSegments(10) + + sort.Sort(byLiveSizeDescending(segs)) + + for i := 1; i < len(segs); i++ { + if segs[i].LiveSize() >= segs[i-1].LiveSize() { + t.Errorf("not descending") + } + } +} + +// ---------------------------------------- + +func TestCalcBudget(t *testing.T) { + tests := []struct { + totalSize int64 + firstTierSize int64 + o MergePlanOptions + expect int + }{ + {0, 0, MergePlanOptions{}, 0}, + {1, 0, MergePlanOptions{}, 1}, + {9, 0, MergePlanOptions{}, 4}, + {1, 1, + MergePlanOptions{ + MaxSegmentsPerTier: 1, + MaxSegmentSize: 1000, + SegmentsPerMergeTask: 2, + FloorSegmentSize: 1, + }, + 1, + }, + {21, 1, + MergePlanOptions{ + MaxSegmentsPerTier: 1, + MaxSegmentSize: 1000, + SegmentsPerMergeTask: 2, + FloorSegmentSize: 1, + }, + 5, + }, + {21, 1, + MergePlanOptions{ + MaxSegmentsPerTier: 2, + MaxSegmentSize: 1000, + SegmentsPerMergeTask: 2, + FloorSegmentSize: 1, + }, + 7, + }, + } + + for testi, test := range tests { + res := CalcBudget(test.totalSize, test.firstTierSize, &test.o) + if res != test.expect { + t.Errorf("testi: %d, test: %#v, res: %v", + testi, test, res) + } + } +} + +// ---------------------------------------- + +func TestInsert1SameSizedSegmentBetweenMerges(t *testing.T) { + o := &MergePlanOptions{ + MaxSegmentSize: 1000, + MaxSegmentsPerTier: 3, + SegmentsPerMergeTask: 3, + } + + spec := testCyclesSpec{ + descrip: "i1sssbm", + verbose: os.Getenv("VERBOSE") == "i1sssbm" || os.Getenv("VERBOSE") == "y", + n: 200, + o: o, + beforePlan: func(spec *testCyclesSpec) { + spec.segments = append(spec.segments, &segment{ + MyId: spec.nextSegmentId, + MyFullSize: 1, + MyLiveSize: 1, + }) + spec.nextSegmentId++ + }, + } + + spec.runCycles(t) +} + +func TestInsertManySameSizedSegmentsBetweenMerges(t *testing.T) { + o := &MergePlanOptions{ + MaxSegmentSize: 1000, + MaxSegmentsPerTier: 3, + SegmentsPerMergeTask: 3, + } + + spec := testCyclesSpec{ + descrip: "imsssbm", + verbose: os.Getenv("VERBOSE") == "imsssbm" || os.Getenv("VERBOSE") == "y", + n: 20, + o: o, + beforePlan: func(spec *testCyclesSpec) { + for i := 0; i < 10; i++ { + spec.segments = append(spec.segments, &segment{ + MyId: spec.nextSegmentId, + MyFullSize: 1, + MyLiveSize: 1, + }) + spec.nextSegmentId++ + } + }, + } + + spec.runCycles(t) +} + +func TestInsertManyDifferentSizedSegmentsBetweenMerges(t *testing.T) { + o := &MergePlanOptions{ + MaxSegmentSize: 1000, + MaxSegmentsPerTier: 3, + SegmentsPerMergeTask: 3, + } + + spec := testCyclesSpec{ + descrip: "imdssbm", + verbose: os.Getenv("VERBOSE") == "imdssbm" || os.Getenv("VERBOSE") == "y", + n: 20, + o: o, + beforePlan: func(spec *testCyclesSpec) { + for i := 0; i < 10; i++ { + spec.segments = append(spec.segments, &segment{ + MyId: spec.nextSegmentId, + MyFullSize: int64(1 + (i % 5)), + MyLiveSize: int64(1 + (i % 5)), + }) + spec.nextSegmentId++ + } + }, + } + + spec.runCycles(t) +} + +func TestManySameSizedSegmentsWithDeletesBetweenMerges(t *testing.T) { + o := &MergePlanOptions{ + MaxSegmentSize: 1000, + MaxSegmentsPerTier: 3, + SegmentsPerMergeTask: 3, + } + + var numPlansWithTasks int + + spec := testCyclesSpec{ + descrip: "mssswdbm", + verbose: os.Getenv("VERBOSE") == "mssswdbm" || os.Getenv("VERBOSE") == "y", + n: 20, + o: o, + beforePlan: func(spec *testCyclesSpec) { + // Deletions are a shrinking of the live size. + for i, seg := range spec.segments { + if (spec.cycle+i)%5 == 0 { + s := seg.(*segment) + if s.MyLiveSize > 0 { + s.MyLiveSize -= 1 + } + } + } + + for i := 0; i < 10; i++ { + spec.segments = append(spec.segments, &segment{ + MyId: spec.nextSegmentId, + MyFullSize: 1, + MyLiveSize: 1, + }) + spec.nextSegmentId++ + } + }, + afterPlan: func(spec *testCyclesSpec, plan *MergePlan) { + if plan != nil && len(plan.Tasks) > 0 { + numPlansWithTasks++ + } + }, + } + + spec.runCycles(t) + + if numPlansWithTasks <= 0 { + t.Errorf("expected some plans with tasks") + } +} + +// ---------------------------------------- + +type testCyclesSpec struct { + descrip string + verbose bool + + n int // Number of cycles to run. + o *MergePlanOptions + + beforePlan func(*testCyclesSpec) + afterPlan func(*testCyclesSpec, *MergePlan) + + cycle int + segments []Segment + nextSegmentId uint64 +} + +func (spec *testCyclesSpec) runCycles(t *testing.T) { + numPlansWithTasks := 0 + + for spec.cycle < spec.n { + if spec.verbose { + emit(spec.descrip, spec.cycle, 0, spec.segments, nil) + } + + if spec.beforePlan != nil { + spec.beforePlan(spec) + } + + if spec.verbose { + emit(spec.descrip, spec.cycle, 1, spec.segments, nil) + } + + plan, err := Plan(spec.segments, spec.o) + if err != nil { + t.Fatalf("expected no err, got: %v", err) + } + + if spec.afterPlan != nil { + spec.afterPlan(spec, plan) + } + + if spec.verbose { + emit(spec.descrip, spec.cycle, 2, spec.segments, plan) + } + + if plan != nil { + if len(plan.Tasks) > 0 { + numPlansWithTasks++ + } + + for _, task := range plan.Tasks { + spec.segments = removeSegments(spec.segments, task.Segments) + + var totLiveSize int64 + for _, segment := range task.Segments { + totLiveSize += segment.LiveSize() + } + + spec.segments = append(spec.segments, &segment{ + MyId: spec.nextSegmentId, + MyFullSize: totLiveSize, + MyLiveSize: totLiveSize, + }) + spec.nextSegmentId++ + } + } + + spec.cycle++ + } + + if numPlansWithTasks <= 0 { + t.Errorf("expected some plans with tasks") + } +} + +func emit(descrip string, cycle int, step int, segments []Segment, plan *MergePlan) { + if os.Getenv("VERBOSE") == "" { + return + } + + suffix := "" + if plan != nil && len(plan.Tasks) > 0 { + suffix = "hasPlan" + } + + fmt.Printf("%s %d-%d ---------- %s\n", descrip, cycle, step, suffix) + + var maxFullSize int64 + for _, segment := range segments { + if maxFullSize < segment.FullSize() { + maxFullSize = segment.FullSize() + } + } + + barMax := 100 + + for _, segment := range segments { + barFull := int(segment.FullSize()) + barLive := int(segment.LiveSize()) + + if maxFullSize > int64(barMax) { + barFull = int(float64(barMax) * float64(barFull) / float64(maxFullSize)) + barLive = int(float64(barMax) * float64(barLive) / float64(maxFullSize)) + } + + var barChar = "." + + if plan != nil { + TASK_LOOP: + for taski, task := range plan.Tasks { + for _, taskSegment := range task.Segments { + if taskSegment == segment { + barChar = fmt.Sprintf("%d", taski) + break TASK_LOOP + } + } + } + } + + bar := strings.Repeat(barChar, barLive) + strings.Repeat("x", barFull-barLive) + + fmt.Printf("%s %5d %5d - %s\n", descrip, segment.Id(), segment.FullSize(), bar) + } +} diff --git a/index/scorch/mergeplan/sort.go b/index/scorch/mergeplan/sort.go index 4eabc31db..d044b8d7c 100644 --- a/index/scorch/mergeplan/sort.go +++ b/index/scorch/mergeplan/sort.go @@ -16,12 +16,13 @@ package mergeplan type byLiveSizeDescending []Segment -func (a byLiveSizeDescending) Len() int { return len(a) } +func (a byLiveSizeDescending) Len() int { return len(a) } + func (a byLiveSizeDescending) Swap(i, j int) { a[i], a[j] = a[j], a[i] } func (a byLiveSizeDescending) Less(i, j int) bool { if a[i].LiveSize() != a[j].LiveSize() { - return a[i].LiveSize() < a[j].LiveSize() + return a[i].LiveSize() > a[j].LiveSize() } return a[i].Id() < a[j].Id() } From be7dd36ac650b80cfc416731717db77877e8a2f1 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 12 Dec 2017 10:36:16 -0800 Subject: [PATCH 051/728] mergeplan: more tests and bargraph tweaks --- index/scorch/mergeplan/merge_plan.go | 8 +-- index/scorch/mergeplan/merge_plan_test.go | 70 +++++++++++++++++++---- 2 files changed, 64 insertions(+), 14 deletions(-) diff --git a/index/scorch/mergeplan/merge_plan.go b/index/scorch/mergeplan/merge_plan.go index 18bb3ec16..f3f7b9e3b 100644 --- a/index/scorch/mergeplan/merge_plan.go +++ b/index/scorch/mergeplan/merge_plan.go @@ -130,16 +130,12 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) { sort.Sort(byLiveSizeDescending(segments)) - var segmentsLiveSize int64 - var minLiveSize int64 = math.MaxInt64 var eligibles []Segment var eligiblesLiveSize int64 for _, segment := range segments { - segmentsLiveSize += segment.LiveSize() - if minLiveSize > segment.LiveSize() { minLiveSize = segment.LiveSize() } @@ -271,6 +267,10 @@ func ScoreSegments(segments []Segment, o *MergePlanOptions) float64 { totAfterSizeFloored += o.RaiseToFloorSegmentSize(segment.LiveSize()) } + if totBeforeSize <= 0 || totAfterSize <= 0 || totAfterSizeFloored <= 0 { + return 0 + } + // Roughly guess the "balance" of the segments -- whether the // segments are about the same size. balance := diff --git a/index/scorch/mergeplan/merge_plan_test.go b/index/scorch/mergeplan/merge_plan_test.go index c1bca8459..05dcaaf9f 100644 --- a/index/scorch/mergeplan/merge_plan_test.go +++ b/index/scorch/mergeplan/merge_plan_test.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +// The segment merge planning approach was inspired by Lucene's +// TieredMergePolicy.java and descriptions like +// http://blog.mikemccandless.com/2011/02/visualizing-lucenes-segment-merges.html package mergeplan import ( @@ -250,6 +253,43 @@ func TestInsertManySameSizedSegmentsBetweenMerges(t *testing.T) { spec.runCycles(t) } +func TestInsertManySameSizedSegmentsWithDeletionsBetweenMerges(t *testing.T) { + o := &MergePlanOptions{ + MaxSegmentSize: 1000, + MaxSegmentsPerTier: 3, + SegmentsPerMergeTask: 3, + } + + spec := testCyclesSpec{ + descrip: "imssswdbm", + verbose: os.Getenv("VERBOSE") == "imssswdbm" || os.Getenv("VERBOSE") == "y", + n: 20, + o: o, + beforePlan: func(spec *testCyclesSpec) { + for i := 0; i < 10; i++ { + // Deletions are a shrinking of the live size. + for i, seg := range spec.segments { + if (spec.cycle+i)%5 == 0 { + s := seg.(*segment) + if s.MyLiveSize > 0 { + s.MyLiveSize -= 1 + } + } + } + + spec.segments = append(spec.segments, &segment{ + MyId: spec.nextSegmentId, + MyFullSize: 1, + MyLiveSize: 1, + }) + spec.nextSegmentId++ + } + }, + } + + spec.runCycles(t) +} + func TestInsertManyDifferentSizedSegmentsBetweenMerges(t *testing.T) { o := &MergePlanOptions{ MaxSegmentSize: 1000, @@ -384,12 +424,14 @@ func (spec *testCyclesSpec) runCycles(t *testing.T) { totLiveSize += segment.LiveSize() } - spec.segments = append(spec.segments, &segment{ - MyId: spec.nextSegmentId, - MyFullSize: totLiveSize, - MyLiveSize: totLiveSize, - }) - spec.nextSegmentId++ + if totLiveSize > 0 { + spec.segments = append(spec.segments, &segment{ + MyId: spec.nextSegmentId, + MyFullSize: totLiveSize, + MyLiveSize: totLiveSize, + }) + spec.nextSegmentId++ + } } } @@ -411,7 +453,7 @@ func emit(descrip string, cycle int, step int, segments []Segment, plan *MergePl suffix = "hasPlan" } - fmt.Printf("%s %d-%d ---------- %s\n", descrip, cycle, step, suffix) + fmt.Printf("%s %d.%d ---------- %s\n", descrip, cycle, step, suffix) var maxFullSize int64 for _, segment := range segments { @@ -431,13 +473,15 @@ func emit(descrip string, cycle int, step int, segments []Segment, plan *MergePl barLive = int(float64(barMax) * float64(barLive) / float64(maxFullSize)) } - var barChar = "." + barKind := " " + barChar := "." if plan != nil { TASK_LOOP: for taski, task := range plan.Tasks { for _, taskSegment := range task.Segments { if taskSegment == segment { + barKind = "*" barChar = fmt.Sprintf("%d", taski) break TASK_LOOP } @@ -445,8 +489,14 @@ func emit(descrip string, cycle int, step int, segments []Segment, plan *MergePl } } - bar := strings.Repeat(barChar, barLive) + strings.Repeat("x", barFull-barLive) + bar := + strings.Repeat(barChar, barLive)[0:barLive] + + strings.Repeat("x", barFull-barLive)[0:barFull-barLive] - fmt.Printf("%s %5d %5d - %s\n", descrip, segment.Id(), segment.FullSize(), bar) + fmt.Printf("%s %5d: %5d /%5d - %s %s\n", descrip, + segment.Id(), + segment.LiveSize(), + segment.FullSize(), + barKind, bar) } } From c15c3c11cd93dc473ef89afae5c479f09442d206 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 13 Dec 2017 13:31:18 -0500 Subject: [PATCH 052/728] extra protection if dict address is 0 (empty segment) --- index/scorch/segment/zap/segment.go | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 2c6d0bfed..fce80f933 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -150,16 +150,18 @@ func (s *Segment) dictionary(field string) (*Dictionary, error) { dictStart := s.fieldsOffsets[rv.fieldID] - // read the length of the vellum data - vellumLen, read := binary.Uvarint(s.mm[dictStart : dictStart+binary.MaxVarintLen64]) - fstBytes := s.mm[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen] - if fstBytes != nil { - fst, err := vellum.Load(fstBytes) - if err != nil { - return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err) - } - if err == nil { - rv.fst = fst + if dictStart > 0 { + // read the length of the vellum data + vellumLen, read := binary.Uvarint(s.mm[dictStart : dictStart+binary.MaxVarintLen64]) + fstBytes := s.mm[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen] + if fstBytes != nil { + fst, err := vellum.Load(fstBytes) + if err != nil { + return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err) + } + if err == nil { + rv.fst = fst + } } } From f83c9f2a2022a2cd1c45ac34b6cd020d9ab2f089 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 13 Dec 2017 13:41:03 -0500 Subject: [PATCH 053/728] initial cut of merger that actually introduces changes --- index/scorch/introducer.go | 76 +++++++++++ index/scorch/merge.go | 167 +++++++++++++++++++++++++ index/scorch/persister.go | 10 ++ index/scorch/scorch.go | 6 + index/scorch/segment/zap/merge.go | 56 +++++---- index/scorch/segment/zap/merge_test.go | 4 +- index/scorch/snapshot_segment.go | 12 ++ 7 files changed, 303 insertions(+), 28 deletions(-) create mode 100644 index/scorch/merge.go diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 0b30044a1..7998eae3b 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -42,6 +42,82 @@ OUTER: case notify = <-s.introducerNotifier: + case nextMerge := <-s.merges: + // acquire lock + s.rootLock.Lock() + + // prepare new index snapshot + currSize := len(s.root.segment) + newSize := currSize + 1 - len(nextMerge.old) + newSnapshot := &IndexSnapshot{ + segment: make([]*SegmentSnapshot, 0, newSize), + offsets: make([]uint64, 0, newSize), + internal: make(map[string][]byte, len(s.root.segment)), + epoch: s.nextSnapshotEpoch, + } + s.nextSnapshotEpoch++ + + // iterate through current segments + newSegmentDeleted := roaring.NewBitmap() + var running uint64 + for i := range s.root.segment { + segmentID := s.root.segment[i].id + if segSnapAtMerge, ok := nextMerge.old[segmentID]; ok { + // this segment is going away, see if anything else was deleted since we started the merge + if s.root.segment[i].deleted != nil { + // assume all these deletes are new + deletedSince := s.root.segment[i].deleted + // if we already knew about some of them, remove + if segSnapAtMerge.deleted != nil { + deletedSince = roaring.AndNot(s.root.segment[i].deleted, segSnapAtMerge.deleted) + } + deletedSinceItr := deletedSince.Iterator() + for deletedSinceItr.HasNext() { + oldDocNum := deletedSinceItr.Next() + newDocNum := nextMerge.oldNewDocNums[segmentID][oldDocNum] + newSegmentDeleted.Add(uint32(newDocNum)) + } + } + } else { + // this segment is staying + newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ + id: s.root.segment[i].id, + segment: s.root.segment[i].segment, + notify: s.root.segment[i].notify, + deleted: s.root.segment[i].deleted, + }) + newSnapshot.offsets = append(newSnapshot.offsets, running) + running += s.root.segment[i].Count() + } + } + + // put new segment at end + newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ + id: nextMerge.id, + segment: nextMerge.new, + deleted: newSegmentDeleted, + }) + newSnapshot.offsets = append(newSnapshot.offsets, running) + + // copy old values + for key, oldVal := range s.root.internal { + newSnapshot.internal[key] = oldVal + } + + // swap in new segment + s.root = newSnapshot + // release lock + s.rootLock.Unlock() + + // notify merger we incorporated this + close(nextMerge.notify) + + // notify persister + if notify != nil { + close(notify) + notify = nil + } + case next := <-s.introductions: // acquire lock s.rootLock.Lock() diff --git a/index/scorch/merge.go b/index/scorch/merge.go new file mode 100644 index 000000000..aef216646 --- /dev/null +++ b/index/scorch/merge.go @@ -0,0 +1,167 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "fmt" + "log" + "os" + "sync/atomic" + + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index/scorch/mergeplan" + "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/index/scorch/segment/zap" +) + +func (s *Scorch) mergerLoop() { + var lastEpochMergePlanned uint64 +OUTER: + for { + select { + case <-s.closeCh: + break OUTER + + default: + // check to see if there is a new snapshot to persist + s.rootLock.RLock() + ourSnapshot := s.root + s.rootLock.RUnlock() + + if ourSnapshot.epoch != lastEpochMergePlanned { + // lets get started + err := s.planMergeAtSnapshot(ourSnapshot) + if err != nil { + log.Printf("merging err: %v", err) + continue OUTER + } + lastEpochMergePlanned = ourSnapshot.epoch + } + + // tell the persister we're waiting for changes + // first make a notification chan + notifyUs := make(notificationChan) + + // give it to the persister + select { + case <-s.closeCh: + break OUTER + case s.persisterNotifier <- notifyUs: + } + + // check again + s.rootLock.RLock() + ourSnapshot = s.root + s.rootLock.RUnlock() + + if ourSnapshot.epoch != lastEpochMergePlanned { + // lets get started + err := s.planMergeAtSnapshot(ourSnapshot) + if err != nil { + continue OUTER + } + lastEpochMergePlanned = ourSnapshot.epoch + } + + // now wait for it (but also detect close) + select { + case <-s.closeCh: + break OUTER + case <-notifyUs: + // woken up, next loop should pick up work + } + } + } + s.asyncTasks.Done() +} + +func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot) error { + // build list of zap segments in this snapshot + var onlyZapSnapshots []mergeplan.Segment + for _, segmentSnapshot := range ourSnapshot.segment { + if _, ok := segmentSnapshot.segment.(*zap.Segment); ok { + onlyZapSnapshots = append(onlyZapSnapshots, segmentSnapshot) + } + } + + // give this list to the planner + resultMergePlan, err := mergeplan.Plan(onlyZapSnapshots, nil) + if err != nil { + return fmt.Errorf("merge planning err: %v", err) + } + if resultMergePlan == nil { + // nothing to do + return nil + } + + // process tasks in serial for now + var notifications []notificationChan + for _, task := range resultMergePlan.Tasks { + oldMap := make(map[uint64]*SegmentSnapshot) + newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) + segmentsToMerge := make([]*zap.Segment, 0, len(task.Segments)) + docsToDrop := make([]*roaring.Bitmap, 0, len(task.Segments)) + for _, planSegment := range task.Segments { + if segSnapshot, ok := planSegment.(*SegmentSnapshot); ok { + oldMap[segSnapshot.id] = segSnapshot + if zapSeg, ok := segSnapshot.segment.(*zap.Segment); ok { + segmentsToMerge = append(segmentsToMerge, zapSeg) + docsToDrop = append(docsToDrop, segSnapshot.deleted) + } + } + } + + filename := fmt.Sprintf("%x.zap", newSegmentID) + path := s.path + string(os.PathSeparator) + filename + newDocNums, err := zap.Merge(segmentsToMerge, docsToDrop, path, 1024) + if err != nil { + return fmt.Errorf("merging failed: %v", err) + } + segment, err := zap.Open(path) + if err != nil { + return err + } + sm := &segmentMerge{ + id: newSegmentID, + old: oldMap, + oldNewDocNums: make(map[uint64][]uint64), + new: segment, + notify: make(notificationChan), + } + notifications = append(notifications, sm.notify) + for i, segNewDocNums := range newDocNums { + sm.oldNewDocNums[task.Segments[i].Id()] = segNewDocNums + } + + // give it to the introducer + select { + case <-s.closeCh: + return nil + case s.merges <- sm: + } + } + for _, notification := range notifications { + <-notification + } + return nil +} + +type segmentMerge struct { + id uint64 + old map[uint64]*SegmentSnapshot + oldNewDocNums map[uint64][]uint64 + new segment.Segment + notify notificationChan +} diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 8a171b311..78a1092ba 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -31,12 +31,14 @@ import ( type notificationChan chan struct{} func (s *Scorch) persisterLoop() { + var notify notificationChan var lastPersistedEpoch uint64 OUTER: for { select { case <-s.closeCh: break OUTER + case notify = <-s.persisterNotifier: default: // check to see if there is a new snapshot to persist @@ -53,6 +55,10 @@ OUTER: continue OUTER } lastPersistedEpoch = ourSnapshot.epoch + if notify != nil { + close(notify) + notify = nil + } } // tell the introducer we're waiting for changes @@ -79,6 +85,10 @@ OUTER: continue OUTER } lastPersistedEpoch = ourSnapshot.epoch + if notify != nil { + close(notify) + notify = nil + } } // now wait for it (but also detect close) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index d95cb05bc..7373e6b43 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -54,7 +54,9 @@ type Scorch struct { closeCh chan struct{} introductions chan *segmentIntroduction + merges chan *segmentMerge introducerNotifier chan notificationChan + persisterNotifier chan notificationChan rootBolt *bolt.DB asyncTasks sync.WaitGroup } @@ -115,7 +117,9 @@ func (s *Scorch) Open() error { s.closeCh = make(chan struct{}) s.introductions = make(chan *segmentIntroduction) + s.merges = make(chan *segmentMerge) s.introducerNotifier = make(chan notificationChan) + s.persisterNotifier = make(chan notificationChan) s.asyncTasks.Add(1) go s.mainLoop() @@ -123,6 +127,8 @@ func (s *Scorch) Open() error { if !s.readOnly && s.path != "" { s.asyncTasks.Add(1) go s.persisterLoop() + s.asyncTasks.Add(1) + go s.mergerLoop() } return nil diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index b41dc4275..7652c2210 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -19,13 +19,13 @@ import ( // from the may be dropped, and creates a new segment containing the remaining // data. This new segment is built at the specified path, with the provided // chunkFactor. -func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, chunkFactor uint32) error { - +func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, + chunkFactor uint32) ([][]uint64, error) { flag := os.O_RDWR | os.O_CREATE f, err := os.OpenFile(path, flag, 0600) if err != nil { - return err + return nil, err } // bufer the output @@ -41,50 +41,49 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, chunkFacto var newDocNums [][]uint64 var storedIndexOffset uint64 - storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, - fieldsMap, fieldsInv, newSegDocCount, cr) - if err != nil { - return err - } - - // FIXME temp until computed - //dictLocs := make([]uint64, len(fieldsInv)) + dictLocs := make([]uint64, len(fieldsInv)) + if newSegDocCount > 0 { + storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, + fieldsMap, fieldsInv, newSegDocCount, cr) + if err != nil { + return nil, err + } - var dictLocs []uint64 - dictLocs, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap, - newDocNums, newSegDocCount, cr) - if err != nil { - return err + dictLocs, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap, + newDocNums, newSegDocCount, cr) + if err != nil { + return nil, err + } } var fieldsIndexOffset uint64 fieldsIndexOffset, err = persistMergedFields(fieldsInv, cr, dictLocs) if err != nil { - return err + return nil, err } err = persistFooter(newSegDocCount, storedIndexOffset, fieldsIndexOffset, chunkFactor, cr) if err != nil { - return err + return nil, err } err = br.Flush() if err != nil { - return err + return nil, err } err = f.Sync() if err != nil { - return err + return nil, err } err = f.Close() if err != nil { - return err + return nil, err } - return nil + return newDocNums, nil } func mapFields(fields []string) map[string]uint16 { @@ -161,11 +160,13 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, } dicts = append(dicts, dict) - itr, err2 := dict.fst.Iterator(nil, nil) - if err2 != nil { - return nil, err2 + if dict != nil && dict.fst != nil { + itr, err2 := dict.fst.Iterator(nil, nil) + if err2 != nil { + return nil, err2 + } + itrs = append(itrs, itr) } - itrs = append(itrs, itr) } // create merging iterator @@ -187,6 +188,9 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, // now go back and get posting list for this term // but pass in the deleted docs for that segment for dictI, dict := range dicts { + if dict == nil { + continue + } postings, err2 := dict.postingsList(string(term), drops[dictI]) if err2 != nil { return nil, err2 diff --git a/index/scorch/segment/zap/merge_test.go b/index/scorch/segment/zap/merge_test.go index 53bcde7fb..c8046efab 100644 --- a/index/scorch/segment/zap/merge_test.go +++ b/index/scorch/segment/zap/merge_test.go @@ -54,7 +54,7 @@ func TestMerge(t *testing.T) { segsToMerge[0] = segment.(*Segment) segsToMerge[1] = segment2.(*Segment) - err = Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024) + _, err = Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024) if err != nil { t.Fatal(err) } @@ -109,7 +109,7 @@ func TestMergeAndDrop(t *testing.T) { docsToDrop[1] = roaring.NewBitmap() docsToDrop[1].AddInt(1) - err = Merge(segsToMerge, docsToDrop, "/tmp/scorch3.zap", 1024) + _, err = Merge(segsToMerge, docsToDrop, "/tmp/scorch3.zap", 1024) if err != nil { t.Fatal(err) } diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 14c49450b..ffd38cac5 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -49,6 +49,18 @@ type SegmentSnapshot struct { notify []chan error } +func (s *SegmentSnapshot) Id() uint64 { + return s.id +} + +func (s *SegmentSnapshot) FullSize() int64 { + return int64(s.segment.Count()) +} + +func (s SegmentSnapshot) LiveSize() int64 { + return int64(s.Count()) +} + func (s *SegmentSnapshot) Close() error { return s.segment.Close() } From cd45487cb3b5cee1b89dff8a0dd6f7b2d5be740a Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 13 Dec 2017 13:55:06 -0500 Subject: [PATCH 054/728] fsync rootBolt when persisting snapshot --- index/scorch/persister.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 78a1092ba..d54840864 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -109,6 +109,13 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { if err != nil { return err } + // defer fsync of the rootbolt + defer func() { + if err == nil { + err = s.rootBolt.Sync() + } + }() + // defer commit/rollback transaction defer func() { if err == nil { err = tx.Commit() From 1cd3fd7fbe661f0f29656155aa48fa864b0551e9 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 13 Dec 2017 14:06:54 -0500 Subject: [PATCH 055/728] extrac common functionality between build/merge --- index/scorch/segment/zap/build.go | 47 +-------------- index/scorch/segment/zap/merge.go | 76 +------------------------ index/scorch/segment/zap/write.go | 95 +++++++++++++++++++++++++++++++ 3 files changed, 97 insertions(+), 121 deletions(-) create mode 100644 index/scorch/segment/zap/write.go diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 2bc520f63..2cad61a58 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -77,7 +77,7 @@ func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) (e } var fieldIndexStart uint64 - fieldIndexStart, err = persistFields(memSegment, cr, dictLocs) + fieldIndexStart, err = persistFields(memSegment.FieldsInv, cr, dictLocs) if err != nil { return err } @@ -573,51 +573,6 @@ func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs return rv, nil } -func persistFields(memSegment *mem.Segment, w *CountHashWriter, dictLocs []uint64) (uint64, error) { - var rv uint64 - - var fieldStarts []uint64 - for fieldID, fieldName := range memSegment.FieldsInv { - - // record start of this field - fieldStarts = append(fieldStarts, uint64(w.Count())) - - buf := make([]byte, binary.MaxVarintLen64) - // write out dict location for this field - n := binary.PutUvarint(buf, dictLocs[fieldID]) - _, err := w.Write(buf[:n]) - if err != nil { - return 0, err - } - - // write out the length of the field name - n = binary.PutUvarint(buf, uint64(len(fieldName))) - _, err = w.Write(buf[:n]) - if err != nil { - return 0, err - } - - // write out the field name - _, err = w.Write([]byte(fieldName)) - if err != nil { - return 0, err - } - } - - // now write out the fields index - rv = uint64(w.Count()) - - // now write out the stored doc index - for fieldID := range memSegment.FieldsInv { - err := binary.Write(w, binary.BigEndian, fieldStarts[fieldID]) - if err != nil { - return 0, err - } - } - - return rv, nil -} - // FooterSize is the size of the footer record in bytes // crc + ver + chunk + field offset + stored offset + num docs const FooterSize = 4 + 4 + 4 + 8 + 8 + 8 diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 7652c2210..972b1d167 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -5,7 +5,6 @@ import ( "bytes" "encoding/binary" "fmt" - "io" "math" "os" @@ -36,7 +35,6 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, fieldsInv := mergeFields(segments) fieldsMap := mapFields(fieldsInv) - newSegDocCount := computeNewDocCount(segments, drops) var newDocNums [][]uint64 @@ -57,7 +55,7 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, } var fieldsIndexOffset uint64 - fieldsIndexOffset, err = persistMergedFields(fieldsInv, cr, dictLocs) + fieldsIndexOffset, err = persistFields(fieldsInv, cr, dictLocs) if err != nil { return nil, err } @@ -106,33 +104,6 @@ func computeNewDocCount(segments []*Segment, drops []*roaring.Bitmap) uint64 { return newSegDocCount } -func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer) (int, error) { - var buffer bytes.Buffer - // write out postings list to memory so we know the len - postingsListLen, err := r.WriteTo(&buffer) - if err != nil { - return 0, err - } - var tw int - // write out the length of this postings list - buf := make([]byte, binary.MaxVarintLen64) - n := binary.PutUvarint(buf, uint64(postingsListLen)) - nw, err := w.Write(buf[:n]) - tw += nw - if err != nil { - return tw, err - } - - // write out the postings list itself - nw, err = w.Write(buffer.Bytes()) - tw += nw - if err != nil { - return tw, err - } - - return tw, nil -} - func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, fieldsInv []string, fieldsMap map[string]uint16, newDocNums [][]uint64, newSegDocCount uint64, w *CountHashWriter) ([]uint64, error) { @@ -489,48 +460,3 @@ func mergeFields(segments []*Segment) []string { return rv } - -func persistMergedFields(fieldsInv []string, w *CountHashWriter, dictLocs []uint64) (uint64, error) { - var rv uint64 - - var fieldStarts []uint64 - for fieldID, fieldName := range fieldsInv { - - // record start of this field - fieldStarts = append(fieldStarts, uint64(w.Count())) - - buf := make([]byte, binary.MaxVarintLen64) - // write out dict location for this field - n := binary.PutUvarint(buf, dictLocs[fieldID]) - _, err := w.Write(buf[:n]) - if err != nil { - return 0, err - } - - // write out the length of the field name - n = binary.PutUvarint(buf, uint64(len(fieldName))) - _, err = w.Write(buf[:n]) - if err != nil { - return 0, err - } - - // write out the field name - _, err = w.Write([]byte(fieldName)) - if err != nil { - return 0, err - } - } - - // now write out the fields index - rv = uint64(w.Count()) - - // now write out the stored doc index - for fieldID := range fieldsInv { - err := binary.Write(w, binary.BigEndian, fieldStarts[fieldID]) - if err != nil { - return 0, err - } - } - - return rv, nil -} diff --git a/index/scorch/segment/zap/write.go b/index/scorch/segment/zap/write.go new file mode 100644 index 000000000..9772b3a64 --- /dev/null +++ b/index/scorch/segment/zap/write.go @@ -0,0 +1,95 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "io" + + "github.com/RoaringBitmap/roaring" +) + +// writes out the length of the roaring bitmap in bytes as varint +// then writs out the roaring bitmap itself +func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer) (int, error) { + var buffer bytes.Buffer + // write out postings list to memory so we know the len + postingsListLen, err := r.WriteTo(&buffer) + if err != nil { + return 0, err + } + var tw int + // write out the length of this postings list + buf := make([]byte, binary.MaxVarintLen64) + n := binary.PutUvarint(buf, uint64(postingsListLen)) + nw, err := w.Write(buf[:n]) + tw += nw + if err != nil { + return tw, err + } + + // write out the postings list itself + nw, err = w.Write(buffer.Bytes()) + tw += nw + if err != nil { + return tw, err + } + + return tw, nil +} + +func persistFields(fieldsInv []string, w *CountHashWriter, dictLocs []uint64) (uint64, error) { + var rv uint64 + + var fieldStarts []uint64 + for fieldID, fieldName := range fieldsInv { + + // record start of this field + fieldStarts = append(fieldStarts, uint64(w.Count())) + + buf := make([]byte, binary.MaxVarintLen64) + // write out dict location for this field + n := binary.PutUvarint(buf, dictLocs[fieldID]) + _, err := w.Write(buf[:n]) + if err != nil { + return 0, err + } + + // write out the length of the field name + n = binary.PutUvarint(buf, uint64(len(fieldName))) + _, err = w.Write(buf[:n]) + if err != nil { + return 0, err + } + + // write out the field name + _, err = w.Write([]byte(fieldName)) + if err != nil { + return 0, err + } + } + + // now write out the fields index + rv = uint64(w.Count()) + for fieldID := range fieldsInv { + err := binary.Write(w, binary.BigEndian, fieldStarts[fieldID]) + if err != nil { + return 0, err + } + } + + return rv, nil +} From 289dc398bd88ea1bd9c952772e2323b64c0ec2a5 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 13 Dec 2017 14:26:11 -0500 Subject: [PATCH 056/728] more refacotring of build/merge --- index/scorch/segment/zap/build.go | 39 ------------------------------- index/scorch/segment/zap/merge.go | 3 +++ index/scorch/segment/zap/write.go | 39 +++++++++++++++++++++++++++++++ 3 files changed, 42 insertions(+), 39 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 2cad61a58..56061a2e9 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -572,42 +572,3 @@ func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs return rv, nil } - -// FooterSize is the size of the footer record in bytes -// crc + ver + chunk + field offset + stored offset + num docs -const FooterSize = 4 + 4 + 4 + 8 + 8 + 8 - -func persistFooter(numDocs, storedIndexOffset, fieldIndexOffset uint64, - chunkFactor uint32, w *CountHashWriter) error { - // write out the number of docs - err := binary.Write(w, binary.BigEndian, numDocs) - if err != nil { - return err - } - // write out the stored field index location: - err = binary.Write(w, binary.BigEndian, storedIndexOffset) - if err != nil { - return err - } - // write out the field index location - err = binary.Write(w, binary.BigEndian, fieldIndexOffset) - if err != nil { - return err - } - // write out 32-bit chunk factor - err = binary.Write(w, binary.BigEndian, chunkFactor) - if err != nil { - return err - } - // write out 32-bit version - err = binary.Write(w, binary.BigEndian, version) - if err != nil { - return err - } - // write out CRC-32 of everything upto but not including this CRC - err = binary.Write(w, binary.BigEndian, w.Sum32()) - if err != nil { - return err - } - return nil -} diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 972b1d167..670c8d795 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -84,6 +84,7 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, return newDocNums, nil } +// mapFields takes the fieldsInv list and builds the map func mapFields(fields []string) map[string]uint16 { rv := make(map[string]uint16) for i, fieldName := range fields { @@ -92,6 +93,8 @@ func mapFields(fields []string) map[string]uint16 { return rv } +// computeNewDocCount determines how many documents will be in the newly +// merged segment when obsoleted docs are dropped func computeNewDocCount(segments []*Segment, drops []*roaring.Bitmap) uint64 { var newSegDocCount uint64 for segI, segment := range segments { diff --git a/index/scorch/segment/zap/write.go b/index/scorch/segment/zap/write.go index 9772b3a64..bd63e1472 100644 --- a/index/scorch/segment/zap/write.go +++ b/index/scorch/segment/zap/write.go @@ -93,3 +93,42 @@ func persistFields(fieldsInv []string, w *CountHashWriter, dictLocs []uint64) (u return rv, nil } + +// FooterSize is the size of the footer record in bytes +// crc + ver + chunk + field offset + stored offset + num docs +const FooterSize = 4 + 4 + 4 + 8 + 8 + 8 + +func persistFooter(numDocs, storedIndexOffset, fieldIndexOffset uint64, + chunkFactor uint32, w *CountHashWriter) error { + // write out the number of docs + err := binary.Write(w, binary.BigEndian, numDocs) + if err != nil { + return err + } + // write out the stored field index location: + err = binary.Write(w, binary.BigEndian, storedIndexOffset) + if err != nil { + return err + } + // write out the field index location + err = binary.Write(w, binary.BigEndian, fieldIndexOffset) + if err != nil { + return err + } + // write out 32-bit chunk factor + err = binary.Write(w, binary.BigEndian, chunkFactor) + if err != nil { + return err + } + // write out 32-bit version + err = binary.Write(w, binary.BigEndian, version) + if err != nil { + return err + } + // write out CRC-32 of everything upto but not including this CRC + err = binary.Write(w, binary.BigEndian, w.Sum32()) + if err != nil { + return err + } + return nil +} From 50441e5065699ca36156bf265974c7c6a025ba4b Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 13 Dec 2017 14:41:20 -0500 Subject: [PATCH 057/728] refactor to reuse shared code --- index/scorch/segment/zap/build.go | 61 ++++--------------------------- 1 file changed, 7 insertions(+), 54 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 56061a2e9..31eacb1a9 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -426,60 +426,24 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac return freqOffsets, locOfffsets, nil } -func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) ([]uint64, error) { - var rv []uint64 - - var postingsBuf bytes.Buffer +func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint64, err error) { for postingID := range memSegment.PostingsLocs { - if postingID != 0 { - postingsBuf.Reset() - } - // record where we start this posting loc rv = append(rv, uint64(w.Count())) - - // write out postings locs to memory so we know the len - postingsLocLen, err := memSegment.PostingsLocs[postingID].WriteTo(&postingsBuf) - if err != nil { - return nil, err - } - - buf := make([]byte, binary.MaxVarintLen64) - // write out the length of this postings locs - n := binary.PutUvarint(buf, uint64(postingsLocLen)) - _, err = w.Write(buf[:n]) - if err != nil { - return nil, err - } - - // write out the postings list itself - _, err = w.Write(postingsBuf.Bytes()) + // write out the length and bitmap + _, err = writeRoaringWithLen(memSegment.PostingsLocs[postingID], w) if err != nil { return nil, err } } - return rv, nil } -func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, postingsListLocs, freqOffsets, locOffsets []uint64) ([]uint64, error) { - var rv []uint64 - - var postingsBuf bytes.Buffer +func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, + postingsListLocs, freqOffsets, locOffsets []uint64) (rv []uint64, err error) { for postingID := range memSegment.Postings { - if postingID != 0 { - postingsBuf.Reset() - } - // record where we start this posting list rv = append(rv, uint64(w.Count())) - - // write out postings list to memory so we know the len - postingsListLen, err := memSegment.Postings[postingID].WriteTo(&postingsBuf) - if err != nil { - return nil, err - } - // write out the start of the term info buf := make([]byte, binary.MaxVarintLen64) n := binary.PutUvarint(buf, freqOffsets[postingID]) @@ -487,35 +451,24 @@ func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, postingsL if err != nil { return nil, err } - // write out the start of the loc info n = binary.PutUvarint(buf, locOffsets[postingID]) _, err = w.Write(buf[:n]) if err != nil { return nil, err } - // write out the start of the loc posting list n = binary.PutUvarint(buf, postingsListLocs[postingID]) _, err = w.Write(buf[:n]) if err != nil { return nil, err } - - // write out the length of this postings list - n = binary.PutUvarint(buf, uint64(postingsListLen)) - _, err = w.Write(buf[:n]) - if err != nil { - return nil, err - } - - // write out the postings list itself - _, err = w.Write(postingsBuf.Bytes()) + // write out the length and bitmap + _, err = writeRoaringWithLen(memSegment.Postings[postingID], w) if err != nil { return nil, err } } - return rv, nil } From 6e2207c445fd00fc12824dcd2e31941fba3e3022 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 13 Dec 2017 15:22:13 -0500 Subject: [PATCH 058/728] additional refactoring of build/merge --- index/scorch/segment/zap/build.go | 34 +++++++---------------------- index/scorch/segment/zap/merge.go | 13 +++-------- index/scorch/segment/zap/read.go | 31 ++++++++++++++++++++++++++ index/scorch/segment/zap/segment.go | 13 ++--------- index/scorch/segment/zap/write.go | 27 +++++++++++++---------- 5 files changed, 60 insertions(+), 58 deletions(-) create mode 100644 index/scorch/segment/zap/read.go diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 31eacb1a9..f3f9658e4 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -182,19 +182,12 @@ func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) // record where we're about to start writing docNumOffsets[docNum] = uint64(w.Count()) - buf := make([]byte, binary.MaxVarintLen64) - // write out the meta length - n := binary.PutUvarint(buf, uint64(len(metaBytes))) - _, err := w.Write(buf[:n]) - if err != nil { - return 0, err - } - // write out the compressed data length - n = binary.PutUvarint(buf, uint64(len(compressed))) - _, err = w.Write(buf[:n]) + // write out the meta len and compressed data len + _, err := writeUvarints(w, uint64(len(metaBytes)), uint64(len(compressed))) if err != nil { return 0, err } + // now write the meta _, err = w.Write(metaBytes) if err != nil { @@ -444,25 +437,14 @@ func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, for postingID := range memSegment.Postings { // record where we start this posting list rv = append(rv, uint64(w.Count())) - // write out the start of the term info - buf := make([]byte, binary.MaxVarintLen64) - n := binary.PutUvarint(buf, freqOffsets[postingID]) - _, err = w.Write(buf[:n]) - if err != nil { - return nil, err - } - // write out the start of the loc info - n = binary.PutUvarint(buf, locOffsets[postingID]) - _, err = w.Write(buf[:n]) - if err != nil { - return nil, err - } - // write out the start of the loc posting list - n = binary.PutUvarint(buf, postingsListLocs[postingID]) - _, err = w.Write(buf[:n]) + + // write out the term info, loc info, and loc posting list offset + _, err = writeUvarints(w, freqOffsets[postingID], + locOffsets[postingID], postingsListLocs[postingID]) if err != nil { return nil, err } + // write out the length and bitmap _, err = writeRoaringWithLen(memSegment.Postings[postingID], w) if err != nil { diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 670c8d795..8683eb648 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -399,16 +399,9 @@ func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, // record where we're about to start writing docNumOffsets[newDocNum] = uint64(w.Count()) - buf := make([]byte, binary.MaxVarintLen64) - // write out the meta length - n := binary.PutUvarint(buf, uint64(len(metaBytes))) - _, err = w.Write(buf[:n]) - if err != nil { - return 0, nil, err - } - // write out the compressed data length - n = binary.PutUvarint(buf, uint64(len(compressed))) - _, err = w.Write(buf[:n]) + // write out the meta len and compressed data len + _, err = writeUvarints(w, + uint64(len(metaBytes)), uint64(len(compressed))) if err != nil { return 0, nil, err } diff --git a/index/scorch/segment/zap/read.go b/index/scorch/segment/zap/read.go new file mode 100644 index 000000000..c9b3e7720 --- /dev/null +++ b/index/scorch/segment/zap/read.go @@ -0,0 +1,31 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import "encoding/binary" + +func (s *Segment) getStoredMetaAndCompressed(docNum uint64) ([]byte, []byte) { + docStoredStartAddr := s.storedIndexOffset + (8 * docNum) + docStoredStart := binary.BigEndian.Uint64(s.mm[docStoredStartAddr : docStoredStartAddr+8]) + var n uint64 + metaLen, read := binary.Uvarint(s.mm[docStoredStart : docStoredStart+binary.MaxVarintLen64]) + n += uint64(read) + var dataLen uint64 + dataLen, read = binary.Uvarint(s.mm[docStoredStart+n : docStoredStart+n+binary.MaxVarintLen64]) + n += uint64(read) + meta := s.mm[docStoredStart+n : docStoredStart+n+metaLen] + data := s.mm[docStoredStart+n+metaLen : docStoredStart+n+metaLen+dataLen] + return meta, data +} diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index fce80f933..9f80b7037 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -177,17 +177,8 @@ func (s *Segment) dictionary(field string) (*Dictionary, error) { func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { // first make sure this is a valid number in this segment if num < s.numDocs { - docStoredStartAddr := s.storedIndexOffset + (8 * num) - docStoredStart := binary.BigEndian.Uint64(s.mm[docStoredStartAddr : docStoredStartAddr+8]) - var n uint64 - metaLen, read := binary.Uvarint(s.mm[docStoredStart : docStoredStart+binary.MaxVarintLen64]) - n += uint64(read) - var dataLen uint64 - dataLen, read = binary.Uvarint(s.mm[docStoredStart+n : docStoredStart+n+binary.MaxVarintLen64]) - n += uint64(read) - meta := s.mm[docStoredStart+n : docStoredStart+n+metaLen] - data := s.mm[docStoredStart+n+metaLen : docStoredStart+n+metaLen+dataLen] - uncompressed, err := snappy.Decode(nil, data) + meta, compressed := s.getStoredMetaAndCompressed(num) + uncompressed, err := snappy.Decode(nil, compressed) if err != nil { panic(err) } diff --git a/index/scorch/segment/zap/write.go b/index/scorch/segment/zap/write.go index bd63e1472..a831ef6ae 100644 --- a/index/scorch/segment/zap/write.go +++ b/index/scorch/segment/zap/write.go @@ -60,17 +60,8 @@ func persistFields(fieldsInv []string, w *CountHashWriter, dictLocs []uint64) (u // record start of this field fieldStarts = append(fieldStarts, uint64(w.Count())) - buf := make([]byte, binary.MaxVarintLen64) - // write out dict location for this field - n := binary.PutUvarint(buf, dictLocs[fieldID]) - _, err := w.Write(buf[:n]) - if err != nil { - return 0, err - } - - // write out the length of the field name - n = binary.PutUvarint(buf, uint64(len(fieldName))) - _, err = w.Write(buf[:n]) + // write out the dict location and field name length + _, err := writeUvarints(w, dictLocs[fieldID], uint64(len(fieldName))) if err != nil { return 0, err } @@ -132,3 +123,17 @@ func persistFooter(numDocs, storedIndexOffset, fieldIndexOffset uint64, } return nil } + +func writeUvarints(w io.Writer, vals ...uint64) (tw int, err error) { + buf := make([]byte, binary.MaxVarintLen64) + for _, val := range vals { + n := binary.PutUvarint(buf, val) + var nw int + nw, err = w.Write(buf[:n]) + tw += nw + if err != nil { + return tw, err + } + } + return tw, err +} From 85e15628eed9e87801523f881afccffdb500ae24 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 13 Dec 2017 16:10:06 -0500 Subject: [PATCH 059/728] major refactoring of posting details --- index/scorch/segment/zap/build.go | 192 ++++++++---------------------- index/scorch/segment/zap/merge.go | 13 +- 2 files changed, 60 insertions(+), 145 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index f3f9658e4..60906d334 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -47,33 +47,39 @@ func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) (e cr := NewCountHashWriter(br) var storedIndexOffset uint64 - storedIndexOffset, err = persistStored(memSegment, cr) - if err != nil { - return err - } + var dictLocs []uint64 + if len(memSegment.Stored) > 0 { - var freqOffsets, locOffsets []uint64 - freqOffsets, locOffsets, err = persistPostingDetails(memSegment, cr, chunkFactor) - if err != nil { - return err - } + storedIndexOffset, err = persistStored(memSegment, cr) + if err != nil { + return err + } - var postingsListLocs []uint64 - postingsListLocs, err = persistPostingsLocs(memSegment, cr) - if err != nil { - return err - } + var freqOffsets, locOffsets []uint64 + freqOffsets, locOffsets, err = persistPostingDetails(memSegment, cr, chunkFactor) + if err != nil { + return err + } - var postingsLocs []uint64 - postingsLocs, err = persistPostingsLists(memSegment, cr, postingsListLocs, freqOffsets, locOffsets) - if err != nil { - return err - } + var postingsListLocs []uint64 + postingsListLocs, err = persistPostingsLocs(memSegment, cr) + if err != nil { + return err + } - var dictLocs []uint64 - dictLocs, err = persistDictionary(memSegment, cr, postingsLocs) - if err != nil { - return err + var postingsLocs []uint64 + postingsLocs, err = persistPostingsLists(memSegment, cr, postingsListLocs, freqOffsets, locOffsets) + if err != nil { + return err + } + + dictLocs, err = persistDictionary(memSegment, cr, postingsLocs) + if err != nil { + return err + } + + } else { + dictLocs = make([]uint64, len(memSegment.FieldsInv)) } var fieldIndexStart uint64 @@ -215,40 +221,19 @@ func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFactor uint32) ([]uint64, []uint64, error) { var freqOffsets, locOfffsets []uint64 + tfEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) for postingID := range memSegment.Postings { + if postingID != 0 { + tfEncoder.Reset() + } postingsListItr := memSegment.Postings[postingID].Iterator() - - total := uint64(len(memSegment.Stored))/uint64(chunkFactor) + 1 - - var freqNormBuf []byte var offset int - - var encodingBuf bytes.Buffer - encoder := govarint.NewU64Base128Encoder(&encodingBuf) - - chunkLens := make([]uint64, total) - var currChunk uint64 for postingsListItr.HasNext() { - docNum := postingsListItr.Next() - chunk := uint64(docNum) / uint64(chunkFactor) - - if chunk != currChunk { - // starting a new chunk - if encoder != nil { - // close out last - encoder.Close() - encodingBytes := encodingBuf.Bytes() - chunkLens[currChunk] = uint64(len(encodingBytes)) - freqNormBuf = append(freqNormBuf, encodingBytes...) - encodingBuf.Reset() - encoder = govarint.NewU64Base128Encoder(&encodingBuf) - } - currChunk = chunk - } + docNum := uint64(postingsListItr.Next()) // put freq - _, err := encoder.PutU64(memSegment.Freqs[postingID][offset]) + err := tfEncoder.Add(docNum, memSegment.Freqs[postingID][offset]) if err != nil { return nil, nil, err } @@ -256,7 +241,7 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac // put norm norm := memSegment.Norms[postingID][offset] normBits := math.Float32bits(norm) - _, err = encoder.PutU32(normBits) + err = tfEncoder.Add(docNum, uint64(normBits)) if err != nil { return nil, nil, err } @@ -264,35 +249,11 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac offset++ } - // close out last chunk - if encoder != nil { - // fix me write freq/norms - encoder.Close() - encodingBytes := encodingBuf.Bytes() - chunkLens[currChunk] = uint64(len(encodingBytes)) - freqNormBuf = append(freqNormBuf, encodingBytes...) - } - // record where this postings freq info starts freqOffsets = append(freqOffsets, uint64(w.Count())) - buf := make([]byte, binary.MaxVarintLen64) - // write out the number of chunks - n := binary.PutUvarint(buf, uint64(total)) - _, err := w.Write(buf[:n]) - if err != nil { - return nil, nil, err - } - // write out the chunk lens - for _, chunkLen := range chunkLens { - n := binary.PutUvarint(buf, uint64(chunkLen)) - _, err = w.Write(buf[:n]) - if err != nil { - return nil, nil, err - } - } - // write out the data - _, err = w.Write(freqNormBuf) + tfEncoder.Close() + _, err := tfEncoder.Write(w) if err != nil { return nil, nil, err } @@ -300,61 +261,39 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac } // now do it again for the locations + locEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) for postingID := range memSegment.Postings { + if postingID != 0 { + locEncoder.Reset() + } postingsListItr := memSegment.Postings[postingID].Iterator() - - total := uint64(len(memSegment.Stored))/uint64(chunkFactor) + 1 - - var locBuf []byte var offset int var locOffset int - - var encodingBuf bytes.Buffer - encoder := govarint.NewU64Base128Encoder(&encodingBuf) - - chunkLens := make([]uint64, total) - var currChunk uint64 for postingsListItr.HasNext() { - docNum := postingsListItr.Next() - chunk := uint64(docNum) / uint64(chunkFactor) - - if chunk != currChunk { - // starting a new chunk - if encoder != nil { - // close out last - encoder.Close() - encodingBytes := encodingBuf.Bytes() - chunkLens[currChunk] = uint64(len(encodingBytes)) - locBuf = append(locBuf, encodingBytes...) - encodingBuf.Reset() - encoder = govarint.NewU64Base128Encoder(&encodingBuf) - } - currChunk = chunk - } - + docNum := uint64(postingsListItr.Next()) for i := 0; i < int(memSegment.Freqs[postingID][offset]); i++ { - if len(memSegment.Locfields[postingID]) > 0 { // put field - _, err := encoder.PutU64(uint64(memSegment.Locfields[postingID][locOffset])) + err := locEncoder.Add(docNum, uint64(memSegment.Locfields[postingID][locOffset])) if err != nil { return nil, nil, err } // put pos - _, err = encoder.PutU64(memSegment.Locpos[postingID][locOffset]) + + err = locEncoder.Add(docNum, memSegment.Locpos[postingID][locOffset]) if err != nil { return nil, nil, err } // put start - _, err = encoder.PutU64(memSegment.Locstarts[postingID][locOffset]) + err = locEncoder.Add(docNum, memSegment.Locstarts[postingID][locOffset]) if err != nil { return nil, nil, err } // put end - _, err = encoder.PutU64(memSegment.Locends[postingID][locOffset]) + err = locEncoder.Add(docNum, memSegment.Locends[postingID][locOffset]) if err != nil { return nil, nil, err } @@ -363,58 +302,31 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac num := len(memSegment.Locarraypos[postingID][locOffset]) // put the number of array positions to follow - _, err = encoder.PutU64(uint64(num)) + err = locEncoder.Add(docNum, uint64(num)) if err != nil { return nil, nil, err } // put each array position for j := 0; j < num; j++ { - _, err = encoder.PutU64(memSegment.Locarraypos[postingID][locOffset][j]) + err = locEncoder.Add(docNum, memSegment.Locarraypos[postingID][locOffset][j]) if err != nil { return nil, nil, err } } } - locOffset++ } offset++ } - // close out last chunk - if encoder != nil { - // fix me write freq/norms - encoder.Close() - encodingBytes := encodingBuf.Bytes() - chunkLens[currChunk] = uint64(len(encodingBytes)) - locBuf = append(locBuf, encodingBytes...) - } - // record where this postings loc info starts locOfffsets = append(locOfffsets, uint64(w.Count())) - - buf := make([]byte, binary.MaxVarintLen64) - // write out the number of chunks - n := binary.PutUvarint(buf, uint64(total)) - _, err := w.Write(buf[:n]) + locEncoder.Close() + _, err := locEncoder.Write(w) if err != nil { return nil, nil, err } - // write out the chunk lens - for _, chunkLen := range chunkLens { - n := binary.PutUvarint(buf, uint64(chunkLen)) - _, err = w.Write(buf[:n]) - if err != nil { - return nil, nil, err - } - } - // write out the data - _, err = w.Write(locBuf) - if err != nil { - return nil, nil, err - } - } return freqOffsets, locOfffsets, nil } diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 8683eb648..b1ea6c148 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -39,7 +39,7 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, var newDocNums [][]uint64 var storedIndexOffset uint64 - dictLocs := make([]uint64, len(fieldsInv)) + var dictLocs []uint64 if newSegDocCount > 0 { storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, fieldsMap, fieldsInv, newSegDocCount, cr) @@ -48,10 +48,12 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, } dictLocs, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap, - newDocNums, newSegDocCount, cr) + newDocNums, newSegDocCount, chunkFactor, cr) if err != nil { return nil, err } + } else { + dictLocs = make([]uint64, len(fieldsInv)) } var fieldsIndexOffset uint64 @@ -108,7 +110,8 @@ func computeNewDocCount(segments []*Segment, drops []*roaring.Bitmap) uint64 { } func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, - fieldsInv []string, fieldsMap map[string]uint16, newDocNums [][]uint64, newSegDocCount uint64, + fieldsInv []string, fieldsMap map[string]uint16, newDocNums [][]uint64, + newSegDocCount uint64, chunkFactor uint32, w *CountHashWriter) ([]uint64, error) { rv := make([]uint64, len(fieldsInv)) @@ -149,8 +152,8 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, return 0 }) - tfEncoder := newChunkedIntCoder(1024, newSegDocCount-1) - locEncoder := newChunkedIntCoder(1024, newSegDocCount-1) + tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) + locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) for err == nil { term, _ := mergeItr.Current() From a0e12b264080e720febad16e269d8ebb803123ea Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 13 Dec 2017 16:12:29 -0500 Subject: [PATCH 060/728] add license to a few files missing it --- index/scorch/segment/zap/intcoder.go | 14 ++++++++++++++ index/scorch/segment/zap/intcoder_test.go | 14 ++++++++++++++ index/scorch/segment/zap/merge.go | 14 ++++++++++++++ index/scorch/segment/zap/merge_test.go | 14 ++++++++++++++ 4 files changed, 56 insertions(+) diff --git a/index/scorch/segment/zap/intcoder.go b/index/scorch/segment/zap/intcoder.go index a682740f1..7e268bcf3 100644 --- a/index/scorch/segment/zap/intcoder.go +++ b/index/scorch/segment/zap/intcoder.go @@ -1,3 +1,17 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package zap import ( diff --git a/index/scorch/segment/zap/intcoder_test.go b/index/scorch/segment/zap/intcoder_test.go index f2623a548..85d2c5a76 100644 --- a/index/scorch/segment/zap/intcoder_test.go +++ b/index/scorch/segment/zap/intcoder_test.go @@ -1,3 +1,17 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package zap import ( diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index b1ea6c148..6d635bdfb 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -1,3 +1,17 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package zap import ( diff --git a/index/scorch/segment/zap/merge_test.go b/index/scorch/segment/zap/merge_test.go index c8046efab..1e0110418 100644 --- a/index/scorch/segment/zap/merge_test.go +++ b/index/scorch/segment/zap/merge_test.go @@ -1,3 +1,17 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + package zap import ( From 50471003dc874b68f2e1c0d9f7d54fa62992f956 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 13 Dec 2017 16:30:39 -0500 Subject: [PATCH 061/728] basic refactoring of introducer to make it more readable --- index/scorch/introducer.go | 292 +++++++++++++++++++------------------ 1 file changed, 150 insertions(+), 142 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 7998eae3b..76c145517 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -41,162 +41,170 @@ OUTER: break OUTER case notify = <-s.introducerNotifier: + continue case nextMerge := <-s.merges: - // acquire lock - s.rootLock.Lock() - - // prepare new index snapshot - currSize := len(s.root.segment) - newSize := currSize + 1 - len(nextMerge.old) - newSnapshot := &IndexSnapshot{ - segment: make([]*SegmentSnapshot, 0, newSize), - offsets: make([]uint64, 0, newSize), - internal: make(map[string][]byte, len(s.root.segment)), - epoch: s.nextSnapshotEpoch, - } - s.nextSnapshotEpoch++ - - // iterate through current segments - newSegmentDeleted := roaring.NewBitmap() - var running uint64 - for i := range s.root.segment { - segmentID := s.root.segment[i].id - if segSnapAtMerge, ok := nextMerge.old[segmentID]; ok { - // this segment is going away, see if anything else was deleted since we started the merge - if s.root.segment[i].deleted != nil { - // assume all these deletes are new - deletedSince := s.root.segment[i].deleted - // if we already knew about some of them, remove - if segSnapAtMerge.deleted != nil { - deletedSince = roaring.AndNot(s.root.segment[i].deleted, segSnapAtMerge.deleted) - } - deletedSinceItr := deletedSince.Iterator() - for deletedSinceItr.HasNext() { - oldDocNum := deletedSinceItr.Next() - newDocNum := nextMerge.oldNewDocNums[segmentID][oldDocNum] - newSegmentDeleted.Add(uint32(newDocNum)) - } - } - } else { - // this segment is staying - newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ - id: s.root.segment[i].id, - segment: s.root.segment[i].segment, - notify: s.root.segment[i].notify, - deleted: s.root.segment[i].deleted, - }) - newSnapshot.offsets = append(newSnapshot.offsets, running) - running += s.root.segment[i].Count() - } - } + s.introduceMerge(nextMerge) - // put new segment at end - newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ - id: nextMerge.id, - segment: nextMerge.new, - deleted: newSegmentDeleted, - }) - newSnapshot.offsets = append(newSnapshot.offsets, running) - - // copy old values - for key, oldVal := range s.root.internal { - newSnapshot.internal[key] = oldVal + case next := <-s.introductions: + err := s.introduceSegment(next) + if err != nil { + continue OUTER } + } + // notify persister + if notify != nil { + close(notify) + notify = nil + } + } - // swap in new segment - s.root = newSnapshot - // release lock - s.rootLock.Unlock() + s.asyncTasks.Done() +} - // notify merger we incorporated this - close(nextMerge.notify) +func (s *Scorch) introduceSegment(next *segmentIntroduction) error { + // acquire lock + s.rootLock.Lock() - // notify persister - if notify != nil { - close(notify) - notify = nil + // prepare new index snapshot, with curr size + 1 + newSnapshot := &IndexSnapshot{ + segment: make([]*SegmentSnapshot, len(s.root.segment)+1), + offsets: make([]uint64, len(s.root.segment)+1), + internal: make(map[string][]byte, len(s.root.segment)), + epoch: s.nextSnapshotEpoch, + } + s.nextSnapshotEpoch++ + + // iterate through current segments + var running uint64 + for i := range s.root.segment { + // see if optimistic work included this segment + delta, ok := next.obsoletes[s.root.segment[i].id] + if !ok { + var err error + delta, err = s.root.segment[i].segment.DocNumbers(next.ids) + if err != nil { + next.applied <- fmt.Errorf("error computing doc numbers: %v", err) + close(next.applied) + return err } + } + newSnapshot.segment[i] = &SegmentSnapshot{ + id: s.root.segment[i].id, + segment: s.root.segment[i].segment, + notify: s.root.segment[i].notify, + } + // apply new obsoletions + if s.root.segment[i].deleted == nil { + newSnapshot.segment[i].deleted = delta + } else { + newSnapshot.segment[i].deleted = roaring.Or(s.root.segment[i].deleted, delta) + } - case next := <-s.introductions: - // acquire lock - s.rootLock.Lock() - - // prepare new index snapshot, with curr size + 1 - newSnapshot := &IndexSnapshot{ - segment: make([]*SegmentSnapshot, len(s.root.segment)+1), - offsets: make([]uint64, len(s.root.segment)+1), - internal: make(map[string][]byte, len(s.root.segment)), - epoch: s.nextSnapshotEpoch, - } - s.nextSnapshotEpoch++ - - // iterate through current segments - var running uint64 - for i := range s.root.segment { - // see if optimistic work included this segment - delta, ok := next.obsoletes[s.root.segment[i].id] - if !ok { - var err error - delta, err = s.root.segment[i].segment.DocNumbers(next.ids) - if err != nil { - next.applied <- fmt.Errorf("error computing doc numbers: %v", err) - close(next.applied) - continue OUTER - } - } - newSnapshot.segment[i] = &SegmentSnapshot{ - id: s.root.segment[i].id, - segment: s.root.segment[i].segment, - notify: s.root.segment[i].notify, - } - // apply new obsoletions - if s.root.segment[i].deleted == nil { - newSnapshot.segment[i].deleted = delta - } else { - newSnapshot.segment[i].deleted = roaring.Or(s.root.segment[i].deleted, delta) - } + newSnapshot.offsets[i] = running + running += s.root.segment[i].Count() - newSnapshot.offsets[i] = running - running += s.root.segment[i].Count() + } + // put new segment at end + newSnapshot.segment[len(s.root.segment)] = &SegmentSnapshot{ + id: next.id, + segment: next.data, + } + newSnapshot.offsets[len(s.root.segment)] = running + if !s.unsafeBatch { + newSnapshot.segment[len(s.root.segment)].notify = append( + newSnapshot.segment[len(s.root.segment)].notify, + next.persisted, + ) + } + // copy old values + for key, oldVal := range s.root.internal { + newSnapshot.internal[key] = oldVal + } + // set new values and apply deletes + for key, newVal := range next.internal { + if newVal != nil { + newSnapshot.internal[key] = newVal + } else { + delete(newSnapshot.internal, key) + } + } + // swap in new segment + s.root = newSnapshot + // release lock + s.rootLock.Unlock() + close(next.applied) - } - // put new segment at end - newSnapshot.segment[len(s.root.segment)] = &SegmentSnapshot{ - id: next.id, - segment: next.data, - } - newSnapshot.offsets[len(s.root.segment)] = running - if !s.unsafeBatch { - newSnapshot.segment[len(s.root.segment)].notify = append( - newSnapshot.segment[len(s.root.segment)].notify, - next.persisted, - ) - } - // copy old values - for key, oldVal := range s.root.internal { - newSnapshot.internal[key] = oldVal - } - // set new values and apply deletes - for key, newVal := range next.internal { - if newVal != nil { - newSnapshot.internal[key] = newVal - } else { - delete(newSnapshot.internal, key) + return nil +} + +func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { + // acquire lock + s.rootLock.Lock() + + // prepare new index snapshot + currSize := len(s.root.segment) + newSize := currSize + 1 - len(nextMerge.old) + newSnapshot := &IndexSnapshot{ + segment: make([]*SegmentSnapshot, 0, newSize), + offsets: make([]uint64, 0, newSize), + internal: make(map[string][]byte, len(s.root.segment)), + epoch: s.nextSnapshotEpoch, + } + s.nextSnapshotEpoch++ + + // iterate through current segments + newSegmentDeleted := roaring.NewBitmap() + var running uint64 + for i := range s.root.segment { + segmentID := s.root.segment[i].id + if segSnapAtMerge, ok := nextMerge.old[segmentID]; ok { + // this segment is going away, see if anything else was deleted since we started the merge + if s.root.segment[i].deleted != nil { + // assume all these deletes are new + deletedSince := s.root.segment[i].deleted + // if we already knew about some of them, remove + if segSnapAtMerge.deleted != nil { + deletedSince = roaring.AndNot(s.root.segment[i].deleted, segSnapAtMerge.deleted) + } + deletedSinceItr := deletedSince.Iterator() + for deletedSinceItr.HasNext() { + oldDocNum := deletedSinceItr.Next() + newDocNum := nextMerge.oldNewDocNums[segmentID][oldDocNum] + newSegmentDeleted.Add(uint32(newDocNum)) } } - // swap in new segment - s.root = newSnapshot - // release lock - s.rootLock.Unlock() - close(next.applied) - - if notify != nil { - close(notify) - notify = nil - } + } else { + // this segment is staying + newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ + id: s.root.segment[i].id, + segment: s.root.segment[i].segment, + notify: s.root.segment[i].notify, + deleted: s.root.segment[i].deleted, + }) + newSnapshot.offsets = append(newSnapshot.offsets, running) + running += s.root.segment[i].Count() } } - s.asyncTasks.Done() + // put new segment at end + newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ + id: nextMerge.id, + segment: nextMerge.new, + deleted: newSegmentDeleted, + }) + newSnapshot.offsets = append(newSnapshot.offsets, running) + + // copy old values + for key, oldVal := range s.root.internal { + newSnapshot.internal[key] = oldVal + } + + // swap in new segment + s.root = newSnapshot + // release lock + s.rootLock.Unlock() + + // notify merger we incorporated this + close(nextMerge.notify) } From c13ff85aaf682079e549674d54eff7d59d16eb3c Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 13 Dec 2017 13:10:44 -0800 Subject: [PATCH 062/728] scorch ref-counting Future commits will provide actual cleanup when ref-counts reach 0. --- index/scorch/introducer.go | 20 ++++++++++++++-- index/scorch/merge.go | 6 +++++ index/scorch/persister.go | 30 ++++++++++++++++++++---- index/scorch/reader.go | 4 ++-- index/scorch/reader_test.go | 5 ++++ index/scorch/scorch.go | 36 +++++++++++++++-------------- index/scorch/segment/empty.go | 7 ++++++ index/scorch/segment/mem/segment.go | 7 ++++++ index/scorch/segment/segment.go | 3 +++ index/scorch/segment/zap/segment.go | 25 ++++++++++++++++++++ index/scorch/snapshot_index.go | 27 ++++++++++++++++++++++ 11 files changed, 145 insertions(+), 25 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 76c145517..9402e5a48 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -72,6 +72,7 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { offsets: make([]uint64, len(s.root.segment)+1), internal: make(map[string][]byte, len(s.root.segment)), epoch: s.nextSnapshotEpoch, + refs: 1, } s.nextSnapshotEpoch++ @@ -86,6 +87,7 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { if err != nil { next.applied <- fmt.Errorf("error computing doc numbers: %v", err) close(next.applied) + _ = newSnapshot.DecRef() return err } } @@ -94,6 +96,7 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { segment: s.root.segment[i].segment, notify: s.root.segment[i].notify, } + s.root.segment[i].segment.AddRef() // apply new obsoletions if s.root.segment[i].deleted == nil { newSnapshot.segment[i].deleted = delta @@ -108,7 +111,7 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { // put new segment at end newSnapshot.segment[len(s.root.segment)] = &SegmentSnapshot{ id: next.id, - segment: next.data, + segment: next.data, // Take ownership of next.data's ref-count. } newSnapshot.offsets[len(s.root.segment)] = running if !s.unsafeBatch { @@ -130,9 +133,15 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { } } // swap in new segment + rootPrev := s.root s.root = newSnapshot // release lock s.rootLock.Unlock() + + if rootPrev != nil { + _ = rootPrev.DecRef() + } + close(next.applied) return nil @@ -150,6 +159,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { offsets: make([]uint64, 0, newSize), internal: make(map[string][]byte, len(s.root.segment)), epoch: s.nextSnapshotEpoch, + refs: 1, } s.nextSnapshotEpoch++ @@ -182,6 +192,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { notify: s.root.segment[i].notify, deleted: s.root.segment[i].deleted, }) + s.root.segment[i].segment.AddRef() newSnapshot.offsets = append(newSnapshot.offsets, running) running += s.root.segment[i].Count() } @@ -190,7 +201,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { // put new segment at end newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ id: nextMerge.id, - segment: nextMerge.new, + segment: nextMerge.new, // Take ownership for nextMerge.new's ref-count. deleted: newSegmentDeleted, }) newSnapshot.offsets = append(newSnapshot.offsets, running) @@ -201,10 +212,15 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { } // swap in new segment + rootPrev := s.root s.root = newSnapshot // release lock s.rootLock.Unlock() + if rootPrev != nil { + _ = rootPrev.DecRef() + } + // notify merger we incorporated this close(nextMerge.notify) } diff --git a/index/scorch/merge.go b/index/scorch/merge.go index aef216646..818f3dbd1 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -38,6 +38,7 @@ OUTER: // check to see if there is a new snapshot to persist s.rootLock.RLock() ourSnapshot := s.root + ourSnapshot.AddRef() s.rootLock.RUnlock() if ourSnapshot.epoch != lastEpochMergePlanned { @@ -45,10 +46,12 @@ OUTER: err := s.planMergeAtSnapshot(ourSnapshot) if err != nil { log.Printf("merging err: %v", err) + _ = ourSnapshot.DecRef() continue OUTER } lastEpochMergePlanned = ourSnapshot.epoch } + _ = ourSnapshot.DecRef() // tell the persister we're waiting for changes // first make a notification chan @@ -64,16 +67,19 @@ OUTER: // check again s.rootLock.RLock() ourSnapshot = s.root + ourSnapshot.AddRef() s.rootLock.RUnlock() if ourSnapshot.epoch != lastEpochMergePlanned { // lets get started err := s.planMergeAtSnapshot(ourSnapshot) if err != nil { + _ = ourSnapshot.DecRef() continue OUTER } lastEpochMergePlanned = ourSnapshot.epoch } + _ = ourSnapshot.DecRef() // now wait for it (but also detect close) select { diff --git a/index/scorch/persister.go b/index/scorch/persister.go index d54840864..964e1db93 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -44,6 +44,7 @@ OUTER: // check to see if there is a new snapshot to persist s.rootLock.RLock() ourSnapshot := s.root + ourSnapshot.AddRef() s.rootLock.RUnlock() //for ourSnapshot.epoch != lastPersistedEpoch { @@ -52,6 +53,7 @@ OUTER: err := s.persistSnapshot(ourSnapshot) if err != nil { log.Printf("got err persisting snapshot: %v", err) + _ = ourSnapshot.DecRef() continue OUTER } lastPersistedEpoch = ourSnapshot.epoch @@ -60,6 +62,7 @@ OUTER: notify = nil } } + _ = ourSnapshot.DecRef() // tell the introducer we're waiting for changes // first make a notification chan @@ -75,13 +78,15 @@ OUTER: // check again s.rootLock.RLock() ourSnapshot = s.root + ourSnapshot.AddRef() s.rootLock.RUnlock() - if ourSnapshot.epoch != lastPersistedEpoch { + if ourSnapshot.epoch != lastPersistedEpoch { // lets get started err := s.persistSnapshot(ourSnapshot) if err != nil { log.Printf("got err persisting snapshot: %v", err) + _ = ourSnapshot.DecRef() continue OUTER } lastPersistedEpoch = ourSnapshot.epoch @@ -90,6 +95,7 @@ OUTER: notify = nil } } + _ = ourSnapshot.DecRef() // now wait for it (but also detect close) select { @@ -199,6 +205,9 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { for segmentID, path := range newSegmentPaths { newSegments[segmentID], err = zap.Open(path) if err != nil { + for _, s := range newSegments { + _ = s.Close() // cleanup segments that were successfully opened + } return fmt.Errorf("error opening new segment at %s, %v", path, err) } } @@ -212,6 +221,7 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { segment: make([]*SegmentSnapshot, len(s.root.segment)), offsets: make([]uint64, len(s.root.offsets)), internal: make(map[string][]byte, len(s.root.internal)), + refs: 1, } for i, segmentSnapshot := range s.root.segment { // see if this segment has been replaced @@ -228,15 +238,21 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { } } else { newIndexSnapshot.segment[i] = s.root.segment[i] + newIndexSnapshot.segment[i].segment.AddRef() } newIndexSnapshot.offsets[i] = s.root.offsets[i] } for k, v := range s.root.internal { newIndexSnapshot.internal[k] = v } + rootPrev := s.root s.root = newIndexSnapshot s.rootLock.Unlock() + if rootPrev != nil { + _ = rootPrev.DecRef() + } + // now that we've given up the lock, notify everyone that we've safely // persisted their data for _, notification := range notifications { @@ -263,17 +279,17 @@ func (s *Scorch) loadFromBolt() error { for k, _ := c.Last(); k != nil; k, _ = c.Prev() { _, snapshotEpoch, err := segment.DecodeUvarintAscending(k) if err != nil { - log.Printf("unable to parse segment epoch % x, contiuing", k) + log.Printf("unable to parse segment epoch %x, continuing", k) continue } snapshot := snapshots.Bucket(k) if snapshot == nil { - log.Printf("snapshot key, but bucket missing % x, continuing", k) + log.Printf("snapshot key, but bucket missing %x, continuing", k) continue } indexSnapshot, err := s.loadSnapshot(snapshot) if err != nil { - log.Printf("unable to load snapshot, %v continuing", err) + log.Printf("unable to load snapshot, %v, continuing", err) continue } indexSnapshot.epoch = snapshotEpoch @@ -296,6 +312,7 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { rv := &IndexSnapshot{ internal: make(map[string][]byte), + refs: 1, } var running uint64 c := snapshot.Cursor() @@ -308,19 +325,23 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { return nil }) if err != nil { + _ = rv.DecRef() return nil, err } } else { segmentBucket := snapshot.Bucket(k) if segmentBucket == nil { + _ = rv.DecRef() return nil, fmt.Errorf("segment key, but bucket missing % x", k) } segmentSnapshot, err := s.loadSegment(segmentBucket) if err != nil { + _ = rv.DecRef() return nil, fmt.Errorf("failed to load segment: %v", err) } _, segmentSnapshot.id, err = segment.DecodeUvarintAscending(k) if err != nil { + _ = rv.DecRef() return nil, fmt.Errorf("failed to decode segment id: %v", err) } rv.segment = append(rv.segment, segmentSnapshot) @@ -351,6 +372,7 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro r := bytes.NewReader(deletedBytes) _, err := deletedBitmap.ReadFrom(r) if err != nil { + _ = segment.Close() return nil, fmt.Errorf("error reading deleted bytes: %v", err) } rv.deleted = deletedBitmap diff --git a/index/scorch/reader.go b/index/scorch/reader.go index 9a20aa013..365ecb670 100644 --- a/index/scorch/reader.go +++ b/index/scorch/reader.go @@ -20,7 +20,7 @@ import ( ) type Reader struct { - root *IndexSnapshot + root *IndexSnapshot // Owns 1 ref-count on the index snapshot. } func (r *Reader) TermFieldReader(term []byte, field string, includeFreq, @@ -106,5 +106,5 @@ func (r *Reader) DumpFields() chan interface{} { } func (r *Reader) Close() error { - return nil + return r.root.DecRef() } diff --git a/index/scorch/reader_test.go b/index/scorch/reader_test.go index 2cd42fe47..4eb9b5fb9 100644 --- a/index/scorch/reader_test.go +++ b/index/scorch/reader_test.go @@ -646,6 +646,7 @@ func TestSegmentIndexAndLocalDocNumFromGlobal(t *testing.T) { for _, test := range tests { i := &IndexSnapshot{ offsets: test.offsets, + refs: 1, } gotSegmentIndex, gotLocalDocNum := i.segmentIndexAndLocalDocNumFromGlobal(test.globalDocNum) if gotSegmentIndex != test.segmentIndex { @@ -654,5 +655,9 @@ func TestSegmentIndexAndLocalDocNumFromGlobal(t *testing.T) { if gotLocalDocNum != test.localDocNum { t.Errorf("got localDocNum %d expected %d for offsets %v globalDocNum %d", gotLocalDocNum, test.localDocNum, test.offsets, test.globalDocNum) } + err := i.DecRef() + if err != nil { + t.Errorf("expected no err, got: %v", err) + } } } diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 7373e6b43..d59563be2 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -50,7 +50,7 @@ type Scorch struct { unsafeBatch bool rootLock sync.RWMutex - root *IndexSnapshot + root *IndexSnapshot // holds 1 ref-count on the root closeCh chan struct{} introductions chan *segmentIntroduction @@ -67,7 +67,7 @@ func NewScorch(storeName string, config map[string]interface{}, analysisQueue *i config: config, analysisQueue: analysisQueue, stats: &Stats{}, - root: &IndexSnapshot{}, + root: &IndexSnapshot{refs: 1}, nextSnapshotEpoch: 1, } ro, ok := config["read_only"].(bool) @@ -140,16 +140,12 @@ func (s *Scorch) Close() (err error) { // wait for them to close s.asyncTasks.Wait() // now close the root bolt - if s.rootBolt != nil { err = s.rootBolt.Close() s.rootLock.Lock() - for _, segment := range s.root.segment { - cerr := segment.Close() - if err == nil { - err = cerr - } - } + _ = s.root.DecRef() + s.root = nil + s.rootLock.Unlock() } return @@ -218,7 +214,12 @@ func (s *Scorch) Batch(batch *index.Batch) error { } else { newSegment = mem.New() } - return s.prepareSegment(newSegment, ids, batch.InternalOps) + + err := s.prepareSegment(newSegment, ids, batch.InternalOps) + if err != nil { + _ = newSegment.Close() + } + return err } func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, @@ -240,12 +241,13 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, // get read lock, to optimistically prepare obsoleted info s.rootLock.RLock() - for i := range s.root.segment { - delta, err := s.root.segment[i].segment.DocNumbers(ids) + for _, seg := range s.root.segment { + delta, err := seg.segment.DocNumbers(ids) if err != nil { + s.rootLock.RUnlock() return err } - introduction.obsoletes[s.root.segment[i].id] = delta + introduction.obsoletes[seg.id] = delta } s.rootLock.RUnlock() @@ -280,10 +282,10 @@ func (s *Scorch) DeleteInternal(key []byte) error { // release associated resources. func (s *Scorch) Reader() (index.IndexReader, error) { s.rootLock.RLock() - defer s.rootLock.RUnlock() - return &Reader{ - root: s.root, - }, nil + rv := &Reader{root: s.root} + rv.root.AddRef() + s.rootLock.RUnlock() + return rv, nil } func (s *Scorch) Stats() json.Marshaler { diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index 724195007..83454644d 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -46,6 +46,13 @@ func (e *EmptySegment) Close() error { return nil } +func (e *EmptySegment) AddRef() { +} + +func (e *EmptySegment) DecRef() error { + return nil +} + type EmptyDictionary struct{} func (e *EmptyDictionary) PostingsList(term string, diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go index cdbff5839..75ff50cc0 100644 --- a/index/scorch/segment/mem/segment.go +++ b/index/scorch/segment/mem/segment.go @@ -96,6 +96,13 @@ func New() *Segment { } } +func (s *Segment) AddRef() { +} + +func (s *Segment) DecRef() error { + return nil +} + // Fields returns the field names used in this segment func (s *Segment) Fields() []string { return s.FieldsInv diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index 6a9d70730..14b97ec80 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -35,6 +35,9 @@ type Segment interface { Fields() []string Close() error + + AddRef() + DecRef() error } type TermDictionary interface { diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 9f80b7037..498699072 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -20,6 +20,7 @@ import ( "fmt" "io" "os" + "sync" "github.com/RoaringBitmap/roaring" "github.com/Smerity/govarint" @@ -47,6 +48,7 @@ func Open(path string) (segment.Segment, error) { mm: mm, path: path, fieldsMap: make(map[string]uint16), + refs: 1, } err = rv.loadConfig() @@ -79,6 +81,25 @@ type Segment struct { fieldsMap map[string]uint16 fieldsInv []string fieldsOffsets []uint64 + + m sync.Mutex // Protects the fields that follow. + refs int64 +} + +func (s *Segment) AddRef() { + s.m.Lock() + s.refs++ + s.m.Unlock() +} + +func (s *Segment) DecRef() (err error) { + s.m.Lock() + s.refs-- + if s.refs == 0 { + err = s.closeActual() + } + s.m.Unlock() + return err } func (s *Segment) loadConfig() error { @@ -272,6 +293,10 @@ func (s *Segment) Path() string { // Close releases all resources associated with this segment func (s *Segment) Close() (err error) { + return s.DecRef() +} + +func (s *Segment) closeActual() (err error) { if s.mm != nil { err = s.mm.Unmap() } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 10d208efd..19581f755 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -20,6 +20,7 @@ import ( "encoding/binary" "fmt" "sort" + "sync" "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/document" @@ -43,6 +44,32 @@ type IndexSnapshot struct { offsets []uint64 internal map[string][]byte epoch uint64 + + m sync.Mutex // Protects the fields that follow. + refs int64 +} + +func (i *IndexSnapshot) AddRef() { + i.m.Lock() + i.refs++ + i.m.Unlock() +} + +func (i *IndexSnapshot) DecRef() (err error) { + i.m.Lock() + i.refs-- + if i.refs == 0 { + for _, s := range i.segment { + if s != nil { + err2 := s.segment.DecRef() + if err == nil { + err = err2 + } + } + } + } + i.m.Unlock() + return err } func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i segment.TermDictionary) segment.DictionaryIterator) (*IndexSnapshotFieldDict, error) { From 74b882640e422f7e3db0205dcbb3e4d007a974e7 Mon Sep 17 00:00:00 2001 From: Damien Tournoud Date: Wed, 13 Dec 2017 13:45:58 -0800 Subject: [PATCH 063/728] mapping: Fix closestDocMapping selecting wrong mapping --- mapping/document.go | 1 + mapping/mapping_test.go | 23 +++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/mapping/document.go b/mapping/document.go index d62675e52..d4c9a8f9d 100644 --- a/mapping/document.go +++ b/mapping/document.go @@ -179,6 +179,7 @@ OUTER: continue OUTER } } + break } return current } diff --git a/mapping/mapping_test.go b/mapping/mapping_test.go index 5cd86015f..735aef057 100644 --- a/mapping/mapping_test.go +++ b/mapping/mapping_test.go @@ -991,3 +991,26 @@ func TestMappingForNilTextMarshaler(t *testing.T) { } } + +func TestClosestDocDynamicMapping(t *testing.T) { + mapping := NewIndexMapping() + mapping.IndexDynamic = false + mapping.DefaultMapping = NewDocumentStaticMapping() + mapping.DefaultMapping.AddFieldMappingsAt("foo", NewTextFieldMapping()) + + doc := document.NewDocument("x") + err := mapping.MapDocument(doc, map[string]interface{}{ + "foo": "value", + "bar": map[string]string{ + "foo": "value2", + "baz": "value3", + }, + }) + if err != nil { + t.Fatal(err) + } + + if len(doc.Fields) != 1 { + t.Fatalf("expected 1 field, got: %d", len(doc.Fields)) + } +} From c0cc46a2be80bdf4538d1de27fb8683892f85abf Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 13 Dec 2017 14:54:58 -0800 Subject: [PATCH 064/728] scorch cleanup of the rootBolt of old snapshots A new global variable, NumSnapshotsToKeep, represents the default number of old snapshots that each scorch instance should maintain -- 0 is the default. Apps that need rollback'ability may want to increase this value in early initialization. The Scorch.eligibleForRemoval field tracks epoches which are safe to delete from the rootBolt. The eligibleForRemoval is appended to whenever the ref-count on an IndexSnapshot drops to 0. On startup, eligibleForRemoval is also initialized with any older epoch's found in the rootBolt. The newly introduced Scorch.removeOldSnapshots() method is called on every cycle of the persisterLoop(), where it maintains the eligibleForRemoval slice to under a size defined by the NumSnapshotsToKeep. A future commit will remove actual storage files in order to match the "source of truth" information found in the rootBolt. --- index/scorch/introducer.go | 2 + index/scorch/persister.go | 79 +++++++++++++++++++++++++++++++++- index/scorch/scorch.go | 12 +++++- index/scorch/snapshot_index.go | 4 ++ 4 files changed, 94 insertions(+), 3 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 9402e5a48..a80e5ac15 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -68,6 +68,7 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { // prepare new index snapshot, with curr size + 1 newSnapshot := &IndexSnapshot{ + parent: s, segment: make([]*SegmentSnapshot, len(s.root.segment)+1), offsets: make([]uint64, len(s.root.segment)+1), internal: make(map[string][]byte, len(s.root.segment)), @@ -155,6 +156,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { currSize := len(s.root.segment) newSize := currSize + 1 - len(nextMerge.old) newSnapshot := &IndexSnapshot{ + parent: s, segment: make([]*SegmentSnapshot, 0, newSize), offsets: make([]uint64, 0, newSize), internal: make(map[string][]byte, len(s.root.segment)), diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 964e1db93..3a6ce85b5 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -19,6 +19,7 @@ import ( "fmt" "log" "os" + "sort" "strings" "github.com/RoaringBitmap/roaring" @@ -35,6 +36,11 @@ func (s *Scorch) persisterLoop() { var lastPersistedEpoch uint64 OUTER: for { + err := s.removeOldSnapshots() + if err != nil { + log.Printf("got err removing old snapshots: %v", err) + } + select { case <-s.closeCh: break OUTER @@ -50,7 +56,7 @@ OUTER: //for ourSnapshot.epoch != lastPersistedEpoch { if ourSnapshot.epoch != lastPersistedEpoch { // lets get started - err := s.persistSnapshot(ourSnapshot) + err = s.persistSnapshot(ourSnapshot) if err != nil { log.Printf("got err persisting snapshot: %v", err) _ = ourSnapshot.DecRef() @@ -217,6 +223,7 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { s.rootLock.Lock() newIndexSnapshot := &IndexSnapshot{ + parent: s, epoch: s.root.epoch, segment: make([]*SegmentSnapshot, len(s.root.segment)), offsets: make([]uint64, len(s.root.offsets)), @@ -275,6 +282,7 @@ func (s *Scorch) loadFromBolt() error { if snapshots == nil { return nil } + foundRoot := false c := snapshots.Cursor() for k, _ := c.Last(); k != nil; k, _ = c.Prev() { _, snapshotEpoch, err := segment.DecodeUvarintAscending(k) @@ -282,14 +290,20 @@ func (s *Scorch) loadFromBolt() error { log.Printf("unable to parse segment epoch %x, continuing", k) continue } + if foundRoot { + s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch) + continue + } snapshot := snapshots.Bucket(k) if snapshot == nil { log.Printf("snapshot key, but bucket missing %x, continuing", k) + s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch) continue } indexSnapshot, err := s.loadSnapshot(snapshot) if err != nil { log.Printf("unable to load snapshot, %v, continuing", err) + s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch) continue } indexSnapshot.epoch = snapshotEpoch @@ -301,8 +315,11 @@ func (s *Scorch) loadFromBolt() error { } s.nextSegmentID++ s.nextSnapshotEpoch = snapshotEpoch + 1 + if s.root != nil { + _ = s.root.DecRef() + } s.root = indexSnapshot - break + foundRoot = true } return nil }) @@ -311,6 +328,7 @@ func (s *Scorch) loadFromBolt() error { func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { rv := &IndexSnapshot{ + parent: s, internal: make(map[string][]byte), refs: 1, } @@ -380,3 +398,60 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro return rv, nil } + +type uint64Descending []uint64 + +func (p uint64Descending) Len() int { return len(p) } +func (p uint64Descending) Less(i, j int) bool { return p[i] > p[j] } +func (p uint64Descending) Swap(i, j int) { p[i], p[j] = p[j], p[i] } + +// NumSnapshotsToKeep represents how many recent, old snapshots to +// keep around per Scorch instance. Useful for apps that require +// rollback'ability. +var NumSnapshotsToKeep int + +// Removes enough snapshots from the rootBolt so that the +// s.eligibleForRemoval stays under the NumSnapshotsToKeep policy. +func (s *Scorch) removeOldSnapshots() error { + var epochsToRemove []uint64 + + s.rootLock.Lock() + if len(s.eligibleForRemoval) > NumSnapshotsToKeep { + sort.Sort(uint64Descending(s.eligibleForRemoval)) + epochsToRemove = append([]uint64(nil), s.eligibleForRemoval[NumSnapshotsToKeep:]...) // Copy. + s.eligibleForRemoval = s.eligibleForRemoval[0:NumSnapshotsToKeep] + } + s.rootLock.Unlock() + + if len(epochsToRemove) <= 0 { + return nil + } + + tx, err := s.rootBolt.Begin(true) + if err != nil { + return err + } + defer func() { + if err == nil { + err = s.rootBolt.Sync() + } + }() + defer func() { + if err == nil { + err = tx.Commit() + } else { + _ = tx.Rollback() + } + }() + + for _, epochToRemove := range epochsToRemove { + k := segment.EncodeUvarintAscending(nil, epochToRemove) + err = tx.DeleteBucket(k) + if err == bolt.ErrBucketNotFound { + err = nil + } + } + + return err +} + diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index d59563be2..06e788ca2 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -59,6 +59,8 @@ type Scorch struct { persisterNotifier chan notificationChan rootBolt *bolt.DB asyncTasks sync.WaitGroup + + eligibleForRemoval []uint64 // Index snapshot epoch's that are safe to GC. } func NewScorch(storeName string, config map[string]interface{}, analysisQueue *index.AnalysisQueue) (index.Index, error) { @@ -67,9 +69,9 @@ func NewScorch(storeName string, config map[string]interface{}, analysisQueue *i config: config, analysisQueue: analysisQueue, stats: &Stats{}, - root: &IndexSnapshot{refs: 1}, nextSnapshotEpoch: 1, } + rv.root = &IndexSnapshot{parent: rv, refs: 1} ro, ok := config["read_only"].(bool) if ok { rv.readOnly = ro @@ -324,6 +326,14 @@ func (s *Scorch) Advanced() (store.KVStore, error) { return nil, nil } +func (s *Scorch) AddEligibleForRemoval(epoch uint64) { + s.rootLock.Lock() + if s.root == nil || s.root.epoch != epoch { + s.eligibleForRemoval = append(s.eligibleForRemoval, epoch) + } + s.rootLock.Unlock() +} + func init() { registry.RegisterIndexType(Name, NewScorch) } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 19581f755..6dd77ff4a 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -40,6 +40,7 @@ type asynchSegmentResult struct { } type IndexSnapshot struct { + parent *Scorch segment []*SegmentSnapshot offsets []uint64 internal map[string][]byte @@ -67,6 +68,9 @@ func (i *IndexSnapshot) DecRef() (err error) { } } } + if i.parent != nil { + go i.parent.AddEligibleForRemoval(i.epoch) + } } i.m.Unlock() return err From b7dff6669f0770eb5b4115f19d1f973f5f08a58a Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 13 Dec 2017 16:58:36 -0800 Subject: [PATCH 065/728] scorch cleanup of *.zap files not listed in the rootBolt --- index/scorch/persister.go | 99 +++++++++++++++++++++++++++++++++++---- 1 file changed, 89 insertions(+), 10 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 3a6ce85b5..d66bdea0d 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -17,8 +17,10 @@ package scorch import ( "bytes" "fmt" + "io/ioutil" "log" "os" + "path/filepath" "sort" "strings" @@ -32,15 +34,12 @@ import ( type notificationChan chan struct{} func (s *Scorch) persisterLoop() { + s.removeOldData(true) + var notify notificationChan var lastPersistedEpoch uint64 OUTER: for { - err := s.removeOldSnapshots() - if err != nil { - log.Printf("got err removing old snapshots: %v", err) - } - select { case <-s.closeCh: break OUTER @@ -56,7 +55,7 @@ OUTER: //for ourSnapshot.epoch != lastPersistedEpoch { if ourSnapshot.epoch != lastPersistedEpoch { // lets get started - err = s.persistSnapshot(ourSnapshot) + err := s.persistSnapshot(ourSnapshot) if err != nil { log.Printf("got err persisting snapshot: %v", err) _ = ourSnapshot.DecRef() @@ -111,6 +110,7 @@ OUTER: // woken up, next loop should pick up work } } + s.removeOldData(false) } s.asyncTasks.Done() } @@ -405,6 +405,20 @@ func (p uint64Descending) Len() int { return len(p) } func (p uint64Descending) Less(i, j int) bool { return p[i] > p[j] } func (p uint64Descending) Swap(i, j int) { p[i], p[j] = p[j], p[i] } +func (s *Scorch) removeOldData(force bool) { + removed, err := s.removeOldBoltSnapshots() + if err != nil { + log.Printf("got err removing old bolt snapshots: %v", err) + } + + if force || removed > 0 { + err = s.removeOldZapFiles() + if err != nil { + log.Printf("go err removing old zap files: %v", err) + } + } +} + // NumSnapshotsToKeep represents how many recent, old snapshots to // keep around per Scorch instance. Useful for apps that require // rollback'ability. @@ -412,7 +426,7 @@ var NumSnapshotsToKeep int // Removes enough snapshots from the rootBolt so that the // s.eligibleForRemoval stays under the NumSnapshotsToKeep policy. -func (s *Scorch) removeOldSnapshots() error { +func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) { var epochsToRemove []uint64 s.rootLock.Lock() @@ -424,12 +438,12 @@ func (s *Scorch) removeOldSnapshots() error { s.rootLock.Unlock() if len(epochsToRemove) <= 0 { - return nil + return 0, nil } tx, err := s.rootBolt.Begin(true) if err != nil { - return err + return 0, err } defer func() { if err == nil { @@ -450,8 +464,73 @@ func (s *Scorch) removeOldSnapshots() error { if err == bolt.ErrBucketNotFound { err = nil } + if err == nil { + numRemoved++ + } + } + + return numRemoved, err +} + +// Removes any *.zap files which aren't listed in the rootBolt. +func (s *Scorch) removeOldZapFiles() error { + liveFileNames, err := s.loadZapFileNames() + if err != nil { + return err + } + + currFileInfos, err := ioutil.ReadDir(s.path) + if err != nil { + return err + } + + for _, finfo := range currFileInfos { + fname := finfo.Name() + if filepath.Ext(fname) == ".zap" { + if _, exists := liveFileNames[fname]; !exists { + err := os.Remove(s.path + string(os.PathSeparator) + fname) + if err != nil { + log.Printf("got err removing file: %s, err: %v", fname, err) + } + } + } } - return err + return nil } +// Returns the *.zap file names that are listed in the rootBolt. +func (s *Scorch) loadZapFileNames() (map[string]struct{}, error) { + rv := map[string]struct{}{} + err := s.rootBolt.View(func(tx *bolt.Tx) error { + snapshots := tx.Bucket(boltSnapshotsBucket) + if snapshots == nil { + return nil + } + sc := snapshots.Cursor() + for sk, _ := sc.First(); sk != nil; sk, _ = sc.Next() { + snapshot := snapshots.Bucket(sk) + if snapshot == nil { + continue + } + segc := snapshot.Cursor() + for segk, _ := segc.First(); segk != nil; segk, _ = segc.Next() { + if segk[0] == boltInternalKey[0] { + continue + } + segmentBucket := snapshot.Bucket(segk) + if segmentBucket == nil { + continue + } + pathBytes := segmentBucket.Get(boltPathKey) + if pathBytes == nil { + continue + } + rv[string(pathBytes)] = struct{}{} + } + } + return nil + }) + + return rv, err +} From e1b0c61e2a529ddd703ce0a66e9f311cbec88597 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 13 Dec 2017 22:07:37 -0500 Subject: [PATCH 066/728] fix bug in handling iterator-done --- index/scorch/segment/zap/merge.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 6d635bdfb..481d9272a 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -153,10 +153,12 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, if dict != nil && dict.fst != nil { itr, err2 := dict.fst.Iterator(nil, nil) - if err2 != nil { + if err2 != nil && err2 != vellum.ErrIteratorDone { return nil, err2 } - itrs = append(itrs, itr) + if itr != nil { + itrs = append(itrs, itr) + } } } From 1066ee7d221cc6f5b6e57f2fdc99d4fb1a2a26a1 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 14 Dec 2017 12:38:29 +0530 Subject: [PATCH 067/728] DocumentVisitFieldTerms Scorch implementation level1 --- index/scorch/introducer.go | 2 + index/scorch/scorch_test.go | 152 +++++++++++++++++++++++++++++++ index/scorch/snapshot_index.go | 69 +++++++++++--- index/scorch/snapshot_segment.go | 98 +++++++++++++++++++- 4 files changed, 305 insertions(+), 16 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index a80e5ac15..56c95143e 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -96,6 +96,7 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { id: s.root.segment[i].id, segment: s.root.segment[i].segment, notify: s.root.segment[i].notify, + cachedDocs: s.root.segment[i].cachedDocs, } s.root.segment[i].segment.AddRef() // apply new obsoletions @@ -113,6 +114,7 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { newSnapshot.segment[len(s.root.segment)] = &SegmentSnapshot{ id: next.id, segment: next.data, // Take ownership of next.data's ref-count. + cachedDocs: &cachedDocs{cache: nil}, } newSnapshot.offsets[len(s.root.segment)] = running if !s.unsafeBatch { diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index b46a5ffd9..8d2db4d24 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -1377,3 +1377,155 @@ Mechanism[edit] This section needs additional citations for verification. Please help improve this article by adding citations to reliable sources. Unsourced material may be challenged and removed. (July 2013) There are three characteristics of liquids which are relevant to the discussion of a BLEVE:`) + +func TestIndexDocumentVisitFieldTermsWithMultipleDocs(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, testConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Fatalf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + doc := document.NewDocument("1") + doc.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("test"), document.IndexField|document.StoreField|document.IncludeTermVectors)) + doc.AddField(document.NewTextFieldWithIndexingOptions("title", []uint64{}, []byte("mister"), document.IndexField|document.StoreField|document.IncludeTermVectors)) + + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + + fieldTerms := make(index.FieldTerms) + docNumber, err := indexReader.InternalID("1") + if err != nil { + t.Fatal(err) + } + err = indexReader.DocumentVisitFieldTerms(docNumber, []string{"name", "title"}, func(field string, term []byte) { + fieldTerms[field] = append(fieldTerms[field], string(term)) + }) + if err != nil { + t.Error(err) + } + expectedFieldTerms := index.FieldTerms{ + "name": []string{"test"}, + "title": []string{"mister"}, + } + if !reflect.DeepEqual(fieldTerms, expectedFieldTerms) { + t.Errorf("expected field terms: %#v, got: %#v", expectedFieldTerms, fieldTerms) + } + err = indexReader.Close() + if err != nil { + t.Fatal(err) + } + + doc2 := document.NewDocument("2") + doc2.AddField(document.NewTextFieldWithIndexingOptions("name", []uint64{}, []byte("test2"), document.IndexField|document.StoreField|document.IncludeTermVectors)) + doc2.AddField(document.NewTextFieldWithIndexingOptions("title", []uint64{}, []byte("mister2"), document.IndexField|document.StoreField|document.IncludeTermVectors)) + err = idx.Update(doc2) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + indexReader, err = idx.Reader() + if err != nil { + t.Error(err) + } + + fieldTerms = make(index.FieldTerms) + docNumber, err = indexReader.InternalID("2") + if err != nil { + t.Fatal(err) + } + err = indexReader.DocumentVisitFieldTerms(docNumber, []string{"name", "title"}, func(field string, term []byte) { + fieldTerms[field] = append(fieldTerms[field], string(term)) + }) + if err != nil { + t.Error(err) + } + expectedFieldTerms = index.FieldTerms{ + "name": []string{"test2"}, + "title": []string{"mister2"}, + } + if !reflect.DeepEqual(fieldTerms, expectedFieldTerms) { + t.Errorf("expected field terms: %#v, got: %#v", expectedFieldTerms, fieldTerms) + } + err = indexReader.Close() + if err != nil { + t.Fatal(err) + } + + doc3 := document.NewDocument("3") + doc3.AddField(document.NewTextFieldWithIndexingOptions("name3", []uint64{}, []byte("test3"), document.IndexField|document.StoreField|document.IncludeTermVectors)) + doc3.AddField(document.NewTextFieldWithIndexingOptions("title3", []uint64{}, []byte("mister3"), document.IndexField|document.StoreField|document.IncludeTermVectors)) + err = idx.Update(doc3) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + indexReader, err = idx.Reader() + if err != nil { + t.Error(err) + } + + fieldTerms = make(index.FieldTerms) + docNumber, err = indexReader.InternalID("3") + if err != nil { + t.Fatal(err) + } + err = indexReader.DocumentVisitFieldTerms(docNumber, []string{"name3", "title3"}, func(field string, term []byte) { + fieldTerms[field] = append(fieldTerms[field], string(term)) + }) + if err != nil { + t.Error(err) + } + expectedFieldTerms = index.FieldTerms{ + "name3": []string{"test3"}, + "title3": []string{"mister3"}, + } + if !reflect.DeepEqual(fieldTerms, expectedFieldTerms) { + t.Errorf("expected field terms: %#v, got: %#v", expectedFieldTerms, fieldTerms) + } + + fieldTerms = make(index.FieldTerms) + docNumber, err = indexReader.InternalID("1") + if err != nil { + t.Fatal(err) + } + err = indexReader.DocumentVisitFieldTerms(docNumber, []string{"name", "title"}, func(field string, term []byte) { + fieldTerms[field] = append(fieldTerms[field], string(term)) + }) + if err != nil { + t.Error(err) + } + expectedFieldTerms = index.FieldTerms{ + "name": []string{"test"}, + "title": []string{"mister"}, + } + if !reflect.DeepEqual(fieldTerms, expectedFieldTerms) { + t.Errorf("expected field terms: %#v, got: %#v", expectedFieldTerms, fieldTerms) + } + err = indexReader.Close() + if err != nil { + t.Fatal(err) + } + +} diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 6dd77ff4a..cb25efc65 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -248,7 +248,10 @@ func (i *IndexSnapshot) Document(id string) (rv *document.Document, err error) { return nil, nil } - docNum := docInternalToNumber(next.ID) + docNum, err := docInternalToNumber(next.ID) + if err != nil { + return nil, err + } segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) rv = document.NewDocument(id) @@ -286,12 +289,15 @@ func (i *IndexSnapshot) segmentIndexAndLocalDocNumFromGlobal(docNum uint64) (int } func (i *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) { - docNum := docInternalToNumber(id) + docNum, err := docInternalToNumber(id) + if err != nil { + return "", err + } segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) var found bool var rv string - err := i.segment[segmentIndex].VisitDocument(localDocNum, func(field string, typ byte, value []byte, pos []uint64) bool { + err = i.segment[segmentIndex].VisitDocument(localDocNum, func(field string, typ byte, value []byte, pos []uint64) bool { if field == "_id" { found = true rv = string(value) @@ -377,15 +383,6 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, return rv, nil } -func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID, fields []string, - visitor index.DocumentFieldTermVisitor) error { - - docNum := docInternalToNumber(id) - segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) - - return i.segment[segmentIndex].DocumentVisitFieldTerms(localDocNum, fields, visitor) -} - func docNumberToBytes(in uint64) []byte { buf := new(bytes.Buffer) @@ -393,8 +390,50 @@ func docNumberToBytes(in uint64) []byte { return buf.Bytes() } -func docInternalToNumber(in index.IndexInternalID) uint64 { +func docInternalToNumber(in index.IndexInternalID) (uint64, error) { var res uint64 - _ = binary.Read(bytes.NewReader(in), binary.BigEndian, &res) - return res + err := binary.Read(bytes.NewReader(in), binary.BigEndian, &res) + if err != nil { + return res, err + } + return res, nil +} + +func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID, + fields []string, visitor index.DocumentFieldTermVisitor) error { + docNum, err := docInternalToNumber(id) + if err != nil { + return err + } + segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) + if segmentIndex >= len(i.segment) { + return nil + } + + i.m.Lock() + ss := i.segment[segmentIndex] + if ss.cachedDocs == nil { + ss.cachedDocs = &cachedDocs{cache: nil} + } + i.m.Unlock() + + err = ss.cachedDocs.prepareFields(localDocNum, fields, ss) + if err != nil { + return err + } + + for _, field := range fields { + if cachedFieldDocs, exists := ss.cachedDocs.cache[field]; exists { + if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists { + terms := bytes.SplitN(tlist, TermSeparatorSplitSlice, -1) + for _, term := range terms { + if len(term) > 0 { + visitor(field, term) + } + } + } + } + } + + return nil } diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index ffd38cac5..836ec6fa7 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -15,11 +15,17 @@ package scorch import ( + "sync" + "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" ) +var TermSeparator byte = 0xff + +var TermSeparatorSplitSlice = []byte{TermSeparator} + type SegmentDictionarySnapshot struct { s *SegmentSnapshot d segment.TermDictionary @@ -46,7 +52,8 @@ type SegmentSnapshot struct { segment segment.Segment deleted *roaring.Bitmap - notify []chan error + notify []chan error + cachedDocs *cachedDocs } func (s *SegmentSnapshot) Id() uint64 { @@ -157,3 +164,92 @@ func (s *SegmentSnapshot) DocNumbersLive() *roaring.Bitmap { func (s *SegmentSnapshot) Fields() []string { return s.segment.Fields() } + +type cachedFieldDocs struct { + readyCh chan struct{} // closed when the cachedFieldDocs.docs is ready to be used. + err error // Non-nil if there was an error when preparing this cachedFieldDocs. + docs map[uint64][]byte // Keyed by localDocNum, value is a list of terms delimited by 0xFF. +} + +func (cfd *cachedFieldDocs) prepareFields(docNum uint64, field string, + ss *SegmentSnapshot) { + defer close(cfd.readyCh) + + dict, err := ss.segment.Dictionary(field) + if err != nil { + cfd.err = err + return + } + + dictItr := dict.Iterator() + next, err := dictItr.Next() + for next != nil && err == nil { + postings, err1 := dict.PostingsList(next.Term, nil) + if err1 != nil { + cfd.err = err1 + return + } + + postingsItr := postings.Iterator() + nextPosting, err2 := postingsItr.Next() + for err2 == nil && nextPosting != nil && nextPosting.Number() <= docNum { + if nextPosting.Number() == docNum { + // got what we're looking for + cfd.docs[docNum] = append(cfd.docs[docNum], []byte(next.Term)...) + cfd.docs[docNum] = append(cfd.docs[docNum], TermSeparator) + } + nextPosting, err2 = postingsItr.Next() + } + + if err2 != nil { + cfd.err = err2 + return + } + + next, err = dictItr.Next() + } + + if err != nil { + cfd.err = err + return + } +} + +type cachedDocs struct { + m sync.Mutex // As the cache is asynchronously prepared, need a lock + cache map[string]*cachedFieldDocs // Keyed by field +} + +func (c *cachedDocs) prepareFields(docNum uint64, wantedFields []string, + ss *SegmentSnapshot) error { + c.m.Lock() + if c.cache == nil { + c.cache = make(map[string]*cachedFieldDocs, len(ss.Fields())) + } + + for _, field := range wantedFields { + _, exists := c.cache[field] + if !exists { + c.cache[field] = &cachedFieldDocs{ + readyCh: make(chan struct{}), + docs: make(map[uint64][]byte), + } + + go c.cache[field].prepareFields(docNum, field, ss) + } + } + + for _, field := range wantedFields { + cachedFieldDocs := c.cache[field] + c.m.Unlock() + <-cachedFieldDocs.readyCh + + if cachedFieldDocs.err != nil { + return cachedFieldDocs.err + } + c.m.Lock() + } + + c.m.Unlock() + return nil +} From 95b65ade3e266db161f900aa847f36f912f8abd1 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 14 Dec 2017 17:16:47 +0530 Subject: [PATCH 068/728] getting right internalID for doc in UT --- index/scorch/scorch_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index 8d2db4d24..6feabb306 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -1240,7 +1240,7 @@ func TestIndexDocumentVisitFieldTerms(t *testing.T) { fieldTerms := make(index.FieldTerms) - internalID, err := indexReader.GetInternal([]byte("1")) + internalID, err := indexReader.InternalID("1") if err != nil { t.Fatal(err) } From a4acd53c54b2de8927340707637a7ec9d3c5b692 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 14 Dec 2017 07:52:38 -0500 Subject: [PATCH 069/728] try to make racey test safe --- index_test.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/index_test.go b/index_test.go index c4e99ce1e..03ef71929 100644 --- a/index_test.go +++ b/index_test.go @@ -674,16 +674,19 @@ func TestIndexMetadataRaceBug198(t *testing.T) { } }() + wg := sync.WaitGroup{} + wg.Add(1) done := make(chan struct{}) go func() { for { select { case <-done: + wg.Done() return default: - _, err := index.DocCount() - if err != nil { - t.Fatal(err) + _, err2 := index.DocCount() + if err2 != nil { + t.Fatal(err2) } } } @@ -701,6 +704,7 @@ func TestIndexMetadataRaceBug198(t *testing.T) { } } close(done) + wg.Wait() } func TestSortMatchSearch(t *testing.T) { From 149a26b5c159e1367fdb6d1e6b5fe32495005dcd Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 14 Dec 2017 10:27:39 -0500 Subject: [PATCH 070/728] merge deletion and cacheddocs fixes discussed in meeting --- index/scorch/introducer.go | 26 ++++++++++++++------------ index/scorch/merge.go | 2 +- index/scorch/persister.go | 27 +++++++++++++++++---------- index/scorch/snapshot_index.go | 6 +----- 4 files changed, 33 insertions(+), 28 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 56c95143e..a3a197b08 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -93,9 +93,9 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { } } newSnapshot.segment[i] = &SegmentSnapshot{ - id: s.root.segment[i].id, - segment: s.root.segment[i].segment, - notify: s.root.segment[i].notify, + id: s.root.segment[i].id, + segment: s.root.segment[i].segment, + notify: s.root.segment[i].notify, cachedDocs: s.root.segment[i].cachedDocs, } s.root.segment[i].segment.AddRef() @@ -112,8 +112,8 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { } // put new segment at end newSnapshot.segment[len(s.root.segment)] = &SegmentSnapshot{ - id: next.id, - segment: next.data, // Take ownership of next.data's ref-count. + id: next.id, + segment: next.data, // Take ownership of next.data's ref-count. cachedDocs: &cachedDocs{cache: nil}, } newSnapshot.offsets[len(s.root.segment)] = running @@ -191,10 +191,11 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { } else { // this segment is staying newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ - id: s.root.segment[i].id, - segment: s.root.segment[i].segment, - notify: s.root.segment[i].notify, - deleted: s.root.segment[i].deleted, + id: s.root.segment[i].id, + segment: s.root.segment[i].segment, + notify: s.root.segment[i].notify, + deleted: s.root.segment[i].deleted, + cachedDocs: s.root.segment[i].cachedDocs, }) s.root.segment[i].segment.AddRef() newSnapshot.offsets = append(newSnapshot.offsets, running) @@ -204,9 +205,10 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { // put new segment at end newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ - id: nextMerge.id, - segment: nextMerge.new, // Take ownership for nextMerge.new's ref-count. - deleted: newSegmentDeleted, + id: nextMerge.id, + segment: nextMerge.new, // Take ownership for nextMerge.new's ref-count. + deleted: newSegmentDeleted, + cachedDocs: &cachedDocs{cache: nil}, }) newSnapshot.offsets = append(newSnapshot.offsets, running) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 818f3dbd1..c077a1c7e 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -129,7 +129,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot) error { } } - filename := fmt.Sprintf("%x.zap", newSegmentID) + filename := fmt.Sprintf("%08x.zap", newSegmentID) path := s.path + string(os.PathSeparator) + filename newDocNums, err := zap.Merge(segmentsToMerge, docsToDrop, path, 1024) if err != nil { diff --git a/index/scorch/persister.go b/index/scorch/persister.go index d66bdea0d..1a1ea2833 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -171,7 +171,7 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { switch seg := segmentSnapshot.segment.(type) { case *mem.Segment: // need to persist this to disk - filename := fmt.Sprintf("%x.zap", segmentSnapshot.id) + filename := fmt.Sprintf("%08x.zap", segmentSnapshot.id) path := s.path + string(os.PathSeparator) + filename err2 := zap.PersistSegment(seg, path, 1024) if err2 != nil { @@ -234,9 +234,10 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { // see if this segment has been replaced if replacement, ok := newSegments[segmentSnapshot.id]; ok { newSegmentSnapshot := &SegmentSnapshot{ - segment: replacement, - deleted: segmentSnapshot.deleted, - id: segmentSnapshot.id, + segment: replacement, + deleted: segmentSnapshot.deleted, + id: segmentSnapshot.id, + cachedDocs: segmentSnapshot.cachedDocs, } newIndexSnapshot.segment[i] = newSegmentSnapshot // add the old segment snapshots notifications to the list @@ -382,7 +383,8 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro } rv := &SegmentSnapshot{ - segment: segment, + segment: segment, + cachedDocs: &cachedDocs{cache: nil}, } deletedBytes := segmentBucket.Get(boltDeletedKey) if deletedBytes != nil { @@ -474,7 +476,7 @@ func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) { // Removes any *.zap files which aren't listed in the rootBolt. func (s *Scorch) removeOldZapFiles() error { - liveFileNames, err := s.loadZapFileNames() + liveFileNames, highestName, err := s.loadZapFileNames() if err != nil { return err } @@ -487,7 +489,7 @@ func (s *Scorch) removeOldZapFiles() error { for _, finfo := range currFileInfos { fname := finfo.Name() if filepath.Ext(fname) == ".zap" { - if _, exists := liveFileNames[fname]; !exists { + if _, exists := liveFileNames[fname]; !exists && fname < highestName { err := os.Remove(s.path + string(os.PathSeparator) + fname) if err != nil { log.Printf("got err removing file: %s, err: %v", fname, err) @@ -500,8 +502,9 @@ func (s *Scorch) removeOldZapFiles() error { } // Returns the *.zap file names that are listed in the rootBolt. -func (s *Scorch) loadZapFileNames() (map[string]struct{}, error) { +func (s *Scorch) loadZapFileNames() (map[string]struct{}, string, error) { rv := map[string]struct{}{} + var highest string err := s.rootBolt.View(func(tx *bolt.Tx) error { snapshots := tx.Bucket(boltSnapshotsBucket) if snapshots == nil { @@ -526,11 +529,15 @@ func (s *Scorch) loadZapFileNames() (map[string]struct{}, error) { if pathBytes == nil { continue } - rv[string(pathBytes)] = struct{}{} + pathString := string(pathBytes) + if pathString > highest { + highest = pathString + } + rv[string(pathString)] = struct{}{} } } return nil }) - return rv, err + return rv, highest, err } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index cb25efc65..201c1b24e 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -401,6 +401,7 @@ func docInternalToNumber(in index.IndexInternalID) (uint64, error) { func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID, fields []string, visitor index.DocumentFieldTermVisitor) error { + docNum, err := docInternalToNumber(id) if err != nil { return err @@ -410,12 +411,7 @@ func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID, return nil } - i.m.Lock() ss := i.segment[segmentIndex] - if ss.cachedDocs == nil { - ss.cachedDocs = &cachedDocs{cache: nil} - } - i.m.Unlock() err = ss.cachedDocs.prepareFields(localDocNum, fields, ss) if err != nil { From bd742caf6525d7e704ba5b79e028da96644eeeb9 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 14 Dec 2017 10:29:19 -0500 Subject: [PATCH 071/728] don't try to close a nil segment if err opening --- index/scorch/persister.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 1a1ea2833..2ef55b69c 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -212,7 +212,9 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { newSegments[segmentID], err = zap.Open(path) if err != nil { for _, s := range newSegments { - _ = s.Close() // cleanup segments that were successfully opened + if s != nil { + _ = s.Close() // cleanup segments that were successfully opened + } } return fmt.Errorf("error opening new segment at %s, %v", path, err) } From 2be5eb4427a6c3a26812c8a3d567f8933db5bcf4 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 14 Dec 2017 10:49:33 -0800 Subject: [PATCH 072/728] scorch tracks zap files that can't be removed yet A race & solution found by Marty Schoch... consider a case when the merger might grab a nextSegmentID, like 4, but takes awhile to complete. Meanwhile, the persister grabs the nextSegmentID of 5, but finishes its persistence work fast, and then loops to cleanup any old files. The simple approach of checking a "highest segment ID" of 5 is wrong now, because the deleter now thinks that segment 4's zap file is (incorrectly) ok to delete. The solution in this commit is to track an ephemeral map of filenames which are ineligibleForRemoval, because they're still being written (by the merger) and haven't been fully incorporated into the rootBolt yet. The merger adds to that ineligibleForRemoval map as it starts a merged zap file, the persister cleans up entries from that map when it persists zap filenames into the rootBolt, and the deleter (part of the persister's loop) consults the map before performing any actual zap file deletions. --- index/scorch/merge.go | 5 ++++- index/scorch/persister.go | 39 ++++++++++++++++++++++--------------- index/scorch/scorch.go | 41 +++++++++++++++++++++++++++++++-------- 3 files changed, 60 insertions(+), 25 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index c077a1c7e..cc3af774a 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -129,14 +129,17 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot) error { } } - filename := fmt.Sprintf("%08x.zap", newSegmentID) + filename := zapFileName(newSegmentID) + s.markIneligibleForRemoval(filename) path := s.path + string(os.PathSeparator) + filename newDocNums, err := zap.Merge(segmentsToMerge, docsToDrop, path, 1024) if err != nil { + s.unmarkIneligibleForRemoval(filename) return fmt.Errorf("merging failed: %v", err) } segment, err := zap.Open(path) if err != nil { + s.unmarkIneligibleForRemoval(filename) return err } sm := &segmentMerge{ diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 2ef55b69c..f12d86f52 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -34,8 +34,6 @@ import ( type notificationChan chan struct{} func (s *Scorch) persisterLoop() { - s.removeOldData(true) - var notify notificationChan var lastPersistedEpoch uint64 OUTER: @@ -52,7 +50,6 @@ OUTER: ourSnapshot.AddRef() s.rootLock.RUnlock() - //for ourSnapshot.epoch != lastPersistedEpoch { if ourSnapshot.epoch != lastPersistedEpoch { // lets get started err := s.persistSnapshot(ourSnapshot) @@ -110,7 +107,7 @@ OUTER: // woken up, next loop should pick up work } } - s.removeOldData(false) + s.removeOldData() } s.asyncTasks.Done() } @@ -159,6 +156,7 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { } } + var filenames []string newSegmentPaths := make(map[uint64]string) // first ensure that each segment in this snapshot has been persisted @@ -171,7 +169,7 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { switch seg := segmentSnapshot.segment.(type) { case *mem.Segment: // need to persist this to disk - filename := fmt.Sprintf("%08x.zap", segmentSnapshot.id) + filename := zapFileName(segmentSnapshot.id) path := s.path + string(os.PathSeparator) + filename err2 := zap.PersistSegment(seg, path, 1024) if err2 != nil { @@ -182,6 +180,7 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { if err != nil { return err } + filenames = append(filenames, filename) case *zap.Segment: path := seg.Path() filename := strings.TrimPrefix(path, s.path+string(os.PathSeparator)) @@ -189,6 +188,7 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { if err != nil { return err } + filenames = append(filenames, filename) default: return fmt.Errorf("unknown segment type: %T", seg) } @@ -255,6 +255,9 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { for k, v := range s.root.internal { newIndexSnapshot.internal[k] = v } + for _, filename := range filenames { + delete(s.ineligibleForRemoval, filename) + } rootPrev := s.root s.root = newIndexSnapshot s.rootLock.Unlock() @@ -272,6 +275,10 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { return nil } +func zapFileName(epoch uint64) string { + return fmt.Sprintf("%012x.zap", epoch) +} + // bolt snapshot code var boltSnapshotsBucket = []byte{'s'} @@ -409,16 +416,16 @@ func (p uint64Descending) Len() int { return len(p) } func (p uint64Descending) Less(i, j int) bool { return p[i] > p[j] } func (p uint64Descending) Swap(i, j int) { p[i], p[j] = p[j], p[i] } -func (s *Scorch) removeOldData(force bool) { +func (s *Scorch) removeOldData() { removed, err := s.removeOldBoltSnapshots() if err != nil { log.Printf("got err removing old bolt snapshots: %v", err) } - if force || removed > 0 { + if removed > 0 { err = s.removeOldZapFiles() if err != nil { - log.Printf("go err removing old zap files: %v", err) + log.Printf("got err removing old zap files: %v", err) } } } @@ -478,7 +485,7 @@ func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) { // Removes any *.zap files which aren't listed in the rootBolt. func (s *Scorch) removeOldZapFiles() error { - liveFileNames, highestName, err := s.loadZapFileNames() + liveFileNames, err := s.loadZapFileNames() if err != nil { return err } @@ -488,10 +495,12 @@ func (s *Scorch) removeOldZapFiles() error { return err } + s.rootLock.RLock() + for _, finfo := range currFileInfos { fname := finfo.Name() if filepath.Ext(fname) == ".zap" { - if _, exists := liveFileNames[fname]; !exists && fname < highestName { + if _, exists := liveFileNames[fname]; !exists && !s.ineligibleForRemoval[fname] { err := os.Remove(s.path + string(os.PathSeparator) + fname) if err != nil { log.Printf("got err removing file: %s, err: %v", fname, err) @@ -500,13 +509,14 @@ func (s *Scorch) removeOldZapFiles() error { } } + s.rootLock.RUnlock() + return nil } // Returns the *.zap file names that are listed in the rootBolt. -func (s *Scorch) loadZapFileNames() (map[string]struct{}, string, error) { +func (s *Scorch) loadZapFileNames() (map[string]struct{}, error) { rv := map[string]struct{}{} - var highest string err := s.rootBolt.View(func(tx *bolt.Tx) error { snapshots := tx.Bucket(boltSnapshotsBucket) if snapshots == nil { @@ -532,14 +542,11 @@ func (s *Scorch) loadZapFileNames() (map[string]struct{}, string, error) { continue } pathString := string(pathBytes) - if pathString > highest { - highest = pathString - } rv[string(pathString)] = struct{}{} } } return nil }) - return rv, highest, err + return rv, err } diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 06e788ca2..2e54bebce 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -60,16 +60,19 @@ type Scorch struct { rootBolt *bolt.DB asyncTasks sync.WaitGroup - eligibleForRemoval []uint64 // Index snapshot epoch's that are safe to GC. + eligibleForRemoval []uint64 // Index snapshot epochs that are safe to GC. + ineligibleForRemoval map[string]bool // Filenames that should not be GC'ed yet. } func NewScorch(storeName string, config map[string]interface{}, analysisQueue *index.AnalysisQueue) (index.Index, error) { rv := &Scorch{ - version: Version, - config: config, - analysisQueue: analysisQueue, - stats: &Stats{}, - nextSnapshotEpoch: 1, + version: Version, + config: config, + analysisQueue: analysisQueue, + stats: &Stats{}, + nextSnapshotEpoch: 1, + closeCh: make(chan struct{}), + ineligibleForRemoval: map[string]bool{}, } rv.root = &IndexSnapshot{parent: rv, refs: 1} ro, ok := config["read_only"].(bool) @@ -113,16 +116,24 @@ func (s *Scorch) Open() error { // now see if there is any existing state to load err = s.loadFromBolt() if err != nil { + _ = s.Close() return err } } - s.closeCh = make(chan struct{}) s.introductions = make(chan *segmentIntroduction) s.merges = make(chan *segmentMerge) s.introducerNotifier = make(chan notificationChan) s.persisterNotifier = make(chan notificationChan) + if !s.readOnly && s.path != "" { + err := s.removeOldZapFiles() // Before persister or merger create any new files. + if err != nil { + _ = s.Close() + return err + } + } + s.asyncTasks.Add(1) go s.mainLoop() @@ -145,7 +156,9 @@ func (s *Scorch) Close() (err error) { if s.rootBolt != nil { err = s.rootBolt.Close() s.rootLock.Lock() - _ = s.root.DecRef() + if s.root != nil { + _ = s.root.DecRef() + } s.root = nil s.rootLock.Unlock() } @@ -334,6 +347,18 @@ func (s *Scorch) AddEligibleForRemoval(epoch uint64) { s.rootLock.Unlock() } +func (s *Scorch) markIneligibleForRemoval(filename string) { + s.rootLock.Lock() + s.ineligibleForRemoval[filename] = true + s.rootLock.Unlock() +} + +func (s *Scorch) unmarkIneligibleForRemoval(filename string) { + s.rootLock.Lock() + delete(s.ineligibleForRemoval, filename) + s.rootLock.Unlock() +} + func init() { registry.RegisterIndexType(Name, NewScorch) } From a8884e101136a9abb11d4c610001c3b0cd6ed58d Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 14 Dec 2017 13:16:06 -0800 Subject: [PATCH 073/728] scorch fix for TestSortMatchSearch The cachedDocs preparation has to happen for all docs in the field, not just on the currently requested docNum. Also, as part of this commit, there's a loop optimization where we no longer use bytes.Split() on the terms buffer, thus avoiding garbage creation. --- index/scorch/snapshot_index.go | 12 +++++++----- index/scorch/snapshot_segment.go | 20 ++++++++------------ 2 files changed, 15 insertions(+), 17 deletions(-) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 201c1b24e..86998bbdd 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -413,7 +413,7 @@ func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID, ss := i.segment[segmentIndex] - err = ss.cachedDocs.prepareFields(localDocNum, fields, ss) + err = ss.cachedDocs.prepareFields(fields, ss) if err != nil { return err } @@ -421,11 +421,13 @@ func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID, for _, field := range fields { if cachedFieldDocs, exists := ss.cachedDocs.cache[field]; exists { if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists { - terms := bytes.SplitN(tlist, TermSeparatorSplitSlice, -1) - for _, term := range terms { - if len(term) > 0 { - visitor(field, term) + for { + i := bytes.Index(tlist, TermSeparatorSplitSlice) + if i < 0 { + break } + visitor(field, tlist[0:i]) + tlist = tlist[i+1:] } } } diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 836ec6fa7..9b22eaedd 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -171,8 +171,7 @@ type cachedFieldDocs struct { docs map[uint64][]byte // Keyed by localDocNum, value is a list of terms delimited by 0xFF. } -func (cfd *cachedFieldDocs) prepareFields(docNum uint64, field string, - ss *SegmentSnapshot) { +func (cfd *cachedFieldDocs) prepareFields(field string, ss *SegmentSnapshot) { defer close(cfd.readyCh) dict, err := ss.segment.Dictionary(field) @@ -183,7 +182,7 @@ func (cfd *cachedFieldDocs) prepareFields(docNum uint64, field string, dictItr := dict.Iterator() next, err := dictItr.Next() - for next != nil && err == nil { + for err == nil && next != nil { postings, err1 := dict.PostingsList(next.Term, nil) if err1 != nil { cfd.err = err1 @@ -192,12 +191,10 @@ func (cfd *cachedFieldDocs) prepareFields(docNum uint64, field string, postingsItr := postings.Iterator() nextPosting, err2 := postingsItr.Next() - for err2 == nil && nextPosting != nil && nextPosting.Number() <= docNum { - if nextPosting.Number() == docNum { - // got what we're looking for - cfd.docs[docNum] = append(cfd.docs[docNum], []byte(next.Term)...) - cfd.docs[docNum] = append(cfd.docs[docNum], TermSeparator) - } + for err2 == nil && nextPosting != nil { + docNum := nextPosting.Number() + cfd.docs[docNum] = append(cfd.docs[docNum], []byte(next.Term)...) + cfd.docs[docNum] = append(cfd.docs[docNum], TermSeparator) nextPosting, err2 = postingsItr.Next() } @@ -220,8 +217,7 @@ type cachedDocs struct { cache map[string]*cachedFieldDocs // Keyed by field } -func (c *cachedDocs) prepareFields(docNum uint64, wantedFields []string, - ss *SegmentSnapshot) error { +func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) error { c.m.Lock() if c.cache == nil { c.cache = make(map[string]*cachedFieldDocs, len(ss.Fields())) @@ -235,7 +231,7 @@ func (c *cachedDocs) prepareFields(docNum uint64, wantedFields []string, docs: make(map[uint64][]byte), } - go c.cache[field].prepareFields(docNum, field, ss) + go c.cache[field].prepareFields(field, ss) } } From eb2f541d4f9f5756d524cb52cc9b9621f7c8eef6 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 14 Dec 2017 13:52:28 -0800 Subject: [PATCH 074/728] scorch filters _id from Reader.Document() results --- index/scorch/scorch_test.go | 29 ++++++++++++++--------------- index/scorch/snapshot_index.go | 3 +++ 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index 6feabb306..d347ecec0 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -540,7 +540,7 @@ func TestIndexInsertWithStore(t *testing.T) { t.Error(err) } - if len(storedDoc.Fields) != 2 { + if len(storedDoc.Fields) != 1 { t.Errorf("expected 1 stored field, got %d", len(storedDoc.Fields)) } for _, field := range storedDoc.Fields { @@ -553,13 +553,7 @@ func TestIndexInsertWithStore(t *testing.T) { t.Errorf("expected field content 'test', got '%s'", string(textField.Value())) } } else if field.Name() == "_id" { - textField, ok := field.(*document.TextField) - if !ok { - t.Errorf("expected text field") - } - if string(textField.Value()) != "1" { - t.Errorf("expected field content '1', got '%s'", string(textField.Value())) - } + t.Errorf("not expecting _id field") } } } @@ -857,8 +851,8 @@ func TestIndexInsertUpdateDeleteWithMultipleTypesStored(t *testing.T) { t.Error(err) } - if len(storedDoc.Fields) != 4 { - t.Errorf("expected 4 stored field, got %d", len(storedDoc.Fields)) + if len(storedDoc.Fields) != 3 { + t.Errorf("expected 3 stored field, got %d", len(storedDoc.Fields)) } for _, field := range storedDoc.Fields { @@ -896,8 +890,9 @@ func TestIndexInsertUpdateDeleteWithMultipleTypesStored(t *testing.T) { t.Errorf("expected date value unix epoch, got %v", dateFieldDate) } } + } else if field.Name() == "_id" { + t.Errorf("not expecting _id field") } - } // now update the document, but omit one of the fields @@ -934,8 +929,8 @@ func TestIndexInsertUpdateDeleteWithMultipleTypesStored(t *testing.T) { t.Error(err) } - if len(storedDoc.Fields) != 3 { - t.Errorf("expected 3 stored field, got %d", len(storedDoc.Fields)) + if len(storedDoc.Fields) != 2 { + t.Errorf("expected 2 stored field, got %d", len(storedDoc.Fields)) } for _, field := range storedDoc.Fields { @@ -961,6 +956,8 @@ func TestIndexInsertUpdateDeleteWithMultipleTypesStored(t *testing.T) { t.Errorf("expeted numeric value 36.99, got %f", numFieldNumer) } } + } else if field.Name() == "_id" { + t.Errorf("not expecting _id field") } } @@ -1114,8 +1111,8 @@ func TestIndexUpdateComposites(t *testing.T) { if err != nil { t.Error(err) } - if len(storedDoc.Fields) != 3 { - t.Errorf("expected 3 stored field, got %d", len(storedDoc.Fields)) + if len(storedDoc.Fields) != 2 { + t.Errorf("expected 2 stored field, got %d", len(storedDoc.Fields)) } for _, field := range storedDoc.Fields { if field.Name() == "name" { @@ -1126,6 +1123,8 @@ func TestIndexUpdateComposites(t *testing.T) { if string(textField.Value()) != "testupdated" { t.Errorf("expected field content 'test', got '%s'", string(textField.Value())) } + } else if field.Name() == "_id" { + t.Errorf("not expecting _id field") } } } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 86998bbdd..5b54669b9 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -256,6 +256,9 @@ func (i *IndexSnapshot) Document(id string) (rv *document.Document, err error) { rv = document.NewDocument(id) err = i.segment[segmentIndex].VisitDocument(localDocNum, func(name string, typ byte, value []byte, pos []uint64) bool { + if name == "_id" { + return true + } switch typ { case 't': rv.AddField(document.NewTextField(name, pos, value)) From 6ab27e4afa51f56b2de03b5e62f5ab45edeec701 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 14 Dec 2017 17:19:50 -0500 Subject: [PATCH 075/728] quick hack to disable safe batches in fts --- index/scorch/scorch.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 2e54bebce..585904979 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -79,6 +79,10 @@ func NewScorch(storeName string, config map[string]interface{}, analysisQueue *i if ok { rv.readOnly = ro } + // hack for now to disable safe batches in FTS + if storeName == "moss" { + rv.unsafeBatch = true + } return rv, nil } From 506aa1c325005fb1d955e0c8b5a9bf013011fe96 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 14 Dec 2017 14:40:33 -0800 Subject: [PATCH 076/728] scorch fix data race w/ AddEligibleForRemoval Found from "go test -race ./..." WARNING: DATA RACE Read at 0x00c420088060 by goroutine 48: github.com/blevesearch/bleve/index/scorch.(*Scorch).AddEligibleForRemoval() /Users/steveyen/go/src/github.com/blevesearch/bleve/index/scorch/scorch.go:348 +0x6d Previous write at 0x00c420088060 by goroutine 31: github.com/blevesearch/bleve/index/scorch.(*Scorch).loadFromBolt.func1() /Users/steveyen/go/src/github.com/blevesearch/bleve/index/scorch/persister.go:332 +0x87b github.com/boltdb/bolt.(*DB).View() /Users/steveyen/go/src/github.com/boltdb/bolt/db.go:629 +0xc1 github.com/blevesearch/bleve/index/scorch.(*Scorch).loadFromBolt() /Users/steveyen/go/src/github.com/blevesearch/bleve/index/scorch/persister.go:290 +0xa1 github.com/blevesearch/bleve/index/scorch.(*Scorch).Open() /Users/steveyen/go/src/github.com/blevesearch/bleve/index/scorch/scorch.go:121 +0x77f github.com/blevesearch/bleve/index/scorch.TestIndexOpenReopen() /Users/steveyen/go/src/github.com/blevesearch/bleve/index/scorch/scorch_test.go:115 +0x1351 testing.tRunner() /usr/local/Cellar/go/1.9/libexec/src/testing/testing.go:746 +0x16c Goroutine 48 (running) created at: github.com/blevesearch/bleve/index/scorch.(*IndexSnapshot).DecRef() /Users/steveyen/go/src/github.com/blevesearch/bleve/index/scorch/snapshot_index.go:72 +0x23e github.com/blevesearch/bleve/index/scorch.(*Scorch).loadFromBolt.func1() /Users/steveyen/go/src/github.com/blevesearch/bleve/index/scorch/persister.go:330 +0x8f4 github.com/boltdb/bolt.(*DB).View() /Users/steveyen/go/src/github.com/boltdb/bolt/db.go:629 +0xc1 github.com/blevesearch/bleve/index/scorch.(*Scorch).loadFromBolt() /Users/steveyen/go/src/github.com/blevesearch/bleve/index/scorch/persister.go:290 +0xa1 github.com/blevesearch/bleve/index/scorch.(*Scorch).Open() /Users/steveyen/go/src/github.com/blevesearch/bleve/index/scorch/scorch.go:121 +0x77f github.com/blevesearch/bleve/index/scorch.TestIndexOpenReopen() /Users/steveyen/go/src/github.com/blevesearch/bleve/index/scorch/scorch_test.go:115 +0x1351 testing.tRunner() /usr/local/Cellar/go/1.9/libexec/src/testing/testing.go:746 +0x16c --- index/scorch/persister.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index f12d86f52..d0f010e48 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -325,10 +325,12 @@ func (s *Scorch) loadFromBolt() error { } s.nextSegmentID++ s.nextSnapshotEpoch = snapshotEpoch + 1 + s.rootLock.Lock() if s.root != nil { _ = s.root.DecRef() } s.root = indexSnapshot + s.rootLock.Unlock() foundRoot = true } return nil From b5aa4ed22b6d73d1dd3ff589c7e21db536616df1 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 14 Dec 2017 17:41:02 -0500 Subject: [PATCH 077/728] return err not panic --- index/scorch/segment/zap/segment.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 498699072..65c62ac9c 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -201,7 +201,7 @@ func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVi meta, compressed := s.getStoredMetaAndCompressed(num) uncompressed, err := snappy.Decode(nil, compressed) if err != nil { - panic(err) + return err } // now decode meta and process reader := bytes.NewReader(meta) From f05794c6aace58cf791ee8f0d2108c5ecf9ea484 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 15 Dec 2017 11:11:18 -0800 Subject: [PATCH 078/728] scorch removed worker goroutines from TermFieldReader() On a couple of micro benchmarks on a dev macbook using bleve-query on an index of 50K wikipedia docs, scorch is now in more the same neighborhood of upsidedown/moss... high-freq term search "text:date"... 400 qps - upsidedown/moss 360 qps - scorch before 404 qps - scorch after zero-freq term search "text:mschoch"... 100K qps - upsidedown/moss 55K qps - scorch before 99K qps - scorch after Of note, the scorch index had ~150 *.zap files in it, which likely made made the worker goroutine overhead more costly than for a case with few segments, where goroutine and channel related work appeared relatively prominently in the pprof SVG's. --- index/scorch/snapshot_index.go | 41 +++++++++------------------------- 1 file changed, 10 insertions(+), 31 deletions(-) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 5b54669b9..c0b50a25e 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -341,27 +341,6 @@ func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err err func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { - results := make(chan *asynchSegmentResult) - for index, segment := range i.segment { - go func(index int, segment *SegmentSnapshot) { - dict, err := segment.Dictionary(field) - if err != nil { - results <- &asynchSegmentResult{err: err} - } else { - pl, err := dict.PostingsList(string(term), nil) - if err != nil { - results <- &asynchSegmentResult{err: err} - } else { - results <- &asynchSegmentResult{ - index: index, - postings: pl, - } - } - } - }(index, segment) - } - - var err error rv := &IndexSnapshotTermFieldReader{ term: term, snapshot: i, @@ -371,17 +350,17 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, includeNorm: includeNorm, includeTermVectors: includeTermVectors, } - for count := 0; count < len(i.segment); count++ { - asr := <-results - if asr.err != nil && err == nil { - err = asr.err - } else { - rv.postings[asr.index] = asr.postings - rv.iterators[asr.index] = asr.postings.Iterator() + for i, segment := range i.segment { + dict, err := segment.Dictionary(field) + if err != nil { + return nil, err } - } - if err != nil { - return nil, err + pl, err := dict.PostingsList(string(term), nil) + if err != nil { + return nil, err + } + rv.postings[i] = pl + rv.iterators[i] = pl.Iterator() } return rv, nil } From 620dcdb6f87dcc78c5c1baec713d141f7c7e604d Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 15 Dec 2017 11:54:52 -0800 Subject: [PATCH 079/728] scorch uses prealloc'ed buffer for docNumberToBytes() On a couple of micro benchmarks on a dev macbook using bleve-query on an index of 50K wikipedia docs, scorch is now faster than upsidedown/moss on high-freq term search "text:date"... 400 qps - upsidedown/moss 404 qps - scorch before 565 qps - scorch after --- index/scorch/snapshot_index.go | 11 ++++++----- index/scorch/snapshot_index_doc.go | 2 +- index/scorch/snapshot_index_tfr.go | 3 +-- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index c0b50a25e..7f5f41032 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -365,11 +365,12 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, return rv, nil } -func docNumberToBytes(in uint64) []byte { - - buf := new(bytes.Buffer) - _ = binary.Write(buf, binary.BigEndian, in) - return buf.Bytes() +func docNumberToBytes(buf []byte, in uint64) []byte { + if len(buf) != 8 { + buf = make([]byte, 8) + } + binary.BigEndian.PutUint64(buf, in) + return buf } func docInternalToNumber(in index.IndexInternalID) (uint64, error) { diff --git a/index/scorch/snapshot_index_doc.go b/index/scorch/snapshot_index_doc.go index 4656079b0..d1205ff8e 100644 --- a/index/scorch/snapshot_index_doc.go +++ b/index/scorch/snapshot_index_doc.go @@ -36,7 +36,7 @@ func (i *IndexSnapshotDocIDReader) Next() (index.IndexInternalID, error) { next := i.iterators[i.segmentOffset].Next() // make segment number into global number by adding offset globalOffset := i.snapshot.offsets[i.segmentOffset] - return docNumberToBytes(uint64(next) + globalOffset), nil + return docNumberToBytes(nil, uint64(next)+globalOffset), nil } return nil, nil } diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index 1fbabdfbb..d6c8dcd13 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -49,8 +49,7 @@ func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*in // make segment number into global number by adding offset globalOffset := i.snapshot.offsets[i.segmentOffset] nnum := next.Number() - rv.ID = docNumberToBytes(nnum + globalOffset) - + rv.ID = docNumberToBytes(rv.ID, nnum+globalOffset) i.postingToTermFieldDoc(next, rv) i.currID = rv.ID From 45c212a0c227f1923a74fa16d5f115658ac13020 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 15 Dec 2017 13:25:37 -0800 Subject: [PATCH 080/728] scorch mergeplan package comments tweak Moving the package comment for mergeplan to the right place. --- index/scorch/mergeplan/merge_plan.go | 3 +++ index/scorch/mergeplan/merge_plan_test.go | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/index/scorch/mergeplan/merge_plan.go b/index/scorch/mergeplan/merge_plan.go index f3f7b9e3b..61510c1c9 100644 --- a/index/scorch/mergeplan/merge_plan.go +++ b/index/scorch/mergeplan/merge_plan.go @@ -12,6 +12,9 @@ // See the License for the specific language governing permissions and // limitations under the License. +// Package mergeplan provides a segment merge planning approach that's +// inspired by Lucene's TieredMergePolicy.java and descriptions like +// http://blog.mikemccandless.com/2011/02/visualizing-lucenes-segment-merges.html package mergeplan import ( diff --git a/index/scorch/mergeplan/merge_plan_test.go b/index/scorch/mergeplan/merge_plan_test.go index 05dcaaf9f..bf2f24be1 100644 --- a/index/scorch/mergeplan/merge_plan_test.go +++ b/index/scorch/mergeplan/merge_plan_test.go @@ -12,9 +12,6 @@ // See the License for the specific language governing permissions and // limitations under the License. -// The segment merge planning approach was inspired by Lucene's -// TieredMergePolicy.java and descriptions like -// http://blog.mikemccandless.com/2011/02/visualizing-lucenes-segment-merges.html package mergeplan import ( From a575be4d56f89f805b4a4904834932095df5c977 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 15 Dec 2017 19:26:23 -0500 Subject: [PATCH 081/728] fix issue where we incorrectly seed the nextSegmentID on Open() --- index/scorch/persister.go | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index d0f010e48..73a0bd9f4 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -22,6 +22,7 @@ import ( "os" "path/filepath" "sort" + "strconv" "strings" "github.com/RoaringBitmap/roaring" @@ -318,10 +319,9 @@ func (s *Scorch) loadFromBolt() error { } indexSnapshot.epoch = snapshotEpoch // set the nextSegmentID - for _, segment := range indexSnapshot.segment { - if segment.id > s.nextSegmentID { - s.nextSegmentID = segment.id - } + s.nextSegmentID, err = s.maxSegmentIDOnDisk() + if err != nil { + return err } s.nextSegmentID++ s.nextSnapshotEpoch = snapshotEpoch + 1 @@ -485,6 +485,29 @@ func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) { return numRemoved, err } +func (s *Scorch) maxSegmentIDOnDisk() (uint64, error) { + currFileInfos, err := ioutil.ReadDir(s.path) + if err != nil { + return 0, err + } + + var rv uint64 + for _, finfo := range currFileInfos { + fname := finfo.Name() + if filepath.Ext(fname) == ".zap" { + prefix := strings.TrimSuffix(fname, ".zap") + id, err2 := strconv.ParseUint(prefix, 16, 64) + if err2 != nil { + return 0, err2 + } + if id > rv { + rv = id + } + } + } + return rv, err +} + // Removes any *.zap files which aren't listed in the rootBolt. func (s *Scorch) removeOldZapFiles() error { liveFileNames, err := s.loadZapFileNames() From 0539744e901cc74f2ba7d1dd89d6f8bdc893d79b Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 16 Dec 2017 08:39:10 -0800 Subject: [PATCH 082/728] scorch mergeplan.ToBarChart() refactored to callable API Refactored out API so it's usable from other places. --- index/scorch/mergeplan/merge_plan.go | 58 +++++++++++++++++++++++ index/scorch/mergeplan/merge_plan_test.go | 47 +----------------- 2 files changed, 59 insertions(+), 46 deletions(-) diff --git a/index/scorch/mergeplan/merge_plan.go b/index/scorch/mergeplan/merge_plan.go index 61510c1c9..c77fe37f8 100644 --- a/index/scorch/mergeplan/merge_plan.go +++ b/index/scorch/mergeplan/merge_plan.go @@ -18,8 +18,10 @@ package mergeplan import ( + "fmt" "math" "sort" + "strings" ) // A Segment represents the information that the planner needs to @@ -292,3 +294,59 @@ func ScoreSegments(segments []Segment, o *MergePlanOptions) float64 { return score } + +// ------------------------------------------ + +// ToBarChart returns an ASCII rendering of the segments and the plan. +// The barMax is the max width of the bars in the bar chart. +func ToBarChart(prefix string, barMax int, segments []Segment, plan *MergePlan) string { + rv := make([]string, 0, len(segments)) + + var maxFullSize int64 + for _, segment := range segments { + if maxFullSize < segment.FullSize() { + maxFullSize = segment.FullSize() + } + } + if maxFullSize < 0 { + maxFullSize = 1 + } + + for _, segment := range segments { + barFull := int(segment.FullSize()) + barLive := int(segment.LiveSize()) + + if maxFullSize > int64(barMax) { + barFull = int(float64(barMax) * float64(barFull) / float64(maxFullSize)) + barLive = int(float64(barMax) * float64(barLive) / float64(maxFullSize)) + } + + barKind := " " + barChar := "." + + if plan != nil { + TASK_LOOP: + for taski, task := range plan.Tasks { + for _, taskSegment := range task.Segments { + if taskSegment == segment { + barKind = "*" + barChar = fmt.Sprintf("%d", taski) + break TASK_LOOP + } + } + } + } + + bar := + strings.Repeat(barChar, barLive)[0:barLive] + + strings.Repeat("x", barFull-barLive)[0:barFull-barLive] + + rv = append(rv, fmt.Sprintf("%s %5d: %5d /%5d - %s %s", prefix, + segment.Id(), + segment.LiveSize(), + segment.FullSize(), + barKind, bar)) + } + + return strings.Join(rv, "\n") +} diff --git a/index/scorch/mergeplan/merge_plan_test.go b/index/scorch/mergeplan/merge_plan_test.go index bf2f24be1..ca4d80bd1 100644 --- a/index/scorch/mergeplan/merge_plan_test.go +++ b/index/scorch/mergeplan/merge_plan_test.go @@ -20,7 +20,6 @@ import ( "os" "reflect" "sort" - "strings" "testing" ) @@ -451,49 +450,5 @@ func emit(descrip string, cycle int, step int, segments []Segment, plan *MergePl } fmt.Printf("%s %d.%d ---------- %s\n", descrip, cycle, step, suffix) - - var maxFullSize int64 - for _, segment := range segments { - if maxFullSize < segment.FullSize() { - maxFullSize = segment.FullSize() - } - } - - barMax := 100 - - for _, segment := range segments { - barFull := int(segment.FullSize()) - barLive := int(segment.LiveSize()) - - if maxFullSize > int64(barMax) { - barFull = int(float64(barMax) * float64(barFull) / float64(maxFullSize)) - barLive = int(float64(barMax) * float64(barLive) / float64(maxFullSize)) - } - - barKind := " " - barChar := "." - - if plan != nil { - TASK_LOOP: - for taski, task := range plan.Tasks { - for _, taskSegment := range task.Segments { - if taskSegment == segment { - barKind = "*" - barChar = fmt.Sprintf("%d", taski) - break TASK_LOOP - } - } - } - } - - bar := - strings.Repeat(barChar, barLive)[0:barLive] + - strings.Repeat("x", barFull-barLive)[0:barFull-barLive] - - fmt.Printf("%s %5d: %5d /%5d - %s %s\n", descrip, - segment.Id(), - segment.LiveSize(), - segment.FullSize(), - barKind, bar) - } + fmt.Printf("%s\n", ToBarChart(descrip, 100, segments, plan)) } From e98602600d3067a3e073380875727b5deeef310c Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 16 Dec 2017 14:22:15 -0800 Subject: [PATCH 083/728] scorch mergeplan added TierGrowth option Previously, CalcBudget() was treating MergePlanOptions.SegmentsPerMergeTask as the growth factor while computing the idealized staircase of segments. This change introduces a TierGrowth option to MergePlanOptions for more control and so that SegmentsPerMergeTask can be tweaked independently of the tier growth factor. --- index/scorch/mergeplan/merge_plan.go | 16 +++++++++++----- index/scorch/mergeplan/merge_plan_test.go | 11 ++++++++++- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/index/scorch/mergeplan/merge_plan.go b/index/scorch/mergeplan/merge_plan.go index c77fe37f8..41ccae0c4 100644 --- a/index/scorch/mergeplan/merge_plan.go +++ b/index/scorch/mergeplan/merge_plan.go @@ -75,6 +75,10 @@ type MergePlanOptions struct { // planner’s predicted sizes. MaxSegmentSize int64 + // The growth factor for each tier in a staircase of idealized + // segments computed by CalcBudget(). + TierGrowth float64 + // The number of segments in any resulting MergeTask. e.g., // len(result.Tasks[ * ].Segments) == SegmentsPerMergeTask. SegmentsPerMergeTask int @@ -115,6 +119,7 @@ func (o *MergePlanOptions) RaiseToFloorSegmentSize(s int64) int64 { var DefaultMergePlanOptions = MergePlanOptions{ MaxSegmentsPerTier: 10, MaxSegmentSize: 5000000, + TierGrowth: 10.0, SegmentsPerMergeTask: 10, FloorSegmentSize: 2000, ReclaimDeletesWeight: 2.0, @@ -212,7 +217,8 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) { } // Compute the number of segments that would be needed to cover the -// totalSize, by climbing up a logarithmic staircase of segment tiers. +// totalSize, by climbing up a logarithmically growing staircase of +// segment tiers. func CalcBudget(totalSize int64, firstTierSize int64, o *MergePlanOptions) ( budgetNumSegments int) { tierSize := firstTierSize @@ -225,9 +231,9 @@ func CalcBudget(totalSize int64, firstTierSize int64, o *MergePlanOptions) ( maxSegmentsPerTier = 1 } - segmentsPerMergeTask := int64(o.SegmentsPerMergeTask) - if segmentsPerMergeTask < 2 { - segmentsPerMergeTask = 2 + tierGrowth := o.TierGrowth + if tierGrowth < 1.0 { + tierGrowth = 1.0 } for totalSize > 0 { @@ -239,7 +245,7 @@ func CalcBudget(totalSize int64, firstTierSize int64, o *MergePlanOptions) ( budgetNumSegments += maxSegmentsPerTier totalSize -= int64(maxSegmentsPerTier) * tierSize - tierSize *= segmentsPerMergeTask + tierSize = int64(float64(tierSize) * tierGrowth) } return budgetNumSegments diff --git a/index/scorch/mergeplan/merge_plan_test.go b/index/scorch/mergeplan/merge_plan_test.go index ca4d80bd1..4db8eb1e8 100644 --- a/index/scorch/mergeplan/merge_plan_test.go +++ b/index/scorch/mergeplan/merge_plan_test.go @@ -98,6 +98,7 @@ func TestSimplePlan(t *testing.T) { &MergePlanOptions{ MaxSegmentsPerTier: 1, MaxSegmentSize: 1000, + TierGrowth: 2.0, SegmentsPerMergeTask: 2, FloorSegmentSize: 1, }, @@ -156,11 +157,12 @@ func TestCalcBudget(t *testing.T) { }{ {0, 0, MergePlanOptions{}, 0}, {1, 0, MergePlanOptions{}, 1}, - {9, 0, MergePlanOptions{}, 4}, + {9, 0, MergePlanOptions{}, 9}, {1, 1, MergePlanOptions{ MaxSegmentsPerTier: 1, MaxSegmentSize: 1000, + TierGrowth: 2.0, SegmentsPerMergeTask: 2, FloorSegmentSize: 1, }, @@ -170,6 +172,7 @@ func TestCalcBudget(t *testing.T) { MergePlanOptions{ MaxSegmentsPerTier: 1, MaxSegmentSize: 1000, + TierGrowth: 2.0, SegmentsPerMergeTask: 2, FloorSegmentSize: 1, }, @@ -179,6 +182,7 @@ func TestCalcBudget(t *testing.T) { MergePlanOptions{ MaxSegmentsPerTier: 2, MaxSegmentSize: 1000, + TierGrowth: 2.0, SegmentsPerMergeTask: 2, FloorSegmentSize: 1, }, @@ -201,6 +205,7 @@ func TestInsert1SameSizedSegmentBetweenMerges(t *testing.T) { o := &MergePlanOptions{ MaxSegmentSize: 1000, MaxSegmentsPerTier: 3, + TierGrowth: 3.0, SegmentsPerMergeTask: 3, } @@ -226,6 +231,7 @@ func TestInsertManySameSizedSegmentsBetweenMerges(t *testing.T) { o := &MergePlanOptions{ MaxSegmentSize: 1000, MaxSegmentsPerTier: 3, + TierGrowth: 3.0, SegmentsPerMergeTask: 3, } @@ -253,6 +259,7 @@ func TestInsertManySameSizedSegmentsWithDeletionsBetweenMerges(t *testing.T) { o := &MergePlanOptions{ MaxSegmentSize: 1000, MaxSegmentsPerTier: 3, + TierGrowth: 3.0, SegmentsPerMergeTask: 3, } @@ -290,6 +297,7 @@ func TestInsertManyDifferentSizedSegmentsBetweenMerges(t *testing.T) { o := &MergePlanOptions{ MaxSegmentSize: 1000, MaxSegmentsPerTier: 3, + TierGrowth: 3.0, SegmentsPerMergeTask: 3, } @@ -317,6 +325,7 @@ func TestManySameSizedSegmentsWithDeletesBetweenMerges(t *testing.T) { o := &MergePlanOptions{ MaxSegmentSize: 1000, MaxSegmentsPerTier: 3, + TierGrowth: 3.0, SegmentsPerMergeTask: 3, } From ecbb3d2df4a573988e20255ef64651a1bf602ad6 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 16 Dec 2017 21:21:43 -0800 Subject: [PATCH 084/728] scorch handles non-updating batches better This commit improves handling when an incoming batch has internal-data updates only and no doc updates. In this case, a nil segment instead of an empty segment instance is used in the segmentIntroduction. The segmentIntroduction, that is, might now hold only internal-data updates only. To handle synchronous persistence, a new field that's a slice of persisted notification channels is added to the IndexSnapshot struct, which the persister goroutine will close as each IndexSnapshot is persisted. Also, as part of this change, instead of checking the unsafeBatch flag in several places, we instead check for non-nil'ness of these persisted chan's. --- index/scorch/introducer.go | 51 ++++++++++++++++---------------- index/scorch/persister.go | 7 +++-- index/scorch/scorch.go | 8 ++--- index/scorch/snapshot_index.go | 2 ++ index/scorch/snapshot_segment.go | 3 +- 5 files changed, 37 insertions(+), 34 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index a3a197b08..ef7eaab68 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -66,12 +66,14 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { // acquire lock s.rootLock.Lock() - // prepare new index snapshot, with curr size + 1 + nsegs := len(s.root.segment) + + // prepare new index snapshot newSnapshot := &IndexSnapshot{ parent: s, - segment: make([]*SegmentSnapshot, len(s.root.segment)+1), - offsets: make([]uint64, len(s.root.segment)+1), - internal: make(map[string][]byte, len(s.root.segment)), + segment: make([]*SegmentSnapshot, nsegs, nsegs+1), + offsets: make([]uint64, nsegs, nsegs+1), + internal: make(map[string][]byte, len(s.root.internal)), epoch: s.nextSnapshotEpoch, refs: 1, } @@ -95,7 +97,7 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { newSnapshot.segment[i] = &SegmentSnapshot{ id: s.root.segment[i].id, segment: s.root.segment[i].segment, - notify: s.root.segment[i].notify, + persisted: s.root.segment[i].persisted, cachedDocs: s.root.segment[i].cachedDocs, } s.root.segment[i].segment.AddRef() @@ -110,18 +112,22 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { running += s.root.segment[i].Count() } - // put new segment at end - newSnapshot.segment[len(s.root.segment)] = &SegmentSnapshot{ - id: next.id, - segment: next.data, // Take ownership of next.data's ref-count. - cachedDocs: &cachedDocs{cache: nil}, - } - newSnapshot.offsets[len(s.root.segment)] = running - if !s.unsafeBatch { - newSnapshot.segment[len(s.root.segment)].notify = append( - newSnapshot.segment[len(s.root.segment)].notify, - next.persisted, - ) + // append new segment, if any, to end of the new index snapshot + if next.data != nil { + newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ + id: next.id, + segment: next.data, // take ownership of next.data's ref-count + cachedDocs: &cachedDocs{cache: nil}, + }) + newSnapshot.offsets = append(newSnapshot.offsets, running) + if next.persisted != nil { + newSnapshot.segment[nsegs].persisted = + append(newSnapshot.segment[nsegs].persisted, next.persisted) + } + } else { // new segment might be nil when it's an internal data update only + if next.persisted != nil { + newSnapshot.persisted = append(newSnapshot.persisted, next.persisted) + } } // copy old values for key, oldVal := range s.root.internal { @@ -161,7 +167,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { parent: s, segment: make([]*SegmentSnapshot, 0, newSize), offsets: make([]uint64, 0, newSize), - internal: make(map[string][]byte, len(s.root.segment)), + internal: s.root.internal, epoch: s.nextSnapshotEpoch, refs: 1, } @@ -193,8 +199,8 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ id: s.root.segment[i].id, segment: s.root.segment[i].segment, - notify: s.root.segment[i].notify, deleted: s.root.segment[i].deleted, + persisted: s.root.segment[i].persisted, cachedDocs: s.root.segment[i].cachedDocs, }) s.root.segment[i].segment.AddRef() @@ -206,17 +212,12 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { // put new segment at end newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ id: nextMerge.id, - segment: nextMerge.new, // Take ownership for nextMerge.new's ref-count. + segment: nextMerge.new, // take ownership for nextMerge.new's ref-count deleted: newSegmentDeleted, cachedDocs: &cachedDocs{cache: nil}, }) newSnapshot.offsets = append(newSnapshot.offsets, running) - // copy old values - for key, oldVal := range s.root.internal { - newSnapshot.internal[key] = oldVal - } - // swap in new segment rootPrev := s.root s.root = newSnapshot diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 73a0bd9f4..05f336642 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -222,7 +222,10 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { } // get write lock and update the current snapshot with disk-based versions - var notifications []chan error + snapshot.m.Lock() + notifications := snapshot.persisted + snapshot.persisted = nil + snapshot.m.Unlock() s.rootLock.Lock() newIndexSnapshot := &IndexSnapshot{ @@ -244,7 +247,7 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { } newIndexSnapshot.segment[i] = newSegmentSnapshot // add the old segment snapshots notifications to the list - for _, notification := range segmentSnapshot.notify { + for _, notification := range segmentSnapshot.persisted { notifications = append(notifications, notification) } } else { diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 585904979..13dd32d33 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -215,11 +215,9 @@ func (s *Scorch) Batch(batch *index.Batch) error { // wait for analysis result analysisResults := make([]*index.AnalysisResult, int(numUpdates)) - // newRowsMap := make(map[string][]index.IndexRow) var itemsDeQueued uint64 for itemsDeQueued < numUpdates { result := <-resultChan - //newRowsMap[result.DocID] = result.Rows analysisResults[itemsDeQueued] = result itemsDeQueued++ } @@ -230,12 +228,10 @@ func (s *Scorch) Batch(batch *index.Batch) error { var newSegment segment.Segment if len(analysisResults) > 0 { newSegment = mem.NewFromAnalyzedDocs(analysisResults) - } else { - newSegment = mem.New() } err := s.prepareSegment(newSegment, ids, batch.InternalOps) - if err != nil { + if err != nil && newSegment != nil { _ = newSegment.Close() } return err @@ -278,7 +274,7 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, return err } - if !s.unsafeBatch { + if introduction.persisted != nil { err = <-introduction.persisted } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 7f5f41032..9b03a0e73 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -48,6 +48,8 @@ type IndexSnapshot struct { m sync.Mutex // Protects the fields that follow. refs int64 + + persisted []chan error } func (i *IndexSnapshot) AddRef() { diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 9b22eaedd..fd15ed7fb 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -52,7 +52,8 @@ type SegmentSnapshot struct { segment segment.Segment deleted *roaring.Bitmap - notify []chan error + persisted []chan error // closed when the segment is persisted + cachedDocs *cachedDocs } From 34f5e2175fcd7a1692c6e483614f25d97a0f3199 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 17 Dec 2017 08:23:00 -0800 Subject: [PATCH 085/728] scorch fix persister for lost notifications on no-data batches With the previous commit, there can be a scenario where batches that had internal-updates-only can be rapidly introduced by the app, but the persisted notifications on only the very last IndexSnapshot would be fired. The persisted notifications on the in-between batches might be missed. The solution was to track the persisted notification channels at a higher Scorch struct level, instead of tracking the persisted channels at the IndexSnapshot and SegmentSnapshot levels. Also, the persister double-check looping was simplified, which avoids a race where an introducer might incorrectly not notify the persister. --- index/scorch/introducer.go | 46 ++++++----- index/scorch/persister.go | 133 ++++++++++++++----------------- index/scorch/scorch.go | 11 +-- index/scorch/snapshot_index.go | 2 - index/scorch/snapshot_segment.go | 2 - 5 files changed, 94 insertions(+), 100 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index ef7eaab68..5cb43b9c0 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -32,16 +32,21 @@ type segmentIntroduction struct { persisted chan error } +type epochWatcher struct { + epoch uint64 + notifyCh notificationChan +} + func (s *Scorch) mainLoop() { - var notify notificationChan + var epochWatchers []*epochWatcher OUTER: for { select { case <-s.closeCh: break OUTER - case notify = <-s.introducerNotifier: - continue + case epochWatcher := <-s.introducerNotifier: + epochWatchers = append(epochWatchers, epochWatcher) case nextMerge := <-s.merges: s.introduceMerge(nextMerge) @@ -52,11 +57,22 @@ OUTER: continue OUTER } } - // notify persister - if notify != nil { - close(notify) - notify = nil + + var epochCurr uint64 + s.rootLock.RLock() + if s.root != nil { + epochCurr = s.root.epoch + } + s.rootLock.RUnlock() + var epochWatchersNext []*epochWatcher + for _, w := range epochWatchers { + if w.epoch < epochCurr { + close(w.notifyCh) + } else { + epochWatchersNext = append(epochWatchersNext, w) + } } + epochWatchers = epochWatchersNext } s.asyncTasks.Done() @@ -97,10 +113,10 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { newSnapshot.segment[i] = &SegmentSnapshot{ id: s.root.segment[i].id, segment: s.root.segment[i].segment, - persisted: s.root.segment[i].persisted, cachedDocs: s.root.segment[i].cachedDocs, } s.root.segment[i].segment.AddRef() + // apply new obsoletions if s.root.segment[i].deleted == nil { newSnapshot.segment[i].deleted = delta @@ -120,14 +136,6 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { cachedDocs: &cachedDocs{cache: nil}, }) newSnapshot.offsets = append(newSnapshot.offsets, running) - if next.persisted != nil { - newSnapshot.segment[nsegs].persisted = - append(newSnapshot.segment[nsegs].persisted, next.persisted) - } - } else { // new segment might be nil when it's an internal data update only - if next.persisted != nil { - newSnapshot.persisted = append(newSnapshot.persisted, next.persisted) - } } // copy old values for key, oldVal := range s.root.internal { @@ -141,7 +149,10 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { delete(newSnapshot.internal, key) } } - // swap in new segment + if next.persisted != nil { + s.rootPersisted = append(s.rootPersisted, next.persisted) + } + // swap in new index snapshot rootPrev := s.root s.root = newSnapshot // release lock @@ -200,7 +211,6 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { id: s.root.segment[i].id, segment: s.root.segment[i].segment, deleted: s.root.segment[i].deleted, - persisted: s.root.segment[i].persisted, cachedDocs: s.root.segment[i].cachedDocs, }) s.root.segment[i].segment.AddRef() diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 05f336642..969b449ce 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -35,82 +35,85 @@ import ( type notificationChan chan struct{} func (s *Scorch) persisterLoop() { - var notify notificationChan + defer s.asyncTasks.Done() + + var notifyChs []notificationChan var lastPersistedEpoch uint64 OUTER: for { select { case <-s.closeCh: break OUTER - case notify = <-s.persisterNotifier: - + case notifyCh := <-s.persisterNotifier: + notifyChs = append(notifyChs, notifyCh) default: - // check to see if there is a new snapshot to persist - s.rootLock.RLock() - ourSnapshot := s.root - ourSnapshot.AddRef() - s.rootLock.RUnlock() - - if ourSnapshot.epoch != lastPersistedEpoch { - // lets get started - err := s.persistSnapshot(ourSnapshot) - if err != nil { - log.Printf("got err persisting snapshot: %v", err) - _ = ourSnapshot.DecRef() - continue OUTER - } - lastPersistedEpoch = ourSnapshot.epoch - if notify != nil { - close(notify) - notify = nil - } - } - _ = ourSnapshot.DecRef() - - // tell the introducer we're waiting for changes - // first make a notification chan - notifyUs := make(notificationChan) + } - // give it to the introducer - select { - case <-s.closeCh: - break OUTER - case s.introducerNotifier <- notifyUs: - } + var ourSnapshot *IndexSnapshot + var ourPersisted []chan error - // check again - s.rootLock.RLock() + // check to see if there is a new snapshot to persist + s.rootLock.Lock() + if s.root != nil && s.root.epoch > lastPersistedEpoch { ourSnapshot = s.root ourSnapshot.AddRef() - s.rootLock.RUnlock() + ourPersisted = s.rootPersisted + s.rootPersisted = nil + } + s.rootLock.Unlock() - if ourSnapshot.epoch != lastPersistedEpoch { - // lets get started - err := s.persistSnapshot(ourSnapshot) + if ourSnapshot != nil { + err := s.persistSnapshot(ourSnapshot) + for _, ch := range ourPersisted { if err != nil { - log.Printf("got err persisting snapshot: %v", err) - _ = ourSnapshot.DecRef() - continue OUTER - } - lastPersistedEpoch = ourSnapshot.epoch - if notify != nil { - close(notify) - notify = nil + ch <- err } + close(ch) + } + if err != nil { + log.Printf("got err persisting snapshot: %v", err) + _ = ourSnapshot.DecRef() + continue OUTER } + lastPersistedEpoch = ourSnapshot.epoch + for _, notifyCh := range notifyChs { + close(notifyCh) + } + notifyChs = nil _ = ourSnapshot.DecRef() - // now wait for it (but also detect close) - select { - case <-s.closeCh: - break OUTER - case <-notifyUs: - // woken up, next loop should pick up work + changed := false + s.rootLock.RLock() + if s.root != nil && s.root.epoch != lastPersistedEpoch { + changed = true } + s.rootLock.RUnlock() + if changed { + continue OUTER + } + } + + // tell the introducer we're waiting for changes + w := &epochWatcher{ + epoch: lastPersistedEpoch, + notifyCh: make(notificationChan, 1), + } + + select { + case <-s.closeCh: + break OUTER + case s.introducerNotifier <- w: + } + + s.removeOldData() // might as well cleanup while waiting + + select { + case <-s.closeCh: + break OUTER + case <-w.notifyCh: + // woken up, next loop should pick up work } - s.removeOldData() } - s.asyncTasks.Done() } func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { @@ -221,12 +224,6 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { } } - // get write lock and update the current snapshot with disk-based versions - snapshot.m.Lock() - notifications := snapshot.persisted - snapshot.persisted = nil - snapshot.m.Unlock() - s.rootLock.Lock() newIndexSnapshot := &IndexSnapshot{ parent: s, @@ -240,16 +237,12 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { // see if this segment has been replaced if replacement, ok := newSegments[segmentSnapshot.id]; ok { newSegmentSnapshot := &SegmentSnapshot{ + id: segmentSnapshot.id, segment: replacement, deleted: segmentSnapshot.deleted, - id: segmentSnapshot.id, cachedDocs: segmentSnapshot.cachedDocs, } newIndexSnapshot.segment[i] = newSegmentSnapshot - // add the old segment snapshots notifications to the list - for _, notification := range segmentSnapshot.persisted { - notifications = append(notifications, notification) - } } else { newIndexSnapshot.segment[i] = s.root.segment[i] newIndexSnapshot.segment[i].segment.AddRef() @@ -270,12 +263,6 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { _ = rootPrev.DecRef() } - // now that we've given up the lock, notify everyone that we've safely - // persisted their data - for _, notification := range notifications { - close(notification) - } - return nil } diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 13dd32d33..5bba9bf98 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -49,13 +49,14 @@ type Scorch struct { unsafeBatch bool - rootLock sync.RWMutex - root *IndexSnapshot // holds 1 ref-count on the root + rootLock sync.RWMutex + root *IndexSnapshot // holds 1 ref-count on the root + rootPersisted []chan error // closed when root is persisted closeCh chan struct{} introductions chan *segmentIntroduction merges chan *segmentMerge - introducerNotifier chan notificationChan + introducerNotifier chan *epochWatcher persisterNotifier chan notificationChan rootBolt *bolt.DB asyncTasks sync.WaitGroup @@ -127,7 +128,7 @@ func (s *Scorch) Open() error { s.introductions = make(chan *segmentIntroduction) s.merges = make(chan *segmentMerge) - s.introducerNotifier = make(chan notificationChan) + s.introducerNotifier = make(chan *epochWatcher, 1) s.persisterNotifier = make(chan notificationChan) if !s.readOnly && s.path != "" { @@ -251,7 +252,7 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, } if !s.unsafeBatch { - introduction.persisted = make(chan error) + introduction.persisted = make(chan error, 1) } // get read lock, to optimistically prepare obsoleted info diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 9b03a0e73..7f5f41032 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -48,8 +48,6 @@ type IndexSnapshot struct { m sync.Mutex // Protects the fields that follow. refs int64 - - persisted []chan error } func (i *IndexSnapshot) AddRef() { diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index fd15ed7fb..c543af826 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -52,8 +52,6 @@ type SegmentSnapshot struct { segment segment.Segment deleted *roaring.Bitmap - persisted []chan error // closed when the segment is persisted - cachedDocs *cachedDocs } From 20fe70770a9a43119c64319b7b59ddbc0d0bf7fd Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 17 Dec 2017 12:39:15 -0800 Subject: [PATCH 086/728] scorch added some tests on # of expected segments --- index/scorch/scorch_test.go | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index d347ecec0..42e8df50a 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -587,6 +587,10 @@ func TestIndexInternalCRUD(t *testing.T) { t.Error(err) } + if len(indexReader.(*Reader).root.segment) != 0 { + t.Errorf("expected 0 segments") + } + // get something that doesn't exist yet val, err := indexReader.GetInternal([]byte("key")) if err != nil { @@ -612,6 +616,10 @@ func TestIndexInternalCRUD(t *testing.T) { t.Error(err) } + if len(indexReader2.(*Reader).root.segment) != 0 { + t.Errorf("expected 0 segments") + } + // get val, err = indexReader2.GetInternal([]byte("key")) if err != nil { @@ -637,6 +645,10 @@ func TestIndexInternalCRUD(t *testing.T) { t.Error(err) } + if len(indexReader3.(*Reader).root.segment) != 0 { + t.Errorf("expected 0 segments") + } + // get again val, err = indexReader3.GetInternal([]byte("key")) if err != nil { @@ -726,6 +738,11 @@ func TestIndexBatch(t *testing.T) { } }() + numSegments := len(indexReader.(*Reader).root.segment) + if numSegments <= 0 { + t.Errorf("expected some segments, got: %d", numSegments) + } + docCount, err := indexReader.DocCount() if err != nil { t.Fatal(err) From 867bb2c0314205d922dba91c452bb1e7fe61456d Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 18 Dec 2017 11:33:19 -0800 Subject: [PATCH 087/728] scorch mergeplan explicitly weeds out empty segments Rather than waiting on scoring to weed out empty segments, this commit does the weeding out of empty segments explicitly and up front. --- index/scorch/mergeplan/merge_plan.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/index/scorch/mergeplan/merge_plan.go b/index/scorch/mergeplan/merge_plan.go index 41ccae0c4..0afc3ce5c 100644 --- a/index/scorch/mergeplan/merge_plan.go +++ b/index/scorch/mergeplan/merge_plan.go @@ -173,6 +173,17 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) { rv := &MergePlan{} + var empties []Segment + for _, eligible := range eligibles { + if eligible.LiveSize() <= 0 { + empties = append(empties, eligible) + } + } + if len(empties) > 0 { + rv.Tasks = append(rv.Tasks, &MergeTask{Segments: empties}) + eligibles = removeSegments(eligibles, empties) + } + // While we’re over budget, keep looping, which might produce // another MergeTask. for len(eligibles) > budgetNumSegments { From 730d906a50487535e90b566eeb78b93b6f74ee68 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 15 Dec 2017 12:32:37 -0800 Subject: [PATCH 088/728] scorch reuses Posting instance in PostingsIterator.Next() With this change, there are no more memory allocations in the calls to PostingsIterator.Next() in the micro benchmarks of bleve-query. On a dev macbook, on an index of 50K wikipedia docs, using high frequency search of "text:date"... 400 qps - upsidedown/moss 565 qps - scorch before 680 qps - scorch after --- index/scorch/segment/segment.go | 4 ++++ index/scorch/segment/zap/posting.go | 10 ++++++---- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index 14b97ec80..2c91c0ef8 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -64,6 +64,10 @@ type PostingsList interface { } type PostingsIterator interface { + // The caller is responsible for copying whatever it needs from + // the returned Posting instance before calling Next(), as some + // implementations may return a shared instance to reduce memory + // allocations. Next() (Posting, error) } diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 68db04299..e6b76bf90 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -115,6 +115,8 @@ type PostingsIterator struct { locChunkStart uint64 locBitmap *roaring.Bitmap + + next Posting } func (i *PostingsIterator) loadChunk(chunk int) error { @@ -266,10 +268,10 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { } } - rv := &Posting{ - iterator: i, - docNum: uint64(n), - } + i.next = Posting{} // clear the struct. + rv := &i.next + rv.iterator = i + rv.docNum = uint64(n) var err error var normBits uint64 From f6b506134b0b2a62f9b12a1ea2200791284ea06d Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 19 Dec 2017 10:49:57 -0800 Subject: [PATCH 089/728] import couchbase/vellum instead of couchbaselabs/vellum Also, scrubbed an old couchbaselabs/moss reference in comments. Also, go fmt. --- index/scorch/segment/zap/build.go | 2 +- index/scorch/segment/zap/cmd/zap/cmd/dict.go | 2 +- index/scorch/segment/zap/cmd/zap/cmd/explore.go | 2 +- index/scorch/segment/zap/dict.go | 4 ++-- index/scorch/segment/zap/merge.go | 2 +- index/scorch/segment/zap/segment.go | 2 +- index/store/moss/lower.go | 2 +- index/store/moss/store.go | 2 +- search/search.go | 10 +++++----- 9 files changed, 14 insertions(+), 14 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 60906d334..8bd00601e 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -23,7 +23,7 @@ import ( "github.com/Smerity/govarint" "github.com/blevesearch/bleve/index/scorch/segment/mem" - "github.com/couchbaselabs/vellum" + "github.com/couchbase/vellum" "github.com/golang/snappy" ) diff --git a/index/scorch/segment/zap/cmd/zap/cmd/dict.go b/index/scorch/segment/zap/cmd/zap/cmd/dict.go index 74e59e902..fa8b3277e 100644 --- a/index/scorch/segment/zap/cmd/zap/cmd/dict.go +++ b/index/scorch/segment/zap/cmd/zap/cmd/dict.go @@ -18,7 +18,7 @@ import ( "encoding/binary" "fmt" - "github.com/couchbaselabs/vellum" + "github.com/couchbase/vellum" "github.com/spf13/cobra" ) diff --git a/index/scorch/segment/zap/cmd/zap/cmd/explore.go b/index/scorch/segment/zap/cmd/zap/cmd/explore.go index 42ab82732..012a829fe 100644 --- a/index/scorch/segment/zap/cmd/zap/cmd/explore.go +++ b/index/scorch/segment/zap/cmd/zap/cmd/explore.go @@ -19,7 +19,7 @@ import ( "fmt" "log" - "github.com/couchbaselabs/vellum" + "github.com/couchbase/vellum" "github.com/spf13/cobra" ) diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 5d3c160ba..3221d0616 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -21,8 +21,8 @@ import ( "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/couchbaselabs/vellum" - "github.com/couchbaselabs/vellum/regexp" + "github.com/couchbase/vellum" + "github.com/couchbase/vellum/regexp" ) // Dictionary is the zap representation of the term dictionary diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 481d9272a..842371e5f 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -24,7 +24,7 @@ import ( "github.com/RoaringBitmap/roaring" "github.com/Smerity/govarint" - "github.com/couchbaselabs/vellum" + "github.com/couchbase/vellum" "github.com/golang/snappy" ) diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 65c62ac9c..62c147a2c 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -25,7 +25,7 @@ import ( "github.com/RoaringBitmap/roaring" "github.com/Smerity/govarint" "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/couchbaselabs/vellum" + "github.com/couchbase/vellum" mmap "github.com/edsrzf/mmap-go" "github.com/golang/snappy" ) diff --git a/index/store/moss/lower.go b/index/store/moss/lower.go index 2aff2aea3..1133f95f6 100644 --- a/index/store/moss/lower.go +++ b/index/store/moss/lower.go @@ -13,7 +13,7 @@ // limitations under the License. // Package moss provides a KVStore implementation based on the -// github.com/couchbaselabs/moss library. +// github.com/couchbase/moss library. package moss diff --git a/index/store/moss/store.go b/index/store/moss/store.go index a7aa4d417..89ea553cf 100644 --- a/index/store/moss/store.go +++ b/index/store/moss/store.go @@ -13,7 +13,7 @@ // limitations under the License. // Package moss provides a KVStore implementation based on the -// github.com/couchbaselabs/moss library. +// github.com/couchbase/moss library. package moss diff --git a/search/search.go b/search/search.go index cbbcfbfd6..f9a92783b 100644 --- a/search/search.go +++ b/search/search.go @@ -37,12 +37,12 @@ func (ap ArrayPositions) Equals(other ArrayPositions) bool { type Location struct { // Pos is the position of the term within the field, starting at 1 - Pos uint64 `json:"pos"` - + Pos uint64 `json:"pos"` + // Start and End are the byte offsets of the term in the field - Start uint64 `json:"start"` - End uint64 `json:"end"` - + Start uint64 `json:"start"` + End uint64 `json:"end"` + // ArrayPositions contains the positions of the term within any elements. ArrayPositions ArrayPositions `json:"array_positions"` } From 679f1ce9c3cb0f9ccd95bcb11e3253fa0e66a983 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Fri, 15 Dec 2017 14:49:33 -0800 Subject: [PATCH 090/728] scorch APIs to support rollback - PreviousPersistedSnapshot - SnapshotRevert + unit test --- index/scorch/introducer.go | 58 +++++++++++++ index/scorch/scorch.go | 2 + index/scorch/snapshot_rollback.go | 102 ++++++++++++++++++++++ index/scorch/snapshot_rollback_test.go | 113 +++++++++++++++++++++++++ 4 files changed, 275 insertions(+) create mode 100644 index/scorch/snapshot_rollback.go create mode 100644 index/scorch/snapshot_rollback_test.go diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 5cb43b9c0..41ce4ca1f 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -37,6 +37,11 @@ type epochWatcher struct { notifyCh notificationChan } +type snapshotReversion struct { + snapshot *IndexSnapshot + applied chan error +} + func (s *Scorch) mainLoop() { var epochWatchers []*epochWatcher OUTER: @@ -56,6 +61,12 @@ OUTER: if err != nil { continue OUTER } + + case revertTo := <-s.revertToSnapshots: + err := s.revertToSnapshot(revertTo) + if err != nil { + continue OUTER + } } var epochCurr uint64 @@ -241,3 +252,50 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { // notify merger we incorporated this close(nextMerge.notify) } + +func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { + if revertTo.snapshot == nil { + err := fmt.Errorf("Cannot revert to a nil snapshot") + revertTo.applied <- err + return err + } + + // acquire lock + s.rootLock.Lock() + + // prepare a new index snapshot, based on next snapshot + newSnapshot := &IndexSnapshot{ + parent: s, + segment: make([]*SegmentSnapshot, len(revertTo.snapshot.segment)), + offsets: revertTo.snapshot.offsets, + internal: revertTo.snapshot.internal, + epoch: s.nextSnapshotEpoch, + refs: 1, + } + s.nextSnapshotEpoch++ + + // iterate through segments + for i, segmentSnapshot := range revertTo.snapshot.segment { + newSnapshot.segment[i] = &SegmentSnapshot{ + id: segmentSnapshot.id, + segment: segmentSnapshot.segment, + deleted: segmentSnapshot.deleted, + cachedDocs: segmentSnapshot.cachedDocs, + } + segmentSnapshot.segment.AddRef() + } + + // swap in new snapshot + rootPrev := s.root + s.root = newSnapshot + // release lock + s.rootLock.Unlock() + + if rootPrev != nil { + _ = rootPrev.DecRef() + } + + close(revertTo.applied) + + return nil +} diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 5bba9bf98..9484920f3 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -57,6 +57,7 @@ type Scorch struct { introductions chan *segmentIntroduction merges chan *segmentMerge introducerNotifier chan *epochWatcher + revertToSnapshots chan *snapshotReversion persisterNotifier chan notificationChan rootBolt *bolt.DB asyncTasks sync.WaitGroup @@ -129,6 +130,7 @@ func (s *Scorch) Open() error { s.introductions = make(chan *segmentIntroduction) s.merges = make(chan *segmentMerge) s.introducerNotifier = make(chan *epochWatcher, 1) + s.revertToSnapshots = make(chan *snapshotReversion) s.persisterNotifier = make(chan notificationChan) if !s.readOnly && s.path != "" { diff --git a/index/scorch/snapshot_rollback.go b/index/scorch/snapshot_rollback.go new file mode 100644 index 000000000..e31308a20 --- /dev/null +++ b/index/scorch/snapshot_rollback.go @@ -0,0 +1,102 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "bytes" + "log" + + "github.com/blevesearch/bleve/index/scorch/segment" +) + +// PreviousPersistedSnapshot returns the next older, previous +// IndexSnapshot based on the provided IndexSnapshot. If the provided +// argument is nil, the most recently persisted IndexSnapshot is returned. +// This API allows the application to walk backwards into the history +// of a store to previous points in time. A nil return value indicates +// that no previous snapshots are available. +func (s *Scorch) PreviousPersistedSnapshot(is *IndexSnapshot) (*IndexSnapshot, error) { + if s.rootBolt == nil { + return nil, nil + } + + // start a read-only transaction + tx, err := s.rootBolt.Begin(false) + if err != nil { + return nil, err + } + + // Read-only bolt transactions to be rolled back. + defer func() { + _ = tx.Rollback() + }() + + snapshots := tx.Bucket(boltSnapshotsBucket) + if snapshots == nil { + return nil, nil + } + + pos := []byte(nil) + + if is != nil { + pos = segment.EncodeUvarintAscending(nil, is.epoch) + } + + c := snapshots.Cursor() + for k, _ := c.Last(); k != nil; k, _ = c.Prev() { + if pos == nil || bytes.Compare(k, pos) < 0 { + _, snapshotEpoch, err := segment.DecodeUvarintAscending(k) + if err != nil { + log.Printf("PreviousPersistedSnapshot:"+ + " unable to parse segment epoch %x, continuing", k) + continue + } + + snapshot := snapshots.Bucket(k) + if snapshot == nil { + log.Printf("PreviousPersistedSnapshot:"+ + " snapshot key, but bucket missing %x, continuing", k) + continue + } + + indexSnapshot, err := s.loadSnapshot(snapshot) + if err != nil { + log.Printf("PreviousPersistedSnapshot:"+ + " unable to load snapshot, %v, continuing", err) + continue + } + + indexSnapshot.epoch = snapshotEpoch + return indexSnapshot, nil + } + } + + return nil, nil +} + +// SnapshotRevert atomically brings the store back to the point in time +// as represented by the revertTo IndexSnapshot. SnapshotRevert() should +// only be passed an IndexSnapshot that came from the same store. +func (s *Scorch) SnapshotRevert(revertTo *IndexSnapshot) error { + revert := &snapshotReversion{ + snapshot: revertTo, + applied: make(chan error), + } + + s.revertToSnapshots <- revert + + // block until this IndexSnapshot is applied + return <-revert.applied +} diff --git a/index/scorch/snapshot_rollback_test.go b/index/scorch/snapshot_rollback_test.go new file mode 100644 index 000000000..3054c9bb8 --- /dev/null +++ b/index/scorch/snapshot_rollback_test.go @@ -0,0 +1,113 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "testing" + + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" +) + +func TestIndexRollback(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, testConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Fatalf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + // create 2 docs + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test1"))) + err = idx.Update(doc) + if err != nil { + t.Error(err) + } + + doc = document.NewDocument("2") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test2"))) + err = idx.Update(doc) + if err != nil { + t.Error(err) + } + + // create a batch, insert new doc, update existing doc, delete existing doc + batch := index.NewBatch() + doc = document.NewDocument("3") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test3"))) + batch.Update(doc) + doc = document.NewDocument("2") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test2updated"))) + batch.Update(doc) + batch.Delete("1") + + err = idx.Batch(batch) + if err != nil { + t.Error(err) + } + + sh, ok := idx.(*Scorch) + if !ok { + t.Errorf("Not a scorch index?") + } + + // Get Last persisted snapshot + ss, err := sh.PreviousPersistedSnapshot(nil) + if err != nil { + t.Error(err) + } + + // Retrieve the snapshot earlier + prev, err := sh.PreviousPersistedSnapshot(ss) + if err != nil { + t.Error(err) + } + + err = sh.SnapshotRevert(prev) + if err != nil { + t.Error(err) + } + + newRoot := sh.root + if newRoot != nil && prev != nil { + if newRoot.epoch <= prev.epoch { + t.Errorf("Unexpected epoch, %v <= %v", newRoot.epoch, prev.epoch) + } + } else { + if prev == nil { + t.Errorf("The last persisted snapshot before the revert was nil!") + } + if newRoot == nil { + t.Errorf("The new root has been set to nil?") + } + } +} From d0e4f850268ac69894a1e16d45512b2fa7fd2ea0 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 19 Dec 2017 13:37:04 -0800 Subject: [PATCH 091/728] scorch avoid extra clone by using roaring.AndNot(x, y) --- index/scorch/segment/zap/posting.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index e6b76bf90..1ef85eef2 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -72,8 +72,7 @@ func (p *PostingsList) Iterator() segment.PostingsIterator { rv.all = p.postings.Iterator() if p.except != nil { - allExcept := p.postings.Clone() - allExcept.AndNot(p.except) + allExcept := roaring.AndNot(p.postings, p.except) rv.actual = allExcept.Iterator() } else { rv.actual = p.postings.Iterator() From 142ccdfaecc3631e4ff51ed013bf7d5671264d85 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 19 Dec 2017 13:46:13 -0800 Subject: [PATCH 092/728] scorch remove leftover doc comment I'm suspecting that Marty's editor is more exciting than mine. :-) --- index/scorch/snapshot_index_tfr.go | 1 - 1 file changed, 1 deletion(-) diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index d6c8dcd13..25cc0bd07 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -83,7 +83,6 @@ func (i *IndexSnapshotTermFieldReader) postingToTermFieldDoc(next segment.Postin } } -// Advance go fuck yourself editor func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) { // first make sure we aren't already pointing at the right thing, (due to way searchers work) if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 { From a0556ad65b939cad24da881043f71f534411bee2 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 19 Dec 2017 16:41:56 -0800 Subject: [PATCH 093/728] scorch added more cases to TestIndexInsertThenDelete --- index/scorch/scorch_test.go | 93 +++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index 42e8df50a..a9df3e9e2 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -275,6 +275,18 @@ func TestIndexInsertThenDelete(t *testing.T) { if docCount != expectedCount { t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) } + iid, err := reader.InternalID("1") + if err != nil || iid == nil { + t.Errorf("unexpected on doc id 1") + } + iid, err = reader.InternalID("2") + if err != nil || iid == nil { + t.Errorf("unexpected on doc id 2") + } + iid, err = reader.InternalID("3") + if err != nil || iid != nil { + t.Errorf("unexpected on doc id 3") + } err = reader.Close() if err != nil { t.Fatal(err) @@ -297,6 +309,73 @@ func TestIndexInsertThenDelete(t *testing.T) { if docCount != expectedCount { t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) } + storedDoc, err := reader.Document("1") + if err != nil { + t.Error(err) + } + if storedDoc != nil { + t.Errorf("expected nil for deleted stored doc #1, got %v", storedDoc) + } + storedDoc, err = reader.Document("2") + if err != nil { + t.Error(err) + } + if storedDoc == nil { + t.Errorf("expected stored doc for #2, got nil") + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } + + // now close it + err = idx.Close() + if err != nil { + t.Fatal(err) + } + + idx, err = NewScorch(Name, testConfig, analysisQueue) // reopen + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error reopening index: %v", err) + } + + reader, err = idx.Reader() + if err != nil { + t.Fatal(err) + } + docCount, err = reader.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + storedDoc, err = reader.Document("1") + if err != nil { + t.Error(err) + } + if storedDoc != nil { + t.Errorf("expected nil for deleted stored doc #1, got %v", storedDoc) + } + storedDoc, err = reader.Document("2") + if err != nil { + t.Error(err) + } + if storedDoc == nil { + t.Errorf("expected stored doc for #2, got nil") + } + iid, err = reader.InternalID("1") + if err != nil || iid != nil { + t.Errorf("unexpected on doc id 1") + } + iid, err = reader.InternalID("2") + if err != nil || iid == nil { + t.Errorf("unexpected on doc id 2, should exist") + } err = reader.Close() if err != nil { t.Fatal(err) @@ -319,6 +398,20 @@ func TestIndexInsertThenDelete(t *testing.T) { if docCount != expectedCount { t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) } + storedDoc, err = reader.Document("1") + if err != nil { + t.Error(err) + } + if storedDoc != nil { + t.Errorf("expected nil for deleted stored doc #1, got %v", storedDoc) + } + storedDoc, err = reader.Document("2") + if err != nil { + t.Error(err) + } + if storedDoc != nil { + t.Errorf("expected nil for deleted stored doc #2, got nil") + } err = reader.Close() if err != nil { t.Fatal(err) From 8f8333e01bb1345790fe54d2f2889397d4acf0b2 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 19 Dec 2017 17:44:25 -0800 Subject: [PATCH 094/728] scorch optimize zap Count() This proposed approach avoids building a temporary AndNot() bitmap, following the same kind of optimization used by mem segments. --- index/scorch/segment/zap/posting.go | 9 +++++++-- index/scorch/snapshot_segment.go | 1 + 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 1ef85eef2..1b7a0a587 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -85,10 +85,15 @@ func (p *PostingsList) Iterator() segment.PostingsIterator { // Count returns the number of items on this postings list func (p *PostingsList) Count() uint64 { if p.postings != nil { + n := p.postings.GetCardinality() if p.except != nil { - return roaring.AndNot(p.postings, p.except).GetCardinality() + e := p.except.GetCardinality() + if e > n { + e = n + } + return n - e } - return p.postings.GetCardinality() + return n } return 0 } diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index c543af826..f2bcfb065 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -32,6 +32,7 @@ type SegmentDictionarySnapshot struct { } func (s *SegmentDictionarySnapshot) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) { + // TODO: if except is non-nil, perhaps need to OR it with s.s.deleted? return s.d.PostingsList(term, s.s.deleted) } From dbc88cf6b3990a905ced2641d3f7e60fb7ea25d3 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 19 Dec 2017 19:15:19 -0800 Subject: [PATCH 095/728] scorch docNumberToBytes() checks cap(buf) before allocating With more pprof focusing (zooming in on a particular func), there were still some memory allocations showing up with docNumberToBytes() in micro benchmarks of bleve-query. On a dev macbook, on an index of 50K wikipedia docs, using search of relatively common "text:date"... 400 qps - upsidedown/moss 680 qps - scorch before 775 qps - scorch after --- index/scorch/snapshot_index.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 7f5f41032..bb6a8be6b 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -367,7 +367,11 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, func docNumberToBytes(buf []byte, in uint64) []byte { if len(buf) != 8 { - buf = make([]byte, 8) + if cap(buf) >= 8 { + buf = buf[0:8] + } else { + buf = make([]byte, 8) + } } binary.BigEndian.PutUint64(buf, in) return buf From 1abbfadf0dca26a1dcfc0721fb1bdc57e0538af5 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 19 Dec 2017 22:26:17 -0800 Subject: [PATCH 096/728] scorch simplify err check after vellum load --- index/scorch/segment/zap/segment.go | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 62c147a2c..e78ac392f 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -159,8 +159,8 @@ func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) { return dict, err } -func (s *Segment) dictionary(field string) (*Dictionary, error) { - rv := &Dictionary{ +func (s *Segment) dictionary(field string) (rv *Dictionary, err error) { + rv = &Dictionary{ segment: s, field: field, } @@ -170,19 +170,15 @@ func (s *Segment) dictionary(field string) (*Dictionary, error) { rv.fieldID = rv.fieldID - 1 dictStart := s.fieldsOffsets[rv.fieldID] - if dictStart > 0 { // read the length of the vellum data vellumLen, read := binary.Uvarint(s.mm[dictStart : dictStart+binary.MaxVarintLen64]) fstBytes := s.mm[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen] if fstBytes != nil { - fst, err := vellum.Load(fstBytes) + rv.fst, err = vellum.Load(fstBytes) if err != nil { return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err) } - if err == nil { - rv.fst = fst - } } } From df6c8f4074f45a4c5375383cf0c2e444e8fba0b6 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 20 Dec 2017 10:06:50 -0800 Subject: [PATCH 097/728] scorch added kvconfig unsafe_batch option Added an option to the kvconfig JSON, called "unsafe_batch" (bool). Default is false, so Batch() calls are synchronously persisted by default. Advanced users may want to unsafe, asynchronous persistence to tradeoff performance (mutations are queryable sooner) over safety. { "index_type": "scorch", "kvconfig": { "unsafe_batch": true } } This change replaces the previous kvstore=="moss" workaround. --- index/scorch/scorch.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 9484920f3..d72a8f886 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -81,9 +81,9 @@ func NewScorch(storeName string, config map[string]interface{}, analysisQueue *i if ok { rv.readOnly = ro } - // hack for now to disable safe batches in FTS - if storeName == "moss" { - rv.unsafeBatch = true + ub, ok := config["unsafe_batch"].(bool) + if ok { + rv.unsafeBatch = ub } return rv, nil } From 04ac9d5b1ff31d7c16ead43c2909f0535ec72824 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 20 Dec 2017 14:43:08 -0800 Subject: [PATCH 098/728] scorch removeOldBoltSnapshots() deletes from correct bucket --- index/scorch/persister.go | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 969b449ce..4ad3df80d 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -448,22 +448,25 @@ func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) { if err != nil { return 0, err } - defer func() { - if err == nil { - err = s.rootBolt.Sync() - } - }() defer func() { if err == nil { err = tx.Commit() } else { _ = tx.Rollback() } + if err == nil { + err = s.rootBolt.Sync() + } }() + snapshots := tx.Bucket(boltSnapshotsBucket) + if snapshots == nil { + return 0, nil + } + for _, epochToRemove := range epochsToRemove { k := segment.EncodeUvarintAscending(nil, epochToRemove) - err = tx.DeleteBucket(k) + err = snapshots.DeleteBucket(k) if err == bolt.ErrBucketNotFound { err = nil } From ea4eb7301bb237773a9ae2d7b8c5c6cd570499a0 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 20 Dec 2017 14:51:10 -0800 Subject: [PATCH 099/728] scorch merger checks closeCh --- index/scorch/merge.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index cc3af774a..35469d591 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -162,7 +162,11 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot) error { } } for _, notification := range notifications { - <-notification + select { + case <-s.closeCh: + return nil + case <-notification: + } } return nil } From c1552555063bf69e8a1025dd5e7997ce4843303e Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 20 Dec 2017 14:55:31 -0800 Subject: [PATCH 100/728] scorch optimize zap.Merge() to reuse some buffers --- index/scorch/segment/zap/merge.go | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 842371e5f..500830d03 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -41,7 +41,7 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, return nil, err } - // bufer the output + // buffer the output br := bufio.NewWriter(f) // wrap it for counting (tracking offsets) @@ -128,6 +128,9 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, newSegDocCount uint64, chunkFactor uint32, w *CountHashWriter) ([]uint64, error) { + var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64) + var bufLoc []uint64 + rv := make([]uint64, len(fieldsInv)) var vellumBuf bytes.Buffer @@ -208,7 +211,10 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, if len(locs) > 0 { newRoaringLocs.Add(uint32(hitNewDocNum)) for _, loc := range locs { - args := make([]uint64, 0, 5+len(loc.ArrayPositions())) + if cap(bufLoc) < 5+len(loc.ArrayPositions()) { + bufLoc = make([]uint64, 0, 5+len(loc.ArrayPositions())) + } + args := bufLoc[0:0] args = append(args, uint64(fieldsMap[loc.Field()])) args = append(args, loc.Pos()) args = append(args, loc.Start()) @@ -250,7 +256,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, } postingOffset := uint64(w.Count()) // write out the start of the term info - buf := make([]byte, binary.MaxVarintLen64) + buf := bufMaxVarintLen64 n := binary.PutUvarint(buf, freqOffset) _, err = w.Write(buf[:n]) if err != nil { @@ -295,7 +301,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, vellumData := vellumBuf.Bytes() // write out the length of the vellum data - buf := make([]byte, binary.MaxVarintLen64) + buf := bufMaxVarintLen64 // write out the number of chunks n := binary.PutUvarint(buf, uint64(len(vellumData))) _, err = w.Write(buf[:n]) From 67e0e5973b8cba53e7150adf29bfe54ed71db258 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 20 Dec 2017 15:18:22 -0800 Subject: [PATCH 101/728] scorch mergeStoredAndRemap() memory reuse In mergeStoredAndRemap(), instead of allocating new hashmaps for each document, this commit reuses some arrays that are indexed by fieldId. --- index/scorch/segment/zap/merge.go | 90 ++++++++++++++++--------------- 1 file changed, 47 insertions(+), 43 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 500830d03..5d845bbd1 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -333,6 +333,10 @@ func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, var metaBuf bytes.Buffer var data, compressed []byte + vals := make([][][]byte, len(fieldsInv)) + typs := make([][]byte, len(fieldsInv)) + poss := make([][][]uint64, len(fieldsInv)) + docNumOffsets := make([]uint64, newSegDocCount) // for each segment @@ -353,11 +357,13 @@ func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, } else { segNewDocNums = append(segNewDocNums, uint64(newDocNum)) // collect all the data - vals := make(map[uint16][][]byte) - typs := make(map[uint16][]byte) - poss := make(map[uint16][][]uint64) + for i := 0; i < len(fieldsInv); i++ { + vals[i] = vals[i][:0] + typs[i] = typs[i][:0] + poss[i] = poss[i][:0] + } err := segment.VisitDocument(docNum, func(field string, typ byte, value []byte, pos []uint64) bool { - fieldID := fieldsMap[field] + fieldID := int(fieldsMap[field]) vals[fieldID] = append(vals[fieldID], value) typs[fieldID] = append(typs[fieldID], typ) poss[fieldID] = append(poss[fieldID], pos) @@ -370,51 +376,49 @@ func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, // now walk the fields in order for fieldID := range fieldsInv { - if storedFieldValues, ok := vals[uint16(fieldID)]; ok { + storedFieldValues := vals[int(fieldID)] - // has stored values for this field - num := len(storedFieldValues) + // has stored values for this field + num := len(storedFieldValues) - // process each value - for i := 0; i < num; i++ { - // encode field - _, err2 := metaEncoder.PutU64(uint64(fieldID)) - if err2 != nil { - return 0, nil, err2 - } - // encode type - _, err2 = metaEncoder.PutU64(uint64(typs[uint16(fieldID)][i])) - if err2 != nil { - return 0, nil, err2 - } - // encode start offset - _, err2 = metaEncoder.PutU64(uint64(curr)) - if err2 != nil { - return 0, nil, err2 - } - // end len - _, err2 = metaEncoder.PutU64(uint64(len(storedFieldValues[i]))) - if err2 != nil { - return 0, nil, err2 - } - // encode number of array pos - _, err2 = metaEncoder.PutU64(uint64(len(poss[uint16(fieldID)][i]))) + // process each value + for i := 0; i < num; i++ { + // encode field + _, err2 := metaEncoder.PutU64(uint64(fieldID)) + if err2 != nil { + return 0, nil, err2 + } + // encode type + _, err2 = metaEncoder.PutU64(uint64(typs[int(fieldID)][i])) + if err2 != nil { + return 0, nil, err2 + } + // encode start offset + _, err2 = metaEncoder.PutU64(uint64(curr)) + if err2 != nil { + return 0, nil, err2 + } + // end len + _, err2 = metaEncoder.PutU64(uint64(len(storedFieldValues[i]))) + if err2 != nil { + return 0, nil, err2 + } + // encode number of array pos + _, err2 = metaEncoder.PutU64(uint64(len(poss[int(fieldID)][i]))) + if err2 != nil { + return 0, nil, err2 + } + // encode all array positions + for j := 0; j < len(poss[int(fieldID)][i]); j++ { + _, err2 = metaEncoder.PutU64(poss[int(fieldID)][i][j]) if err2 != nil { return 0, nil, err2 } - // encode all array positions - for j := 0; j < len(poss[uint16(fieldID)][i]); j++ { - _, err2 = metaEncoder.PutU64(poss[uint16(fieldID)][i][j]) - if err2 != nil { - return 0, nil, err2 - } - } - // append data - data = append(data, storedFieldValues[i]...) - // update curr - curr += len(storedFieldValues[i]) - } + // append data + data = append(data, storedFieldValues[i]...) + // update curr + curr += len(storedFieldValues[i]) } } From b3e41335e15c0327063313ea4229dc9fe134d175 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 21 Dec 2017 07:36:03 -0800 Subject: [PATCH 102/728] scorch compared to upsidedown/bolt using templated, generated searches This is somewhat like a simple, unit-test'ish version of testrunner's random query generator, where this does not have a dependency on an external elasticsearch server, and instead depends on functional correctness when comparing to upsidedown/bolt. --- test/versus_test.go | 435 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 435 insertions(+) create mode 100644 test/versus_test.go diff --git a/test/versus_test.go b/test/versus_test.go new file mode 100644 index 000000000..16abce078 --- /dev/null +++ b/test/versus_test.go @@ -0,0 +1,435 @@ +// Copyright (c) 2014 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package test + +import ( + "bytes" + "encoding/json" + "fmt" + "math" + "math/rand" + "os" + "reflect" + "strconv" + "strings" + "testing" + "text/template" + + "github.com/blevesearch/bleve" + "github.com/blevesearch/bleve/index/scorch" + "github.com/blevesearch/bleve/index/store/boltdb" + "github.com/blevesearch/bleve/index/upsidedown" + "github.com/blevesearch/bleve/mapping" + "github.com/blevesearch/bleve/search" +) + +// Tests scorch indexer versus upsidedown/bolt indexer against various +// templated queries. Example usage from the bleve top-level directory... +// +// go test -v -run TestScorchVersusUpsideDownBolt ./test +// VERBOSE=1 FOCUS=Trista go test -v -run TestScorchVersusUpsideDownBolt ./test +// +func TestScorchVersusUpsideDownBolt(t *testing.T) { + (&VersusTest{ + t: t, + NumDocs: 10000, + MaxWordsPerDoc: 20, + NumWords: 100, + BatchSize: 100, + NumAttemptsPerSearch: 1000, + }).run(scorch.Name, boltdb.Name, upsidedown.Name, boltdb.Name, nil) +} + +func TestScorchVersusUpsideDownBoltSmallMNSAM(t *testing.T) { + (&VersusTest{ + t: t, + Focus: "must-not-same-as-must", + NumDocs: 5, + MaxWordsPerDoc: 2, + NumWords: 1, + BatchSize: 1, + NumAttemptsPerSearch: 1, + }).run(scorch.Name, boltdb.Name, upsidedown.Name, boltdb.Name, nil) +} + +// ------------------------------------------------------- + +// Templates used to compare search results in the "versus" tests. +var searchTemplates = []string{ + `{ + "about": "expected to return zero hits", + "query": { + "query": "title:notARealTitle" + } + }`, + `{ + "about": "try straight word()'s", + "query": { + "query": "body:{{word}}" + } + }`, + `{ + "about": "conjuncts on same term", + "query": { + "conjuncts": [ + { "field": "body", "term": "{{word}}", "boost": 1.0 }, + { "field": "body", "term": "{{word}}", "boost": 1.0 } + ] + } + }`, + `{ + "about": "disjuncts on same term", + "query": { + "disjuncts": [ + { "field": "body", "term": "{{word}}", "boost": 1.0 }, + { "field": "body", "term": "{{word}}", "boost": 1.0 } + ] + } + }`, + `{ + "about": "never-matching-title-conjuncts", + "query": { + "conjuncts": [ + {"field": "body", "match": "{{word}}"}, + {"field": "body", "match": "{{word}}"}, + {"field": "title", "match": "notAnActualTitle"} + ] + } + }`, + `{ + "about": "never-matching-title-disjuncts", + "query": { + "disjuncts": [ + {"field": "body", "match": "{{word}}"}, + {"field": "body", "match": "{{word}}"}, + {"field": "title", "match": "notAnActualTitle"} + ] + } + }`, + `{ + "about": "must-not-never-matches", + "query": { + "must_not": {"disjuncts": [ + {"field": "title", "match": "notAnActualTitle"} + ]}, + "should": {"disjuncts": [ + {"field": "body", "match": "{{word}}"} + ]} + } + }`, + `{ + "about": "must-not-only -- FAILS!!!", + "query": { + "must_not": {"disjuncts": [ + {"field": "body", "term": "{{word}}"} + ]} + } + }`, + `{ + "about": "must-not-same-as-must -- FAILS!!!", + "query": { + "must_not": {"disjuncts": [ + {"field": "body", "match": "{{word}}"} + ]}, + "must": {"conjuncts": [ + {"field": "body", "match": "{{word}}"} + ]} + } + }`, + `{ + "about": "must-not-same-as-should -- FAILS!!!", + "query": { + "must_not": {"disjuncts": [ + {"field": "body", "match": "{{word}}"} + ]}, + "should": {"disjuncts": [ + {"field": "body", "match": "{{word}}"} + ]} + } + }`, + `{ + "about": "inspired by testrunner RQG issue -- FAILS!!!", + "query": { + "must_not": {"disjuncts": [ + {"field": "title", "match": "Trista Allen"}, + {"field": "body", "match": "{{word}}"} + ]}, + "should": {"disjuncts": [ + {"field": "title", "match": "Kallie Safiya Amara"}, + {"field": "body", "match": "{{word}}"} + ]} + } + }`, +} + +// ------------------------------------------------------- + +type VersusTest struct { + t *testing.T + + // Use environment variable VERBOSE= that's > 0 for more + // verbose output. + Verbose int + + // Allow user to focus on particular search templates, where + // where the search template must contain the Focus string. + Focus string + + NumDocs int // Number of docs to insert. + MaxWordsPerDoc int // Max number words in each doc's Body field. + NumWords int // Total number of words in the dictionary. + BatchSize int // Batch size when inserting docs. + NumAttemptsPerSearch int // For each search template, number of searches to try. + + // The Bodies is an array with length NumDocs, where each entry + // is the words in a doc's Body field. + Bodies [][]string + + CurAttempt int + TotAttempts int +} + +// ------------------------------------------------------- + +func testVersusSearches(vt *VersusTest, idxA, idxB bleve.Index) { + t := vt.t + + funcMap := template.FuncMap{ + "word": func() string { + return vt.genWord(vt.CurAttempt % vt.NumWords) + }, + } + + // Optionally allow call to focus on a particular search templates, + // where the search template must contain the vt.Focus string. + if vt.Focus == "" { + vt.Focus = os.Getenv("FOCUS") + } + + for i, searchTemplate := range searchTemplates { + if vt.Focus != "" && !strings.Contains(searchTemplate, vt.Focus) { + continue + } + + tmpl, err := template.New("search").Funcs(funcMap).Parse(searchTemplate) + if err != nil { + t.Fatalf("could not parse search template: %s, err: %v", searchTemplate, err) + } + + for j := 0; j < vt.NumAttemptsPerSearch; j++ { + vt.CurAttempt = j + + var buf bytes.Buffer + err = tmpl.Execute(&buf, vt) + if err != nil { + t.Fatalf("could not execute search template: %s, err: %v", searchTemplate, err) + } + + bufBytes := buf.Bytes() + + if vt.Verbose > 0 { + fmt.Printf(" %s\n", bufBytes) + } + + var search bleve.SearchRequest + err = json.Unmarshal(bufBytes, &search) + if err != nil { + t.Fatalf("could not unmarshal search: %s, err: %v", bufBytes, err) + } + + search.Size = vt.NumDocs * 10 // Crank up limit to get all results. + + searchA := search + searchB := search + + resA, errA := idxA.Search(&searchA) + resB, errB := idxB.Search(&searchB) + if errA != errB { + t.Errorf("search: (%d) %s,\n err mismatch, errA: %v, errB: %v", + i, bufBytes, errA, errB) + } + + // Scores might have float64 vs float32 wobbles, so truncate precision. + resA.MaxScore = math.Trunc(resA.MaxScore*1000.0) / 1000.0 + resB.MaxScore = math.Trunc(resB.MaxScore*1000.0) / 1000.0 + + // Timings may be different between A & B, so force equality. + resA.Took = resB.Took + + // Hits might have different ordering since some indexers + // (like upsidedown) have a natural secondary sort on id + // while others (like scorch) don't. So, we compare by + // putting the hits from A & B into maps. + hitsA := hitsById(resA) + hitsB := hitsById(resB) + if !reflect.DeepEqual(hitsA, hitsB) { + t.Errorf("search: (%d) %s,\n res hits mismatch,\n len(hitsA): %d,\n len(hitsB): %d", + i, bufBytes, len(hitsA), len(hitsB)) + t.Errorf("\n hitsA: %#v,\n hitsB: %#v", + hitsA, hitsB) + for id, hitA := range hitsA { + hitB := hitsB[id] + if !reflect.DeepEqual(hitA, hitB) { + t.Errorf("\n hitA: %#v,\n hitB: %#v", hitA, hitB) + idx, _ := strconv.Atoi(id) + t.Errorf("\n body: %s", strings.Join(vt.Bodies[idx], " ")) + } + } + } + + resA.Hits = nil + resB.Hits = nil + + if !reflect.DeepEqual(resA, resB) { + resAj, _ := json.Marshal(resA) + resBj, _ := json.Marshal(resB) + t.Errorf("search: (%d) %s,\n res mismatch,\n resA: %s,\n resB: %s", + i, bufBytes, resAj, resBj) + } + + if vt.Verbose > 0 { + fmt.Printf(" Total: (%t) %d\n", resA.Total == resB.Total, resA.Total) + } + + vt.TotAttempts++ + } + } +} + +// Organizes the hits into a map keyed by id. +func hitsById(res *bleve.SearchResult) map[string]*search.DocumentMatch { + rv := make(map[string]*search.DocumentMatch, len(res.Hits)) + + for _, hit := range res.Hits { + // Clear out or truncate precision of hit fields that might be + // different across different indexer implementations. + hit.Index = "" + hit.Score = math.Trunc(hit.Score*1000.0) / 1000.0 + hit.IndexInternalID = nil + hit.HitNumber = 0 + + rv[hit.ID] = hit + } + + return rv +} + +// ------------------------------------------------------- + +func (vt *VersusTest) run(indexTypeA, kvStoreA, indexTypeB, kvStoreB string, + cb func(versusTest *VersusTest, idxA, idxB bleve.Index)) { + if cb == nil { + cb = testVersusSearches + } + + if vt.Verbose <= 0 { + vt.Verbose, _ = strconv.Atoi(os.Getenv("VERBOSE")) + } + + dirA := "/tmp/bleve-versus-test-a" + dirB := "/tmp/bleve-versus-test-b" + + defer os.RemoveAll(dirA) + defer os.RemoveAll(dirB) + + os.RemoveAll(dirA) + os.RemoveAll(dirB) + + imA := vt.makeIndexMapping() + imB := vt.makeIndexMapping() + + kvConfigA := map[string]interface{}{} + kvConfigB := map[string]interface{}{} + + idxA, err := bleve.NewUsing(dirA, imA, indexTypeA, kvStoreA, kvConfigA) + if err != nil || idxA == nil { + vt.t.Fatalf("new using err: %v", err) + } + defer idxA.Close() + + idxB, err := bleve.NewUsing(dirB, imB, indexTypeB, kvStoreB, kvConfigB) + if err != nil || idxB == nil { + vt.t.Fatalf("new using err: %v", err) + } + defer idxB.Close() + + rand.Seed(0) + + vt.Bodies = vt.genBodies() + + vt.insertBodies(idxA) + vt.insertBodies(idxB) + + cb(vt, idxA, idxB) +} + +// ------------------------------------------------------- + +func (vt *VersusTest) makeIndexMapping() mapping.IndexMapping { + standardFM := bleve.NewTextFieldMapping() + standardFM.Store = false + standardFM.IncludeInAll = false + standardFM.IncludeTermVectors = true + standardFM.Analyzer = "standard" + + dm := bleve.NewDocumentMapping() + dm.AddFieldMappingsAt("title", standardFM) + dm.AddFieldMappingsAt("body", standardFM) + + im := bleve.NewIndexMapping() + im.DefaultMapping = dm + im.DefaultAnalyzer = "standard" + + return im +} + +func (vt *VersusTest) insertBodies(idx bleve.Index) { + batch := idx.NewBatch() + for i, bodyWords := range vt.Bodies { + title := fmt.Sprintf("%d", i) + body := strings.Join(bodyWords, " ") + batch.Index(title, map[string]interface{}{"title": title, "body": body}) + if i%vt.BatchSize == 0 { + err := idx.Batch(batch) + if err != nil { + vt.t.Fatalf("batch err: %v", err) + } + batch.Reset() + } + } + err := idx.Batch(batch) + if err != nil { + vt.t.Fatalf("last batch err: %v", err) + } +} + +func (vt *VersusTest) genBodies() (rv [][]string) { + for i := 0; i < vt.NumDocs; i++ { + rv = append(rv, vt.genBody()) + } + return rv +} + +func (vt *VersusTest) genBody() (rv []string) { + m := rand.Intn(vt.MaxWordsPerDoc) + for j := 0; j < m; j++ { + rv = append(rv, vt.genWord(rand.Intn(vt.NumWords))) + } + return rv +} + +func (vt *VersusTest) genWord(i int) string { + return fmt.Sprintf("%x", i) +} From a884f38bf6993b6fb49897bfe8521d4624cf72a3 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 21 Dec 2017 16:44:31 -0800 Subject: [PATCH 103/728] scorch docInternalToNumber returns 0 on error --- index/scorch/snapshot_index.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index bb6a8be6b..6089a771d 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -381,7 +381,7 @@ func docInternalToNumber(in index.IndexInternalID) (uint64, error) { var res uint64 err := binary.Read(bytes.NewReader(in), binary.BigEndian, &res) if err != nil { - return res, err + return 0, err } return res, nil } From 33687260ca978ba506f0c92a83e669465d6e5950 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 21 Dec 2017 16:45:36 -0800 Subject: [PATCH 104/728] children of conjunct/disjunct's are not necessarily termSearchers Rename termSearcher loop variable to searcher, as the child searchers of a conjunction/disjunction searcher aren't necessarily termSearchers. --- search/searcher/search_conjunction.go | 16 ++++++++-------- search/searcher/search_disjunction.go | 16 ++++++++-------- 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/search/searcher/search_conjunction.go b/search/searcher/search_conjunction.go index 9ab0e7fa4..d7a873ffb 100644 --- a/search/searcher/search_conjunction.go +++ b/search/searcher/search_conjunction.go @@ -57,25 +57,25 @@ func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.S func (s *ConjunctionSearcher) computeQueryNorm() { // first calculate sum of squared weights sumOfSquaredWeights := 0.0 - for _, termSearcher := range s.searchers { - sumOfSquaredWeights += termSearcher.Weight() + for _, searcher := range s.searchers { + sumOfSquaredWeights += searcher.Weight() } // now compute query norm from this s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) // finally tell all the downstream searchers the norm - for _, termSearcher := range s.searchers { - termSearcher.SetQueryNorm(s.queryNorm) + for _, searcher := range s.searchers { + searcher.SetQueryNorm(s.queryNorm) } } func (s *ConjunctionSearcher) initSearchers(ctx *search.SearchContext) error { var err error // get all searchers pointing at their first match - for i, termSearcher := range s.searchers { + for i, searcher := range s.searchers { if s.currs[i] != nil { ctx.DocumentMatchPool.Put(s.currs[i]) } - s.currs[i], err = termSearcher.Next(ctx) + s.currs[i], err = searcher.Next(ctx) if err != nil { return err } @@ -160,11 +160,11 @@ OUTER: // we know all the searchers are pointing at the same thing // so they all need to be bumped - for i, termSearcher := range s.searchers { + for i, searcher := range s.searchers { if s.currs[i] != rv { ctx.DocumentMatchPool.Put(s.currs[i]) } - s.currs[i], err = termSearcher.Next(ctx) + s.currs[i], err = searcher.Next(ctx) if err != nil { return nil, err } diff --git a/search/searcher/search_disjunction.go b/search/searcher/search_disjunction.go index 96bd54474..07df9a326 100644 --- a/search/searcher/search_disjunction.go +++ b/search/searcher/search_disjunction.go @@ -93,25 +93,25 @@ func newDisjunctionSearcher(indexReader index.IndexReader, func (s *DisjunctionSearcher) computeQueryNorm() { // first calculate sum of squared weights sumOfSquaredWeights := 0.0 - for _, termSearcher := range s.searchers { - sumOfSquaredWeights += termSearcher.Weight() + for _, searcher := range s.searchers { + sumOfSquaredWeights += searcher.Weight() } // now compute query norm from this s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) // finally tell all the downstream searchers the norm - for _, termSearcher := range s.searchers { - termSearcher.SetQueryNorm(s.queryNorm) + for _, searcher := range s.searchers { + searcher.SetQueryNorm(s.queryNorm) } } func (s *DisjunctionSearcher) initSearchers(ctx *search.SearchContext) error { var err error // get all searchers pointing at their first match - for i, termSearcher := range s.searchers { + for i, searcher := range s.searchers { if s.currs[i] != nil { ctx.DocumentMatchPool.Put(s.currs[i]) } - s.currs[i], err = termSearcher.Next(ctx) + s.currs[i], err = searcher.Next(ctx) if err != nil { return err } @@ -221,11 +221,11 @@ func (s *DisjunctionSearcher) Advance(ctx *search.SearchContext, } // get all searchers pointing at their first match var err error - for i, termSearcher := range s.searchers { + for i, searcher := range s.searchers { if s.currs[i] != nil { ctx.DocumentMatchPool.Put(s.currs[i]) } - s.currs[i], err = termSearcher.Advance(ctx, ID) + s.currs[i], err = searcher.Advance(ctx, ID) if err != nil { return nil, err } From 93c787ca099657dbb727d28d9ca2b04ae430e1e4 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 21 Dec 2017 16:49:12 -0800 Subject: [PATCH 105/728] scorch versus_test.go passes errcheck --- test/versus_test.go | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/test/versus_test.go b/test/versus_test.go index 16abce078..9b6325c78 100644 --- a/test/versus_test.go +++ b/test/versus_test.go @@ -341,11 +341,13 @@ func (vt *VersusTest) run(indexTypeA, kvStoreA, indexTypeB, kvStoreB string, dirA := "/tmp/bleve-versus-test-a" dirB := "/tmp/bleve-versus-test-b" - defer os.RemoveAll(dirA) - defer os.RemoveAll(dirB) + defer func() { + _ = os.RemoveAll(dirA) + _ = os.RemoveAll(dirB) + }() - os.RemoveAll(dirA) - os.RemoveAll(dirB) + _ = os.RemoveAll(dirA) + _ = os.RemoveAll(dirB) imA := vt.makeIndexMapping() imB := vt.makeIndexMapping() @@ -357,13 +359,13 @@ func (vt *VersusTest) run(indexTypeA, kvStoreA, indexTypeB, kvStoreB string, if err != nil || idxA == nil { vt.t.Fatalf("new using err: %v", err) } - defer idxA.Close() + defer func() { _ = idxA.Close() }() idxB, err := bleve.NewUsing(dirB, imB, indexTypeB, kvStoreB, kvConfigB) if err != nil || idxB == nil { vt.t.Fatalf("new using err: %v", err) } - defer idxB.Close() + defer func() { _ = idxB.Close() }() rand.Seed(0) @@ -400,9 +402,12 @@ func (vt *VersusTest) insertBodies(idx bleve.Index) { for i, bodyWords := range vt.Bodies { title := fmt.Sprintf("%d", i) body := strings.Join(bodyWords, " ") - batch.Index(title, map[string]interface{}{"title": title, "body": body}) + err := batch.Index(title, map[string]interface{}{"title": title, "body": body}) + if err != nil { + vt.t.Fatalf("batch.Index err: %v", err) + } if i%vt.BatchSize == 0 { - err := idx.Batch(batch) + err = idx.Batch(batch) if err != nil { vt.t.Fatalf("batch err: %v", err) } From d425a3be8693778ca3d8cc6bd3f810bc7a7603bb Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 21 Dec 2017 17:49:55 -0800 Subject: [PATCH 106/728] scorch fix disjunction searcher Advance() Found with "versus" test (TestScorchVersusUpsideDownBoltSmallMNSAM), which had a boolean query with a MustNot that was the same as the Must parameters. This replicates a situation found by Aruna/Mihir/testrunner/RQG (MB-27291). Example: "query": { "must_not": {"disjuncts": [ {"field": "body", "match": "hello"} ]}, "must": {"conjuncts": [ {"field": "body", "match": "hello"} ]} } The nested searchers along the MustNot pathway would end up looking roughly like... booleanSearcher MustNot => disjunctionSearcher => disjunctionSearcher => termSearcher On the first Next() call by the collector, the two disjunction searchers would run through their respective Next() method processing, which includes their initSearcher() processing on the first time. This has the effect of driving the leaf termSearcher through two Next() invocations. That is, if there were 3 docs (doc-1, doc-2, doc-3), the leaf termSearcher would at this point have moved to point to doc-3, while the topmost MustNot would have received doc-1. Next, the booleanSearcher's Must searcher would produce doc-2, so the booleanSearcher would try to Advance() the MustNot searcher to doc-2. But, in scorch, the leafmost termSearcher had already gotten past doc-2 and would return its doc-3. In upsidedown, in contrast, the leaf termSearcher would then drive the KVStore iterator with a Seek(doc-2), and the KVStore iterator would perform a backwards seek to reach doc-2. In scorch, however, backwards iteration seeking isn't supported. So, this fix checks the state of the disjunction searcher to see if we already have the necessary state so that we don't have to perform actual Advance()'es on the underlying searchers. This not only fixes the behavior w.r.t. scorch, but also can have an effect of potentially making upsidedown slightly faster as we're avoiding some backwards KVStore iterator seeks. --- search/searcher/search_disjunction.go | 3 +++ test/versus_test.go | 14 +++++++------- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/search/searcher/search_disjunction.go b/search/searcher/search_disjunction.go index 07df9a326..b6910ddb6 100644 --- a/search/searcher/search_disjunction.go +++ b/search/searcher/search_disjunction.go @@ -223,6 +223,9 @@ func (s *DisjunctionSearcher) Advance(ctx *search.SearchContext, var err error for i, searcher := range s.searchers { if s.currs[i] != nil { + if s.currs[i].IndexInternalID.Compare(ID) >= 0 { + continue + } ctx.DocumentMatchPool.Put(s.currs[i]) } s.currs[i], err = searcher.Advance(ctx, ID) diff --git a/test/versus_test.go b/test/versus_test.go index 9b6325c78..de4123ca1 100644 --- a/test/versus_test.go +++ b/test/versus_test.go @@ -44,11 +44,11 @@ import ( func TestScorchVersusUpsideDownBolt(t *testing.T) { (&VersusTest{ t: t, - NumDocs: 10000, + NumDocs: 1000, MaxWordsPerDoc: 20, - NumWords: 100, - BatchSize: 100, - NumAttemptsPerSearch: 1000, + NumWords: 10, + BatchSize: 10, + NumAttemptsPerSearch: 100, }).run(scorch.Name, boltdb.Name, upsidedown.Name, boltdb.Name, nil) } @@ -138,7 +138,7 @@ var searchTemplates = []string{ } }`, `{ - "about": "must-not-same-as-must -- FAILS!!!", + "about": "must-not-same-as-must -- see: MB-27291", "query": { "must_not": {"disjuncts": [ {"field": "body", "match": "{{word}}"} @@ -149,7 +149,7 @@ var searchTemplates = []string{ } }`, `{ - "about": "must-not-same-as-should -- FAILS!!!", + "about": "must-not-same-as-should", "query": { "must_not": {"disjuncts": [ {"field": "body", "match": "{{word}}"} @@ -160,7 +160,7 @@ var searchTemplates = []string{ } }`, `{ - "about": "inspired by testrunner RQG issue -- FAILS!!!", + "about": "inspired by testrunner RQG issue -- see: MB-27291", "query": { "must_not": {"disjuncts": [ {"field": "title", "match": "Trista Allen"}, From c7a342bc7d69414ea651f3b4ea7a0ec1d1586b49 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 22 Dec 2017 10:28:26 -0800 Subject: [PATCH 107/728] scorch conjuncts match phrase test passes The conjunction searcher Advance() method now checks if its curr doc-matches suffices before advancing them. --- index/scorch/snapshot_index.go | 1 + index/scorch/snapshot_index_tfr.go | 15 ++--- search/searcher/search_conjunction.go | 3 + search/searcher/search_phrase.go | 6 ++ test/versus_test.go | 83 ++++++++++++++++++++++----- 5 files changed, 88 insertions(+), 20 deletions(-) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 6089a771d..5f08a496f 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -343,6 +343,7 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, rv := &IndexSnapshotTermFieldReader{ term: term, + field: field, snapshot: i, postings: make([]segment.PostingsList, len(i.segment)), iterators: make([]segment.PostingsIterator, len(i.segment)), diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index 25cc0bd07..497b83dd7 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -23,6 +23,7 @@ import ( type IndexSnapshotTermFieldReader struct { term []byte + field string snapshot *IndexSnapshot postings []segment.PostingsList iterators []segment.PostingsIterator @@ -84,15 +85,15 @@ func (i *IndexSnapshotTermFieldReader) postingToTermFieldDoc(next segment.Postin } func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) { - // first make sure we aren't already pointing at the right thing, (due to way searchers work) + // FIXME do something better + // for now, if we need to seek backwards, then restart from the beginning if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 { - rv := preAlloced - if rv == nil { - rv = &index.TermFieldDoc{} + i2, err := i.snapshot.TermFieldReader(i.term, i.field, + i.includeFreq, i.includeNorm, i.includeTermVectors) + if err != nil { + return nil, err } - rv.ID = i.currID - i.postingToTermFieldDoc(i.currPosting, rv) - return rv, nil + *i = *(i2.(*IndexSnapshotTermFieldReader)) } // FIXME do something better next, err := i.Next(preAlloced) diff --git a/search/searcher/search_conjunction.go b/search/searcher/search_conjunction.go index d7a873ffb..73fba19cd 100644 --- a/search/searcher/search_conjunction.go +++ b/search/searcher/search_conjunction.go @@ -184,6 +184,9 @@ func (s *ConjunctionSearcher) Advance(ctx *search.SearchContext, ID index.IndexI } } for i := range s.searchers { + if s.currs[i] != nil && s.currs[i].IndexInternalID.Compare(ID) >= 0 { + continue + } err := s.advanceChild(ctx, i, ID) if err != nil { return nil, err diff --git a/search/searcher/search_phrase.go b/search/searcher/search_phrase.go index 6ff592ef5..552dfabef 100644 --- a/search/searcher/search_phrase.go +++ b/search/searcher/search_phrase.go @@ -313,6 +313,12 @@ func (s *PhraseSearcher) Advance(ctx *search.SearchContext, ID index.IndexIntern return nil, err } } + if s.currMust != nil { + if s.currMust.IndexInternalID.Compare(ID) >= 0 { + return s.Next(ctx) + } + ctx.DocumentMatchPool.Put(s.currMust) + } var err error s.currMust, err = s.mustSearcher.Advance(ctx, ID) if err != nil { diff --git a/test/versus_test.go b/test/versus_test.go index de4123ca1..70463a93c 100644 --- a/test/versus_test.go +++ b/test/versus_test.go @@ -41,7 +41,7 @@ import ( // go test -v -run TestScorchVersusUpsideDownBolt ./test // VERBOSE=1 FOCUS=Trista go test -v -run TestScorchVersusUpsideDownBolt ./test // -func TestScorchVersusUpsideDownBolt(t *testing.T) { +func TestScorchVersusUpsideDownBoltAll(t *testing.T) { (&VersusTest{ t: t, NumDocs: 1000, @@ -49,7 +49,7 @@ func TestScorchVersusUpsideDownBolt(t *testing.T) { NumWords: 10, BatchSize: 10, NumAttemptsPerSearch: 100, - }).run(scorch.Name, boltdb.Name, upsidedown.Name, boltdb.Name, nil) + }).run(scorch.Name, boltdb.Name, upsidedown.Name, boltdb.Name, nil, nil) } func TestScorchVersusUpsideDownBoltSmallMNSAM(t *testing.T) { @@ -61,13 +61,25 @@ func TestScorchVersusUpsideDownBoltSmallMNSAM(t *testing.T) { NumWords: 1, BatchSize: 1, NumAttemptsPerSearch: 1, - }).run(scorch.Name, boltdb.Name, upsidedown.Name, boltdb.Name, nil) + }).run(scorch.Name, boltdb.Name, upsidedown.Name, boltdb.Name, nil, nil) +} + +func TestScorchVersusUpsideDownBoltSmallCMP11(t *testing.T) { + (&VersusTest{ + t: t, + Focus: "conjuncts-match-phrase-1-1", + NumDocs: 30, + MaxWordsPerDoc: 8, + NumWords: 2, + BatchSize: 1, + NumAttemptsPerSearch: 1, + }).run(scorch.Name, boltdb.Name, upsidedown.Name, boltdb.Name, nil, nil) } // ------------------------------------------------------- // Templates used to compare search results in the "versus" tests. -var searchTemplates = []string{ +var testVersusSearchTemplates = []string{ `{ "about": "expected to return zero hits", "query": { @@ -130,7 +142,7 @@ var searchTemplates = []string{ } }`, `{ - "about": "must-not-only -- FAILS!!!", + "about": "must-not-only", "query": { "must_not": {"disjuncts": [ {"field": "body", "term": "{{word}}"} @@ -172,6 +184,24 @@ var searchTemplates = []string{ ]} } }`, + `{ + "about": "conjuncts-match-phrase-1-1 inspired by testrunner RQG issue -- see: MB-27291", + "query": { + "conjuncts": [ + {"field": "body", "match": "{{bodyWord 0}}"}, + {"field": "body", "match_phrase": "{{bodyWord 1}} {{bodyWord 1}}"} + ] + } + }`, + `{ + "about": "conjuncts-match-phrase-1-2 inspired by testrunner RQG issue -- see: MB-27291 -- FAILS!!", + "query": { + "conjuncts": [ + {"field": "body", "match": "{{bodyWord 0}}"}, + {"field": "body", "match_phrase": "{{bodyWord 1}} {{bodyWord 2}}"} + ] + } + }`, } // ------------------------------------------------------- @@ -203,13 +233,25 @@ type VersusTest struct { // ------------------------------------------------------- -func testVersusSearches(vt *VersusTest, idxA, idxB bleve.Index) { +func testVersusSearches(vt *VersusTest, searchTemplates []string, idxA, idxB bleve.Index) { t := vt.t funcMap := template.FuncMap{ + // Returns a word. The word may or may not be in any + // document's body. "word": func() string { return vt.genWord(vt.CurAttempt % vt.NumWords) }, + // Picks a document and returns the i'th word in that + // document's body. You can use this in searches to + // definitely find at least one document. + "bodyWord": func(i int) string { + body := vt.Bodies[vt.CurAttempt%len(vt.Bodies)] + if len(body) <= 0 { + return "" + } + return body[i%len(body)] + }, } // Optionally allow call to focus on a particular search templates, @@ -275,16 +317,24 @@ func testVersusSearches(vt *VersusTest, idxA, idxB bleve.Index) { hitsA := hitsById(resA) hitsB := hitsById(resB) if !reflect.DeepEqual(hitsA, hitsB) { - t.Errorf("search: (%d) %s,\n res hits mismatch,\n len(hitsA): %d,\n len(hitsB): %d", + t.Errorf("=========\nsearch: (%d) %s,\n res hits mismatch,\n len(hitsA): %d,\n len(hitsB): %d", i, bufBytes, len(hitsA), len(hitsB)) t.Errorf("\n hitsA: %#v,\n hitsB: %#v", hitsA, hitsB) for id, hitA := range hitsA { hitB := hitsB[id] if !reflect.DeepEqual(hitA, hitB) { - t.Errorf("\n hitA: %#v,\n hitB: %#v", hitA, hitB) + t.Errorf("\n driving from hitsA\n hitA: %#v,\n hitB: %#v", hitA, hitB) idx, _ := strconv.Atoi(id) - t.Errorf("\n body: %s", strings.Join(vt.Bodies[idx], " ")) + t.Errorf("\n doc: %d, body: %s", idx, strings.Join(vt.Bodies[idx], " ")) + } + } + for id, hitB := range hitsB { + hitA := hitsA[id] + if !reflect.DeepEqual(hitA, hitB) { + t.Errorf("\n driving from hitsB\n hitA: %#v,\n hitB: %#v", hitA, hitB) + idx, _ := strconv.Atoi(id) + t.Errorf("\n doc: %d, body: %s", idx, strings.Join(vt.Bodies[idx], " ")) } } } @@ -295,7 +345,7 @@ func testVersusSearches(vt *VersusTest, idxA, idxB bleve.Index) { if !reflect.DeepEqual(resA, resB) { resAj, _ := json.Marshal(resA) resBj, _ := json.Marshal(resB) - t.Errorf("search: (%d) %s,\n res mismatch,\n resA: %s,\n resB: %s", + t.Errorf("search: (%d) %s,\n res mismatch,\n resA: %s,\n resB: %s", i, bufBytes, resAj, resBj) } @@ -329,11 +379,16 @@ func hitsById(res *bleve.SearchResult) map[string]*search.DocumentMatch { // ------------------------------------------------------- func (vt *VersusTest) run(indexTypeA, kvStoreA, indexTypeB, kvStoreB string, - cb func(versusTest *VersusTest, idxA, idxB bleve.Index)) { + cb func(versusTest *VersusTest, searchTemplates []string, idxA, idxB bleve.Index), + searchTemplates []string) { if cb == nil { cb = testVersusSearches } + if searchTemplates == nil { + searchTemplates = testVersusSearchTemplates + } + if vt.Verbose <= 0 { vt.Verbose, _ = strconv.Atoi(os.Getenv("VERBOSE")) } @@ -369,12 +424,14 @@ func (vt *VersusTest) run(indexTypeA, kvStoreA, indexTypeB, kvStoreB string, rand.Seed(0) - vt.Bodies = vt.genBodies() + if vt.Bodies == nil { + vt.Bodies = vt.genBodies() + } vt.insertBodies(idxA) vt.insertBodies(idxB) - cb(vt, idxA, idxB) + cb(vt, searchTemplates, idxA, idxB) } // ------------------------------------------------------- From dcabc267a06339af6827792fe18858e0b95bdbfb Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Tue, 26 Dec 2017 10:37:42 -0700 Subject: [PATCH 108/728] Wait for rollback'ed snapshot to persist --- index/scorch/introducer.go | 9 +++++++-- index/scorch/snapshot_rollback.go | 15 +++++++++++++- index/scorch/snapshot_rollback_test.go | 27 +++++++++++++------------- 3 files changed, 35 insertions(+), 16 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 41ce4ca1f..c1f9321d5 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -38,8 +38,9 @@ type epochWatcher struct { } type snapshotReversion struct { - snapshot *IndexSnapshot - applied chan error + snapshot *IndexSnapshot + applied chan error + persisted chan error } func (s *Scorch) mainLoop() { @@ -285,6 +286,10 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { segmentSnapshot.segment.AddRef() } + if revertTo.persisted != nil { + s.rootPersisted = append(s.rootPersisted, revertTo.persisted) + } + // swap in new snapshot rootPrev := s.root s.root = newSnapshot diff --git a/index/scorch/snapshot_rollback.go b/index/scorch/snapshot_rollback.go index e31308a20..d4b1f2eb8 100644 --- a/index/scorch/snapshot_rollback.go +++ b/index/scorch/snapshot_rollback.go @@ -95,8 +95,21 @@ func (s *Scorch) SnapshotRevert(revertTo *IndexSnapshot) error { applied: make(chan error), } + if !s.unsafeBatch { + revert.persisted = make(chan error) + } + s.revertToSnapshots <- revert // block until this IndexSnapshot is applied - return <-revert.applied + err := <-revert.applied + if err != nil { + return err + } + + if revert.persisted != nil { + err = <-revert.persisted + } + + return err } diff --git a/index/scorch/snapshot_rollback_test.go b/index/scorch/snapshot_rollback_test.go index 3054c9bb8..879d01685 100644 --- a/index/scorch/snapshot_rollback_test.go +++ b/index/scorch/snapshot_rollback_test.go @@ -92,22 +92,23 @@ func TestIndexRollback(t *testing.T) { t.Error(err) } - err = sh.SnapshotRevert(prev) - if err != nil { - t.Error(err) - } - - newRoot := sh.root - if newRoot != nil && prev != nil { - if newRoot.epoch <= prev.epoch { - t.Errorf("Unexpected epoch, %v <= %v", newRoot.epoch, prev.epoch) + if prev != nil { + err = sh.SnapshotRevert(prev) + if err != nil { + t.Error(err) } - } else { - if prev == nil { - t.Errorf("The last persisted snapshot before the revert was nil!") + + newRoot, err := sh.PreviousPersistedSnapshot(nil) + if err != nil { + t.Error(err) } + if newRoot == nil { - t.Errorf("The new root has been set to nil?") + t.Errorf("Failed to retrieve latest persisted snapshot") + } + + if newRoot.epoch <= prev.epoch { + t.Errorf("Unexpected epoch, %v <= %v", newRoot.epoch, prev.epoch) } } } From 272da43c165e780945708785738167e093a91bb6 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 27 Dec 2017 10:20:45 -0800 Subject: [PATCH 109/728] phrase searcher don't allow advance after end --- search/searcher/search_phrase.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/search/searcher/search_phrase.go b/search/searcher/search_phrase.go index 552dfabef..6237cecfd 100644 --- a/search/searcher/search_phrase.go +++ b/search/searcher/search_phrase.go @@ -319,6 +319,9 @@ func (s *PhraseSearcher) Advance(ctx *search.SearchContext, ID index.IndexIntern } ctx.DocumentMatchPool.Put(s.currMust) } + if s.currMust == nil { + return nil, nil + } var err error s.currMust, err = s.mustSearcher.Advance(ctx, ID) if err != nil { From 76f827f469197577d8c6b79f2238fce6608b5da4 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 28 Dec 2017 12:05:33 +0530 Subject: [PATCH 110/728] docValue persist changes docValues are persisted along with the index, in a columnar fashion per field with variable sized chunking for quick look up. -naive chunk level caching is added per field -data part inside a chunk is snappy compressed -metaHeader inside the chunk index the dv values inside the uncompressed data part -all the fields are docValue persisted in this iteration --- index/scorch/scorch.go | 8 + index/scorch/segment/mem/build.go | 4 + index/scorch/segment/mem/segment.go | 7 +- index/scorch/segment/zap/build.go | 110 ++++++++++- index/scorch/segment/zap/contentcoder.go | 180 ++++++++++++++++++ index/scorch/segment/zap/contentcoder_test.go | 75 ++++++++ index/scorch/segment/zap/docvalues.go | 176 +++++++++++++++++ index/scorch/segment/zap/merge.go | 96 +++++++--- index/scorch/segment/zap/segment.go | 44 ++++- index/scorch/segment/zap/write.go | 11 +- index/scorch/snapshot_index.go | 40 ++-- 11 files changed, 699 insertions(+), 52 deletions(-) create mode 100644 index/scorch/segment/zap/contentcoder.go create mode 100644 index/scorch/segment/zap/contentcoder_test.go create mode 100644 index/scorch/segment/zap/docvalues.go diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index d72a8f886..61381dd57 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -37,6 +37,14 @@ const Name = "scorch" const Version uint8 = 1 +// UnInvertIndex is implemented by various scorch index implementations +// to provide the un inverting of the postings or other indexed values. +type UnInvertIndex interface { + // apparently need better namings here.. + VisitDocumentFieldTerms(localDocNum uint64, fields []string, + visitor index.DocumentFieldTermVisitor) error +} + type Scorch struct { readOnly bool version uint8 diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index e111ce4f7..2709d2f06 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -116,6 +116,10 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { if field.Options().IsStored() { storeField(docNum, fieldID, encodeFieldType(field), field.Value(), field.ArrayPositions()) } + // TODO with mapping changes for dv + //if field.Options().IncludeDocValues() { + s.DocValueFields[fieldID] = true + //} } // now that its been rolled up into docMap, walk that diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go index 75ff50cc0..cd22616e8 100644 --- a/index/scorch/segment/mem/segment.go +++ b/index/scorch/segment/mem/segment.go @@ -87,12 +87,17 @@ type Segment struct { // stored field array positions // docNum -> field id -> slice of array positions (each is []uint64) StoredPos []map[uint16][][]uint64 + + // for marking the docValue override status + // field id -> status + DocValueFields map[uint16]bool } // New builds a new empty Segment func New() *Segment { return &Segment{ - FieldsMap: map[string]uint16{}, + FieldsMap: map[string]uint16{}, + DocValueFields: map[uint16]bool{}, } } diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 8bd00601e..5047acc0e 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -20,6 +20,7 @@ import ( "encoding/binary" "math" "os" + "sort" "github.com/Smerity/govarint" "github.com/blevesearch/bleve/index/scorch/segment/mem" @@ -48,6 +49,7 @@ func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) (e var storedIndexOffset uint64 var dictLocs []uint64 + var docValueOffset uint64 if len(memSegment.Stored) > 0 { storedIndexOffset, err = persistStored(memSegment, cr) @@ -78,6 +80,11 @@ func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) (e return err } + docValueOffset, err = persistFieldDocValues(cr, chunkFactor, memSegment) + if err != nil { + return err + } + } else { dictLocs = make([]uint64, len(memSegment.FieldsInv)) } @@ -89,7 +96,7 @@ func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) (e } err = persistFooter(uint64(len(memSegment.Stored)), storedIndexOffset, - fieldIndexStart, chunkFactor, cr) + fieldIndexStart, docValueOffset, chunkFactor, cr) if err != nil { return err } @@ -419,3 +426,104 @@ func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs return rv, nil } + +type docIDRange []uint64 + +func (a docIDRange) Len() int { return len(a) } +func (a docIDRange) Swap(i, j int) { a[i], a[j] = a[j], a[i] } +func (a docIDRange) Less(i, j int) bool { return a[i] < a[j] } + +func persistDocValues(memSegment *mem.Segment, w *CountHashWriter, + chunkFactor uint32) (map[uint16]uint64, error) { + fieldChunkOffsets := make(map[uint16]uint64, len(memSegment.DocValueFields)) + fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) + + for fieldID := range memSegment.DocValueFields { + field := memSegment.FieldsInv[fieldID] + docTermMap := make(map[uint64][]byte, 0) + dict, err := memSegment.Dictionary(field) + if err != nil { + return fieldChunkOffsets, err + } + + dictItr := dict.Iterator() + next, err := dictItr.Next() + for err == nil && next != nil { + postings, err1 := dict.PostingsList(next.Term, nil) + if err1 != nil { + return fieldChunkOffsets, err + } + + postingsItr := postings.Iterator() + nextPosting, err2 := postingsItr.Next() + for err2 == nil && nextPosting != nil { + docNum := nextPosting.Number() + docTermMap[docNum] = append(docTermMap[docNum], []byte(next.Term)...) + docTermMap[docNum] = append(docTermMap[docNum], termSeparator) + nextPosting, err2 = postingsItr.Next() + } + if err2 != nil { + return fieldChunkOffsets, err2 + } + + next, err = dictItr.Next() + } + + if err != nil { + return fieldChunkOffsets, err + } + // sort wrt to docIDs + var docNumbers docIDRange + for k := range docTermMap { + docNumbers = append(docNumbers, k) + } + sort.Sort(docNumbers) + + for _, docNum := range docNumbers { + err = fdvEncoder.Add(docNum, docTermMap[docNum]) + if err != nil { + return fieldChunkOffsets, err + } + } + + fieldChunkOffsets[fieldID] = uint64(w.Count()) + fdvEncoder.Close() + // persist the doc value details for this field + _, err = fdvEncoder.Write(w) + if err != nil { + return fieldChunkOffsets, err + } + // resetting encoder for the next field + fdvEncoder.Reset() + } + + return fieldChunkOffsets, nil +} + +func persistFieldDocValues(w *CountHashWriter, chunkFactor uint32, + memSegment *mem.Segment) (uint64, error) { + + fieldDvOffsets, err := persistDocValues(memSegment, w, chunkFactor) + if err != nil { + return math.MaxUint64, err + } + + fieldDocValuesOffset := uint64(w.Count()) + buf := make([]byte, binary.MaxVarintLen64) + offset := uint64(math.MaxUint64) + ok := true + for fieldID := range memSegment.FieldsInv { + // if the field isn't configured for docValue, then mark + // the offset accordingly + if offset, ok = fieldDvOffsets[uint16(fieldID)]; !ok { + offset = math.MaxUint64 + } + n := binary.PutUvarint(buf, uint64(offset)) + _, err := w.Write(buf[:n]) + if err != nil { + return math.MaxUint64, err + } + } + + return fieldDocValuesOffset, nil +} diff --git a/index/scorch/segment/zap/contentcoder.go b/index/scorch/segment/zap/contentcoder.go new file mode 100644 index 000000000..cf05b6759 --- /dev/null +++ b/index/scorch/segment/zap/contentcoder.go @@ -0,0 +1,180 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "io" + + "github.com/golang/snappy" +) + +var termSeparator byte = 0xff +var termSeparatorSplitSlice = []byte{termSeparator} + +type chunkedContentCoder struct { + final []byte + chunkSize uint64 + currChunk uint64 + chunkLens []uint64 + chunkMetaBuf bytes.Buffer + chunkBuf bytes.Buffer + + chunkMeta []metaData +} + +// metaData represents the data information inside a +// chunk. +type metaData struct { + docID uint64 // docid of the data inside the chunk + docDvLoc uint64 // starting offset for a given docid + docDvLen uint64 // length of data inside the chunk for the given docid +} + +// newChunkedContentCoder returns a new chunk content coder which +// packs data into chunks based on the provided chunkSize +func newChunkedContentCoder(chunkSize uint64, maxDocNum uint64) *chunkedContentCoder { + total := maxDocNum/chunkSize + 1 + rv := &chunkedContentCoder{ + chunkSize: chunkSize, + chunkLens: make([]uint64, total), + chunkMeta: []metaData{}, + } + + return rv +} + +// Reset lets you reuse this chunked content coder. Buffers are reset +// and re used. You cannot change the chunk size. +func (c *chunkedContentCoder) Reset() { + c.currChunk = 0 + c.final = c.final[:0] + c.chunkBuf.Reset() + c.chunkMetaBuf.Reset() + for i := range c.chunkLens { + c.chunkLens[i] = 0 + } + c.chunkMeta = []metaData{} +} + +// Close indicates you are done calling Add() this allows +// the final chunk to be encoded. +func (c *chunkedContentCoder) Close() { + c.flushContents() +} + +func (c *chunkedContentCoder) flushContents() error { + // flush the contents, with meta information at first + buf := make([]byte, binary.MaxVarintLen64) + n := binary.PutUvarint(buf, uint64(len(c.chunkMeta))) + _, err := c.chunkMetaBuf.Write(buf[:n]) + if err != nil { + return err + } + + // write out the metaData slice + for _, meta := range c.chunkMeta { + n := binary.PutUvarint(buf, meta.docID) + _, err = c.chunkMetaBuf.Write(buf[:n]) + if err != nil { + return err + } + + n = binary.PutUvarint(buf, meta.docDvLoc) + _, err = c.chunkMetaBuf.Write(buf[:n]) + if err != nil { + return err + } + + n = binary.PutUvarint(buf, meta.docDvLen) + _, err = c.chunkMetaBuf.Write(buf[:n]) + if err != nil { + return err + } + + } + + // write the metadata to final data + metaData := c.chunkMetaBuf.Bytes() + c.final = append(c.final, c.chunkMetaBuf.Bytes()...) + // write the compressed data to the final data + compressedData := snappy.Encode(nil, c.chunkBuf.Bytes()) + c.final = append(c.final, compressedData...) + //c.chunkLens = append(c.chunkLens, uint64(len(compressedData)+len(metaData))) + c.chunkLens[c.currChunk] = uint64(len(compressedData) + len(metaData)) + return nil +} + +// Add encodes the provided byte slice into the correct chunk for the provided +// doc num. You MUST call Add() with increasing docNums. +func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error { + chunk := docNum / c.chunkSize + if chunk != c.currChunk { + // flush out the previous chunk details + err := c.flushContents() + if err != nil { + return err + } + // clearing the chunk specific meta for next chunk + c.chunkBuf.Reset() + c.chunkMetaBuf.Reset() + c.chunkMeta = []metaData{} + c.currChunk = chunk + } + + // mark the starting offset for this doc + dvOffset := c.chunkBuf.Len() + dvSize, err := c.chunkBuf.Write(vals) + if err != nil { + return err + } + + c.chunkMeta = append(c.chunkMeta, metaData{ + docID: docNum, + docDvLoc: uint64(dvOffset), + docDvLen: uint64(dvSize), + }) + return nil +} + +// Write commits all the encoded chunked contents to the provided writer. +func (c *chunkedContentCoder) Write(w io.Writer) (int, error) { + var tw int + buf := make([]byte, binary.MaxVarintLen64) + // write out the number of chunks + n := binary.PutUvarint(buf, uint64(len(c.chunkLens))) + nw, err := w.Write(buf[:n]) + tw += nw + if err != nil { + return tw, err + } + // write out the chunk lens + for _, chunkLen := range c.chunkLens { + n := binary.PutUvarint(buf, uint64(chunkLen)) + nw, err = w.Write(buf[:n]) + tw += nw + if err != nil { + return tw, err + } + } + // write out the data + nw, err = w.Write(c.final) + tw += nw + if err != nil { + return tw, err + } + return tw, nil +} diff --git a/index/scorch/segment/zap/contentcoder_test.go b/index/scorch/segment/zap/contentcoder_test.go new file mode 100644 index 000000000..4ae4d8121 --- /dev/null +++ b/index/scorch/segment/zap/contentcoder_test.go @@ -0,0 +1,75 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "reflect" + "testing" +) + +func TestChunkContentCoder(t *testing.T) { + + tests := []struct { + maxDocNum uint64 + chunkSize uint64 + docNums []uint64 + vals [][]byte + expected string + }{ + { + maxDocNum: 0, + chunkSize: 1, + docNums: []uint64{0}, + vals: [][]byte{[]byte("bleve")}, + // 1 chunk, chunk-0 length 11(b), value + expected: string([]byte{0x1, 0xb, 0x1, 0x0, 0x0, 0x05, 0x05, 0x10, 0x62, 0x6c, 0x65, 0x76, 0x65}), + }, + { + maxDocNum: 1, + chunkSize: 1, + docNums: []uint64{0, 1}, + vals: [][]byte{ + []byte("upside"), + []byte("scorch"), + }, + + expected: string([]byte{0x02, 0x0c, 0x0c, 0x01, 0x00, 0x00, 0x06, 0x06, 0x14, + 0x75, 0x70, 0x73, 0x69, 0x64, 0x65, 0x01, 0x01, 0x00, 0x06, 0x06, + 0x14, 0x73, 0x63, 0x6f, 0x72, 0x63, 0x68}), + }, + } + + for _, test := range tests { + + cic := newChunkedContentCoder(test.chunkSize, test.maxDocNum) + for i, docNum := range test.docNums { + err := cic.Add(docNum, test.vals[i]) + if err != nil { + t.Fatalf("error adding to intcoder: %v", err) + } + } + cic.Close() + var actual bytes.Buffer + _, err := cic.Write(&actual) + if err != nil { + t.Fatalf("error writing: %v", err) + } + + if !reflect.DeepEqual(test.expected, string(actual.Bytes())) { + t.Errorf("got % s, expected % s", string(actual.Bytes()), test.expected) + } + } +} diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go new file mode 100644 index 000000000..c8dad8fdc --- /dev/null +++ b/index/scorch/segment/zap/docvalues.go @@ -0,0 +1,176 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "fmt" + "math" + "sort" + + "github.com/blevesearch/bleve/index" + "github.com/golang/snappy" +) + +type docValueIterator struct { + field string + curChunkNum uint64 + numChunks uint64 + chunkLens []uint64 + dvDataLoc uint64 + curChunkHeader []metaData + curChunkData []byte // compressed data cache +} + +func (di *docValueIterator) fieldName() string { + return di.field +} + +func (di *docValueIterator) curChunkNumber() uint64 { + return di.curChunkNum +} + +func (s *Segment) loadFieldDocValueIterator(field string, + fieldDvLoc uint64) (*docValueIterator, error) { + // get the docValue offset for the given fields + if fieldDvLoc == math.MaxUint64 { + return nil, fmt.Errorf("loadFieldDocValueConfigs: "+ + "no docValues found for field: %s", field) + } + + // read the number of chunks, chunk lengths + var offset uint64 + numChunks, read := binary.Uvarint(s.mm[fieldDvLoc : fieldDvLoc+binary.MaxVarintLen64]) + if read <= 0 { + return nil, fmt.Errorf("failed to read the field "+ + "doc values for field %s", field) + } + offset += uint64(read) + + fdvIter := &docValueIterator{ + curChunkNum: math.MaxUint64, + field: field, + chunkLens: make([]uint64, int(numChunks)), + } + for i := 0; i < int(numChunks); i++ { + fdvIter.chunkLens[i], read = binary.Uvarint(s.mm[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64]) + offset += uint64(read) + } + + fdvIter.dvDataLoc = fieldDvLoc + offset + return fdvIter, nil +} + +func (di *docValueIterator) loadDvChunk(chunkNumber, + localDocNum uint64, s *Segment) error { + // advance to the chunk where the docValues + // reside for the given docID + destChunkDataLoc := di.dvDataLoc + for i := 0; i < int(chunkNumber); i++ { + destChunkDataLoc += di.chunkLens[i] + } + + curChunkSize := di.chunkLens[chunkNumber] + // read the number of docs reside in the chunk + numDocs, read := binary.Uvarint(s.mm[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64]) + if read <= 0 { + return fmt.Errorf("failed to read the chunk") + } + chunkMetaLoc := destChunkDataLoc + uint64(read) + + offset := uint64(0) + di.curChunkHeader = make([]metaData, int(numDocs)) + for i := 0; i < int(numDocs); i++ { + di.curChunkHeader[i].docID, read = binary.Uvarint(s.mm[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + offset += uint64(read) + di.curChunkHeader[i].docDvLoc, read = binary.Uvarint(s.mm[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + offset += uint64(read) + di.curChunkHeader[i].docDvLen, read = binary.Uvarint(s.mm[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + offset += uint64(read) + } + + compressedDataLoc := chunkMetaLoc + offset + dataLength := destChunkDataLoc + curChunkSize - compressedDataLoc + di.curChunkData = s.mm[compressedDataLoc : compressedDataLoc+dataLength] + di.curChunkNum = chunkNumber + return nil +} + +func (di *docValueIterator) visitDocValues(docID uint64, + visitor index.DocumentFieldTermVisitor) error { + // binary search the term locations for the docID + start, length := di.getDocValueLocs(docID) + if start == math.MaxUint64 || length == math.MaxUint64 { + return nil + } + // uncompress the already loaded data + uncompressed, err := snappy.Decode(nil, di.curChunkData) + if err != nil { + return err + } + + // pick the terms for the given docID + uncompressed = uncompressed[start : start+length] + for { + i := bytes.Index(uncompressed, termSeparatorSplitSlice) + if i < 0 { + break + } + + visitor(di.field, uncompressed[0:i]) + uncompressed = uncompressed[i+1:] + } + + return nil +} + +func (di *docValueIterator) getDocValueLocs(docID uint64) (uint64, uint64) { + i := sort.Search(len(di.curChunkHeader), func(i int) bool { + return di.curChunkHeader[i].docID >= docID + }) + if i < len(di.curChunkHeader) && di.curChunkHeader[i].docID == docID { + return di.curChunkHeader[i].docDvLoc, di.curChunkHeader[i].docDvLen + } + return math.MaxUint64, math.MaxUint64 +} + +// VisitDocumentFieldTerms is an implementation of the UnInvertIndex interface +func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string, + visitor index.DocumentFieldTermVisitor) error { + fieldID := uint16(0) + ok := true + for _, field := range fields { + if fieldID, ok = s.fieldsMap[field]; !ok { + continue + } + // find the chunkNumber where the docValues are stored + docInChunk := localDocNum / uint64(s.chunkFactor) + + if dvIter, exists := s.fieldDvIterMap[fieldID-1]; exists && + dvIter != nil { + // check if the chunk is already loaded + if docInChunk != dvIter.curChunkNumber() { + err := dvIter.loadDvChunk(docInChunk, localDocNum, s) + if err != nil { + continue + } + } + + dvIter.visitDocValues(localDocNum, visitor) + } + } + return nil +} diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 5d845bbd1..36959c62e 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -21,6 +21,7 @@ import ( "fmt" "math" "os" + "sort" "github.com/RoaringBitmap/roaring" "github.com/Smerity/govarint" @@ -52,7 +53,7 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, newSegDocCount := computeNewDocCount(segments, drops) var newDocNums [][]uint64 - var storedIndexOffset uint64 + var storedIndexOffset, fieldDvLocsOffset uint64 var dictLocs []uint64 if newSegDocCount > 0 { storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, @@ -61,7 +62,7 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, return nil, err } - dictLocs, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap, + dictLocs, fieldDvLocsOffset, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap, newDocNums, newSegDocCount, chunkFactor, cr) if err != nil { return nil, err @@ -77,7 +78,7 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, } err = persistFooter(newSegDocCount, storedIndexOffset, - fieldsIndexOffset, chunkFactor, cr) + fieldsIndexOffset, fieldDvLocsOffset, chunkFactor, cr) if err != nil { return nil, err } @@ -126,12 +127,14 @@ func computeNewDocCount(segments []*Segment, drops []*roaring.Bitmap) uint64 { func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, fieldsInv []string, fieldsMap map[string]uint16, newDocNums [][]uint64, newSegDocCount uint64, chunkFactor uint32, - w *CountHashWriter) ([]uint64, error) { + w *CountHashWriter) ([]uint64, uint64, error) { var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64) var bufLoc []uint64 - rv := make([]uint64, len(fieldsInv)) + rv1 := make([]uint64, len(fieldsInv)) + fieldDvLocs := make([]uint64, len(fieldsInv)) + fieldDvLocsOffset := uint64(math.MaxUint64) var vellumBuf bytes.Buffer // for each field @@ -141,23 +144,23 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, } newVellum, err := vellum.New(&vellumBuf, nil) if err != nil { - return nil, err + return nil, fieldDvLocsOffset, err } - // collect FTS iterators from all segments for this field + // collect FST iterators from all segments for this field var dicts []*Dictionary var itrs []vellum.Iterator for _, segment := range segments { dict, err2 := segment.dictionary(fieldName) if err2 != nil { - return nil, err2 + return nil, fieldDvLocsOffset, err2 } dicts = append(dicts, dict) if dict != nil && dict.fst != nil { itr, err2 := dict.fst.Iterator(nil, nil) if err2 != nil && err2 != vellum.ErrIteratorDone { - return nil, err2 + return nil, fieldDvLocsOffset, err2 } if itr != nil { itrs = append(itrs, itr) @@ -173,6 +176,8 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) + fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1) + docTermMap := make(map[uint64][]byte, 0) for err == nil { term, _ := mergeItr.Current() @@ -189,7 +194,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, } postings, err2 := dict.postingsList(string(term), drops[dictI]) if err2 != nil { - return nil, err2 + return nil, fieldDvLocsOffset, err2 } postItr := postings.Iterator() @@ -197,7 +202,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, for next != nil && err2 == nil { hitNewDocNum := newDocNums[dictI][next.Number()] if hitNewDocNum == docDropped { - return nil, fmt.Errorf("see hit with dropped doc num") + return nil, fieldDvLocsOffset, fmt.Errorf("see hit with dropped doc num") } newRoaring.Add(uint32(hitNewDocNum)) // encode norm bits @@ -205,7 +210,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, normBits := math.Float32bits(float32(norm)) err3 := tfEncoder.Add(hitNewDocNum, next.Frequency(), uint64(normBits)) if err3 != nil { - return nil, err3 + return nil, fieldDvLocsOffset, err3 } locs := next.Locations() if len(locs) > 0 { @@ -223,14 +228,17 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, args = append(args, loc.ArrayPositions()...) err = locEncoder.Add(hitNewDocNum, args...) if err != nil { - return nil, err + return nil, fieldDvLocsOffset, err } } } + + docTermMap[hitNewDocNum] = append(docTermMap[hitNewDocNum], []byte(term)...) + docTermMap[hitNewDocNum] = append(docTermMap[hitNewDocNum], termSeparator) next, err2 = postItr.Next() } if err != nil { - return nil, err + return nil, fieldDvLocsOffset, err } } @@ -242,17 +250,17 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, freqOffset := uint64(w.Count()) _, err = tfEncoder.Write(w) if err != nil { - return nil, err + return nil, fieldDvLocsOffset, err } locOffset := uint64(w.Count()) _, err = locEncoder.Write(w) if err != nil { - return nil, err + return nil, fieldDvLocsOffset, err } postingLocOffset := uint64(w.Count()) _, err = writeRoaringWithLen(newRoaringLocs, w) if err != nil { - return nil, err + return nil, fieldDvLocsOffset, err } postingOffset := uint64(w.Count()) // write out the start of the term info @@ -260,43 +268,43 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, n := binary.PutUvarint(buf, freqOffset) _, err = w.Write(buf[:n]) if err != nil { - return nil, err + return nil, fieldDvLocsOffset, err } // write out the start of the loc info n = binary.PutUvarint(buf, locOffset) _, err = w.Write(buf[:n]) if err != nil { - return nil, err + return nil, fieldDvLocsOffset, err } // write out the start of the loc posting list n = binary.PutUvarint(buf, postingLocOffset) _, err = w.Write(buf[:n]) if err != nil { - return nil, err + return nil, fieldDvLocsOffset, err } _, err = writeRoaringWithLen(newRoaring, w) if err != nil { - return nil, err + return nil, fieldDvLocsOffset, err } err = newVellum.Insert(term, postingOffset) if err != nil { - return nil, err + return nil, fieldDvLocsOffset, err } } err = mergeItr.Next() } if err != nil && err != vellum.ErrIteratorDone { - return nil, err + return nil, fieldDvLocsOffset, err } dictOffset := uint64(w.Count()) err = newVellum.Close() if err != nil { - return nil, err + return nil, fieldDvLocsOffset, err } vellumData := vellumBuf.Bytes() @@ -306,19 +314,51 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, n := binary.PutUvarint(buf, uint64(len(vellumData))) _, err = w.Write(buf[:n]) if err != nil { - return nil, err + return nil, fieldDvLocsOffset, err } // write this vellum to disk _, err = w.Write(vellumData) if err != nil { - return nil, err + return nil, fieldDvLocsOffset, err + } + + rv1[fieldID] = dictOffset + + // update teh doc value + var docNumbers docIDRange + for k := range docTermMap { + docNumbers = append(docNumbers, k) } + sort.Sort(docNumbers) - rv[fieldID] = dictOffset + for _, docNum := range docNumbers { + err = fdvEncoder.Add(docNum, docTermMap[docNum]) + if err != nil { + return nil, fieldDvLocsOffset, err + } + } + // get the field doc value offset + fieldDvLocs[fieldID] = uint64(w.Count()) + fdvEncoder.Close() + // persist the doc value details for this field + _, err = fdvEncoder.Write(w) + if err != nil { + return nil, fieldDvLocsOffset, err + } + } + + fieldDvLocsOffset = uint64(w.Count()) + buf := make([]byte, binary.MaxVarintLen64) + for _, offset := range fieldDvLocs { + n := binary.PutUvarint(buf, uint64(offset)) + _, err := w.Write(buf[:n]) + if err != nil { + return nil, math.MaxUint64, err + } } - return rv, nil + return rv1, fieldDvLocsOffset, nil } const docDropped = math.MaxUint64 diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index e78ac392f..c5d5aed22 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -19,6 +19,7 @@ import ( "encoding/binary" "fmt" "io" + "math" "os" "sync" @@ -44,11 +45,12 @@ func Open(path string) (segment.Segment, error) { } rv := &Segment{ - f: f, - mm: mm, - path: path, - fieldsMap: make(map[string]uint16), - refs: 1, + f: f, + mm: mm, + path: path, + fieldsMap: make(map[string]uint16), + fieldDvIterMap: make(map[uint16]*docValueIterator), + refs: 1, } err = rv.loadConfig() @@ -63,6 +65,12 @@ func Open(path string) (segment.Segment, error) { return nil, err } + err = rv.loadDvIterators() + if err != nil { + _ = rv.Close() + return nil, err + } + return rv, nil } @@ -82,6 +90,9 @@ type Segment struct { fieldsInv []string fieldsOffsets []uint64 + docValueOffset uint64 + fieldDvIterMap map[uint16]*docValueIterator // naive chunk cache per field + m sync.Mutex // Protects the fields that follow. refs int64 } @@ -112,7 +123,11 @@ func (s *Segment) loadConfig() error { } chunkOffset := verOffset - 4 s.chunkFactor = binary.BigEndian.Uint32(s.mm[chunkOffset : chunkOffset+4]) - fieldsOffset := chunkOffset - 8 + + docValueOffset := chunkOffset - 8 + s.docValueOffset = binary.BigEndian.Uint64(s.mm[docValueOffset : docValueOffset+8]) + fieldsOffset := docValueOffset - 8 + s.fieldsIndexOffset = binary.BigEndian.Uint64(s.mm[fieldsOffset : fieldsOffset+8]) storedOffset := fieldsOffset - 8 s.storedIndexOffset = binary.BigEndian.Uint64(s.mm[storedOffset : storedOffset+8]) @@ -355,3 +370,20 @@ func (s *Segment) DictAddr(field string) (uint64, error) { return s.fieldsOffsets[fieldID-1], nil } + +func (s *Segment) loadDvIterators() error { + if s.docValueOffset == math.MaxUint64 { + return nil + } + + var read uint64 + for fieldID, field := range s.fieldsInv { + fieldLoc, n := binary.Uvarint(s.mm[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) + if n <= 0 { + return fmt.Errorf("loadDvIterators: failed to read the docvalue offsets for field %d", fieldID) + } + s.fieldDvIterMap[uint16(fieldID)], _ = s.loadFieldDocValueIterator(field, fieldLoc) + read += uint64(n) + } + return nil +} diff --git a/index/scorch/segment/zap/write.go b/index/scorch/segment/zap/write.go index a831ef6ae..cfb7e46e9 100644 --- a/index/scorch/segment/zap/write.go +++ b/index/scorch/segment/zap/write.go @@ -86,10 +86,10 @@ func persistFields(fieldsInv []string, w *CountHashWriter, dictLocs []uint64) (u } // FooterSize is the size of the footer record in bytes -// crc + ver + chunk + field offset + stored offset + num docs -const FooterSize = 4 + 4 + 4 + 8 + 8 + 8 +// crc + ver + chunk + field offset + stored offset + num docs + docValueOffset +const FooterSize = 4 + 4 + 4 + 8 + 8 + 8 + 8 -func persistFooter(numDocs, storedIndexOffset, fieldIndexOffset uint64, +func persistFooter(numDocs, storedIndexOffset, fieldIndexOffset, docValueOffset uint64, chunkFactor uint32, w *CountHashWriter) error { // write out the number of docs err := binary.Write(w, binary.BigEndian, numDocs) @@ -106,6 +106,11 @@ func persistFooter(numDocs, storedIndexOffset, fieldIndexOffset uint64, if err != nil { return err } + // write out the fieldDocValue location + err = binary.Write(w, binary.BigEndian, docValueOffset) + if err != nil { + return err + } // write out 32-bit chunk factor err = binary.Write(w, binary.BigEndian, chunkFactor) if err != nil { diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 5f08a496f..2b2654b58 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -22,6 +22,9 @@ import ( "sort" "sync" + "github.com/blevesearch/bleve/index/scorch/segment/mem" + "github.com/blevesearch/bleve/index/scorch/segment/zap" + "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" @@ -401,24 +404,35 @@ func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID, ss := i.segment[segmentIndex] - err = ss.cachedDocs.prepareFields(fields, ss) - if err != nil { - return err - } + switch seg := ss.segment.(type) { + case *mem.Segment: + err = ss.cachedDocs.prepareFields(fields, ss) + if err != nil { + return err + } - for _, field := range fields { - if cachedFieldDocs, exists := ss.cachedDocs.cache[field]; exists { - if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists { - for { - i := bytes.Index(tlist, TermSeparatorSplitSlice) - if i < 0 { - break + for _, field := range fields { + if cachedFieldDocs, exists := ss.cachedDocs.cache[field]; exists { + if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists { + for { + i := bytes.Index(tlist, TermSeparatorSplitSlice) + if i < 0 { + break + } + visitor(field, tlist[0:i]) + tlist = tlist[i+1:] } - visitor(field, tlist[0:i]) - tlist = tlist[i+1:] } } } + + case *zap.Segment: + if zaps, ok := ss.segment.(UnInvertIndex); ok { + return zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor) + } + + default: + return fmt.Errorf("snapshot_index: DocumentVisitFieldTerms, unknown segment type: %T", seg) } return nil From 0272451093c3013be4a0df701e69d4f1a83936db Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 28 Dec 2017 13:05:25 +0530 Subject: [PATCH 111/728] adding checks for robustness --- index/scorch/segment/zap/contentcoder.go | 5 +++-- index/scorch/segment/zap/docvalues.go | 8 ++++++-- index/scorch/segment/zap/segment.go | 2 +- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/index/scorch/segment/zap/contentcoder.go b/index/scorch/segment/zap/contentcoder.go index cf05b6759..e13e108e6 100644 --- a/index/scorch/segment/zap/contentcoder.go +++ b/index/scorch/segment/zap/contentcoder.go @@ -46,7 +46,8 @@ type metaData struct { // newChunkedContentCoder returns a new chunk content coder which // packs data into chunks based on the provided chunkSize -func newChunkedContentCoder(chunkSize uint64, maxDocNum uint64) *chunkedContentCoder { +func newChunkedContentCoder(chunkSize uint64, + maxDocNum uint64) *chunkedContentCoder { total := maxDocNum/chunkSize + 1 rv := &chunkedContentCoder{ chunkSize: chunkSize, @@ -113,7 +114,7 @@ func (c *chunkedContentCoder) flushContents() error { // write the compressed data to the final data compressedData := snappy.Encode(nil, c.chunkBuf.Bytes()) c.final = append(c.final, compressedData...) - //c.chunkLens = append(c.chunkLens, uint64(len(compressedData)+len(metaData))) + c.chunkLens[c.currChunk] = uint64(len(compressedData) + len(metaData)) return nil } diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index c8dad8fdc..353cbb650 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -52,7 +52,7 @@ func (s *Segment) loadFieldDocValueIterator(field string, } // read the number of chunks, chunk lengths - var offset uint64 + var offset, clen uint64 numChunks, read := binary.Uvarint(s.mm[fieldDvLoc : fieldDvLoc+binary.MaxVarintLen64]) if read <= 0 { return nil, fmt.Errorf("failed to read the field "+ @@ -66,7 +66,11 @@ func (s *Segment) loadFieldDocValueIterator(field string, chunkLens: make([]uint64, int(numChunks)), } for i := 0; i < int(numChunks); i++ { - fdvIter.chunkLens[i], read = binary.Uvarint(s.mm[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64]) + clen, read = binary.Uvarint(s.mm[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64]) + if read <= 0 { + return nil, fmt.Errorf("corrupted chunk length during segment load") + } + fdvIter.chunkLens[i] = clen offset += uint64(read) } diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index c5d5aed22..5a39f48fb 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -372,7 +372,7 @@ func (s *Segment) DictAddr(field string) (uint64, error) { } func (s *Segment) loadDvIterators() error { - if s.docValueOffset == math.MaxUint64 { + if s.docValueOffset == math.MaxUint64 || s.docValueOffset == 0 { return nil } From 8abac42796f95b8e9f83d2963b540bf5fac4050a Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 28 Dec 2017 13:23:57 +0530 Subject: [PATCH 112/728] errCheck fixes --- index/scorch/segment/zap/contentcoder.go | 2 +- index/scorch/segment/zap/docvalues.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/contentcoder.go b/index/scorch/segment/zap/contentcoder.go index e13e108e6..978500e92 100644 --- a/index/scorch/segment/zap/contentcoder.go +++ b/index/scorch/segment/zap/contentcoder.go @@ -74,7 +74,7 @@ func (c *chunkedContentCoder) Reset() { // Close indicates you are done calling Add() this allows // the final chunk to be encoded. func (c *chunkedContentCoder) Close() { - c.flushContents() + _ = c.flushContents() } func (c *chunkedContentCoder) flushContents() error { diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index 353cbb650..9f1e9942f 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -173,7 +173,7 @@ func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string, } } - dvIter.visitDocValues(localDocNum, visitor) + _ = dvIter.visitDocValues(localDocNum, visitor) } } return nil From becd4677cd60e510dccdc13240e4d87575a3e220 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 27 Dec 2017 17:06:31 -0700 Subject: [PATCH 113/728] Adding num_items_introduced, num_items_persisted stats + Adding new entries to the stats struct of scorch. + These stats are atomically incremented upon every segment introduction, and upon successful persistence. --- index/scorch/introducer.go | 10 ++++++++-- index/scorch/persister.go | 4 ++++ index/scorch/stats.go | 4 ++++ 3 files changed, 16 insertions(+), 2 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index c1f9321d5..715325626 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -16,6 +16,7 @@ package scorch import ( "fmt" + "sync/atomic" "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index/scorch/segment" @@ -142,12 +143,17 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { } // append new segment, if any, to end of the new index snapshot if next.data != nil { - newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ + newSegmentSnapshot := &SegmentSnapshot{ id: next.id, segment: next.data, // take ownership of next.data's ref-count cachedDocs: &cachedDocs{cache: nil}, - }) + } + newSnapshot.segment = append(newSnapshot.segment, newSegmentSnapshot) newSnapshot.offsets = append(newSnapshot.offsets, running) + + // increment numItemsIntroduced which tracks the number of items + // queued for persistence. + atomic.AddUint64(&s.stats.numItemsIntroduced, newSegmentSnapshot.Count()) } // copy old values for key, oldVal := range s.root.internal { diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 4ad3df80d..52df1ff7a 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -24,6 +24,7 @@ import ( "sort" "strconv" "strings" + "sync/atomic" "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index/scorch/segment" @@ -75,6 +76,7 @@ OUTER: _ = ourSnapshot.DecRef() continue OUTER } + lastPersistedEpoch = ourSnapshot.epoch for _, notifyCh := range notifyChs { close(notifyCh) @@ -243,6 +245,8 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { cachedDocs: segmentSnapshot.cachedDocs, } newIndexSnapshot.segment[i] = newSegmentSnapshot + // update items persisted incase of a new segment snapshot + atomic.AddUint64(&s.stats.numItemsPersisted, newSegmentSnapshot.Count()) } else { newIndexSnapshot.segment[i] = s.root.segment[i] newIndexSnapshot.segment[i].segment.AddRef() diff --git a/index/scorch/stats.go b/index/scorch/stats.go index 13668480d..807e02107 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -22,6 +22,8 @@ import ( // Stats tracks statistics about the index type Stats struct { analysisTime, indexTime uint64 + numItemsIntroduced uint64 + numItemsPersisted uint64 } // FIXME wire up these other stats again @@ -36,6 +38,8 @@ func (s *Stats) statsMap() map[string]interface{} { // m["term_searchers_started"] = atomic.LoadUint64(&i.termSearchersStarted) // m["term_searchers_finished"] = atomic.LoadUint64(&i.termSearchersFinished) // m["num_plain_text_bytes_indexed"] = atomic.LoadUint64(&i.numPlainTextBytesIndexed) + m["num_items_introduced"] = atomic.LoadUint64(&s.numItemsIntroduced) + m["num_items_persisted"] = atomic.LoadUint64(&s.numItemsPersisted) return m } From 4bede84fd0fba8750232e8be403a56d08348905c Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 20 Dec 2017 17:25:21 -0800 Subject: [PATCH 114/728] Wiring up missing stats for scorch - updates, deletes, batches, errors - term_searchers_started, term_searchers_finished - num_plain_test_bytes_indexed --- index/scorch/scorch.go | 15 +++++++++++++-- index/scorch/snapshot_index.go | 2 ++ index/scorch/snapshot_index_tfr.go | 4 ++++ index/scorch/stats.go | 25 ++++++++++++++----------- 4 files changed, 33 insertions(+), 13 deletions(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index d72a8f886..9d2b9049b 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -192,6 +192,7 @@ func (s *Scorch) Batch(batch *index.Batch) error { resultChan := make(chan *index.AnalysisResult, len(batch.IndexOps)) var numUpdates uint64 + var numDeletes uint64 var numPlainTextBytes uint64 var ids []string for docID, doc := range batch.IndexOps { @@ -200,6 +201,8 @@ func (s *Scorch) Batch(batch *index.Batch) error { doc.AddField(document.NewTextFieldCustom("_id", nil, []byte(doc.ID), document.IndexField|document.StoreField, nil)) numUpdates++ numPlainTextBytes += doc.NumPlainTextBytes() + } else { + numDeletes++ } ids = append(ids, docID) } @@ -234,8 +237,16 @@ func (s *Scorch) Batch(batch *index.Batch) error { } err := s.prepareSegment(newSegment, ids, batch.InternalOps) - if err != nil && newSegment != nil { - _ = newSegment.Close() + if err != nil { + if newSegment != nil { + _ = newSegment.Close() + } + atomic.AddUint64(&s.stats.errors, 1) + } else { + atomic.AddUint64(&s.stats.updates, numUpdates) + atomic.AddUint64(&s.stats.deletes, numDeletes) + atomic.AddUint64(&s.stats.batches, 1) + atomic.AddUint64(&s.stats.numPlainTextBytesIndexed, numPlainTextBytes) } return err } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 5f08a496f..9da84278c 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -21,6 +21,7 @@ import ( "fmt" "sort" "sync" + "sync/atomic" "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/document" @@ -363,6 +364,7 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, rv.postings[i] = pl rv.iterators[i] = pl.Iterator() } + atomic.AddUint64(&i.parent.stats.termSearchersStarted, uint64(1)) return rv, nil } diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index 497b83dd7..87fd0d14f 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -16,6 +16,7 @@ package scorch import ( "bytes" + "sync/atomic" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" @@ -124,5 +125,8 @@ func (i *IndexSnapshotTermFieldReader) Count() uint64 { } func (i *IndexSnapshotTermFieldReader) Close() error { + if i.snapshot != nil { + atomic.AddUint64(&i.snapshot.parent.stats.termSearchersFinished, uint64(1)) + } return nil } diff --git a/index/scorch/stats.go b/index/scorch/stats.go index 807e02107..abd054c81 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -21,23 +21,26 @@ import ( // Stats tracks statistics about the index type Stats struct { - analysisTime, indexTime uint64 - numItemsIntroduced uint64 - numItemsPersisted uint64 + updates, deletes, batches, errors uint64 + analysisTime, indexTime uint64 + termSearchersStarted uint64 + termSearchersFinished uint64 + numPlainTextBytesIndexed uint64 + numItemsIntroduced uint64 + numItemsPersisted uint64 } -// FIXME wire up these other stats again func (s *Stats) statsMap() map[string]interface{} { m := map[string]interface{}{} - // m["updates"] = atomic.LoadUint64(&i.updates) - // m["deletes"] = atomic.LoadUint64(&i.deletes) - // m["batches"] = atomic.LoadUint64(&i.batches) - // m["errors"] = atomic.LoadUint64(&i.errors) + m["updates"] = atomic.LoadUint64(&s.updates) + m["deletes"] = atomic.LoadUint64(&s.deletes) + m["batches"] = atomic.LoadUint64(&s.batches) + m["errors"] = atomic.LoadUint64(&s.errors) m["analysis_time"] = atomic.LoadUint64(&s.analysisTime) m["index_time"] = atomic.LoadUint64(&s.indexTime) - // m["term_searchers_started"] = atomic.LoadUint64(&i.termSearchersStarted) - // m["term_searchers_finished"] = atomic.LoadUint64(&i.termSearchersFinished) - // m["num_plain_text_bytes_indexed"] = atomic.LoadUint64(&i.numPlainTextBytesIndexed) + m["term_searchers_started"] = atomic.LoadUint64(&s.termSearchersStarted) + m["term_searchers_finished"] = atomic.LoadUint64(&s.termSearchersFinished) + m["num_plain_text_bytes_indexed"] = atomic.LoadUint64(&s.numPlainTextBytesIndexed) m["num_items_introduced"] = atomic.LoadUint64(&s.numItemsIntroduced) m["num_items_persisted"] = atomic.LoadUint64(&s.numItemsPersisted) From c8df014c0ca1b5296968029ddad39e2a905437a0 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 29 Dec 2017 21:39:29 +0530 Subject: [PATCH 115/728] Updated readme, zap version, added new docvalue cmd, fixed the footer and fields cmd, interface name updated --- index/scorch/segment/mem/build.go | 2 +- index/scorch/segment/mem/segment.go | 9 +- index/scorch/segment/segment.go | 8 + index/scorch/segment/zap/README.md | 18 +- index/scorch/segment/zap/build.go | 33 +-- .../segment/zap/cmd/zap/cmd/docvalue.go | 224 ++++++++++++++++++ .../scorch/segment/zap/cmd/zap/cmd/fields.go | 2 +- .../scorch/segment/zap/cmd/zap/cmd/footer.go | 1 + index/scorch/segment/zap/contentcoder.go | 52 ++-- index/scorch/segment/zap/contentcoder_test.go | 2 +- index/scorch/segment/zap/docvalues.go | 16 +- index/scorch/segment/zap/merge.go | 55 +++-- index/scorch/segment/zap/segment.go | 10 +- index/scorch/snapshot_index.go | 45 ++-- 14 files changed, 360 insertions(+), 117 deletions(-) create mode 100644 index/scorch/segment/zap/cmd/zap/cmd/docvalue.go diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 2709d2f06..246c35680 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -118,7 +118,7 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { } // TODO with mapping changes for dv //if field.Options().IncludeDocValues() { - s.DocValueFields[fieldID] = true + s.DocValueFields = append(s.DocValueFields, fieldID) //} } diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go index cd22616e8..705447c22 100644 --- a/index/scorch/segment/mem/segment.go +++ b/index/scorch/segment/mem/segment.go @@ -88,16 +88,15 @@ type Segment struct { // docNum -> field id -> slice of array positions (each is []uint64) StoredPos []map[uint16][][]uint64 - // for marking the docValue override status - // field id -> status - DocValueFields map[uint16]bool + // for storing the docValue persisted fields + // field id + DocValueFields []uint16 } // New builds a new empty Segment func New() *Segment { return &Segment{ - FieldsMap: map[string]uint16{}, - DocValueFields: map[uint16]bool{}, + FieldsMap: map[string]uint16{}, } } diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index 2c91c0ef8..73225c70a 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -87,3 +87,11 @@ type Location interface { Pos() uint64 ArrayPositions() []uint64 } + +// DocumentFieldTermVisitable is implemented by various scorch segment +// implementations to provide the un inverting of the postings +// or other indexed values. +type DocumentFieldTermVisitable interface { + VisitDocumentFieldTerms(localDocNum uint64, fields []string, + visitor index.DocumentFieldTermVisitor) error +} diff --git a/index/scorch/segment/zap/README.md b/index/scorch/segment/zap/README.md index b7a1b9e67..179adceaf 100644 --- a/index/scorch/segment/zap/README.md +++ b/index/scorch/segment/zap/README.md @@ -8,7 +8,7 @@ Current usage: - crc-32 bytes and version are in fixed position at end of the file - reading remainder of footer could be version specific - remainder of footer gives us: - - 2 important offsets (fields index and stored data index) + - 3 important offsets (docValue , fields index and stored data index) - 2 important values (number of docs and chunk factor) - field data is processed once and memoized onto the heap so that we never have to go back to disk for it - access to stored data by doc number means first navigating to the stored data index, then accessing a fixed position offset into that slice, which gives us the actual address of the data. the first bytes of that section tell us the size of data so that we know where it ends. @@ -140,12 +140,28 @@ If you know the doc number you're interested in, this format lets you jump to th NOTE: currently we don't know or record the length of this fields index. Instead we rely on the fact that we know it immediately precedes a footer of known size. +## fields DocValue + +- for each field + - preparation phase: + - produce a slice containing multiple consecutive chunks, where each chunk is composed of a meta section followed by compressed columnar field data + - produce a slice remembering the length of each chunk + - file writing phase: + - remember the start position of this first field DocValue offset in the footer + - write out number of chunks that follow (varint uint64) + - write out length of each chunk (each a varint uint64) + - write out the byte slice containing all the chunk data + +NOTE: currently the meta header inside each chunk gives clue to the location offsets and size of the data pertaining to a given docID and any +read operation leverage that meta information to extract the document specific data from the file. + ## footer - file writing phase - write number of docs (big endian uint64) - write stored field index location (big endian uint64) - write field index location (big endian uint64) + - write field docValue location (big endian uint64) - write out chunk factor (big endian uint32) - write out version (big endian uint32) - write out file CRC of everything preceding this (big endian uint32) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 5047acc0e..96d536964 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -28,7 +28,9 @@ import ( "github.com/golang/snappy" ) -const version uint32 = 1 +const version uint32 = 2 + +const fieldNotUninverted = math.MaxUint64 // PersistSegment takes the in-memory segment and persists it to the specified // path in the zap file format. @@ -435,15 +437,15 @@ func (a docIDRange) Less(i, j int) bool { return a[i] < a[j] } func persistDocValues(memSegment *mem.Segment, w *CountHashWriter, chunkFactor uint32) (map[uint16]uint64, error) { - fieldChunkOffsets := make(map[uint16]uint64, len(memSegment.DocValueFields)) + fieldChunkOffsets := make(map[uint16]uint64, len(memSegment.FieldsInv)) fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) - for fieldID := range memSegment.DocValueFields { + for _, fieldID := range memSegment.DocValueFields { field := memSegment.FieldsInv[fieldID] docTermMap := make(map[uint64][]byte, 0) dict, err := memSegment.Dictionary(field) if err != nil { - return fieldChunkOffsets, err + return nil, err } dictItr := dict.Iterator() @@ -451,7 +453,7 @@ func persistDocValues(memSegment *mem.Segment, w *CountHashWriter, for err == nil && next != nil { postings, err1 := dict.PostingsList(next.Term, nil) if err1 != nil { - return fieldChunkOffsets, err + return nil, err } postingsItr := postings.Iterator() @@ -463,14 +465,14 @@ func persistDocValues(memSegment *mem.Segment, w *CountHashWriter, nextPosting, err2 = postingsItr.Next() } if err2 != nil { - return fieldChunkOffsets, err2 + return nil, err2 } next, err = dictItr.Next() } if err != nil { - return fieldChunkOffsets, err + return nil, err } // sort wrt to docIDs var docNumbers docIDRange @@ -482,16 +484,19 @@ func persistDocValues(memSegment *mem.Segment, w *CountHashWriter, for _, docNum := range docNumbers { err = fdvEncoder.Add(docNum, docTermMap[docNum]) if err != nil { - return fieldChunkOffsets, err + return nil, err } } fieldChunkOffsets[fieldID] = uint64(w.Count()) - fdvEncoder.Close() + err = fdvEncoder.Close() + if err != nil { + return nil, err + } // persist the doc value details for this field _, err = fdvEncoder.Write(w) if err != nil { - return fieldChunkOffsets, err + return nil, err } // resetting encoder for the next field fdvEncoder.Reset() @@ -505,23 +510,23 @@ func persistFieldDocValues(w *CountHashWriter, chunkFactor uint32, fieldDvOffsets, err := persistDocValues(memSegment, w, chunkFactor) if err != nil { - return math.MaxUint64, err + return 0, err } fieldDocValuesOffset := uint64(w.Count()) buf := make([]byte, binary.MaxVarintLen64) - offset := uint64(math.MaxUint64) + offset := uint64(0) ok := true for fieldID := range memSegment.FieldsInv { // if the field isn't configured for docValue, then mark // the offset accordingly if offset, ok = fieldDvOffsets[uint16(fieldID)]; !ok { - offset = math.MaxUint64 + offset = fieldNotUninverted } n := binary.PutUvarint(buf, uint64(offset)) _, err := w.Write(buf[:n]) if err != nil { - return math.MaxUint64, err + return 0, err } } diff --git a/index/scorch/segment/zap/cmd/zap/cmd/docvalue.go b/index/scorch/segment/zap/cmd/zap/cmd/docvalue.go new file mode 100644 index 000000000..f20243ee2 --- /dev/null +++ b/index/scorch/segment/zap/cmd/zap/cmd/docvalue.go @@ -0,0 +1,224 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "bytes" + "encoding/binary" + "fmt" + "log" + "math" + "sort" + "strconv" + + "github.com/blevesearch/bleve/index/scorch/segment/zap" + "github.com/golang/snappy" + "github.com/spf13/cobra" +) + +// docvalueCmd represents the docvalue command +var docvalueCmd = &cobra.Command{ + Use: "docvalue [path] optional optional", + Short: "docvalue prints the docvalue details by field, and docNum", + Long: `The docvalue command lets you explore the docValues in order of field and by doc number.`, + RunE: func(cmd *cobra.Command, args []string) error { + if len(args) < 1 { + return fmt.Errorf("must specify index file path") + } + + data := segment.Data() + crcOffset := len(data) - 4 + verOffset := crcOffset - 4 + chunkOffset := verOffset - 4 + fieldsOffset := chunkOffset - 16 + fieldsIndexOffset := binary.BigEndian.Uint64(data[fieldsOffset : fieldsOffset+8]) + fieldsIndexEnd := uint64(len(data) - zap.FooterSize) + + // iterate through fields index + var fieldInv []string + var id, read, fieldLoc uint64 + var nread int + for fieldsIndexOffset+(8*id) < fieldsIndexEnd { + addr := binary.BigEndian.Uint64(data[fieldsIndexOffset+(8*id) : fieldsIndexOffset+(8*id)+8]) + var n uint64 + _, read := binary.Uvarint(data[addr+n : fieldsIndexEnd]) + n += uint64(read) + + var nameLen uint64 + nameLen, read = binary.Uvarint(data[addr+n : fieldsIndexEnd]) + n += uint64(read) + + name := string(data[addr+n : addr+n+nameLen]) + + id++ + fieldInv = append(fieldInv, name) + } + + dvLoc := segment.DocValueOffset() + fieldDvLoc := uint64(0) + var fieldName string + var fieldID uint16 + + // if no fields are specified then print the docValue offsets for all fields set + for id, field := range fieldInv { + fieldLoc, nread = binary.Uvarint(data[dvLoc+read : dvLoc+read+binary.MaxVarintLen64]) + if nread <= 0 { + return fmt.Errorf("loadDvIterators: failed to read the docvalue offsets for field %d", fieldID) + } + read += uint64(nread) + if len(args) == 1 { + // if no field args are given, then print out the dv locations for all fields + fmt.Printf("fieldID: %d '%s' docvalue at %d (%x)\n", id, field, fieldLoc, fieldLoc) + continue + } + + if field != args[1] { + continue + } else { + fieldDvLoc = fieldLoc + fieldName = field + fieldID = uint16(id) + } + + } + + // done with the fields dv locs printing for the given zap file + if len(args) == 1 { + return nil + } + + if fieldName == "" || fieldDvLoc == 0 { + return fmt.Errorf("no field found for given field arg: %s", args[1]) + } + + // read the number of chunks + var offset, clen, numChunks uint64 + numChunks, nread = binary.Uvarint(data[fieldDvLoc : fieldDvLoc+binary.MaxVarintLen64]) + if nread <= 0 { + return fmt.Errorf("failed to read the field "+ + "doc values for field %s", fieldName) + } + offset += uint64(nread) + + if len(args) == 2 { + fmt.Printf("number of chunks: %d\n", numChunks) + } + + // read the length of chunks + chunkLens := make([]uint64, numChunks) + for i := 0; i < int(numChunks); i++ { + clen, nread = binary.Uvarint(data[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64]) + if nread <= 0 { + return fmt.Errorf("corrupted chunk length for chunk number: %d", i) + } + + chunkLens[i] = clen + offset += uint64(nread) + if len(args) == 2 { + fmt.Printf("chunk: %d size: %d \n", i, clen) + } + } + + if len(args) == 2 { + return nil + } + + localDocNum, err := strconv.Atoi(args[2]) + if err != nil { + return fmt.Errorf("unable to parse doc number: %v", err) + } + + if localDocNum >= int(segment.NumDocs()) { + return fmt.Errorf("invalid doc number %d (valid 0 - %d)", localDocNum, segment.NumDocs()-1) + } + + // find the chunkNumber where the docValues are stored + docInChunk := uint64(localDocNum) / uint64(segment.ChunkFactor()) + + if numChunks < docInChunk { + return fmt.Errorf("no chunk exists for chunk number: %d for docID: %d", docInChunk, localDocNum) + } + + destChunkDataLoc := fieldDvLoc + offset + for i := 0; i < int(docInChunk); i++ { + destChunkDataLoc += chunkLens[i] + } + curChunkSize := chunkLens[docInChunk] + + // read the number of docs reside in the chunk + numDocs := uint64(0) + numDocs, nread = binary.Uvarint(data[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64]) + if nread <= 0 { + return fmt.Errorf("failed to read the target chunk: %d", docInChunk) + } + chunkMetaLoc := destChunkDataLoc + uint64(nread) + + offset = uint64(0) + curChunkHeader := make([]zap.MetaData, int(numDocs)) + for i := 0; i < int(numDocs); i++ { + curChunkHeader[i].DocID, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + offset += uint64(nread) + curChunkHeader[i].DocDvLoc, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + offset += uint64(nread) + curChunkHeader[i].DocDvLen, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + offset += uint64(nread) + } + + compressedDataLoc := chunkMetaLoc + offset + dataLength := destChunkDataLoc + curChunkSize - compressedDataLoc + curChunkData := data[compressedDataLoc : compressedDataLoc+dataLength] + + start, length := getDocValueLocs(uint64(localDocNum), curChunkHeader) + if start == math.MaxUint64 || length == math.MaxUint64 { + return nil + } + // uncompress the already loaded data + uncompressed, err := snappy.Decode(nil, curChunkData) + if err != nil { + log.Printf("snappy err %+v ", err) + return err + } + + var termSeparator byte = 0xff + var termSeparatorSplitSlice = []byte{termSeparator} + // pick the terms for the given docID + uncompressed = uncompressed[start : start+length] + for { + i := bytes.Index(uncompressed, termSeparatorSplitSlice) + if i < 0 { + break + } + + fmt.Printf(" %s ", uncompressed[0:i]) + uncompressed = uncompressed[i+1:] + } + fmt.Printf(" \n ") + return nil + }, +} + +func getDocValueLocs(docID uint64, metaHeader []zap.MetaData) (uint64, uint64) { + i := sort.Search(len(metaHeader), func(i int) bool { + return metaHeader[i].DocID >= docID + }) + if i < len(metaHeader) && metaHeader[i].DocID == docID { + return metaHeader[i].DocDvLoc, metaHeader[i].DocDvLen + } + return math.MaxUint64, math.MaxUint64 +} + +func init() { + RootCmd.AddCommand(docvalueCmd) +} diff --git a/index/scorch/segment/zap/cmd/zap/cmd/fields.go b/index/scorch/segment/zap/cmd/zap/cmd/fields.go index 98cdf9d73..cfc40974b 100644 --- a/index/scorch/segment/zap/cmd/zap/cmd/fields.go +++ b/index/scorch/segment/zap/cmd/zap/cmd/fields.go @@ -34,7 +34,7 @@ var fieldsCmd = &cobra.Command{ crcOffset := len(data) - 4 verOffset := crcOffset - 4 chunkOffset := verOffset - 4 - fieldsOffset := chunkOffset - 8 + fieldsOffset := chunkOffset - 16 fieldsIndexOffset := binary.BigEndian.Uint64(data[fieldsOffset : fieldsOffset+8]) fieldsIndexEnd := uint64(len(data) - zap.FooterSize) diff --git a/index/scorch/segment/zap/cmd/zap/cmd/footer.go b/index/scorch/segment/zap/cmd/zap/cmd/footer.go index 177f4e71b..0460360fc 100644 --- a/index/scorch/segment/zap/cmd/zap/cmd/footer.go +++ b/index/scorch/segment/zap/cmd/zap/cmd/footer.go @@ -33,6 +33,7 @@ var footerCmd = &cobra.Command{ fmt.Printf("Chunk Factor: %d\n", segment.ChunkFactor()) fmt.Printf("Fields Idx: %d (%#x)\n", segment.FieldsIndexOffset(), segment.FieldsIndexOffset()) fmt.Printf("Stored Idx: %d (%#x)\n", segment.StoredIndexOffset(), segment.StoredIndexOffset()) + fmt.Printf("DocValue Idx: %d (%#x)\n", segment.DocValueOffset(), segment.DocValueOffset()) fmt.Printf("Num Docs: %d\n", segment.NumDocs()) return nil }, diff --git a/index/scorch/segment/zap/contentcoder.go b/index/scorch/segment/zap/contentcoder.go index 978500e92..6887076ac 100644 --- a/index/scorch/segment/zap/contentcoder.go +++ b/index/scorch/segment/zap/contentcoder.go @@ -15,6 +15,7 @@ package zap import ( + "bufio" "bytes" "encoding/binary" "io" @@ -33,15 +34,15 @@ type chunkedContentCoder struct { chunkMetaBuf bytes.Buffer chunkBuf bytes.Buffer - chunkMeta []metaData + chunkMeta []MetaData } -// metaData represents the data information inside a +// MetaData represents the data information inside a // chunk. -type metaData struct { - docID uint64 // docid of the data inside the chunk - docDvLoc uint64 // starting offset for a given docid - docDvLen uint64 // length of data inside the chunk for the given docid +type MetaData struct { + DocID uint64 // docid of the data inside the chunk + DocDvLoc uint64 // starting offset for a given docid + DocDvLen uint64 // length of data inside the chunk for the given docid } // newChunkedContentCoder returns a new chunk content coder which @@ -52,7 +53,7 @@ func newChunkedContentCoder(chunkSize uint64, rv := &chunkedContentCoder{ chunkSize: chunkSize, chunkLens: make([]uint64, total), - chunkMeta: []metaData{}, + chunkMeta: []MetaData{}, } return rv @@ -68,13 +69,13 @@ func (c *chunkedContentCoder) Reset() { for i := range c.chunkLens { c.chunkLens[i] = 0 } - c.chunkMeta = []metaData{} + c.chunkMeta = []MetaData{} } // Close indicates you are done calling Add() this allows // the final chunk to be encoded. -func (c *chunkedContentCoder) Close() { - _ = c.flushContents() +func (c *chunkedContentCoder) Close() error { + return c.flushContents() } func (c *chunkedContentCoder) flushContents() error { @@ -86,26 +87,17 @@ func (c *chunkedContentCoder) flushContents() error { return err } + w := bufio.NewWriter(&c.chunkMetaBuf) // write out the metaData slice for _, meta := range c.chunkMeta { - n := binary.PutUvarint(buf, meta.docID) - _, err = c.chunkMetaBuf.Write(buf[:n]) + _, err := writeUvarints(w, meta.DocID, meta.DocDvLoc, meta.DocDvLen) if err != nil { return err } - - n = binary.PutUvarint(buf, meta.docDvLoc) - _, err = c.chunkMetaBuf.Write(buf[:n]) - if err != nil { - return err - } - - n = binary.PutUvarint(buf, meta.docDvLen) - _, err = c.chunkMetaBuf.Write(buf[:n]) - if err != nil { - return err - } - + } + err = w.Flush() + if err != nil { + return err } // write the metadata to final data @@ -132,7 +124,7 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error { // clearing the chunk specific meta for next chunk c.chunkBuf.Reset() c.chunkMetaBuf.Reset() - c.chunkMeta = []metaData{} + c.chunkMeta = []MetaData{} c.currChunk = chunk } @@ -143,10 +135,10 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error { return err } - c.chunkMeta = append(c.chunkMeta, metaData{ - docID: docNum, - docDvLoc: uint64(dvOffset), - docDvLen: uint64(dvSize), + c.chunkMeta = append(c.chunkMeta, MetaData{ + DocID: docNum, + DocDvLoc: uint64(dvOffset), + DocDvLen: uint64(dvSize), }) return nil } diff --git a/index/scorch/segment/zap/contentcoder_test.go b/index/scorch/segment/zap/contentcoder_test.go index 4ae4d8121..0e45b783e 100644 --- a/index/scorch/segment/zap/contentcoder_test.go +++ b/index/scorch/segment/zap/contentcoder_test.go @@ -61,7 +61,7 @@ func TestChunkContentCoder(t *testing.T) { t.Fatalf("error adding to intcoder: %v", err) } } - cic.Close() + _ = cic.Close() var actual bytes.Buffer _, err := cic.Write(&actual) if err != nil { diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index 9f1e9942f..d77c63233 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -31,7 +31,7 @@ type docValueIterator struct { numChunks uint64 chunkLens []uint64 dvDataLoc uint64 - curChunkHeader []metaData + curChunkHeader []MetaData curChunkData []byte // compressed data cache } @@ -96,13 +96,13 @@ func (di *docValueIterator) loadDvChunk(chunkNumber, chunkMetaLoc := destChunkDataLoc + uint64(read) offset := uint64(0) - di.curChunkHeader = make([]metaData, int(numDocs)) + di.curChunkHeader = make([]MetaData, int(numDocs)) for i := 0; i < int(numDocs); i++ { - di.curChunkHeader[i].docID, read = binary.Uvarint(s.mm[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + di.curChunkHeader[i].DocID, read = binary.Uvarint(s.mm[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) offset += uint64(read) - di.curChunkHeader[i].docDvLoc, read = binary.Uvarint(s.mm[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + di.curChunkHeader[i].DocDvLoc, read = binary.Uvarint(s.mm[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) offset += uint64(read) - di.curChunkHeader[i].docDvLen, read = binary.Uvarint(s.mm[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + di.curChunkHeader[i].DocDvLen, read = binary.Uvarint(s.mm[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) offset += uint64(read) } @@ -143,10 +143,10 @@ func (di *docValueIterator) visitDocValues(docID uint64, func (di *docValueIterator) getDocValueLocs(docID uint64) (uint64, uint64) { i := sort.Search(len(di.curChunkHeader), func(i int) bool { - return di.curChunkHeader[i].docID >= docID + return di.curChunkHeader[i].DocID >= docID }) - if i < len(di.curChunkHeader) && di.curChunkHeader[i].docID == docID { - return di.curChunkHeader[i].docDvLoc, di.curChunkHeader[i].docDvLen + if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocID == docID { + return di.curChunkHeader[i].DocDvLoc, di.curChunkHeader[i].DocDvLen } return math.MaxUint64, math.MaxUint64 } diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 36959c62e..aeb81a6e2 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -134,7 +134,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, rv1 := make([]uint64, len(fieldsInv)) fieldDvLocs := make([]uint64, len(fieldsInv)) - fieldDvLocsOffset := uint64(math.MaxUint64) + fieldDvLocsOffset := uint64(fieldNotUninverted) var vellumBuf bytes.Buffer // for each field @@ -144,7 +144,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, } newVellum, err := vellum.New(&vellumBuf, nil) if err != nil { - return nil, fieldDvLocsOffset, err + return nil, 0, err } // collect FST iterators from all segments for this field @@ -153,14 +153,14 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, for _, segment := range segments { dict, err2 := segment.dictionary(fieldName) if err2 != nil { - return nil, fieldDvLocsOffset, err2 + return nil, 0, err2 } dicts = append(dicts, dict) if dict != nil && dict.fst != nil { itr, err2 := dict.fst.Iterator(nil, nil) if err2 != nil && err2 != vellum.ErrIteratorDone { - return nil, fieldDvLocsOffset, err2 + return nil, 0, err2 } if itr != nil { itrs = append(itrs, itr) @@ -194,7 +194,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, } postings, err2 := dict.postingsList(string(term), drops[dictI]) if err2 != nil { - return nil, fieldDvLocsOffset, err2 + return nil, 0, err2 } postItr := postings.Iterator() @@ -202,7 +202,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, for next != nil && err2 == nil { hitNewDocNum := newDocNums[dictI][next.Number()] if hitNewDocNum == docDropped { - return nil, fieldDvLocsOffset, fmt.Errorf("see hit with dropped doc num") + return nil, 0, fmt.Errorf("see hit with dropped doc num") } newRoaring.Add(uint32(hitNewDocNum)) // encode norm bits @@ -210,7 +210,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, normBits := math.Float32bits(float32(norm)) err3 := tfEncoder.Add(hitNewDocNum, next.Frequency(), uint64(normBits)) if err3 != nil { - return nil, fieldDvLocsOffset, err3 + return nil, 0, err3 } locs := next.Locations() if len(locs) > 0 { @@ -228,7 +228,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, args = append(args, loc.ArrayPositions()...) err = locEncoder.Add(hitNewDocNum, args...) if err != nil { - return nil, fieldDvLocsOffset, err + return nil, 0, err } } } @@ -238,7 +238,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, next, err2 = postItr.Next() } if err != nil { - return nil, fieldDvLocsOffset, err + return nil, 0, err } } @@ -250,17 +250,17 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, freqOffset := uint64(w.Count()) _, err = tfEncoder.Write(w) if err != nil { - return nil, fieldDvLocsOffset, err + return nil, 0, err } locOffset := uint64(w.Count()) _, err = locEncoder.Write(w) if err != nil { - return nil, fieldDvLocsOffset, err + return nil, 0, err } postingLocOffset := uint64(w.Count()) _, err = writeRoaringWithLen(newRoaringLocs, w) if err != nil { - return nil, fieldDvLocsOffset, err + return nil, 0, err } postingOffset := uint64(w.Count()) // write out the start of the term info @@ -268,43 +268,43 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, n := binary.PutUvarint(buf, freqOffset) _, err = w.Write(buf[:n]) if err != nil { - return nil, fieldDvLocsOffset, err + return nil, 0, err } // write out the start of the loc info n = binary.PutUvarint(buf, locOffset) _, err = w.Write(buf[:n]) if err != nil { - return nil, fieldDvLocsOffset, err + return nil, 0, err } // write out the start of the loc posting list n = binary.PutUvarint(buf, postingLocOffset) _, err = w.Write(buf[:n]) if err != nil { - return nil, fieldDvLocsOffset, err + return nil, 0, err } _, err = writeRoaringWithLen(newRoaring, w) if err != nil { - return nil, fieldDvLocsOffset, err + return nil, 0, err } err = newVellum.Insert(term, postingOffset) if err != nil { - return nil, fieldDvLocsOffset, err + return nil, 0, err } } err = mergeItr.Next() } if err != nil && err != vellum.ErrIteratorDone { - return nil, fieldDvLocsOffset, err + return nil, 0, err } dictOffset := uint64(w.Count()) err = newVellum.Close() if err != nil { - return nil, fieldDvLocsOffset, err + return nil, 0, err } vellumData := vellumBuf.Bytes() @@ -314,18 +314,18 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, n := binary.PutUvarint(buf, uint64(len(vellumData))) _, err = w.Write(buf[:n]) if err != nil { - return nil, fieldDvLocsOffset, err + return nil, 0, err } // write this vellum to disk _, err = w.Write(vellumData) if err != nil { - return nil, fieldDvLocsOffset, err + return nil, 0, err } rv1[fieldID] = dictOffset - // update teh doc value + // update the doc value var docNumbers docIDRange for k := range docTermMap { docNumbers = append(docNumbers, k) @@ -335,16 +335,19 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, for _, docNum := range docNumbers { err = fdvEncoder.Add(docNum, docTermMap[docNum]) if err != nil { - return nil, fieldDvLocsOffset, err + return nil, 0, err } } // get the field doc value offset fieldDvLocs[fieldID] = uint64(w.Count()) - fdvEncoder.Close() + err = fdvEncoder.Close() + if err != nil { + return nil, 0, err + } // persist the doc value details for this field _, err = fdvEncoder.Write(w) if err != nil { - return nil, fieldDvLocsOffset, err + return nil, 0, err } } @@ -354,7 +357,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, n := binary.PutUvarint(buf, uint64(offset)) _, err := w.Write(buf[:n]) if err != nil { - return nil, math.MaxUint64, err + return nil, 0, err } } diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 5a39f48fb..4e1da0acd 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -19,7 +19,6 @@ import ( "encoding/binary" "fmt" "io" - "math" "os" "sync" @@ -349,11 +348,16 @@ func (s *Segment) FieldsIndexOffset() uint64 { return s.fieldsIndexOffset } -// StoredIndexOffset returns the stored value index offset in the file foooter +// StoredIndexOffset returns the stored value index offset in the file footer func (s *Segment) StoredIndexOffset() uint64 { return s.storedIndexOffset } +// DocValueOffset returns the docValue offset in the file footer +func (s *Segment) DocValueOffset() uint64 { + return s.docValueOffset +} + // NumDocs returns the number of documents in the file footer func (s *Segment) NumDocs() uint64 { return s.numDocs @@ -372,7 +376,7 @@ func (s *Segment) DictAddr(field string) (uint64, error) { } func (s *Segment) loadDvIterators() error { - if s.docValueOffset == math.MaxUint64 || s.docValueOffset == 0 { + if s.docValueOffset == fieldNotUninverted || s.docValueOffset == 0 { return nil } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 2b2654b58..530bb66f7 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -22,9 +22,6 @@ import ( "sort" "sync" - "github.com/blevesearch/bleve/index/scorch/segment/mem" - "github.com/blevesearch/bleve/index/scorch/segment/zap" - "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" @@ -404,35 +401,29 @@ func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID, ss := i.segment[segmentIndex] - switch seg := ss.segment.(type) { - case *mem.Segment: - err = ss.cachedDocs.prepareFields(fields, ss) - if err != nil { - return err - } + if zaps, ok := ss.segment.(segment.DocumentFieldTermVisitable); ok { + return zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor) + } - for _, field := range fields { - if cachedFieldDocs, exists := ss.cachedDocs.cache[field]; exists { - if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists { - for { - i := bytes.Index(tlist, TermSeparatorSplitSlice) - if i < 0 { - break - } - visitor(field, tlist[0:i]) - tlist = tlist[i+1:] + // else fallback to the in memory fieldCache + err = ss.cachedDocs.prepareFields(fields, ss) + if err != nil { + return err + } + + for _, field := range fields { + if cachedFieldDocs, exists := ss.cachedDocs.cache[field]; exists { + if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists { + for { + i := bytes.Index(tlist, TermSeparatorSplitSlice) + if i < 0 { + break } + visitor(field, tlist[0:i]) + tlist = tlist[i+1:] } } } - - case *zap.Segment: - if zaps, ok := ss.segment.(UnInvertIndex); ok { - return zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor) - } - - default: - return fmt.Errorf("snapshot_index: DocumentVisitFieldTerms, unknown segment type: %T", seg) } return nil From 055d3e12df33f44dbc0c9c1ab77eb705486326bf Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Tue, 26 Dec 2017 19:11:14 -0700 Subject: [PATCH 116/728] Adding onEvent callback support for scorch Event types: - EventKindCloseStart - EventKindClose - EventKindMergerProgress - EventKindPersisterProgress - EventKindBatchIntroductionStart - EventKindBatchIntroduction --- index/scorch/merge.go | 10 ++++++ index/scorch/persister.go | 6 ++++ index/scorch/scorch.go | 65 +++++++++++++++++++++++++++++++++++++-- 3 files changed, 78 insertions(+), 3 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 35469d591..0c166df7a 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -19,6 +19,7 @@ import ( "log" "os" "sync/atomic" + "time" "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index/scorch/mergeplan" @@ -42,6 +43,8 @@ OUTER: s.rootLock.RUnlock() if ourSnapshot.epoch != lastEpochMergePlanned { + startTime := time.Now() + // lets get started err := s.planMergeAtSnapshot(ourSnapshot) if err != nil { @@ -50,6 +53,9 @@ OUTER: continue OUTER } lastEpochMergePlanned = ourSnapshot.epoch + + s.fireEvent(EventKindMergerProgress, time.Since(startTime)) + } _ = ourSnapshot.DecRef() @@ -71,6 +77,8 @@ OUTER: s.rootLock.RUnlock() if ourSnapshot.epoch != lastEpochMergePlanned { + startTime := time.Now() + // lets get started err := s.planMergeAtSnapshot(ourSnapshot) if err != nil { @@ -78,6 +86,8 @@ OUTER: continue OUTER } lastEpochMergePlanned = ourSnapshot.epoch + + s.fireEvent(EventKindMergerProgress, time.Since(startTime)) } _ = ourSnapshot.DecRef() diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 52df1ff7a..b54b20134 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -25,6 +25,7 @@ import ( "strconv" "strings" "sync/atomic" + "time" "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index/scorch/segment" @@ -64,6 +65,8 @@ OUTER: s.rootLock.Unlock() if ourSnapshot != nil { + startTime := time.Now() + err := s.persistSnapshot(ourSnapshot) for _, ch := range ourPersisted { if err != nil { @@ -90,6 +93,9 @@ OUTER: changed = true } s.rootLock.RUnlock() + + s.fireEvent(EventKindPersisterProgress, time.Since(startTime)) + if changed { continue OUTER } diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 9d2b9049b..2745bda74 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -64,9 +64,44 @@ type Scorch struct { eligibleForRemoval []uint64 // Index snapshot epochs that are safe to GC. ineligibleForRemoval map[string]bool // Filenames that should not be GC'ed yet. + + onEvent func(event Event) +} + +// Event represents the information provided in an OnEvent() callback. +type Event struct { + Kind EventKind + Scorch *Scorch + Duration time.Duration } -func NewScorch(storeName string, config map[string]interface{}, analysisQueue *index.AnalysisQueue) (index.Index, error) { +// EventKind represents an event code for OnEvent() callbacks. +type EventKind int + +// EventKindCLoseStart is fired when a Scorch.Close() has begun. +var EventKindCloseStart = EventKind(1) + +// EventKindClose is fired when a scorch index has been fully closed. +var EventKindClose = EventKind(2) + +// EventKindMergerProgress is fired when the merger has completed a +// round of merge processing. +var EventKindMergerProgress = EventKind(3) + +// EventKindPersisterProgress is fired when the persister has completed +// a round of persistence processing. +var EventKindPersisterProgress = EventKind(4) + +// EventKindBatchIntroductionStart is fired when Batch() is invoked which +// introduces a new segment. +var EventKindBatchIntroductionStart = EventKind(5) + +// EventKindBatchIntroduction is fired when Batch() completes. +var EventKindBatchIntroduction = EventKind(6) + +func NewScorch(storeName string, + config map[string]interface{}, + analysisQueue *index.AnalysisQueue) (index.Index, error) { rv := &Scorch{ version: Version, config: config, @@ -88,6 +123,16 @@ func NewScorch(storeName string, config map[string]interface{}, analysisQueue *i return rv, nil } +func (s *Scorch) SetEventCallback(f func(Event)) { + s.onEvent = f +} + +func (s *Scorch) fireEvent(kind EventKind, dur time.Duration) { + if s.onEvent != nil { + s.onEvent(Event{Kind: kind, Scorch: s, Duration: dur}) + } +} + func (s *Scorch) Open() error { var ok bool s.path, ok = s.config["path"].(string) @@ -155,6 +200,13 @@ func (s *Scorch) Open() error { } func (s *Scorch) Close() (err error) { + startTime := time.Now() + defer func() { + s.fireEvent(EventKindClose, time.Since(startTime)) + }() + + s.fireEvent(EventKindCloseStart, 0) + // signal to async tasks we want to close close(s.closeCh) // wait for them to close @@ -187,7 +239,11 @@ func (s *Scorch) Delete(id string) error { // Batch applices a batch of changes to the index atomically func (s *Scorch) Batch(batch *index.Batch) error { - analysisStart := time.Now() + start := time.Now() + + defer func() { + s.fireEvent(EventKindBatchIntroduction, time.Since(start)) + }() resultChan := make(chan *index.AnalysisResult, len(batch.IndexOps)) @@ -229,7 +285,10 @@ func (s *Scorch) Batch(batch *index.Batch) error { } close(resultChan) - atomic.AddUint64(&s.stats.analysisTime, uint64(time.Since(analysisStart))) + atomic.AddUint64(&s.stats.analysisTime, uint64(time.Since(start))) + + // notify handlers that we're about to introduce a segment + s.fireEvent(EventKindBatchIntroductionStart, 0) var newSegment segment.Segment if len(analysisResults) > 0 { From 5c26f5a86d39619fc54dd4fc64378a229d1272a5 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Thu, 28 Dec 2017 18:48:38 -0700 Subject: [PATCH 117/728] Tracking memory consumption for a scorch index + Track memory usage at a segment level + Add a new scorch API: MemoryUsed() - Aggregate the memory consumption across segments when API is invoked. + TODO: - Revisit the second iteration if it can be gotten rid off, and the size accounted for during the first run while building an in-mem segment. - Accounting for pointer and slice overhead. --- index/scorch/scorch.go | 15 ++++++ index/scorch/segment/mem/build.go | 3 ++ index/scorch/segment/mem/segment.go | 68 ++++++++++++++++++++++++ index/scorch/segment/mem/segment_test.go | 4 ++ index/scorch/segment/segment.go | 2 + index/scorch/segment/zap/segment.go | 25 +++++++++ index/scorch/snapshot_segment.go | 15 ++++++ 7 files changed, 132 insertions(+) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 9d2b9049b..926fa0b64 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -361,6 +361,21 @@ func (s *Scorch) AddEligibleForRemoval(epoch uint64) { s.rootLock.Unlock() } +func (s *Scorch) MemoryUsed() uint64 { + var memUsed uint64 + s.rootLock.RLock() + for _, segmentSnapshot := range s.root.segment { + memUsed += 8 /* size of id -> uint64 */ + + segmentSnapshot.segment.SizeInBytes() + if segmentSnapshot.deleted != nil { + memUsed += segmentSnapshot.deleted.GetSizeInBytes() + } + memUsed += segmentSnapshot.cachedDocs.sizeInBytes() + } + s.rootLock.RUnlock() + return memUsed +} + func (s *Scorch) markIneligibleForRemoval(filename string) { s.rootLock.Lock() s.ineligibleForRemoval[filename] = true diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index e111ce4f7..bfcbe2fd0 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -41,6 +41,9 @@ func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { sort.Strings(dict) } + // compute memory usage of segment + s.updateSizeInBytes() + // professional debugging // // log.Printf("fields: %v\n", s.FieldsMap) diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go index 75ff50cc0..94ab137b3 100644 --- a/index/scorch/segment/mem/segment.go +++ b/index/scorch/segment/mem/segment.go @@ -87,6 +87,10 @@ type Segment struct { // stored field array positions // docNum -> field id -> slice of array positions (each is []uint64) StoredPos []map[uint16][][]uint64 + + // footprint of the segment, updated when analyzed document mutations + // are added into the segment + sizeInBytes uint64 } // New builds a new empty Segment @@ -96,6 +100,70 @@ func New() *Segment { } } +func (s *Segment) updateSizeInBytes() { + var sizeInBytes uint64 + + for k, _ := range s.FieldsMap { + sizeInBytes += uint64(len(k)*2 /* FieldsMap + FieldsInv */ + + 2 /* size of uint16 */) + } + + for _, entry := range s.Dicts { + for k, _ := range entry { + sizeInBytes += uint64(len(k)*2 /* Dicts + DictKeys */ + + 8 /* size of uint64 */) + } + } + + for i := 0; i < len(s.Postings); i++ { + sizeInBytes += s.Postings[i].GetSizeInBytes() + s.PostingsLocs[i].GetSizeInBytes() + } + + for i := 0; i < len(s.Freqs); i++ { + sizeInBytes += uint64(len(s.Freqs[i])*8 /* size of uint64 */ + + len(s.Norms[i])*4 /* size of float32 */) + } + + for i := 0; i < len(s.Locfields); i++ { + sizeInBytes += uint64(len(s.Locfields[i])*2 /* size of uint16 */ + + len(s.Locstarts[i])*8 /* size of uint64 */ + + len(s.Locends[i])*8 /* size of uint64 */ + + len(s.Locpos[i])*8 /* size of uint64 */) + + for j := 0; j < len(s.Locarraypos[i]); j++ { + sizeInBytes += uint64(len(s.Locarraypos[i][j]) * 8 /* size of uint64 */) + } + } + + for i := 0; i < len(s.Stored); i++ { + for _, v := range s.Stored[i] { + sizeInBytes += uint64(2 /* size of uint16 */) + for _, arr := range v { + sizeInBytes += uint64(len(arr)) + } + } + + for _, v := range s.StoredTypes[i] { + sizeInBytes += uint64(2 /* size of uint16 */ + len(v)) + } + + for _, v := range s.StoredPos[i] { + sizeInBytes += uint64(2 /* size of uint16 */) + for _, arr := range v { + sizeInBytes += uint64(len(arr) * 8 /* size of uint64 */) + } + } + } + + sizeInBytes += uint64(8 /* size of sizeInBytes -> uint64*/) + + s.sizeInBytes = sizeInBytes +} + +func (s *Segment) SizeInBytes() uint64 { + return s.sizeInBytes +} + func (s *Segment) AddRef() { } diff --git a/index/scorch/segment/mem/segment_test.go b/index/scorch/segment/mem/segment_test.go index 7eb691476..5e3818c24 100644 --- a/index/scorch/segment/mem/segment_test.go +++ b/index/scorch/segment/mem/segment_test.go @@ -169,6 +169,10 @@ func TestSingle(t *testing.T) { t.Fatalf("segment nil, not expected") } + if segment.SizeInBytes() <= 0 { + t.Fatalf("segment size not updated") + } + expectFields := map[string]struct{}{ "_id": struct{}{}, "_all": struct{}{}, diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index 2c91c0ef8..cad9ae8f3 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -36,6 +36,8 @@ type Segment interface { Close() error + SizeInBytes() uint64 + AddRef() DecRef() error } diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index e78ac392f..d38b81987 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -86,6 +86,31 @@ type Segment struct { refs int64 } +func (s *Segment) SizeInBytes() uint64 { + // 4 /* size of crc -> uint32 */ + + // 4 /* size of version -> uint32 */ + + // 4 /* size of chunkFactor -> uint32 */ + + // 8 /* size of numDocs -> uint64 */ + + // 8 /* size of storedIndexOffset -> uint64 */ + + // 8 /* size of fieldsIndexOffset -> uint64 */ + sizeOfUints := 36 + + sizeInBytes := len(s.mm) + len(s.path) + sizeOfUints + + for k, _ := range s.fieldsMap { + sizeInBytes += len(k) + 2 /* size of uint16 */ + } + + for _, entry := range s.fieldsInv { + sizeInBytes += len(entry) + } + + sizeInBytes += len(s.fieldsOffsets) * 8 /* size of uint64 */ + sizeInBytes += 8 /* size of refs -> int64 */ + + return uint64(sizeInBytes) +} + func (s *Segment) AddRef() { s.m.Lock() s.refs++ diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index f2bcfb065..b3b8d8284 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -249,3 +249,18 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e c.m.Unlock() return nil } + +func (c *cachedDocs) sizeInBytes() uint64 { + sizeInBytes := 0 + c.m.Lock() + for k, v := range c.cache { // cachedFieldDocs + sizeInBytes += len(k) + if v != nil { + for _, entry := range v.docs { // docs + sizeInBytes += 8 /* size of uint64 */ + len(entry) + } + } + } + c.m.Unlock() + return uint64(sizeInBytes) +} From 448201243aed7202bb4a95efc70ad101fb7b99ca Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Sat, 30 Dec 2017 16:54:06 +0530 Subject: [PATCH 118/728] removed redundant buf writer, and checks --- index/scorch/segment/zap/build.go | 2 +- index/scorch/segment/zap/contentcoder.go | 8 +------- index/scorch/segment/zap/docvalues.go | 2 +- index/scorch/segment/zap/merge.go | 3 ++- index/scorch/segment/zap/segment.go | 2 +- 5 files changed, 6 insertions(+), 11 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 96d536964..f5a92562d 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -51,7 +51,7 @@ func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) (e var storedIndexOffset uint64 var dictLocs []uint64 - var docValueOffset uint64 + docValueOffset := uint64(fieldNotUninverted) if len(memSegment.Stored) > 0 { storedIndexOffset, err = persistStored(memSegment, cr) diff --git a/index/scorch/segment/zap/contentcoder.go b/index/scorch/segment/zap/contentcoder.go index 6887076ac..b03940497 100644 --- a/index/scorch/segment/zap/contentcoder.go +++ b/index/scorch/segment/zap/contentcoder.go @@ -15,7 +15,6 @@ package zap import ( - "bufio" "bytes" "encoding/binary" "io" @@ -87,18 +86,13 @@ func (c *chunkedContentCoder) flushContents() error { return err } - w := bufio.NewWriter(&c.chunkMetaBuf) // write out the metaData slice for _, meta := range c.chunkMeta { - _, err := writeUvarints(w, meta.DocID, meta.DocDvLoc, meta.DocDvLen) + _, err := writeUvarints(&c.chunkMetaBuf, meta.DocID, meta.DocDvLoc, meta.DocDvLen) if err != nil { return err } } - err = w.Flush() - if err != nil { - return err - } // write the metadata to final data metaData := c.chunkMetaBuf.Bytes() diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index d77c63233..3a75f29d2 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -46,7 +46,7 @@ func (di *docValueIterator) curChunkNumber() uint64 { func (s *Segment) loadFieldDocValueIterator(field string, fieldDvLoc uint64) (*docValueIterator, error) { // get the docValue offset for the given fields - if fieldDvLoc == math.MaxUint64 { + if fieldDvLoc == fieldNotUninverted { return nil, fmt.Errorf("loadFieldDocValueConfigs: "+ "no docValues found for field: %s", field) } diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index aeb81a6e2..16ec848b2 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -53,7 +53,8 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, newSegDocCount := computeNewDocCount(segments, drops) var newDocNums [][]uint64 - var storedIndexOffset, fieldDvLocsOffset uint64 + var storedIndexOffset uint64 + fieldDvLocsOffset := uint64(fieldNotUninverted) var dictLocs []uint64 if newSegDocCount > 0 { storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index a0963d31e..93b7466c8 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -401,7 +401,7 @@ func (s *Segment) DictAddr(field string) (uint64, error) { } func (s *Segment) loadDvIterators() error { - if s.docValueOffset == fieldNotUninverted || s.docValueOffset == 0 { + if s.docValueOffset == fieldNotUninverted { return nil } From 1a59a1bb99adce5ff36a4f02c78a15e53499b273 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 2 Jan 2018 16:09:55 -0500 Subject: [PATCH 119/728] attempt to fix core reference counting issues Observed problem: Persisted index state (in root bolt) would contain index snapshots which pointed to index files that did not exist. Debugging this uncovered two main problems: 1. At the end of persisting a snapshot, the persister creates a new index snapshot with the SAME epoch as the current root, only it replaces in-memory segments with the new disk based ones. This is problematic because reference counting an index segment triggers "eligible for deletion". And eligible for deletion is keyed by epoch. So having two separate instances going by the same epoch is problematic. Specifically, one of them gets to 0 before the other, and we wrongly conclude it's eligible for deletion, when in fact the "other" instance with same epoch is actually still in use. To address this problem, we have modified the behavior of the persister. Now, upon completion of persistence, ONLY if new files were actually created do we proceed to introduce a new snapshot. AND, this new snapshot now gets it's own brand new epoch. BOTH of these are important because since the persister now also introduces a new epoch, it will see this epoch again in the future AND be expected to persist it. That is OK (mostly harmless), but we cannot allow it to form a loop. Checking that new files were actually introduced is what short-circuits the potential loop. The new epoch introduced by the persister, if seen again will not have any new segments that actually need persisting to disk, and the cycle is stopped. 2. The implementation of NumSnapshotsToKeep, and related code to deleted old snapshots from the root bolt also contains problems. Specifically, the determination of which snapshots to keep vs delete did not consider which ones were actually persisted. So, lets say you had set NumSnapshotsToKeep to 3, if the introducer gets 3 snapshots ahead of the persister, what can happen is that the three snapshots we choose to keep are all in memory. We now wrongly delete all of the snapshots from the root bolt. But it gets worse, in this instant of time, we now have files on disk that nothing in the root bolt points to, so we also go ahead and delete those files. Those files were still being referenced by the in-memory snapshots. But, now even if they get persisted to disk, they simply have references to non-existent files. Opening up one of these indexes results in lost data (often everything). To address this problem, we made large change to the way this section of code operates. First, we now start with a list of all epochs actually persisted in the root bolt. Second, we set aside NumSnapshotsToKeep of these snapshots to keep. Third, anything else in the eligibleForRemoval list will be deleted. I suspect this code is slower and less elegant, but I think it is more correct. Also, previously NumSnapshotsToKeep defaulted to 0, I have now defaulted it to 1, which feels like saner out-of-the-box behavior (though it's debatable if the original intent was perhaps instead for "extra" snapshots to keep, but with the variable named as it is, 1 makes more sense to me) Other minor changes included in this change: - Location of 'nextSnapshotEpoch', 'eligibleForRemoval', and 'ineligibleForRemoval' members of Scorch struct were moved into the paragraph with 'rootLock' to clarify that you must hold the lock to access it. - TestBatchRaceBug260 was updated to properly Close() the index, which leads to occasional test failures. --- index/scorch/persister.go | 151 ++++++++++++++++++++++++-------------- index/scorch/scorch.go | 27 ++++--- index_test.go | 6 ++ 3 files changed, 116 insertions(+), 68 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index b54b20134..acf241ebf 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -21,7 +21,6 @@ import ( "log" "os" "path/filepath" - "sort" "strconv" "strings" "sync/atomic" @@ -218,59 +217,63 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { } } - // now try to open all the new snapshots - newSegments := make(map[uint64]segment.Segment) - for segmentID, path := range newSegmentPaths { - newSegments[segmentID], err = zap.Open(path) - if err != nil { - for _, s := range newSegments { - if s != nil { - _ = s.Close() // cleanup segments that were successfully opened + // only alter the root if we actually persisted a segment + // (sometimes its just a new snapshot, possibly with new internal values) + if len(newSegmentPaths) > 0 { + // now try to open all the new snapshots + newSegments := make(map[uint64]segment.Segment) + for segmentID, path := range newSegmentPaths { + newSegments[segmentID], err = zap.Open(path) + if err != nil { + for _, s := range newSegments { + if s != nil { + _ = s.Close() // cleanup segments that were successfully opened + } } + return fmt.Errorf("error opening new segment at %s, %v", path, err) } - return fmt.Errorf("error opening new segment at %s, %v", path, err) } - } - s.rootLock.Lock() - newIndexSnapshot := &IndexSnapshot{ - parent: s, - epoch: s.root.epoch, - segment: make([]*SegmentSnapshot, len(s.root.segment)), - offsets: make([]uint64, len(s.root.offsets)), - internal: make(map[string][]byte, len(s.root.internal)), - refs: 1, - } - for i, segmentSnapshot := range s.root.segment { - // see if this segment has been replaced - if replacement, ok := newSegments[segmentSnapshot.id]; ok { - newSegmentSnapshot := &SegmentSnapshot{ - id: segmentSnapshot.id, - segment: replacement, - deleted: segmentSnapshot.deleted, - cachedDocs: segmentSnapshot.cachedDocs, + s.rootLock.Lock() + newIndexSnapshot := &IndexSnapshot{ + parent: s, + epoch: s.nextSnapshotEpoch, + segment: make([]*SegmentSnapshot, len(s.root.segment)), + offsets: make([]uint64, len(s.root.offsets)), + internal: make(map[string][]byte, len(s.root.internal)), + refs: 1, + } + s.nextSnapshotEpoch++ + for i, segmentSnapshot := range s.root.segment { + // see if this segment has been replaced + if replacement, ok := newSegments[segmentSnapshot.id]; ok { + newSegmentSnapshot := &SegmentSnapshot{ + id: segmentSnapshot.id, + segment: replacement, + deleted: segmentSnapshot.deleted, + cachedDocs: segmentSnapshot.cachedDocs, + } + newIndexSnapshot.segment[i] = newSegmentSnapshot + // update items persisted incase of a new segment snapshot + atomic.AddUint64(&s.stats.numItemsPersisted, newSegmentSnapshot.Count()) + } else { + newIndexSnapshot.segment[i] = s.root.segment[i] + newIndexSnapshot.segment[i].segment.AddRef() } - newIndexSnapshot.segment[i] = newSegmentSnapshot - // update items persisted incase of a new segment snapshot - atomic.AddUint64(&s.stats.numItemsPersisted, newSegmentSnapshot.Count()) - } else { - newIndexSnapshot.segment[i] = s.root.segment[i] - newIndexSnapshot.segment[i].segment.AddRef() + newIndexSnapshot.offsets[i] = s.root.offsets[i] + } + for k, v := range s.root.internal { + newIndexSnapshot.internal[k] = v + } + for _, filename := range filenames { + delete(s.ineligibleForRemoval, filename) + } + rootPrev := s.root + s.root = newIndexSnapshot + s.rootLock.Unlock() + if rootPrev != nil { + _ = rootPrev.DecRef() } - newIndexSnapshot.offsets[i] = s.root.offsets[i] - } - for k, v := range s.root.internal { - newIndexSnapshot.internal[k] = v - } - for _, filename := range filenames { - delete(s.ineligibleForRemoval, filename) - } - rootPrev := s.root - s.root = newIndexSnapshot - s.rootLock.Unlock() - - if rootPrev != nil { - _ = rootPrev.DecRef() } return nil @@ -435,19 +438,39 @@ func (s *Scorch) removeOldData() { // NumSnapshotsToKeep represents how many recent, old snapshots to // keep around per Scorch instance. Useful for apps that require // rollback'ability. -var NumSnapshotsToKeep int +var NumSnapshotsToKeep = 1 // Removes enough snapshots from the rootBolt so that the // s.eligibleForRemoval stays under the NumSnapshotsToKeep policy. func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) { - var epochsToRemove []uint64 + persistedEpochs, err := s.rootBoltSnapshotEpochs() + if err != nil { + return 0, err + } + if len(persistedEpochs) <= NumSnapshotsToKeep { + // we need to keep everything + return 0, nil + } + + // make a map of epochs to protect from deletion + protectedEpochs := make(map[uint64]struct{}, NumSnapshotsToKeep) + for _, epoch := range persistedEpochs[0:NumSnapshotsToKeep] { + protectedEpochs[epoch] = struct{}{} + } + + var epochsToRemove []uint64 + var newEligible []uint64 s.rootLock.Lock() - if len(s.eligibleForRemoval) > NumSnapshotsToKeep { - sort.Sort(uint64Descending(s.eligibleForRemoval)) - epochsToRemove = append([]uint64(nil), s.eligibleForRemoval[NumSnapshotsToKeep:]...) // Copy. - s.eligibleForRemoval = s.eligibleForRemoval[0:NumSnapshotsToKeep] + for _, epoch := range s.eligibleForRemoval { + if _, ok := protectedEpochs[epoch]; ok { + // protected + newEligible = append(newEligible, epoch) + } else { + epochsToRemove = append(epochsToRemove, epoch) + } } + s.eligibleForRemoval = newEligible s.rootLock.Unlock() if len(epochsToRemove) <= 0 { @@ -542,6 +565,26 @@ func (s *Scorch) removeOldZapFiles() error { return nil } +func (s *Scorch) rootBoltSnapshotEpochs() ([]uint64, error) { + var rv []uint64 + err := s.rootBolt.View(func(tx *bolt.Tx) error { + snapshots := tx.Bucket(boltSnapshotsBucket) + if snapshots == nil { + return nil + } + sc := snapshots.Cursor() + for sk, _ := sc.Last(); sk != nil; sk, _ = sc.Prev() { + _, snapshotEpoch, err := segment.DecodeUvarintAscending(sk) + if err != nil { + continue + } + rv = append(rv, snapshotEpoch) + } + return nil + }) + return rv, err +} + // Returns the *.zap file names that are listed in the rootBolt. func (s *Scorch) loadZapFileNames() (map[string]struct{}, error) { rv := map[string]struct{}{} diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 94cbb3e6e..e03cdeee3 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -38,20 +38,22 @@ const Name = "scorch" const Version uint8 = 1 type Scorch struct { - readOnly bool - version uint8 - config map[string]interface{} - analysisQueue *index.AnalysisQueue - stats *Stats - nextSegmentID uint64 - nextSnapshotEpoch uint64 - path string + readOnly bool + version uint8 + config map[string]interface{} + analysisQueue *index.AnalysisQueue + stats *Stats + nextSegmentID uint64 + path string unsafeBatch bool - rootLock sync.RWMutex - root *IndexSnapshot // holds 1 ref-count on the root - rootPersisted []chan error // closed when root is persisted + rootLock sync.RWMutex + root *IndexSnapshot // holds 1 ref-count on the root + rootPersisted []chan error // closed when root is persisted + nextSnapshotEpoch uint64 + eligibleForRemoval []uint64 // Index snapshot epochs that are safe to GC. + ineligibleForRemoval map[string]bool // Filenames that should not be GC'ed yet. closeCh chan struct{} introductions chan *segmentIntroduction @@ -62,9 +64,6 @@ type Scorch struct { rootBolt *bolt.DB asyncTasks sync.WaitGroup - eligibleForRemoval []uint64 // Index snapshot epochs that are safe to GC. - ineligibleForRemoval map[string]bool // Filenames that should not be GC'ed yet. - onEvent func(event Event) } diff --git a/index_test.go b/index_test.go index 03ef71929..762e3838e 100644 --- a/index_test.go +++ b/index_test.go @@ -1565,6 +1565,12 @@ func TestBatchRaceBug260(t *testing.T) { if err != nil { t.Fatal(err) } + defer func() { + err := i.Close() + if err != nil { + t.Fatal(err) + } + }() b := i.NewBatch() err = b.Index("1", 1) if err != nil { From f42ecb0ac727393c59d0fec264be1d7feb996a48 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 4 Jan 2018 13:58:51 +0530 Subject: [PATCH 120/728] docvalue "zap-path" cmd to print out the dv disk sizes --- .../segment/zap/cmd/zap/cmd/docvalue.go | 60 +++++++++++++++++-- 1 file changed, 55 insertions(+), 5 deletions(-) diff --git a/index/scorch/segment/zap/cmd/zap/cmd/docvalue.go b/index/scorch/segment/zap/cmd/zap/cmd/docvalue.go index f20243ee2..ee15bac35 100644 --- a/index/scorch/segment/zap/cmd/zap/cmd/docvalue.go +++ b/index/scorch/segment/zap/cmd/zap/cmd/docvalue.go @@ -67,20 +67,50 @@ var docvalueCmd = &cobra.Command{ } dvLoc := segment.DocValueOffset() - fieldDvLoc := uint64(0) + fieldDvLoc, total, fdvread := uint64(0), uint64(0), int(0) + var fieldName string var fieldID uint16 // if no fields are specified then print the docValue offsets for all fields set for id, field := range fieldInv { - fieldLoc, nread = binary.Uvarint(data[dvLoc+read : dvLoc+read+binary.MaxVarintLen64]) - if nread <= 0 { + fieldLoc, fdvread = binary.Uvarint(data[dvLoc+read : dvLoc+read+binary.MaxVarintLen64]) + if fdvread <= 0 { return fmt.Errorf("loadDvIterators: failed to read the docvalue offsets for field %d", fieldID) } - read += uint64(nread) + read += uint64(fdvread) + if fieldLoc == math.MaxUint64 { + fmt.Printf("fieldID: %d '%s' docvalue at %d (%x) not persisted \n", id, field, fieldLoc, fieldLoc) + continue + } + + var offset, clen, numChunks uint64 + numChunks, nread = binary.Uvarint(data[fieldLoc : fieldLoc+binary.MaxVarintLen64]) + if nread <= 0 { + return fmt.Errorf("failed to read the field "+ + "doc values for field %s", fieldName) + } + offset += uint64(nread) + + // read the length of chunks + totalSize := uint64(0) + chunkLens := make([]uint64, numChunks) + for i := 0; i < int(numChunks); i++ { + clen, nread = binary.Uvarint(data[fieldLoc+offset : fieldLoc+offset+binary.MaxVarintLen64]) + if nread <= 0 { + return fmt.Errorf("corrupted chunk length for chunk number: %d", i) + } + + chunkLens[i] = clen + totalSize += clen + offset += uint64(nread) + } + + total += totalSize if len(args) == 1 { // if no field args are given, then print out the dv locations for all fields - fmt.Printf("fieldID: %d '%s' docvalue at %d (%x)\n", id, field, fieldLoc, fieldLoc) + mbsize := float64(totalSize) / (1024 * 1024) + fmt.Printf("fieldID: %d '%s' docvalue at %d (%x) numChunks %d diskSize %.3f MB\n", id, field, fieldLoc, fieldLoc, numChunks, mbsize) continue } @@ -94,6 +124,9 @@ var docvalueCmd = &cobra.Command{ } + mbsize := float64(total) / (1024 * 1024) + fmt.Printf("Total Doc Values Size on Disk: %.3f MB\n", mbsize) + // done with the fields dv locs printing for the given zap file if len(args) == 1 { return nil @@ -129,6 +162,12 @@ var docvalueCmd = &cobra.Command{ if len(args) == 2 { fmt.Printf("chunk: %d size: %d \n", i, clen) } + /* + TODO => dump all chunk headers?? + if len(args) == 3 && args[2] == ">" { + dumpChunkDocIDs(data, ) + + }*/ } if len(args) == 2 { @@ -182,6 +221,8 @@ var docvalueCmd = &cobra.Command{ start, length := getDocValueLocs(uint64(localDocNum), curChunkHeader) if start == math.MaxUint64 || length == math.MaxUint64 { + fmt.Printf("no field values found for docID %d\n", localDocNum) + fmt.Printf("Try docIDs present in chunk: %s\n", assortDocID(curChunkHeader)) return nil } // uncompress the already loaded data @@ -219,6 +260,15 @@ func getDocValueLocs(docID uint64, metaHeader []zap.MetaData) (uint64, uint64) { return math.MaxUint64, math.MaxUint64 } +func assortDocID(metaHeader []zap.MetaData) string { + docIDs := "" + for _, meta := range metaHeader { + id := fmt.Sprintf("%d", meta.DocID) + docIDs += id + ", " + } + return docIDs +} + func init() { RootCmd.AddCommand(docvalueCmd) } From 71a726bbf6862f156022fb895565ceaac252c277 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 4 Jan 2018 15:34:55 +0530 Subject: [PATCH 121/728] perf issue was due to duplicate fieldIDs getting inserted to the list of dv enabled fields list - DocValueFields in mem segment. Moved back to the original type `DocValueFields map[uint16]bool` for easy look up to check whether the fieldID is configured for dv storage. --- index/scorch/segment/mem/build.go | 2 +- index/scorch/segment/mem/segment.go | 8 ++++---- index/scorch/segment/zap/build.go | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 27a297d2f..72ea08eb8 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -121,7 +121,7 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { } // TODO with mapping changes for dv //if field.Options().IncludeDocValues() { - s.DocValueFields = append(s.DocValueFields, fieldID) + s.DocValueFields[fieldID] = true //} } diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go index 3ba7df7e9..5ef3e1f34 100644 --- a/index/scorch/segment/mem/segment.go +++ b/index/scorch/segment/mem/segment.go @@ -89,9 +89,8 @@ type Segment struct { StoredPos []map[uint16][][]uint64 // for storing the docValue persisted fields - // field id - DocValueFields []uint16 - + DocValueFields map[uint16]bool + // footprint of the segment, updated when analyzed document mutations // are added into the segment sizeInBytes uint64 @@ -100,7 +99,8 @@ type Segment struct { // New builds a new empty Segment func New() *Segment { return &Segment{ - FieldsMap: map[string]uint16{}, + FieldsMap: map[string]uint16{}, + DocValueFields: map[uint16]bool{}, } } diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index f5a92562d..c7f73769e 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -440,7 +440,7 @@ func persistDocValues(memSegment *mem.Segment, w *CountHashWriter, fieldChunkOffsets := make(map[uint16]uint64, len(memSegment.FieldsInv)) fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) - for _, fieldID := range memSegment.DocValueFields { + for fieldID := range memSegment.DocValueFields { field := memSegment.FieldsInv[fieldID] docTermMap := make(map[uint64][]byte, 0) dict, err := memSegment.Dictionary(field) From 111f0d07214d7d6bd773893f1ce535ea83f25ff7 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 3 Jan 2018 15:26:21 -0800 Subject: [PATCH 122/728] Updated Rollback APIs New APIs: + RollbackPoints() - Retrieves the available list of rollback points: epoch+meta. - The application will need to check with the meta to decide on the rollback point. + Rollback() - API requires a rollback point identified by the first API. - Atomically & Durably rolls back the index to specified point, provided the specified rollback point is still available. + Unit test: TestIndexRollback - Writes a batch. - Sets the rollback point. - Writes second batch. - Rollback to previously decided point. - Ensure that data is as is before the second batch. --- index/scorch/introducer.go | 6 +- index/scorch/snapshot_rollback.go | 172 +++++++++++++++++-------- index/scorch/snapshot_rollback_test.go | 128 ++++++++++++------ 3 files changed, 215 insertions(+), 91 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 715325626..0b9c48537 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -289,7 +289,11 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { deleted: segmentSnapshot.deleted, cachedDocs: segmentSnapshot.cachedDocs, } - segmentSnapshot.segment.AddRef() + newSnapshot.segment[i].segment.AddRef() + + // remove segment from ineligibleForRemoval map + filename := zapFileName(segmentSnapshot.id) + delete(s.ineligibleForRemoval, filename) } if revertTo.persisted != nil { diff --git a/index/scorch/snapshot_rollback.go b/index/scorch/snapshot_rollback.go index d4b1f2eb8..43c3ba9f1 100644 --- a/index/scorch/snapshot_rollback.go +++ b/index/scorch/snapshot_rollback.go @@ -15,98 +15,164 @@ package scorch import ( - "bytes" + "fmt" "log" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/boltdb/bolt" ) -// PreviousPersistedSnapshot returns the next older, previous -// IndexSnapshot based on the provided IndexSnapshot. If the provided -// argument is nil, the most recently persisted IndexSnapshot is returned. -// This API allows the application to walk backwards into the history -// of a store to previous points in time. A nil return value indicates -// that no previous snapshots are available. -func (s *Scorch) PreviousPersistedSnapshot(is *IndexSnapshot) (*IndexSnapshot, error) { +type RollbackPoint struct { + epoch uint64 + meta map[string][]byte +} + +func (r *RollbackPoint) GetInternal(key []byte) []byte { + return r.meta[string(key)] +} + +// RollbackPoints returns an array of rollback points available +// for the application to make a decision on where to rollback +// to. A nil return value indicates that there are no available +// rollback points. +func (s *Scorch) RollbackPoints() ([]*RollbackPoint, error) { if s.rootBolt == nil { - return nil, nil + return nil, fmt.Errorf("RollbackPoints: root is nil") } - // start a read-only transaction + // start a read-only bolt transaction tx, err := s.rootBolt.Begin(false) if err != nil { - return nil, err + return nil, fmt.Errorf("RollbackPoints: failed to start" + + " read-only transaction") } - // Read-only bolt transactions to be rolled back. + // read-only bolt transactions to be rolled back defer func() { _ = tx.Rollback() }() snapshots := tx.Bucket(boltSnapshotsBucket) if snapshots == nil { - return nil, nil + return nil, fmt.Errorf("RollbackPoints: no snapshots available") } - pos := []byte(nil) + rollbackPoints := []*RollbackPoint{} - if is != nil { - pos = segment.EncodeUvarintAscending(nil, is.epoch) - } - - c := snapshots.Cursor() - for k, _ := c.Last(); k != nil; k, _ = c.Prev() { - if pos == nil || bytes.Compare(k, pos) < 0 { - _, snapshotEpoch, err := segment.DecodeUvarintAscending(k) - if err != nil { - log.Printf("PreviousPersistedSnapshot:"+ - " unable to parse segment epoch %x, continuing", k) - continue - } + c1 := snapshots.Cursor() + for k, _ := c1.Last(); k != nil; k, _ = c1.Prev() { + _, snapshotEpoch, err := segment.DecodeUvarintAscending(k) + if err != nil { + log.Printf("RollbackPoints:"+ + " unable to parse segment epoch %x, continuing", k) + continue + } - snapshot := snapshots.Bucket(k) - if snapshot == nil { - log.Printf("PreviousPersistedSnapshot:"+ - " snapshot key, but bucket missing %x, continuing", k) - continue - } + snapshot := snapshots.Bucket(k) + if snapshot == nil { + log.Printf("RollbackPoints:"+ + " snapshot key, but bucket missing %x, continuing", k) + continue + } - indexSnapshot, err := s.loadSnapshot(snapshot) - if err != nil { - log.Printf("PreviousPersistedSnapshot:"+ - " unable to load snapshot, %v, continuing", err) - continue + meta := map[string][]byte{} + c2 := snapshot.Cursor() + for j, _ := c2.First(); j != nil; j, _ = c2.Next() { + if j[0] == boltInternalKey[0] { + internalBucket := snapshot.Bucket(j) + err = internalBucket.ForEach(func(key []byte, val []byte) error { + copiedVal := append([]byte(nil), val...) + meta[string(key)] = copiedVal + return nil + }) + if err != nil { + break + } } + } - indexSnapshot.epoch = snapshotEpoch - return indexSnapshot, nil + if err != nil { + log.Printf("RollbackPoints:"+ + " failed in fetching internal data: %v", err) + continue } + + rollbackPoints = append(rollbackPoints, &RollbackPoint{ + epoch: snapshotEpoch, + meta: meta, + }) } - return nil, nil + return rollbackPoints, nil } -// SnapshotRevert atomically brings the store back to the point in time -// as represented by the revertTo IndexSnapshot. SnapshotRevert() should -// only be passed an IndexSnapshot that came from the same store. -func (s *Scorch) SnapshotRevert(revertTo *IndexSnapshot) error { - revert := &snapshotReversion{ - snapshot: revertTo, - applied: make(chan error), +// Rollback atomically and durably (if unsafeBatch is unset) brings +// the store back to the point in time as represented by the +// RollbackPoint. Rollback() should only be passed a RollbackPoint +// that came from the same store using the RollbackPoints() API. +func (s *Scorch) Rollback(to *RollbackPoint) error { + if to == nil { + return fmt.Errorf("Rollback: RollbackPoint is nil") } - if !s.unsafeBatch { - revert.persisted = make(chan error) + if s.rootBolt == nil { + return fmt.Errorf("Rollback: root is nil") } - s.revertToSnapshots <- revert + revert := &snapshotReversion{} + + s.rootLock.Lock() + + err := s.rootBolt.View(func(tx *bolt.Tx) error { + snapshots := tx.Bucket(boltSnapshotsBucket) + if snapshots == nil { + return fmt.Errorf("Rollback: no snapshots available") + } + + pos := segment.EncodeUvarintAscending(nil, to.epoch) + + snapshot := snapshots.Bucket(pos) + if snapshot == nil { + return fmt.Errorf("Rollback: snapshot not found") + } + + indexSnapshot, err := s.loadSnapshot(snapshot) + if err != nil { + return fmt.Errorf("Rollback: unable to load snapshot: %v", err) + } + + // add segments referenced by loaded index snapshot to the + // ineligibleForRemoval map + for _, segSnap := range indexSnapshot.segment { + filename := zapFileName(segSnap.id) + s.ineligibleForRemoval[filename] = true + } + + revert.snapshot = indexSnapshot + revert.applied = make(chan error) + + if !s.unsafeBatch { + revert.persisted = make(chan error) + } + + return nil + }) + + s.rootLock.Unlock() - // block until this IndexSnapshot is applied - err := <-revert.applied if err != nil { return err } + // introduce the reversion + s.revertToSnapshots <- revert + + // block until this snapshot is applied + err = <-revert.applied + if err != nil { + return fmt.Errorf("Rollback: failed with err: %v", err) + } + if revert.persisted != nil { err = <-revert.persisted } diff --git a/index/scorch/snapshot_rollback_test.go b/index/scorch/snapshot_rollback_test.go index 879d01685..9816a51e6 100644 --- a/index/scorch/snapshot_rollback_test.go +++ b/index/scorch/snapshot_rollback_test.go @@ -45,70 +45,124 @@ func TestIndexRollback(t *testing.T) { } }() - // create 2 docs + // create a batch, insert 2 new documents + batch := index.NewBatch() doc := document.NewDocument("1") doc.AddField(document.NewTextField("name", []uint64{}, []byte("test1"))) - err = idx.Update(doc) - if err != nil { - t.Error(err) - } - + batch.Update(doc) doc = document.NewDocument("2") doc.AddField(document.NewTextField("name", []uint64{}, []byte("test2"))) - err = idx.Update(doc) + batch.Update(doc) + + err = idx.Batch(batch) if err != nil { - t.Error(err) + t.Fatal(err) } - // create a batch, insert new doc, update existing doc, delete existing doc - batch := index.NewBatch() + sh, ok := idx.(*Scorch) + if !ok { + t.Fatalf("Not a scorch index?") + } + + // fetch rollback points available as of here + rollbackPoints, err := sh.RollbackPoints() + if err != nil || len(rollbackPoints) == 0 { + t.Fatal(err, len(rollbackPoints)) + } + + // set this as a rollback point for the future + rollbackPoint := rollbackPoints[0] + + // create another batch, insert 2 new documents, and delete an existing one + batch = index.NewBatch() doc = document.NewDocument("3") doc.AddField(document.NewTextField("name", []uint64{}, []byte("test3"))) batch.Update(doc) - doc = document.NewDocument("2") - doc.AddField(document.NewTextField("name", []uint64{}, []byte("test2updated"))) + doc = document.NewDocument("4") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test4"))) batch.Update(doc) batch.Delete("1") err = idx.Batch(batch) if err != nil { - t.Error(err) + t.Fatal(err) } - sh, ok := idx.(*Scorch) - if !ok { - t.Errorf("Not a scorch index?") + reader, err := idx.Reader() + if err != nil { + t.Fatal(err) + } + + docCount, err := reader.DocCount() + if err != nil { + t.Fatal(err) } - // Get Last persisted snapshot - ss, err := sh.PreviousPersistedSnapshot(nil) + // expect docs 2, 3, 4 + if docCount != 3 { + t.Fatalf("unexpected doc count: %v", docCount) + } + ret, err := reader.Document("1") + if err != nil || ret != nil { + t.Fatal(ret, err) + } + ret, err = reader.Document("2") + if err != nil || ret == nil { + t.Fatal(ret, err) + } + ret, err = reader.Document("3") + if err != nil || ret == nil { + t.Fatal(ret, err) + } + ret, err = reader.Document("4") + if err != nil || ret == nil { + t.Fatal(ret, err) + } + + err = reader.Close() if err != nil { - t.Error(err) + t.Fatal(err) } - // Retrieve the snapshot earlier - prev, err := sh.PreviousPersistedSnapshot(ss) + // rollback to the selected rollback point + err = sh.Rollback(rollbackPoint) if err != nil { - t.Error(err) + t.Fatal(err) } - if prev != nil { - err = sh.SnapshotRevert(prev) - if err != nil { - t.Error(err) - } + reader, err = idx.Reader() + if err != nil { + t.Fatal(err) + } - newRoot, err := sh.PreviousPersistedSnapshot(nil) - if err != nil { - t.Error(err) - } + docCount, err = reader.DocCount() + if err != nil { + t.Fatal(err) + } - if newRoot == nil { - t.Errorf("Failed to retrieve latest persisted snapshot") - } + // expect only docs 1, 2 + if docCount != 2 { + t.Fatalf("unexpected doc count: %v", docCount) + } + ret, err = reader.Document("1") + if err != nil || ret == nil { + t.Fatal(ret, err) + } + ret, err = reader.Document("2") + if err != nil || ret == nil { + t.Fatal(ret, err) + } + ret, err = reader.Document("3") + if err != nil || ret != nil { + t.Fatal(ret, err) + } + ret, err = reader.Document("4") + if err != nil || ret != nil { + t.Fatal(ret, err) + } - if newRoot.epoch <= prev.epoch { - t.Errorf("Unexpected epoch, %v <= %v", newRoot.epoch, prev.epoch) - } + err = reader.Close() + if err != nil { + t.Fatal(err) } } From c691cd2bb55c81fdd3634a9b23fa0b895fc7b3ce Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 5 Jan 2018 10:17:18 -0500 Subject: [PATCH 123/728] refactor scorch/zap command-line tools under bleve zap command-line tool added to main bleve command-line tool this required physical relocation due to the vendoring used only on the bleve command-line tool (unforseen limitation) a new scorch command-line tool has also been introduced and for the same reasons it is physically store under the top-level bleve command-line tool as well --- cmd/bleve/cmd/scorch.go | 25 +++++++ cmd/bleve/cmd/scorch/info.go | 59 ++++++++++++++++ cmd/bleve/cmd/scorch/root.go | 70 +++++++++++++++++++ cmd/bleve/cmd/scorch/snapshot.go | 46 ++++++++++++ .../cmd/zap/main.go => cmd/bleve/cmd/zap.go | 10 +-- .../cmd/zap/cmd => cmd/bleve/cmd/zap}/dict.go | 2 +- .../zap/cmd => cmd/bleve/cmd/zap}/docvalue.go | 2 +- .../zap/cmd => cmd/bleve/cmd/zap}/explore.go | 2 +- .../zap/cmd => cmd/bleve/cmd/zap}/fields.go | 2 +- .../zap/cmd => cmd/bleve/cmd/zap}/footer.go | 2 +- .../cmd/zap/cmd => cmd/bleve/cmd/zap}/root.go | 2 +- .../zap/cmd => cmd/bleve/cmd/zap}/stored.go | 2 +- index/scorch/persister.go | 4 +- index/scorch/segment/zap/cmd/zap/README.md | 3 - 14 files changed, 215 insertions(+), 16 deletions(-) create mode 100644 cmd/bleve/cmd/scorch.go create mode 100644 cmd/bleve/cmd/scorch/info.go create mode 100644 cmd/bleve/cmd/scorch/root.go create mode 100644 cmd/bleve/cmd/scorch/snapshot.go rename index/scorch/segment/zap/cmd/zap/main.go => cmd/bleve/cmd/zap.go (79%) rename {index/scorch/segment/zap/cmd/zap/cmd => cmd/bleve/cmd/zap}/dict.go (99%) rename {index/scorch/segment/zap/cmd/zap/cmd => cmd/bleve/cmd/zap}/docvalue.go (99%) rename {index/scorch/segment/zap/cmd/zap/cmd => cmd/bleve/cmd/zap}/explore.go (99%) rename {index/scorch/segment/zap/cmd/zap/cmd => cmd/bleve/cmd/zap}/fields.go (99%) rename {index/scorch/segment/zap/cmd/zap/cmd => cmd/bleve/cmd/zap}/footer.go (99%) rename {index/scorch/segment/zap/cmd/zap/cmd => cmd/bleve/cmd/zap}/root.go (99%) rename {index/scorch/segment/zap/cmd/zap/cmd => cmd/bleve/cmd/zap}/stored.go (99%) delete mode 100644 index/scorch/segment/zap/cmd/zap/README.md diff --git a/cmd/bleve/cmd/scorch.go b/cmd/bleve/cmd/scorch.go new file mode 100644 index 000000000..781db7031 --- /dev/null +++ b/cmd/bleve/cmd/scorch.go @@ -0,0 +1,25 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cmd + +import ( + "github.com/blevesearch/bleve/cmd/bleve/cmd/scorch" +) + +// make scorch command-line tool a bleve sub-command + +func init() { + RootCmd.AddCommand(scorch.RootCmd) +} diff --git a/cmd/bleve/cmd/scorch/info.go b/cmd/bleve/cmd/scorch/info.go new file mode 100644 index 000000000..2b4674f06 --- /dev/null +++ b/cmd/bleve/cmd/scorch/info.go @@ -0,0 +1,59 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "fmt" + + "github.com/spf13/cobra" +) + +// dictCmd represents the dict command +var infoCmd = &cobra.Command{ + Use: "info", + Short: "info prints basic info about the index", + Long: `The info command prints basic info about the index.`, + RunE: func(cmd *cobra.Command, args []string) error { + + reader, err := index.Reader() + if err != nil { + return err + } + + count, err := reader.DocCount() + if err != nil { + return err + } + + fmt.Printf("count: %d\n", count) + + // var numSnapshots int + // var rootSnapshot uint64 + // index.VisitBoltSnapshots(func(snapshotEpoch uint64) error { + // if rootSnapshot == 0 { + // rootSnapshot = snapshotEpoch + // } + // numSnapshots++ + // return nil + // }) + // fmt.Printf("has %d snapshot(s), root: %d\n", numSnapshots, rootSnapshot) + + return nil + }, +} + +func init() { + RootCmd.AddCommand(infoCmd) +} diff --git a/cmd/bleve/cmd/scorch/root.go b/cmd/bleve/cmd/scorch/root.go new file mode 100644 index 000000000..b27992edc --- /dev/null +++ b/cmd/bleve/cmd/scorch/root.go @@ -0,0 +1,70 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "fmt" + "os" + + "github.com/blevesearch/bleve/index/scorch" + "github.com/spf13/cobra" +) + +var index *scorch.Scorch + +// RootCmd represents the base command when called without any subcommands +var RootCmd = &cobra.Command{ + Use: "scorch", + Short: "command-line tool to interact with a scorch index", + Long: `Scorch is a command-line tool to interact with a scorch index.`, + PersistentPreRunE: func(cmd *cobra.Command, args []string) error { + + if len(args) < 1 { + return fmt.Errorf("must specify path to scorch index") + } + + readOnly := true + config := map[string]interface{}{ + "read_only": readOnly, + "path": args[0], + } + + idx, err := scorch.NewScorch(scorch.Name, config, nil) + if err != nil { + return err + } + + err = idx.Open() + if err != nil { + return fmt.Errorf("error opening: %v", err) + } + + index = idx.(*scorch.Scorch) + + return nil + }, + PersistentPostRunE: func(cmd *cobra.Command, args []string) error { + return nil + }, +} + +// Execute adds all child commands to the root command sets flags appropriately. +// This is called by main.main(). It only needs to happen once to the rootCmd. +func Execute() { + if err := RootCmd.Execute(); err != nil { + fmt.Println(err) + os.Exit(-1) + } +} diff --git a/cmd/bleve/cmd/scorch/snapshot.go b/cmd/bleve/cmd/scorch/snapshot.go new file mode 100644 index 000000000..1c51c056c --- /dev/null +++ b/cmd/bleve/cmd/scorch/snapshot.go @@ -0,0 +1,46 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "fmt" + + "github.com/spf13/cobra" +) + +// snapshotsCmd represents the snapshots command +var snapshotCmd = &cobra.Command{ + Use: "snapshot", + Short: "info prints details about the snapshots in the index", + Long: `The snapshot command prints details about the snapshots in the index.`, + RunE: func(cmd *cobra.Command, args []string) error { + + if len(args) < 2 { + snapshotEpochs, err := index.RootBoltSnapshotEpochs() + if err != nil { + return err + } + for _, snapshotEpoch := range snapshotEpochs { + fmt.Printf("%d\n", snapshotEpoch) + } + } + + return nil + }, +} + +func init() { + RootCmd.AddCommand(snapshotCmd) +} diff --git a/index/scorch/segment/zap/cmd/zap/main.go b/cmd/bleve/cmd/zap.go similarity index 79% rename from index/scorch/segment/zap/cmd/zap/main.go rename to cmd/bleve/cmd/zap.go index 23c500a33..b84d5f2b3 100644 --- a/index/scorch/segment/zap/cmd/zap/main.go +++ b/cmd/bleve/cmd/zap.go @@ -12,12 +12,14 @@ // See the License for the specific language governing permissions and // limitations under the License. -package main +package cmd import ( - "github.com/blevesearch/bleve/index/scorch/segment/zap/cmd/zap/cmd" + "github.com/blevesearch/bleve/cmd/bleve/cmd/zap" ) -func main() { - cmd.Execute() +// make zap command-line tool a bleve sub-command + +func init() { + RootCmd.AddCommand(zap.RootCmd) } diff --git a/index/scorch/segment/zap/cmd/zap/cmd/dict.go b/cmd/bleve/cmd/zap/dict.go similarity index 99% rename from index/scorch/segment/zap/cmd/zap/cmd/dict.go rename to cmd/bleve/cmd/zap/dict.go index fa8b3277e..3e2727195 100644 --- a/index/scorch/segment/zap/cmd/zap/cmd/dict.go +++ b/cmd/bleve/cmd/zap/dict.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package cmd +package zap import ( "encoding/binary" diff --git a/index/scorch/segment/zap/cmd/zap/cmd/docvalue.go b/cmd/bleve/cmd/zap/docvalue.go similarity index 99% rename from index/scorch/segment/zap/cmd/zap/cmd/docvalue.go rename to cmd/bleve/cmd/zap/docvalue.go index ee15bac35..165829fdf 100644 --- a/index/scorch/segment/zap/cmd/zap/cmd/docvalue.go +++ b/cmd/bleve/cmd/zap/docvalue.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package cmd +package zap import ( "bytes" diff --git a/index/scorch/segment/zap/cmd/zap/cmd/explore.go b/cmd/bleve/cmd/zap/explore.go similarity index 99% rename from index/scorch/segment/zap/cmd/zap/cmd/explore.go rename to cmd/bleve/cmd/zap/explore.go index 012a829fe..de05c63e7 100644 --- a/index/scorch/segment/zap/cmd/zap/cmd/explore.go +++ b/cmd/bleve/cmd/zap/explore.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package cmd +package zap import ( "encoding/binary" diff --git a/index/scorch/segment/zap/cmd/zap/cmd/fields.go b/cmd/bleve/cmd/zap/fields.go similarity index 99% rename from index/scorch/segment/zap/cmd/zap/cmd/fields.go rename to cmd/bleve/cmd/zap/fields.go index cfc40974b..cf8cc3d86 100644 --- a/index/scorch/segment/zap/cmd/zap/cmd/fields.go +++ b/cmd/bleve/cmd/zap/fields.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package cmd +package zap import ( "encoding/binary" diff --git a/index/scorch/segment/zap/cmd/zap/cmd/footer.go b/cmd/bleve/cmd/zap/footer.go similarity index 99% rename from index/scorch/segment/zap/cmd/zap/cmd/footer.go rename to cmd/bleve/cmd/zap/footer.go index 0460360fc..96078ded6 100644 --- a/index/scorch/segment/zap/cmd/zap/cmd/footer.go +++ b/cmd/bleve/cmd/zap/footer.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package cmd +package zap import ( "fmt" diff --git a/index/scorch/segment/zap/cmd/zap/cmd/root.go b/cmd/bleve/cmd/zap/root.go similarity index 99% rename from index/scorch/segment/zap/cmd/zap/cmd/root.go rename to cmd/bleve/cmd/zap/root.go index f969bbf13..ee2b62602 100644 --- a/index/scorch/segment/zap/cmd/zap/cmd/root.go +++ b/cmd/bleve/cmd/zap/root.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package cmd +package zap import ( "fmt" diff --git a/index/scorch/segment/zap/cmd/zap/cmd/stored.go b/cmd/bleve/cmd/zap/stored.go similarity index 99% rename from index/scorch/segment/zap/cmd/zap/cmd/stored.go rename to cmd/bleve/cmd/zap/stored.go index 64e42c7e6..ba1143cb1 100644 --- a/index/scorch/segment/zap/cmd/zap/cmd/stored.go +++ b/cmd/bleve/cmd/zap/stored.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package cmd +package zap import ( "encoding/binary" diff --git a/index/scorch/persister.go b/index/scorch/persister.go index acf241ebf..e0614151e 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -443,7 +443,7 @@ var NumSnapshotsToKeep = 1 // Removes enough snapshots from the rootBolt so that the // s.eligibleForRemoval stays under the NumSnapshotsToKeep policy. func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) { - persistedEpochs, err := s.rootBoltSnapshotEpochs() + persistedEpochs, err := s.RootBoltSnapshotEpochs() if err != nil { return 0, err } @@ -565,7 +565,7 @@ func (s *Scorch) removeOldZapFiles() error { return nil } -func (s *Scorch) rootBoltSnapshotEpochs() ([]uint64, error) { +func (s *Scorch) RootBoltSnapshotEpochs() ([]uint64, error) { var rv []uint64 err := s.rootBolt.View(func(tx *bolt.Tx) error { snapshots := tx.Bucket(boltSnapshotsBucket) diff --git a/index/scorch/segment/zap/cmd/zap/README.md b/index/scorch/segment/zap/cmd/zap/README.md deleted file mode 100644 index 99f55d365..000000000 --- a/index/scorch/segment/zap/cmd/zap/README.md +++ /dev/null @@ -1,3 +0,0 @@ -# zap command line utility - -Kind of a hack just put together quickly to let me debug some issues. From 57a075afdb5bdd2debe8b228512e50d99e1a352c Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 5 Jan 2018 11:50:07 -0500 Subject: [PATCH 124/728] improving command-line tool for scorch --- cmd/bleve/cmd/scorch/ascii.go | 59 ++++++++++++++++++++++++++++++ cmd/bleve/cmd/scorch/deleted.go | 55 ++++++++++++++++++++++++++++ cmd/bleve/cmd/scorch/internal.go | 61 ++++++++++++++++++++++++++++++++ cmd/bleve/cmd/scorch/snapshot.go | 20 ++++++++++- index/scorch/persister.go | 22 ++++++++++++ index/scorch/snapshot_index.go | 8 +++++ index/scorch/snapshot_segment.go | 8 +++++ 7 files changed, 232 insertions(+), 1 deletion(-) create mode 100644 cmd/bleve/cmd/scorch/ascii.go create mode 100644 cmd/bleve/cmd/scorch/deleted.go create mode 100644 cmd/bleve/cmd/scorch/internal.go diff --git a/cmd/bleve/cmd/scorch/ascii.go b/cmd/bleve/cmd/scorch/ascii.go new file mode 100644 index 000000000..7b36b5b9c --- /dev/null +++ b/cmd/bleve/cmd/scorch/ascii.go @@ -0,0 +1,59 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "fmt" + "strconv" + + "github.com/blevesearch/bleve/index/scorch/mergeplan" + "github.com/spf13/cobra" +) + +// asciiCmd represents the snapshots command +var asciiCmd = &cobra.Command{ + Use: "ascii", + Short: "ascii prints details an ascii representation of the snapshots in the index", + Long: `The ascii command prints an ascii representation of the snapshots in the index.`, + RunE: func(cmd *cobra.Command, args []string) error { + + if len(args) < 2 { + return fmt.Errorf("snapshot epoch required") + } else if len(args) < 3 { + snapshotEpoch, err := strconv.ParseUint(args[1], 10, 64) + if err != nil { + return err + } + snapshot, err := index.LoadSnapshot(snapshotEpoch) + if err != nil { + return err + } + segments := snapshot.Segments() + var mergePlanSegments []mergeplan.Segment + for _, v := range segments { + mergePlanSegments = append(mergePlanSegments, v) + } + + str := mergeplan.ToBarChart(args[1], 25, mergePlanSegments, nil) + fmt.Printf("%s\n", str) + } + + return nil + }, +} + +func init() { + RootCmd.AddCommand(asciiCmd) +} diff --git a/cmd/bleve/cmd/scorch/deleted.go b/cmd/bleve/cmd/scorch/deleted.go new file mode 100644 index 000000000..cb2a9245c --- /dev/null +++ b/cmd/bleve/cmd/scorch/deleted.go @@ -0,0 +1,55 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "fmt" + "strconv" + + "github.com/spf13/cobra" +) + +// deletedCmd represents the deleted command +var deletedCmd = &cobra.Command{ + Use: "deleted", + Short: "deleted prints the deleted bitmap for segments in the index snapshot", + Long: `The delete command prints the deleted bitmap for segments in the index snapshot.`, + RunE: func(cmd *cobra.Command, args []string) error { + + if len(args) < 2 { + return fmt.Errorf("snapshot epoch required") + } else if len(args) < 3 { + snapshotEpoch, err := strconv.ParseUint(args[1], 10, 64) + if err != nil { + return err + } + snapshot, err := index.LoadSnapshot(snapshotEpoch) + if err != nil { + return err + } + segments := snapshot.Segments() + for i, segmentSnap := range segments { + deleted := segmentSnap.Deleted() + fmt.Printf("%d %v\n", i, deleted) + } + } + + return nil + }, +} + +func init() { + RootCmd.AddCommand(deletedCmd) +} diff --git a/cmd/bleve/cmd/scorch/internal.go b/cmd/bleve/cmd/scorch/internal.go new file mode 100644 index 000000000..027e90282 --- /dev/null +++ b/cmd/bleve/cmd/scorch/internal.go @@ -0,0 +1,61 @@ +// Copyright (c) 2017 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "fmt" + "strconv" + + "github.com/spf13/cobra" +) + +var ascii bool + +// internalCmd represents the snapshots command +var internalCmd = &cobra.Command{ + Use: "internal", + Short: "internal prints the internal k/v pairs in a snapshot", + Long: `The internal command prints the internal k/v pairs in a snapshot.`, + RunE: func(cmd *cobra.Command, args []string) error { + + if len(args) < 2 { + return fmt.Errorf("snapshot epoch required") + } else if len(args) < 3 { + snapshotEpoch, err := strconv.ParseUint(args[1], 10, 64) + if err != nil { + return err + } + snapshot, err := index.LoadSnapshot(snapshotEpoch) + if err != nil { + return err + } + internal := snapshot.Internal() + for k, v := range internal { + if ascii { + fmt.Printf("%s %s\n", k, string(v)) + } else { + fmt.Printf("%x %x\n", k, v) + } + } + } + + return nil + }, +} + +func init() { + RootCmd.AddCommand(internalCmd) + internalCmd.Flags().BoolVarP(&ascii, "ascii", "a", false, "print key/value in ascii") +} diff --git a/cmd/bleve/cmd/scorch/snapshot.go b/cmd/bleve/cmd/scorch/snapshot.go index 1c51c056c..bb035ce59 100644 --- a/cmd/bleve/cmd/scorch/snapshot.go +++ b/cmd/bleve/cmd/scorch/snapshot.go @@ -16,11 +16,13 @@ package scorch import ( "fmt" + "strconv" + "github.com/blevesearch/bleve/index/scorch/segment/zap" "github.com/spf13/cobra" ) -// snapshotsCmd represents the snapshots command +// snapshotCmd represents the snapshot command var snapshotCmd = &cobra.Command{ Use: "snapshot", Short: "info prints details about the snapshots in the index", @@ -35,6 +37,22 @@ var snapshotCmd = &cobra.Command{ for _, snapshotEpoch := range snapshotEpochs { fmt.Printf("%d\n", snapshotEpoch) } + } else if len(args) < 3 { + snapshotEpoch, err := strconv.ParseUint(args[1], 10, 64) + if err != nil { + return err + } + snapshot, err := index.LoadSnapshot(snapshotEpoch) + if err != nil { + return err + } + segments := snapshot.Segments() + for i, segmentSnap := range segments { + segment := segmentSnap.Segment() + if segment, ok := segment.(*zap.Segment); ok { + fmt.Printf("%d %s\n", i, segment.Path()) + } + } } return nil diff --git a/index/scorch/persister.go b/index/scorch/persister.go index e0614151e..401086543 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -340,6 +340,28 @@ func (s *Scorch) loadFromBolt() error { }) } +// LoadSnapshot loads the segment with the specified epoch +// NOTE: this is currently ONLY intended to be used by the command-line tool +func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) { + err = s.rootBolt.View(func(tx *bolt.Tx) error { + snapshots := tx.Bucket(boltSnapshotsBucket) + if snapshots == nil { + return nil + } + snapshotKey := segment.EncodeUvarintAscending(nil, epoch) + snapshot := snapshots.Bucket(snapshotKey) + if snapshot == nil { + return nil + } + rv, err = s.loadSnapshot(snapshot) + return err + }) + if err != nil { + return nil, err + } + return rv, nil +} + func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { rv := &IndexSnapshot{ diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 074d4c41f..a9b105a0d 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -51,6 +51,14 @@ type IndexSnapshot struct { refs int64 } +func (i *IndexSnapshot) Segments() []*SegmentSnapshot { + return i.segment +} + +func (i *IndexSnapshot) Internal() map[string][]byte { + return i.internal +} + func (i *IndexSnapshot) AddRef() { i.m.Lock() i.refs++ diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index b3b8d8284..0c111c505 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -56,6 +56,14 @@ type SegmentSnapshot struct { cachedDocs *cachedDocs } +func (s *SegmentSnapshot) Segment() segment.Segment { + return s.segment +} + +func (s *SegmentSnapshot) Deleted() *roaring.Bitmap { + return s.deleted +} + func (s *SegmentSnapshot) Id() uint64 { return s.id } From 62374796050991a1ae019c431128e46987b2a5ec Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 5 Jan 2018 13:32:47 -0500 Subject: [PATCH 125/728] fix race condition in setting up event callbacks previous approach used SetEventCallback method which allowed you to change the callback, unfotunately that also included times after the goroutines were started and potentially firing the callback. checking lock on this would be too expensive, so instead we go for an approach that allows callbacks to be registered by name during process init(), then upon opening up an index a string config key 'eventCallbackName' is used to look up the appropriate callback function. also, since this string config name is serializable, it fits into the existing bleve index metadata without any new issues. --- index/scorch/event.go | 52 +++++++++++++++++++++++++++ index/scorch/event_test.go | 73 ++++++++++++++++++++++++++++++++++++++ index/scorch/scorch.go | 41 +++------------------ 3 files changed, 130 insertions(+), 36 deletions(-) create mode 100644 index/scorch/event.go create mode 100644 index/scorch/event_test.go diff --git a/index/scorch/event.go b/index/scorch/event.go new file mode 100644 index 000000000..ef3439a8b --- /dev/null +++ b/index/scorch/event.go @@ -0,0 +1,52 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import "time" + +// RegistryEventCallbacks should be treated as read-only after +// process init()'ialization. +var RegistryEventCallbacks = map[string]func(Event){} + +// Event represents the information provided in an OnEvent() callback. +type Event struct { + Kind EventKind + Scorch *Scorch + Duration time.Duration +} + +// EventKind represents an event code for OnEvent() callbacks. +type EventKind int + +// EventKindCloseStart is fired when a Scorch.Close() has begun. +var EventKindCloseStart = EventKind(1) + +// EventKindClose is fired when a scorch index has been fully closed. +var EventKindClose = EventKind(2) + +// EventKindMergerProgress is fired when the merger has completed a +// round of merge processing. +var EventKindMergerProgress = EventKind(3) + +// EventKindPersisterProgress is fired when the persister has completed +// a round of persistence processing. +var EventKindPersisterProgress = EventKind(4) + +// EventKindBatchIntroductionStart is fired when Batch() is invoked which +// introduces a new segment. +var EventKindBatchIntroductionStart = EventKind(5) + +// EventKindBatchIntroduction is fired when Batch() completes. +var EventKindBatchIntroduction = EventKind(6) diff --git a/index/scorch/event_test.go b/index/scorch/event_test.go new file mode 100644 index 000000000..92b49d20d --- /dev/null +++ b/index/scorch/event_test.go @@ -0,0 +1,73 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "testing" + + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" +) + +func TestEventBatchIntroductionStart(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + var count int + RegistryEventCallbacks["test"] = func(e Event) { + if e.Kind == EventKindBatchIntroductionStart { + count++ + } + } + + ourConfig := make(map[string]interface{}, len(testConfig)) + for k, v := range testConfig { + ourConfig[k] = v + } + ourConfig["eventCallbackName"] = "test" + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, ourConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + + err = idx.Open() + if err != nil { + t.Fatalf("error opening index: %v", err) + } + + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + if count != 1 { + t.Fatalf("expected to see 1 batch introduction event event, saw %d", count) + } +} diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index c6b417a28..df4f5de5f 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -1,4 +1,4 @@ -// Copyright (c) 2017 Couchbase, Inc. +// Copyright (c) 2018 Couchbase, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -75,37 +75,6 @@ type Scorch struct { onEvent func(event Event) } -// Event represents the information provided in an OnEvent() callback. -type Event struct { - Kind EventKind - Scorch *Scorch - Duration time.Duration -} - -// EventKind represents an event code for OnEvent() callbacks. -type EventKind int - -// EventKindCLoseStart is fired when a Scorch.Close() has begun. -var EventKindCloseStart = EventKind(1) - -// EventKindClose is fired when a scorch index has been fully closed. -var EventKindClose = EventKind(2) - -// EventKindMergerProgress is fired when the merger has completed a -// round of merge processing. -var EventKindMergerProgress = EventKind(3) - -// EventKindPersisterProgress is fired when the persister has completed -// a round of persistence processing. -var EventKindPersisterProgress = EventKind(4) - -// EventKindBatchIntroductionStart is fired when Batch() is invoked which -// introduces a new segment. -var EventKindBatchIntroductionStart = EventKind(5) - -// EventKindBatchIntroduction is fired when Batch() completes. -var EventKindBatchIntroduction = EventKind(6) - func NewScorch(storeName string, config map[string]interface{}, analysisQueue *index.AnalysisQueue) (index.Index, error) { @@ -127,13 +96,13 @@ func NewScorch(storeName string, if ok { rv.unsafeBatch = ub } + ecbName, ok := config["eventCallbackName"].(string) + if ok { + rv.onEvent = RegistryEventCallbacks[ecbName] + } return rv, nil } -func (s *Scorch) SetEventCallback(f func(Event)) { - s.onEvent = f -} - func (s *Scorch) fireEvent(kind EventKind, dur time.Duration) { if s.onEvent != nil { s.onEvent(Event{Kind: kind, Scorch: s, Duration: dur}) From e756c7acf0caef33e3d2d2f9c1050de746ec8e23 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 5 Jan 2018 15:29:01 -0500 Subject: [PATCH 126/728] add initial support for async error callback --- index/scorch/event.go | 4 ++++ index/scorch/merge.go | 3 +-- index/scorch/persister.go | 6 +++--- index/scorch/scorch.go | 13 ++++++++++++- 4 files changed, 20 insertions(+), 6 deletions(-) diff --git a/index/scorch/event.go b/index/scorch/event.go index ef3439a8b..dd79d6d06 100644 --- a/index/scorch/event.go +++ b/index/scorch/event.go @@ -16,6 +16,10 @@ package scorch import "time" +// RegistryAsyncErrorCallbacks should be treated as read-only after +// process init()'ialization. +var RegistryAsyncErrorCallbacks = map[string]func(error){} + // RegistryEventCallbacks should be treated as read-only after // process init()'ialization. var RegistryEventCallbacks = map[string]func(Event){} diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 0c166df7a..78c27ddb1 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -16,7 +16,6 @@ package scorch import ( "fmt" - "log" "os" "sync/atomic" "time" @@ -48,7 +47,7 @@ OUTER: // lets get started err := s.planMergeAtSnapshot(ourSnapshot) if err != nil { - log.Printf("merging err: %v", err) + s.fireAsyncError(fmt.Errorf("merging err: %v", err)) _ = ourSnapshot.DecRef() continue OUTER } diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 401086543..658e57aee 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -74,7 +74,7 @@ OUTER: close(ch) } if err != nil { - log.Printf("got err persisting snapshot: %v", err) + s.fireAsyncError(fmt.Errorf("got err persisting snapshot: %v", err)) _ = ourSnapshot.DecRef() continue OUTER } @@ -446,13 +446,13 @@ func (p uint64Descending) Swap(i, j int) { p[i], p[j] = p[j], p[i] } func (s *Scorch) removeOldData() { removed, err := s.removeOldBoltSnapshots() if err != nil { - log.Printf("got err removing old bolt snapshots: %v", err) + s.fireAsyncError(fmt.Errorf("got err removing old bolt snapshots: %v", err)) } if removed > 0 { err = s.removeOldZapFiles() if err != nil { - log.Printf("got err removing old zap files: %v", err) + s.fireAsyncError(fmt.Errorf("got err removing old zap files: %v", err)) } } } diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index df4f5de5f..5d0cfb7c9 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -72,7 +72,8 @@ type Scorch struct { rootBolt *bolt.DB asyncTasks sync.WaitGroup - onEvent func(event Event) + onEvent func(event Event) + onAsyncError func(err error) } func NewScorch(storeName string, @@ -100,6 +101,10 @@ func NewScorch(storeName string, if ok { rv.onEvent = RegistryEventCallbacks[ecbName] } + aecbName, ok := config["asyncErrorCallbackName"].(string) + if ok { + rv.onAsyncError = RegistryAsyncErrorCallbacks[aecbName] + } return rv, nil } @@ -109,6 +114,12 @@ func (s *Scorch) fireEvent(kind EventKind, dur time.Duration) { } } +func (s *Scorch) fireAsyncError(err error) { + if s.onAsyncError != nil { + s.onAsyncError(err) + } +} + func (s *Scorch) Open() error { var ok bool s.path, ok = s.config["path"].(string) From 94b0367e47fe54650ffbe56b51c073baa4b0e298 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 5 Jan 2018 16:53:16 -0500 Subject: [PATCH 127/728] switch back to upsidedown as default index before merge to master --- config.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/config.go b/config.go index c1475db74..482efb408 100644 --- a/config.go +++ b/config.go @@ -21,10 +21,13 @@ import ( "time" "github.com/blevesearch/bleve/index" - "github.com/blevesearch/bleve/index/scorch" "github.com/blevesearch/bleve/index/store/gtreap" + "github.com/blevesearch/bleve/index/upsidedown" "github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/search/highlight/highlighter/html" + + // force import of scorch so its accessible by default + _ "github.com/blevesearch/bleve/index/scorch" ) var bleveExpVar = expvar.NewMap("bleve") @@ -69,7 +72,7 @@ func init() { Config.DefaultMemKVStore = gtreap.Name // default index - Config.DefaultIndexType = scorch.Name + Config.DefaultIndexType = upsidedown.Name bootDuration := time.Since(bootStart) bleveExpVar.Add("bootDuration", int64(bootDuration)) From 637fad78a55a5d207d6a74d6e585330d15706451 Mon Sep 17 00:00:00 2001 From: Tom Kralidis Date: Sat, 6 Jan 2018 21:04:03 -0500 Subject: [PATCH 128/728] fix minor typo --- cmd/bleve/cmd/index.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/bleve/cmd/index.go b/cmd/bleve/cmd/index.go index 63ad5f3bf..c8925b49b 100644 --- a/cmd/bleve/cmd/index.go +++ b/cmd/bleve/cmd/index.go @@ -111,7 +111,7 @@ func getAllFiles(args []string, rv chan file) { func init() { RootCmd.AddCommand(indexCmd) - indexCmd.Flags().BoolVarP(&keepDir, "keepDir", "d", false, "Keep the directory in the dodcument id, defaults false.") + indexCmd.Flags().BoolVarP(&keepDir, "keepDir", "d", false, "Keep the directory in the document id, defaults false.") indexCmd.Flags().BoolVarP(&keepExt, "keepExt", "x", false, "Keep the extension in the document id, defaults false.") indexCmd.Flags().BoolVarP(&parseJSON, "json", "j", true, "Parse the contents as JSON, defaults true.") } From 1788a0380385d48665a82d5c215db48251697a45 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Sat, 6 Jan 2018 21:09:53 -0500 Subject: [PATCH 129/728] remove junk from end of scorch readme --- index/scorch/README.md | 143 ----------------------------------------- 1 file changed, 143 deletions(-) diff --git a/index/scorch/README.md b/index/scorch/README.md index 690e7d3d5..861335a1b 100644 --- a/index/scorch/README.md +++ b/index/scorch/README.md @@ -365,146 +365,3 @@ A few simple principles have been identified. - Segments with all items deleted/obsoleted can be dropped. Merging of a segment should be able to proceed even if that segment is held by an ongoing snapshot, it should only delay the removal of it. - - -## TODO - -- need reference counting on the segments, to know when we can safely remove? - -- how well will bitmaps perform when large and possibly mmap'd? - - ------ -thinking out loud on storage - -- fields - - field name - field id -- term dictionary - - field id - FST (values postings ids) -- postings - - posting id - postings list -- freqs - - posting id - freqs list -- norms - - posting id - norms list -- stored - - docNum - - field id - field values - - - ----- - -race dialog with steve: - -state: 2, 4, 8 - -- introducing new segment X - - deleted bitmasks, 2, 4, 8 - -- merger, merge 4 and 8 - new segment Y - - -- merger wins - - state: 2, 9 - - introducer: need to recompute bitmask for 9, could lose again and keep losing race - -- introducer wins - - state: 2, 4, 8, X - 2-X, 4-X, 8-X, nil - - merger finishes: new segment Y, is not valid, need to be recomputed - - -### Bolt Segment Proposal - -Bucket - -"f" field storage - - Key Val - field name field id (var uint16) - - // TODO field location bits - -"d" term dictionary storage - Key Val - field id (var uint16) Vellum FST (mapping term to posting id uint64) - - -"p" postings list storage - Key Val - posting id (var uint64) Roaring Bitmap Serialization (doc numbers) - see FromBuffer - - -"x" chunked data storage - Key Val - chunk id (var uint64) sub-bucket - - Key Val - posting id (var uint64) sub-bucket - - - ALL Compressed Integer Encoding []uint64 - Key Val - "f" freqs 1 value per hit - "n" norms 1 value per hit - "i" fields values per hit - "s" start values per hit - "e" end values per hit - "p" pos values per hit - "a" array pos - entries - each entry is count - followed by uint64 - -"s" stored field data - Key Val - doc num (var uint64) sub-bucket - - Key Val - "m" mossy-like meta packed - - 16 bits - field id - 8 bits - field type - 2? bits - array pos length - - X bits - offset - X bits - length - - "d" raw []byte data (possibly compressed, need segment level config?) - - "a" array position info, packed slice uint64 - - - - - -Notes: - -It is assumed that each IndexReader (snapshot) starts a new Bolt TX (read-only) immediately, and holds it up until it is no longer needed. This allows us to use (unsafely) the raw bytes coming out of BoltDB as return values. Bolt guarantees they will be safe for the duration of the transaction (which we arrange to be the life of the index snapshot). - -Only physically store the fields in one direction, even though at runtime we need both. Upon opening the index, we can read in all the k/v pairs in the "f" bucket. We use the unsafe package to create a []string inverted mapping pointing to the underlying []byte in the BoltDB values. - -The term dictionary is stored opaquely as Vellum FST for each field. When accessing these keys, the []byte return to us is mmap'd by bolt under the hood. We then pass this to vellum using its []byte API, which then operates on it without ever forcing whole thing into memory unless needed. - -We do not need to persist the dictkeys slice since it is only there to support the dictionary iterator prefix/range searches, which are supported directly by the FST. - -Theory of operation of chunked storage is as follows. The postings list iterators only allow starting at the beginning, and have no "advance" capability. In the memory version, this means we always know the Nth hit in the postings list is the Nth entry in some other densely packed slice. However, while OK when everything is in RAM, this is not as suitable for a structure on disk, where wading through detailed info of records you don't care about is too expensive. Instead, we assume some fixed chunking, say 1024. All detailed info for document number N can be found inside of chunk N/1024. Now, the Advance operation still has to Next it's way through the posting list. But, now when it reaches a hit, it knows the chunk index as well as the hit index inside that chunk. Further, we push the chunk offsets to the top of the bolt structure, under the theory that we're likely to access data inside a chunk at the same time. For example, you're likely to access the frequency and norm values for a document hit together, so by organizing by chunk first, we increase the likelihood that this info is nearby on disk. - -The "f" and "n" sub-buckets inside a posting have 1 entry for each hit. (you must next-next-next within the chunk) - -The "i", "s", "e", "p", sub-buckets have entries for each hit. (you must have read and know the freq) - -The "a" sub-bucket has groupings, where each grouping starts with a count, followed by entries. - -For example, lets say hit docNum 27 has freq of 2. The first location for the hit has array positions (0, 1) length 2, and the second location for the hit has array positions (1, 3, 2) length 3. The entries in the slice for this hit look like: - -2 0 1 3 1 3 2 -^ ^ -| next entry, number of ints to follow for it -number of ints to follow for this entry From 4c256f56692f10e903bc1582ff7c7db5dbe354d1 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 8 Jan 2018 10:58:33 +0530 Subject: [PATCH 130/728] DocValue Config, new API Changes -VisitableDocValueFields API for persisted DV field list -making dv configs overridable at field level -enabling on the fly/runtime un inverting of doc values -few UT updates --- document/field_boolean.go | 2 +- document/field_datetime.go | 2 +- document/field_numeric.go | 2 +- document/field_text.go | 2 +- document/indexing_options.go | 11 +++ document/indexing_options_test.go | 24 +++++++ index/scorch/scorch.go | 8 --- index/scorch/scorch_test.go | 69 +++++++++++++++++++ index/scorch/segment/mem/build.go | 8 +-- index/scorch/segment/segment.go | 9 ++- index/scorch/segment/zap/build_test.go | 87 ++++++++++++++++++++++++ index/scorch/segment/zap/docvalues.go | 22 +++++- index/scorch/segment/zap/segment_test.go | 83 ++++++++++++++++++++++ index/scorch/snapshot_index.go | 71 +++++++++++++++++-- index/scorch/snapshot_segment.go | 45 ------------ mapping/field.go | 22 ++++++ mapping/index.go | 8 +++ mapping/mapping_test.go | 3 +- 18 files changed, 409 insertions(+), 69 deletions(-) diff --git a/document/field_boolean.go b/document/field_boolean.go index 668b431a1..c226374c0 100644 --- a/document/field_boolean.go +++ b/document/field_boolean.go @@ -20,7 +20,7 @@ import ( "github.com/blevesearch/bleve/analysis" ) -const DefaultBooleanIndexingOptions = StoreField | IndexField +const DefaultBooleanIndexingOptions = StoreField | IndexField | DocValues type BooleanField struct { name string diff --git a/document/field_datetime.go b/document/field_datetime.go index 6783d53d0..1db068c87 100644 --- a/document/field_datetime.go +++ b/document/field_datetime.go @@ -23,7 +23,7 @@ import ( "github.com/blevesearch/bleve/numeric" ) -const DefaultDateTimeIndexingOptions = StoreField | IndexField +const DefaultDateTimeIndexingOptions = StoreField | IndexField | DocValues const DefaultDateTimePrecisionStep uint = 4 var MinTimeRepresentable = time.Unix(0, math.MinInt64) diff --git a/document/field_numeric.go b/document/field_numeric.go index 7faae2bbb..e32993c88 100644 --- a/document/field_numeric.go +++ b/document/field_numeric.go @@ -21,7 +21,7 @@ import ( "github.com/blevesearch/bleve/numeric" ) -const DefaultNumericIndexingOptions = StoreField | IndexField +const DefaultNumericIndexingOptions = StoreField | IndexField | DocValues const DefaultPrecisionStep uint = 4 diff --git a/document/field_text.go b/document/field_text.go index 37873d36e..5f7a3ab64 100644 --- a/document/field_text.go +++ b/document/field_text.go @@ -20,7 +20,7 @@ import ( "github.com/blevesearch/bleve/analysis" ) -const DefaultTextIndexingOptions = IndexField +const DefaultTextIndexingOptions = IndexField | DocValues type TextField struct { name string diff --git a/document/indexing_options.go b/document/indexing_options.go index 5d562c1de..44498a8e9 100644 --- a/document/indexing_options.go +++ b/document/indexing_options.go @@ -20,6 +20,7 @@ const ( IndexField IndexingOptions = 1 << iota StoreField IncludeTermVectors + DocValues ) func (o IndexingOptions) IsIndexed() bool { @@ -34,6 +35,10 @@ func (o IndexingOptions) IncludeTermVectors() bool { return o&IncludeTermVectors != 0 } +func (o IndexingOptions) IncludeDocValues() bool { + return o&DocValues != 0 +} + func (o IndexingOptions) String() string { rv := "" if o.IsIndexed() { @@ -51,5 +56,11 @@ func (o IndexingOptions) String() string { } rv += "TV" } + if o.IncludeDocValues() { + if rv != "" { + rv += ", " + } + rv += "DV" + } return rv } diff --git a/document/indexing_options_test.go b/document/indexing_options_test.go index f6c6c996f..d88c41070 100644 --- a/document/indexing_options_test.go +++ b/document/indexing_options_test.go @@ -24,36 +24,56 @@ func TestIndexingOptions(t *testing.T) { isIndexed bool isStored bool includeTermVectors bool + docValues bool }{ { options: IndexField | StoreField | IncludeTermVectors, isIndexed: true, isStored: true, includeTermVectors: true, + docValues: false, }, { options: IndexField | IncludeTermVectors, isIndexed: true, isStored: false, includeTermVectors: true, + docValues: false, }, { options: StoreField | IncludeTermVectors, isIndexed: false, isStored: true, includeTermVectors: true, + docValues: false, }, { options: IndexField, isIndexed: true, isStored: false, includeTermVectors: false, + docValues: false, }, { options: StoreField, isIndexed: false, isStored: true, includeTermVectors: false, + docValues: false, + }, + { + options: DocValues, + isIndexed: false, + isStored: false, + includeTermVectors: false, + docValues: true, + }, + { + options: IndexField | StoreField | IncludeTermVectors | DocValues, + isIndexed: true, + isStored: true, + includeTermVectors: true, + docValues: true, }, } @@ -70,5 +90,9 @@ func TestIndexingOptions(t *testing.T) { if actuallyIncludeTermVectors != test.includeTermVectors { t.Errorf("expected includeTermVectors to be %v, got %v for %d", test.includeTermVectors, actuallyIncludeTermVectors, test.options) } + actuallyDocValues := test.options.IncludeDocValues() + if actuallyDocValues != test.docValues { + t.Errorf("expected docValue to be %v, got %v for %d", test.docValues, actuallyDocValues, test.options) + } } } diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 5d0cfb7c9..57a01d7cb 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -37,14 +37,6 @@ const Name = "scorch" const Version uint8 = 1 -// UnInvertIndex is implemented by various scorch index implementations -// to provide the un inverting of the postings or other indexed values. -type UnInvertIndex interface { - // apparently need better namings here.. - VisitDocumentFieldTerms(localDocNum uint64, fields []string, - visitor index.DocumentFieldTermVisitor) error -} - type Scorch struct { readOnly bool version uint8 diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index a9df3e9e2..6e8ecb0cf 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -1638,3 +1638,72 @@ func TestIndexDocumentVisitFieldTermsWithMultipleDocs(t *testing.T) { } } + +func TestIndexDocumentVisitFieldTermsWithMultipleFieldOptions(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, testConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Fatalf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + // mix of field options, this exercises the run time/ on the fly un inverting of + // doc values for custom options enabled field like designation, dept. + options := document.IndexField | document.StoreField | document.IncludeTermVectors + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) // default doc value persisted + doc.AddField(document.NewTextField("title", []uint64{}, []byte("mister"))) // default doc value persisted + doc.AddField(document.NewTextFieldWithIndexingOptions("designation", []uint64{}, []byte("engineer"), options)) + doc.AddField(document.NewTextFieldWithIndexingOptions("dept", []uint64{}, []byte("bleve"), options)) + + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + + fieldTerms := make(index.FieldTerms) + docNumber, err := indexReader.InternalID("1") + if err != nil { + t.Fatal(err) + } + err = indexReader.DocumentVisitFieldTerms(docNumber, []string{"name", "designation", "dept"}, func(field string, term []byte) { + fieldTerms[field] = append(fieldTerms[field], string(term)) + }) + if err != nil { + t.Error(err) + } + expectedFieldTerms := index.FieldTerms{ + "name": []string{"test"}, + "designation": []string{"engineer"}, + "dept": []string{"bleve"}, + } + if !reflect.DeepEqual(fieldTerms, expectedFieldTerms) { + t.Errorf("expected field terms: %#v, got: %#v", expectedFieldTerms, fieldTerms) + } + err = indexReader.Close() + if err != nil { + t.Fatal(err) + } + +} diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 72ea08eb8..cd11fb401 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -119,10 +119,10 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { if field.Options().IsStored() { storeField(docNum, fieldID, encodeFieldType(field), field.Value(), field.ArrayPositions()) } - // TODO with mapping changes for dv - //if field.Options().IncludeDocValues() { - s.DocValueFields[fieldID] = true - //} + + if field.Options().IncludeDocValues() { + s.DocValueFields[fieldID] = true + } } // now that its been rolled up into docMap, walk that diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index 9102e0f4e..858ac3590 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -91,9 +91,14 @@ type Location interface { } // DocumentFieldTermVisitable is implemented by various scorch segment -// implementations to provide the un inverting of the postings -// or other indexed values. +// implementations with persistence for the un inverting of the +// postings or other indexed values. type DocumentFieldTermVisitable interface { VisitDocumentFieldTerms(localDocNum uint64, fields []string, visitor index.DocumentFieldTermVisitor) error + + // VisitableDocValueFields implementation should return + // the list of fields which are document value persisted and + // therefore visitable by the above VisitDocumentFieldTerms method. + VisitableDocValueFields() ([]string, error) } diff --git a/index/scorch/segment/zap/build_test.go b/index/scorch/segment/zap/build_test.go index da0e12ba6..9063980b7 100644 --- a/index/scorch/segment/zap/build_test.go +++ b/index/scorch/segment/zap/build_test.go @@ -286,3 +286,90 @@ func buildMemSegmentMulti() *mem.Segment { return segment } + +func buildMemSegmentWithDefaultFieldMapping() (*mem.Segment, []string) { + + doc := &document.Document{ + ID: "a", + Fields: []document.Field{ + document.NewTextField("_id", nil, []byte("a")), + document.NewTextField("name", nil, []byte("wow")), + document.NewTextField("desc", nil, []byte("some thing")), + document.NewTextField("tag", []uint64{0}, []byte("cold")), + }, + CompositeFields: []*document.CompositeField{ + document.NewCompositeField("_all", true, nil, []string{"_id"}), + }, + } + + var fields []string + fields = append(fields, "_id") + fields = append(fields, "name") + fields = append(fields, "desc") + fields = append(fields, "tag") + + // forge analyzed docs + results := []*index.AnalysisResult{ + &index.AnalysisResult{ + Document: doc, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("a"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("wow"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("some"), + }, + &analysis.Token{ + Start: 5, + End: 10, + Position: 2, + Term: []byte("thing"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 4, + Position: 1, + Term: []byte("cold"), + }, + }, []uint64{0}, true), + }, + Length: []int{ + 1, + 1, + 2, + 1, + 1, + }, + }, + } + + // fix up composite fields + for _, ar := range results { + for i, f := range ar.Document.Fields { + for _, cf := range ar.Document.CompositeFields { + cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i]) + } + } + } + + return mem.NewFromAnalyzedDocs(results), fields +} diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index 3a75f29d2..3774afefb 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -151,7 +151,8 @@ func (di *docValueIterator) getDocValueLocs(docID uint64) (uint64, uint64) { return math.MaxUint64, math.MaxUint64 } -// VisitDocumentFieldTerms is an implementation of the UnInvertIndex interface +// VisitDocumentFieldTerms is an implementation of the +// DocumentFieldTermVisitable interface func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string, visitor index.DocumentFieldTermVisitor) error { fieldID := uint16(0) @@ -178,3 +179,22 @@ func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string, } return nil } + +// VisitableDocValueFields returns the list of fields with +// persisted doc value terms ready to be visitable using the +// VisitDocumentFieldTerms method. +func (s *Segment) VisitableDocValueFields() ([]string, error) { + if len(s.fieldsInv) == 0 { + return nil, nil + } + + var rv []string + for fieldID, field := range s.fieldsInv { + if dvIter, ok := s.fieldDvIterMap[uint16(fieldID)]; ok && + dvIter != nil { + rv = append(rv, field) + } + } + + return rv, nil +} diff --git a/index/scorch/segment/zap/segment_test.go b/index/scorch/segment/zap/segment_test.go index d4241c1d9..fb49f72e5 100644 --- a/index/scorch/segment/zap/segment_test.go +++ b/index/scorch/segment/zap/segment_test.go @@ -19,6 +19,9 @@ import ( "os" "reflect" "testing" + + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment" ) func TestOpen(t *testing.T) { @@ -515,3 +518,83 @@ func TestOpenMultiWithTwoChunks(t *testing.T) { t.Errorf("expected count to be 1, got %d", count) } } + +func TestSegmentVisitableDocValueFieldsList(t *testing.T) { + _ = os.RemoveAll("/tmp/scorch.zap") + + memSegment := buildMemSegmentMulti() + err := PersistSegment(memSegment, "/tmp/scorch.zap", 1) + if err != nil { + t.Fatalf("error persisting segment: %v", err) + } + + seg, err := Open("/tmp/scorch.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + + cerr := seg.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + + if zaps, ok := seg.(segment.DocumentFieldTermVisitable); ok { + fields, err := zaps.VisitableDocValueFields() + if err != nil { + t.Fatalf("segment VisitableDocValueFields err: %v", err) + } + // no persisted doc value fields + if len(fields) != 0 { + t.Errorf("expected no persisted fields for doc values, got: %#v", fields) + } + } + + _ = os.RemoveAll("/tmp/scorch.zap") + + memSegment, expectedFields := buildMemSegmentWithDefaultFieldMapping() + err = PersistSegment(memSegment, "/tmp/scorch.zap", 1) + if err != nil { + t.Fatalf("error persisting segment: %v", err) + } + + seg, err = Open("/tmp/scorch.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := seg.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + if zaps, ok := seg.(segment.DocumentFieldTermVisitable); ok { + fields, err := zaps.VisitableDocValueFields() + if err != nil { + t.Fatalf("segment VisitableDocValueFields err: %v", err) + } + + if !reflect.DeepEqual(fields, expectedFields) { + t.Errorf("expected field terms: %#v, got: %#v", expectedFields, fields) + } + + fieldTerms := make(index.FieldTerms) + err = zaps.VisitDocumentFieldTerms(0, fields, func(field string, term []byte) { + fieldTerms[field] = append(fieldTerms[field], string(term)) + }) + if err != nil { + t.Error(err) + } + + expectedFieldTerms := index.FieldTerms{ + "name": []string{"wow"}, + "desc": []string{"some", "thing"}, + "tag": []string{"cold"}, + "_id": []string{"a"}, + } + if !reflect.DeepEqual(fieldTerms, expectedFieldTerms) { + t.Errorf("expected field terms: %#v, got: %#v", expectedFieldTerms, fieldTerms) + } + + } +} diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index a9b105a0d..bb9975768 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -412,15 +412,64 @@ func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID, ss := i.segment[segmentIndex] if zaps, ok := ss.segment.(segment.DocumentFieldTermVisitable); ok { - return zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor) + // get the list of doc value persisted fields + pFields, err := zaps.VisitableDocValueFields() + if err != nil { + return err + } + // assort the fields for which terms look up have to + // be performed runtime + dvPendingFields := extractDvPendingFields(fields, pFields) + if len(dvPendingFields) == 0 { + // all fields are doc value persisted + return zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor) + } + + // concurrently trigger the runtime doc value preparations for + // pending fields as well as the visit of the persisted doc values + errCh := make(chan error, 1) + + go func() { + defer close(errCh) + err := ss.cachedDocs.prepareFields(fields, ss) + if err != nil { + errCh <- err + } + }() + + // visit the persisted dv while the cache preparation is in progress + err = zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor) + if err != nil { + return err + } + + // err out if fieldCache preparation failed + err = <-errCh + if err != nil { + return err + } + + visitDocumentFieldCacheTerms(localDocNum, dvPendingFields, ss, visitor) + return nil } - // else fallback to the in memory fieldCache - err = ss.cachedDocs.prepareFields(fields, ss) + return prepareCacheVisitDocumentFieldTerms(localDocNum, fields, ss, visitor) +} + +func prepareCacheVisitDocumentFieldTerms(localDocNum uint64, fields []string, + ss *SegmentSnapshot, visitor index.DocumentFieldTermVisitor) error { + err := ss.cachedDocs.prepareFields(fields, ss) if err != nil { return err } + visitDocumentFieldCacheTerms(localDocNum, fields, ss, visitor) + return nil +} + +func visitDocumentFieldCacheTerms(localDocNum uint64, fields []string, + ss *SegmentSnapshot, visitor index.DocumentFieldTermVisitor) { + for _, field := range fields { if cachedFieldDocs, exists := ss.cachedDocs.cache[field]; exists { if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists { @@ -436,5 +485,19 @@ func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID, } } - return nil +} + +func extractDvPendingFields(requestedFields, persistedFields []string) []string { + removeMap := map[string]struct{}{} + for _, str := range persistedFields { + removeMap[str] = struct{}{} + } + + rv := make([]string, 0, len(requestedFields)) + for _, s := range requestedFields { + if _, ok := removeMap[s]; !ok { + rv = append(rv, s) + } + } + return rv } diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 0c111c505..5e64cb1f2 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -18,7 +18,6 @@ import ( "sync" "github.com/RoaringBitmap/roaring" - "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" ) @@ -84,50 +83,6 @@ func (s *SegmentSnapshot) VisitDocument(num uint64, visitor segment.DocumentFiel return s.segment.VisitDocument(num, visitor) } -func (s *SegmentSnapshot) DocumentVisitFieldTerms(num uint64, fields []string, - visitor index.DocumentFieldTermVisitor) error { - collection := make(map[string][][]byte) - // collect field indexed values - for _, field := range fields { - dict, err := s.Dictionary(field) - if err != nil { - return err - } - dictItr := dict.Iterator() - var next *index.DictEntry - next, err = dictItr.Next() - for next != nil && err == nil { - postings, err2 := dict.PostingsList(next.Term, nil) - if err2 != nil { - return err2 - } - postingsItr := postings.Iterator() - nextPosting, err2 := postingsItr.Next() - for err2 == nil && nextPosting != nil && nextPosting.Number() <= num { - if nextPosting.Number() == num { - // got what we're looking for - collection[field] = append(collection[field], []byte(next.Term)) - } - nextPosting, err = postingsItr.Next() - } - if err2 != nil { - return err - } - next, err = dictItr.Next() - } - if err != nil { - return err - } - } - // invoke callback - for field, values := range collection { - for _, value := range values { - visitor(field, value) - } - } - return nil -} - func (s *SegmentSnapshot) Count() uint64 { rv := s.segment.Count() diff --git a/mapping/field.go b/mapping/field.go index 9f1928ca5..898ee9d79 100644 --- a/mapping/field.go +++ b/mapping/field.go @@ -28,6 +28,7 @@ import ( var ( IndexDynamic = true StoreDynamic = true + DocValues = true // TODO revisit default? ) // A FieldMapping describes how a specific item @@ -54,6 +55,10 @@ type FieldMapping struct { IncludeTermVectors bool `json:"include_term_vectors,omitempty"` IncludeInAll bool `json:"include_in_all,omitempty"` DateFormat string `json:"date_format,omitempty"` + + // DocValues, if true makes the index uninverting possible for this field + // It is useful for faceting and sorting queries. + DocValues bool `json:"docvalues,omitempty"` } // NewTextFieldMapping returns a default field mapping for text @@ -64,6 +69,7 @@ func NewTextFieldMapping() *FieldMapping { Index: true, IncludeTermVectors: true, IncludeInAll: true, + DocValues: true, } } @@ -71,6 +77,7 @@ func newTextFieldMappingDynamic(im *IndexMappingImpl) *FieldMapping { rv := NewTextFieldMapping() rv.Store = im.StoreDynamic rv.Index = im.IndexDynamic + rv.DocValues = im.DocValues return rv } @@ -81,6 +88,7 @@ func NewNumericFieldMapping() *FieldMapping { Store: true, Index: true, IncludeInAll: true, + DocValues: true, } } @@ -88,6 +96,7 @@ func newNumericFieldMappingDynamic(im *IndexMappingImpl) *FieldMapping { rv := NewNumericFieldMapping() rv.Store = im.StoreDynamic rv.Index = im.IndexDynamic + rv.DocValues = im.DocValues return rv } @@ -98,6 +107,7 @@ func NewDateTimeFieldMapping() *FieldMapping { Store: true, Index: true, IncludeInAll: true, + DocValues: true, } } @@ -105,6 +115,7 @@ func newDateTimeFieldMappingDynamic(im *IndexMappingImpl) *FieldMapping { rv := NewDateTimeFieldMapping() rv.Store = im.StoreDynamic rv.Index = im.IndexDynamic + rv.DocValues = im.DocValues return rv } @@ -115,6 +126,7 @@ func NewBooleanFieldMapping() *FieldMapping { Store: true, Index: true, IncludeInAll: true, + DocValues: true, } } @@ -122,6 +134,7 @@ func newBooleanFieldMappingDynamic(im *IndexMappingImpl) *FieldMapping { rv := NewBooleanFieldMapping() rv.Store = im.StoreDynamic rv.Index = im.IndexDynamic + rv.DocValues = im.DocValues return rv } @@ -132,6 +145,7 @@ func NewGeoPointFieldMapping() *FieldMapping { Store: true, Index: true, IncludeInAll: true, + DocValues: true, } } @@ -147,6 +161,9 @@ func (fm *FieldMapping) Options() document.IndexingOptions { if fm.IncludeTermVectors { rv |= document.IncludeTermVectors } + if fm.DocValues { + rv |= document.DocValues + } return rv } @@ -308,6 +325,11 @@ func (fm *FieldMapping) UnmarshalJSON(data []byte) error { if err != nil { return err } + case "docvalues": + err := json.Unmarshal(v, &fm.DocValues) + if err != nil { + return err + } default: invalidKeys = append(invalidKeys, k) } diff --git a/mapping/index.go b/mapping/index.go index cefa59803..99ed6353e 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -50,6 +50,7 @@ type IndexMappingImpl struct { DefaultField string `json:"default_field"` StoreDynamic bool `json:"store_dynamic"` IndexDynamic bool `json:"index_dynamic"` + DocValues bool `json:"docvalues,omitempty"` CustomAnalysis *customAnalysis `json:"analysis,omitempty"` cache *registry.Cache } @@ -154,6 +155,7 @@ func NewIndexMapping() *IndexMappingImpl { DefaultField: defaultField, IndexDynamic: IndexDynamic, StoreDynamic: StoreDynamic, + DocValues: DocValues, CustomAnalysis: newCustomAnalysis(), cache: registry.NewCache(), } @@ -217,6 +219,7 @@ func (im *IndexMappingImpl) UnmarshalJSON(data []byte) error { im.TypeMapping = make(map[string]*DocumentMapping) im.StoreDynamic = StoreDynamic im.IndexDynamic = IndexDynamic + im.DocValues = DocValues var invalidKeys []string for k, v := range tmp { @@ -271,6 +274,11 @@ func (im *IndexMappingImpl) UnmarshalJSON(data []byte) error { if err != nil { return err } + case "docvalues": + err := json.Unmarshal(v, &im.DocValues) + if err != nil { + return err + } default: invalidKeys = append(invalidKeys, k) } diff --git a/mapping/mapping_test.go b/mapping/mapping_test.go index 735aef057..5d7527e0a 100644 --- a/mapping/mapping_test.go +++ b/mapping/mapping_test.go @@ -40,7 +40,8 @@ var mappingSource = []byte(`{ "store": true, "index": true, "include_term_vectors": true, - "include_in_all": true + "include_in_all": true, + "docvalues": true } ] } From 43bfcc00c9704785d66dbd3dd261e47d4bf27dbb Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 8 Jan 2018 16:54:01 -0800 Subject: [PATCH 131/728] Do not account mmap'ed part of zap segments in MemoryUsed This API is designed to only emit the dirty "unpersisted" bytes only. This does not included the mmap'ed part in the zap segments (disk). --- index/scorch/scorch.go | 14 ++++++++------ index/scorch/segment/zap/segment.go | 3 ++- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 5d0cfb7c9..c2e4b48b4 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -410,13 +410,15 @@ func (s *Scorch) AddEligibleForRemoval(epoch uint64) { func (s *Scorch) MemoryUsed() uint64 { var memUsed uint64 s.rootLock.RLock() - for _, segmentSnapshot := range s.root.segment { - memUsed += 8 /* size of id -> uint64 */ + - segmentSnapshot.segment.SizeInBytes() - if segmentSnapshot.deleted != nil { - memUsed += segmentSnapshot.deleted.GetSizeInBytes() + if s.root != nil { + for _, segmentSnapshot := range s.root.segment { + memUsed += 8 /* size of id -> uint64 */ + + segmentSnapshot.segment.SizeInBytes() + if segmentSnapshot.deleted != nil { + memUsed += segmentSnapshot.deleted.GetSizeInBytes() + } + memUsed += segmentSnapshot.cachedDocs.sizeInBytes() } - memUsed += segmentSnapshot.cachedDocs.sizeInBytes() } s.rootLock.RUnlock() return memUsed diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 93b7466c8..9f9910366 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -105,7 +105,8 @@ func (s *Segment) SizeInBytes() uint64 { // 8 /* size of fieldsIndexOffset -> uint64 */ sizeOfUints := 36 - sizeInBytes := len(s.mm) + len(s.path) + sizeOfUints + // Do not include the mmap'ed part + sizeInBytes := len(s.path) + sizeOfUints for k, _ := range s.fieldsMap { sizeInBytes += len(k) + 2 /* size of uint16 */ From 53aef2104e8b1d91961d151e056d7ebc8b2f8bb2 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 10 Jan 2018 22:00:26 +0530 Subject: [PATCH 132/728] fixing err handling in UTs, name changes --- index/scorch/segment/zap/docvalues.go | 5 ----- index/scorch/segment/zap/segment_test.go | 18 +++++++++--------- mapping/field.go | 14 +++++++------- mapping/index.go | 10 +++++----- 4 files changed, 21 insertions(+), 26 deletions(-) diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index 3774afefb..cdb16ccb9 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -184,10 +184,6 @@ func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string, // persisted doc value terms ready to be visitable using the // VisitDocumentFieldTerms method. func (s *Segment) VisitableDocValueFields() ([]string, error) { - if len(s.fieldsInv) == 0 { - return nil, nil - } - var rv []string for fieldID, field := range s.fieldsInv { if dvIter, ok := s.fieldDvIterMap[uint16(fieldID)]; ok && @@ -195,6 +191,5 @@ func (s *Segment) VisitableDocValueFields() ([]string, error) { rv = append(rv, field) } } - return rv, nil } diff --git a/index/scorch/segment/zap/segment_test.go b/index/scorch/segment/zap/segment_test.go index fb49f72e5..704f9e72e 100644 --- a/index/scorch/segment/zap/segment_test.go +++ b/index/scorch/segment/zap/segment_test.go @@ -40,7 +40,7 @@ func TestOpen(t *testing.T) { defer func() { cerr := segment.Close() if cerr != nil { - t.Fatalf("error closing segment: %v", err) + t.Fatalf("error closing segment: %v", cerr) } }() @@ -340,7 +340,7 @@ func TestOpenMulti(t *testing.T) { defer func() { cerr := segment.Close() if cerr != nil { - t.Fatalf("error closing segment: %v", err) + t.Fatalf("error closing segment: %v", cerr) } }() @@ -440,7 +440,7 @@ func TestOpenMultiWithTwoChunks(t *testing.T) { defer func() { cerr := segment.Close() if cerr != nil { - t.Fatalf("error closing segment: %v", err) + t.Fatalf("error closing segment: %v", cerr) } }() @@ -533,11 +533,6 @@ func TestSegmentVisitableDocValueFieldsList(t *testing.T) { t.Fatalf("error opening segment: %v", err) } - cerr := seg.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", err) - } - if zaps, ok := seg.(segment.DocumentFieldTermVisitable); ok { fields, err := zaps.VisitableDocValueFields() if err != nil { @@ -549,6 +544,10 @@ func TestSegmentVisitableDocValueFieldsList(t *testing.T) { } } + err = seg.Close() + if err != nil { + t.Fatalf("error closing segment: %v", err) + } _ = os.RemoveAll("/tmp/scorch.zap") memSegment, expectedFields := buildMemSegmentWithDefaultFieldMapping() @@ -561,10 +560,11 @@ func TestSegmentVisitableDocValueFieldsList(t *testing.T) { if err != nil { t.Fatalf("error opening segment: %v", err) } + defer func() { cerr := seg.Close() if cerr != nil { - t.Fatalf("error closing segment: %v", err) + t.Fatalf("error closing segment: %v", cerr) } }() diff --git a/mapping/field.go b/mapping/field.go index 898ee9d79..278faa1a9 100644 --- a/mapping/field.go +++ b/mapping/field.go @@ -26,9 +26,9 @@ import ( // control the default behavior for dynamic fields (those not explicitly mapped) var ( - IndexDynamic = true - StoreDynamic = true - DocValues = true // TODO revisit default? + IndexDynamic = true + StoreDynamic = true + DocValuesDynamic = true // TODO revisit default? ) // A FieldMapping describes how a specific item @@ -77,7 +77,7 @@ func newTextFieldMappingDynamic(im *IndexMappingImpl) *FieldMapping { rv := NewTextFieldMapping() rv.Store = im.StoreDynamic rv.Index = im.IndexDynamic - rv.DocValues = im.DocValues + rv.DocValues = im.DocValuesDynamic return rv } @@ -96,7 +96,7 @@ func newNumericFieldMappingDynamic(im *IndexMappingImpl) *FieldMapping { rv := NewNumericFieldMapping() rv.Store = im.StoreDynamic rv.Index = im.IndexDynamic - rv.DocValues = im.DocValues + rv.DocValues = im.DocValuesDynamic return rv } @@ -115,7 +115,7 @@ func newDateTimeFieldMappingDynamic(im *IndexMappingImpl) *FieldMapping { rv := NewDateTimeFieldMapping() rv.Store = im.StoreDynamic rv.Index = im.IndexDynamic - rv.DocValues = im.DocValues + rv.DocValues = im.DocValuesDynamic return rv } @@ -134,7 +134,7 @@ func newBooleanFieldMappingDynamic(im *IndexMappingImpl) *FieldMapping { rv := NewBooleanFieldMapping() rv.Store = im.StoreDynamic rv.Index = im.IndexDynamic - rv.DocValues = im.DocValues + rv.DocValues = im.DocValuesDynamic return rv } diff --git a/mapping/index.go b/mapping/index.go index 99ed6353e..fc5d12a73 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -50,7 +50,7 @@ type IndexMappingImpl struct { DefaultField string `json:"default_field"` StoreDynamic bool `json:"store_dynamic"` IndexDynamic bool `json:"index_dynamic"` - DocValues bool `json:"docvalues,omitempty"` + DocValuesDynamic bool `json:"docvalues_dynamic,omitempty"` CustomAnalysis *customAnalysis `json:"analysis,omitempty"` cache *registry.Cache } @@ -155,7 +155,7 @@ func NewIndexMapping() *IndexMappingImpl { DefaultField: defaultField, IndexDynamic: IndexDynamic, StoreDynamic: StoreDynamic, - DocValues: DocValues, + DocValuesDynamic: DocValuesDynamic, CustomAnalysis: newCustomAnalysis(), cache: registry.NewCache(), } @@ -219,7 +219,7 @@ func (im *IndexMappingImpl) UnmarshalJSON(data []byte) error { im.TypeMapping = make(map[string]*DocumentMapping) im.StoreDynamic = StoreDynamic im.IndexDynamic = IndexDynamic - im.DocValues = DocValues + im.DocValuesDynamic = DocValuesDynamic var invalidKeys []string for k, v := range tmp { @@ -274,8 +274,8 @@ func (im *IndexMappingImpl) UnmarshalJSON(data []byte) error { if err != nil { return err } - case "docvalues": - err := json.Unmarshal(v, &im.DocValues) + case "docvalues_dynamic": + err := json.Unmarshal(v, &im.DocValuesDynamic) if err != nil { return err } From a9532e510ab7de665b7d131abb77eac400699cf3 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 10 Jan 2018 15:15:31 -0500 Subject: [PATCH 133/728] refactor slightly to use our new hosted snowball stemmers rather than having each package include it directly inside of bleve, we have decide to host them all in one repo https://github.com/blevesearch/snowballstem this makes the easier for the rest of the community to use outside of bleve contexts --- analysis/lang/ru/analyzer_ru.go | 2 +- analysis/lang/ru/analyzer_ru_test.go | 54 +- analysis/lang/ru/snowball/stem_Unicode.go | 737 ---------------------- analysis/lang/ru/stemmer_ru.go | 11 +- analysis/lang/ru/stemmer_ru_test.go | 2 +- analysis/lang/ru/stop_filter_ru.go | 2 +- analysis/lang/ru/stop_words_ru.go | 469 +++++++------- vendor/manifest | 8 + 8 files changed, 309 insertions(+), 976 deletions(-) delete mode 100644 analysis/lang/ru/snowball/stem_Unicode.go diff --git a/analysis/lang/ru/analyzer_ru.go b/analysis/lang/ru/analyzer_ru.go index 3b3404037..d1b7688c0 100644 --- a/analysis/lang/ru/analyzer_ru.go +++ b/analysis/lang/ru/analyzer_ru.go @@ -1,4 +1,4 @@ -// Copyright (c) 2014 Couchbase, Inc. +// Copyright (c) 2018 Couchbase, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/analysis/lang/ru/analyzer_ru_test.go b/analysis/lang/ru/analyzer_ru_test.go index a7ffef414..6cda4a586 100644 --- a/analysis/lang/ru/analyzer_ru_test.go +++ b/analysis/lang/ru/analyzer_ru_test.go @@ -1,4 +1,4 @@ -// Copyright (c) 2014 Couchbase, Inc. +// Copyright (c) 2018 Couchbase, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -49,6 +49,58 @@ func TestRussianAnalyzer(t *testing.T) { input: []byte("как"), output: analysis.TokenStream{}, }, + // digits safe + { + input: []byte("text 1000"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("text"), + }, + &analysis.Token{ + Term: []byte("1000"), + }, + }, + }, + { + input: []byte("Вместе с тем о силе электромагнитной энергии имели представление еще"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("вмест"), + }, + &analysis.Token{ + Term: []byte("сил"), + }, + &analysis.Token{ + Term: []byte("электромагнитн"), + }, + &analysis.Token{ + Term: []byte("энерг"), + }, + &analysis.Token{ + Term: []byte("имел"), + }, + &analysis.Token{ + Term: []byte("представлен"), + }, + }, + }, + { + input: []byte("Но знание это хранилось в тайне"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("знан"), + }, + &analysis.Token{ + Term: []byte("эт"), + }, + &analysis.Token{ + Term: []byte("хран"), + }, + &analysis.Token{ + Term: []byte("тайн"), + }, + }, + }, } cache := registry.NewCache() diff --git a/analysis/lang/ru/snowball/stem_Unicode.go b/analysis/lang/ru/snowball/stem_Unicode.go deleted file mode 100644 index dfd2f2eb0..000000000 --- a/analysis/lang/ru/snowball/stem_Unicode.go +++ /dev/null @@ -1,737 +0,0 @@ -//! This file was generated automatically by the Snowball to Go compiler -//! http://snowballstem.org/ - -package snowball - -import ( - snowballRuntime "github.com/snowballstem/snowball/go" -) - -var A_0 = []*snowballRuntime.Among{ - &snowballRuntime.Among{Str: "\u0432\u0448\u0438\u0441\u044C", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044B\u0432\u0448\u0438\u0441\u044C", A: 0, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u0432\u0448\u0438\u0441\u044C", A: 0, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0432", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044B\u0432", A: 3, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u0432", A: 3, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0432\u0448\u0438", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044B\u0432\u0448\u0438", A: 6, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u0432\u0448\u0438", A: 6, B: 2, F: nil}, -} - -var A_1 = []*snowballRuntime.Among{ - &snowballRuntime.Among{Str: "\u0435\u043C\u0443", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u043E\u043C\u0443", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044B\u0445", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u0445", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0443\u044E", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044E\u044E", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0435\u044E", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u043E\u044E", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044F\u044F", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0430\u044F", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044B\u0435", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0435\u0435", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u0435", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u043E\u0435", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044B\u043C\u0438", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u043C\u0438", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044B\u0439", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0435\u0439", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u0439", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u043E\u0439", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044B\u043C", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0435\u043C", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u043C", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u043E\u043C", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0435\u0433\u043E", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u043E\u0433\u043E", A: -1, B: 1, F: nil}, -} - -var A_2 = []*snowballRuntime.Among{ - &snowballRuntime.Among{Str: "\u0432\u0448", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044B\u0432\u0448", A: 0, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u0432\u0448", A: 0, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0449", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044E\u0449", A: 3, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0443\u044E\u0449", A: 4, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0435\u043C", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u043D\u043D", A: -1, B: 1, F: nil}, -} - -var A_3 = []*snowballRuntime.Among{ - &snowballRuntime.Among{Str: "\u0441\u044C", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0441\u044F", A: -1, B: 1, F: nil}, -} - -var A_4 = []*snowballRuntime.Among{ - &snowballRuntime.Among{Str: "\u044B\u0442", A: -1, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u044E\u0442", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0443\u044E\u0442", A: 1, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u044F\u0442", A: -1, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0435\u0442", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0443\u0435\u0442", A: 4, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u0442", A: -1, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u043D\u044B", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0435\u043D\u044B", A: 7, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0442\u044C", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044B\u0442\u044C", A: 9, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u0442\u044C", A: 9, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0435\u0448\u044C", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u0448\u044C", A: -1, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u044E", A: -1, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0443\u044E", A: 14, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u043B\u0430", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044B\u043B\u0430", A: 16, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u043B\u0430", A: 16, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u043D\u0430", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0435\u043D\u0430", A: 19, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0435\u0442\u0435", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u0442\u0435", A: -1, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0439\u0442\u0435", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0443\u0439\u0442\u0435", A: 23, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0435\u0439\u0442\u0435", A: 23, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u043B\u0438", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044B\u043B\u0438", A: 26, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u043B\u0438", A: 26, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0439", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0443\u0439", A: 29, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0435\u0439", A: 29, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u043B", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044B\u043B", A: 32, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u043B", A: 32, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u044B\u043C", A: -1, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0435\u043C", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u043C", A: -1, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u043D", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0435\u043D", A: 38, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u043B\u043E", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044B\u043B\u043E", A: 40, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u043B\u043E", A: 40, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u043D\u043E", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0435\u043D\u043E", A: 43, B: 2, F: nil}, - &snowballRuntime.Among{Str: "\u043D\u043D\u043E", A: 43, B: 1, F: nil}, -} - -var A_5 = []*snowballRuntime.Among{ - &snowballRuntime.Among{Str: "\u0443", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044F\u0445", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u044F\u0445", A: 1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0430\u0445", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044B", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044C", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044E", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044C\u044E", A: 6, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u044E", A: 6, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044F", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044C\u044F", A: 9, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u044F", A: 9, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0430", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0435\u0432", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u043E\u0432", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0435", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044C\u0435", A: 15, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u0435", A: 15, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0438", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0435\u0438", A: 18, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u0438", A: 18, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044F\u043C\u0438", A: 18, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u044F\u043C\u0438", A: 21, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0430\u043C\u0438", A: 18, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0439", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0435\u0439", A: 24, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u0435\u0439", A: 25, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u0439", A: 24, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u043E\u0439", A: 24, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044F\u043C", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u044F\u043C", A: 29, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0430\u043C", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0435\u043C", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u0438\u0435\u043C", A: 32, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u043E\u043C", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u043E", A: -1, B: 1, F: nil}, -} - -var A_6 = []*snowballRuntime.Among{ - &snowballRuntime.Among{Str: "\u043E\u0441\u0442", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u043E\u0441\u0442\u044C", A: -1, B: 1, F: nil}, -} - -var A_7 = []*snowballRuntime.Among{ - &snowballRuntime.Among{Str: "\u0435\u0439\u0448", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u044C", A: -1, B: 3, F: nil}, - &snowballRuntime.Among{Str: "\u0435\u0439\u0448\u0435", A: -1, B: 1, F: nil}, - &snowballRuntime.Among{Str: "\u043D", A: -1, B: 2, F: nil}, -} - -var G_v = []byte{33, 65, 8, 232} - -type Context struct { - i_p2 int - i_pV int -} - -func r_mark_regions(env *snowballRuntime.Env, ctx interface{}) bool { - context := ctx.(*Context) - _ = context - // (, line 57 - context.i_pV = env.Limit - context.i_p2 = env.Limit - // do, line 61 - var v_1 = env.Cursor -lab0: - for { - // (, line 61 - // gopast, line 62 - golab1: - for { - lab2: - for { - if !env.InGrouping(G_v, 1072, 1103) { - break lab2 - } - break golab1 - } - if env.Cursor >= env.Limit { - break lab0 - } - env.NextChar() - } - // setmark pV, line 62 - context.i_pV = env.Cursor - // gopast, line 62 - golab3: - for { - lab4: - for { - if !env.OutGrouping(G_v, 1072, 1103) { - break lab4 - } - break golab3 - } - if env.Cursor >= env.Limit { - break lab0 - } - env.NextChar() - } - // gopast, line 63 - golab5: - for { - lab6: - for { - if !env.InGrouping(G_v, 1072, 1103) { - break lab6 - } - break golab5 - } - if env.Cursor >= env.Limit { - break lab0 - } - env.NextChar() - } - // gopast, line 63 - golab7: - for { - lab8: - for { - if !env.OutGrouping(G_v, 1072, 1103) { - break lab8 - } - break golab7 - } - if env.Cursor >= env.Limit { - break lab0 - } - env.NextChar() - } - // setmark p2, line 63 - context.i_p2 = env.Cursor - break lab0 - } - env.Cursor = v_1 - return true -} - -func r_R2(env *snowballRuntime.Env, ctx interface{}) bool { - context := ctx.(*Context) - _ = context - if !(context.i_p2 <= env.Cursor) { - return false - } - return true -} - -func r_perfective_gerund(env *snowballRuntime.Env, ctx interface{}) bool { - context := ctx.(*Context) - _ = context - var among_var int32 - // (, line 71 - // [, line 72 - env.Ket = env.Cursor - // substring, line 72 - among_var = env.FindAmongB(A_0, context) - if among_var == 0 { - return false - } - // ], line 72 - env.Bra = env.Cursor - if among_var == 0 { - return false - } else if among_var == 1 { - // (, line 76 - // or, line 76 - lab0: - for { - var v_1 = env.Limit - env.Cursor - lab1: - for { - // literal, line 76 - if !env.EqSB("\u0430") { - break lab1 - } - break lab0 - } - env.Cursor = env.Limit - v_1 - // literal, line 76 - if !env.EqSB("\u044F") { - return false - } - break lab0 - } - // delete, line 76 - if !env.SliceDel() { - return false - } - } else if among_var == 2 { - // (, line 83 - // delete, line 83 - if !env.SliceDel() { - return false - } - } - return true -} - -func r_adjective(env *snowballRuntime.Env, ctx interface{}) bool { - context := ctx.(*Context) - _ = context - var among_var int32 - // (, line 87 - // [, line 88 - env.Ket = env.Cursor - // substring, line 88 - among_var = env.FindAmongB(A_1, context) - if among_var == 0 { - return false - } - // ], line 88 - env.Bra = env.Cursor - if among_var == 0 { - return false - } else if among_var == 1 { - // (, line 97 - // delete, line 97 - if !env.SliceDel() { - return false - } - } - return true -} - -func r_adjectival(env *snowballRuntime.Env, ctx interface{}) bool { - context := ctx.(*Context) - _ = context - var among_var int32 - // (, line 101 - // call adjective, line 102 - if !r_adjective(env, context) { - return false - } - // try, line 109 - var v_1 = env.Limit - env.Cursor -lab0: - for { - // (, line 109 - // [, line 110 - env.Ket = env.Cursor - // substring, line 110 - among_var = env.FindAmongB(A_2, context) - if among_var == 0 { - env.Cursor = env.Limit - v_1 - break lab0 - } - // ], line 110 - env.Bra = env.Cursor - if among_var == 0 { - env.Cursor = env.Limit - v_1 - break lab0 - } else if among_var == 1 { - // (, line 115 - // or, line 115 - lab1: - for { - var v_2 = env.Limit - env.Cursor - lab2: - for { - // literal, line 115 - if !env.EqSB("\u0430") { - break lab2 - } - break lab1 - } - env.Cursor = env.Limit - v_2 - // literal, line 115 - if !env.EqSB("\u044F") { - env.Cursor = env.Limit - v_1 - break lab0 - } - break lab1 - } - // delete, line 115 - if !env.SliceDel() { - return false - } - } else if among_var == 2 { - // (, line 122 - // delete, line 122 - if !env.SliceDel() { - return false - } - } - break lab0 - } - return true -} - -func r_reflexive(env *snowballRuntime.Env, ctx interface{}) bool { - context := ctx.(*Context) - _ = context - var among_var int32 - // (, line 128 - // [, line 129 - env.Ket = env.Cursor - // substring, line 129 - among_var = env.FindAmongB(A_3, context) - if among_var == 0 { - return false - } - // ], line 129 - env.Bra = env.Cursor - if among_var == 0 { - return false - } else if among_var == 1 { - // (, line 132 - // delete, line 132 - if !env.SliceDel() { - return false - } - } - return true -} - -func r_verb(env *snowballRuntime.Env, ctx interface{}) bool { - context := ctx.(*Context) - _ = context - var among_var int32 - // (, line 136 - // [, line 137 - env.Ket = env.Cursor - // substring, line 137 - among_var = env.FindAmongB(A_4, context) - if among_var == 0 { - return false - } - // ], line 137 - env.Bra = env.Cursor - if among_var == 0 { - return false - } else if among_var == 1 { - // (, line 143 - // or, line 143 - lab0: - for { - var v_1 = env.Limit - env.Cursor - lab1: - for { - // literal, line 143 - if !env.EqSB("\u0430") { - break lab1 - } - break lab0 - } - env.Cursor = env.Limit - v_1 - // literal, line 143 - if !env.EqSB("\u044F") { - return false - } - break lab0 - } - // delete, line 143 - if !env.SliceDel() { - return false - } - } else if among_var == 2 { - // (, line 151 - // delete, line 151 - if !env.SliceDel() { - return false - } - } - return true -} - -func r_noun(env *snowballRuntime.Env, ctx interface{}) bool { - context := ctx.(*Context) - _ = context - var among_var int32 - // (, line 159 - // [, line 160 - env.Ket = env.Cursor - // substring, line 160 - among_var = env.FindAmongB(A_5, context) - if among_var == 0 { - return false - } - // ], line 160 - env.Bra = env.Cursor - if among_var == 0 { - return false - } else if among_var == 1 { - // (, line 167 - // delete, line 167 - if !env.SliceDel() { - return false - } - } - return true -} - -func r_derivational(env *snowballRuntime.Env, ctx interface{}) bool { - context := ctx.(*Context) - _ = context - var among_var int32 - // (, line 175 - // [, line 176 - env.Ket = env.Cursor - // substring, line 176 - among_var = env.FindAmongB(A_6, context) - if among_var == 0 { - return false - } - // ], line 176 - env.Bra = env.Cursor - // call R2, line 176 - if !r_R2(env, context) { - return false - } - if among_var == 0 { - return false - } else if among_var == 1 { - // (, line 179 - // delete, line 179 - if !env.SliceDel() { - return false - } - } - return true -} - -func r_tidy_up(env *snowballRuntime.Env, ctx interface{}) bool { - context := ctx.(*Context) - _ = context - var among_var int32 - // (, line 183 - // [, line 184 - env.Ket = env.Cursor - // substring, line 184 - among_var = env.FindAmongB(A_7, context) - if among_var == 0 { - return false - } - // ], line 184 - env.Bra = env.Cursor - if among_var == 0 { - return false - } else if among_var == 1 { - // (, line 188 - // delete, line 188 - if !env.SliceDel() { - return false - } - // [, line 189 - env.Ket = env.Cursor - // literal, line 189 - if !env.EqSB("\u043D") { - return false - } - // ], line 189 - env.Bra = env.Cursor - // literal, line 189 - if !env.EqSB("\u043D") { - return false - } - // delete, line 189 - if !env.SliceDel() { - return false - } - } else if among_var == 2 { - // (, line 192 - // literal, line 192 - if !env.EqSB("\u043D") { - return false - } - // delete, line 192 - if !env.SliceDel() { - return false - } - } else if among_var == 3 { - // (, line 194 - // delete, line 194 - if !env.SliceDel() { - return false - } - } - return true -} - -func Stem(env *snowballRuntime.Env) bool { - var context = &Context{ - i_p2: 0, - i_pV: 0, - } - _ = context - // (, line 199 - // do, line 201 - var v_1 = env.Cursor -lab0: - for { - // call mark_regions, line 201 - if !r_mark_regions(env, context) { - break lab0 - } - break lab0 - } - env.Cursor = v_1 - // backwards, line 202 - env.LimitBackward = env.Cursor - env.Cursor = env.Limit - // setlimit, line 202 - var v_2 = env.Limit - env.Cursor - // tomark, line 202 - if env.Cursor < context.i_pV { - return false - } - env.Cursor = context.i_pV - var v_3 = env.LimitBackward - env.LimitBackward = env.Cursor - env.Cursor = env.Limit - v_2 - // (, line 202 - // do, line 203 - var v_4 = env.Limit - env.Cursor -lab1: - for { - // (, line 203 - // or, line 204 - lab2: - for { - var v_5 = env.Limit - env.Cursor - lab3: - for { - // call perfective_gerund, line 204 - if !r_perfective_gerund(env, context) { - break lab3 - } - break lab2 - } - env.Cursor = env.Limit - v_5 - // (, line 205 - // try, line 205 - var v_6 = env.Limit - env.Cursor - lab4: - for { - // call reflexive, line 205 - if !r_reflexive(env, context) { - env.Cursor = env.Limit - v_6 - break lab4 - } - break lab4 - } - // or, line 206 - lab5: - for { - var v_7 = env.Limit - env.Cursor - lab6: - for { - // call adjectival, line 206 - if !r_adjectival(env, context) { - break lab6 - } - break lab5 - } - env.Cursor = env.Limit - v_7 - lab7: - for { - // call verb, line 206 - if !r_verb(env, context) { - break lab7 - } - break lab5 - } - env.Cursor = env.Limit - v_7 - // call noun, line 206 - if !r_noun(env, context) { - break lab1 - } - break lab5 - } - break lab2 - } - break lab1 - } - env.Cursor = env.Limit - v_4 - // try, line 209 - var v_8 = env.Limit - env.Cursor -lab8: - for { - // (, line 209 - // [, line 209 - env.Ket = env.Cursor - // literal, line 209 - if !env.EqSB("\u0438") { - env.Cursor = env.Limit - v_8 - break lab8 - } - // ], line 209 - env.Bra = env.Cursor - // delete, line 209 - if !env.SliceDel() { - return false - } - break lab8 - } - // do, line 212 - var v_9 = env.Limit - env.Cursor -lab9: - for { - // call derivational, line 212 - if !r_derivational(env, context) { - break lab9 - } - break lab9 - } - env.Cursor = env.Limit - v_9 - // do, line 213 - var v_10 = env.Limit - env.Cursor -lab10: - for { - // call tidy_up, line 213 - if !r_tidy_up(env, context) { - break lab10 - } - break lab10 - } - env.Cursor = env.Limit - v_10 - env.LimitBackward = v_3 - env.Cursor = env.LimitBackward - return true -} diff --git a/analysis/lang/ru/stemmer_ru.go b/analysis/lang/ru/stemmer_ru.go index 6da9095b0..47a90456b 100644 --- a/analysis/lang/ru/stemmer_ru.go +++ b/analysis/lang/ru/stemmer_ru.go @@ -1,4 +1,4 @@ -// Copyright (c) 2014 Couchbase, Inc. +// Copyright (c) 2018 Couchbase, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,10 +16,10 @@ package ru import ( "github.com/blevesearch/bleve/analysis" - "github.com/blevesearch/bleve/analysis/lang/ru/snowball" "github.com/blevesearch/bleve/registry" - snowballRuntime "github.com/snowballstem/snowball/go" + "github.com/blevesearch/snowballstem" + "github.com/blevesearch/snowballstem/russian" ) const SnowballStemmerName = "stemmer_ru_snowball" @@ -33,9 +33,8 @@ func NewRussianStemmerFilter() *RussianStemmerFilter { func (s *RussianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { for _, token := range input { - - env := snowballRuntime.NewEnv(string(token.Term)) - snowball.Stem(env) + env := snowballstem.NewEnv(string(token.Term)) + russian.Stem(env) token.Term = []byte(env.Current()) } return input diff --git a/analysis/lang/ru/stemmer_ru_test.go b/analysis/lang/ru/stemmer_ru_test.go index 1795497ff..39949fcb0 100644 --- a/analysis/lang/ru/stemmer_ru_test.go +++ b/analysis/lang/ru/stemmer_ru_test.go @@ -1,4 +1,4 @@ -// Copyright (c) 2015 Couchbase, Inc. +// Copyright (c) 2018 Couchbase, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/analysis/lang/ru/stop_filter_ru.go b/analysis/lang/ru/stop_filter_ru.go index 5679420a1..326fb9d56 100644 --- a/analysis/lang/ru/stop_filter_ru.go +++ b/analysis/lang/ru/stop_filter_ru.go @@ -1,4 +1,4 @@ -// Copyright (c) 2014 Couchbase, Inc. +// Copyright (c) 2018 Couchbase, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. diff --git a/analysis/lang/ru/stop_words_ru.go b/analysis/lang/ru/stop_words_ru.go index 60bec0236..0129f48c4 100644 --- a/analysis/lang/ru/stop_words_ru.go +++ b/analysis/lang/ru/stop_words_ru.go @@ -7,242 +7,253 @@ import ( const StopName = "stop_ru" -var RussianStopWords = []byte(` | From http://snowball.tartarus.org/algorithms/russian/stop.txt +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ +// ` was changed to ' to allow for literal string +var RussianStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/russian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" - | a russian stop word list. comments begin with vertical bar. each stop - | word is at the start of a line. + | a russian stop word list. comments begin with vertical bar. each stop + | word is at the start of a line. - | this is a ranked list (commonest to rarest) of stopwords derived from - | a large text sample. + | this is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. - | letter 'ё' is translated to 'е'. + | letter 'ё' is translated to 'е'. - и | and - в | in/into - во | alternative form - не | not - что | what/that - он | he - на | on/onto - я | i - с | from - со | alternative form - как | how - а | milder form of 'no' (but) - то | conjunction and form of 'that' - все | all - она | she - так | so, thus - его | him - но | but - да | yes/and - ты | thou - к | towards, by - у | around, chez - же | intensifier particle - вы | you - за | beyond, behind - бы | conditional/subj. particle - по | up to, along - только | only - ее | her - мне | to me - было | it was - вот | here is/are, particle - от | away from - меня | me - еще | still, yet, more - нет | no, there isnt/arent - о | about - из | out of - ему | to him - теперь | now - когда | when - даже | even - ну | so, well - вдруг | suddenly - ли | interrogative particle - если | if - уже | already, but homonym of 'narrower' - или | or - ни | neither - быть | to be - был | he was - него | prepositional form of его - до | up to - вас | you accusative - нибудь | indef. suffix preceded by hyphen - опять | again - уж | already, but homonym of 'adder' - вам | to you - сказал | he said - ведь | particle 'after all' - там | there - потом | then - себя | oneself - ничего | nothing - ей | to her - может | usually with 'быть' as 'maybe' - они | they - тут | here - где | where - есть | there is/are - надо | got to, must - ней | prepositional form of ей - для | for - мы | we - тебя | thee - их | them, their - чем | than - была | she was - сам | self - чтоб | in order to - без | without - будто | as if - человек | man, person, one - чего | genitive form of 'what' - раз | once - тоже | also - себе | to oneself - под | beneath - жизнь | life - будет | will be - ж | short form of intensifer particle 'же' - тогда | then - кто | who - этот | this - говорил | was saying - того | genitive form of 'that' - потому | for that reason - этого | genitive form of 'this' - какой | which - совсем | altogether - ним | prepositional form of 'его', 'они' - здесь | here - этом | prepositional form of 'этот' - один | one - почти | almost - мой | my - тем | instrumental/dative plural of 'тот', 'то' - чтобы | full form of 'in order that' - нее | her (acc.) - кажется | it seems - сейчас | now - были | they were - куда | where to - зачем | why - сказать | to say - всех | all (acc., gen. preposn. plural) - никогда | never - сегодня | today - можно | possible, one can - при | by - наконец | finally - два | two - об | alternative form of 'о', about - другой | another - хоть | even - после | after - над | above - больше | more - тот | that one (masc.) - через | across, in - эти | these - нас | us - про | about - всего | in all, only, of all - них | prepositional form of 'они' (they) - какая | which, feminine - много | lots - разве | interrogative particle - сказала | she said - три | three - эту | this, acc. fem. sing. - моя | my, feminine - впрочем | moreover, besides - хорошо | good - свою | ones own, acc. fem. sing. - этой | oblique form of 'эта', fem. 'this' - перед | in front of - иногда | sometimes - лучше | better - чуть | a little - том | preposn. form of 'that one' - нельзя | one must not - такой | such a one - им | to them - более | more - всегда | always - конечно | of course - всю | acc. fem. sing of 'all' - между | between +и | and +в | in/into +во | alternative form +не | not +что | what/that +он | he +на | on/onto +я | i +с | from +со | alternative form +как | how +а | milder form of 'no' (but) +то | conjunction and form of 'that' +все | all +она | she +так | so, thus +его | him +но | but +да | yes/and +ты | thou +к | towards, by +у | around, chez +же | intensifier particle +вы | you +за | beyond, behind +бы | conditional/subj. particle +по | up to, along +только | only +ее | her +мне | to me +было | it was +вот | here is/are, particle +от | away from +меня | me +еще | still, yet, more +нет | no, there isnt/arent +о | about +из | out of +ему | to him +теперь | now +когда | when +даже | even +ну | so, well +вдруг | suddenly +ли | interrogative particle +если | if +уже | already, but homonym of 'narrower' +или | or +ни | neither +быть | to be +был | he was +него | prepositional form of его +до | up to +вас | you accusative +нибудь | indef. suffix preceded by hyphen +опять | again +уж | already, but homonym of 'adder' +вам | to you +сказал | he said +ведь | particle 'after all' +там | there +потом | then +себя | oneself +ничего | nothing +ей | to her +может | usually with 'быть' as 'maybe' +они | they +тут | here +где | where +есть | there is/are +надо | got to, must +ней | prepositional form of ей +для | for +мы | we +тебя | thee +их | them, their +чем | than +была | she was +сам | self +чтоб | in order to +без | without +будто | as if +человек | man, person, one +чего | genitive form of 'what' +раз | once +тоже | also +себе | to oneself +под | beneath +жизнь | life +будет | will be +ж | short form of intensifer particle 'же' +тогда | then +кто | who +этот | this +говорил | was saying +того | genitive form of 'that' +потому | for that reason +этого | genitive form of 'this' +какой | which +совсем | altogether +ним | prepositional form of 'его', 'они' +здесь | here +этом | prepositional form of 'этот' +один | one +почти | almost +мой | my +тем | instrumental/dative plural of 'тот', 'то' +чтобы | full form of 'in order that' +нее | her (acc.) +кажется | it seems +сейчас | now +были | they were +куда | where to +зачем | why +сказать | to say +всех | all (acc., gen. preposn. plural) +никогда | never +сегодня | today +можно | possible, one can +при | by +наконец | finally +два | two +об | alternative form of 'о', about +другой | another +хоть | even +после | after +над | above +больше | more +тот | that one (masc.) +через | across, in +эти | these +нас | us +про | about +всего | in all, only, of all +них | prepositional form of 'они' (they) +какая | which, feminine +много | lots +разве | interrogative particle +сказала | she said +три | three +эту | this, acc. fem. sing. +моя | my, feminine +впрочем | moreover, besides +хорошо | good +свою | ones own, acc. fem. sing. +этой | oblique form of 'эта', fem. 'this' +перед | in front of +иногда | sometimes +лучше | better +чуть | a little +том | preposn. form of 'that one' +нельзя | one must not +такой | such a one +им | to them +более | more +всегда | always +конечно | of course +всю | acc. fem. sing of 'all' +между | between - | b: some paradigms - | - | personal pronouns - | - | я меня мне мной [мною] - | ты тебя тебе тобой [тобою] - | он его ему им [него, нему, ним] - | она ее эи ею [нее, нэи, нею] - | оно его ему им [него, нему, ним] - | - | мы нас нам нами - | вы вас вам вами - | они их им ими [них, ним, ними] - | - | себя себе собой [собою] - | - | demonstrative pronouns: этот (this), тот (that) - | - | этот эта это эти - | этого эты это эти - | этого этой этого этих - | этому этой этому этим - | этим этой этим [этою] этими - | этом этой этом этих - | - | тот та то те - | того ту то те - | того той того тех - | тому той тому тем - | тем той тем [тою] теми - | том той том тех - | - | determinative pronouns - | - | (a) весь (all) - | - | весь вся все все - | всего всю все все - | всего всей всего всех - | всему всей всему всем - | всем всей всем [всею] всеми - | всем всей всем всех - | - | (b) сам (himself etc) - | - | сам сама само сами - | самого саму само самих - | самого самой самого самих - | самому самой самому самим - | самим самой самим [самою] самими - | самом самой самом самих - | - | stems of verbs 'to be', 'to have', 'to do' and modal - | - | быть бы буд быв есть суть - | име - | дел - | мог мож мочь - | уме - | хоч хот - | долж - | можн - | нужн - | нельзя + | b: some paradigms + | + | personal pronouns + | + | я меня мне мной [мною] + | ты тебя тебе тобой [тобою] + | он его ему им [него, нему, ним] + | она ее эи ею [нее, нэи, нею] + | оно его ему им [него, нему, ним] + | + | мы нас нам нами + | вы вас вам вами + | они их им ими [них, ним, ними] + | + | себя себе собой [собою] + | + | demonstrative pronouns: этот (this), тот (that) + | + | этот эта это эти + | этого эты это эти + | этого этой этого этих + | этому этой этому этим + | этим этой этим [этою] этими + | этом этой этом этих + | + | тот та то те + | того ту то те + | того той того тех + | тому той тому тем + | тем той тем [тою] теми + | том той том тех + | + | determinative pronouns + | + | (a) весь (all) + | + | весь вся все все + | всего всю все все + | всего всей всего всех + | всему всей всему всем + | всем всей всем [всею] всеми + | всем всей всем всех + | + | (b) сам (himself etc) + | + | сам сама само сами + | самого саму само самих + | самого самой самого самих + | самому самой самому самим + | самим самой самим [самою] самими + | самом самой самом самих + | + | stems of verbs 'to be', 'to have', 'to do' and modal + | + | быть бы буд быв есть суть + | име + | дел + | мог мож мочь + | уме + | хоч хот + | долж + | можн + | нужн + | нельзя + `) func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { diff --git a/vendor/manifest b/vendor/manifest index 60c5fbb5e..641f276e5 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -17,6 +17,14 @@ "branch": "master", "notests": true }, + { + "importpath": "github.com/blevesearch/snowballstem", + "repository": "https://github.com/blevesearch/snowballstem", + "vcs": "", + "revision": "26b06a2c243d4f8ca5db3486f94409dd5b2a7467", + "branch": "master", + "notests": true + }, { "importpath": "github.com/boltdb/bolt", "repository": "https://github.com/boltdb/bolt", From 09a61a7a3821dc044f5e81b5dadef698df9340e7 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 10 Jan 2018 16:00:29 -0500 Subject: [PATCH 134/728] add analyzers for several languages Having pure Go snowball stemmers allows us to add support for many languages into the core of bleve. Specifically we just added: Russian, Danish, Finnish, Hungarian, Dutch, Norwegian, Romanian, Swedish, Turkish --- analysis/lang/da/analyzer_da.go | 56 ++++++ analysis/lang/da/analyzer_da_test.go | 71 ++++++++ analysis/lang/da/stemmer_da.go | 49 +++++ analysis/lang/da/stop_filter_da.go | 33 ++++ analysis/lang/da/stop_words_da.go | 134 ++++++++++++++ analysis/lang/fi/analyzer_fi.go | 57 ++++++ analysis/lang/fi/analyzer_fi_test.go | 70 ++++++++ analysis/lang/fi/stemmer_fi.go | 49 +++++ analysis/lang/fi/stop_filter_fi.go | 33 ++++ analysis/lang/fi/stop_words_fi.go | 121 +++++++++++++ analysis/lang/hu/analyzer_hu.go | 57 ++++++ analysis/lang/hu/analyzer_hu_test.go | 70 ++++++++ analysis/lang/hu/stemmer_hu.go | 49 +++++ analysis/lang/hu/stop_filter_hu.go | 33 ++++ analysis/lang/hu/stop_words_hu.go | 235 ++++++++++++++++++++++++ analysis/lang/nl/analyzer_nl.go | 57 ++++++ analysis/lang/nl/analyzer_nl_test.go | 70 ++++++++ analysis/lang/nl/stemmer_nl.go | 49 +++++ analysis/lang/nl/stop_filter_nl.go | 33 ++++ analysis/lang/nl/stop_words_nl.go | 143 +++++++++++++++ analysis/lang/no/analyzer_no.go | 57 ++++++ analysis/lang/no/analyzer_no_test.go | 70 ++++++++ analysis/lang/no/stemmer_no.go | 49 +++++ analysis/lang/no/stop_filter_no.go | 33 ++++ analysis/lang/no/stop_words_no.go | 218 +++++++++++++++++++++++ analysis/lang/ro/analyzer_ro.go | 57 ++++++ analysis/lang/ro/analyzer_ro_test.go | 70 ++++++++ analysis/lang/ro/stemmer_ro.go | 49 +++++ analysis/lang/ro/stop_filter_ro.go | 33 ++++ analysis/lang/ro/stop_words_ro.go | 257 +++++++++++++++++++++++++++ analysis/lang/sv/analyzer_sv.go | 57 ++++++ analysis/lang/sv/analyzer_sv_test.go | 70 ++++++++ analysis/lang/sv/stemmer_sv.go | 49 +++++ analysis/lang/sv/stop_filter_sv.go | 33 ++++ analysis/lang/sv/stop_words_sv.go | 157 ++++++++++++++++ analysis/lang/tr/analyzer_tr.go | 63 +++++++ analysis/lang/tr/analyzer_tr_test.go | 90 ++++++++++ analysis/lang/tr/stemmer_tr.go | 49 +++++ analysis/lang/tr/stop_filter_tr.go | 33 ++++ analysis/lang/tr/stop_words_tr.go | 236 ++++++++++++++++++++++++ 40 files changed, 3199 insertions(+) create mode 100644 analysis/lang/da/analyzer_da.go create mode 100644 analysis/lang/da/analyzer_da_test.go create mode 100644 analysis/lang/da/stemmer_da.go create mode 100644 analysis/lang/da/stop_filter_da.go create mode 100644 analysis/lang/da/stop_words_da.go create mode 100644 analysis/lang/fi/analyzer_fi.go create mode 100644 analysis/lang/fi/analyzer_fi_test.go create mode 100644 analysis/lang/fi/stemmer_fi.go create mode 100644 analysis/lang/fi/stop_filter_fi.go create mode 100644 analysis/lang/fi/stop_words_fi.go create mode 100644 analysis/lang/hu/analyzer_hu.go create mode 100644 analysis/lang/hu/analyzer_hu_test.go create mode 100644 analysis/lang/hu/stemmer_hu.go create mode 100644 analysis/lang/hu/stop_filter_hu.go create mode 100644 analysis/lang/hu/stop_words_hu.go create mode 100644 analysis/lang/nl/analyzer_nl.go create mode 100644 analysis/lang/nl/analyzer_nl_test.go create mode 100644 analysis/lang/nl/stemmer_nl.go create mode 100644 analysis/lang/nl/stop_filter_nl.go create mode 100644 analysis/lang/nl/stop_words_nl.go create mode 100644 analysis/lang/no/analyzer_no.go create mode 100644 analysis/lang/no/analyzer_no_test.go create mode 100644 analysis/lang/no/stemmer_no.go create mode 100644 analysis/lang/no/stop_filter_no.go create mode 100644 analysis/lang/no/stop_words_no.go create mode 100644 analysis/lang/ro/analyzer_ro.go create mode 100644 analysis/lang/ro/analyzer_ro_test.go create mode 100644 analysis/lang/ro/stemmer_ro.go create mode 100644 analysis/lang/ro/stop_filter_ro.go create mode 100644 analysis/lang/ro/stop_words_ro.go create mode 100644 analysis/lang/sv/analyzer_sv.go create mode 100644 analysis/lang/sv/analyzer_sv_test.go create mode 100644 analysis/lang/sv/stemmer_sv.go create mode 100644 analysis/lang/sv/stop_filter_sv.go create mode 100644 analysis/lang/sv/stop_words_sv.go create mode 100644 analysis/lang/tr/analyzer_tr.go create mode 100644 analysis/lang/tr/analyzer_tr_test.go create mode 100644 analysis/lang/tr/stemmer_tr.go create mode 100644 analysis/lang/tr/stop_filter_tr.go create mode 100644 analysis/lang/tr/stop_words_tr.go diff --git a/analysis/lang/da/analyzer_da.go b/analysis/lang/da/analyzer_da.go new file mode 100644 index 000000000..dca141779 --- /dev/null +++ b/analysis/lang/da/analyzer_da.go @@ -0,0 +1,56 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package da + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/token/lowercase" + "github.com/blevesearch/bleve/analysis/tokenizer/unicode" + "github.com/blevesearch/bleve/registry" +) + +const AnalyzerName = "da" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name) + if err != nil { + return nil, err + } + stopDaFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerDaFilter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopDaFilter, + stemmerDaFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/lang/da/analyzer_da_test.go b/analysis/lang/da/analyzer_da_test.go new file mode 100644 index 000000000..d6a1c51a4 --- /dev/null +++ b/analysis/lang/da/analyzer_da_test.go @@ -0,0 +1,71 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package da + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestDanishAnalyzer(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + // stemming + { + input: []byte("undersøg"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("undersøg"), + Position: 1, + Start: 0, + End: 9, + }, + }, + }, + { + input: []byte("undersøgelse"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("undersøg"), + Position: 1, + Start: 0, + End: 13, + }, + }, + }, + // stop word + { + input: []byte("på"), + output: analysis.TokenStream{}, + }, + } + + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed(AnalyzerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := analyzer.Analyze(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %v, got %v", test.output, actual) + } + } +} diff --git a/analysis/lang/da/stemmer_da.go b/analysis/lang/da/stemmer_da.go new file mode 100644 index 000000000..e40e623ab --- /dev/null +++ b/analysis/lang/da/stemmer_da.go @@ -0,0 +1,49 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package da + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/snowballstem" + "github.com/blevesearch/snowballstem/danish" +) + +const SnowballStemmerName = "stemmer_da_snowball" + +type DanishStemmerFilter struct { +} + +func NewDanishStemmerFilter() *DanishStemmerFilter { + return &DanishStemmerFilter{} +} + +func (s *DanishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + env := snowballstem.NewEnv(string(token.Term)) + danish.Stem(env) + token.Term = []byte(env.Current()) + } + return input +} + +func DanishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewDanishStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(SnowballStemmerName, DanishStemmerFilterConstructor) +} diff --git a/analysis/lang/da/stop_filter_da.go b/analysis/lang/da/stop_filter_da.go new file mode 100644 index 000000000..a146d0b43 --- /dev/null +++ b/analysis/lang/da/stop_filter_da.go @@ -0,0 +1,33 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package da + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/token/stop" + "github.com/blevesearch/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/lang/da/stop_words_da.go b/analysis/lang/da/stop_words_da.go new file mode 100644 index 000000000..63a407a0c --- /dev/null +++ b/analysis/lang/da/stop_words_da.go @@ -0,0 +1,134 @@ +package da + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const StopName = "stop_da" + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ +// ` was changed to ' to allow for literal string + +var DanishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/danish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Danish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + +og | and +i | in +jeg | I +det | that (dem. pronoun)/it (pers. pronoun) +at | that (in front of a sentence)/to (with infinitive) +en | a/an +den | it (pers. pronoun)/that (dem. pronoun) +til | to/at/for/until/against/by/of/into, more +er | present tense of "to be" +som | who, as +på | on/upon/in/on/at/to/after/of/with/for, on +de | they +med | with/by/in, along +han | he +af | of/by/from/off/for/in/with/on, off +for | at/for/to/from/by/of/ago, in front/before, because +ikke | not +der | who/which, there/those +var | past tense of "to be" +mig | me/myself +sig | oneself/himself/herself/itself/themselves +men | but +et | a/an/one, one (number), someone/somebody/one +har | present tense of "to have" +om | round/about/for/in/a, about/around/down, if +vi | we +min | my +havde | past tense of "to have" +ham | him +hun | she +nu | now +over | over/above/across/by/beyond/past/on/about, over/past +da | then, when/as/since +fra | from/off/since, off, since +du | you +ud | out +sin | his/her/its/one's +dem | them +os | us/ourselves +op | up +man | you/one +hans | his +hvor | where +eller | or +hvad | what +skal | must/shall etc. +selv | myself/youself/herself/ourselves etc., even +her | here +alle | all/everyone/everybody etc. +vil | will (verb) +blev | past tense of "to stay/to remain/to get/to become" +kunne | could +ind | in +når | when +være | present tense of "to be" +dog | however/yet/after all +noget | something +ville | would +jo | you know/you see (adv), yes +deres | their/theirs +efter | after/behind/according to/for/by/from, later/afterwards +ned | down +skulle | should +denne | this +end | than +dette | this +mit | my/mine +også | also +under | under/beneath/below/during, below/underneath +have | have +dig | you +anden | other +hende | her +mine | my +alt | everything +meget | much/very, plenty of +sit | his, her, its, one's +sine | his, her, its, one's +vor | our +mod | against +disse | these +hvis | if +din | your/yours +nogle | some +hos | by/at +blive | be/become +mange | many +ad | by/through +bliver | present tense of "to be/to become" +hendes | her/hers +været | be +thi | for (conj) +jer | you +sådan | such, like this/like that +`) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(DanishStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/lang/fi/analyzer_fi.go b/analysis/lang/fi/analyzer_fi.go new file mode 100644 index 000000000..9482e6b36 --- /dev/null +++ b/analysis/lang/fi/analyzer_fi.go @@ -0,0 +1,57 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fi + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/bleve/analysis/token/lowercase" + "github.com/blevesearch/bleve/analysis/tokenizer/unicode" +) + +const AnalyzerName = "fi" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name) + if err != nil { + return nil, err + } + stopFiFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerFiFilter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopFiFilter, + stemmerFiFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/lang/fi/analyzer_fi_test.go b/analysis/lang/fi/analyzer_fi_test.go new file mode 100644 index 000000000..035e7fdb5 --- /dev/null +++ b/analysis/lang/fi/analyzer_fi_test.go @@ -0,0 +1,70 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fi + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestFinishAnalyzer(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + // stemming + { + input: []byte("edeltäjiinsä"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("edeltäj"), + }, + }, + }, + { + input: []byte("edeltäjistään"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("edeltäj"), + }, + }, + }, + // stop word + { + input: []byte("olla"), + output: analysis.TokenStream{}, + }, + } + + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed(AnalyzerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := analyzer.Analyze(test.input) + if len(actual) != len(test.output) { + t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) + } + for i, tok := range actual { + if !reflect.DeepEqual(tok.Term, test.output[i].Term) { + t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) + } + } + } +} diff --git a/analysis/lang/fi/stemmer_fi.go b/analysis/lang/fi/stemmer_fi.go new file mode 100644 index 000000000..14a6a1cbc --- /dev/null +++ b/analysis/lang/fi/stemmer_fi.go @@ -0,0 +1,49 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fi + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/snowballstem" + "github.com/blevesearch/snowballstem/finnish" +) + +const SnowballStemmerName = "stemmer_fi_snowball" + +type FinnishStemmerFilter struct { +} + +func NewFinnishStemmerFilter() *FinnishStemmerFilter { + return &FinnishStemmerFilter{} +} + +func (s *FinnishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + env := snowballstem.NewEnv(string(token.Term)) + finnish.Stem(env) + token.Term = []byte(env.Current()) + } + return input +} + +func FinnishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewFinnishStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(SnowballStemmerName, FinnishStemmerFilterConstructor) +} diff --git a/analysis/lang/fi/stop_filter_fi.go b/analysis/lang/fi/stop_filter_fi.go new file mode 100644 index 000000000..f3576a2be --- /dev/null +++ b/analysis/lang/fi/stop_filter_fi.go @@ -0,0 +1,33 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fi + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/token/stop" + "github.com/blevesearch/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/lang/fi/stop_words_fi.go b/analysis/lang/fi/stop_words_fi.go new file mode 100644 index 000000000..7cf0c9c13 --- /dev/null +++ b/analysis/lang/fi/stop_words_fi.go @@ -0,0 +1,121 @@ +package fi + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const StopName = "stop_fi" + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ +// ` was changed to ' to allow for literal string + +var FinnishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/finnish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + +| forms of BE + +olla +olen +olet +on +olemme +olette +ovat +ole | negative form + +oli +olisi +olisit +olisin +olisimme +olisitte +olisivat +olit +olin +olimme +olitte +olivat +ollut +olleet + +en | negation +et +ei +emme +ette +eivät + +|Nom Gen Acc Part Iness Elat Illat Adess Ablat Allat Ess Trans +minä minun minut minua minussa minusta minuun minulla minulta minulle | I +sinä sinun sinut sinua sinussa sinusta sinuun sinulla sinulta sinulle | you +hän hänen hänet häntä hänessä hänestä häneen hänellä häneltä hänelle | he she +me meidän meidät meitä meissä meistä meihin meillä meiltä meille | we +te teidän teidät teitä teissä teistä teihin teillä teiltä teille | you +he heidän heidät heitä heissä heistä heihin heillä heiltä heille | they + +tämä tämän tätä tässä tästä tähän tallä tältä tälle tänä täksi | this +tuo tuon tuotä tuossa tuosta tuohon tuolla tuolta tuolle tuona tuoksi | that +se sen sitä siinä siitä siihen sillä siltä sille sinä siksi | it +nämä näiden näitä näissä näistä näihin näillä näiltä näille näinä näiksi | these +nuo noiden noita noissa noista noihin noilla noilta noille noina noiksi | those +ne niiden niitä niissä niistä niihin niillä niiltä niille niinä niiksi | they + +kuka kenen kenet ketä kenessä kenestä keneen kenellä keneltä kenelle kenenä keneksi| who +ketkä keiden ketkä keitä keissä keistä keihin keillä keiltä keille keinä keiksi | (pl) +mikä minkä minkä mitä missä mistä mihin millä miltä mille minä miksi | which what +mitkä | (pl) + +joka jonka jota jossa josta johon jolla jolta jolle jona joksi | who which +jotka joiden joita joissa joista joihin joilla joilta joille joina joiksi | (pl) + +| conjunctions + +että | that +ja | and +jos | if +koska | because +kuin | than +mutta | but +niin | so +sekä | and +sillä | for +tai | or +vaan | but +vai | or +vaikka | although + + +| prepositions + +kanssa | with +mukaan | according to +noin | about +poikki | across +yli | over, across + +| other + +kun | when +niin | so +nyt | now +itse | self + +`) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(FinnishStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/lang/hu/analyzer_hu.go b/analysis/lang/hu/analyzer_hu.go new file mode 100644 index 000000000..6797a91e4 --- /dev/null +++ b/analysis/lang/hu/analyzer_hu.go @@ -0,0 +1,57 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hu + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/bleve/analysis/token/lowercase" + "github.com/blevesearch/bleve/analysis/tokenizer/unicode" +) + +const AnalyzerName = "hu" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name) + if err != nil { + return nil, err + } + stopHuFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerHuFilter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopHuFilter, + stemmerHuFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/lang/hu/analyzer_hu_test.go b/analysis/lang/hu/analyzer_hu_test.go new file mode 100644 index 000000000..4a14dff68 --- /dev/null +++ b/analysis/lang/hu/analyzer_hu_test.go @@ -0,0 +1,70 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hu + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestHungarianAnalyzer(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + // stemming + { + input: []byte("babakocsi"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("babakocs"), + }, + }, + }, + { + input: []byte("babakocsijáért"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("babakocs"), + }, + }, + }, + // stop word + { + input: []byte("által"), + output: analysis.TokenStream{}, + }, + } + + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed(AnalyzerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := analyzer.Analyze(test.input) + if len(actual) != len(test.output) { + t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) + } + for i, tok := range actual { + if !reflect.DeepEqual(tok.Term, test.output[i].Term) { + t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) + } + } + } +} diff --git a/analysis/lang/hu/stemmer_hu.go b/analysis/lang/hu/stemmer_hu.go new file mode 100644 index 000000000..b380818a0 --- /dev/null +++ b/analysis/lang/hu/stemmer_hu.go @@ -0,0 +1,49 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hu + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/snowballstem" + "github.com/blevesearch/snowballstem/hungarian" +) + +const SnowballStemmerName = "stemmer_hu_snowball" + +type HungarianStemmerFilter struct { +} + +func NewHungarianStemmerFilter() *HungarianStemmerFilter { + return &HungarianStemmerFilter{} +} + +func (s *HungarianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + env := snowballstem.NewEnv(string(token.Term)) + hungarian.Stem(env) + token.Term = []byte(env.Current()) + } + return input +} + +func HungarianStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewHungarianStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(SnowballStemmerName, HungarianStemmerFilterConstructor) +} diff --git a/analysis/lang/hu/stop_filter_hu.go b/analysis/lang/hu/stop_filter_hu.go new file mode 100644 index 000000000..a83fd4ccb --- /dev/null +++ b/analysis/lang/hu/stop_filter_hu.go @@ -0,0 +1,33 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package hu + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/token/stop" + "github.com/blevesearch/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/lang/hu/stop_words_hu.go b/analysis/lang/hu/stop_words_hu.go new file mode 100644 index 000000000..fe45d55ea --- /dev/null +++ b/analysis/lang/hu/stop_words_hu.go @@ -0,0 +1,235 @@ +package hu + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const StopName = "stop_hu" + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ +// ` was changed to ' to allow for literal string + +var HungarianStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/hungarian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + +| Hungarian stop word list +| prepared by Anna Tordai + +a +ahogy +ahol +aki +akik +akkor +alatt +által +általában +amely +amelyek +amelyekben +amelyeket +amelyet +amelynek +ami +amit +amolyan +amíg +amikor +át +abban +ahhoz +annak +arra +arról +az +azok +azon +azt +azzal +azért +aztán +azután +azonban +bár +be +belül +benne +cikk +cikkek +cikkeket +csak +de +e +eddig +egész +egy +egyes +egyetlen +egyéb +egyik +egyre +ekkor +el +elég +ellen +elő +először +előtt +első +én +éppen +ebben +ehhez +emilyen +ennek +erre +ez +ezt +ezek +ezen +ezzel +ezért +és +fel +felé +hanem +hiszen +hogy +hogyan +igen +így +illetve +ill. +ill +ilyen +ilyenkor +ison +ismét +itt +jó +jól +jobban +kell +kellett +keresztül +keressünk +ki +kívül +között +közül +legalább +lehet +lehetett +legyen +lenne +lenni +lesz +lett +maga +magát +majd +majd +már +más +másik +meg +még +mellett +mert +mely +melyek +mi +mit +míg +miért +milyen +mikor +minden +mindent +mindenki +mindig +mint +mintha +mivel +most +nagy +nagyobb +nagyon +ne +néha +nekem +neki +nem +néhány +nélkül +nincs +olyan +ott +össze +ő +ők +őket +pedig +persze +rá +s +saját +sem +semmi +sok +sokat +sokkal +számára +szemben +szerint +szinte +talán +tehát +teljes +tovább +továbbá +több +úgy +ugyanis +új +újabb +újra +után +utána +utolsó +vagy +vagyis +valaki +valami +valamint +való +vagyok +van +vannak +volt +voltam +voltak +voltunk +vissza +vele +viszont +volna +`) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(HungarianStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/lang/nl/analyzer_nl.go b/analysis/lang/nl/analyzer_nl.go new file mode 100644 index 000000000..69853a9e1 --- /dev/null +++ b/analysis/lang/nl/analyzer_nl.go @@ -0,0 +1,57 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package nl + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/bleve/analysis/token/lowercase" + "github.com/blevesearch/bleve/analysis/tokenizer/unicode" +) + +const AnalyzerName = "nl" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name) + if err != nil { + return nil, err + } + stopNlFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerNlFilter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopNlFilter, + stemmerNlFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/lang/nl/analyzer_nl_test.go b/analysis/lang/nl/analyzer_nl_test.go new file mode 100644 index 000000000..21e851c33 --- /dev/null +++ b/analysis/lang/nl/analyzer_nl_test.go @@ -0,0 +1,70 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package nl + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestDutchAnalyzer(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + // stemming + { + input: []byte("lichamelijk"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("licham"), + }, + }, + }, + { + input: []byte("lichamelijke"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("licham"), + }, + }, + }, + // stop word + { + input: []byte("van"), + output: analysis.TokenStream{}, + }, + } + + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed(AnalyzerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := analyzer.Analyze(test.input) + if len(actual) != len(test.output) { + t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) + } + for i, tok := range actual { + if !reflect.DeepEqual(tok.Term, test.output[i].Term) { + t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) + } + } + } +} diff --git a/analysis/lang/nl/stemmer_nl.go b/analysis/lang/nl/stemmer_nl.go new file mode 100644 index 000000000..049d92160 --- /dev/null +++ b/analysis/lang/nl/stemmer_nl.go @@ -0,0 +1,49 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package nl + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/snowballstem" + "github.com/blevesearch/snowballstem/dutch" +) + +const SnowballStemmerName = "stemmer_nl_snowball" + +type DutchStemmerFilter struct { +} + +func NewDutchStemmerFilter() *DutchStemmerFilter { + return &DutchStemmerFilter{} +} + +func (s *DutchStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + env := snowballstem.NewEnv(string(token.Term)) + dutch.Stem(env) + token.Term = []byte(env.Current()) + } + return input +} + +func DutchStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewDutchStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(SnowballStemmerName, DutchStemmerFilterConstructor) +} diff --git a/analysis/lang/nl/stop_filter_nl.go b/analysis/lang/nl/stop_filter_nl.go new file mode 100644 index 000000000..218f0f42c --- /dev/null +++ b/analysis/lang/nl/stop_filter_nl.go @@ -0,0 +1,33 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package nl + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/token/stop" + "github.com/blevesearch/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/lang/nl/stop_words_nl.go b/analysis/lang/nl/stop_words_nl.go new file mode 100644 index 000000000..4adae1002 --- /dev/null +++ b/analysis/lang/nl/stop_words_nl.go @@ -0,0 +1,143 @@ +package nl + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const StopName = "stop_nl" + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ +// ` was changed to ' to allow for literal string + +var DutchStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/dutch/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Dutch stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large sample of Dutch text. + + | Dutch stop words frequently exhibit homonym clashes. These are indicated + | clearly below. + +de | the +en | and +van | of, from +ik | I, the ego +te | (1) chez, at etc, (2) to, (3) too +dat | that, which +die | that, those, who, which +in | in, inside +een | a, an, one +hij | he +het | the, it +niet | not, nothing, naught +zijn | (1) to be, being, (2) his, one's, its +is | is +was | (1) was, past tense of all persons sing. of 'zijn' (to be) (2) wax, (3) the washing, (4) rise of river +op | on, upon, at, in, up, used up +aan | on, upon, to (as dative) +met | with, by +als | like, such as, when +voor | (1) before, in front of, (2) furrow +had | had, past tense all persons sing. of 'hebben' (have) +er | there +maar | but, only +om | round, about, for etc +hem | him +dan | then +zou | should/would, past tense all persons sing. of 'zullen' +of | or, whether, if +wat | what, something, anything +mijn | possessive and noun 'mine' +men | people, 'one' +dit | this +zo | so, thus, in this way +door | through by +over | over, across +ze | she, her, they, them +zich | oneself +bij | (1) a bee, (2) by, near, at +ook | also, too +tot | till, until +je | you +mij | me +uit | out of, from +der | Old Dutch form of 'van der' still found in surnames +daar | (1) there, (2) because +haar | (1) her, their, them, (2) hair +naar | (1) unpleasant, unwell etc, (2) towards, (3) as +heb | present first person sing. of 'to have' +hoe | how, why +heeft | present third person sing. of 'to have' +hebben | 'to have' and various parts thereof +deze | this +u | you +want | (1) for, (2) mitten, (3) rigging +nog | yet, still +zal | 'shall', first and third person sing. of verb 'zullen' (will) +me | me +zij | she, they +nu | now +ge | 'thou', still used in Belgium and south Netherlands +geen | none +omdat | because +iets | something, somewhat +worden | to become, grow, get +toch | yet, still +al | all, every, each +waren | (1) 'were' (2) to wander, (3) wares, (3) +veel | much, many +meer | (1) more, (2) lake +doen | to do, to make +toen | then, when +moet | noun 'spot/mote' and present form of 'to must' +ben | (1) am, (2) 'are' in interrogative second person singular of 'to be' +zonder | without +kan | noun 'can' and present form of 'to be able' +hun | their, them +dus | so, consequently +alles | all, everything, anything +onder | under, beneath +ja | yes, of course +eens | once, one day +hier | here +wie | who +werd | imperfect third person sing. of 'become' +altijd | always +doch | yet, but etc +wordt | present third person sing. of 'become' +wezen | (1) to be, (2) 'been' as in 'been fishing', (3) orphans +kunnen | to be able +ons | us/our +zelf | self +tegen | against, towards, at +na | after, near +reeds | already +wil | (1) present tense of 'want', (2) 'will', noun, (3) fender +kon | could; past tense of 'to be able' +niets | nothing +uw | your +iemand | somebody +geweest | been; past participle of 'be' +andere | other +`) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(DutchStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/lang/no/analyzer_no.go b/analysis/lang/no/analyzer_no.go new file mode 100644 index 000000000..57d749eac --- /dev/null +++ b/analysis/lang/no/analyzer_no.go @@ -0,0 +1,57 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package no + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/bleve/analysis/token/lowercase" + "github.com/blevesearch/bleve/analysis/tokenizer/unicode" +) + +const AnalyzerName = "no" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name) + if err != nil { + return nil, err + } + stopNoFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerNoFilter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopNoFilter, + stemmerNoFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/lang/no/analyzer_no_test.go b/analysis/lang/no/analyzer_no_test.go new file mode 100644 index 000000000..c73f5f731 --- /dev/null +++ b/analysis/lang/no/analyzer_no_test.go @@ -0,0 +1,70 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package no + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestNorwegianAnalyzer(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + // stemming + { + input: []byte("havnedistriktene"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("havnedistrikt"), + }, + }, + }, + { + input: []byte("havnedistrikter"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("havnedistrikt"), + }, + }, + }, + // stop word + { + input: []byte("det"), + output: analysis.TokenStream{}, + }, + } + + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed(AnalyzerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := analyzer.Analyze(test.input) + if len(actual) != len(test.output) { + t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) + } + for i, tok := range actual { + if !reflect.DeepEqual(tok.Term, test.output[i].Term) { + t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) + } + } + } +} diff --git a/analysis/lang/no/stemmer_no.go b/analysis/lang/no/stemmer_no.go new file mode 100644 index 000000000..e61e02477 --- /dev/null +++ b/analysis/lang/no/stemmer_no.go @@ -0,0 +1,49 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package no + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/snowballstem" + "github.com/blevesearch/snowballstem/norwegian" +) + +const SnowballStemmerName = "stemmer_no_snowball" + +type NorwegianStemmerFilter struct { +} + +func NewNorwegianStemmerFilter() *NorwegianStemmerFilter { + return &NorwegianStemmerFilter{} +} + +func (s *NorwegianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + env := snowballstem.NewEnv(string(token.Term)) + norwegian.Stem(env) + token.Term = []byte(env.Current()) + } + return input +} + +func NorwegianStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewNorwegianStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(SnowballStemmerName, NorwegianStemmerFilterConstructor) +} diff --git a/analysis/lang/no/stop_filter_no.go b/analysis/lang/no/stop_filter_no.go new file mode 100644 index 000000000..093688fa7 --- /dev/null +++ b/analysis/lang/no/stop_filter_no.go @@ -0,0 +1,33 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package no + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/token/stop" + "github.com/blevesearch/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/lang/no/stop_words_no.go b/analysis/lang/no/stop_words_no.go new file mode 100644 index 000000000..bfca34846 --- /dev/null +++ b/analysis/lang/no/stop_words_no.go @@ -0,0 +1,218 @@ +package no + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const StopName = "stop_no" + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ +// ` was changed to ' to allow for literal string + +var NorwegianStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/norwegian/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Norwegian stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This stop word list is for the dominant bokmål dialect. Words unique + | to nynorsk are marked *. + + | Revised by Jan Bruusgaard , Jan 2005 + +og | and +i | in +jeg | I +det | it/this/that +at | to (w. inf.) +en | a/an +et | a/an +den | it/this/that +til | to +er | is/am/are +som | who/that +på | on +de | they / you(formal) +med | with +han | he +av | of +ikke | not +ikkje | not * +der | there +så | so +var | was/were +meg | me +seg | you +men | but +ett | one +har | have +om | about +vi | we +min | my +mitt | my +ha | have +hadde | had +hun | she +nå | now +over | over +da | when/as +ved | by/know +fra | from +du | you +ut | out +sin | your +dem | them +oss | us +opp | up +man | you/one +kan | can +hans | his +hvor | where +eller | or +hva | what +skal | shall/must +selv | self (reflective) +sjøl | self (reflective) +her | here +alle | all +vil | will +bli | become +ble | became +blei | became * +blitt | have become +kunne | could +inn | in +når | when +være | be +kom | come +noen | some +noe | some +ville | would +dere | you +som | who/which/that +deres | their/theirs +kun | only/just +ja | yes +etter | after +ned | down +skulle | should +denne | this +for | for/because +deg | you +si | hers/his +sine | hers/his +sitt | hers/his +mot | against +å | to +meget | much +hvorfor | why +dette | this +disse | these/those +uten | without +hvordan | how +ingen | none +din | your +ditt | your +blir | become +samme | same +hvilken | which +hvilke | which (plural) +sånn | such a +inni | inside/within +mellom | between +vår | our +hver | each +hvem | who +vors | us/ours +hvis | whose +både | both +bare | only/just +enn | than +fordi | as/because +før | before +mange | many +også | also +slik | just +vært | been +være | to be +båe | both * +begge | both +siden | since +dykk | your * +dykkar | yours * +dei | they * +deira | them * +deires | theirs * +deim | them * +di | your (fem.) * +då | as/when * +eg | I * +ein | a/an * +eit | a/an * +eitt | a/an * +elles | or * +honom | he * +hjå | at * +ho | she * +hoe | she * +henne | her +hennar | her/hers +hennes | hers +hoss | how * +hossen | how * +ikkje | not * +ingi | noone * +inkje | noone * +korleis | how * +korso | how * +kva | what/which * +kvar | where * +kvarhelst | where * +kven | who/whom * +kvi | why * +kvifor | why * +me | we * +medan | while * +mi | my * +mine | my * +mykje | much * +no | now * +nokon | some (masc./neut.) * +noka | some (fem.) * +nokor | some * +noko | some * +nokre | some * +si | his/hers * +sia | since * +sidan | since * +so | so * +somt | some * +somme | some * +um | about* +upp | up * +vere | be * +vore | was * +verte | become * +vort | become * +varte | became * +vart | became * + +`) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(NorwegianStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/lang/ro/analyzer_ro.go b/analysis/lang/ro/analyzer_ro.go new file mode 100644 index 000000000..e29388155 --- /dev/null +++ b/analysis/lang/ro/analyzer_ro.go @@ -0,0 +1,57 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ro + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/bleve/analysis/token/lowercase" + "github.com/blevesearch/bleve/analysis/tokenizer/unicode" +) + +const AnalyzerName = "ro" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name) + if err != nil { + return nil, err + } + stopRoFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerRoFilter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopRoFilter, + stemmerRoFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/lang/ro/analyzer_ro_test.go b/analysis/lang/ro/analyzer_ro_test.go new file mode 100644 index 000000000..ee8b88f80 --- /dev/null +++ b/analysis/lang/ro/analyzer_ro_test.go @@ -0,0 +1,70 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ro + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestRomanianAnalyzer(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + // stemming + { + input: []byte("absenţa"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("absenţ"), + }, + }, + }, + { + input: []byte("absenţi"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("absenţ"), + }, + }, + }, + // stop word + { + input: []byte("îl"), + output: analysis.TokenStream{}, + }, + } + + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed(AnalyzerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := analyzer.Analyze(test.input) + if len(actual) != len(test.output) { + t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) + } + for i, tok := range actual { + if !reflect.DeepEqual(tok.Term, test.output[i].Term) { + t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) + } + } + } +} diff --git a/analysis/lang/ro/stemmer_ro.go b/analysis/lang/ro/stemmer_ro.go new file mode 100644 index 000000000..3966215ff --- /dev/null +++ b/analysis/lang/ro/stemmer_ro.go @@ -0,0 +1,49 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ro + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/snowballstem" + "github.com/blevesearch/snowballstem/romanian" +) + +const SnowballStemmerName = "stemmer_ro_snowball" + +type RomanianStemmerFilter struct { +} + +func NewRomanianStemmerFilter() *RomanianStemmerFilter { + return &RomanianStemmerFilter{} +} + +func (s *RomanianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + env := snowballstem.NewEnv(string(token.Term)) + romanian.Stem(env) + token.Term = []byte(env.Current()) + } + return input +} + +func RomanianStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewRomanianStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(SnowballStemmerName, RomanianStemmerFilterConstructor) +} diff --git a/analysis/lang/ro/stop_filter_ro.go b/analysis/lang/ro/stop_filter_ro.go new file mode 100644 index 000000000..a2f7f6dd9 --- /dev/null +++ b/analysis/lang/ro/stop_filter_ro.go @@ -0,0 +1,33 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package ro + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/token/stop" + "github.com/blevesearch/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/lang/ro/stop_words_ro.go b/analysis/lang/ro/stop_words_ro.go new file mode 100644 index 000000000..e7d62d414 --- /dev/null +++ b/analysis/lang/ro/stop_words_ro.go @@ -0,0 +1,257 @@ +package ro + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const StopName = "stop_ro" + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/ +// ` was changed to ' to allow for literal string + +var RomanianStopWords = []byte(`# This file was created by Jacques Savoy and is distributed under the BSD license. +# See http://members.unine.ch/jacques.savoy/clef/index.html. +# Also see http://www.opensource.org/licenses/bsd-license.html +acea +aceasta +această +aceea +acei +aceia +acel +acela +acele +acelea +acest +acesta +aceste +acestea +aceşti +aceştia +acolo +acum +ai +aia +aibă +aici +al +ăla +ale +alea +ălea +altceva +altcineva +am +ar +are +aş +aşadar +asemenea +asta +ăsta +astăzi +astea +ăstea +ăştia +asupra +aţi +au +avea +avem +aveţi +azi +bine +bucur +bună +ca +că +căci +când +care +cărei +căror +cărui +cât +câte +câţi +către +câtva +ce +cel +ceva +chiar +cînd +cine +cineva +cît +cîte +cîţi +cîtva +contra +cu +cum +cumva +curând +curînd +da +dă +dacă +dar +datorită +de +deci +deja +deoarece +departe +deşi +din +dinaintea +dintr +dintre +drept +după +ea +ei +el +ele +eram +este +eşti +eu +face +fără +fi +fie +fiecare +fii +fim +fiţi +iar +ieri +îi +îl +îmi +împotriva +în +înainte +înaintea +încât +încît +încotro +între +întrucât +întrucît +îţi +la +lângă +le +li +lîngă +lor +lui +mă +mâine +mea +mei +mele +mereu +meu +mi +mine +mult +multă +mulţi +ne +nicăieri +nici +nimeni +nişte +noastră +noastre +noi +noştri +nostru +nu +ori +oricând +oricare +oricât +orice +oricînd +oricine +oricît +oricum +oriunde +până +pe +pentru +peste +pînă +poate +pot +prea +prima +primul +prin +printr +sa +să +săi +sale +sau +său +se +şi +sînt +sîntem +sînteţi +spre +sub +sunt +suntem +sunteţi +ta +tăi +tale +tău +te +ţi +ţie +tine +toată +toate +tot +toţi +totuşi +tu +un +una +unde +undeva +unei +unele +uneori +unor +vă +vi +voastră +voastre +voi +voştri +vostru +vouă +vreo +vreun +`) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(RomanianStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/lang/sv/analyzer_sv.go b/analysis/lang/sv/analyzer_sv.go new file mode 100644 index 000000000..f650158d4 --- /dev/null +++ b/analysis/lang/sv/analyzer_sv.go @@ -0,0 +1,57 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sv + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/bleve/analysis/token/lowercase" + "github.com/blevesearch/bleve/analysis/tokenizer/unicode" +) + +const AnalyzerName = "sv" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name) + if err != nil { + return nil, err + } + stopSvFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerSvFilter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + toLowerFilter, + stopSvFilter, + stemmerSvFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/lang/sv/analyzer_sv_test.go b/analysis/lang/sv/analyzer_sv_test.go new file mode 100644 index 000000000..2d358b63e --- /dev/null +++ b/analysis/lang/sv/analyzer_sv_test.go @@ -0,0 +1,70 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sv + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestSwedishAnalyzer(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + // stemming + { + input: []byte("jaktkarlarne"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("jaktkarl"), + }, + }, + }, + { + input: []byte("jaktkarlens"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("jaktkarl"), + }, + }, + }, + // stop word + { + input: []byte("och"), + output: analysis.TokenStream{}, + }, + } + + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed(AnalyzerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := analyzer.Analyze(test.input) + if len(actual) != len(test.output) { + t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) + } + for i, tok := range actual { + if !reflect.DeepEqual(tok.Term, test.output[i].Term) { + t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) + } + } + } +} diff --git a/analysis/lang/sv/stemmer_sv.go b/analysis/lang/sv/stemmer_sv.go new file mode 100644 index 000000000..247f11bb2 --- /dev/null +++ b/analysis/lang/sv/stemmer_sv.go @@ -0,0 +1,49 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sv + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/snowballstem" + "github.com/blevesearch/snowballstem/swedish" +) + +const SnowballStemmerName = "stemmer_sv_snowball" + +type SwedishStemmerFilter struct { +} + +func NewSwedishStemmerFilter() *SwedishStemmerFilter { + return &SwedishStemmerFilter{} +} + +func (s *SwedishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + env := snowballstem.NewEnv(string(token.Term)) + swedish.Stem(env) + token.Term = []byte(env.Current()) + } + return input +} + +func SwedishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewSwedishStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(SnowballStemmerName, SwedishStemmerFilterConstructor) +} diff --git a/analysis/lang/sv/stop_filter_sv.go b/analysis/lang/sv/stop_filter_sv.go new file mode 100644 index 000000000..46a533d17 --- /dev/null +++ b/analysis/lang/sv/stop_filter_sv.go @@ -0,0 +1,33 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package sv + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/token/stop" + "github.com/blevesearch/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/lang/sv/stop_words_sv.go b/analysis/lang/sv/stop_words_sv.go new file mode 100644 index 000000000..b4022fd90 --- /dev/null +++ b/analysis/lang/sv/stop_words_sv.go @@ -0,0 +1,157 @@ +package sv + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const StopName = "stop_sv" + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ +// ` was changed to ' to allow for literal string + +var SwedishStopWords = []byte(` | From svn.tartarus.org/snowball/trunk/website/algorithms/swedish/stop.txt + | This file is distributed under the BSD License. + | See http://snowball.tartarus.org/license.php + | Also see http://www.opensource.org/licenses/bsd-license.html + | - Encoding was converted to UTF-8. + | - This notice was added. + | + | NOTE: To use this file with StopFilterFactory, you must specify format="snowball" + + | A Swedish stop word list. Comments begin with vertical bar. Each stop + | word is at the start of a line. + + | This is a ranked list (commonest to rarest) of stopwords derived from + | a large text sample. + + | Swedish stop words occasionally exhibit homonym clashes. For example + | så = so, but also seed. These are indicated clearly below. + +och | and +det | it, this/that +att | to (with infinitive) +i | in, at +en | a +jag | I +hon | she +som | who, that +han | he +på | on +den | it, this/that +med | with +var | where, each +sig | him(self) etc +för | for +så | so (also: seed) +till | to +är | is +men | but +ett | a +om | if; around, about +hade | had +de | they, these/those +av | of +icke | not, no +mig | me +du | you +henne | her +då | then, when +sin | his +nu | now +har | have +inte | inte någon = no one +hans | his +honom | him +skulle | 'sake' +hennes | her +där | there +min | my +man | one (pronoun) +ej | nor +vid | at, by, on (also: vast) +kunde | could +något | some etc +från | from, off +ut | out +när | when +efter | after, behind +upp | up +vi | we +dem | them +vara | be +vad | what +över | over +än | than +dig | you +kan | can +sina | his +här | here +ha | have +mot | towards +alla | all +under | under (also: wonder) +någon | some etc +eller | or (else) +allt | all +mycket | much +sedan | since +ju | why +denna | this/that +själv | myself, yourself etc +detta | this/that +åt | to +utan | without +varit | was +hur | how +ingen | no +mitt | my +ni | you +bli | to be, become +blev | from bli +oss | us +din | thy +dessa | these/those +några | some etc +deras | their +blir | from bli +mina | my +samma | (the) same +vilken | who, that +er | you, your +sådan | such a +vår | our +blivit | from bli +dess | its +inom | within +mellan | between +sådant | such a +varför | why +varje | each +vilka | who, that +ditt | thy +vem | who +vilket | who, that +sitta | his +sådana | such a +vart | each +dina | thy +vars | whose +vårt | our +våra | our +ert | your +era | your +vilkas | whose + +`) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(SwedishStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} diff --git a/analysis/lang/tr/analyzer_tr.go b/analysis/lang/tr/analyzer_tr.go new file mode 100644 index 000000000..d52a1d5cc --- /dev/null +++ b/analysis/lang/tr/analyzer_tr.go @@ -0,0 +1,63 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tr + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/bleve/analysis/token/apostrophe" + "github.com/blevesearch/bleve/analysis/token/lowercase" + "github.com/blevesearch/bleve/analysis/tokenizer/unicode" +) + +const AnalyzerName = "tr" + +func AnalyzerConstructor(config map[string]interface{}, cache *registry.Cache) (*analysis.Analyzer, error) { + unicodeTokenizer, err := cache.TokenizerNamed(unicode.Name) + if err != nil { + return nil, err + } + aposFilter, err := cache.TokenFilterNamed(apostrophe.Name) + if err != nil { + return nil, err + } + toLowerFilter, err := cache.TokenFilterNamed(lowercase.Name) + if err != nil { + return nil, err + } + stopTrFilter, err := cache.TokenFilterNamed(StopName) + if err != nil { + return nil, err + } + stemmerTrFilter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + return nil, err + } + rv := analysis.Analyzer{ + Tokenizer: unicodeTokenizer, + TokenFilters: []analysis.TokenFilter{ + aposFilter, + toLowerFilter, + stopTrFilter, + stemmerTrFilter, + }, + } + return &rv, nil +} + +func init() { + registry.RegisterAnalyzer(AnalyzerName, AnalyzerConstructor) +} diff --git a/analysis/lang/tr/analyzer_tr_test.go b/analysis/lang/tr/analyzer_tr_test.go new file mode 100644 index 000000000..fe8980938 --- /dev/null +++ b/analysis/lang/tr/analyzer_tr_test.go @@ -0,0 +1,90 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tr + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestTurkishAnalyzer(t *testing.T) { + tests := []struct { + input []byte + output analysis.TokenStream + }{ + // stemming + { + input: []byte("ağacı"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ağaç"), + }, + }, + }, + { + input: []byte("ağaç"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("ağaç"), + }, + }, + }, + // stop word + { + input: []byte("dolayı"), + output: analysis.TokenStream{}, + }, + // apostrophes + { + input: []byte("Kıbrıs'ta"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("kıbrıs"), + }, + }, + }, + { + input: []byte("Van Gölü'ne"), + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("van"), + }, + &analysis.Token{ + Term: []byte("göl"), + }, + }, + }, + } + + cache := registry.NewCache() + analyzer, err := cache.AnalyzerNamed(AnalyzerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := analyzer.Analyze(test.input) + if len(actual) != len(test.output) { + t.Fatalf("expected length: %d, got %d", len(test.output), len(actual)) + } + for i, tok := range actual { + if !reflect.DeepEqual(tok.Term, test.output[i].Term) { + t.Errorf("expected term %s (% x) got %s (% x)", test.output[i].Term, test.output[i].Term, tok.Term, tok.Term) + } + } + } +} diff --git a/analysis/lang/tr/stemmer_tr.go b/analysis/lang/tr/stemmer_tr.go new file mode 100644 index 000000000..ba3034e1a --- /dev/null +++ b/analysis/lang/tr/stemmer_tr.go @@ -0,0 +1,49 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tr + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/snowballstem" + "github.com/blevesearch/snowballstem/turkish" +) + +const SnowballStemmerName = "stemmer_tr_snowball" + +type TurkishStemmerFilter struct { +} + +func NewTurkishStemmerFilter() *TurkishStemmerFilter { + return &TurkishStemmerFilter{} +} + +func (s *TurkishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + env := snowballstem.NewEnv(string(token.Term)) + turkish.Stem(env) + token.Term = []byte(env.Current()) + } + return input +} + +func TurkishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewTurkishStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(SnowballStemmerName, TurkishStemmerFilterConstructor) +} diff --git a/analysis/lang/tr/stop_filter_tr.go b/analysis/lang/tr/stop_filter_tr.go new file mode 100644 index 000000000..5b616eb9c --- /dev/null +++ b/analysis/lang/tr/stop_filter_tr.go @@ -0,0 +1,33 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tr + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/token/stop" + "github.com/blevesearch/bleve/registry" +) + +func StopTokenFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + tokenMap, err := cache.TokenMapNamed(StopName) + if err != nil { + return nil, err + } + return stop.NewStopTokensFilter(tokenMap), nil +} + +func init() { + registry.RegisterTokenFilter(StopName, StopTokenFilterConstructor) +} diff --git a/analysis/lang/tr/stop_words_tr.go b/analysis/lang/tr/stop_words_tr.go new file mode 100644 index 000000000..f96fb07ed --- /dev/null +++ b/analysis/lang/tr/stop_words_tr.go @@ -0,0 +1,236 @@ +package tr + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const StopName = "stop_tr" + +// this content was obtained from: +// lucene-4.7.2/analysis/common/src/resources/org/apache/lucene/analysis/snowball/ +// ` was changed to ' to allow for literal string + +var TurkishStopWords = []byte(`# Turkish stopwords from LUCENE-559 +# merged with the list from "Information Retrieval on Turkish Texts" +# (http://www.users.muohio.edu/canf/papers/JASIST2008offPrint.pdf) +acaba +altmış +altı +ama +ancak +arada +aslında +ayrıca +bana +bazı +belki +ben +benden +beni +benim +beri +beş +bile +bin +bir +birçok +biri +birkaç +birkez +birşey +birşeyi +biz +bize +bizden +bizi +bizim +böyle +böylece +bu +buna +bunda +bundan +bunlar +bunları +bunların +bunu +bunun +burada +çok +çünkü +da +daha +dahi +de +defa +değil +diğer +diye +doksan +dokuz +dolayı +dolayısıyla +dört +edecek +eden +ederek +edilecek +ediliyor +edilmesi +ediyor +eğer +elli +en +etmesi +etti +ettiği +ettiğini +gibi +göre +halen +hangi +hatta +hem +henüz +hep +hepsi +her +herhangi +herkesin +hiç +hiçbir +için +iki +ile +ilgili +ise +işte +itibaren +itibariyle +kadar +karşın +katrilyon +kendi +kendilerine +kendini +kendisi +kendisine +kendisini +kez +ki +kim +kimden +kime +kimi +kimse +kırk +milyar +milyon +mu +mü +mı +nasıl +ne +neden +nedenle +nerde +nerede +nereye +niye +niçin +o +olan +olarak +oldu +olduğu +olduğunu +olduklarını +olmadı +olmadığı +olmak +olması +olmayan +olmaz +olsa +olsun +olup +olur +olursa +oluyor +on +ona +ondan +onlar +onlardan +onları +onların +onu +onun +otuz +oysa +öyle +pek +rağmen +sadece +sanki +sekiz +seksen +sen +senden +seni +senin +siz +sizden +sizi +sizin +şey +şeyden +şeyi +şeyler +şöyle +şu +şuna +şunda +şundan +şunları +şunu +tarafından +trilyon +tüm +üç +üzere +var +vardı +ve +veya +ya +yani +yapacak +yapılan +yapılması +yapıyor +yapmak +yaptı +yaptığı +yaptığını +yaptıkları +yedi +yerine +yetmiş +yine +yirmi +yoksa +yüz +zaten +`) + +func TokenMapConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenMap, error) { + rv := analysis.NewTokenMap() + err := rv.LoadBytes(TurkishStopWords) + return rv, err +} + +func init() { + registry.RegisterTokenMap(StopName, TokenMapConstructor) +} From 3afc5458e00f75c677b83f537e31efa1b9ca5152 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 11 Jan 2018 14:44:05 +0530 Subject: [PATCH 135/728] MB-27498 - date range facet query panics Initialise the facet results map in case of an empty partial hits with a multi node cluster --- search.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/search.go b/search.go index c2ebafbb5..a57a11cb2 100644 --- a/search.go +++ b/search.go @@ -481,5 +481,8 @@ func (sr *SearchResult) Merge(other *SearchResult) { if other.MaxScore > sr.MaxScore { sr.MaxScore = other.MaxScore } + if len(sr.Facets) == 0 { + sr.Facets = make(search.FacetResults) + } sr.Facets.Merge(other.Facets) } From 039a4df33be99e8caea443e70d9d96aefcd4ec26 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 11 Jan 2018 15:09:27 +0530 Subject: [PATCH 136/728] initialize only with an imminent merge --- search.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/search.go b/search.go index a57a11cb2..5d12b7a25 100644 --- a/search.go +++ b/search.go @@ -481,7 +481,7 @@ func (sr *SearchResult) Merge(other *SearchResult) { if other.MaxScore > sr.MaxScore { sr.MaxScore = other.MaxScore } - if len(sr.Facets) == 0 { + if len(sr.Facets) == 0 && len(other.Facets) != 0 { sr.Facets = make(search.FacetResults) } sr.Facets.Merge(other.Facets) From 4d71e901e866e06c006302d8e2b5bc71b7ce0793 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 11 Jan 2018 11:00:18 -0500 Subject: [PATCH 137/728] make new analyzers available to consumers of the config pkg many tools and applications using bleve use the config pkg to include support for many languages out of the box by forcing import of optional packages. --- config/config.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/config/config.go b/config/config.go index c4c5e9153..ad0bdcb9a 100644 --- a/config/config.go +++ b/config/config.go @@ -75,19 +75,30 @@ import ( _ "github.com/blevesearch/bleve/analysis/lang/cjk" _ "github.com/blevesearch/bleve/analysis/lang/ckb" _ "github.com/blevesearch/bleve/analysis/lang/cs" + _ "github.com/blevesearch/bleve/analysis/lang/da" + _ "github.com/blevesearch/bleve/analysis/lang/de" _ "github.com/blevesearch/bleve/analysis/lang/el" _ "github.com/blevesearch/bleve/analysis/lang/en" + _ "github.com/blevesearch/bleve/analysis/lang/es" _ "github.com/blevesearch/bleve/analysis/lang/eu" _ "github.com/blevesearch/bleve/analysis/lang/fa" + _ "github.com/blevesearch/bleve/analysis/lang/fi" _ "github.com/blevesearch/bleve/analysis/lang/fr" _ "github.com/blevesearch/bleve/analysis/lang/ga" _ "github.com/blevesearch/bleve/analysis/lang/gl" _ "github.com/blevesearch/bleve/analysis/lang/hi" + _ "github.com/blevesearch/bleve/analysis/lang/hu" _ "github.com/blevesearch/bleve/analysis/lang/hy" _ "github.com/blevesearch/bleve/analysis/lang/id" _ "github.com/blevesearch/bleve/analysis/lang/in" _ "github.com/blevesearch/bleve/analysis/lang/it" + _ "github.com/blevesearch/bleve/analysis/lang/nl" + _ "github.com/blevesearch/bleve/analysis/lang/no" _ "github.com/blevesearch/bleve/analysis/lang/pt" + _ "github.com/blevesearch/bleve/analysis/lang/ro" + _ "github.com/blevesearch/bleve/analysis/lang/ru" + _ "github.com/blevesearch/bleve/analysis/lang/sv" + _ "github.com/blevesearch/bleve/analysis/lang/tr" // kv stores _ "github.com/blevesearch/bleve/index/store/boltdb" From d777d7c3652942a752147f747089d13151b4bd5a Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 15 Jan 2018 11:06:44 -0800 Subject: [PATCH 138/728] scorch mem segment comments consistency --- index/scorch/segment/mem/build.go | 2 +- index/scorch/segment/mem/segment.go | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index cd11fb401..1dc0a788c 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -24,7 +24,7 @@ import ( "github.com/blevesearch/bleve/index" ) -// NewFromAnalyzedDocs places the analyzed document mutations into this segment +// NewFromAnalyzedDocs places the analyzed document mutations into a new segment func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { s := New() diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go index 5ef3e1f34..40c071f60 100644 --- a/index/scorch/segment/mem/segment.go +++ b/index/scorch/segment/mem/segment.go @@ -46,7 +46,7 @@ type Segment struct { FieldsInv []string // term dictionary - // field id -> term -> posting id + 1 + // field id -> term -> postings list id + 1 Dicts []map[string]uint64 // term dictionary keys @@ -54,7 +54,7 @@ type Segment struct { DictKeys [][]string // Postings list - // Postings list id -> Postings bitmap + // postings list id -> Postings bitmap Postings []*roaring.Bitmap // Postings List has locations From e7bd6026eb239f7e8b452b02958b8d37b68bc0ad Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 15 Jan 2018 11:52:18 -0800 Subject: [PATCH 139/728] scorch mem segment preallocs docMap/fieldLens with capacity The first time through, startNumFields should be 0, where there ought to be more optimization assuming later docs have similar fields as the first doc. --- index/scorch/segment/mem/build.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 1dc0a788c..29c41d5de 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -83,9 +83,12 @@ func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { } func (s *Segment) processDocument(result *index.AnalysisResult) { + startNumFields := len(s.FieldsMap) + // used to collate information across fields - docMap := map[uint16]analysis.TokenFrequencies{} - fieldLens := map[uint16]int{} + docMap := make(map[uint16]analysis.TokenFrequencies, startNumFields) + fieldLens := make(map[uint16]int, startNumFields) + docNum := uint64(s.addDocument()) processField := func(field uint16, name string, l int, tf analysis.TokenFrequencies) { From 917c47079122dc59c79ec5e131811ca9dfdd48c3 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 15 Jan 2018 11:54:46 -0800 Subject: [PATCH 140/728] scorch mem segment VisitDocument() accesses StoredTypes/Pos outside of loop --- index/scorch/segment/mem/segment.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go index 40c071f60..3c400b531 100644 --- a/index/scorch/segment/mem/segment.go +++ b/index/scorch/segment/mem/segment.go @@ -188,9 +188,11 @@ func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVi return nil } docFields := s.Stored[int(num)] + st := s.StoredTypes[int(num)] + sp := s.StoredPos[int(num)] for field, values := range docFields { for i, value := range values { - keepGoing := visitor(s.FieldsInv[field], s.StoredTypes[int(num)][field][i], value, s.StoredPos[int(num)][field][i]) + keepGoing := visitor(s.FieldsInv[field], st[field][i], value, sp[field][i]) if !keepGoing { return nil } From a4110d325c2ff0b790d509893328f07be6234cf5 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 15 Jan 2018 16:37:06 -0800 Subject: [PATCH 141/728] scorch mem segment preallocates slices that are key'ed by postingId The scorch mem segment build phase uses the append() idiom to populate various slices that are keyed by postings list id's. These slices include... * Postings * PostingsLocs * Freqs * Norms * Locfields * Locstarts * Locends * Locpos * Locarraypos This change introduces an initialization step that preallocates those slices up-front, by assigning postings list id's to terms up-front. This change also has an additional effect of simplifying the processDocument() logic to no longer have to worry about a first-time initialization case, removing some duplicate'ish code. --- index/scorch/segment/mem/build.go | 138 +++++++++++++++--------------- 1 file changed, 67 insertions(+), 71 deletions(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 29c41d5de..8f080338b 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -31,6 +31,9 @@ func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { // ensure that _id field get fieldID 0 s.getOrDefineField("_id") + // fill Dicts/DictKeys and preallocate memory + s.initializeDict(results) + // walk each doc for _, result := range results { s.processDocument(result) @@ -82,12 +85,58 @@ func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { return s } -func (s *Segment) processDocument(result *index.AnalysisResult) { - startNumFields := len(s.FieldsMap) +// fill Dicts/DictKeys and preallocate memory for postings +func (s *Segment) initializeDict(results []*index.AnalysisResult) { + var numPostings int + + processField := func(fieldID uint16, tf analysis.TokenFrequencies) { + for term, _ := range tf { + _, exists := s.Dicts[fieldID][term] + if !exists { + numPostings++ + s.Dicts[fieldID][term] = uint64(numPostings) + s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term) + } + } + } + + for _, result := range results { + // walk each composite field + for _, field := range result.Document.CompositeFields { + fieldID := uint16(s.getOrDefineField(field.Name())) + _, tf := field.Analyze() + processField(fieldID, tf) + } + + // walk each field + for i, field := range result.Document.Fields { + fieldID := uint16(s.getOrDefineField(field.Name())) + tf := result.Analyzed[i] + processField(fieldID, tf) + } + } + s.Postings = make([]*roaring.Bitmap, numPostings) + for i := 0; i < numPostings; i++ { + s.Postings[i] = roaring.New() + } + s.PostingsLocs = make([]*roaring.Bitmap, numPostings) + for i := 0; i < numPostings; i++ { + s.PostingsLocs[i] = roaring.New() + } + s.Freqs = make([][]uint64, numPostings) + s.Norms = make([][]float32, numPostings) + s.Locfields = make([][]uint16, numPostings) + s.Locstarts = make([][]uint64, numPostings) + s.Locends = make([][]uint64, numPostings) + s.Locpos = make([][]uint64, numPostings) + s.Locarraypos = make([][][]uint64, numPostings) +} + +func (s *Segment) processDocument(result *index.AnalysisResult) { // used to collate information across fields - docMap := make(map[uint16]analysis.TokenFrequencies, startNumFields) - fieldLens := make(map[uint16]int, startNumFields) + docMap := make(map[uint16]analysis.TokenFrequencies, len(s.FieldsMap)) + fieldLens := make(map[uint16]int, len(s.FieldsMap)) docNum := uint64(s.addDocument()) @@ -132,80 +181,27 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { for fieldID, tokenFrequencies := range docMap { for term, tokenFreq := range tokenFrequencies { fieldTermPostings := s.Dicts[fieldID][term] - - // FIXME this if/else block has duplicate code that has resulted in - // bugs fixed/missed more than once, need to refactor - if fieldTermPostings == 0 { - // need to build new posting - bs := roaring.New() - bs.AddInt(int(docNum)) - - newPostingID := uint64(len(s.Postings) + 1) - // add this new bitset to the postings slice - s.Postings = append(s.Postings, bs) - - locationBS := roaring.New() - s.PostingsLocs = append(s.PostingsLocs, locationBS) - // add this to the details slice - s.Freqs = append(s.Freqs, []uint64{uint64(tokenFreq.Frequency())}) - s.Norms = append(s.Norms, []float32{float32(1.0 / math.Sqrt(float64(fieldLens[fieldID])))}) - // add to locations - var locfields []uint16 - var locstarts []uint64 - var locends []uint64 - var locpos []uint64 - var locarraypos [][]uint64 - if len(tokenFreq.Locations) > 0 { - locationBS.AddInt(int(docNum)) - } - for _, loc := range tokenFreq.Locations { - var locf = fieldID - if loc.Field != "" { - locf = uint16(s.getOrDefineField(loc.Field)) - } - locfields = append(locfields, locf) - locstarts = append(locstarts, uint64(loc.Start)) - locends = append(locends, uint64(loc.End)) - locpos = append(locpos, uint64(loc.Position)) - if len(loc.ArrayPositions) > 0 { - locarraypos = append(locarraypos, loc.ArrayPositions) - } else { - locarraypos = append(locarraypos, nil) - } - } - s.Locfields = append(s.Locfields, locfields) - s.Locstarts = append(s.Locstarts, locstarts) - s.Locends = append(s.Locends, locends) - s.Locpos = append(s.Locpos, locpos) - s.Locarraypos = append(s.Locarraypos, locarraypos) - // record it - s.Dicts[fieldID][term] = newPostingID - // this term was new for this field, add it to dictKeys - s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term) - } else { - // posting already started for this field/term - // the actual offset is - 1, because 0 is zero value - bs := s.Postings[fieldTermPostings-1] - bs.AddInt(int(docNum)) - locationBS := s.PostingsLocs[fieldTermPostings-1] - s.Freqs[fieldTermPostings-1] = append(s.Freqs[fieldTermPostings-1], uint64(tokenFreq.Frequency())) - s.Norms[fieldTermPostings-1] = append(s.Norms[fieldTermPostings-1], float32(1.0/math.Sqrt(float64(fieldLens[fieldID])))) - if len(tokenFreq.Locations) > 0 { - locationBS.AddInt(int(docNum)) - } + pid := fieldTermPostings-1 + bs := s.Postings[pid] + bs.AddInt(int(docNum)) + s.Freqs[pid] = append(s.Freqs[pid], uint64(tokenFreq.Frequency())) + s.Norms[pid] = append(s.Norms[pid], float32(1.0/math.Sqrt(float64(fieldLens[fieldID])))) + locationBS := s.PostingsLocs[pid] + if len(tokenFreq.Locations) > 0 { + locationBS.AddInt(int(docNum)) for _, loc := range tokenFreq.Locations { var locf = fieldID if loc.Field != "" { locf = uint16(s.getOrDefineField(loc.Field)) } - s.Locfields[fieldTermPostings-1] = append(s.Locfields[fieldTermPostings-1], locf) - s.Locstarts[fieldTermPostings-1] = append(s.Locstarts[fieldTermPostings-1], uint64(loc.Start)) - s.Locends[fieldTermPostings-1] = append(s.Locends[fieldTermPostings-1], uint64(loc.End)) - s.Locpos[fieldTermPostings-1] = append(s.Locpos[fieldTermPostings-1], uint64(loc.Position)) + s.Locfields[pid] = append(s.Locfields[pid], locf) + s.Locstarts[pid] = append(s.Locstarts[pid], uint64(loc.Start)) + s.Locends[pid] = append(s.Locends[pid], uint64(loc.End)) + s.Locpos[pid] = append(s.Locpos[pid], uint64(loc.Position)) if len(loc.ArrayPositions) > 0 { - s.Locarraypos[fieldTermPostings-1] = append(s.Locarraypos[fieldTermPostings-1], loc.ArrayPositions) + s.Locarraypos[pid] = append(s.Locarraypos[pid], loc.ArrayPositions) } else { - s.Locarraypos[fieldTermPostings-1] = append(s.Locarraypos[fieldTermPostings-1], nil) + s.Locarraypos[pid] = append(s.Locarraypos[pid], nil) } } } From a84bd122d2c6f18dc16dbd44a2ef8a5b0f184d34 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 15 Jan 2018 17:04:27 -0800 Subject: [PATCH 142/728] scorch mem segment preallocates sub-slices via # terms This change tracks the number of terms per posting list to preallocate the sub-slices for the Freqs & Norms. --- index/scorch/segment/mem/build.go | 54 +++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 17 deletions(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 8f080338b..eaf368336 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -87,17 +87,26 @@ func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { // fill Dicts/DictKeys and preallocate memory for postings func (s *Segment) initializeDict(results []*index.AnalysisResult) { - var numPostings int + var numPostingsLists int + + numTermsPerPostingsList := make([]int, 0, 64) + + var numTokenFrequencies int processField := func(fieldID uint16, tf analysis.TokenFrequencies) { for term, _ := range tf { - _, exists := s.Dicts[fieldID][term] + pidPlus1, exists := s.Dicts[fieldID][term] if !exists { - numPostings++ - s.Dicts[fieldID][term] = uint64(numPostings) + numPostingsLists++ + pidPlus1 = uint64(numPostingsLists) + s.Dicts[fieldID][term] = pidPlus1 s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term) + numTermsPerPostingsList = append(numTermsPerPostingsList, 0) } + pid := pidPlus1 - 1 + numTermsPerPostingsList[pid]++ } + numTokenFrequencies += len(tf) } for _, result := range results { @@ -116,21 +125,33 @@ func (s *Segment) initializeDict(results []*index.AnalysisResult) { } } - s.Postings = make([]*roaring.Bitmap, numPostings) - for i := 0; i < numPostings; i++ { + s.Postings = make([]*roaring.Bitmap, numPostingsLists) + for i := 0; i < numPostingsLists; i++ { s.Postings[i] = roaring.New() } - s.PostingsLocs = make([]*roaring.Bitmap, numPostings) - for i := 0; i < numPostings; i++ { + s.PostingsLocs = make([]*roaring.Bitmap, numPostingsLists) + for i := 0; i < numPostingsLists; i++ { s.PostingsLocs[i] = roaring.New() } - s.Freqs = make([][]uint64, numPostings) - s.Norms = make([][]float32, numPostings) - s.Locfields = make([][]uint16, numPostings) - s.Locstarts = make([][]uint64, numPostings) - s.Locends = make([][]uint64, numPostings) - s.Locpos = make([][]uint64, numPostings) - s.Locarraypos = make([][][]uint64, numPostings) + + s.Freqs = make([][]uint64, numPostingsLists) + s.Norms = make([][]float32, numPostingsLists) + s.Locfields = make([][]uint16, numPostingsLists) + s.Locstarts = make([][]uint64, numPostingsLists) + s.Locends = make([][]uint64, numPostingsLists) + s.Locpos = make([][]uint64, numPostingsLists) + s.Locarraypos = make([][][]uint64, numPostingsLists) + + uint64Backing := make([]uint64, numTokenFrequencies) + float32Backing := make([]float32, numTokenFrequencies) + + for i, numTerms := range numTermsPerPostingsList { + s.Freqs[i] = uint64Backing[0:0] + uint64Backing = uint64Backing[numTerms:] + + s.Norms[i] = float32Backing[0:0] + float32Backing = float32Backing[numTerms:] + } } func (s *Segment) processDocument(result *index.AnalysisResult) { @@ -180,8 +201,7 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { // now that its been rolled up into docMap, walk that for fieldID, tokenFrequencies := range docMap { for term, tokenFreq := range tokenFrequencies { - fieldTermPostings := s.Dicts[fieldID][term] - pid := fieldTermPostings-1 + pid := s.Dicts[fieldID][term]-1 bs := s.Postings[pid] bs.AddInt(int(docNum)) s.Freqs[pid] = append(s.Freqs[pid], uint64(tokenFreq.Frequency())) From 0f19b542a3f91fe5f55b8f38b5e6f7af864be644 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 15 Jan 2018 18:40:28 -0800 Subject: [PATCH 143/728] scorch mem segment prealloc's Locfields/starts/ends/pos/arraypos This change preallocates more of the backing arrays for Locfields, Locstarts, Locends, Locpos, Locaaraypos sub-slices of a scorch mem segment. On small bleve-blast tests (50K wiki docs) on a dev macbook, scorch indexing throughput seems to improve from 15MB/sec to 20MB/sec after the recent series of preallocation changes. --- index/scorch/segment/mem/build.go | 53 +++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 13 deletions(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index eaf368336..14cb1cbc4 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -89,12 +89,14 @@ func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { func (s *Segment) initializeDict(results []*index.AnalysisResult) { var numPostingsLists int - numTermsPerPostingsList := make([]int, 0, 64) + numTermsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id. + numLocsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id. var numTokenFrequencies int + var numLocs int - processField := func(fieldID uint16, tf analysis.TokenFrequencies) { - for term, _ := range tf { + processField := func(fieldID uint16, tfs analysis.TokenFrequencies) { + for term, tf := range tfs { pidPlus1, exists := s.Dicts[fieldID][term] if !exists { numPostingsLists++ @@ -102,11 +104,14 @@ func (s *Segment) initializeDict(results []*index.AnalysisResult) { s.Dicts[fieldID][term] = pidPlus1 s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term) numTermsPerPostingsList = append(numTermsPerPostingsList, 0) + numLocsPerPostingsList = append(numLocsPerPostingsList, 0) } pid := pidPlus1 - 1 - numTermsPerPostingsList[pid]++ + numTermsPerPostingsList[pid] += 1 + numLocsPerPostingsList[pid] += len(tf.Locations) + numLocs += len(tf.Locations) } - numTokenFrequencies += len(tf) + numTokenFrequencies += len(tfs) } for _, result := range results { @@ -136,21 +141,43 @@ func (s *Segment) initializeDict(results []*index.AnalysisResult) { s.Freqs = make([][]uint64, numPostingsLists) s.Norms = make([][]float32, numPostingsLists) + + uint64Backing := make([]uint64, numTokenFrequencies) + float32Backing := make([]float32, numTokenFrequencies) + + for pid, numTerms := range numTermsPerPostingsList { + s.Freqs[pid] = uint64Backing[0:0] + uint64Backing = uint64Backing[numTerms:] + + s.Norms[pid] = float32Backing[0:0] + float32Backing = float32Backing[numTerms:] + } + s.Locfields = make([][]uint16, numPostingsLists) s.Locstarts = make([][]uint64, numPostingsLists) s.Locends = make([][]uint64, numPostingsLists) s.Locpos = make([][]uint64, numPostingsLists) s.Locarraypos = make([][][]uint64, numPostingsLists) - uint64Backing := make([]uint64, numTokenFrequencies) - float32Backing := make([]float32, numTokenFrequencies) + uint16Backing := make([]uint16, numLocs) // For Locfields. + uint64Backing = make([]uint64, numLocs*3) // For Locstarts, Locends, Locpos. + auint64Backing := make([][]uint64, numLocs) // For Locarraypos. - for i, numTerms := range numTermsPerPostingsList { - s.Freqs[i] = uint64Backing[0:0] - uint64Backing = uint64Backing[numTerms:] + for pid, numLocs := range numLocsPerPostingsList { + s.Locfields[pid] = uint16Backing[0:0] + uint16Backing = uint16Backing[numLocs:] - s.Norms[i] = float32Backing[0:0] - float32Backing = float32Backing[numTerms:] + s.Locstarts[pid] = uint64Backing[0:0] + uint64Backing = uint64Backing[numLocs:] + + s.Locends[pid] = uint64Backing[0:0] + uint64Backing = uint64Backing[numLocs:] + + s.Locpos[pid] = uint64Backing[0:0] + uint64Backing = uint64Backing[numLocs:] + + s.Locarraypos[pid] = auint64Backing[0:0] + auint64Backing = auint64Backing[numLocs:] } } @@ -201,7 +228,7 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { // now that its been rolled up into docMap, walk that for fieldID, tokenFrequencies := range docMap { for term, tokenFreq := range tokenFrequencies { - pid := s.Dicts[fieldID][term]-1 + pid := s.Dicts[fieldID][term] - 1 bs := s.Postings[pid] bs.AddInt(int(docNum)) s.Freqs[pid] = append(s.Freqs[pid], uint64(tokenFreq.Frequency())) From d682c85a7b599b3d8053caa4b2937860c1febf14 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 15 Jan 2018 19:17:39 -0800 Subject: [PATCH 144/728] scorch mem segments uses backing array trick even more This change invokes make() only once per distinct type to allocate the large, contiguous backing arrays for the mem segment. --- index/scorch/segment/mem/build.go | 40 +++++++++++++++++++------------ 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 14cb1cbc4..554de8906 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -93,7 +93,7 @@ func (s *Segment) initializeDict(results []*index.AnalysisResult) { numLocsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id. var numTokenFrequencies int - var numLocs int + var totLocs int processField := func(fieldID uint16, tfs analysis.TokenFrequencies) { for term, tf := range tfs { @@ -109,7 +109,7 @@ func (s *Segment) initializeDict(results []*index.AnalysisResult) { pid := pidPlus1 - 1 numTermsPerPostingsList[pid] += 1 numLocsPerPostingsList[pid] += len(tf.Locations) - numLocs += len(tf.Locations) + totLocs += len(tf.Locations) } numTokenFrequencies += len(tfs) } @@ -139,12 +139,32 @@ func (s *Segment) initializeDict(results []*index.AnalysisResult) { s.PostingsLocs[i] = roaring.New() } - s.Freqs = make([][]uint64, numPostingsLists) + // Preallocate big, contiguous backing arrays. + auint64Backing := make([][]uint64, numPostingsLists*4+totLocs) // For Freqs, Locstarts, Locends, Locpos, sub-Locarraypos. + uint64Backing := make([]uint64, numTokenFrequencies+totLocs*3) // For sub-Freqs, sub-Locstarts, sub-Locends, sub-Locpos. + float32Backing := make([]float32, numTokenFrequencies) // For sub-Norms. + uint16Backing := make([]uint16, totLocs) // For sub-Locfields. + + // Point top-level slices to the backing arrays. + s.Freqs = auint64Backing[0:numPostingsLists] + auint64Backing = auint64Backing[numPostingsLists:] + s.Norms = make([][]float32, numPostingsLists) - uint64Backing := make([]uint64, numTokenFrequencies) - float32Backing := make([]float32, numTokenFrequencies) + s.Locfields = make([][]uint16, numPostingsLists) + + s.Locstarts = auint64Backing[0:numPostingsLists] + auint64Backing = auint64Backing[numPostingsLists:] + + s.Locends = auint64Backing[0:numPostingsLists] + auint64Backing = auint64Backing[numPostingsLists:] + + s.Locpos = auint64Backing[0:numPostingsLists] + auint64Backing = auint64Backing[numPostingsLists:] + s.Locarraypos = make([][][]uint64, numPostingsLists) + + // Point sub-slices to the backing arrays. for pid, numTerms := range numTermsPerPostingsList { s.Freqs[pid] = uint64Backing[0:0] uint64Backing = uint64Backing[numTerms:] @@ -153,16 +173,6 @@ func (s *Segment) initializeDict(results []*index.AnalysisResult) { float32Backing = float32Backing[numTerms:] } - s.Locfields = make([][]uint16, numPostingsLists) - s.Locstarts = make([][]uint64, numPostingsLists) - s.Locends = make([][]uint64, numPostingsLists) - s.Locpos = make([][]uint64, numPostingsLists) - s.Locarraypos = make([][][]uint64, numPostingsLists) - - uint16Backing := make([]uint16, numLocs) // For Locfields. - uint64Backing = make([]uint64, numLocs*3) // For Locstarts, Locends, Locpos. - auint64Backing := make([][]uint64, numLocs) // For Locarraypos. - for pid, numLocs := range numLocsPerPostingsList { s.Locfields[pid] = uint16Backing[0:0] uint16Backing = uint16Backing[numLocs:] From d14b290235caff80127fe490fb3068e53269fbe1 Mon Sep 17 00:00:00 2001 From: Ethan Koenig Date: Mon, 15 Jan 2018 22:23:41 -0800 Subject: [PATCH 145/728] Fix coverage badge in README --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fa11f906d..7c1a7c7c4 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # ![bleve](docs/bleve.png) bleve -[![Build Status](https://travis-ci.org/blevesearch/bleve.svg?branch=master)](https://travis-ci.org/blevesearch/bleve) [![Coverage Status](https://coveralls.io/repos/blevesearch/bleve/badge.png?branch=master)](https://coveralls.io/r/blevesearch/bleve?branch=master) [![GoDoc](https://godoc.org/github.com/blevesearch/bleve?status.svg)](https://godoc.org/github.com/blevesearch/bleve) +[![Build Status](https://travis-ci.org/blevesearch/bleve.svg?branch=master)](https://travis-ci.org/blevesearch/bleve) [![Coverage Status](https://coveralls.io/repos/github/blevesearch/bleve/badge.svg?branch=master)](https://coveralls.io/github/blevesearch/bleve?branch=master) [![GoDoc](https://godoc.org/github.com/blevesearch/bleve?status.svg)](https://godoc.org/github.com/blevesearch/bleve) [![Join the chat at https://gitter.im/blevesearch/bleve](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/blevesearch/bleve?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![codebeat](https://codebeat.co/badges/38a7cbc9-9cf5-41c0-a315-0746178230f4)](https://codebeat.co/projects/github-com-blevesearch-bleve) [![Go Report Card](https://goreportcard.com/badge/blevesearch/bleve)](https://goreportcard.com/report/blevesearch/bleve) From 71d6d1691b992a25f4511f4ec86c25cf476e3ff8 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 15 Jan 2018 22:43:08 -0800 Subject: [PATCH 146/728] scorch zap optimizations of inner loops and easy preallocs --- index/scorch/segment/zap/build.go | 54 ++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 19 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index c7f73769e..1b16b5e35 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -140,12 +140,18 @@ func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) metaEncoder := govarint.NewU64Base128Encoder(&metaBuf) + st := memSegment.StoredTypes[docNum] + sp := memSegment.StoredPos[docNum] + // encode fields in order for fieldID := range memSegment.FieldsInv { if storedFieldValues, ok := storedValues[uint16(fieldID)]; ok { // has stored values for this field num := len(storedFieldValues) + stf := st[uint16(fieldID)] + spf := sp[uint16(fieldID)] + // process each value for i := 0; i < num; i++ { // encode field @@ -154,7 +160,7 @@ func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) return 0, err2 } // encode type - _, err2 = metaEncoder.PutU64(uint64(memSegment.StoredTypes[docNum][uint16(fieldID)][i])) + _, err2 = metaEncoder.PutU64(uint64(stf[i])) if err2 != nil { return 0, err2 } @@ -169,13 +175,13 @@ func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) return 0, err2 } // encode number of array pos - _, err2 = metaEncoder.PutU64(uint64(len(memSegment.StoredPos[docNum][uint16(fieldID)][i]))) + _, err2 = metaEncoder.PutU64(uint64(len(spf[i]))) if err2 != nil { return 0, err2 } // encode all array positions - for j := 0; j < len(memSegment.StoredPos[docNum][uint16(fieldID)][i]); j++ { - _, err2 = metaEncoder.PutU64(memSegment.StoredPos[docNum][uint16(fieldID)][i][j]) + for _, pos := range spf[i] { + _, err2 = metaEncoder.PutU64(pos) if err2 != nil { return 0, err2 } @@ -235,6 +241,8 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac if postingID != 0 { tfEncoder.Reset() } + freqs := memSegment.Freqs[postingID] + norms := memSegment.Norms[postingID] postingsListItr := memSegment.Postings[postingID].Iterator() var offset int for postingsListItr.HasNext() { @@ -242,13 +250,13 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac docNum := uint64(postingsListItr.Next()) // put freq - err := tfEncoder.Add(docNum, memSegment.Freqs[postingID][offset]) + err := tfEncoder.Add(docNum, freqs[offset]) if err != nil { return nil, nil, err } // put norm - norm := memSegment.Norms[postingID][offset] + norm := norms[offset] normBits := math.Float32bits(norm) err = tfEncoder.Add(docNum, uint64(normBits)) if err != nil { @@ -275,40 +283,46 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac if postingID != 0 { locEncoder.Reset() } + freqs := memSegment.Freqs[postingID] + locfields := memSegment.Locfields[postingID] + locpos := memSegment.Locpos[postingID] + locstarts := memSegment.Locstarts[postingID] + locends := memSegment.Locends[postingID] + locarraypos := memSegment.Locarraypos[postingID] postingsListItr := memSegment.Postings[postingID].Iterator() var offset int var locOffset int for postingsListItr.HasNext() { docNum := uint64(postingsListItr.Next()) - for i := 0; i < int(memSegment.Freqs[postingID][offset]); i++ { - if len(memSegment.Locfields[postingID]) > 0 { + for i := 0; i < int(freqs[offset]); i++ { + if len(locfields) > 0 { // put field - err := locEncoder.Add(docNum, uint64(memSegment.Locfields[postingID][locOffset])) + err := locEncoder.Add(docNum, uint64(locfields[locOffset])) if err != nil { return nil, nil, err } // put pos - err = locEncoder.Add(docNum, memSegment.Locpos[postingID][locOffset]) + err = locEncoder.Add(docNum, locpos[locOffset]) if err != nil { return nil, nil, err } // put start - err = locEncoder.Add(docNum, memSegment.Locstarts[postingID][locOffset]) + err = locEncoder.Add(docNum, locstarts[locOffset]) if err != nil { return nil, nil, err } // put end - err = locEncoder.Add(docNum, memSegment.Locends[postingID][locOffset]) + err = locEncoder.Add(docNum, locends[locOffset]) if err != nil { return nil, nil, err } // put array positions - num := len(memSegment.Locarraypos[postingID][locOffset]) + num := len(locarraypos[locOffset]) // put the number of array positions to follow err = locEncoder.Add(docNum, uint64(num)) @@ -317,8 +331,8 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac } // put each array position - for j := 0; j < num; j++ { - err = locEncoder.Add(docNum, memSegment.Locarraypos[postingID][locOffset][j]) + for _, pos := range locarraypos[locOffset] { + err = locEncoder.Add(docNum, pos) if err != nil { return nil, nil, err } @@ -341,6 +355,7 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac } func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint64, err error) { + rv = make([]uint64, 0, len(memSegment.PostingsLocs)) for postingID := range memSegment.PostingsLocs { // record where we start this posting loc rv = append(rv, uint64(w.Count())) @@ -355,6 +370,7 @@ func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, postingsListLocs, freqOffsets, locOffsets []uint64) (rv []uint64, err error) { + rv = make([]uint64, 0, len(memSegment.Postings)) for postingID := range memSegment.Postings { // record where we start this posting list rv = append(rv, uint64(w.Count())) @@ -376,7 +392,7 @@ func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, } func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs []uint64) ([]uint64, error) { - var rv []uint64 + rv := make([]uint64, 0, len(memSegment.DictKeys)) var buffer bytes.Buffer for fieldID, fieldTerms := range memSegment.DictKeys { @@ -392,10 +408,10 @@ func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs dict := memSegment.Dicts[fieldID] // now walk the dictionary in order of fieldTerms (already sorted) - for i := range fieldTerms { - postingID := dict[fieldTerms[i]] - 1 + for _, fieldTerm := range fieldTerms { + postingID := dict[fieldTerm] - 1 postingsAddr := postingsLocs[postingID] - err = builder.Insert([]byte(fieldTerms[i]), postingsAddr) + err = builder.Insert([]byte(fieldTerm), postingsAddr) if err != nil { return nil, err } From 012d436dd7db71913fc3221e832f21229c18cf0e Mon Sep 17 00:00:00 2001 From: Ethan Koenig Date: Mon, 15 Jan 2018 22:02:18 -0800 Subject: [PATCH 147/728] Add UniqueTerm token filter --- analysis/token/unique/unique.go | 53 ++++++++++++++++++ analysis/token/unique/unique_test.go | 84 ++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 analysis/token/unique/unique.go create mode 100644 analysis/token/unique/unique_test.go diff --git a/analysis/token/unique/unique.go b/analysis/token/unique/unique.go new file mode 100644 index 000000000..f0d96c504 --- /dev/null +++ b/analysis/token/unique/unique.go @@ -0,0 +1,53 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package unique + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const Name = "unique" + +// UniqueTermFilter retains only the tokens which mark the first occurence of +// a term. Tokens whose term appears in a preceding token are dropped. +type UniqueTermFilter struct{} + +func NewUniqueTermFilter() *UniqueTermFilter { + return &UniqueTermFilter{} +} + +func (f *UniqueTermFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + encounteredTerms := make(map[string]struct{}, len(input)/4) + j := 0 + for _, token := range input { + term := string(token.Term) + if _, ok := encounteredTerms[term]; ok { + continue + } + encounteredTerms[term] = struct{}{} + input[j] = token + j++ + } + return input[:j] +} + +func UniqueTermFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewUniqueTermFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(Name, UniqueTermFilterConstructor) +} diff --git a/analysis/token/unique/unique_test.go b/analysis/token/unique/unique_test.go new file mode 100644 index 000000000..216d8f1fa --- /dev/null +++ b/analysis/token/unique/unique_test.go @@ -0,0 +1,84 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package unique + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" +) + +func TestUniqueTermFilter(t *testing.T) { + var tests = []struct { + input analysis.TokenStream + // expected indices of input which should be included in the output. We + // use indices instead of another TokenStream, since position/start/end + // should be preserved. + expectedIndices []int + }{ + { + input: tokenStream(), + expectedIndices: []int{}, + }, + { + input: tokenStream("a"), + expectedIndices: []int{0}, + }, + { + input: tokenStream("each", "term", "in", "this", "sentence", "is", "unique"), + expectedIndices: []int{0, 1, 2, 3, 4, 5, 6}, + }, + { + input: tokenStream("Lui", "è", "alto", "e", "lei", "è", "bassa"), + expectedIndices: []int{0, 1, 2, 3, 4, 6}, + }, + { + input: tokenStream("a", "a", "A", "a", "a", "A"), + expectedIndices: []int{0, 2}, + }, + } + uniqueTermFilter := NewUniqueTermFilter() + for _, test := range tests { + expected := subStream(test.input, test.expectedIndices) + actual := uniqueTermFilter.Filter(test.input) + if !reflect.DeepEqual(actual, expected) { + t.Errorf("expected %s \n\n got %s", expected, actual) + } + } +} + +func tokenStream(termStrs ...string) analysis.TokenStream { + tokenStream := make([]*analysis.Token, len(termStrs)) + index := 0 + for i, termStr := range termStrs { + tokenStream[i] = &analysis.Token{ + Term: []byte(termStr), + Position: i + 1, + Start: index, + End: index + len(termStr), + } + index += len(termStr) + } + return analysis.TokenStream(tokenStream) +} + +func subStream(stream analysis.TokenStream, indices []int) analysis.TokenStream { + result := make(analysis.TokenStream, len(indices)) + for i, index := range indices { + result[i] = stream[index] + } + return result +} From 1176c73a9cf3d5272b962575f8c682c0ff3a34dc Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Fri, 12 Jan 2018 12:11:11 -0800 Subject: [PATCH 148/728] Include overhead from data structures in segment's SizeInBytes + Account for all the overhead incurred from the data structures within mem.Segment and zap.Segment. - SizeOfMap = 8 - SizeOfPointer = 8 - SizeOfSlice = 24 - SizeOfString = 16 + Include overhead from certain new fields as well. --- index/scorch/segment/mem/segment.go | 50 ++++++++++++++++++++++----- index/scorch/segment/segment.go | 6 ++++ index/scorch/segment/zap/docvalues.go | 18 ++++++++++ index/scorch/segment/zap/segment.go | 39 +++++++++++++++------ 4 files changed, 93 insertions(+), 20 deletions(-) diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go index 3c400b531..baa4811a4 100644 --- a/index/scorch/segment/mem/segment.go +++ b/index/scorch/segment/mem/segment.go @@ -107,27 +107,41 @@ func New() *Segment { func (s *Segment) updateSizeInBytes() { var sizeInBytes uint64 + // FieldsMap, FieldsInv for k, _ := range s.FieldsMap { - sizeInBytes += uint64(len(k)*2 /* FieldsMap + FieldsInv */ + + sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 + 2 /* size of uint16 */) } + // overhead from the data structures + sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice) + // Dicts, DictKeys for _, entry := range s.Dicts { for k, _ := range entry { - sizeInBytes += uint64(len(k)*2 /* Dicts + DictKeys */ + + sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 + 8 /* size of uint64 */) } + // overhead from the data structures + sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice) } + sizeInBytes += (segment.SizeOfSlice * 2) + // Postings, PostingsLocs for i := 0; i < len(s.Postings); i++ { - sizeInBytes += s.Postings[i].GetSizeInBytes() + s.PostingsLocs[i].GetSizeInBytes() + sizeInBytes += (s.Postings[i].GetSizeInBytes() + segment.SizeOfPointer) + + (s.PostingsLocs[i].GetSizeInBytes() + segment.SizeOfPointer) } + sizeInBytes += (segment.SizeOfSlice * 2) + // Freqs, Norms for i := 0; i < len(s.Freqs); i++ { sizeInBytes += uint64(len(s.Freqs[i])*8 /* size of uint64 */ + - len(s.Norms[i])*4 /* size of float32 */) + len(s.Norms[i])*4 /* size of float32 */) + + (segment.SizeOfSlice * 2) } + sizeInBytes += (segment.SizeOfSlice * 2) + // Location data for i := 0; i < len(s.Locfields); i++ { sizeInBytes += uint64(len(s.Locfields[i])*2 /* size of uint16 */ + len(s.Locstarts[i])*8 /* size of uint64 */ + @@ -135,31 +149,49 @@ func (s *Segment) updateSizeInBytes() { len(s.Locpos[i])*8 /* size of uint64 */) for j := 0; j < len(s.Locarraypos[i]); j++ { - sizeInBytes += uint64(len(s.Locarraypos[i][j]) * 8 /* size of uint64 */) + sizeInBytes += uint64(len(s.Locarraypos[i][j])*8 /* size of uint64 */) + + segment.SizeOfSlice } + + sizeInBytes += (segment.SizeOfSlice * 5) } + sizeInBytes += (segment.SizeOfSlice * 5) + // Stored data for i := 0; i < len(s.Stored); i++ { for _, v := range s.Stored[i] { sizeInBytes += uint64(2 /* size of uint16 */) for _, arr := range v { - sizeInBytes += uint64(len(arr)) + sizeInBytes += uint64(len(arr)) + segment.SizeOfSlice } + sizeInBytes += segment.SizeOfSlice } for _, v := range s.StoredTypes[i] { - sizeInBytes += uint64(2 /* size of uint16 */ + len(v)) + sizeInBytes += uint64(2 /* size of uint16 */ +len(v)) + segment.SizeOfSlice } for _, v := range s.StoredPos[i] { sizeInBytes += uint64(2 /* size of uint16 */) for _, arr := range v { - sizeInBytes += uint64(len(arr) * 8 /* size of uint64 */) + sizeInBytes += uint64(len(arr)*8 /* size of uint64 */) + + segment.SizeOfSlice } + sizeInBytes += segment.SizeOfSlice } + + // overhead from map(s) within Stored, StoredTypes, StoredPos + sizeInBytes += (segment.SizeOfMap * 3) } + // overhead from data structures: Stored, StoredTypes, StoredPos + sizeInBytes += (segment.SizeOfSlice * 3) + + // DocValueFields + sizeInBytes += uint64(len(s.DocValueFields)*3 /* size of uint16 + bool */) + + segment.SizeOfMap - sizeInBytes += uint64(8 /* size of sizeInBytes -> uint64*/) + // SizeInBytes + sizeInBytes += uint64(8) s.sizeInBytes = sizeInBytes } diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index 858ac3590..d5435ab96 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -19,6 +19,12 @@ import ( "github.com/blevesearch/bleve/index" ) +// Overhead from go data structures when deployed on a 64-bit system. +const SizeOfMap uint64 = 8 +const SizeOfPointer uint64 = 8 +const SizeOfSlice uint64 = 24 +const SizeOfString uint64 = 16 + // DocumentFieldValueVisitor defines a callback to be visited for each // stored field value. The return value determines if the visitor // should keep going. Returning true continues visiting, false stops. diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index cdb16ccb9..e37ecc74e 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -22,6 +22,7 @@ import ( "sort" "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment" "github.com/golang/snappy" ) @@ -35,6 +36,23 @@ type docValueIterator struct { curChunkData []byte // compressed data cache } +func (di *docValueIterator) sizeInBytes() uint64 { + // curChunkNum, numChunks, dvDataLoc --> uint64 + sizeInBytes := 24 + + // field + sizeInBytes += (len(di.field) + int(segment.SizeOfString)) + + // chunkLens, curChunkHeader + sizeInBytes += len(di.chunkLens)*8 + + len(di.curChunkHeader)*24 + + int(segment.SizeOfSlice*2) /* overhead from slices */ + + // curChunkData is mmap'ed, not included + + return uint64(sizeInBytes) +} + func (di *docValueIterator) fieldName() string { return di.field } diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 9f9910366..18d4ea56c 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -97,27 +97,44 @@ type Segment struct { } func (s *Segment) SizeInBytes() uint64 { - // 4 /* size of crc -> uint32 */ + - // 4 /* size of version -> uint32 */ + - // 4 /* size of chunkFactor -> uint32 */ + - // 8 /* size of numDocs -> uint64 */ + - // 8 /* size of storedIndexOffset -> uint64 */ + + // 8 /* size of file pointer */ + // 4 /* size of crc -> uint32 */ + // 4 /* size of version -> uint32 */ + // 4 /* size of chunkFactor -> uint32 */ + // 8 /* size of numDocs -> uint64 */ + // 8 /* size of storedIndexOffset -> uint64 */ // 8 /* size of fieldsIndexOffset -> uint64 */ - sizeOfUints := 36 + // 8 /* size of docValueOffset -> uint64 */ + sizeOfUints := 52 // Do not include the mmap'ed part - sizeInBytes := len(s.path) + sizeOfUints + sizeInBytes := (len(s.path) + int(segment.SizeOfString)) + sizeOfUints + // fieldsMap for k, _ := range s.fieldsMap { - sizeInBytes += len(k) + 2 /* size of uint16 */ + sizeInBytes += (len(k) + int(segment.SizeOfString)) + 2 /* size of uint16 */ } + sizeInBytes += int(segment.SizeOfMap) /* overhead from map */ + // fieldsInv, fieldsOffsets for _, entry := range s.fieldsInv { - sizeInBytes += len(entry) + sizeInBytes += (len(entry) + int(segment.SizeOfString)) } + sizeInBytes += len(s.fieldsOffsets) * 8 /* size of uint64 */ + sizeInBytes += int(segment.SizeOfSlice) * 2 /* overhead from slices */ + + // fieldDvIterMap + sizeInBytes += len(s.fieldDvIterMap) * + int(segment.SizeOfPointer+2 /* size of uint16 */) + for _, entry := range s.fieldDvIterMap { + if entry != nil { + sizeInBytes += int(entry.sizeInBytes()) + } + } + sizeInBytes += int(segment.SizeOfMap) - sizeInBytes += len(s.fieldsOffsets) * 8 /* size of uint64 */ - sizeInBytes += 8 /* size of refs -> int64 */ + // mutex, refs -> int64 + sizeInBytes += 16 return uint64(sizeInBytes) } From 47f1c66889db55545776d2cbe6d0c137a0556ccf Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 19 Jan 2018 11:47:28 +0530 Subject: [PATCH 149/728] adding UT --- search.go | 6 +++-- search_test.go | 73 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 2 deletions(-) diff --git a/search.go b/search.go index 5d12b7a25..46d849c1b 100644 --- a/search.go +++ b/search.go @@ -481,8 +481,10 @@ func (sr *SearchResult) Merge(other *SearchResult) { if other.MaxScore > sr.MaxScore { sr.MaxScore = other.MaxScore } - if len(sr.Facets) == 0 && len(other.Facets) != 0 { - sr.Facets = make(search.FacetResults) + if sr.Facets == nil && len(other.Facets) != 0 { + sr.Facets = other.Facets + return } + sr.Facets.Merge(other.Facets) } diff --git a/search_test.go b/search_test.go index 7f5018950..242494132 100644 --- a/search_test.go +++ b/search_test.go @@ -326,3 +326,76 @@ func TestFacetNumericDateRangeRequests(t *testing.T) { } } + +func TestSearchResultFacetsMerge(t *testing.T) { + lowmed := "2010-01-01" + medhi := "2011-01-01" + hihigher := "2012-01-01" + + fr := &search.FacetResult{ + Field: "birthday", + Total: 100, + Missing: 25, + Other: 25, + DateRanges: []*search.DateRangeFacet{ + { + Name: "low", + End: &lowmed, + Count: 25, + }, + { + Name: "med", + Count: 24, + Start: &lowmed, + End: &medhi, + }, + { + Name: "hi", + Count: 1, + Start: &medhi, + End: &hihigher, + }, + }, + } + frs := search.FacetResults{ + "birthdays": fr, + } + + l := &SearchResult{ + Status: &SearchStatus{ + Total: 10, + Successful: 1, + Errors: make(map[string]error), + }, + Total: 10, + MaxScore: 1, + } + + r := &SearchResult{ + Status: &SearchStatus{ + Total: 1, + Successful: 1, + Errors: make(map[string]error), + }, + Total: 1, + MaxScore: 2, + Facets: frs, + } + + expected := &SearchResult{ + Status: &SearchStatus{ + Total: 11, + Successful: 2, + Errors: make(map[string]error), + }, + Total: 11, + MaxScore: 2, + Facets: frs, + } + + l.Merge(r) + + if !reflect.DeepEqual(l, expected) { + t.Errorf("expected %#v, got %#v", expected, l) + } +} From 34fd77709f6e58c9005d59d56b74844277f6f0b6 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 20 Jan 2018 17:14:07 -0800 Subject: [PATCH 150/728] scorch unlocks in introduceSegment's DocNumbers() error codepath --- index/scorch/introducer.go | 1 + 1 file changed, 1 insertion(+) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 0b9c48537..4499fa41b 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -117,6 +117,7 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { var err error delta, err = s.root.segment[i].segment.DocNumbers(next.ids) if err != nil { + s.rootLock.Unlock() next.applied <- fmt.Errorf("error computing doc numbers: %v", err) close(next.applied) _ = newSnapshot.DecRef() From 567d756c27e0733ee4f32ab2eedd2ba985048ac7 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 24 Jan 2018 14:10:14 -0800 Subject: [PATCH 151/728] Add support for certain disk stats + num_bytes_used_disk + num_files_on_disk --- index/scorch/scorch.go | 5 +++-- index/scorch/stats.go | 30 +++++++++++++++++++++++++++--- 2 files changed, 30 insertions(+), 5 deletions(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 99d6dcd5c..b77c69239 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -75,11 +75,11 @@ func NewScorch(storeName string, version: Version, config: config, analysisQueue: analysisQueue, - stats: &Stats{}, nextSnapshotEpoch: 1, closeCh: make(chan struct{}), ineligibleForRemoval: map[string]bool{}, } + rv.stats = &Stats{i: rv} rv.root = &IndexSnapshot{parent: rv, refs: 1} ro, ok := config["read_only"].(bool) if ok { @@ -359,7 +359,8 @@ func (s *Scorch) Stats() json.Marshaler { return s.stats } func (s *Scorch) StatsMap() map[string]interface{} { - return s.stats.statsMap() + m, _ := s.stats.statsMap() + return m } func (s *Scorch) Analyze(d *document.Document) *index.AnalysisResult { diff --git a/index/scorch/stats.go b/index/scorch/stats.go index abd054c81..c44a977bf 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -16,6 +16,7 @@ package scorch import ( "encoding/json" + "io/ioutil" "sync/atomic" ) @@ -28,9 +29,10 @@ type Stats struct { numPlainTextBytesIndexed uint64 numItemsIntroduced uint64 numItemsPersisted uint64 + i *Scorch } -func (s *Stats) statsMap() map[string]interface{} { +func (s *Stats) statsMap() (map[string]interface{}, error) { m := map[string]interface{}{} m["updates"] = atomic.LoadUint64(&s.updates) m["deletes"] = atomic.LoadUint64(&s.deletes) @@ -44,11 +46,33 @@ func (s *Stats) statsMap() map[string]interface{} { m["num_items_introduced"] = atomic.LoadUint64(&s.numItemsIntroduced) m["num_items_persisted"] = atomic.LoadUint64(&s.numItemsPersisted) - return m + if s.i.path != "" { + finfos, err := ioutil.ReadDir(s.i.path) + if err != nil { + return nil, err + } + + var numFilesOnDisk, numBytesUsedDisk uint64 + + for _, finfo := range finfos { + if !finfo.IsDir() { + numBytesUsedDisk += uint64(finfo.Size()) + numFilesOnDisk++ + } + } + + m["num_bytes_used_disk"] = numBytesUsedDisk + m["num_files_on_disk"] = numFilesOnDisk + } + + return m, nil } // MarshalJSON implements json.Marshaler func (s *Stats) MarshalJSON() ([]byte, error) { - m := s.statsMap() + m, err := s.statsMap() + if err != nil { + return nil, err + } return json.Marshal(m) } From dc62324e021e5772d45bdcb764aa825082ac8509 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 17 Jan 2018 11:29:32 -0800 Subject: [PATCH 152/728] scorch zap miscellaneous typos --- index/scorch/segment/zap/build.go | 2 +- index/scorch/segment/zap/docvalues.go | 2 +- index/scorch/segment/zap/segment.go | 6 ++++-- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 1b16b5e35..1928bdf3a 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -43,7 +43,7 @@ func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) (e return err } - // bufer the output + // buffer the output br := bufio.NewWriter(f) // wrap it for counting (tracking offsets) diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index e37ecc74e..11b3b99d4 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -65,7 +65,7 @@ func (s *Segment) loadFieldDocValueIterator(field string, fieldDvLoc uint64) (*docValueIterator, error) { // get the docValue offset for the given fields if fieldDvLoc == fieldNotUninverted { - return nil, fmt.Errorf("loadFieldDocValueConfigs: "+ + return nil, fmt.Errorf("loadFieldDocValueIterator: "+ "no docValues found for field: %s", field) } diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 18d4ea56c..60cc034b9 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -168,11 +168,13 @@ func (s *Segment) loadConfig() error { docValueOffset := chunkOffset - 8 s.docValueOffset = binary.BigEndian.Uint64(s.mm[docValueOffset : docValueOffset+8]) - fieldsOffset := docValueOffset - 8 + fieldsOffset := docValueOffset - 8 s.fieldsIndexOffset = binary.BigEndian.Uint64(s.mm[fieldsOffset : fieldsOffset+8]) + storedOffset := fieldsOffset - 8 s.storedIndexOffset = binary.BigEndian.Uint64(s.mm[storedOffset : storedOffset+8]) + docNumOffset := storedOffset - 8 s.numDocs = binary.BigEndian.Uint64(s.mm[docNumOffset : docNumOffset+8]) return nil @@ -181,7 +183,7 @@ func (s *Segment) loadConfig() error { func (s *Segment) loadFields() error { // NOTE for now we assume the fields index immediately preceeds the footer - // if this changes, need to adjust accordingly (or store epxlicit length) + // if this changes, need to adjust accordingly (or store explicit length) fieldsIndexEnd := uint64(len(s.mm) - FooterSize) // iterate through fields index From 5a035dc9aa4a7660efe56ac3e8a7e64ff27172c1 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 17 Jan 2018 18:46:57 -0800 Subject: [PATCH 153/728] scorch zap in-memory segment representation (SegmentBase) The zap SegmentBase struct is a refactoring of the zap Segment into the subset of fields that are needed for read-only ops, without any persistence related info. This allows us to use zap's optimized data encoding as scorch's in-memory segments. The zap Segment struct now embeds a zap SegmentBase struct, and layers on persistence. Both the zap Segment and zap SegmentBase implement scorch's Segment interface. --- index/scorch/persister.go | 7 +- index/scorch/scorch.go | 10 +- index/scorch/scorch_test.go | 2 +- index/scorch/segment/mem/build.go | 8 +- index/scorch/segment/mem/segment.go | 31 ++-- index/scorch/segment/zap/build.go | 199 +++++++++++++++++++------- index/scorch/segment/zap/count.go | 18 +-- index/scorch/segment/zap/dict.go | 22 +-- index/scorch/segment/zap/docvalues.go | 26 ++-- index/scorch/segment/zap/merge.go | 5 +- index/scorch/segment/zap/posting.go | 20 +-- index/scorch/segment/zap/read.go | 12 +- index/scorch/segment/zap/segment.go | 166 +++++++++++---------- index/scorch/segment/zap/write.go | 18 +-- 14 files changed, 333 insertions(+), 211 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 658e57aee..cdcee37c2 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -28,11 +28,12 @@ import ( "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/blevesearch/bleve/index/scorch/segment/mem" "github.com/blevesearch/bleve/index/scorch/segment/zap" "github.com/boltdb/bolt" ) +var DefaultChunkFactor uint32 = 1024 + type notificationChan chan struct{} func (s *Scorch) persisterLoop() { @@ -178,11 +179,11 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { return err2 } switch seg := segmentSnapshot.segment.(type) { - case *mem.Segment: + case *zap.SegmentBase: // need to persist this to disk filename := zapFileName(segmentSnapshot.id) path := s.path + string(os.PathSeparator) + filename - err2 := zap.PersistSegment(seg, path, 1024) + err2 := zap.PersistSegmentBase(seg, path) if err2 != nil { return fmt.Errorf("error persisting segment: %v", err2) } diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 99d6dcd5c..69328d9b8 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -28,6 +28,7 @@ import ( "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/index/scorch/segment/mem" + "github.com/blevesearch/bleve/index/scorch/segment/zap" "github.com/blevesearch/bleve/index/store" "github.com/blevesearch/bleve/registry" "github.com/boltdb/bolt" @@ -217,7 +218,7 @@ func (s *Scorch) Delete(id string) error { } // Batch applices a batch of changes to the index atomically -func (s *Scorch) Batch(batch *index.Batch) error { +func (s *Scorch) Batch(batch *index.Batch) (err error) { start := time.Now() defer func() { @@ -271,10 +272,13 @@ func (s *Scorch) Batch(batch *index.Batch) error { var newSegment segment.Segment if len(analysisResults) > 0 { - newSegment = mem.NewFromAnalyzedDocs(analysisResults) + newSegment, err = zap.NewSegmentBase(mem.NewFromAnalyzedDocs(analysisResults), DefaultChunkFactor) + if err != nil { + return err + } } - err := s.prepareSegment(newSegment, ids, batch.InternalOps) + err = s.prepareSegment(newSegment, ids, batch.InternalOps) if err != nil { if newSegment != nil { _ = newSegment.Close() diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index 6e8ecb0cf..87e9bdb21 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -1395,7 +1395,7 @@ func TestConcurrentUpdate(t *testing.T) { // do some concurrent updates var wg sync.WaitGroup - for i := 0; i < 10; i++ { + for i := 0; i < 100; i++ { wg.Add(1) go func(i int) { doc := document.NewDocument("1") diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 554de8906..d3344ce30 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -267,15 +267,15 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { } func (s *Segment) getOrDefineField(name string) int { - fieldID, ok := s.FieldsMap[name] + fieldIDPlus1, ok := s.FieldsMap[name] if !ok { - fieldID = uint16(len(s.FieldsInv) + 1) - s.FieldsMap[name] = fieldID + fieldIDPlus1 = uint16(len(s.FieldsInv) + 1) + s.FieldsMap[name] = fieldIDPlus1 s.FieldsInv = append(s.FieldsInv, name) s.Dicts = append(s.Dicts, make(map[string]uint64)) s.DictKeys = append(s.DictKeys, make([]string, 0)) } - return int(fieldID - 1) + return int(fieldIDPlus1 - 1) } func (s *Segment) addDocument() int { diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go index baa4811a4..04bdb368a 100644 --- a/index/scorch/segment/mem/segment.go +++ b/index/scorch/segment/mem/segment.go @@ -40,35 +40,38 @@ const idFieldID uint16 = 0 // Segment is an in memory implementation of scorch.Segment type Segment struct { - // FieldsMap name -> id+1 + // FieldsMap adds 1 to field id to avoid zero value issues + // name -> field id + 1 FieldsMap map[string]uint16 - // fields id -> name + + // FieldsInv is the inverse of FieldsMap + // field id -> name FieldsInv []string - // term dictionary + // Term dictionaries for each field // field id -> term -> postings list id + 1 Dicts []map[string]uint64 - // term dictionary keys - // field id -> []dictionary keys + // Terms for each field, where terms are sorted ascending + // field id -> []term DictKeys [][]string // Postings list - // postings list id -> Postings bitmap + // postings list id -> bitmap by docNum Postings []*roaring.Bitmap - // Postings List has locations + // Postings list has locations PostingsLocs []*roaring.Bitmap - // term frequencies + // Term frequencies // postings list id -> Freqs (one for each hit in bitmap) Freqs [][]uint64 - // field Norms + // Field norms // postings list id -> Norms (one for each hit in bitmap) Norms [][]float32 - // field/start/end/pos/locarraypos + // Field/start/end/pos/locarraypos // postings list id -> start/end/pos/locarraypos (one for each freq) Locfields [][]uint16 Locstarts [][]uint64 @@ -80,18 +83,18 @@ type Segment struct { // docNum -> field id -> slice of values (each value []byte) Stored []map[uint16][][]byte - // stored field types + // Stored field types // docNum -> field id -> slice of types (each type byte) StoredTypes []map[uint16][]byte - // stored field array positions + // Stored field array positions // docNum -> field id -> slice of array positions (each is []uint64) StoredPos []map[uint16][][]uint64 - // for storing the docValue persisted fields + // For storing the docValue persisted fields DocValueFields map[uint16]bool - // footprint of the segment, updated when analyzed document mutations + // Footprint of the segment, updated when analyzed document mutations // are added into the segment sizeInBytes uint64 } diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 1928bdf3a..c9361cbd4 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -32,10 +32,8 @@ const version uint32 = 2 const fieldNotUninverted = math.MaxUint64 -// PersistSegment takes the in-memory segment and persists it to the specified -// path in the zap file format. -func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) (err error) { - +// PersistSegmentBase persists SegmentBase in the zap file format. +func PersistSegmentBase(sb *SegmentBase, path string) error { flag := os.O_RDWR | os.O_CREATE f, err := os.OpenFile(path, flag, 0600) @@ -43,84 +41,151 @@ func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) (e return err } - // buffer the output + cleanup := func() { + _ = f.Close() + _ = os.Remove(path) + } + br := bufio.NewWriter(f) - // wrap it for counting (tracking offsets) - cr := NewCountHashWriter(br) + _, err = br.Write(sb.mem) + if err != nil { + cleanup() + return err + } - var storedIndexOffset uint64 - var dictLocs []uint64 - docValueOffset := uint64(fieldNotUninverted) - if len(memSegment.Stored) > 0 { + err = persistFooter(sb.numDocs, sb.storedIndexOffset, sb.fieldsIndexOffset, sb.docValueOffset, + sb.chunkFactor, sb.memCRC, br) + if err != nil { + cleanup() + return err + } - storedIndexOffset, err = persistStored(memSegment, cr) - if err != nil { - return err - } + err = br.Flush() + if err != nil { + cleanup() + return err + } - var freqOffsets, locOffsets []uint64 - freqOffsets, locOffsets, err = persistPostingDetails(memSegment, cr, chunkFactor) - if err != nil { - return err - } + err = f.Sync() + if err != nil { + cleanup() + return err + } - var postingsListLocs []uint64 - postingsListLocs, err = persistPostingsLocs(memSegment, cr) - if err != nil { - return err - } + err = f.Close() + if err != nil { + cleanup() + return err + } - var postingsLocs []uint64 - postingsLocs, err = persistPostingsLists(memSegment, cr, postingsListLocs, freqOffsets, locOffsets) - if err != nil { - return err - } + return nil +} - dictLocs, err = persistDictionary(memSegment, cr, postingsLocs) - if err != nil { - return err - } +// PersistSegment takes the in-memory segment and persists it to +// the specified path in the zap file format. +func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) error { + flag := os.O_RDWR | os.O_CREATE - docValueOffset, err = persistFieldDocValues(cr, chunkFactor, memSegment) - if err != nil { - return err - } + f, err := os.OpenFile(path, flag, 0600) + if err != nil { + return err + } - } else { - dictLocs = make([]uint64, len(memSegment.FieldsInv)) + cleanup := func() { + _ = f.Close() + _ = os.Remove(path) } - var fieldIndexStart uint64 - fieldIndexStart, err = persistFields(memSegment.FieldsInv, cr, dictLocs) + // buffer the output + br := bufio.NewWriter(f) + + // wrap it for counting (tracking offsets) + cr := NewCountHashWriter(br) + + numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, err := + persistBase(memSegment, cr, chunkFactor) if err != nil { + cleanup() return err } - err = persistFooter(uint64(len(memSegment.Stored)), storedIndexOffset, - fieldIndexStart, docValueOffset, chunkFactor, cr) + err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, + chunkFactor, cr.Sum32(), cr) if err != nil { + cleanup() return err } err = br.Flush() if err != nil { + cleanup() return err } err = f.Sync() if err != nil { + cleanup() return err } err = f.Close() if err != nil { + cleanup() return err } return nil } +func persistBase(memSegment *mem.Segment, cr *CountHashWriter, chunkFactor uint32) ( + numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64, + dictLocs []uint64, err error) { + docValueOffset = uint64(fieldNotUninverted) + + if len(memSegment.Stored) > 0 { + storedIndexOffset, err = persistStored(memSegment, cr) + if err != nil { + return 0, 0, 0, 0, nil, err + } + + freqOffsets, locOffsets, err := persistPostingDetails(memSegment, cr, chunkFactor) + if err != nil { + return 0, 0, 0, 0, nil, err + } + + postingsListLocs, err := persistPostingsLocs(memSegment, cr) + if err != nil { + return 0, 0, 0, 0, nil, err + } + + postingsLocs, err := persistPostingsLists(memSegment, cr, postingsListLocs, freqOffsets, locOffsets) + if err != nil { + return 0, 0, 0, 0, nil, err + } + + dictLocs, err = persistDictionary(memSegment, cr, postingsLocs) + if err != nil { + return 0, 0, 0, 0, nil, err + } + + docValueOffset, err = persistFieldDocValues(memSegment, cr, chunkFactor) + if err != nil { + return 0, 0, 0, 0, nil, err + } + } else { + dictLocs = make([]uint64, len(memSegment.FieldsInv)) + } + + fieldsIndexOffset, err = persistFields(memSegment.FieldsInv, cr, dictLocs) + if err != nil { + return 0, 0, 0, 0, nil, err + } + + return uint64(len(memSegment.Stored)), storedIndexOffset, fieldsIndexOffset, docValueOffset, + dictLocs, nil +} + func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) { var curr int @@ -394,6 +459,8 @@ func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs []uint64) ([]uint64, error) { rv := make([]uint64, 0, len(memSegment.DictKeys)) + varintBuf := make([]byte, binary.MaxVarintLen64) + var buffer bytes.Buffer for fieldID, fieldTerms := range memSegment.DictKeys { if fieldID != 0 { @@ -427,10 +494,8 @@ func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs vellumData := buffer.Bytes() // write out the length of the vellum data - buf := make([]byte, binary.MaxVarintLen64) - // write out the number of chunks - n := binary.PutUvarint(buf, uint64(len(vellumData))) - _, err = w.Write(buf[:n]) + n := binary.PutUvarint(varintBuf, uint64(len(vellumData))) + _, err = w.Write(varintBuf[:n]) if err != nil { return nil, err } @@ -521,9 +586,8 @@ func persistDocValues(memSegment *mem.Segment, w *CountHashWriter, return fieldChunkOffsets, nil } -func persistFieldDocValues(w *CountHashWriter, chunkFactor uint32, - memSegment *mem.Segment) (uint64, error) { - +func persistFieldDocValues(memSegment *mem.Segment, w *CountHashWriter, + chunkFactor uint32) (uint64, error) { fieldDvOffsets, err := persistDocValues(memSegment, w, chunkFactor) if err != nil { return 0, err @@ -548,3 +612,36 @@ func persistFieldDocValues(w *CountHashWriter, chunkFactor uint32, return fieldDocValuesOffset, nil } + +func NewSegmentBase(memSegment *mem.Segment, chunkFactor uint32) (*SegmentBase, error) { + var br bytes.Buffer + + cr := NewCountHashWriter(&br) + + numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs, err := + persistBase(memSegment, cr, chunkFactor) + if err != nil { + return nil, err + } + + sb := &SegmentBase{ + mem: br.Bytes(), + memCRC: cr.Sum32(), + chunkFactor: chunkFactor, + fieldsMap: memSegment.FieldsMap, + fieldsInv: memSegment.FieldsInv, + numDocs: numDocs, + storedIndexOffset: storedIndexOffset, + fieldsIndexOffset: fieldsIndexOffset, + docValueOffset: docValueOffset, + dictLocs: dictLocs, + fieldDvIterMap: make(map[uint16]*docValueIterator), + } + + err = sb.loadDvIterators() + if err != nil { + return nil, err + } + + return sb, nil +} diff --git a/index/scorch/segment/zap/count.go b/index/scorch/segment/zap/count.go index 2f0b92de2..d75e83c03 100644 --- a/index/scorch/segment/zap/count.go +++ b/index/scorch/segment/zap/count.go @@ -15,32 +15,28 @@ package zap import ( - "hash" "hash/crc32" "io" ) // CountHashWriter is a wrapper around a Writer which counts the number of -// bytes which have been written +// bytes which have been written and computes a crc32 hash type CountHashWriter struct { - w io.Writer - h hash.Hash32 - n int + w io.Writer + crc uint32 + n int } // NewCountHashWriter returns a CountHashWriter which wraps the provided Writer func NewCountHashWriter(w io.Writer) *CountHashWriter { - return &CountHashWriter{ - w: w, - h: crc32.NewIEEE(), - } + return &CountHashWriter{w: w} } // Write writes the provided bytes to the wrapped writer and counts the bytes func (c *CountHashWriter) Write(b []byte) (int, error) { n, err := c.w.Write(b) + c.crc = crc32.Update(c.crc, crc32.IEEETable, b[:n]) c.n += n - _, _ = c.h.Write(b) return n, err } @@ -51,5 +47,5 @@ func (c *CountHashWriter) Count() int { // Sum32 returns the CRC-32 hash of the content written to this writer func (c *CountHashWriter) Sum32() uint32 { - return c.h.Sum32() + return c.crc } diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 3221d0616..284bc1898 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -27,7 +27,7 @@ import ( // Dictionary is the zap representation of the term dictionary type Dictionary struct { - segment *Segment + sb *SegmentBase field string fieldID uint16 fst *vellum.FST @@ -40,9 +40,9 @@ func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment. func (d *Dictionary) postingsList(term string, except *roaring.Bitmap) (*PostingsList, error) { rv := &PostingsList{ - dictionary: d, - term: term, - except: except, + sb: d.sb, + term: term, + except: except, } if d.fst != nil { @@ -56,19 +56,19 @@ func (d *Dictionary) postingsList(term string, except *roaring.Bitmap) (*Posting var n uint64 var read int - rv.freqOffset, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+binary.MaxVarintLen64]) + rv.freqOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+binary.MaxVarintLen64]) n += uint64(read) - rv.locOffset, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) + rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) n += uint64(read) var locBitmapOffset uint64 - locBitmapOffset, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) + locBitmapOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) n += uint64(read) // go ahead and load loc bitmap var locBitmapLen uint64 - locBitmapLen, read = binary.Uvarint(d.segment.mm[locBitmapOffset : locBitmapOffset+binary.MaxVarintLen64]) - locRoaringBytes := d.segment.mm[locBitmapOffset+uint64(read) : locBitmapOffset+uint64(read)+locBitmapLen] + locBitmapLen, read = binary.Uvarint(d.sb.mem[locBitmapOffset : locBitmapOffset+binary.MaxVarintLen64]) + locRoaringBytes := d.sb.mem[locBitmapOffset+uint64(read) : locBitmapOffset+uint64(read)+locBitmapLen] rv.locBitmap = roaring.NewBitmap() _, err := rv.locBitmap.FromBuffer(locRoaringBytes) if err != nil { @@ -76,10 +76,10 @@ func (d *Dictionary) postingsList(term string, except *roaring.Bitmap) (*Posting } var postingsLen uint64 - postingsLen, read = binary.Uvarint(d.segment.mm[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) + postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) n += uint64(read) - roaringBytes := d.segment.mm[postingsOffset+n : postingsOffset+n+postingsLen] + roaringBytes := d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen] bitmap := roaring.NewBitmap() _, err = bitmap.FromBuffer(roaringBytes) diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index 11b3b99d4..fb5b348a5 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -61,7 +61,7 @@ func (di *docValueIterator) curChunkNumber() uint64 { return di.curChunkNum } -func (s *Segment) loadFieldDocValueIterator(field string, +func (s *SegmentBase) loadFieldDocValueIterator(field string, fieldDvLoc uint64) (*docValueIterator, error) { // get the docValue offset for the given fields if fieldDvLoc == fieldNotUninverted { @@ -71,7 +71,7 @@ func (s *Segment) loadFieldDocValueIterator(field string, // read the number of chunks, chunk lengths var offset, clen uint64 - numChunks, read := binary.Uvarint(s.mm[fieldDvLoc : fieldDvLoc+binary.MaxVarintLen64]) + numChunks, read := binary.Uvarint(s.mem[fieldDvLoc : fieldDvLoc+binary.MaxVarintLen64]) if read <= 0 { return nil, fmt.Errorf("failed to read the field "+ "doc values for field %s", field) @@ -84,7 +84,7 @@ func (s *Segment) loadFieldDocValueIterator(field string, chunkLens: make([]uint64, int(numChunks)), } for i := 0; i < int(numChunks); i++ { - clen, read = binary.Uvarint(s.mm[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64]) + clen, read = binary.Uvarint(s.mem[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64]) if read <= 0 { return nil, fmt.Errorf("corrupted chunk length during segment load") } @@ -97,7 +97,7 @@ func (s *Segment) loadFieldDocValueIterator(field string, } func (di *docValueIterator) loadDvChunk(chunkNumber, - localDocNum uint64, s *Segment) error { + localDocNum uint64, s *SegmentBase) error { // advance to the chunk where the docValues // reside for the given docID destChunkDataLoc := di.dvDataLoc @@ -107,7 +107,7 @@ func (di *docValueIterator) loadDvChunk(chunkNumber, curChunkSize := di.chunkLens[chunkNumber] // read the number of docs reside in the chunk - numDocs, read := binary.Uvarint(s.mm[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64]) + numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64]) if read <= 0 { return fmt.Errorf("failed to read the chunk") } @@ -116,17 +116,17 @@ func (di *docValueIterator) loadDvChunk(chunkNumber, offset := uint64(0) di.curChunkHeader = make([]MetaData, int(numDocs)) for i := 0; i < int(numDocs); i++ { - di.curChunkHeader[i].DocID, read = binary.Uvarint(s.mm[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + di.curChunkHeader[i].DocID, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) offset += uint64(read) - di.curChunkHeader[i].DocDvLoc, read = binary.Uvarint(s.mm[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + di.curChunkHeader[i].DocDvLoc, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) offset += uint64(read) - di.curChunkHeader[i].DocDvLen, read = binary.Uvarint(s.mm[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + di.curChunkHeader[i].DocDvLen, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) offset += uint64(read) } compressedDataLoc := chunkMetaLoc + offset dataLength := destChunkDataLoc + curChunkSize - compressedDataLoc - di.curChunkData = s.mm[compressedDataLoc : compressedDataLoc+dataLength] + di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength] di.curChunkNum = chunkNumber return nil } @@ -171,18 +171,18 @@ func (di *docValueIterator) getDocValueLocs(docID uint64) (uint64, uint64) { // VisitDocumentFieldTerms is an implementation of the // DocumentFieldTermVisitable interface -func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string, +func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []string, visitor index.DocumentFieldTermVisitor) error { - fieldID := uint16(0) + fieldIDPlus1 := uint16(0) ok := true for _, field := range fields { - if fieldID, ok = s.fieldsMap[field]; !ok { + if fieldIDPlus1, ok = s.fieldsMap[field]; !ok { continue } // find the chunkNumber where the docValues are stored docInChunk := localDocNum / uint64(s.chunkFactor) - if dvIter, exists := s.fieldDvIterMap[fieldID-1]; exists && + if dvIter, exists := s.fieldDvIterMap[fieldIDPlus1-1]; exists && dvIter != nil { // check if the chunk is already loaded if docInChunk != dvIter.curChunkNumber() { diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 16ec848b2..07b9bafcc 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -72,14 +72,13 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, dictLocs = make([]uint64, len(fieldsInv)) } - var fieldsIndexOffset uint64 - fieldsIndexOffset, err = persistFields(fieldsInv, cr, dictLocs) + fieldsIndexOffset, err := persistFields(fieldsInv, cr, dictLocs) if err != nil { return nil, err } err = persistFooter(newSegDocCount, storedIndexOffset, - fieldsIndexOffset, fieldDvLocsOffset, chunkFactor, cr) + fieldsIndexOffset, fieldDvLocsOffset, chunkFactor, cr.Sum32(), cr) if err != nil { return nil, err } diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 1b7a0a587..95f350130 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -27,7 +27,7 @@ import ( // PostingsList is an in-memory represenation of a postings list type PostingsList struct { - dictionary *Dictionary + sb *SegmentBase term string postingsOffset uint64 freqOffset uint64 @@ -48,11 +48,11 @@ func (p *PostingsList) Iterator() segment.PostingsIterator { var n uint64 var read int var numFreqChunks uint64 - numFreqChunks, read = binary.Uvarint(p.dictionary.segment.mm[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) + numFreqChunks, read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) n += uint64(read) rv.freqChunkLens = make([]uint64, int(numFreqChunks)) for i := 0; i < int(numFreqChunks); i++ { - rv.freqChunkLens[i], read = binary.Uvarint(p.dictionary.segment.mm[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) + rv.freqChunkLens[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) n += uint64(read) } rv.freqChunkStart = p.freqOffset + n @@ -60,11 +60,11 @@ func (p *PostingsList) Iterator() segment.PostingsIterator { // prepare the loc chunk details n = 0 var numLocChunks uint64 - numLocChunks, read = binary.Uvarint(p.dictionary.segment.mm[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) + numLocChunks, read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) n += uint64(read) rv.locChunkLens = make([]uint64, int(numLocChunks)) for i := 0; i < int(numLocChunks); i++ { - rv.locChunkLens[i], read = binary.Uvarint(p.dictionary.segment.mm[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) + rv.locChunkLens[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) n += uint64(read) } rv.locChunkStart = p.locOffset + n @@ -133,7 +133,7 @@ func (i *PostingsIterator) loadChunk(chunk int) error { start += i.freqChunkLens[j] } end := start + i.freqChunkLens[chunk] - i.currChunkFreqNorm = i.postings.dictionary.segment.mm[start:end] + i.currChunkFreqNorm = i.postings.sb.mem[start:end] i.freqNormDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkFreqNorm)) start = i.locChunkStart @@ -141,7 +141,7 @@ func (i *PostingsIterator) loadChunk(chunk int) error { start += i.locChunkLens[j] } end = start + i.locChunkLens[chunk] - i.currChunkLoc = i.postings.dictionary.segment.mm[start:end] + i.currChunkLoc = i.postings.sb.mem[start:end] i.locDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkLoc)) i.currChunk = uint32(chunk) return nil @@ -192,7 +192,7 @@ func (i *PostingsIterator) readLocation(l *Location) error { // group these together for less branching if l != nil { - l.field = i.postings.dictionary.segment.fieldsInv[fieldID] + l.field = i.postings.sb.fieldsInv[fieldID] l.pos = pos l.start = start l.end = end @@ -221,9 +221,9 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { return nil, nil } n := i.actual.Next() - nChunk := n / i.postings.dictionary.segment.chunkFactor + nChunk := n / i.postings.sb.chunkFactor allN := i.all.Next() - allNChunk := allN / i.postings.dictionary.segment.chunkFactor + allNChunk := allN / i.postings.sb.chunkFactor // n is the next actual hit (excluding some postings) // allN is the next hit in the full postings diff --git a/index/scorch/segment/zap/read.go b/index/scorch/segment/zap/read.go index c9b3e7720..0c5b9e17f 100644 --- a/index/scorch/segment/zap/read.go +++ b/index/scorch/segment/zap/read.go @@ -16,16 +16,16 @@ package zap import "encoding/binary" -func (s *Segment) getStoredMetaAndCompressed(docNum uint64) ([]byte, []byte) { +func (s *SegmentBase) getDocStoredMetaAndCompressed(docNum uint64) ([]byte, []byte) { docStoredStartAddr := s.storedIndexOffset + (8 * docNum) - docStoredStart := binary.BigEndian.Uint64(s.mm[docStoredStartAddr : docStoredStartAddr+8]) + docStoredStart := binary.BigEndian.Uint64(s.mem[docStoredStartAddr : docStoredStartAddr+8]) var n uint64 - metaLen, read := binary.Uvarint(s.mm[docStoredStart : docStoredStart+binary.MaxVarintLen64]) + metaLen, read := binary.Uvarint(s.mem[docStoredStart : docStoredStart+binary.MaxVarintLen64]) n += uint64(read) var dataLen uint64 - dataLen, read = binary.Uvarint(s.mm[docStoredStart+n : docStoredStart+n+binary.MaxVarintLen64]) + dataLen, read = binary.Uvarint(s.mem[docStoredStart+n : docStoredStart+n+binary.MaxVarintLen64]) n += uint64(read) - meta := s.mm[docStoredStart+n : docStoredStart+n+metaLen] - data := s.mm[docStoredStart+n+metaLen : docStoredStart+n+metaLen+dataLen] + meta := s.mem[docStoredStart+n : docStoredStart+n+metaLen] + data := s.mem[docStoredStart+n+metaLen : docStoredStart+n+metaLen+dataLen] return meta, data } diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 60cc034b9..df71f1d62 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -44,12 +44,15 @@ func Open(path string) (segment.Segment, error) { } rv := &Segment{ - f: f, - mm: mm, - path: path, - fieldsMap: make(map[string]uint16), - fieldDvIterMap: make(map[uint16]*docValueIterator), - refs: 1, + SegmentBase: SegmentBase{ + mem: mm[0 : len(mm)-FooterSize], + fieldsMap: make(map[string]uint16), + fieldDvIterMap: make(map[uint16]*docValueIterator), + }, + f: f, + mm: mm, + path: path, + refs: 1, } err = rv.loadConfig() @@ -73,24 +76,36 @@ func Open(path string) (segment.Segment, error) { return rv, nil } -// Segment implements the segment.Segment inteface over top the zap file format -type Segment struct { - f *os.File - mm mmap.MMap - path string - crc uint32 - version uint32 +// SegmentBase is a memory only, read-only implementation of the +// segment.Segment interface, using zap's data representation. +type SegmentBase struct { + mem []byte + memCRC uint32 chunkFactor uint32 + fieldsMap map[string]uint16 // fieldName -> fieldID+1 + fieldsInv []string // fieldID -> fieldName numDocs uint64 storedIndexOffset uint64 fieldsIndexOffset uint64 + docValueOffset uint64 + dictLocs []uint64 + fieldDvIterMap map[uint16]*docValueIterator // naive chunk cache per field +} - fieldsMap map[string]uint16 - fieldsInv []string - fieldsOffsets []uint64 +func (sb *SegmentBase) AddRef() {} +func (sb *SegmentBase) DecRef() (err error) { return nil } +func (sb *SegmentBase) Close() (err error) { return nil } + +// Segment implements a persisted segment.Segment interface, by +// embedding an mmap()'ed SegmentBase. +type Segment struct { + SegmentBase - docValueOffset uint64 - fieldDvIterMap map[uint16]*docValueIterator // naive chunk cache per field + f *os.File + mm mmap.MMap + path string + version uint32 + crc uint32 m sync.Mutex // Protects the fields that follow. refs int64 @@ -98,17 +113,29 @@ type Segment struct { func (s *Segment) SizeInBytes() uint64 { // 8 /* size of file pointer */ - // 4 /* size of crc -> uint32 */ // 4 /* size of version -> uint32 */ + // 4 /* size of crc -> uint32 */ + sizeOfUints := 16 + + sizeInBytes := (len(s.path) + int(segment.SizeOfString)) + sizeOfUints + + // mutex, refs -> int64 + sizeInBytes += 16 + + // do not include the mmap'ed part + return uint64(sizeInBytes) + s.SegmentBase.SizeInBytes() - uint64(len(s.mem)) +} + +func (s *SegmentBase) SizeInBytes() uint64 { + // 4 /* size of memCRC -> uint32 */ // 4 /* size of chunkFactor -> uint32 */ // 8 /* size of numDocs -> uint64 */ // 8 /* size of storedIndexOffset -> uint64 */ // 8 /* size of fieldsIndexOffset -> uint64 */ // 8 /* size of docValueOffset -> uint64 */ - sizeOfUints := 52 + sizeInBytes := 40 - // Do not include the mmap'ed part - sizeInBytes := (len(s.path) + int(segment.SizeOfString)) + sizeOfUints + sizeInBytes += len(s.mem) + int(segment.SizeOfSlice) // fieldsMap for k, _ := range s.fieldsMap { @@ -116,12 +143,12 @@ func (s *Segment) SizeInBytes() uint64 { } sizeInBytes += int(segment.SizeOfMap) /* overhead from map */ - // fieldsInv, fieldsOffsets + // fieldsInv, dictLocs for _, entry := range s.fieldsInv { sizeInBytes += (len(entry) + int(segment.SizeOfString)) } - sizeInBytes += len(s.fieldsOffsets) * 8 /* size of uint64 */ - sizeInBytes += int(segment.SizeOfSlice) * 2 /* overhead from slices */ + sizeInBytes += len(s.dictLocs) * 8 /* size of uint64 */ + sizeInBytes += int(segment.SizeOfSlice) * 3 /* overhead from slices */ // fieldDvIterMap sizeInBytes += len(s.fieldDvIterMap) * @@ -133,9 +160,6 @@ func (s *Segment) SizeInBytes() uint64 { } sizeInBytes += int(segment.SizeOfMap) - // mutex, refs -> int64 - sizeInBytes += 16 - return uint64(sizeInBytes) } @@ -158,49 +182,50 @@ func (s *Segment) DecRef() (err error) { func (s *Segment) loadConfig() error { crcOffset := len(s.mm) - 4 s.crc = binary.BigEndian.Uint32(s.mm[crcOffset : crcOffset+4]) + verOffset := crcOffset - 4 s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4]) if s.version != version { return fmt.Errorf("unsupported version %d", s.version) } + chunkOffset := verOffset - 4 s.chunkFactor = binary.BigEndian.Uint32(s.mm[chunkOffset : chunkOffset+4]) docValueOffset := chunkOffset - 8 s.docValueOffset = binary.BigEndian.Uint64(s.mm[docValueOffset : docValueOffset+8]) - fieldsOffset := docValueOffset - 8 - s.fieldsIndexOffset = binary.BigEndian.Uint64(s.mm[fieldsOffset : fieldsOffset+8]) + fieldsIndexOffset := docValueOffset - 8 + s.fieldsIndexOffset = binary.BigEndian.Uint64(s.mm[fieldsIndexOffset : fieldsIndexOffset+8]) - storedOffset := fieldsOffset - 8 - s.storedIndexOffset = binary.BigEndian.Uint64(s.mm[storedOffset : storedOffset+8]) + storedIndexOffset := fieldsIndexOffset - 8 + s.storedIndexOffset = binary.BigEndian.Uint64(s.mm[storedIndexOffset : storedIndexOffset+8]) - docNumOffset := storedOffset - 8 - s.numDocs = binary.BigEndian.Uint64(s.mm[docNumOffset : docNumOffset+8]) + numDocsOffset := storedIndexOffset - 8 + s.numDocs = binary.BigEndian.Uint64(s.mm[numDocsOffset : numDocsOffset+8]) return nil - } -func (s *Segment) loadFields() error { - // NOTE for now we assume the fields index immediately preceeds the footer - // if this changes, need to adjust accordingly (or store explicit length) - fieldsIndexEnd := uint64(len(s.mm) - FooterSize) +func (s *SegmentBase) loadFields() error { + // NOTE for now we assume the fields index immediately preceeds + // the footer, and if this changes, need to adjust accordingly (or + // store explicit length), where s.mem was sliced from s.mm in Open(). + fieldsIndexEnd := uint64(len(s.mem)) // iterate through fields index var fieldID uint64 for s.fieldsIndexOffset+(8*fieldID) < fieldsIndexEnd { - addr := binary.BigEndian.Uint64(s.mm[s.fieldsIndexOffset+(8*fieldID) : s.fieldsIndexOffset+(8*fieldID)+8]) - var n uint64 + addr := binary.BigEndian.Uint64(s.mem[s.fieldsIndexOffset+(8*fieldID) : s.fieldsIndexOffset+(8*fieldID)+8]) - dictLoc, read := binary.Uvarint(s.mm[addr+n : fieldsIndexEnd]) - n += uint64(read) - s.fieldsOffsets = append(s.fieldsOffsets, dictLoc) + dictLoc, read := binary.Uvarint(s.mem[addr:fieldsIndexEnd]) + n := uint64(read) + s.dictLocs = append(s.dictLocs, dictLoc) var nameLen uint64 - nameLen, read = binary.Uvarint(s.mm[addr+n : fieldsIndexEnd]) + nameLen, read = binary.Uvarint(s.mem[addr+n : fieldsIndexEnd]) n += uint64(read) - name := string(s.mm[addr+n : addr+n+nameLen]) + name := string(s.mem[addr+n : addr+n+nameLen]) s.fieldsInv = append(s.fieldsInv, name) s.fieldsMap[name] = uint16(fieldID + 1) @@ -210,7 +235,7 @@ func (s *Segment) loadFields() error { } // Dictionary returns the term dictionary for the specified field -func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) { +func (s *SegmentBase) Dictionary(field string) (segment.TermDictionary, error) { dict, err := s.dictionary(field) if err == nil && dict == nil { return &segment.EmptyDictionary{}, nil @@ -218,21 +243,20 @@ func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) { return dict, err } -func (s *Segment) dictionary(field string) (rv *Dictionary, err error) { - rv = &Dictionary{ - segment: s, - field: field, - } - - rv.fieldID = s.fieldsMap[field] - if rv.fieldID > 0 { - rv.fieldID = rv.fieldID - 1 +func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) { + fieldIDPlus1 := sb.fieldsMap[field] + if fieldIDPlus1 > 0 { + rv = &Dictionary{ + sb: sb, + field: field, + fieldID: fieldIDPlus1 - 1, + } - dictStart := s.fieldsOffsets[rv.fieldID] + dictStart := sb.dictLocs[rv.fieldID] if dictStart > 0 { // read the length of the vellum data - vellumLen, read := binary.Uvarint(s.mm[dictStart : dictStart+binary.MaxVarintLen64]) - fstBytes := s.mm[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen] + vellumLen, read := binary.Uvarint(sb.mem[dictStart : dictStart+binary.MaxVarintLen64]) + fstBytes := sb.mem[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen] if fstBytes != nil { rv.fst, err = vellum.Load(fstBytes) if err != nil { @@ -240,9 +264,6 @@ func (s *Segment) dictionary(field string) (rv *Dictionary, err error) { } } } - - } else { - return nil, nil } return rv, nil @@ -250,10 +271,10 @@ func (s *Segment) dictionary(field string) (rv *Dictionary, err error) { // VisitDocument invokes the DocFieldValueVistor for each stored field // for the specified doc number -func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { +func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { // first make sure this is a valid number in this segment if num < s.numDocs { - meta, compressed := s.getStoredMetaAndCompressed(num) + meta, compressed := s.getDocStoredMetaAndCompressed(num) uncompressed, err := snappy.Decode(nil, compressed) if err != nil { return err @@ -307,13 +328,13 @@ func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVi } // Count returns the number of documents in this segment. -func (s *Segment) Count() uint64 { +func (s *SegmentBase) Count() uint64 { return s.numDocs } // DocNumbers returns a bitset corresponding to the doc numbers of all the // provided _id strings -func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) { +func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) { rv := roaring.New() if len(s.fieldsMap) > 0 { @@ -337,7 +358,7 @@ func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) { } // Fields returns the field names used in this segment -func (s *Segment) Fields() []string { +func (s *SegmentBase) Fields() []string { return s.fieldsInv } @@ -411,23 +432,22 @@ func (s *Segment) NumDocs() uint64 { // DictAddr is a helper function to compute the file offset where the // dictionary is stored for the specified field. func (s *Segment) DictAddr(field string) (uint64, error) { - var fieldID uint16 - var ok bool - if fieldID, ok = s.fieldsMap[field]; !ok { + fieldIDPlus1, ok := s.fieldsMap[field] + if !ok { return 0, fmt.Errorf("no such field '%s'", field) } - return s.fieldsOffsets[fieldID-1], nil + return s.dictLocs[fieldIDPlus1-1], nil } -func (s *Segment) loadDvIterators() error { +func (s *SegmentBase) loadDvIterators() error { if s.docValueOffset == fieldNotUninverted { return nil } var read uint64 for fieldID, field := range s.fieldsInv { - fieldLoc, n := binary.Uvarint(s.mm[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) + fieldLoc, n := binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) if n <= 0 { return fmt.Errorf("loadDvIterators: failed to read the docvalue offsets for field %d", fieldID) } diff --git a/index/scorch/segment/zap/write.go b/index/scorch/segment/zap/write.go index cfb7e46e9..246710743 100644 --- a/index/scorch/segment/zap/write.go +++ b/index/scorch/segment/zap/write.go @@ -53,12 +53,11 @@ func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer) (int, error) { func persistFields(fieldsInv []string, w *CountHashWriter, dictLocs []uint64) (uint64, error) { var rv uint64 + var fieldsOffsets []uint64 - var fieldStarts []uint64 for fieldID, fieldName := range fieldsInv { - // record start of this field - fieldStarts = append(fieldStarts, uint64(w.Count())) + fieldsOffsets = append(fieldsOffsets, uint64(w.Count())) // write out the dict location and field name length _, err := writeUvarints(w, dictLocs[fieldID], uint64(len(fieldName))) @@ -76,7 +75,7 @@ func persistFields(fieldsInv []string, w *CountHashWriter, dictLocs []uint64) (u // now write out the fields index rv = uint64(w.Count()) for fieldID := range fieldsInv { - err := binary.Write(w, binary.BigEndian, fieldStarts[fieldID]) + err := binary.Write(w, binary.BigEndian, fieldsOffsets[fieldID]) if err != nil { return 0, err } @@ -89,8 +88,11 @@ func persistFields(fieldsInv []string, w *CountHashWriter, dictLocs []uint64) (u // crc + ver + chunk + field offset + stored offset + num docs + docValueOffset const FooterSize = 4 + 4 + 4 + 8 + 8 + 8 + 8 -func persistFooter(numDocs, storedIndexOffset, fieldIndexOffset, docValueOffset uint64, - chunkFactor uint32, w *CountHashWriter) error { +func persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64, + chunkFactor uint32, crcBeforeFooter uint32, writerIn io.Writer) error { + w := NewCountHashWriter(writerIn) + w.crc = crcBeforeFooter + // write out the number of docs err := binary.Write(w, binary.BigEndian, numDocs) if err != nil { @@ -102,7 +104,7 @@ func persistFooter(numDocs, storedIndexOffset, fieldIndexOffset, docValueOffset return err } // write out the field index location - err = binary.Write(w, binary.BigEndian, fieldIndexOffset) + err = binary.Write(w, binary.BigEndian, fieldsIndexOffset) if err != nil { return err } @@ -122,7 +124,7 @@ func persistFooter(numDocs, storedIndexOffset, fieldIndexOffset, docValueOffset return err } // write out CRC-32 of everything upto but not including this CRC - err = binary.Write(w, binary.BigEndian, w.Sum32()) + err = binary.Write(w, binary.BigEndian, w.crc) if err != nil { return err } From 37121c3b4927a497b62d69d752520a2b4a395e29 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 21 Jan 2018 10:53:58 -0800 Subject: [PATCH 154/728] scorch zap writeRoaringWithLen optimized with reused bufs --- index/scorch/segment/zap/build.go | 8 ++++++-- index/scorch/segment/zap/merge.go | 5 +++-- index/scorch/segment/zap/write.go | 19 +++++++++---------- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index c9361cbd4..769c07958 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -421,11 +421,13 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint64, err error) { rv = make([]uint64, 0, len(memSegment.PostingsLocs)) + var reuseBuf bytes.Buffer + reuseBufVarint := make([]byte, binary.MaxVarintLen64) for postingID := range memSegment.PostingsLocs { // record where we start this posting loc rv = append(rv, uint64(w.Count())) // write out the length and bitmap - _, err = writeRoaringWithLen(memSegment.PostingsLocs[postingID], w) + _, err = writeRoaringWithLen(memSegment.PostingsLocs[postingID], w, &reuseBuf, reuseBufVarint) if err != nil { return nil, err } @@ -436,6 +438,8 @@ func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, postingsListLocs, freqOffsets, locOffsets []uint64) (rv []uint64, err error) { rv = make([]uint64, 0, len(memSegment.Postings)) + var reuseBuf bytes.Buffer + reuseBufVarint := make([]byte, binary.MaxVarintLen64) for postingID := range memSegment.Postings { // record where we start this posting list rv = append(rv, uint64(w.Count())) @@ -448,7 +452,7 @@ func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, } // write out the length and bitmap - _, err = writeRoaringWithLen(memSegment.Postings[postingID], w) + _, err = writeRoaringWithLen(memSegment.Postings[postingID], w, &reuseBuf, reuseBufVarint) if err != nil { return nil, err } diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 07b9bafcc..f8ca142f1 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -129,6 +129,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, newSegDocCount uint64, chunkFactor uint32, w *CountHashWriter) ([]uint64, uint64, error) { + var bufReuse bytes.Buffer var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64) var bufLoc []uint64 @@ -258,7 +259,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, return nil, 0, err } postingLocOffset := uint64(w.Count()) - _, err = writeRoaringWithLen(newRoaringLocs, w) + _, err = writeRoaringWithLen(newRoaringLocs, w, &bufReuse, bufMaxVarintLen64) if err != nil { return nil, 0, err } @@ -284,7 +285,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, if err != nil { return nil, 0, err } - _, err = writeRoaringWithLen(newRoaring, w) + _, err = writeRoaringWithLen(newRoaring, w, &bufReuse, bufMaxVarintLen64) if err != nil { return nil, 0, err } diff --git a/index/scorch/segment/zap/write.go b/index/scorch/segment/zap/write.go index 246710743..c5316a99f 100644 --- a/index/scorch/segment/zap/write.go +++ b/index/scorch/segment/zap/write.go @@ -23,31 +23,30 @@ import ( ) // writes out the length of the roaring bitmap in bytes as varint -// then writs out the roaring bitmap itself -func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer) (int, error) { - var buffer bytes.Buffer +// then writes out the roaring bitmap itself +func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer, + reuseBuf *bytes.Buffer, reuseBufVarint []byte) (int, error) { + reuseBuf.Reset() + // write out postings list to memory so we know the len - postingsListLen, err := r.WriteTo(&buffer) + postingsListLen, err := r.WriteTo(reuseBuf) if err != nil { return 0, err } var tw int // write out the length of this postings list - buf := make([]byte, binary.MaxVarintLen64) - n := binary.PutUvarint(buf, uint64(postingsListLen)) - nw, err := w.Write(buf[:n]) + n := binary.PutUvarint(reuseBufVarint, uint64(postingsListLen)) + nw, err := w.Write(reuseBufVarint[:n]) tw += nw if err != nil { return tw, err } - // write out the postings list itself - nw, err = w.Write(buffer.Bytes()) + nw, err = w.Write(reuseBuf.Bytes()) tw += nw if err != nil { return tw, err } - return tw, nil } From 603425c2c585c103a69691fa0d2d399bc2ff282a Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 24 Jan 2018 08:39:50 -0800 Subject: [PATCH 155/728] scorch zap mergerLoop missing fireAsyncError case --- index/scorch/merge.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 78c27ddb1..cdf0b40b2 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -54,7 +54,6 @@ OUTER: lastEpochMergePlanned = ourSnapshot.epoch s.fireEvent(EventKindMergerProgress, time.Since(startTime)) - } _ = ourSnapshot.DecRef() @@ -81,6 +80,7 @@ OUTER: // lets get started err := s.planMergeAtSnapshot(ourSnapshot) if err != nil { + s.fireAsyncError(fmt.Errorf("merging err: %v", err)) _ = ourSnapshot.DecRef() continue OUTER } From 29d526a7c22f91a8c45adfe9c8e5141a373fed2d Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 24 Jan 2018 09:13:16 -0800 Subject: [PATCH 156/728] scorch zap merge uses DefaultChunkFactor --- index/scorch/merge.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index cdf0b40b2..5ded29b5a 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -141,7 +141,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot) error { filename := zapFileName(newSegmentID) s.markIneligibleForRemoval(filename) path := s.path + string(os.PathSeparator) + filename - newDocNums, err := zap.Merge(segmentsToMerge, docsToDrop, path, 1024) + newDocNums, err := zap.Merge(segmentsToMerge, docsToDrop, path, DefaultChunkFactor) if err != nil { s.unmarkIneligibleForRemoval(filename) return fmt.Errorf("merging failed: %v", err) From d389e2bb40dc3a9c2feabb7e1703e6b48c35f1c9 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 24 Jan 2018 09:22:10 -0800 Subject: [PATCH 157/728] scorch zap merge file cleanup on error, and some minor prealloc's --- index/scorch/segment/zap/merge.go | 29 +++++++++++++++++++++-------- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index f8ca142f1..d19f3938d 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -29,10 +29,10 @@ import ( "github.com/golang/snappy" ) -// Merge takes a slice of zap segments, bit masks describing which documents -// from the may be dropped, and creates a new segment containing the remaining -// data. This new segment is built at the specified path, with the provided -// chunkFactor. +// Merge takes a slice of zap segments and bit masks describing which +// documents may be dropped, and creates a new segment containing the +// remaining data. This new segment is built at the specified path, +// with the provided chunkFactor. func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, chunkFactor uint32) ([][]uint64, error) { flag := os.O_RDWR | os.O_CREATE @@ -42,6 +42,11 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, return nil, err } + cleanup := func() { + _ = f.Close() + _ = os.Remove(path) + } + // buffer the output br := bufio.NewWriter(f) @@ -50,22 +55,25 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, fieldsInv := mergeFields(segments) fieldsMap := mapFields(fieldsInv) - newSegDocCount := computeNewDocCount(segments, drops) var newDocNums [][]uint64 var storedIndexOffset uint64 fieldDvLocsOffset := uint64(fieldNotUninverted) var dictLocs []uint64 + + newSegDocCount := computeNewDocCount(segments, drops) if newSegDocCount > 0 { storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, fieldsMap, fieldsInv, newSegDocCount, cr) if err != nil { + cleanup() return nil, err } dictLocs, fieldDvLocsOffset, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap, newDocNums, newSegDocCount, chunkFactor, cr) if err != nil { + cleanup() return nil, err } } else { @@ -74,27 +82,32 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, fieldsIndexOffset, err := persistFields(fieldsInv, cr, dictLocs) if err != nil { + cleanup() return nil, err } err = persistFooter(newSegDocCount, storedIndexOffset, fieldsIndexOffset, fieldDvLocsOffset, chunkFactor, cr.Sum32(), cr) if err != nil { + cleanup() return nil, err } err = br.Flush() if err != nil { + cleanup() return nil, err } err = f.Sync() if err != nil { + cleanup() return nil, err } err = f.Close() if err != nil { + cleanup() return nil, err } @@ -103,7 +116,7 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, // mapFields takes the fieldsInv list and builds the map func mapFields(fields []string) map[string]uint16 { - rv := make(map[string]uint16) + rv := make(map[string]uint16, len(fields)) for i, fieldName := range fields { rv[fieldName] = uint16(i) } @@ -327,7 +340,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, rv1[fieldID] = dictOffset // update the doc value - var docNumbers docIDRange + docNumbers := make(docIDRange, 0, len(docTermMap)) for k := range docTermMap { docNumbers = append(docNumbers, k) } @@ -353,7 +366,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, } fieldDvLocsOffset = uint64(w.Count()) - buf := make([]byte, binary.MaxVarintLen64) + buf := bufMaxVarintLen64 for _, offset := range fieldDvLocs { n := binary.PutUvarint(buf, uint64(offset)) _, err := w.Write(buf[:n]) From 6a17ff48c71f265da517e1db209e889338f9bb96 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 27 Jan 2018 09:37:57 -0800 Subject: [PATCH 158/728] scorch zap removed uneeded []byte cast of term --- index/scorch/segment/zap/merge.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index d19f3938d..1acbbf5d7 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -247,7 +247,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, } } - docTermMap[hitNewDocNum] = append(docTermMap[hitNewDocNum], []byte(term)...) + docTermMap[hitNewDocNum] = append(docTermMap[hitNewDocNum], term...) docTermMap[hitNewDocNum] = append(docTermMap[hitNewDocNum], termSeparator) next, err2 = postItr.Next() } From 10dd5489c21a47a6a3138c25a531925961000acf Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 27 Jan 2018 09:42:56 -0800 Subject: [PATCH 159/728] scorch zap Dict.postingsList() takes []byte for more mem control This allows callers that already have a []byte term to avoid string'ification garbage. --- index/scorch/segment/zap/dict.go | 7 +++---- index/scorch/segment/zap/merge.go | 2 +- index/scorch/segment/zap/posting.go | 2 +- index/scorch/segment/zap/segment.go | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 284bc1898..bb6fd9478 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -35,10 +35,10 @@ type Dictionary struct { // PostingsList returns the postings list for the specified term func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) { - return d.postingsList(term, except) + return d.postingsList([]byte(term), except) } -func (d *Dictionary) postingsList(term string, except *roaring.Bitmap) (*PostingsList, error) { +func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap) (*PostingsList, error) { rv := &PostingsList{ sb: d.sb, term: term, @@ -46,7 +46,7 @@ func (d *Dictionary) postingsList(term string, except *roaring.Bitmap) (*Posting } if d.fst != nil { - postingsOffset, exists, err := d.fst.Get([]byte(term)) + postingsOffset, exists, err := d.fst.Get(term) if err != nil { return nil, fmt.Errorf("vellum err: %v", err) } @@ -96,7 +96,6 @@ func (d *Dictionary) postingsList(term string, except *roaring.Bitmap) (*Posting // Iterator returns an iterator for this dictionary func (d *Dictionary) Iterator() segment.DictionaryIterator { - rv := &DictionaryIterator{ d: d, } diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 1acbbf5d7..c1a1a38aa 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -206,7 +206,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, if dict == nil { continue } - postings, err2 := dict.postingsList(string(term), drops[dictI]) + postings, err2 := dict.postingsList(term, drops[dictI]) if err2 != nil { return nil, 0, err2 } diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 95f350130..e8533a12a 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -28,7 +28,7 @@ import ( // PostingsList is an in-memory represenation of a postings list type PostingsList struct { sb *SegmentBase - term string + term []byte postingsOffset uint64 freqOffset uint64 locOffset uint64 diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index df71f1d62..94268cace 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -344,7 +344,7 @@ func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) { } for _, id := range ids { - postings, err := idDict.postingsList(id, nil) + postings, err := idDict.postingsList([]byte(id), nil) if err != nil { return nil, err } From 9038d75c98e8597bfd0d0c5e992b198e3709b89f Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 27 Jan 2018 09:56:56 -0800 Subject: [PATCH 160/728] scorch zap allocate govarint.U64Base128Encoder just once Instead of allocating a govarint.U64Base128Encoder in the inner loop, allocate it just once on the outside, as it appears that it's just a thin wrapper around binary.PutUvarint(). --- index/scorch/segment/zap/merge.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index c1a1a38aa..f7e4be3d8 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -390,6 +390,8 @@ func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, var metaBuf bytes.Buffer var data, compressed []byte + metaEncoder := govarint.NewU64Base128Encoder(&metaBuf) + vals := make([][][]byte, len(fieldsInv)) typs := make([][]byte, len(fieldsInv)) poss := make([][][]uint64, len(fieldsInv)) @@ -407,8 +409,6 @@ func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, compressed = compressed[:0] curr = 0 - metaEncoder := govarint.NewU64Base128Encoder(&metaBuf) - if drops[segI] != nil && drops[segI].Contains(uint32(docNum)) { segNewDocNums = append(segNewDocNums, docDropped) } else { From 3030d4edb5d64dba92d2a1be291588f266005f48 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 27 Jan 2018 10:05:12 -0800 Subject: [PATCH 161/728] scorch zap merge preallocs segNewDocNums capacity --- index/scorch/segment/zap/merge.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index f7e4be3d8..2c290bba5 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -366,6 +366,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, } fieldDvLocsOffset = uint64(w.Count()) + buf := bufMaxVarintLen64 for _, offset := range fieldDvLocs { n := binary.PutUvarint(buf, uint64(offset)) @@ -400,14 +401,14 @@ func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, // for each segment for segI, segment := range segments { - var segNewDocNums []uint64 + segNewDocNums := make([]uint64, 0, segment.numDocs) // for each doc num for docNum := uint64(0); docNum < segment.numDocs; docNum++ { + curr = 0 metaBuf.Reset() data = data[:0] compressed = compressed[:0] - curr = 0 if drops[segI] != nil && drops[segI].Contains(uint32(docNum)) { segNewDocNums = append(segNewDocNums, docDropped) @@ -481,7 +482,9 @@ func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, metaEncoder.Close() metaBytes := metaBuf.Bytes() + compressed = snappy.Encode(compressed, data) + // record where we're about to start writing docNumOffsets[newDocNum] = uint64(w.Count()) @@ -510,6 +513,7 @@ func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, // return value is the start of the stored index offset := uint64(w.Count()) + // now write out the stored doc index for docNum := range docNumOffsets { err := binary.Write(w, binary.BigEndian, docNumOffsets[docNum]) @@ -524,13 +528,13 @@ func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, // mergeFields builds a unified list of fields used across all the input segments func mergeFields(segments []*Segment) []string { fieldsMap := map[string]struct{}{} - for _, segment := range segments { fields := segment.Fields() for _, field := range fields { fieldsMap[field] = struct{}{} } } + rv := make([]string, 0, len(fieldsMap)) // ensure _id stays first rv = append(rv, "_id") @@ -539,6 +543,5 @@ func mergeFields(segments []*Segment) []string { rv = append(rv, k) } } - return rv } From 56cdb68f35624c8075d1268b68bc788406de4461 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 27 Jan 2018 10:18:40 -0800 Subject: [PATCH 162/728] scorch zap merge checks err2 not err Also, optimize the appending of the termSeparator so that the docTermMap is accessed and updated just once. --- index/scorch/segment/zap/merge.go | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 2c290bba5..5e7dae1f8 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -146,7 +146,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64) var bufLoc []uint64 - rv1 := make([]uint64, len(fieldsInv)) + rv := make([]uint64, len(fieldsInv)) fieldDvLocs := make([]uint64, len(fieldsInv)) fieldDvLocsOffset := uint64(fieldNotUninverted) @@ -191,12 +191,14 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1) + docTermMap := make(map[uint64][]byte, 0) for err == nil { term, _ := mergeItr.Current() newRoaring := roaring.NewBitmap() newRoaringLocs := roaring.NewBitmap() + tfEncoder.Reset() locEncoder.Reset() @@ -222,9 +224,9 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, // encode norm bits norm := next.Norm() normBits := math.Float32bits(float32(norm)) - err3 := tfEncoder.Add(hitNewDocNum, next.Frequency(), uint64(normBits)) - if err3 != nil { - return nil, 0, err3 + err = tfEncoder.Add(hitNewDocNum, next.Frequency(), uint64(normBits)) + if err != nil { + return nil, 0, err } locs := next.Locations() if len(locs) > 0 { @@ -247,15 +249,16 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, } } - docTermMap[hitNewDocNum] = append(docTermMap[hitNewDocNum], term...) - docTermMap[hitNewDocNum] = append(docTermMap[hitNewDocNum], termSeparator) + docTermMap[hitNewDocNum] = + append(append(docTermMap[hitNewDocNum], term...), termSeparator) + next, err2 = postItr.Next() } - if err != nil { - return nil, 0, err + if err2 != nil { + return nil, 0, err2 } - } + tfEncoder.Close() locEncoder.Close() @@ -337,7 +340,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, return nil, 0, err } - rv1[fieldID] = dictOffset + rv[fieldID] = dictOffset // update the doc value docNumbers := make(docIDRange, 0, len(docTermMap)) @@ -376,7 +379,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, } } - return rv1, fieldDvLocsOffset, nil + return rv, fieldDvLocsOffset, nil } const docDropped = math.MaxUint64 From 916bbf41257935f949cdbd80f95f327fbea9596b Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 27 Jan 2018 10:33:38 -0800 Subject: [PATCH 163/728] scorch zap merge prealloc's docTermMap capacity --- index/scorch/segment/zap/merge.go | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 5e7dae1f8..4f0333bb4 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -151,6 +151,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, fieldDvLocsOffset := uint64(fieldNotUninverted) var vellumBuf bytes.Buffer + // for each field for fieldID, fieldName := range fieldsInv { if fieldID != 0 { @@ -190,9 +191,9 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) - fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1) - docTermMap := make(map[uint64][]byte, 0) + docTermMap := make(map[uint64][]byte, newSegDocCount) + for err == nil { term, _ := mergeItr.Current() @@ -319,6 +320,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, } dictOffset := uint64(w.Count()) + err = newVellum.Close() if err != nil { return nil, 0, err @@ -326,10 +328,8 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, vellumData := vellumBuf.Bytes() // write out the length of the vellum data - buf := bufMaxVarintLen64 - // write out the number of chunks - n := binary.PutUvarint(buf, uint64(len(vellumData))) - _, err = w.Write(buf[:n]) + n := binary.PutUvarint(bufMaxVarintLen64, uint64(len(vellumData))) + _, err = w.Write(bufMaxVarintLen64[:n]) if err != nil { return nil, 0, err } @@ -342,25 +342,28 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, rv[fieldID] = dictOffset - // update the doc value + // update the doc nums docNumbers := make(docIDRange, 0, len(docTermMap)) for k := range docTermMap { docNumbers = append(docNumbers, k) } sort.Sort(docNumbers) + fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1) for _, docNum := range docNumbers { err = fdvEncoder.Add(docNum, docTermMap[docNum]) if err != nil { return nil, 0, err } } - // get the field doc value offset - fieldDvLocs[fieldID] = uint64(w.Count()) err = fdvEncoder.Close() if err != nil { return nil, 0, err } + + // get the field doc value offset + fieldDvLocs[fieldID] = uint64(w.Count()) + // persist the doc value details for this field _, err = fdvEncoder.Write(w) if err != nil { From 6985db13a0f626de144ff5ae5f0cbee183d47ee2 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 27 Jan 2018 10:39:02 -0800 Subject: [PATCH 164/728] scorch zap merge reuses docNumbers array --- index/scorch/segment/zap/merge.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 4f0333bb4..9156675de 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -150,6 +150,8 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, fieldDvLocs := make([]uint64, len(fieldsInv)) fieldDvLocsOffset := uint64(fieldNotUninverted) + var docNumbers docIDRange + var vellumBuf bytes.Buffer // for each field @@ -343,7 +345,10 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, rv[fieldID] = dictOffset // update the doc nums - docNumbers := make(docIDRange, 0, len(docTermMap)) + if cap(docNumbers) < len(docTermMap) { + docNumbers = make(docIDRange, 0, len(docTermMap)) + } + docNumbers = docNumbers[:0] for k := range docTermMap { docNumbers = append(docNumbers, k) } From 0041664bc412ed5d93110de5c6648c573557e82a Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 27 Jan 2018 10:45:23 -0800 Subject: [PATCH 165/728] scorch zap merge computeNewDocCount() optimize 1 variable --- index/scorch/segment/zap/merge.go | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 9156675de..a239740cf 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -126,15 +126,14 @@ func mapFields(fields []string) map[string]uint16 { // computeNewDocCount determines how many documents will be in the newly // merged segment when obsoleted docs are dropped func computeNewDocCount(segments []*Segment, drops []*roaring.Bitmap) uint64 { - var newSegDocCount uint64 + var newDocCount uint64 for segI, segment := range segments { - segIAfterDrop := segment.NumDocs() + newDocCount += segment.NumDocs() if drops[segI] != nil { - segIAfterDrop -= drops[segI].GetCardinality() + newDocCount -= drops[segI].GetCardinality() } - newSegDocCount += segIAfterDrop } - return newSegDocCount + return newDocCount } func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, From 8dd17a3b2035cd7a82b7b629e06ca821d36a0688 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 27 Jan 2018 10:55:29 -0800 Subject: [PATCH 166/728] scorch zap mergeStoredAndRemap uses continue for less indentation --- index/scorch/segment/zap/merge.go | 171 +++++++++++++++--------------- 1 file changed, 86 insertions(+), 85 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index a239740cf..db99dc607 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -394,7 +394,7 @@ const docDropped = math.MaxUint64 func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, fieldsMap map[string]uint16, fieldsInv []string, newSegDocCount uint64, w *CountHashWriter) (uint64, [][]uint64, error) { - var rv [][]uint64 + var rv [][]uint64 // The remapped or newDocNums for each segment. var newDocNum int var curr int @@ -415,109 +415,110 @@ func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, // for each doc num for docNum := uint64(0); docNum < segment.numDocs; docNum++ { + if drops[segI] != nil && drops[segI].Contains(uint32(docNum)) { + segNewDocNums = append(segNewDocNums, docDropped) + continue + } + + segNewDocNums = append(segNewDocNums, uint64(newDocNum)) + curr = 0 metaBuf.Reset() data = data[:0] compressed = compressed[:0] - if drops[segI] != nil && drops[segI].Contains(uint32(docNum)) { - segNewDocNums = append(segNewDocNums, docDropped) - } else { - segNewDocNums = append(segNewDocNums, uint64(newDocNum)) - // collect all the data - for i := 0; i < len(fieldsInv); i++ { - vals[i] = vals[i][:0] - typs[i] = typs[i][:0] - poss[i] = poss[i][:0] - } - err := segment.VisitDocument(docNum, func(field string, typ byte, value []byte, pos []uint64) bool { - fieldID := int(fieldsMap[field]) - vals[fieldID] = append(vals[fieldID], value) - typs[fieldID] = append(typs[fieldID], typ) - poss[fieldID] = append(poss[fieldID], pos) - return true - }) - if err != nil { - return 0, nil, err - } - - // now walk the fields in order - for fieldID := range fieldsInv { + // collect all the data + for i := 0; i < len(fieldsInv); i++ { + vals[i] = vals[i][:0] + typs[i] = typs[i][:0] + poss[i] = poss[i][:0] + } + err := segment.VisitDocument(docNum, func(field string, typ byte, value []byte, pos []uint64) bool { + fieldID := int(fieldsMap[field]) + vals[fieldID] = append(vals[fieldID], value) + typs[fieldID] = append(typs[fieldID], typ) + poss[fieldID] = append(poss[fieldID], pos) + return true + }) + if err != nil { + return 0, nil, err + } - storedFieldValues := vals[int(fieldID)] + // now walk the fields in order + for fieldID := range fieldsInv { + storedFieldValues := vals[int(fieldID)] - // has stored values for this field - num := len(storedFieldValues) + // has stored values for this field + num := len(storedFieldValues) - // process each value - for i := 0; i < num; i++ { - // encode field - _, err2 := metaEncoder.PutU64(uint64(fieldID)) - if err2 != nil { - return 0, nil, err2 - } - // encode type - _, err2 = metaEncoder.PutU64(uint64(typs[int(fieldID)][i])) - if err2 != nil { - return 0, nil, err2 - } - // encode start offset - _, err2 = metaEncoder.PutU64(uint64(curr)) - if err2 != nil { - return 0, nil, err2 - } - // end len - _, err2 = metaEncoder.PutU64(uint64(len(storedFieldValues[i]))) - if err2 != nil { - return 0, nil, err2 - } - // encode number of array pos - _, err2 = metaEncoder.PutU64(uint64(len(poss[int(fieldID)][i]))) + // process each value + for i := 0; i < num; i++ { + // encode field + _, err2 := metaEncoder.PutU64(uint64(fieldID)) + if err2 != nil { + return 0, nil, err2 + } + // encode type + _, err2 = metaEncoder.PutU64(uint64(typs[int(fieldID)][i])) + if err2 != nil { + return 0, nil, err2 + } + // encode start offset + _, err2 = metaEncoder.PutU64(uint64(curr)) + if err2 != nil { + return 0, nil, err2 + } + // end len + _, err2 = metaEncoder.PutU64(uint64(len(storedFieldValues[i]))) + if err2 != nil { + return 0, nil, err2 + } + // encode number of array pos + _, err2 = metaEncoder.PutU64(uint64(len(poss[int(fieldID)][i]))) + if err2 != nil { + return 0, nil, err2 + } + // encode all array positions + for j := 0; j < len(poss[int(fieldID)][i]); j++ { + _, err2 = metaEncoder.PutU64(poss[int(fieldID)][i][j]) if err2 != nil { return 0, nil, err2 } - // encode all array positions - for j := 0; j < len(poss[int(fieldID)][i]); j++ { - _, err2 = metaEncoder.PutU64(poss[int(fieldID)][i][j]) - if err2 != nil { - return 0, nil, err2 - } - } - // append data - data = append(data, storedFieldValues[i]...) - // update curr - curr += len(storedFieldValues[i]) } + // append data + data = append(data, storedFieldValues[i]...) + // update curr + curr += len(storedFieldValues[i]) } + } - metaEncoder.Close() - metaBytes := metaBuf.Bytes() - - compressed = snappy.Encode(compressed, data) + metaEncoder.Close() + metaBytes := metaBuf.Bytes() - // record where we're about to start writing - docNumOffsets[newDocNum] = uint64(w.Count()) + compressed = snappy.Encode(compressed, data) - // write out the meta len and compressed data len - _, err = writeUvarints(w, - uint64(len(metaBytes)), uint64(len(compressed))) - if err != nil { - return 0, nil, err - } - // now write the meta - _, err = w.Write(metaBytes) - if err != nil { - return 0, nil, err - } - // now write the compressed data - _, err = w.Write(compressed) - if err != nil { - return 0, nil, err - } + // record where we're about to start writing + docNumOffsets[newDocNum] = uint64(w.Count()) - newDocNum++ + // write out the meta len and compressed data len + _, err = writeUvarints(w, uint64(len(metaBytes)), uint64(len(compressed))) + if err != nil { + return 0, nil, err } + // now write the meta + _, err = w.Write(metaBytes) + if err != nil { + return 0, nil, err + } + // now write the compressed data + _, err = w.Write(compressed) + if err != nil { + return 0, nil, err + } + + newDocNum++ } + rv = append(rv, segNewDocNums) } From 745575a6c13b327bc636c91d5281d05512993403 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 27 Jan 2018 11:02:53 -0800 Subject: [PATCH 167/728] scorch zap mergeStoredAndRemap uses array indexing, not append() Since we have right array size preallocated, we don't need the extra capacity checking of append(). --- index/scorch/segment/zap/merge.go | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index db99dc607..8c06f2fed 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -395,7 +395,8 @@ func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, fieldsMap map[string]uint16, fieldsInv []string, newSegDocCount uint64, w *CountHashWriter) (uint64, [][]uint64, error) { var rv [][]uint64 // The remapped or newDocNums for each segment. - var newDocNum int + + var newDocNum uint64 var curr int var metaBuf bytes.Buffer @@ -411,16 +412,17 @@ func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, // for each segment for segI, segment := range segments { - segNewDocNums := make([]uint64, 0, segment.numDocs) + segNewDocNums := make([]uint64, segment.numDocs) // for each doc num for docNum := uint64(0); docNum < segment.numDocs; docNum++ { + // TODO: roaring's API limits docNums to 32-bits? if drops[segI] != nil && drops[segI].Contains(uint32(docNum)) { - segNewDocNums = append(segNewDocNums, docDropped) + segNewDocNums[docNum] = docDropped continue } - segNewDocNums = append(segNewDocNums, uint64(newDocNum)) + segNewDocNums[docNum] = newDocNum curr = 0 metaBuf.Reset() From a444c25ddf2101071f27fc1acacf90354023ada7 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 27 Jan 2018 13:23:33 -0800 Subject: [PATCH 168/728] scorch zap merge uses array for docTermMap with no sorting Instead of sorting docNum keys from a hashmap, this change instead iterates from docNum 0 to N and uses an array instead of hashmap. The array is also reused across outer loop iterations. This optimizes for when there's a lot of structural similarity between docs, where many/most docs have the same fields. i.e., beers, breweries. If every doc has completely different fields, then this change might produce worse behavior compared to the previous sparse hashmap approach. --- index/scorch/segment/zap/merge.go | 37 +++++++++++++++++-------------- 1 file changed, 20 insertions(+), 17 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 8c06f2fed..ed9c7f98b 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -21,7 +21,6 @@ import ( "fmt" "math" "os" - "sort" "github.com/RoaringBitmap/roaring" "github.com/Smerity/govarint" @@ -149,7 +148,11 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, fieldDvLocs := make([]uint64, len(fieldsInv)) fieldDvLocsOffset := uint64(fieldNotUninverted) - var docNumbers docIDRange + // docTermMap is keyed by docNum, where the array impl provides + // better memory usage behavior than a sparse-friendlier hashmap + // for when docs have much structural similarity (i.e., every doc + // has a given field) + var docTermMap [][]byte var vellumBuf bytes.Buffer @@ -193,7 +196,14 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) - docTermMap := make(map[uint64][]byte, newSegDocCount) + if uint64(cap(docTermMap)) < newSegDocCount { + docTermMap = make([][]byte, newSegDocCount) + } else { + docTermMap = docTermMap[0:newSegDocCount] + for docNum := range docTermMap { // reset the docTermMap + docTermMap[docNum] = docTermMap[docNum][:0] + } + } for err == nil { term, _ := mergeItr.Current() @@ -343,21 +353,14 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, rv[fieldID] = dictOffset - // update the doc nums - if cap(docNumbers) < len(docTermMap) { - docNumbers = make(docIDRange, 0, len(docTermMap)) - } - docNumbers = docNumbers[:0] - for k := range docTermMap { - docNumbers = append(docNumbers, k) - } - sort.Sort(docNumbers) - + // update the field doc values fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1) - for _, docNum := range docNumbers { - err = fdvEncoder.Add(docNum, docTermMap[docNum]) - if err != nil { - return nil, 0, err + for docNum, docTerms := range docTermMap { + if len(docTerms) > 0 { + err = fdvEncoder.Add(uint64(docNum), docTerms) + if err != nil { + return nil, 0, err + } } } err = fdvEncoder.Close() From 634cfa05606be02b4e532ff449f2c0d9b8d2aac1 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 27 Jan 2018 14:38:04 -0800 Subject: [PATCH 169/728] scorch zap chunkedIntCoder optimization to prealloc some final buf --- index/scorch/segment/zap/build.go | 5 +---- index/scorch/segment/zap/intcoder.go | 1 + index/scorch/segment/zap/merge.go | 12 ++++++------ 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 769c07958..58f9faeaf 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -368,7 +368,6 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac } // put pos - err = locEncoder.Add(docNum, locpos[locOffset]) if err != nil { return nil, nil, err @@ -386,10 +385,8 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac return nil, nil, err } - // put array positions - num := len(locarraypos[locOffset]) - // put the number of array positions to follow + num := len(locarraypos[locOffset]) err = locEncoder.Add(docNum, uint64(num)) if err != nil { return nil, nil, err diff --git a/index/scorch/segment/zap/intcoder.go b/index/scorch/segment/zap/intcoder.go index 7e268bcf3..e9f295023 100644 --- a/index/scorch/segment/zap/intcoder.go +++ b/index/scorch/segment/zap/intcoder.go @@ -41,6 +41,7 @@ func newChunkedIntCoder(chunkSize uint64, maxDocNum uint64) *chunkedIntCoder { chunkSize: chunkSize, maxDocNum: maxDocNum, chunkLens: make([]uint64, total), + final: make([]byte, 0, 64), } rv.encoder = govarint.NewU64Base128Encoder(&rv.chunkBuf) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index ed9c7f98b..cc348d720 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -247,12 +247,12 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, if cap(bufLoc) < 5+len(loc.ArrayPositions()) { bufLoc = make([]uint64, 0, 5+len(loc.ArrayPositions())) } - args := bufLoc[0:0] - args = append(args, uint64(fieldsMap[loc.Field()])) - args = append(args, loc.Pos()) - args = append(args, loc.Start()) - args = append(args, loc.End()) - args = append(args, uint64(len(loc.ArrayPositions()))) + args := bufLoc[0:5] + args[0] = uint64(fieldsMap[loc.Field()]) + args[1] = loc.Pos() + args[2] = loc.Start() + args[3] = loc.End() + args[4] = uint64(len(loc.ArrayPositions())) args = append(args, loc.ArrayPositions()...) err = locEncoder.Add(hitNewDocNum, args...) if err != nil { From 6451c8c37f30ee83e9cb45bb948ee8ada1df55ba Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 29 Jan 2018 17:09:18 -0800 Subject: [PATCH 170/728] MB-26396: Handling documents with geopoints in slice format + The issue lies with parsing documents containing a geopoint in slice format - which wasn't handled. + Unit test that verifies the fix. --- mapping/document.go | 2 +- mapping/mapping_test.go | 66 +++++++++++++++++++++++++++++------------ 2 files changed, 48 insertions(+), 20 deletions(-) diff --git a/mapping/document.go b/mapping/document.go index d4c9a8f9d..6ec0c66bb 100644 --- a/mapping/document.go +++ b/mapping/document.go @@ -504,7 +504,7 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string, } dm.walkDocument(property, path, indexes, context) } - case reflect.Map: + case reflect.Map, reflect.Slice: if subDocMapping != nil { for _, fieldMapping := range subDocMapping.Fields { if fieldMapping.Type == "geopoint" { diff --git a/mapping/mapping_test.go b/mapping/mapping_test.go index 5d7527e0a..1a7709049 100644 --- a/mapping/mapping_test.go +++ b/mapping/mapping_test.go @@ -869,37 +869,65 @@ func TestMappingForGeo(t *testing.T) { mapping := NewIndexMapping() mapping.DefaultMapping = thingMapping - x := struct { + geopoints := []interface{}{} + + // geopoint as a struct + geopoints = append(geopoints, struct { Name string `json:"name"` Location *Location `json:"location"` }{ - Name: "marty", + Name: "struct", Location: &Location{ Lon: -180, Lat: -90, }, - } + }) - doc := document.NewDocument("1") - err := mapping.MapDocument(doc, x) - if err != nil { - t.Fatal(err) - } + // geopoint as a map + geopoints = append(geopoints, struct { + Name string `json:"name"` + Location map[string]interface{} `json:"location"` + }{ + Name: "map", + Location: map[string]interface{}{ + "lon": -180, + "lat": -90, + }, + }) - var foundGeo bool - for _, f := range doc.Fields { - if f.Name() == "location" { - foundGeo = true - got := f.Value() - expect := []byte(numeric.MustNewPrefixCodedInt64(0, 0)) - if !reflect.DeepEqual(got, expect) { - t.Errorf("expected geo value: %v, got %v", expect, got) + // geopoint as a slice + geopoints = append(geopoints, struct { + Name string `json:"name"` + Location []interface{} `json:"location"` + }{ + Name: "slice", + Location: []interface{}{ + -180, -90, + }, + }) + + for i, geopoint := range geopoints { + doc := document.NewDocument(string(i)) + err := mapping.MapDocument(doc, geopoint) + if err != nil { + t.Fatal(err) + } + + var foundGeo bool + for _, f := range doc.Fields { + if f.Name() == "location" { + foundGeo = true + got := f.Value() + expect := []byte(numeric.MustNewPrefixCodedInt64(0, 0)) + if !reflect.DeepEqual(got, expect) { + t.Errorf("expected geo value: %v, got %v", expect, got) + } } } - } - if !foundGeo { - t.Errorf("expected to find geo point, did not") + if !foundGeo { + t.Errorf("expected to find geo point, did not") + } } } From 684ee3c0e759314a4208c1ace210519a16c0f88a Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 30 Jan 2018 10:27:07 -0800 Subject: [PATCH 171/728] scorch zap DictIterator term count fixed and more merge unit tests The zap DictionaryIterator Next() was incorrectly returning the postingsList offset as the term count. As part of this, refactored out a PostingsList.read() helper method. Also added more merge unit test scenarios, including merging a segment for a few rounds to see if there are differences before/after merging. --- index/scorch/segment/zap/dict.go | 47 +---- index/scorch/segment/zap/merge_test.go | 272 ++++++++++++++++++++++++- index/scorch/segment/zap/posting.go | 43 ++++ 3 files changed, 318 insertions(+), 44 deletions(-) diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index bb6fd9478..0f5145fba 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -15,7 +15,6 @@ package zap import ( - "encoding/binary" "fmt" "github.com/RoaringBitmap/roaring" @@ -51,43 +50,10 @@ func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap) (*Posting return nil, fmt.Errorf("vellum err: %v", err) } if exists { - rv.postingsOffset = postingsOffset - // read the location of the freq/norm details - var n uint64 - var read int - - rv.freqOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+binary.MaxVarintLen64]) - n += uint64(read) - rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) - n += uint64(read) - - var locBitmapOffset uint64 - locBitmapOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) - n += uint64(read) - - // go ahead and load loc bitmap - var locBitmapLen uint64 - locBitmapLen, read = binary.Uvarint(d.sb.mem[locBitmapOffset : locBitmapOffset+binary.MaxVarintLen64]) - locRoaringBytes := d.sb.mem[locBitmapOffset+uint64(read) : locBitmapOffset+uint64(read)+locBitmapLen] - rv.locBitmap = roaring.NewBitmap() - _, err := rv.locBitmap.FromBuffer(locRoaringBytes) + err = rv.read(postingsOffset, d) if err != nil { - return nil, fmt.Errorf("error loading roaring bitmap of locations with hits: %v", err) + return nil, err } - - var postingsLen uint64 - postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) - n += uint64(read) - - roaringBytes := d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen] - - bitmap := roaring.NewBitmap() - _, err = bitmap.FromBuffer(roaringBytes) - if err != nil { - return nil, fmt.Errorf("error loading roaring bitmap: %v", err) - } - - rv.postings = bitmap } } @@ -160,6 +126,7 @@ type DictionaryIterator struct { d *Dictionary itr vellum.Iterator err error + tmp PostingsList } // Next returns the next entry in the dictionary @@ -169,10 +136,14 @@ func (i *DictionaryIterator) Next() (*index.DictEntry, error) { } else if i.err != nil { return nil, i.err } - term, count := i.itr.Current() + term, postingsOffset := i.itr.Current() + i.err = i.tmp.read(postingsOffset, i.d) + if i.err != nil { + return nil, i.err + } rv := &index.DictEntry{ Term: string(term), - Count: count, + Count: i.tmp.Count(), } i.err = i.itr.Next() return rv, nil diff --git a/index/scorch/segment/zap/merge_test.go b/index/scorch/segment/zap/merge_test.go index 1e0110418..13807d8eb 100644 --- a/index/scorch/segment/zap/merge_test.go +++ b/index/scorch/segment/zap/merge_test.go @@ -15,7 +15,11 @@ package zap import ( + "fmt" "os" + "reflect" + "sort" + "strings" "testing" "github.com/RoaringBitmap/roaring" @@ -72,9 +76,251 @@ func TestMerge(t *testing.T) { if err != nil { t.Fatal(err) } + + segm, err := Open("/tmp/scorch3.zap") + if err != nil { + t.Fatalf("error opening merged segment: %v", err) + } + seg3 := segm.(*Segment) + defer func() { + cerr := seg3.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + if seg3.Path() != "/tmp/scorch3.zap" { + t.Fatalf("wrong path") + } + if seg3.Count() != 4 { + t.Fatalf("wrong count") + } + if len(seg3.Fields()) != 5 { + t.Fatalf("wrong # fields: %#v\n", seg3.Fields()) + } + + testMergeWithSelf(t, seg3, 4) +} + +func testMergeWithSelf(t *testing.T, segCur *Segment, expectedCount uint64) { + // trying merging the segment with itself for a few rounds + var diffs []string + + for i := 0; i < 10; i++ { + fname := fmt.Sprintf("scorch-self-%d.zap", i) + + _ = os.RemoveAll("/tmp/" + fname) + + segsToMerge := make([]*Segment, 1) + segsToMerge[0] = segCur + + _, err := Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/"+fname, 1024) + if err != nil { + t.Fatal(err) + } + + segm, err := Open("/tmp/" + fname) + if err != nil { + t.Fatalf("error opening merged segment: %v", err) + } + segNew := segm.(*Segment) + defer func(s *Segment) { + cerr := s.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }(segNew) + + if segNew.Count() != expectedCount { + t.Fatalf("wrong count") + } + if len(segNew.Fields()) != 5 { + t.Fatalf("wrong # fields: %#v\n", segNew.Fields()) + } + + diff := compareSegments(segCur, segNew) + if diff != "" { + diffs = append(diffs, fname+" is different than previous:\n"+diff) + } + + segCur = segNew + } + + if len(diffs) > 0 { + t.Errorf("mismatches after repeated self-merging: %v", strings.Join(diffs, "\n")) + } +} + +func compareSegments(a, b *Segment) string { + var rv []string + + if a.Count() != b.Count() { + return "counts" + } + + afields := append([]string(nil), a.Fields()...) + bfields := append([]string(nil), b.Fields()...) + sort.Strings(afields) + sort.Strings(bfields) + if !reflect.DeepEqual(afields, bfields) { + return "fields" + } + + for _, fieldName := range afields { + adict, err := a.Dictionary(fieldName) + if err != nil { + return fmt.Sprintf("adict err: %v", err) + } + bdict, err := b.Dictionary(fieldName) + if err != nil { + return fmt.Sprintf("bdict err: %v", err) + } + + if adict.(*Dictionary).fst.Len() != bdict.(*Dictionary).fst.Len() { + rv = append(rv, fmt.Sprintf("field %s, dict fst Len()'s different: %v %v", + fieldName, adict.(*Dictionary).fst.Len(), bdict.(*Dictionary).fst.Len())) + } + + aitr := adict.Iterator() + bitr := bdict.Iterator() + for { + anext, aerr := aitr.Next() + bnext, berr := bitr.Next() + if aerr != berr { + rv = append(rv, fmt.Sprintf("field %s, dict iterator Next() errors different: %v %v", + fieldName, aerr, berr)) + break + } + if !reflect.DeepEqual(anext, bnext) { + rv = append(rv, fmt.Sprintf("field %s, dict iterator Next() results different: %#v %#v", + fieldName, anext, bnext)) + // keep going to try to see more diff details at the postingsList level + } + if aerr != nil || anext == nil || + berr != nil || bnext == nil { + break + } + + for _, next := range []*index.DictEntry{anext, bnext} { + if next == nil { + continue + } + + aplist, aerr := adict.(*Dictionary).postingsList([]byte(next.Term), nil) + bplist, berr := bdict.(*Dictionary).postingsList([]byte(next.Term), nil) + if aerr != berr { + rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsList() errors different: %v %v", + fieldName, next.Term, aerr, berr)) + } + + if (aplist != nil) != (bplist != nil) { + rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsList() results different: %v %v", + fieldName, next.Term, aplist, bplist)) + break + } + + if aerr != nil || aplist == nil || + berr != nil || bplist == nil { + break + } + + if aplist.Count() != bplist.Count() { + rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsList().Count()'s different: %v %v", + fieldName, next.Term, aplist.Count(), bplist.Count())) + } + + apitr := aplist.Iterator() + bpitr := bplist.Iterator() + if (apitr != nil) != (bpitr != nil) { + rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsList.Iterator() results different: %v %v", + fieldName, next.Term, apitr, bpitr)) + break + } + + for { + apitrn, aerr := apitr.Next() + bpitrn, aerr := bpitr.Next() + if aerr != berr { + rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsListIterator Next() errors different: %v %v", + fieldName, next.Term, aerr, berr)) + } + + if (apitrn != nil) != (bpitrn != nil) { + rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsListIterator Next() results different: %v %v", + fieldName, next.Term, apitrn, bpitrn)) + break + } + + if aerr != nil || apitrn == nil || + berr != nil || bpitrn == nil { + break + } + + if apitrn.Number() != bpitrn.Number() { + rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsListIterator Next() Number()'s different: %v %v", + fieldName, next.Term, apitrn.Number(), bpitrn.Number())) + } + + if apitrn.Frequency() != bpitrn.Frequency() { + rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsListIterator Next() Frequency()'s different: %v %v", + fieldName, next.Term, apitrn.Frequency(), bpitrn.Frequency())) + } + + if apitrn.Norm() != bpitrn.Norm() { + rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsListIterator Next() Norm()'s different: %v %v", + fieldName, next.Term, apitrn.Norm(), bpitrn.Norm())) + } + + if len(apitrn.Locations()) != len(bpitrn.Locations()) { + rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsListIterator Next() Locations() len's different: %v %v", + fieldName, next.Term, len(apitrn.Locations()), len(bpitrn.Locations()))) + } + + for loci, aloc := range apitrn.Locations() { + bloc := bpitrn.Locations()[loci] + + if (aloc != nil) != (bloc != nil) { + rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsListIterator Next() loc different: %v %v", + fieldName, next.Term, aloc, bloc)) + break + } + + if aloc.Field() != bloc.Field() || + aloc.Start() != bloc.Start() || + aloc.End() != bloc.End() || + aloc.Pos() != bloc.Pos() || + !reflect.DeepEqual(aloc.ArrayPositions(), bloc.ArrayPositions()) { + rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsListIterator Next() loc details different: %v %v", + fieldName, next.Term, aloc, bloc)) + } + } + } + } + } + } + + return strings.Join(rv, "\n") } func TestMergeAndDrop(t *testing.T) { + docsToDrop := make([]*roaring.Bitmap, 2) + docsToDrop[0] = roaring.NewBitmap() + docsToDrop[0].AddInt(1) + docsToDrop[1] = roaring.NewBitmap() + docsToDrop[1].AddInt(1) + testMergeAndDrop(t, docsToDrop) +} + +func TestMergeAndDropAllFromOneSegment(t *testing.T) { + docsToDrop := make([]*roaring.Bitmap, 2) + docsToDrop[0] = roaring.NewBitmap() + docsToDrop[0].AddInt(0) + docsToDrop[0].AddInt(1) + docsToDrop[1] = roaring.NewBitmap() + testMergeAndDrop(t, docsToDrop) +} + +func testMergeAndDrop(t *testing.T, docsToDrop []*roaring.Bitmap) { _ = os.RemoveAll("/tmp/scorch.zap") _ = os.RemoveAll("/tmp/scorch2.zap") _ = os.RemoveAll("/tmp/scorch3.zap") @@ -117,16 +363,30 @@ func TestMergeAndDrop(t *testing.T) { segsToMerge[0] = segment.(*Segment) segsToMerge[1] = segment2.(*Segment) - docsToDrop := make([]*roaring.Bitmap, 2) - docsToDrop[0] = roaring.NewBitmap() - docsToDrop[0].AddInt(1) - docsToDrop[1] = roaring.NewBitmap() - docsToDrop[1].AddInt(1) - _, err = Merge(segsToMerge, docsToDrop, "/tmp/scorch3.zap", 1024) if err != nil { t.Fatal(err) } + + segm, err := Open("/tmp/scorch3.zap") + if err != nil { + t.Fatalf("error opening merged segment: %v", err) + } + defer func() { + cerr := segm.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + if segm.Count() != 2 { + t.Fatalf("wrong count, got: %d", segm.Count()) + } + if len(segm.Fields()) != 5 { + t.Fatalf("wrong # fields: %#v\n", segm.Fields()) + } + + testMergeWithSelf(t, segm.(*Segment), 2) } func buildMemSegmentMulti2() *mem.Segment { diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index e8533a12a..67e08d1ae 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -98,6 +98,49 @@ func (p *PostingsList) Count() uint64 { return 0 } +func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { + rv.postingsOffset = postingsOffset + + // read the location of the freq/norm details + var n uint64 + var read int + + rv.freqOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+binary.MaxVarintLen64]) + n += uint64(read) + + rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + + var locBitmapOffset uint64 + locBitmapOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + + var locBitmapLen uint64 + locBitmapLen, read = binary.Uvarint(d.sb.mem[locBitmapOffset : locBitmapOffset+binary.MaxVarintLen64]) + + locRoaringBytes := d.sb.mem[locBitmapOffset+uint64(read) : locBitmapOffset+uint64(read)+locBitmapLen] + + rv.locBitmap = roaring.NewBitmap() + _, err := rv.locBitmap.FromBuffer(locRoaringBytes) + if err != nil { + return fmt.Errorf("error loading roaring bitmap of locations with hits: %v", err) + } + + var postingsLen uint64 + postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + + roaringBytes := d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen] + + rv.postings = roaring.NewBitmap() + _, err = rv.postings.FromBuffer(roaringBytes) + if err != nil { + return fmt.Errorf("error loading roaring bitmap: %v", err) + } + + return nil +} + // PostingsIterator provides a way to iterate through the postings list type PostingsIterator struct { postings *PostingsList From 4dd64b68fa31081cfdca75a1fe4ab8c3f5cc8759 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 30 Jan 2018 22:14:10 -0800 Subject: [PATCH 172/728] scorch zap TestMergeWithEmptySegment(s) --- index/scorch/segment/zap/merge_test.go | 104 +++++++++++++++++++++++++ 1 file changed, 104 insertions(+) diff --git a/index/scorch/segment/zap/merge_test.go b/index/scorch/segment/zap/merge_test.go index 13807d8eb..3318c57dd 100644 --- a/index/scorch/segment/zap/merge_test.go +++ b/index/scorch/segment/zap/merge_test.go @@ -102,6 +102,110 @@ func TestMerge(t *testing.T) { testMergeWithSelf(t, seg3, 4) } +func TestMergeWithEmptySegment(t *testing.T) { + testMergeWithEmptySegments(t, true, 1) +} + +func TestMergeWithEmptySegments(t *testing.T) { + testMergeWithEmptySegments(t, true, 5) +} + +func TestMergeWithEmptySegmentFirst(t *testing.T) { + testMergeWithEmptySegments(t, false, 1) +} + +func TestMergeWithEmptySegmentsFirst(t *testing.T) { + testMergeWithEmptySegments(t, false, 5) +} + +func testMergeWithEmptySegments(t *testing.T, before bool, numEmptySegments int) { + _ = os.RemoveAll("/tmp/scorch.zap") + + memSegment := buildMemSegmentMulti() + err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024) + if err != nil { + t.Fatal(err) + } + segment, err := Open("/tmp/scorch.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + var segsToMerge []*Segment + + if before { + segsToMerge = append(segsToMerge, segment.(*Segment)) + } + + for i := 0; i < numEmptySegments; i++ { + fname := fmt.Sprintf("scorch-empty-%d.zap", i) + + _ = os.RemoveAll("/tmp/" + fname) + + emptySegment := mem.NewFromAnalyzedDocs([]*index.AnalysisResult{}) + err = PersistSegment(emptySegment, "/tmp/"+fname, 1024) + if err != nil { + t.Fatal(err) + } + + emptyFileSegment, err := Open("/tmp/" + fname) + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func(emptyFileSegment *Segment) { + cerr := emptyFileSegment.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }(emptyFileSegment.(*Segment)) + + segsToMerge = append(segsToMerge, emptyFileSegment.(*Segment)) + } + + if !before { + segsToMerge = append(segsToMerge, segment.(*Segment)) + } + + _ = os.RemoveAll("/tmp/scorch3.zap") + + drops := make([]*roaring.Bitmap, len(segsToMerge)) + + _, err = Merge(segsToMerge, drops, "/tmp/scorch3.zap", 1024) + if err != nil { + t.Fatal(err) + } + + segm, err := Open("/tmp/scorch3.zap") + if err != nil { + t.Fatalf("error opening merged segment: %v", err) + } + segCur := segm.(*Segment) + defer func() { + cerr := segCur.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + if segCur.Path() != "/tmp/scorch3.zap" { + t.Fatalf("wrong path") + } + if segCur.Count() != 2 { + t.Fatalf("wrong count, numEmptySegments: %d, got count: %d", numEmptySegments, segCur.Count()) + } + if len(segCur.Fields()) != 5 { + t.Fatalf("wrong # fields: %#v\n", segCur.Fields()) + } + + testMergeWithSelf(t, segCur, 2) +} + func testMergeWithSelf(t *testing.T, segCur *Segment, expectedCount uint64) { // trying merging the segment with itself for a few rounds var diffs []string From 93b037cdbb63014054d615c088b062ff8d3052fd Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 31 Jan 2018 11:40:21 -0800 Subject: [PATCH 173/728] scorch zap TestMergeWithUpdates() --- index/scorch/segment/zap/merge_test.go | 126 +++++++++++++++++++++---- 1 file changed, 109 insertions(+), 17 deletions(-) diff --git a/index/scorch/segment/zap/merge_test.go b/index/scorch/segment/zap/merge_test.go index 3318c57dd..4453cfcd9 100644 --- a/index/scorch/segment/zap/merge_test.go +++ b/index/scorch/segment/zap/merge_test.go @@ -427,20 +427,12 @@ func TestMergeAndDropAllFromOneSegment(t *testing.T) { func testMergeAndDrop(t *testing.T, docsToDrop []*roaring.Bitmap) { _ = os.RemoveAll("/tmp/scorch.zap") _ = os.RemoveAll("/tmp/scorch2.zap") - _ = os.RemoveAll("/tmp/scorch3.zap") memSegment := buildMemSegmentMulti() err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024) if err != nil { t.Fatal(err) } - - memSegment2 := buildMemSegmentMulti2() - err = PersistSegment(memSegment2, "/tmp/scorch2.zap", 1024) - if err != nil { - t.Fatal(err) - } - segment, err := Open("/tmp/scorch.zap") if err != nil { t.Fatalf("error opening segment: %v", err) @@ -452,6 +444,12 @@ func testMergeAndDrop(t *testing.T, docsToDrop []*roaring.Bitmap) { } }() + memSegment2 := buildMemSegmentMulti2() + err = PersistSegment(memSegment2, "/tmp/scorch2.zap", 1024) + if err != nil { + t.Fatal(err) + } + segment2, err := Open("/tmp/scorch2.zap") if err != nil { t.Fatalf("error opening segment: %v", err) @@ -467,12 +465,103 @@ func testMergeAndDrop(t *testing.T, docsToDrop []*roaring.Bitmap) { segsToMerge[0] = segment.(*Segment) segsToMerge[1] = segment2.(*Segment) - _, err = Merge(segsToMerge, docsToDrop, "/tmp/scorch3.zap", 1024) + testMergeAndDropSegments(t, segsToMerge, docsToDrop, 2) +} + +func TestMergeWithUpdates(t *testing.T) { + segmentDocIds := [][]string{ + []string{"a", "b"}, + []string{"b", "c"}, // doc "b" updated + } + + docsToDrop := make([]*roaring.Bitmap, 2) + docsToDrop[0] = roaring.NewBitmap() + docsToDrop[0].AddInt(1) // doc "b" updated + docsToDrop[1] = roaring.NewBitmap() + + testMergeWithUpdates(t, segmentDocIds, docsToDrop, 3) +} + +func TestMergeWithUpdatesOnManySegments(t *testing.T) { + segmentDocIds := [][]string{ + []string{"a", "b"}, + []string{"b", "c"}, // doc "b" updated + []string{"c", "d"}, // doc "c" updated + []string{"d", "e"}, // doc "d" updated + } + + docsToDrop := make([]*roaring.Bitmap, 4) + docsToDrop[0] = roaring.NewBitmap() + docsToDrop[0].AddInt(1) // doc "b" updated + docsToDrop[1] = roaring.NewBitmap() + docsToDrop[1].AddInt(1) // doc "c" updated + docsToDrop[2] = roaring.NewBitmap() + docsToDrop[2].AddInt(1) // doc "d" updated + docsToDrop[3] = roaring.NewBitmap() + + testMergeWithUpdates(t, segmentDocIds, docsToDrop, 5) +} + +func TestMergeWithUpdatesOnOneDoc(t *testing.T) { + segmentDocIds := [][]string{ + []string{"a", "b"}, + []string{"a", "c"}, // doc "a" updated + []string{"a", "d"}, // doc "a" updated + []string{"a", "e"}, // doc "a" updated + } + + docsToDrop := make([]*roaring.Bitmap, 4) + docsToDrop[0] = roaring.NewBitmap() + docsToDrop[0].AddInt(0) // doc "a" updated + docsToDrop[1] = roaring.NewBitmap() + docsToDrop[1].AddInt(0) // doc "a" updated + docsToDrop[2] = roaring.NewBitmap() + docsToDrop[2].AddInt(0) // doc "a" updated + docsToDrop[3] = roaring.NewBitmap() + + testMergeWithUpdates(t, segmentDocIds, docsToDrop, 5) +} + +func testMergeWithUpdates(t *testing.T, segmentDocIds [][]string, docsToDrop []*roaring.Bitmap, expectedNumDocs uint64) { + var segsToMerge []*Segment + + // convert segmentDocIds to segsToMerge + for i, docIds := range segmentDocIds { + fname := fmt.Sprintf("scorch%d.zap", i) + + _ = os.RemoveAll("/tmp/" + fname) + + memSegment := buildMemSegmentMultiHelper(docIds) + err := PersistSegment(memSegment, "/tmp/"+fname, 1024) + if err != nil { + t.Fatal(err) + } + segment, err := Open("/tmp/" + fname) + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func(segment *Segment) { + cerr := segment.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }(segment.(*Segment)) + + segsToMerge = append(segsToMerge, segment.(*Segment)) + } + + testMergeAndDropSegments(t, segsToMerge, docsToDrop, expectedNumDocs) +} + +func testMergeAndDropSegments(t *testing.T, segsToMerge []*Segment, docsToDrop []*roaring.Bitmap, expectedNumDocs uint64) { + _ = os.RemoveAll("/tmp/scorch-merged.zap") + + _, err := Merge(segsToMerge, docsToDrop, "/tmp/scorch-merged.zap", 1024) if err != nil { t.Fatal(err) } - segm, err := Open("/tmp/scorch3.zap") + segm, err := Open("/tmp/scorch-merged.zap") if err != nil { t.Fatalf("error opening merged segment: %v", err) } @@ -483,22 +572,25 @@ func testMergeAndDrop(t *testing.T, docsToDrop []*roaring.Bitmap) { } }() - if segm.Count() != 2 { - t.Fatalf("wrong count, got: %d", segm.Count()) + if segm.Count() != expectedNumDocs { + t.Fatalf("wrong count, got: %d, wanted: %d", segm.Count(), expectedNumDocs) } if len(segm.Fields()) != 5 { t.Fatalf("wrong # fields: %#v\n", segm.Fields()) } - testMergeWithSelf(t, segm.(*Segment), 2) + testMergeWithSelf(t, segm.(*Segment), expectedNumDocs) } func buildMemSegmentMulti2() *mem.Segment { + return buildMemSegmentMultiHelper([]string{"c", "d"}) +} +func buildMemSegmentMultiHelper(docIds []string) *mem.Segment { doc := &document.Document{ ID: "c", Fields: []document.Field{ - document.NewTextFieldCustom("_id", nil, []byte("c"), document.IndexField|document.StoreField, nil), + document.NewTextFieldCustom("_id", nil, []byte(docIds[0]), document.IndexField|document.StoreField, nil), document.NewTextFieldCustom("name", nil, []byte("mat"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), @@ -512,7 +604,7 @@ func buildMemSegmentMulti2() *mem.Segment { doc2 := &document.Document{ ID: "d", Fields: []document.Field{ - document.NewTextFieldCustom("_id", nil, []byte("d"), document.IndexField|document.StoreField, nil), + document.NewTextFieldCustom("_id", nil, []byte(docIds[1]), document.IndexField|document.StoreField, nil), document.NewTextFieldCustom("name", nil, []byte("joa"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), @@ -533,7 +625,7 @@ func buildMemSegmentMulti2() *mem.Segment { Start: 0, End: 1, Position: 1, - Term: []byte("c"), + Term: []byte(docIds[0]), }, }, nil, false), analysis.TokenFrequency(analysis.TokenStream{ @@ -591,7 +683,7 @@ func buildMemSegmentMulti2() *mem.Segment { Start: 0, End: 1, Position: 1, - Term: []byte("d"), + Term: []byte(docIds[1]), }, }, nil, false), analysis.TokenFrequency(analysis.TokenStream{ From 714f5321e0dec8e1ff3b48ea0f7411775de9dcc7 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 31 Jan 2018 14:46:28 -0800 Subject: [PATCH 174/728] scorch zap merge storedFieldVals inner loop optimization --- index/scorch/segment/zap/merge.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index cc348d720..1afe99f49 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -456,6 +456,9 @@ func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, // has stored values for this field num := len(storedFieldValues) + stf := typs[int(fieldID)] + spf := poss[int(fieldID)] + // process each value for i := 0; i < num; i++ { // encode field @@ -464,7 +467,7 @@ func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, return 0, nil, err2 } // encode type - _, err2 = metaEncoder.PutU64(uint64(typs[int(fieldID)][i])) + _, err2 = metaEncoder.PutU64(uint64(stf[i])) if err2 != nil { return 0, nil, err2 } @@ -479,13 +482,13 @@ func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, return 0, nil, err2 } // encode number of array pos - _, err2 = metaEncoder.PutU64(uint64(len(poss[int(fieldID)][i]))) + _, err2 = metaEncoder.PutU64(uint64(len(spf[i]))) if err2 != nil { return 0, nil, err2 } // encode all array positions - for j := 0; j < len(poss[int(fieldID)][i]); j++ { - _, err2 = metaEncoder.PutU64(poss[int(fieldID)][i][j]) + for _, pos := range spf[i] { + _, err2 = metaEncoder.PutU64(pos) if err2 != nil { return 0, nil, err2 } From 678c412157db7c91b9c99a4e43fde441e7ba522f Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 2 Feb 2018 14:44:02 +0530 Subject: [PATCH 175/728] unblock the files for clean up, esp for merged new segment files --- index/scorch/persister.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index cdcee37c2..552a25e58 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -266,9 +266,7 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { for k, v := range s.root.internal { newIndexSnapshot.internal[k] = v } - for _, filename := range filenames { - delete(s.ineligibleForRemoval, filename) - } + rootPrev := s.root s.root = newIndexSnapshot s.rootLock.Unlock() @@ -276,6 +274,12 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { _ = rootPrev.DecRef() } } + // unlock the files for clean up + s.rootLock.Lock() + for _, filename := range filenames { + delete(s.ineligibleForRemoval, filename) + } + s.rootLock.Unlock() return nil } From 9636209ae51fb46b004763ff65facfb3a043a015 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 5 Feb 2018 20:49:30 +0530 Subject: [PATCH 176/728] Update persister.go comment updated --- index/scorch/persister.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 552a25e58..d656c4d9f 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -274,7 +274,7 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { _ = rootPrev.DecRef() } } - // unlock the files for clean up + // allow files to become eligible for removal s.rootLock.Lock() for _, filename := range filenames { delete(s.ineligibleForRemoval, filename) From eb21bf83154344c183df51c35f67159df1f2383b Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 31 Jan 2018 15:08:31 -0800 Subject: [PATCH 177/728] scorch zap merge & build share persistStoredFieldValues() Refactored out a helper func, persistStoredFieldValues(), that both the persistence and merge codepaths now share. --- index/scorch/segment/zap/build.go | 100 ++++++++++++++++-------------- index/scorch/segment/zap/merge.go | 46 ++------------ 2 files changed, 59 insertions(+), 87 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 58f9faeaf..e6625528a 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -187,79 +187,42 @@ func persistBase(memSegment *mem.Segment, cr *CountHashWriter, chunkFactor uint3 } func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) { - var curr int var metaBuf bytes.Buffer var data, compressed []byte + metaEncoder := govarint.NewU64Base128Encoder(&metaBuf) + docNumOffsets := make(map[int]uint64, len(memSegment.Stored)) for docNum, storedValues := range memSegment.Stored { if docNum != 0 { // reset buffer if necessary + curr = 0 metaBuf.Reset() data = data[:0] compressed = compressed[:0] - curr = 0 } - metaEncoder := govarint.NewU64Base128Encoder(&metaBuf) - st := memSegment.StoredTypes[docNum] sp := memSegment.StoredPos[docNum] // encode fields in order for fieldID := range memSegment.FieldsInv { if storedFieldValues, ok := storedValues[uint16(fieldID)]; ok { - // has stored values for this field - num := len(storedFieldValues) - stf := st[uint16(fieldID)] spf := sp[uint16(fieldID)] - // process each value - for i := 0; i < num; i++ { - // encode field - _, err2 := metaEncoder.PutU64(uint64(fieldID)) - if err2 != nil { - return 0, err2 - } - // encode type - _, err2 = metaEncoder.PutU64(uint64(stf[i])) - if err2 != nil { - return 0, err2 - } - // encode start offset - _, err2 = metaEncoder.PutU64(uint64(curr)) - if err2 != nil { - return 0, err2 - } - // end len - _, err2 = metaEncoder.PutU64(uint64(len(storedFieldValues[i]))) - if err2 != nil { - return 0, err2 - } - // encode number of array pos - _, err2 = metaEncoder.PutU64(uint64(len(spf[i]))) - if err2 != nil { - return 0, err2 - } - // encode all array positions - for _, pos := range spf[i] { - _, err2 = metaEncoder.PutU64(pos) - if err2 != nil { - return 0, err2 - } - } - // append data - data = append(data, storedFieldValues[i]...) - // update curr - curr += len(storedFieldValues[i]) + var err2 error + curr, data, err2 = persistStoredFieldValues(fieldID, + storedFieldValues, stf, spf, curr, metaEncoder, data) + if err2 != nil { + return 0, err2 } } } - metaEncoder.Close() + metaEncoder.Close() metaBytes := metaBuf.Bytes() // compress the data @@ -299,6 +262,51 @@ func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) return rv, nil } +func persistStoredFieldValues(fieldID int, + storedFieldValues [][]byte, stf []byte, spf [][]uint64, + curr int, metaEncoder *govarint.Base128Encoder, data []byte) ( + int, []byte, error) { + for i := 0; i < len(storedFieldValues); i++ { + // encode field + _, err := metaEncoder.PutU64(uint64(fieldID)) + if err != nil { + return 0, nil, err + } + // encode type + _, err = metaEncoder.PutU64(uint64(stf[i])) + if err != nil { + return 0, nil, err + } + // encode start offset + _, err = metaEncoder.PutU64(uint64(curr)) + if err != nil { + return 0, nil, err + } + // end len + _, err = metaEncoder.PutU64(uint64(len(storedFieldValues[i]))) + if err != nil { + return 0, nil, err + } + // encode number of array pos + _, err = metaEncoder.PutU64(uint64(len(spf[i]))) + if err != nil { + return 0, nil, err + } + // encode all array positions + for _, pos := range spf[i] { + _, err = metaEncoder.PutU64(pos) + if err != nil { + return 0, nil, err + } + } + + data = append(data, storedFieldValues[i]...) + curr += len(storedFieldValues[i]) + } + + return curr, data, nil +} + func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFactor uint32) ([]uint64, []uint64, error) { var freqOffsets, locOfffsets []uint64 tfEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 1afe99f49..8fdb07afc 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -453,50 +453,14 @@ func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, for fieldID := range fieldsInv { storedFieldValues := vals[int(fieldID)] - // has stored values for this field - num := len(storedFieldValues) - stf := typs[int(fieldID)] spf := poss[int(fieldID)] - // process each value - for i := 0; i < num; i++ { - // encode field - _, err2 := metaEncoder.PutU64(uint64(fieldID)) - if err2 != nil { - return 0, nil, err2 - } - // encode type - _, err2 = metaEncoder.PutU64(uint64(stf[i])) - if err2 != nil { - return 0, nil, err2 - } - // encode start offset - _, err2 = metaEncoder.PutU64(uint64(curr)) - if err2 != nil { - return 0, nil, err2 - } - // end len - _, err2 = metaEncoder.PutU64(uint64(len(storedFieldValues[i]))) - if err2 != nil { - return 0, nil, err2 - } - // encode number of array pos - _, err2 = metaEncoder.PutU64(uint64(len(spf[i]))) - if err2 != nil { - return 0, nil, err2 - } - // encode all array positions - for _, pos := range spf[i] { - _, err2 = metaEncoder.PutU64(pos) - if err2 != nil { - return 0, nil, err2 - } - } - // append data - data = append(data, storedFieldValues[i]...) - // update curr - curr += len(storedFieldValues[i]) + var err2 error + curr, data, err2 = persistStoredFieldValues(fieldID, + storedFieldValues, stf, spf, curr, metaEncoder, data) + if err2 != nil { + return 0, nil, err2 } } From 65786557584031cb74749681939f673245842b42 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 31 Jan 2018 15:48:03 -0800 Subject: [PATCH 178/728] scorch zap refactored out mergeToWriter() func This is a step towards supporting in-memory zap segment merging. --- index/scorch/segment/zap/merge.go | 71 ++++++++++++++++++------------- 1 file changed, 41 insertions(+), 30 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 8fdb07afc..327446c5c 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -52,41 +52,15 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, // wrap it for counting (tracking offsets) cr := NewCountHashWriter(br) - fieldsInv := mergeFields(segments) - fieldsMap := mapFields(fieldsInv) - - var newDocNums [][]uint64 - var storedIndexOffset uint64 - fieldDvLocsOffset := uint64(fieldNotUninverted) - var dictLocs []uint64 - - newSegDocCount := computeNewDocCount(segments, drops) - if newSegDocCount > 0 { - storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, - fieldsMap, fieldsInv, newSegDocCount, cr) - if err != nil { - cleanup() - return nil, err - } - - dictLocs, fieldDvLocsOffset, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap, - newDocNums, newSegDocCount, chunkFactor, cr) - if err != nil { - cleanup() - return nil, err - } - } else { - dictLocs = make([]uint64, len(fieldsInv)) - } - - fieldsIndexOffset, err := persistFields(fieldsInv, cr, dictLocs) + newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, err := + mergeToWriter(segments, drops, chunkFactor, cr) if err != nil { cleanup() return nil, err } - err = persistFooter(newSegDocCount, storedIndexOffset, - fieldsIndexOffset, fieldDvLocsOffset, chunkFactor, cr.Sum32(), cr) + err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, + docValueOffset, chunkFactor, cr.Sum32(), cr) if err != nil { cleanup() return nil, err @@ -113,6 +87,43 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, return newDocNums, nil } +func mergeToWriter(segments []*Segment, drops []*roaring.Bitmap, + chunkFactor uint32, cr *CountHashWriter) ( + newDocNums [][]uint64, + numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64, + err error) { + docValueOffset = uint64(fieldNotUninverted) + + var dictLocs []uint64 + + fieldsInv := mergeFields(segments) + fieldsMap := mapFields(fieldsInv) + + numDocs = computeNewDocCount(segments, drops) + if numDocs > 0 { + storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, + fieldsMap, fieldsInv, numDocs, cr) + if err != nil { + return nil, 0, 0, 0, 0, err + } + + dictLocs, docValueOffset, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap, + newDocNums, numDocs, chunkFactor, cr) + if err != nil { + return nil, 0, 0, 0, 0, err + } + } else { + dictLocs = make([]uint64, len(fieldsInv)) + } + + fieldsIndexOffset, err = persistFields(fieldsInv, cr, dictLocs) + if err != nil { + return nil, 0, 0, 0, 0, err + } + + return newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, nil +} + // mapFields takes the fieldsInv list and builds the map func mapFields(fields []string) map[string]uint16 { rv := make(map[string]uint16, len(fields)) From 3da191852de9b0aa860532de7ec39256008d8252 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 1 Feb 2018 16:59:59 -0800 Subject: [PATCH 179/728] scorch zap tighten up prepareSegment()'s lock area --- index/scorch/scorch.go | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 311077653..08fffa25e 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -310,17 +310,21 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, introduction.persisted = make(chan error, 1) } - // get read lock, to optimistically prepare obsoleted info + // optimistically prepare obsoletes outside of rootLock s.rootLock.RLock() - for _, seg := range s.root.segment { + root := s.root + root.AddRef() + s.rootLock.RUnlock() + + for _, seg := range root.segment { delta, err := seg.segment.DocNumbers(ids) if err != nil { - s.rootLock.RUnlock() return err } introduction.obsoletes[seg.id] = delta } - s.rootLock.RUnlock() + + _ = root.DecRef() s.introductions <- introduction From c09e2a08cadda4973bd062baf9a19fcc07b86a5e Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 3 Feb 2018 10:51:24 -0800 Subject: [PATCH 180/728] scorch zap chunkedContentCoder reuses chunk metadata slice memory And, renamed the chunk MetaData.DocID field to DocNum for naming correctness, where much of this commit is the mechanical effect of that rename. --- cmd/bleve/cmd/zap/docvalue.go | 27 ++++++++++++------------ index/scorch/segment/zap/build.go | 2 +- index/scorch/segment/zap/contentcoder.go | 12 +++++------ index/scorch/segment/zap/docvalues.go | 18 ++++++++-------- 4 files changed, 29 insertions(+), 30 deletions(-) diff --git a/cmd/bleve/cmd/zap/docvalue.go b/cmd/bleve/cmd/zap/docvalue.go index 165829fdf..743974955 100644 --- a/cmd/bleve/cmd/zap/docvalue.go +++ b/cmd/bleve/cmd/zap/docvalue.go @@ -165,7 +165,7 @@ var docvalueCmd = &cobra.Command{ /* TODO => dump all chunk headers?? if len(args) == 3 && args[2] == ">" { - dumpChunkDocIDs(data, ) + dumpChunkDocNums(data, ) }*/ } @@ -187,7 +187,7 @@ var docvalueCmd = &cobra.Command{ docInChunk := uint64(localDocNum) / uint64(segment.ChunkFactor()) if numChunks < docInChunk { - return fmt.Errorf("no chunk exists for chunk number: %d for docID: %d", docInChunk, localDocNum) + return fmt.Errorf("no chunk exists for chunk number: %d for localDocNum: %d", docInChunk, localDocNum) } destChunkDataLoc := fieldDvLoc + offset @@ -207,7 +207,7 @@ var docvalueCmd = &cobra.Command{ offset = uint64(0) curChunkHeader := make([]zap.MetaData, int(numDocs)) for i := 0; i < int(numDocs); i++ { - curChunkHeader[i].DocID, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + curChunkHeader[i].DocNum, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) offset += uint64(nread) curChunkHeader[i].DocDvLoc, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) offset += uint64(nread) @@ -221,8 +221,8 @@ var docvalueCmd = &cobra.Command{ start, length := getDocValueLocs(uint64(localDocNum), curChunkHeader) if start == math.MaxUint64 || length == math.MaxUint64 { - fmt.Printf("no field values found for docID %d\n", localDocNum) - fmt.Printf("Try docIDs present in chunk: %s\n", assortDocID(curChunkHeader)) + fmt.Printf("no field values found for localDocNum: %d\n", localDocNum) + fmt.Printf("Try docNums present in chunk: %s\n", metaDataDocNums(curChunkHeader)) return nil } // uncompress the already loaded data @@ -234,7 +234,7 @@ var docvalueCmd = &cobra.Command{ var termSeparator byte = 0xff var termSeparatorSplitSlice = []byte{termSeparator} - // pick the terms for the given docID + // pick the terms for the given docNum uncompressed = uncompressed[start : start+length] for { i := bytes.Index(uncompressed, termSeparatorSplitSlice) @@ -250,23 +250,22 @@ var docvalueCmd = &cobra.Command{ }, } -func getDocValueLocs(docID uint64, metaHeader []zap.MetaData) (uint64, uint64) { +func getDocValueLocs(docNum uint64, metaHeader []zap.MetaData) (uint64, uint64) { i := sort.Search(len(metaHeader), func(i int) bool { - return metaHeader[i].DocID >= docID + return metaHeader[i].DocNum >= docNum }) - if i < len(metaHeader) && metaHeader[i].DocID == docID { + if i < len(metaHeader) && metaHeader[i].DocNum == docNum { return metaHeader[i].DocDvLoc, metaHeader[i].DocDvLen } return math.MaxUint64, math.MaxUint64 } -func assortDocID(metaHeader []zap.MetaData) string { - docIDs := "" +func metaDataDocNums(metaHeader []zap.MetaData) string { + docNums := "" for _, meta := range metaHeader { - id := fmt.Sprintf("%d", meta.DocID) - docIDs += id + ", " + docNums += fmt.Sprintf("%d", meta.DocNum) + ", " } - return docIDs + return docNums } func init() { diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index e6625528a..60d168e66 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -588,7 +588,7 @@ func persistDocValues(memSegment *mem.Segment, w *CountHashWriter, if err != nil { return nil, err } - // resetting encoder for the next field + // reseting encoder for the next field fdvEncoder.Reset() } diff --git a/index/scorch/segment/zap/contentcoder.go b/index/scorch/segment/zap/contentcoder.go index b03940497..83457146e 100644 --- a/index/scorch/segment/zap/contentcoder.go +++ b/index/scorch/segment/zap/contentcoder.go @@ -39,7 +39,7 @@ type chunkedContentCoder struct { // MetaData represents the data information inside a // chunk. type MetaData struct { - DocID uint64 // docid of the data inside the chunk + DocNum uint64 // docNum of the data inside the chunk DocDvLoc uint64 // starting offset for a given docid DocDvLen uint64 // length of data inside the chunk for the given docid } @@ -52,7 +52,7 @@ func newChunkedContentCoder(chunkSize uint64, rv := &chunkedContentCoder{ chunkSize: chunkSize, chunkLens: make([]uint64, total), - chunkMeta: []MetaData{}, + chunkMeta: make([]MetaData, 0, total), } return rv @@ -68,7 +68,7 @@ func (c *chunkedContentCoder) Reset() { for i := range c.chunkLens { c.chunkLens[i] = 0 } - c.chunkMeta = []MetaData{} + c.chunkMeta = c.chunkMeta[:0] } // Close indicates you are done calling Add() this allows @@ -88,7 +88,7 @@ func (c *chunkedContentCoder) flushContents() error { // write out the metaData slice for _, meta := range c.chunkMeta { - _, err := writeUvarints(&c.chunkMetaBuf, meta.DocID, meta.DocDvLoc, meta.DocDvLen) + _, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvLoc, meta.DocDvLen) if err != nil { return err } @@ -118,7 +118,7 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error { // clearing the chunk specific meta for next chunk c.chunkBuf.Reset() c.chunkMetaBuf.Reset() - c.chunkMeta = []MetaData{} + c.chunkMeta = c.chunkMeta[:0] c.currChunk = chunk } @@ -130,7 +130,7 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error { } c.chunkMeta = append(c.chunkMeta, MetaData{ - DocID: docNum, + DocNum: docNum, DocDvLoc: uint64(dvOffset), DocDvLen: uint64(dvSize), }) diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index fb5b348a5..0514bd307 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -99,7 +99,7 @@ func (s *SegmentBase) loadFieldDocValueIterator(field string, func (di *docValueIterator) loadDvChunk(chunkNumber, localDocNum uint64, s *SegmentBase) error { // advance to the chunk where the docValues - // reside for the given docID + // reside for the given docNum destChunkDataLoc := di.dvDataLoc for i := 0; i < int(chunkNumber); i++ { destChunkDataLoc += di.chunkLens[i] @@ -116,7 +116,7 @@ func (di *docValueIterator) loadDvChunk(chunkNumber, offset := uint64(0) di.curChunkHeader = make([]MetaData, int(numDocs)) for i := 0; i < int(numDocs); i++ { - di.curChunkHeader[i].DocID, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) offset += uint64(read) di.curChunkHeader[i].DocDvLoc, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) offset += uint64(read) @@ -131,10 +131,10 @@ func (di *docValueIterator) loadDvChunk(chunkNumber, return nil } -func (di *docValueIterator) visitDocValues(docID uint64, +func (di *docValueIterator) visitDocValues(docNum uint64, visitor index.DocumentFieldTermVisitor) error { - // binary search the term locations for the docID - start, length := di.getDocValueLocs(docID) + // binary search the term locations for the docNum + start, length := di.getDocValueLocs(docNum) if start == math.MaxUint64 || length == math.MaxUint64 { return nil } @@ -144,7 +144,7 @@ func (di *docValueIterator) visitDocValues(docID uint64, return err } - // pick the terms for the given docID + // pick the terms for the given docNum uncompressed = uncompressed[start : start+length] for { i := bytes.Index(uncompressed, termSeparatorSplitSlice) @@ -159,11 +159,11 @@ func (di *docValueIterator) visitDocValues(docID uint64, return nil } -func (di *docValueIterator) getDocValueLocs(docID uint64) (uint64, uint64) { +func (di *docValueIterator) getDocValueLocs(docNum uint64) (uint64, uint64) { i := sort.Search(len(di.curChunkHeader), func(i int) bool { - return di.curChunkHeader[i].DocID >= docID + return di.curChunkHeader[i].DocNum >= docNum }) - if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocID == docID { + if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum { return di.curChunkHeader[i].DocDvLoc, di.curChunkHeader[i].DocDvLen } return math.MaxUint64, math.MaxUint64 From fdb240f5f99cb3301c2adaaa8acfde7c17835e46 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 5 Feb 2018 10:02:44 -0800 Subject: [PATCH 181/728] more zap merge-planner CalcBudget tests at larger sizes Helps provide a sense of how # of segments grows as # of documents grows. Ex: 1B docs => budget of 54 segments. --- index/scorch/mergeplan/merge_plan_test.go | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/index/scorch/mergeplan/merge_plan_test.go b/index/scorch/mergeplan/merge_plan_test.go index 4db8eb1e8..9a2c779c8 100644 --- a/index/scorch/mergeplan/merge_plan_test.go +++ b/index/scorch/mergeplan/merge_plan_test.go @@ -188,6 +188,18 @@ func TestCalcBudget(t *testing.T) { }, 7, }, + {1000, 2000, DefaultMergePlanOptions, + 1}, + {5000, 2000, DefaultMergePlanOptions, + 3}, + {10000, 2000, DefaultMergePlanOptions, + 5}, + {30000, 2000, DefaultMergePlanOptions, + 11}, + {1000000, 2000, DefaultMergePlanOptions, + 24}, + {1000000000, 2000, DefaultMergePlanOptions, + 54}, } for testi, test := range tests { From a280ba7cf8f9db0247e03bb56109d795a95a7ee3 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 5 Feb 2018 12:12:12 -0800 Subject: [PATCH 182/728] scorch zap TestIndexRollback fixes The TestIndexRollback unit test was failing more often than ever (perhaps raciness?), so this commit tries to remove avenues of raciness in the test... - The Scorch.Open() method is refactored into an Scorch.openBolt() helper method in order to allow unit tests to control which background goroutines are started. - TestIndexRollback() doesn't start the merger goroutine, to simulate a really slow merger that never gets around to merging old segments. - TestIndexRollback() creates a long-lived reader after the first batch, so that the first index snapshot isn't removed due to the long-lived reader's ref-count. - TestIndexRollback() temporarily bumps NumSnapshotsToKeep to a large number so the persister isn't tempted to removeOldData() that we're trying to rollback to. --- index/scorch/scorch.go | 30 ++++++++++----- index/scorch/snapshot_rollback_test.go | 51 ++++++++++++++++++++++---- 2 files changed, 63 insertions(+), 18 deletions(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 311077653..4881ff5cb 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -114,6 +114,25 @@ func (s *Scorch) fireAsyncError(err error) { } func (s *Scorch) Open() error { + err := s.openBolt() + if err != nil { + return err + } + + s.asyncTasks.Add(1) + go s.mainLoop() + + if !s.readOnly && s.path != "" { + s.asyncTasks.Add(1) + go s.persisterLoop() + s.asyncTasks.Add(1) + go s.mergerLoop() + } + + return nil +} + +func (s *Scorch) openBolt() error { var ok bool s.path, ok = s.config["path"].(string) if !ok { @@ -136,6 +155,7 @@ func (s *Scorch) Open() error { } } } + rootBoltPath := s.path + string(os.PathSeparator) + "root.bolt" var err error if s.path != "" { @@ -166,16 +186,6 @@ func (s *Scorch) Open() error { } } - s.asyncTasks.Add(1) - go s.mainLoop() - - if !s.readOnly && s.path != "" { - s.asyncTasks.Add(1) - go s.persisterLoop() - s.asyncTasks.Add(1) - go s.mergerLoop() - } - return nil } diff --git a/index/scorch/snapshot_rollback_test.go b/index/scorch/snapshot_rollback_test.go index 9816a51e6..42d908243 100644 --- a/index/scorch/snapshot_rollback_test.go +++ b/index/scorch/snapshot_rollback_test.go @@ -22,7 +22,12 @@ import ( ) func TestIndexRollback(t *testing.T) { + numSnapshotsToKeepOrig := NumSnapshotsToKeep + NumSnapshotsToKeep = 1000 + defer func() { + NumSnapshotsToKeep = numSnapshotsToKeepOrig + err := DestroyTest() if err != nil { t.Fatal(err) @@ -34,10 +39,6 @@ func TestIndexRollback(t *testing.T) { if err != nil { t.Fatal(err) } - err = idx.Open() - if err != nil { - t.Fatalf("error opening index: %v", err) - } defer func() { err := idx.Close() if err != nil { @@ -45,6 +46,22 @@ func TestIndexRollback(t *testing.T) { } }() + sh, ok := idx.(*Scorch) + if !ok { + t.Fatalf("Not a scorch index?") + } + + err = sh.openBolt() + if err != nil { + t.Fatalf("error opening index: %v", err) + } + + // start background goroutines except for the merger, which + // simulates a super slow merger + sh.asyncTasks.Add(2) + go sh.mainLoop() + go sh.persisterLoop() + // create a batch, insert 2 new documents batch := index.NewBatch() doc := document.NewDocument("1") @@ -59,14 +76,17 @@ func TestIndexRollback(t *testing.T) { t.Fatal(err) } - sh, ok := idx.(*Scorch) - if !ok { - t.Fatalf("Not a scorch index?") + readerSlow, err := idx.Reader() // keep snapshot around so it's not cleaned up + if err != nil { + t.Fatal(err) } + defer func() { + _ = readerSlow.Close() + }() // fetch rollback points available as of here rollbackPoints, err := sh.RollbackPoints() - if err != nil || len(rollbackPoints) == 0 { + if err != nil || len(rollbackPoints) != 1 { t.Fatal(err, len(rollbackPoints)) } @@ -88,6 +108,21 @@ func TestIndexRollback(t *testing.T) { t.Fatal(err) } + rollbackPointsB, err := sh.RollbackPoints() + if err != nil || len(rollbackPointsB) != 3 { + t.Fatal(err, len(rollbackPointsB)) + } + + found := false + for _, p := range rollbackPointsB { + if rollbackPoint.epoch == p.epoch { + found = true + } + } + if !found { + t.Fatalf("expected rollbackPoint epoch to still be available") + } + reader, err := idx.Reader() if err != nil { t.Fatal(err) From 07274c036d48a1a583d7416bdaeba4981fbdfe4c Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Tue, 6 Feb 2018 13:48:16 +0530 Subject: [PATCH 183/728] tuning the edge for merge-task execution loop Adjusting the merge task creation loop to accommodate the newly merged segments so that the eventual merge results/ number of segments stay within the calculated budget. --- index/scorch/mergeplan/merge_plan.go | 4 ++-- index/scorch/mergeplan/merge_plan_test.go | 23 +++++++++++++++++++++-- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/index/scorch/mergeplan/merge_plan.go b/index/scorch/mergeplan/merge_plan.go index 0afc3ce5c..62f643f43 100644 --- a/index/scorch/mergeplan/merge_plan.go +++ b/index/scorch/mergeplan/merge_plan.go @@ -186,13 +186,13 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) { // While we’re over budget, keep looping, which might produce // another MergeTask. - for len(eligibles) > budgetNumSegments { + for len(eligibles) > 0 && (len(eligibles)+len(rv.Tasks)) > budgetNumSegments { // Track a current best roster as we examine and score // potential rosters of merges. var bestRoster []Segment var bestRosterScore float64 // Lower score is better. - for startIdx := 0; startIdx < len(eligibles)-o.SegmentsPerMergeTask; startIdx++ { + for startIdx := 0; startIdx < len(eligibles); startIdx++ { var roster []Segment var rosterLiveSize int64 diff --git a/index/scorch/mergeplan/merge_plan_test.go b/index/scorch/mergeplan/merge_plan_test.go index 4db8eb1e8..03fd40f74 100644 --- a/index/scorch/mergeplan/merge_plan_test.go +++ b/index/scorch/mergeplan/merge_plan_test.go @@ -73,7 +73,16 @@ func TestSimplePlan(t *testing.T) { segs[2], }, nil, - &MergePlan{}, + &MergePlan{ + Tasks: []*MergeTask{ + &MergeTask{ + Segments: []Segment{ + segs[2], + segs[1], + }, + }, + }, + }, nil, }, {"3 segments", @@ -83,7 +92,17 @@ func TestSimplePlan(t *testing.T) { segs[9], }, nil, - &MergePlan{}, + &MergePlan{ + Tasks: []*MergeTask{ + &MergeTask{ + Segments: []Segment{ + segs[9], + segs[2], + segs[1], + }, + }, + }, + }, nil, }, {"many segments", From 0dfd73d6cc2c2b3f5e1830201741aecbd4b0f3bd Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 6 Feb 2018 17:10:41 -0800 Subject: [PATCH 184/728] scorch zap mergeStoredAndRemap loop optimization This change avoids an array/slice access in a loop body. --- index/scorch/segment/zap/merge.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 327446c5c..319d81348 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -506,17 +506,17 @@ func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, } // return value is the start of the stored index - offset := uint64(w.Count()) + storedIndexOffset := uint64(w.Count()) // now write out the stored doc index - for docNum := range docNumOffsets { - err := binary.Write(w, binary.BigEndian, docNumOffsets[docNum]) + for _, docNumOffset := range docNumOffsets { + err := binary.Write(w, binary.BigEndian, docNumOffset) if err != nil { return 0, nil, err } } - return offset, rv, nil + return storedIndexOffset, rv, nil } // mergeFields builds a unified list of fields used across all the input segments From 03c8b2b7ec66795dc9d83365a435d0463202c9e4 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 7 Feb 2018 14:16:58 -0800 Subject: [PATCH 185/728] scorch mem segment optimizes DictEntry's across Next() calls This change optimizes the scorch/mem DictionaryIterator by reusing a DictEntry struct across multiple Next() calls. This follows the same optimization trick and Next() semantics as upsidedown's FieldDict implementation. --- index/scorch/segment/mem/dict.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/index/scorch/segment/mem/dict.go b/index/scorch/segment/mem/dict.go index 939c287e9..cf92ef71f 100644 --- a/index/scorch/segment/mem/dict.go +++ b/index/scorch/segment/mem/dict.go @@ -76,6 +76,8 @@ type DictionaryIterator struct { prefix string end string offset int + + dictEntry index.DictEntry // reused across Next()'s } // Next returns the next entry in the dictionary @@ -95,8 +97,7 @@ func (d *DictionaryIterator) Next() (*index.DictEntry, error) { d.offset++ postingID := d.d.segment.Dicts[d.d.fieldID][next] - return &index.DictEntry{ - Term: next, - Count: d.d.segment.Postings[postingID-1].GetCardinality(), - }, nil + d.dictEntry.Term = next + d.dictEntry.Count = d.d.segment.Postings[postingID-1].GetCardinality() + return &d.dictEntry, nil } From 8c2520d55cb1fca6c01659386785720dfd061577 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 7 Feb 2018 14:29:51 -0800 Subject: [PATCH 186/728] scorch zap optimize via postingsList reuse pprof graphs were showing many postingsList allocations during merging, so this change optimizes by reusing postingList memory in the merging loops. --- index/scorch/segment/zap/dict.go | 15 +++++++++------ index/scorch/segment/zap/merge.go | 5 ++++- index/scorch/segment/zap/merge_test.go | 4 ++-- index/scorch/segment/zap/segment.go | 3 ++- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 0f5145fba..137c35d7a 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -34,15 +34,18 @@ type Dictionary struct { // PostingsList returns the postings list for the specified term func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) { - return d.postingsList([]byte(term), except) + return d.postingsList([]byte(term), except, nil) } -func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap) (*PostingsList, error) { - rv := &PostingsList{ - sb: d.sb, - term: term, - except: except, +func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { + if rv == nil { + rv = &PostingsList{} + } else { + *rv = PostingsList{} // clear the struct } + rv.sb = d.sb + rv.term = term + rv.except = except if d.fst != nil { postingsOffset, exists, err := d.fst.Get(term) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 319d81348..1e75439a7 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -155,6 +155,8 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64) var bufLoc []uint64 + var postings *PostingsList + rv := make([]uint64, len(fieldsInv)) fieldDvLocs := make([]uint64, len(fieldsInv)) fieldDvLocsOffset := uint64(fieldNotUninverted) @@ -231,7 +233,8 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, if dict == nil { continue } - postings, err2 := dict.postingsList(term, drops[dictI]) + var err2 error + postings, err2 = dict.postingsList(term, drops[dictI], postings) if err2 != nil { return nil, 0, err2 } diff --git a/index/scorch/segment/zap/merge_test.go b/index/scorch/segment/zap/merge_test.go index 4453cfcd9..323fffed4 100644 --- a/index/scorch/segment/zap/merge_test.go +++ b/index/scorch/segment/zap/merge_test.go @@ -310,8 +310,8 @@ func compareSegments(a, b *Segment) string { continue } - aplist, aerr := adict.(*Dictionary).postingsList([]byte(next.Term), nil) - bplist, berr := bdict.(*Dictionary).postingsList([]byte(next.Term), nil) + aplist, aerr := adict.(*Dictionary).postingsList([]byte(next.Term), nil, nil) + bplist, berr := bdict.(*Dictionary).postingsList([]byte(next.Term), nil, nil) if aerr != berr { rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsList() errors different: %v %v", fieldName, next.Term, aerr, berr)) diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 94268cace..40c0af274 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -343,8 +343,9 @@ func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) { return nil, err } + var postings *PostingsList for _, id := range ids { - postings, err := idDict.postingsList([]byte(id), nil) + postings, err = idDict.postingsList([]byte(id), nil, postings) if err != nil { return nil, err } From a83ee0f364b9dbadac97f9124563196381bcb2fa Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 7 Feb 2018 14:38:10 -0800 Subject: [PATCH 187/728] scorch zap.MergeToWriter() takes SegmentBases instead of Segments This change turns zap.MergeToWriter() into a public func, so that it's now directly callable from outside packages (such as from scorch's top-level merger or persister). And, MergerToWriter() now takes input of SegmentBases instead of Segments, so that it can now work on either in-memory zap segments or file-based zap segments. This is yet another stepping stone towards in-memory merging of zap segments. --- index/scorch/segment/zap/merge.go | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 1e75439a7..db03c998d 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -46,6 +46,11 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, _ = os.Remove(path) } + segmentBases := make([]*SegmentBase, len(segments)) + for segmenti, segment := range segments { + segmentBases[segmenti] = &segment.SegmentBase + } + // buffer the output br := bufio.NewWriter(f) @@ -53,7 +58,7 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, cr := NewCountHashWriter(br) newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, err := - mergeToWriter(segments, drops, chunkFactor, cr) + MergeToWriter(segmentBases, drops, chunkFactor, cr) if err != nil { cleanup() return nil, err @@ -87,7 +92,7 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, return newDocNums, nil } -func mergeToWriter(segments []*Segment, drops []*roaring.Bitmap, +func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap, chunkFactor uint32, cr *CountHashWriter) ( newDocNums [][]uint64, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64, @@ -135,10 +140,10 @@ func mapFields(fields []string) map[string]uint16 { // computeNewDocCount determines how many documents will be in the newly // merged segment when obsoleted docs are dropped -func computeNewDocCount(segments []*Segment, drops []*roaring.Bitmap) uint64 { +func computeNewDocCount(segments []*SegmentBase, drops []*roaring.Bitmap) uint64 { var newDocCount uint64 for segI, segment := range segments { - newDocCount += segment.NumDocs() + newDocCount += segment.numDocs if drops[segI] != nil { newDocCount -= drops[segI].GetCardinality() } @@ -146,7 +151,7 @@ func computeNewDocCount(segments []*Segment, drops []*roaring.Bitmap) uint64 { return newDocCount } -func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, +func persistMergedRest(segments []*SegmentBase, drops []*roaring.Bitmap, fieldsInv []string, fieldsMap map[string]uint16, newDocNums [][]uint64, newSegDocCount uint64, chunkFactor uint32, w *CountHashWriter) ([]uint64, uint64, error) { @@ -408,7 +413,7 @@ func persistMergedRest(segments []*Segment, drops []*roaring.Bitmap, const docDropped = math.MaxUint64 -func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, +func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, fieldsMap map[string]uint16, fieldsInv []string, newSegDocCount uint64, w *CountHashWriter) (uint64, [][]uint64, error) { var rv [][]uint64 // The remapped or newDocNums for each segment. @@ -523,7 +528,7 @@ func mergeStoredAndRemap(segments []*Segment, drops []*roaring.Bitmap, } // mergeFields builds a unified list of fields used across all the input segments -func mergeFields(segments []*Segment) []string { +func mergeFields(segments []*SegmentBase) []string { fieldsMap := map[string]struct{}{} for _, segment := range segments { fields := segment.Fields() From feecce1eb26302e8127a7556fd5082b682a43127 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 5 Feb 2018 16:37:43 +0530 Subject: [PATCH 188/728] fix for merger persister handshake stalemate The slow merger was lagging behind the fast persister to a persister notify send-loop while the persister awaits for any new introductions from introducer totally blocking the merger This fix along with the deleted files eligibilty flipping makes the file count to around 6 to 11 files per shard for both travel and beer samples --- index/scorch/merge.go | 36 ++++++++---------------------------- index/scorch/persister.go | 33 +++++++++++++++++++++++++++------ index/scorch/scorch.go | 4 ++-- 3 files changed, 37 insertions(+), 36 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 5ded29b5a..41abe0655 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -58,44 +58,24 @@ OUTER: _ = ourSnapshot.DecRef() // tell the persister we're waiting for changes - // first make a notification chan - notifyUs := make(notificationChan) + // first make a epochWatcher chan + ew := &epochWatcher{ + epoch: lastEpochMergePlanned, + notifyCh: make(notificationChan, 1), + } // give it to the persister select { case <-s.closeCh: break OUTER - case s.persisterNotifier <- notifyUs: + case s.persisterNotifier <- ew: } - // check again - s.rootLock.RLock() - ourSnapshot = s.root - ourSnapshot.AddRef() - s.rootLock.RUnlock() - - if ourSnapshot.epoch != lastEpochMergePlanned { - startTime := time.Now() - - // lets get started - err := s.planMergeAtSnapshot(ourSnapshot) - if err != nil { - s.fireAsyncError(fmt.Errorf("merging err: %v", err)) - _ = ourSnapshot.DecRef() - continue OUTER - } - lastEpochMergePlanned = ourSnapshot.epoch - - s.fireEvent(EventKindMergerProgress, time.Since(startTime)) - } - _ = ourSnapshot.DecRef() - - // now wait for it (but also detect close) + // now wait for persister (but also detect close) select { case <-s.closeCh: break OUTER - case <-notifyUs: - // woken up, next loop should pick up work + case <-ew.notifyCh: } } } diff --git a/index/scorch/persister.go b/index/scorch/persister.go index cdcee37c2..c9d91dc53 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -39,15 +39,29 @@ type notificationChan chan struct{} func (s *Scorch) persisterLoop() { defer s.asyncTasks.Done() - var notifyChs []notificationChan + var persistWatchers []*epochWatcher var lastPersistedEpoch uint64 + + notifyWatchers := func() { + var watchersNext []*epochWatcher + for _, w := range persistWatchers { + if w.epoch < lastPersistedEpoch { + close(w.notifyCh) + } else { + watchersNext = append(watchersNext, w) + } + } + persistWatchers = watchersNext + } + OUTER: for { select { case <-s.closeCh: break OUTER - case notifyCh := <-s.persisterNotifier: - notifyChs = append(notifyChs, notifyCh) + case ew := <-s.persisterNotifier: + persistWatchers = append(persistWatchers, ew) + notifyWatchers() default: } @@ -81,10 +95,11 @@ OUTER: } lastPersistedEpoch = ourSnapshot.epoch - for _, notifyCh := range notifyChs { - close(notifyCh) + for _, ew := range persistWatchers { + close(ew.notifyCh) } - notifyChs = nil + + persistWatchers = nil _ = ourSnapshot.DecRef() changed := false @@ -120,6 +135,12 @@ OUTER: break OUTER case <-w.notifyCh: // woken up, next loop should pick up work + continue OUTER + case ew := <-s.persisterNotifier: + // if the watchers are already caught up then let them wait, + // else let them continue to do the catch up + persistWatchers = append(persistWatchers, ew) + notifyWatchers() } } } diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 311077653..b4e06fc25 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -61,7 +61,7 @@ type Scorch struct { merges chan *segmentMerge introducerNotifier chan *epochWatcher revertToSnapshots chan *snapshotReversion - persisterNotifier chan notificationChan + persisterNotifier chan *epochWatcher rootBolt *bolt.DB asyncTasks sync.WaitGroup @@ -156,7 +156,7 @@ func (s *Scorch) Open() error { s.merges = make(chan *segmentMerge) s.introducerNotifier = make(chan *epochWatcher, 1) s.revertToSnapshots = make(chan *snapshotReversion) - s.persisterNotifier = make(chan notificationChan) + s.persisterNotifier = make(chan *epochWatcher, 1) if !s.readOnly && s.path != "" { err := s.removeOldZapFiles() // Before persister or merger create any new files. From ffdeb8055efd2e69167caf4aadd19e2cdd6e4e27 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 5 Feb 2018 15:42:47 -0800 Subject: [PATCH 189/728] scorch sorts fields by name to assign fieldID's This is a stepping stone to allow easier future comparisons of field maps and potential merge optimizations. In bleve-blast tests on a 2015 macbook (50K wikipedia docs, 8 indexers, batch size 100, ssd), this does not seem to have a distinct effect on indexing throughput. --- index/scorch/segment/mem/build.go | 15 +++++++++++++++ index/scorch/segment/zap/merge.go | 4 ++++ index/scorch/segment/zap/segment_test.go | 2 ++ 3 files changed, 21 insertions(+) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index d3344ce30..57d60dc89 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -95,6 +95,21 @@ func (s *Segment) initializeDict(results []*index.AnalysisResult) { var numTokenFrequencies int var totLocs int + // initial scan for all fieldID's to sort them + for _, result := range results { + for _, field := range result.Document.CompositeFields { + s.getOrDefineField(field.Name()) + } + for _, field := range result.Document.Fields { + s.getOrDefineField(field.Name()) + } + } + sort.Strings(s.FieldsInv[1:]) // keep _id as first field + s.FieldsMap = make(map[string]uint16, len(s.FieldsInv)) + for fieldID, fieldName := range s.FieldsInv { + s.FieldsMap[fieldName] = uint16(fieldID + 1) + } + processField := func(fieldID uint16, tfs analysis.TokenFrequencies) { for term, tf := range tfs { pidPlus1, exists := s.Dicts[fieldID][term] diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index db03c998d..53b1ffe54 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -21,6 +21,7 @@ import ( "fmt" "math" "os" + "sort" "github.com/RoaringBitmap/roaring" "github.com/Smerity/govarint" @@ -545,5 +546,8 @@ func mergeFields(segments []*SegmentBase) []string { rv = append(rv, k) } } + + sort.Strings(rv[1:]) // leave _id as first + return rv } diff --git a/index/scorch/segment/zap/segment_test.go b/index/scorch/segment/zap/segment_test.go index 704f9e72e..9ce354ce3 100644 --- a/index/scorch/segment/zap/segment_test.go +++ b/index/scorch/segment/zap/segment_test.go @@ -18,6 +18,7 @@ import ( "math" "os" "reflect" + "sort" "testing" "github.com/blevesearch/bleve/index" @@ -574,6 +575,7 @@ func TestSegmentVisitableDocValueFieldsList(t *testing.T) { t.Fatalf("segment VisitableDocValueFields err: %v", err) } + sort.Strings(expectedFields[1:]) // keep _id as first field if !reflect.DeepEqual(fields, expectedFields) { t.Errorf("expected field terms: %#v, got: %#v", expectedFields, fields) } From 822457542efe74b6b6b228ef52db2f56b8f3aea2 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 5 Feb 2018 16:03:17 -0800 Subject: [PATCH 190/728] scorch zap VERSION bump: check whether fields are the same at merge COMPATIBILITY NOTE: scorch zap version bumped in this commit. The version bump is because mergeFields() now computes whether fields are the same across segments and it relies on the previous commit where fieldID's are assigned in field name sorted order (albeit with _id field always having fieldID of 0). Potential future commits might rely on this info that "fields are the same across segments" for more optimizations, etc. --- index/scorch/segment/zap/build.go | 2 +- index/scorch/segment/zap/merge.go | 27 ++++++++++++++++++++------- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 60d168e66..b3bbbab52 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -28,7 +28,7 @@ import ( "github.com/golang/snappy" ) -const version uint32 = 2 +const version uint32 = 3 const fieldNotUninverted = math.MaxUint64 diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 53b1ffe54..bef2a087e 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -102,13 +102,13 @@ func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap, var dictLocs []uint64 - fieldsInv := mergeFields(segments) + fieldsSame, fieldsInv := mergeFields(segments) fieldsMap := mapFields(fieldsInv) numDocs = computeNewDocCount(segments, drops) if numDocs > 0 { storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, - fieldsMap, fieldsInv, numDocs, cr) + fieldsMap, fieldsInv, fieldsSame, numDocs, cr) if err != nil { return nil, 0, 0, 0, 0, err } @@ -415,7 +415,7 @@ func persistMergedRest(segments []*SegmentBase, drops []*roaring.Bitmap, const docDropped = math.MaxUint64 func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, - fieldsMap map[string]uint16, fieldsInv []string, newSegDocCount uint64, + fieldsMap map[string]uint16, fieldsInv []string, fieldsSame bool, newSegDocCount uint64, w *CountHashWriter) (uint64, [][]uint64, error) { var rv [][]uint64 // The remapped or newDocNums for each segment. @@ -528,13 +528,26 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, return storedIndexOffset, rv, nil } -// mergeFields builds a unified list of fields used across all the input segments -func mergeFields(segments []*SegmentBase) []string { +// mergeFields builds a unified list of fields used across all the +// input segments, and computes whether the fields are the same across +// segments (which depends on fields to be sorted in the same way +// across segments) +func mergeFields(segments []*SegmentBase) (bool, []string) { + fieldsSame := true + + var segment0Fields []string + if len(segments) > 0 { + segment0Fields = segments[0].Fields() + } + fieldsMap := map[string]struct{}{} for _, segment := range segments { fields := segment.Fields() - for _, field := range fields { + for fieldi, field := range fields { fieldsMap[field] = struct{}{} + if len(segment0Fields) != len(fields) || segment0Fields[fieldi] != field { + fieldsSame = false + } } } @@ -549,5 +562,5 @@ func mergeFields(segments []*SegmentBase) []string { sort.Strings(rv[1:]) // leave _id as first - return rv + return fieldsSame, rv } From 0b50a20caccf5c4fae2ab295978af21af8d4c792 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 5 Feb 2018 17:53:47 -0800 Subject: [PATCH 191/728] scorch zap move docDropped const to earlier in file --- index/scorch/segment/zap/merge.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index bef2a087e..dbb33110f 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -29,6 +29,8 @@ import ( "github.com/golang/snappy" ) +const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc + // Merge takes a slice of zap segments and bit masks describing which // documents may be dropped, and creates a new segment containing the // remaining data. This new segment is built at the specified path, @@ -412,8 +414,6 @@ func persistMergedRest(segments []*SegmentBase, drops []*roaring.Bitmap, return rv, fieldDvLocsOffset, nil } -const docDropped = math.MaxUint64 - func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, fieldsMap map[string]uint16, fieldsInv []string, fieldsSame bool, newSegDocCount uint64, w *CountHashWriter) (uint64, [][]uint64, error) { From ed4826b189404618f33b57f6add214e86b0353e4 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 6 Feb 2018 09:12:48 -0800 Subject: [PATCH 192/728] scorch zap merge optimization to byte-copy storedDocs The optimization to byte-copy all the storedDocs for a given segment during merging kicks in when the fields are the same across all segments and when there are no deletions for that given segment. This can happen, for example, during data loading or insert-only scenarios. As part of this commit, the Segment.copyStoredDocs() method was added, which uses a single Write() call to copy all the stored docs bytes of a segment to a writer in one shot. And, getDocStoredMetaAndCompressed() was refactored into a related helper function, getDocStoredOffsets(), which provides the storedDocs metadata (offsets & lengths) for a doc. --- index/scorch/segment/zap/merge.go | 53 ++++++++++++++++++++++++++ index/scorch/segment/zap/merge_test.go | 34 +++++++++++++++++ index/scorch/segment/zap/read.go | 28 ++++++++++---- 3 files changed, 107 insertions(+), 8 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index dbb33110f..b1eed28bc 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -437,6 +437,24 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, for segI, segment := range segments { segNewDocNums := make([]uint64, segment.numDocs) + // optimize when the field mapping is the same across all + // segments and there are no deletions, via byte-copying + // of stored docs bytes directly to the writer + if fieldsSame && (drops[segI] == nil || drops[segI].GetCardinality() == 0) { + err := segment.copyStoredDocs(newDocNum, docNumOffsets, w) + if err != nil { + return 0, nil, err + } + + for i := uint64(0); i < segment.numDocs; i++ { + segNewDocNums[i] = newDocNum + newDocNum++ + } + rv = append(rv, segNewDocNums) + + continue + } + // for each doc num for docNum := uint64(0); docNum < segment.numDocs; docNum++ { // TODO: roaring's API limits docNums to 32-bits? @@ -528,6 +546,41 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, return storedIndexOffset, rv, nil } +// copyStoredDocs writes out a segment's stored doc info, optimized by +// using a single Write() call for the entire set of bytes. The +// newDocNumOffsets is filled with the new offsets for each doc. +func (s *SegmentBase) copyStoredDocs(newDocNum uint64, newDocNumOffsets []uint64, + w *CountHashWriter) error { + if s.numDocs <= 0 { + return nil + } + + indexOffset0, storedOffset0, _, _, _ := + s.getDocStoredOffsets(0) // the segment's first doc + + indexOffsetN, storedOffsetN, readN, metaLenN, dataLenN := + s.getDocStoredOffsets(s.numDocs - 1) // the segment's last doc + + storedOffset0New := uint64(w.Count()) + + storedBytes := s.mem[storedOffset0 : storedOffsetN+readN+metaLenN+dataLenN] + _, err := w.Write(storedBytes) + if err != nil { + return err + } + + // remap the storedOffset's for the docs into new offsets relative + // to storedOffset0New, filling the given docNumOffsetsOut array + for indexOffset := indexOffset0; indexOffset <= indexOffsetN; indexOffset += 8 { + storedOffset := binary.BigEndian.Uint64(s.mem[indexOffset : indexOffset+8]) + storedOffsetNew := storedOffset - storedOffset0 + storedOffset0New + newDocNumOffsets[newDocNum] = storedOffsetNew + newDocNum += 1 + } + + return nil +} + // mergeFields builds a unified list of fields used across all the // input segments, and computes whether the fields are the same across // segments (which depends on fields to be sorted in the same way diff --git a/index/scorch/segment/zap/merge_test.go b/index/scorch/segment/zap/merge_test.go index 323fffed4..bb09f8314 100644 --- a/index/scorch/segment/zap/merge_test.go +++ b/index/scorch/segment/zap/merge_test.go @@ -398,6 +398,40 @@ func compareSegments(a, b *Segment) string { fieldName, next.Term, aloc, bloc)) } } + + if fieldName == "_id" { + docId := next.Term + docNumA := apitrn.Number() + docNumB := bpitrn.Number() + afields := map[string]interface{}{} + err = a.VisitDocument(apitrn.Number(), + func(field string, typ byte, value []byte, pos []uint64) bool { + afields[field+"-typ"] = typ + afields[field+"-value"] = value + afields[field+"-pos"] = pos + return true + }) + if err != nil { + rv = append(rv, fmt.Sprintf("a.VisitDocument err: %v", err)) + } + bfields := map[string]interface{}{} + err = b.VisitDocument(bpitrn.Number(), + func(field string, typ byte, value []byte, pos []uint64) bool { + bfields[field+"-typ"] = typ + bfields[field+"-value"] = value + bfields[field+"-pos"] = pos + return true + }) + if err != nil { + rv = append(rv, fmt.Sprintf("b.VisitDocument err: %v", err)) + } + if !reflect.DeepEqual(afields, bfields) { + rv = append(rv, fmt.Sprintf("afields != bfields,"+ + " id: %s, docNumA: %d, docNumB: %d,"+ + " afields: %#v, bfields: %#v", + docId, docNumA, docNumB, afields, bfields)) + } + } } } } diff --git a/index/scorch/segment/zap/read.go b/index/scorch/segment/zap/read.go index 0c5b9e17f..e47d4c6ab 100644 --- a/index/scorch/segment/zap/read.go +++ b/index/scorch/segment/zap/read.go @@ -17,15 +17,27 @@ package zap import "encoding/binary" func (s *SegmentBase) getDocStoredMetaAndCompressed(docNum uint64) ([]byte, []byte) { - docStoredStartAddr := s.storedIndexOffset + (8 * docNum) - docStoredStart := binary.BigEndian.Uint64(s.mem[docStoredStartAddr : docStoredStartAddr+8]) + _, storedOffset, n, metaLen, dataLen := s.getDocStoredOffsets(docNum) + + meta := s.mem[storedOffset+n : storedOffset+n+metaLen] + data := s.mem[storedOffset+n+metaLen : storedOffset+n+metaLen+dataLen] + + return meta, data +} + +func (s *SegmentBase) getDocStoredOffsets(docNum uint64) ( + uint64, uint64, uint64, uint64, uint64) { + indexOffset := s.storedIndexOffset + (8 * docNum) + + storedOffset := binary.BigEndian.Uint64(s.mem[indexOffset : indexOffset+8]) + var n uint64 - metaLen, read := binary.Uvarint(s.mem[docStoredStart : docStoredStart+binary.MaxVarintLen64]) + + metaLen, read := binary.Uvarint(s.mem[storedOffset : storedOffset+binary.MaxVarintLen64]) n += uint64(read) - var dataLen uint64 - dataLen, read = binary.Uvarint(s.mem[docStoredStart+n : docStoredStart+n+binary.MaxVarintLen64]) + + dataLen, read := binary.Uvarint(s.mem[storedOffset+n : storedOffset+n+binary.MaxVarintLen64]) n += uint64(read) - meta := s.mem[docStoredStart+n : docStoredStart+n+metaLen] - data := s.mem[docStoredStart+n+metaLen : docStoredStart+n+metaLen+dataLen] - return meta, data + + return indexOffset, storedOffset, n, metaLen, dataLen } From 99852accb04dfdb650e5d622416bc451651239a6 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 8 Feb 2018 10:12:20 -0800 Subject: [PATCH 193/728] scorch RollbackPoints() no error at start & fix TestIndexRollback When a scorch is just opened and is "empty", RollbackPoints() no longer considers that an error situation. Also, this commit makes the TestIndexRollback unit tests is a bit more forgiving to races, as we were seeing failures sometimes in travis-CI environments (TestIndexRollback was passing fine on my dev macbook). The theory is the double-looping in the persisterLoop would sometimes be racy, leading to 1 or 2 rollback points. --- index/scorch/snapshot_rollback.go | 9 ++++----- index/scorch/snapshot_rollback_test.go | 24 ++++++++++++++++++------ 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/index/scorch/snapshot_rollback.go b/index/scorch/snapshot_rollback.go index 43c3ba9f1..c265b4c31 100644 --- a/index/scorch/snapshot_rollback.go +++ b/index/scorch/snapshot_rollback.go @@ -31,10 +31,9 @@ func (r *RollbackPoint) GetInternal(key []byte) []byte { return r.meta[string(key)] } -// RollbackPoints returns an array of rollback points available -// for the application to make a decision on where to rollback -// to. A nil return value indicates that there are no available -// rollback points. +// RollbackPoints returns an array of rollback points available for +// the application to rollback to, with more recent rollback points +// (higher epochs) coming first. func (s *Scorch) RollbackPoints() ([]*RollbackPoint, error) { if s.rootBolt == nil { return nil, fmt.Errorf("RollbackPoints: root is nil") @@ -54,7 +53,7 @@ func (s *Scorch) RollbackPoints() ([]*RollbackPoint, error) { snapshots := tx.Bucket(boltSnapshotsBucket) if snapshots == nil { - return nil, fmt.Errorf("RollbackPoints: no snapshots available") + return nil, nil } rollbackPoints := []*RollbackPoint{} diff --git a/index/scorch/snapshot_rollback_test.go b/index/scorch/snapshot_rollback_test.go index 42d908243..0065a746d 100644 --- a/index/scorch/snapshot_rollback_test.go +++ b/index/scorch/snapshot_rollback_test.go @@ -62,6 +62,15 @@ func TestIndexRollback(t *testing.T) { go sh.mainLoop() go sh.persisterLoop() + // should have no rollback points initially + rollbackPoints, err := sh.RollbackPoints() + if err != nil { + t.Fatalf("expected no err, got: %v, %d", err, len(rollbackPoints)) + } + if len(rollbackPoints) != 0 { + t.Fatalf("expected no rollbackPoints, got %d", len(rollbackPoints)) + } + // create a batch, insert 2 new documents batch := index.NewBatch() doc := document.NewDocument("1") @@ -84,10 +93,13 @@ func TestIndexRollback(t *testing.T) { _ = readerSlow.Close() }() - // fetch rollback points available as of here - rollbackPoints, err := sh.RollbackPoints() - if err != nil || len(rollbackPoints) != 1 { - t.Fatal(err, len(rollbackPoints)) + // fetch rollback points after first batch + rollbackPoints, err = sh.RollbackPoints() + if err != nil { + t.Fatalf("expected no err, got: %v, %d", err, len(rollbackPoints)) + } + if len(rollbackPoints) == 0 { + t.Fatalf("expected some rollbackPoints, got none") } // set this as a rollback point for the future @@ -109,8 +121,8 @@ func TestIndexRollback(t *testing.T) { } rollbackPointsB, err := sh.RollbackPoints() - if err != nil || len(rollbackPointsB) != 3 { - t.Fatal(err, len(rollbackPointsB)) + if err != nil || len(rollbackPointsB) <= len(rollbackPoints) { + t.Fatalf("expected no err, got: %v, %d", err, len(rollbackPointsB)) } found := false From d0644fec129f08c58f6df3a298e0a314f590a6d0 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 8 Feb 2018 12:17:19 -0800 Subject: [PATCH 194/728] scorch persistSnapshot comments update See also: https://github.com/blevesearch/bleve/issues/763 --- index/scorch/persister.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 388c14055..7b1cd837e 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -239,8 +239,13 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { } } - // only alter the root if we actually persisted a segment - // (sometimes its just a new snapshot, possibly with new internal values) + // we need to swap in a new root only when we've persisted 1 or + // more segments -- whereby the new root would have 1-for-1 + // replacements of in-memory segments with file-based segments + // + // other cases like updates to internal values only, and/or when + // there are only deletions, are already covered and persisted by + // the newly populated boltdb snapshotBucket above if len(newSegmentPaths) > 0 { // now try to open all the new snapshots newSegments := make(map[uint64]segment.Segment) From 91ac0d011aaa7a7edbcd252c66443e6b1540baa1 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 8 Feb 2018 13:25:16 -0800 Subject: [PATCH 195/728] scorch uses segment.id to encode boltdb sub-bucket key fixes #764 --- index/scorch/persister.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 7b1cd837e..61a266adb 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -193,8 +193,8 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { newSegmentPaths := make(map[uint64]string) // first ensure that each segment in this snapshot has been persisted - for i, segmentSnapshot := range snapshot.segment { - snapshotSegmentKey := segment.EncodeUvarintAscending(nil, uint64(i)) + for _, segmentSnapshot := range snapshot.segment { + snapshotSegmentKey := segment.EncodeUvarintAscending(nil, segmentSnapshot.id) snapshotSegmentBucket, err2 := snapshotBucket.CreateBucketIfNotExists(snapshotSegmentKey) if err2 != nil { return err2 From dee6a2b1c64802c6a84a830f4a149bf25fbf91c4 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 8 Feb 2018 12:33:19 -0800 Subject: [PATCH 196/728] scorch persistSnapshot() consistently uses err to commit vs abort Some codepaths in persistSnapshot() were saving errors into an err2 local variable, which might lead incorrectly to commit during an error situation rather than abort. --- index/scorch/persister.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 61a266adb..07f38b81e 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -196,17 +196,17 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { for _, segmentSnapshot := range snapshot.segment { snapshotSegmentKey := segment.EncodeUvarintAscending(nil, segmentSnapshot.id) snapshotSegmentBucket, err2 := snapshotBucket.CreateBucketIfNotExists(snapshotSegmentKey) - if err2 != nil { - return err2 + if err != nil { + return err } switch seg := segmentSnapshot.segment.(type) { case *zap.SegmentBase: // need to persist this to disk filename := zapFileName(segmentSnapshot.id) path := s.path + string(os.PathSeparator) + filename - err2 := zap.PersistSegmentBase(seg, path) - if err2 != nil { - return fmt.Errorf("error persisting segment: %v", err2) + err = zap.PersistSegmentBase(seg, path) + if err != nil { + return fmt.Errorf("error persisting segment: %v", err) } newSegmentPaths[segmentSnapshot.id] = path err = snapshotSegmentBucket.Put(boltPathKey, []byte(filename)) From 83272a9629509d40e8db4fe382e7b95e1fd3dcff Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 8 Feb 2018 12:47:07 -0800 Subject: [PATCH 197/728] scorch persistSnapshot() err handling & propagation --- index/scorch/persister.go | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 07f38b81e..dab753d7f 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -145,23 +145,15 @@ OUTER: } } -func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { +func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) (err error) { // start a write transaction tx, err := s.rootBolt.Begin(true) if err != nil { return err } - // defer fsync of the rootbolt + // defer rollback on error defer func() { - if err == nil { - err = s.rootBolt.Sync() - } - }() - // defer commit/rollback transaction - defer func() { - if err == nil { - err = tx.Commit() - } else { + if err != nil { _ = tx.Rollback() } }() @@ -195,7 +187,7 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { // first ensure that each segment in this snapshot has been persisted for _, segmentSnapshot := range snapshot.segment { snapshotSegmentKey := segment.EncodeUvarintAscending(nil, segmentSnapshot.id) - snapshotSegmentBucket, err2 := snapshotBucket.CreateBucketIfNotExists(snapshotSegmentKey) + snapshotSegmentBucket, err := snapshotBucket.CreateBucketIfNotExists(snapshotSegmentKey) if err != nil { return err } @@ -300,7 +292,19 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { _ = rootPrev.DecRef() } } - // allow files to become eligible for removal + + err = tx.Commit() + if err != nil { + return err + } + + err = s.rootBolt.Sync() + if err != nil { + return err + } + + // allow files to become eligible for removal after commit, such + // as file segments from snapshots that came from the merger s.rootLock.Lock() for _, filename := range filenames { delete(s.ineligibleForRemoval, filename) From 6f5f90cd41720665b04e36805605638fef3120f9 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 7 Feb 2018 16:54:58 -0800 Subject: [PATCH 198/728] scorch zap segment cleanup handling for some edge cases Two cases in this commit... If we're shutting down, the merger might not have handed off its latest merged segment to the introducer yet, so the merger still owns the segment and needs to Close() that segment itself. In persistSnapshot(), there migth be cases where the persister might not be able to swap in its newly persisted segments -- so, the persistSnapshot() needs to Close() those segments itself. --- index/scorch/merge.go | 1 + index/scorch/persister.go | 15 ++++++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 41abe0655..fb4e80d20 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -146,6 +146,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot) error { // give it to the introducer select { case <-s.closeCh: + _ = segment.Close() return nil case s.merges <- sm: } diff --git a/index/scorch/persister.go b/index/scorch/persister.go index dab753d7f..83909a880 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -241,14 +241,18 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) (err error) { if len(newSegmentPaths) > 0 { // now try to open all the new snapshots newSegments := make(map[uint64]segment.Segment) + defer func() { + for _, s := range newSegments { + if s != nil { + // cleanup segments that were opened but not + // swapped into the new root + _ = s.Close() + } + } + }() for segmentID, path := range newSegmentPaths { newSegments[segmentID], err = zap.Open(path) if err != nil { - for _, s := range newSegments { - if s != nil { - _ = s.Close() // cleanup segments that were successfully opened - } - } return fmt.Errorf("error opening new segment at %s, %v", path, err) } } @@ -273,6 +277,7 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) (err error) { cachedDocs: segmentSnapshot.cachedDocs, } newIndexSnapshot.segment[i] = newSegmentSnapshot + delete(newSegments, segmentSnapshot.id) // update items persisted incase of a new segment snapshot atomic.AddUint64(&s.stats.numItemsPersisted, newSegmentSnapshot.Count()) } else { From f177f07613dcaa6c993c6bcdb4963f6fa58f0a67 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 8 Feb 2018 17:11:35 -0800 Subject: [PATCH 199/728] scorch zap segment merging reuses prealloc'ed PostingsIterator During zap segment merging, a new zap PostingsIterator was allocated for every field X segment X term. This change optimizes by reusing a single PostingsIterator instance per persistMergedRest() invocation. And, also unused fields are removed from the PostingsIterator. --- index/scorch/segment/zap/dict.go | 1 - index/scorch/segment/zap/merge.go | 3 ++- index/scorch/segment/zap/posting.go | 14 ++++++++++---- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 137c35d7a..55796ffa0 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -44,7 +44,6 @@ func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *Posti *rv = PostingsList{} // clear the struct } rv.sb = d.sb - rv.term = term rv.except = except if d.fst != nil { diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index b1eed28bc..751fcb582 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -164,6 +164,7 @@ func persistMergedRest(segments []*SegmentBase, drops []*roaring.Bitmap, var bufLoc []uint64 var postings *PostingsList + var postItr *PostingsIterator rv := make([]uint64, len(fieldsInv)) fieldDvLocs := make([]uint64, len(fieldsInv)) @@ -247,7 +248,7 @@ func persistMergedRest(segments []*SegmentBase, drops []*roaring.Bitmap, return nil, 0, err2 } - postItr := postings.Iterator() + postItr = postings.iterator(postItr) next, err2 := postItr.Next() for next != nil && err2 == nil { hitNewDocNum := newDocNums[dictI][next.Number()] diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 67e08d1ae..d504885d0 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -28,21 +28,27 @@ import ( // PostingsList is an in-memory represenation of a postings list type PostingsList struct { sb *SegmentBase - term []byte postingsOffset uint64 freqOffset uint64 locOffset uint64 locBitmap *roaring.Bitmap postings *roaring.Bitmap except *roaring.Bitmap - postingKey []byte } // Iterator returns an iterator for this postings list func (p *PostingsList) Iterator() segment.PostingsIterator { - rv := &PostingsIterator{ - postings: p, + return p.iterator(nil) +} + +func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { + if rv == nil { + rv = &PostingsIterator{} + } else { + *rv = PostingsIterator{} // clear the struct } + rv.postings = p + if p.postings != nil { // prepare the freq chunk details var n uint64 From e37c563c560a7cc35449fb29b7b4d42b6d105b37 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 8 Feb 2018 18:01:23 -0800 Subject: [PATCH 200/728] scorch zap merge move fieldDvLocsOffset var declaration Move the var declaration to nearer where its used. --- index/scorch/segment/zap/merge.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 751fcb582..c9e275c58 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -168,7 +168,6 @@ func persistMergedRest(segments []*SegmentBase, drops []*roaring.Bitmap, rv := make([]uint64, len(fieldsInv)) fieldDvLocs := make([]uint64, len(fieldsInv)) - fieldDvLocsOffset := uint64(fieldNotUninverted) // docTermMap is keyed by docNum, where the array impl provides // better memory usage behavior than a sparse-friendlier hashmap @@ -401,7 +400,7 @@ func persistMergedRest(segments []*SegmentBase, drops []*roaring.Bitmap, } } - fieldDvLocsOffset = uint64(w.Count()) + fieldDvLocsOffset := uint64(w.Count()) buf := bufMaxVarintLen64 for _, offset := range fieldDvLocs { From 846235593c08572890a24f312661ca142b5f1f7c Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 12 Feb 2018 10:03:43 -0800 Subject: [PATCH 201/728] Update vendor'ed revision for moss to the latest --- vendor/manifest | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/manifest b/vendor/manifest index 641f276e5..9a86629ed 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -37,7 +37,7 @@ "importpath": "github.com/couchbase/moss", "repository": "https://github.com/couchbase/moss", "vcs": "git", - "revision": "fc637b3f82ec5b8139b0d295f6588c6a2bea5a16", + "revision": "013a19c55df3e689a66b632c7c8074e37162217d", "branch": "master", "notests": true }, From 95a4f37e5c3d66feae18c6f1d3aa200e5764b95e Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 12 Feb 2018 14:37:40 -0800 Subject: [PATCH 202/728] scorch zap enumerator impl that joins multiple vellum iterators Unlike vellum's MergeIterator, the enumerator introduced in this commit doesn't merge when there are matching keys across iterators. Instead, the enumerator implementation provides a traversal of all the tuples of (key, iteratorIndex, val) from the underlying vellum iterators, ordered by key ASC, iteratorIndex ASC. --- index/scorch/segment/zap/enumerator.go | 124 +++++++++++ index/scorch/segment/zap/enumerator_test.go | 233 ++++++++++++++++++++ 2 files changed, 357 insertions(+) create mode 100644 index/scorch/segment/zap/enumerator.go create mode 100644 index/scorch/segment/zap/enumerator_test.go diff --git a/index/scorch/segment/zap/enumerator.go b/index/scorch/segment/zap/enumerator.go new file mode 100644 index 000000000..3c708dd57 --- /dev/null +++ b/index/scorch/segment/zap/enumerator.go @@ -0,0 +1,124 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + + "github.com/couchbase/vellum" +) + +// enumerator provides an ordered traversal of multiple vellum +// iterators. Like JOIN of iterators, the enumerator produces a +// sequence of (key, iteratorIndex, value) tuples, sorted by key ASC, +// then iteratorIndex ASC, where the same key might be seen or +// repeated across multiple child iterators. +type enumerator struct { + itrs []vellum.Iterator + currKs [][]byte + currVs []uint64 + + lowK []byte + lowIdxs []int + lowCurr int +} + +// newEnumerator returns a new enumerator over the vellum Iterators +func newEnumerator(itrs []vellum.Iterator) (*enumerator, error) { + rv := &enumerator{ + itrs: itrs, + currKs: make([][]byte, len(itrs)), + currVs: make([]uint64, len(itrs)), + lowIdxs: make([]int, 0, len(itrs)), + } + for i, itr := range rv.itrs { + rv.currKs[i], rv.currVs[i] = itr.Current() + } + rv.updateMatches() + if rv.lowK == nil { + return rv, vellum.ErrIteratorDone + } + return rv, nil +} + +// updateMatches maintains the low key matches based on the currKs +func (m *enumerator) updateMatches() { + m.lowK = nil + m.lowIdxs = m.lowIdxs[:0] + m.lowCurr = 0 + + for i, key := range m.currKs { + if key == nil { + continue + } + + cmp := bytes.Compare(key, m.lowK) + if cmp < 0 || m.lowK == nil { + // reached a new low + m.lowK = key + m.lowIdxs = m.lowIdxs[:0] + m.lowIdxs = append(m.lowIdxs, i) + } else if cmp == 0 { + m.lowIdxs = append(m.lowIdxs, i) + } + } +} + +// Current returns the enumerator's current key, iterator-index, and +// value. If the enumerator is not pointing at a valid value (because +// Next returned an error previously), Current will return nil,0,0. +func (m *enumerator) Current() ([]byte, int, uint64) { + var i int + var v uint64 + if m.lowCurr < len(m.lowIdxs) { + i = m.lowIdxs[m.lowCurr] + v = m.currVs[i] + } + return m.lowK, i, v +} + +// Next advances the enumerator to the next key/iterator/value result, +// else vellum.ErrIteratorDone is returned. +func (m *enumerator) Next() error { + m.lowCurr += 1 + if m.lowCurr >= len(m.lowIdxs) { + // move all the current low iterators forwards + for _, vi := range m.lowIdxs { + err := m.itrs[vi].Next() + if err != nil && err != vellum.ErrIteratorDone { + return err + } + m.currKs[vi], m.currVs[vi] = m.itrs[vi].Current() + } + m.updateMatches() + } + if m.lowK == nil { + return vellum.ErrIteratorDone + } + return nil +} + +// Close all the underlying Iterators. The first error, if any, will +// be returned. +func (m *enumerator) Close() error { + var rv error + for _, itr := range m.itrs { + err := itr.Close() + if rv == nil { + rv = err + } + } + return rv +} diff --git a/index/scorch/segment/zap/enumerator_test.go b/index/scorch/segment/zap/enumerator_test.go new file mode 100644 index 000000000..b27788923 --- /dev/null +++ b/index/scorch/segment/zap/enumerator_test.go @@ -0,0 +1,233 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or +// implied. See the License for the specific language governing +// permissions and limitations under the License. + +package zap + +import ( + "fmt" + "testing" + + "github.com/couchbase/vellum" +) + +type enumTestEntry struct { + key string + val uint64 +} + +type enumTestWant struct { + key string + idx int + val uint64 +} + +func TestEnumerator(t *testing.T) { + tests := []struct { + desc string + in [][]enumTestEntry + want []enumTestWant + }{ + { + desc: "two non-empty enumerators with no duplicate keys", + in: [][]enumTestEntry{ + []enumTestEntry{ + {"a", 1}, + {"c", 3}, + {"e", 5}, + }, + []enumTestEntry{ + {"b", 2}, + {"d", 4}, + {"f", 6}, + }, + }, + want: []enumTestWant{ + {"a", 0, 1}, + {"b", 1, 2}, + {"c", 0, 3}, + {"d", 1, 4}, + {"e", 0, 5}, + {"f", 1, 6}, + }, + }, + { + desc: "two non-empty enumerators with duplicate keys", + in: [][]enumTestEntry{ + []enumTestEntry{ + {"a", 1}, + {"c", 3}, + {"e", 5}, + }, + []enumTestEntry{ + {"a", 2}, + {"c", 4}, + {"e", 6}, + }, + }, + want: []enumTestWant{ + {"a", 0, 1}, + {"a", 1, 2}, + {"c", 0, 3}, + {"c", 1, 4}, + {"e", 0, 5}, + {"e", 1, 6}, + }, + }, + { + desc: "first iterator is empty", + in: [][]enumTestEntry{ + []enumTestEntry{}, + []enumTestEntry{ + {"a", 2}, + {"c", 4}, + {"e", 6}, + }, + }, + want: []enumTestWant{ + {"a", 1, 2}, + {"c", 1, 4}, + {"e", 1, 6}, + }, + }, + { + desc: "last iterator is empty", + in: [][]enumTestEntry{ + []enumTestEntry{ + {"a", 1}, + {"c", 3}, + {"e", 5}, + }, + []enumTestEntry{}, + }, + want: []enumTestWant{ + {"a", 0, 1}, + {"c", 0, 3}, + {"e", 0, 5}, + }, + }, + { + desc: "two different length enumerators with duplicate keys", + in: [][]enumTestEntry{ + []enumTestEntry{ + {"a", 1}, + {"c", 3}, + {"e", 5}, + }, + []enumTestEntry{ + {"a", 2}, + {"b", 4}, + {"d", 1000}, + {"e", 6}, + }, + }, + want: []enumTestWant{ + {"a", 0, 1}, + {"a", 1, 2}, + {"b", 1, 4}, + {"c", 0, 3}, + {"d", 1, 1000}, + {"e", 0, 5}, + {"e", 1, 6}, + }, + }, + } + + for _, test := range tests { + var itrs []vellum.Iterator + for _, entries := range test.in { + itrs = append(itrs, &testIterator{entries: entries}) + } + + enumerator, err := newEnumerator(itrs) + if err != nil { + t.Fatalf("%s - expected no err on newNumerator, got: %v", test.desc, err) + } + + wanti := 0 + for wanti < len(test.want) { + if err != nil { + t.Fatalf("%s - wanted no err, got: %v", test.desc, err) + } + + currK, currIdx, currV := enumerator.Current() + + want := test.want[wanti] + if want.key != string(currK) { + t.Fatalf("%s - wrong key, wanted: %#v, got: %q, %d, %d", test.desc, + want, currK, currIdx, currV) + } + if want.idx != currIdx { + t.Fatalf("%s - wrong idx, wanted: %#v, got: %q, %d, %d", test.desc, + want, currK, currIdx, currV) + } + if want.val != currV { + t.Fatalf("%s - wrong val, wanted: %#v, got: %q, %d, %d", test.desc, + want, currK, currIdx, currV) + } + + wanti += 1 + + err = enumerator.Next() + } + + if err != vellum.ErrIteratorDone { + t.Fatalf("%s - expected ErrIteratorDone, got: %v", test.desc, err) + } + + err = enumerator.Close() + if err != nil { + t.Fatalf("%s - expected nil err on close, got: %v", test.desc, err) + } + + for _, itr := range itrs { + if itr.(*testIterator).curr != 654321 { + t.Fatalf("%s - expected child iter to be closed", test.desc) + } + } + } +} + +type testIterator struct { + entries []enumTestEntry + curr int +} + +func (m *testIterator) Current() ([]byte, uint64) { + if m.curr >= len(m.entries) { + return nil, 0 + } + return []byte(m.entries[m.curr].key), m.entries[m.curr].val +} + +func (m *testIterator) Next() error { + m.curr++ + if m.curr >= len(m.entries) { + return vellum.ErrIteratorDone + } + return nil +} + +func (m *testIterator) Seek(key []byte) error { + return fmt.Errorf("not implemented for enumerator unit tests") +} + +func (m *testIterator) Reset(f *vellum.FST, + startKeyInclusive, endKeyExclusive []byte, aut vellum.Automaton) error { + return fmt.Errorf("not implemented for enumerator unit tests") +} + +func (m *testIterator) Close() error { + m.curr = 654321 + return nil +} From 2158e06c40dd40f14685c066531ec91352a57686 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 12 Feb 2018 17:29:50 -0800 Subject: [PATCH 203/728] scorch zap merge collects dicts & itrs in lock-step The theory with this change is that the dicts and itrs should be positionally in "lock-step" with paired entries. And, since later code also uses the same array indexing to access the drops and newDocNums, those also need to be positionally in pair-wise lock-step, too. --- index/scorch/segment/zap/merge.go | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index c9e275c58..0457fc82a 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -154,8 +154,8 @@ func computeNewDocCount(segments []*SegmentBase, drops []*roaring.Bitmap) uint64 return newDocCount } -func persistMergedRest(segments []*SegmentBase, drops []*roaring.Bitmap, - fieldsInv []string, fieldsMap map[string]uint16, newDocNums [][]uint64, +func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, + fieldsInv []string, fieldsMap map[string]uint16, newDocNumsIn [][]uint64, newSegDocCount uint64, chunkFactor uint32, w *CountHashWriter) ([]uint64, uint64, error) { @@ -187,15 +187,17 @@ func persistMergedRest(segments []*SegmentBase, drops []*roaring.Bitmap, return nil, 0, err } - // collect FST iterators from all segments for this field + // collect FST iterators from all active segments for this field + var newDocNums [][]uint64 + var drops []*roaring.Bitmap var dicts []*Dictionary var itrs []vellum.Iterator - for _, segment := range segments { + + for segmentI, segment := range segments { dict, err2 := segment.dictionary(fieldName) if err2 != nil { return nil, 0, err2 } - dicts = append(dicts, dict) if dict != nil && dict.fst != nil { itr, err2 := dict.fst.Iterator(nil, nil) @@ -203,6 +205,9 @@ func persistMergedRest(segments []*SegmentBase, drops []*roaring.Bitmap, return nil, 0, err2 } if itr != nil { + newDocNums = append(newDocNums, newDocNumsIn[segmentI]) + drops = append(drops, dropsIn[segmentI]) + dicts = append(dicts, dict) itrs = append(itrs, itr) } } From a073424e5ac51ee96053f09ccabac32521625aab Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 12 Feb 2018 17:47:28 -0800 Subject: [PATCH 204/728] scorch zap dict.postingsListFromOffset() method A helper method that can create a PostingsList if the caller already knows the postingsOffset. --- index/scorch/segment/zap/dict.go | 43 +++++++++++++++++++++----------- 1 file changed, 28 insertions(+), 15 deletions(-) diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 55796ffa0..e5d712686 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -38,6 +38,33 @@ func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment. } func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { + if d.fst == nil { + return d.postingsListInit(rv, except), nil + } + + postingsOffset, exists, err := d.fst.Get(term) + if err != nil { + return nil, fmt.Errorf("vellum err: %v", err) + } + if !exists { + return d.postingsListInit(rv, except), nil + } + + return d.postingsListFromOffset(postingsOffset, except, rv) +} + +func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { + rv = d.postingsListInit(rv, except) + + err := rv.read(postingsOffset, d) + if err != nil { + return nil, err + } + + return rv, nil +} + +func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) *PostingsList { if rv == nil { rv = &PostingsList{} } else { @@ -45,21 +72,7 @@ func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *Posti } rv.sb = d.sb rv.except = except - - if d.fst != nil { - postingsOffset, exists, err := d.fst.Get(term) - if err != nil { - return nil, fmt.Errorf("vellum err: %v", err) - } - if exists { - err = rv.read(postingsOffset, d) - if err != nil { - return nil, err - } - } - } - - return rv, nil + return rv } // Iterator returns an iterator for this dictionary From fe544f33522ea1f89df03f4722da37d4e4ad9a40 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 12 Feb 2018 17:48:49 -0800 Subject: [PATCH 205/728] scorch zap merge uses enumerator for vellum.Iterator's --- index/scorch/segment/zap/merge.go | 199 ++++++++++++++++-------------- 1 file changed, 108 insertions(+), 91 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 0457fc82a..525b7f93d 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -198,7 +198,6 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, if err2 != nil { return nil, 0, err2 } - if dict != nil && dict.fst != nil { itr, err2 := dict.fst.Iterator(nil, nil) if err2 != nil && err2 != vellum.ErrIteratorDone { @@ -213,15 +212,6 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, } } - // create merging iterator - mergeItr, err := vellum.NewMergeIterator(itrs, func(postingOffsets []uint64) uint64 { - // we don't actually use the merged value - return 0 - }) - - tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) - locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) - if uint64(cap(docTermMap)) < newSegDocCount { docTermMap = make([][]byte, newSegDocCount) } else { @@ -231,71 +221,17 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, } } - for err == nil { - term, _ := mergeItr.Current() + var prevTerm []byte - newRoaring := roaring.NewBitmap() - newRoaringLocs := roaring.NewBitmap() + newRoaring := roaring.NewBitmap() + newRoaringLocs := roaring.NewBitmap() - tfEncoder.Reset() - locEncoder.Reset() - - // now go back and get posting list for this term - // but pass in the deleted docs for that segment - for dictI, dict := range dicts { - if dict == nil { - continue - } - var err2 error - postings, err2 = dict.postingsList(term, drops[dictI], postings) - if err2 != nil { - return nil, 0, err2 - } - - postItr = postings.iterator(postItr) - next, err2 := postItr.Next() - for next != nil && err2 == nil { - hitNewDocNum := newDocNums[dictI][next.Number()] - if hitNewDocNum == docDropped { - return nil, 0, fmt.Errorf("see hit with dropped doc num") - } - newRoaring.Add(uint32(hitNewDocNum)) - // encode norm bits - norm := next.Norm() - normBits := math.Float32bits(float32(norm)) - err = tfEncoder.Add(hitNewDocNum, next.Frequency(), uint64(normBits)) - if err != nil { - return nil, 0, err - } - locs := next.Locations() - if len(locs) > 0 { - newRoaringLocs.Add(uint32(hitNewDocNum)) - for _, loc := range locs { - if cap(bufLoc) < 5+len(loc.ArrayPositions()) { - bufLoc = make([]uint64, 0, 5+len(loc.ArrayPositions())) - } - args := bufLoc[0:5] - args[0] = uint64(fieldsMap[loc.Field()]) - args[1] = loc.Pos() - args[2] = loc.Start() - args[3] = loc.End() - args[4] = uint64(len(loc.ArrayPositions())) - args = append(args, loc.ArrayPositions()...) - err = locEncoder.Add(hitNewDocNum, args...) - if err != nil { - return nil, 0, err - } - } - } - - docTermMap[hitNewDocNum] = - append(append(docTermMap[hitNewDocNum], term...), termSeparator) + tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) + locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) - next, err2 = postItr.Next() - } - if err2 != nil { - return nil, 0, err2 - } + finishTerm := func(term []byte) error { + if term == nil { + return nil } tfEncoder.Close() @@ -304,59 +240,140 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, if newRoaring.GetCardinality() > 0 { // this field/term actually has hits in the new segment, lets write it down freqOffset := uint64(w.Count()) - _, err = tfEncoder.Write(w) + _, err := tfEncoder.Write(w) if err != nil { - return nil, 0, err + return err } locOffset := uint64(w.Count()) _, err = locEncoder.Write(w) if err != nil { - return nil, 0, err + return err } postingLocOffset := uint64(w.Count()) _, err = writeRoaringWithLen(newRoaringLocs, w, &bufReuse, bufMaxVarintLen64) if err != nil { - return nil, 0, err + return err } postingOffset := uint64(w.Count()) + // write out the start of the term info - buf := bufMaxVarintLen64 - n := binary.PutUvarint(buf, freqOffset) - _, err = w.Write(buf[:n]) + n := binary.PutUvarint(bufMaxVarintLen64, freqOffset) + _, err = w.Write(bufMaxVarintLen64[:n]) if err != nil { - return nil, 0, err + return err } - // write out the start of the loc info - n = binary.PutUvarint(buf, locOffset) - _, err = w.Write(buf[:n]) + n = binary.PutUvarint(bufMaxVarintLen64, locOffset) + _, err = w.Write(bufMaxVarintLen64[:n]) if err != nil { - return nil, 0, err + return err } - - // write out the start of the loc posting list - n = binary.PutUvarint(buf, postingLocOffset) - _, err = w.Write(buf[:n]) + // write out the start of the posting locs + n = binary.PutUvarint(bufMaxVarintLen64, postingLocOffset) + _, err = w.Write(bufMaxVarintLen64[:n]) if err != nil { - return nil, 0, err + return err } _, err = writeRoaringWithLen(newRoaring, w, &bufReuse, bufMaxVarintLen64) if err != nil { - return nil, 0, err + return err } err = newVellum.Insert(term, postingOffset) + if err != nil { + return err + } + } + + newRoaring = roaring.NewBitmap() + newRoaringLocs = roaring.NewBitmap() + + tfEncoder.Reset() + locEncoder.Reset() + + return nil + } + + enumerator, err := newEnumerator(itrs) + + for err == nil { + term, itrI, postingsOffset := enumerator.Current() + + if !bytes.Equal(prevTerm, term) { + // if the term changed, write out the info collected + // for the previous term + err2 := finishTerm(prevTerm) + if err2 != nil { + return nil, 0, err2 + } + } + + var err2 error + postings, err2 = dicts[itrI].postingsListFromOffset( + postingsOffset, drops[itrI], postings) + if err2 != nil { + return nil, 0, err2 + } + + postItr = postings.iterator(postItr) + next, err2 := postItr.Next() + for next != nil && err2 == nil { + hitNewDocNum := newDocNums[itrI][next.Number()] + if hitNewDocNum == docDropped { + return nil, 0, fmt.Errorf("see hit with dropped doc num") + } + newRoaring.Add(uint32(hitNewDocNum)) + // encode norm bits + norm := next.Norm() + normBits := math.Float32bits(float32(norm)) + err = tfEncoder.Add(hitNewDocNum, next.Frequency(), uint64(normBits)) if err != nil { return nil, 0, err } + locs := next.Locations() + if len(locs) > 0 { + newRoaringLocs.Add(uint32(hitNewDocNum)) + for _, loc := range locs { + if cap(bufLoc) < 5+len(loc.ArrayPositions()) { + bufLoc = make([]uint64, 0, 5+len(loc.ArrayPositions())) + } + args := bufLoc[0:5] + args[0] = uint64(fieldsMap[loc.Field()]) + args[1] = loc.Pos() + args[2] = loc.Start() + args[3] = loc.End() + args[4] = uint64(len(loc.ArrayPositions())) + args = append(args, loc.ArrayPositions()...) + err = locEncoder.Add(hitNewDocNum, args...) + if err != nil { + return nil, 0, err + } + } + } + + docTermMap[hitNewDocNum] = + append(append(docTermMap[hitNewDocNum], term...), termSeparator) + + next, err2 = postItr.Next() } + if err2 != nil { + return nil, 0, err2 + } + + prevTerm = prevTerm[:0] // copy to prevTerm in case Next() reuses term mem + prevTerm = append(prevTerm, term...) - err = mergeItr.Next() + err = enumerator.Next() } if err != nil && err != vellum.ErrIteratorDone { return nil, 0, err } + err = finishTerm(prevTerm) + if err != nil { + return nil, 0, err + } + dictOffset := uint64(w.Count()) err = newVellum.Close() From 57fc03258e9ab6507ed01525b061080ff759cdd9 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 8 Feb 2018 14:32:55 -0800 Subject: [PATCH 206/728] scorch rollback ignores unsafeBatch flag See also: https://github.com/blevesearch/bleve/issues/760 --- index/scorch/snapshot_rollback.go | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/index/scorch/snapshot_rollback.go b/index/scorch/snapshot_rollback.go index c265b4c31..247003311 100644 --- a/index/scorch/snapshot_rollback.go +++ b/index/scorch/snapshot_rollback.go @@ -149,10 +149,7 @@ func (s *Scorch) Rollback(to *RollbackPoint) error { revert.snapshot = indexSnapshot revert.applied = make(chan error) - - if !s.unsafeBatch { - revert.persisted = make(chan error) - } + revert.persisted = make(chan error) return nil }) @@ -172,9 +169,5 @@ func (s *Scorch) Rollback(to *RollbackPoint) error { return fmt.Errorf("Rollback: failed with err: %v", err) } - if revert.persisted != nil { - err = <-revert.persisted - } - - return err + return <-revert.persisted } From 6f2797bec3720b32cdbd2df0b2d81f8cf69a783d Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 14 Feb 2018 16:39:26 +0530 Subject: [PATCH 207/728] Adding a pause to persister until the merger catches up --- index/scorch/persister.go | 84 ++++++++++++++++++++++++++++++--------- 1 file changed, 65 insertions(+), 19 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 61a266adb..b19c1205d 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -34,36 +34,40 @@ import ( var DefaultChunkFactor uint32 = 1024 +// Arbitrary number, need to make it configurable. +// Lower values like 10/making persister really slow +// doesn't work well as it is creating more files to +// persist for in next persist iteration and spikes the # FDs. +// Ideal value should let persister also proceed at +// an optimum pace so that the merger can skip +// many intermediate snapshots. +// This needs to be based on empirical data. +// With high segment count with snapshots, +// doubtful on the effectiveness of this approach. +var epochDistance = uint64(100) + type notificationChan chan struct{} func (s *Scorch) persisterLoop() { defer s.asyncTasks.Done() var persistWatchers []*epochWatcher - var lastPersistedEpoch uint64 - - notifyWatchers := func() { - var watchersNext []*epochWatcher - for _, w := range persistWatchers { - if w.epoch < lastPersistedEpoch { - close(w.notifyCh) - } else { - watchersNext = append(watchersNext, w) - } - } - persistWatchers = watchersNext - } - + var lastPersistedEpoch, lastMergedEpoch uint64 + var ew *epochWatcher OUTER: for { select { case <-s.closeCh: break OUTER - case ew := <-s.persisterNotifier: + case ew = <-s.persisterNotifier: persistWatchers = append(persistWatchers, ew) - notifyWatchers() default: } + if ew != nil { + lastMergedEpoch = ew.epoch + persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch, + &lastMergedEpoch, persistWatchers) + } var ourSnapshot *IndexSnapshot var ourPersisted []chan error @@ -136,16 +140,58 @@ OUTER: case <-w.notifyCh: // woken up, next loop should pick up work continue OUTER - case ew := <-s.persisterNotifier: + case ew = <-s.persisterNotifier: // if the watchers are already caught up then let them wait, // else let them continue to do the catch up persistWatchers = append(persistWatchers, ew) - notifyWatchers() } } } -func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { +func notifyMergeWatchers(lastPersistedEpoch uint64, + persistWatchers []*epochWatcher) []*epochWatcher { + var watchersNext []*epochWatcher + for _, w := range persistWatchers { + if w.epoch < lastPersistedEpoch { + close(w.notifyCh) + } else { + watchersNext = append(watchersNext, w) + } + } + return watchersNext +} + +func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch *uint64, + persistWatchers []*epochWatcher) []*epochWatcher { + +OUTER: + for { + + // first, let the watchers proceed if they lag behind + persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) + + // check for slow merger and pause persister until merger catch up + if lastPersistedEpoch > *lastMergedEpoch && + lastPersistedEpoch-*lastMergedEpoch > epochDistance { + + select { + case <-s.closeCh: + break OUTER + case ew := <-s.persisterNotifier: + persistWatchers = append(persistWatchers, ew) + *lastMergedEpoch = ew.epoch + log.Printf("persister waiting as lastPersistedEpoch->%d merger epoch->%d", lastPersistedEpoch, *lastMergedEpoch) + continue OUTER + } + } else { + break OUTER + } + } + + return persistWatchers +} + +func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) (err error) { // start a write transaction tx, err := s.rootBolt.Begin(true) if err != nil { From 606a270669be7ca3064110020f0155845a070b96 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 12 Feb 2018 21:54:33 +0530 Subject: [PATCH 208/728] Fix for empty segment merge handling Avoid creating new files with emtpy segments tasks during the merge operation, skips the incorrect appending of a newer segment during merge. --- index/scorch/introducer.go | 26 +++++++++++++------- index/scorch/merge.go | 50 ++++++++++++++++++++++++-------------- 2 files changed, 49 insertions(+), 27 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 4499fa41b..f75801910 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -193,6 +193,12 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { // prepare new index snapshot currSize := len(s.root.segment) newSize := currSize + 1 - len(nextMerge.old) + + // empty segments deletion + if nextMerge.new == nil { + newSize-- + } + newSnapshot := &IndexSnapshot{ parent: s, segment: make([]*SegmentSnapshot, 0, newSize), @@ -210,7 +216,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { segmentID := s.root.segment[i].id if segSnapAtMerge, ok := nextMerge.old[segmentID]; ok { // this segment is going away, see if anything else was deleted since we started the merge - if s.root.segment[i].deleted != nil { + if segSnapAtMerge != nil && s.root.segment[i].deleted != nil { // assume all these deletes are new deletedSince := s.root.segment[i].deleted // if we already knew about some of them, remove @@ -238,14 +244,16 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { } } - // put new segment at end - newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ - id: nextMerge.id, - segment: nextMerge.new, // take ownership for nextMerge.new's ref-count - deleted: newSegmentDeleted, - cachedDocs: &cachedDocs{cache: nil}, - }) - newSnapshot.offsets = append(newSnapshot.offsets, running) + if nextMerge.new != nil { + // put new segment at end + newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ + id: nextMerge.id, + segment: nextMerge.new, // take ownership for nextMerge.new's ref-count + deleted: newSegmentDeleted, + cachedDocs: &cachedDocs{cache: nil}, + }) + newSnapshot.offsets = append(newSnapshot.offsets, running) + } // swap in new segment rootPrev := s.root diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 5ded29b5a..540976f41 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -124,6 +124,10 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot) error { // process tasks in serial for now var notifications []notificationChan for _, task := range resultMergePlan.Tasks { + if len(task.Segments) == 0 { + continue + } + oldMap := make(map[uint64]*SegmentSnapshot) newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) segmentsToMerge := make([]*zap.Segment, 0, len(task.Segments)) @@ -132,36 +136,46 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot) error { if segSnapshot, ok := planSegment.(*SegmentSnapshot); ok { oldMap[segSnapshot.id] = segSnapshot if zapSeg, ok := segSnapshot.segment.(*zap.Segment); ok { - segmentsToMerge = append(segmentsToMerge, zapSeg) - docsToDrop = append(docsToDrop, segSnapshot.deleted) + if segSnapshot.LiveSize() == 0 { + oldMap[segSnapshot.id] = nil + } else { + segmentsToMerge = append(segmentsToMerge, zapSeg) + docsToDrop = append(docsToDrop, segSnapshot.deleted) + } } } } - filename := zapFileName(newSegmentID) - s.markIneligibleForRemoval(filename) - path := s.path + string(os.PathSeparator) + filename - newDocNums, err := zap.Merge(segmentsToMerge, docsToDrop, path, DefaultChunkFactor) - if err != nil { - s.unmarkIneligibleForRemoval(filename) - return fmt.Errorf("merging failed: %v", err) - } - segment, err := zap.Open(path) - if err != nil { - s.unmarkIneligibleForRemoval(filename) - return err + var oldNewDocNums map[uint64][]uint64 + var segment segment.Segment + if len(segmentsToMerge) > 0 { + filename := zapFileName(newSegmentID) + s.markIneligibleForRemoval(filename) + path := s.path + string(os.PathSeparator) + filename + newDocNums, err := zap.Merge(segmentsToMerge, docsToDrop, path, 1024) + if err != nil { + s.unmarkIneligibleForRemoval(filename) + return fmt.Errorf("merging failed: %v", err) + } + segment, err = zap.Open(path) + if err != nil { + s.unmarkIneligibleForRemoval(filename) + return err + } + oldNewDocNums = make(map[uint64][]uint64) + for i, segNewDocNums := range newDocNums { + oldNewDocNums[task.Segments[i].Id()] = segNewDocNums + } } + sm := &segmentMerge{ id: newSegmentID, old: oldMap, - oldNewDocNums: make(map[uint64][]uint64), + oldNewDocNums: oldNewDocNums, new: segment, notify: make(notificationChan), } notifications = append(notifications, sm.notify) - for i, segNewDocNums := range newDocNums { - sm.oldNewDocNums[task.Segments[i].Id()] = segNewDocNums - } // give it to the introducer select { From 720010783ec5b3a3a2ceffbf256d863196aad6a3 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 14 Feb 2018 14:50:30 -0800 Subject: [PATCH 209/728] scorch zap InitSegmentBase() helper func Refactored out a zap.InitSegmentBase() func so that non-zap packages can create SegmentBase instances. --- index/scorch/segment/zap/build.go | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index b3bbbab52..72357ae7d 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -633,12 +633,21 @@ func NewSegmentBase(memSegment *mem.Segment, chunkFactor uint32) (*SegmentBase, return nil, err } + return InitSegmentBase(br.Bytes(), cr.Sum32(), chunkFactor, + memSegment.FieldsMap, memSegment.FieldsInv, numDocs, + storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs) +} + +func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32, + fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64, + storedIndexOffset uint64, fieldsIndexOffset uint64, docValueOffset uint64, + dictLocs []uint64) (*SegmentBase, error) { sb := &SegmentBase{ - mem: br.Bytes(), - memCRC: cr.Sum32(), + mem: mem, + memCRC: memCRC, chunkFactor: chunkFactor, - fieldsMap: memSegment.FieldsMap, - fieldsInv: memSegment.FieldsInv, + fieldsMap: fieldsMap, + fieldsInv: fieldsInv, numDocs: numDocs, storedIndexOffset: storedIndexOffset, fieldsIndexOffset: fieldsIndexOffset, @@ -647,7 +656,7 @@ func NewSegmentBase(memSegment *mem.Segment, chunkFactor uint32) (*SegmentBase, fieldDvIterMap: make(map[uint16]*docValueIterator), } - err = sb.loadDvIterators() + err := sb.loadDvIterators() if err != nil { return nil, err } From a0b7508da7e3effe66e7f4ac325446d57f643122 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 14 Feb 2018 14:53:28 -0800 Subject: [PATCH 210/728] scorch zap mergeSegmentBases() func As part of this, zap.MergeToWriter() now returns more information -- enough so that callers can now create their own SegmentBase instances. Also, the fieldsMap maintained and returned by zap.MergeToWriter() is now a mapping from fieldName ==> fieldID+1 (instead of the previous mapping from fieldName ==> fieldID). This makes it similar to how fieldsMap are handled in other parts of zap to avoid "zero value" issues. --- index/scorch/merge.go | 55 +++++++++++++++++++++++++++++++ index/scorch/segment/zap/merge.go | 35 ++++++++++---------- 2 files changed, 73 insertions(+), 17 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index fb4e80d20..7c7eedac5 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -15,6 +15,7 @@ package scorch import ( + "bytes" "fmt" "os" "sync/atomic" @@ -168,3 +169,57 @@ type segmentMerge struct { new segment.Segment notify notificationChan } + +// perform in-memory merging of the given SegmentBase instances, and +// synchronously introduce the merged segment into the root +func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, + sbs []*zap.SegmentBase, sbsDrops []*roaring.Bitmap, sbsIndexes []int, + chunkFactor uint32) (uint64, error) { + var br bytes.Buffer + + cr := zap.NewCountHashWriter(&br) + + newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, + docValueOffset, dictLocs, fieldsInv, fieldsMap, err := + zap.MergeToWriter(sbs, sbsDrops, chunkFactor, cr) + if err != nil { + return 0, nil + } + + segment, err := zap.InitSegmentBase(br.Bytes(), cr.Sum32(), chunkFactor, + fieldsMap, fieldsInv, numDocs, storedIndexOffset, fieldsIndexOffset, + docValueOffset, dictLocs) + if err != nil { + return 0, nil + } + + newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) + + sm := &segmentMerge{ + id: newSegmentID, + old: make(map[uint64]*SegmentSnapshot), + oldNewDocNums: make(map[uint64][]uint64), + new: segment, + notify: make(notificationChan), + } + + for i, idx := range sbsIndexes { + ss := snapshot.segment[idx] + sm.old[ss.id] = ss + sm.oldNewDocNums[ss.id] = newDocNums[i] + } + + select { // send to introducer + case <-s.closeCh: + return 0, nil // TODO: instead return some ErrInterruptedClosed? + case s.merges <- sm: + } + + select { // wait for introduction to complete + case <-s.closeCh: + return 0, nil // TODO: instead return some ErrInterruptedClosed? + case <-sm.notify: + } + + return numDocs, nil +} diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 525b7f93d..808b16b75 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -60,7 +60,7 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, // wrap it for counting (tracking offsets) cr := NewCountHashWriter(br) - newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, err := + newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, _, _, err := MergeToWriter(segmentBases, drops, chunkFactor, cr) if err != nil { cleanup() @@ -99,26 +99,26 @@ func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap, chunkFactor uint32, cr *CountHashWriter) ( newDocNums [][]uint64, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64, + dictLocs []uint64, fieldsInv []string, fieldsMap map[string]uint16, err error) { docValueOffset = uint64(fieldNotUninverted) - var dictLocs []uint64 - - fieldsSame, fieldsInv := mergeFields(segments) - fieldsMap := mapFields(fieldsInv) + var fieldsSame bool + fieldsSame, fieldsInv = mergeFields(segments) + fieldsMap = mapFields(fieldsInv) numDocs = computeNewDocCount(segments, drops) if numDocs > 0 { storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, fieldsMap, fieldsInv, fieldsSame, numDocs, cr) if err != nil { - return nil, 0, 0, 0, 0, err + return nil, 0, 0, 0, 0, nil, nil, nil, err } dictLocs, docValueOffset, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap, newDocNums, numDocs, chunkFactor, cr) if err != nil { - return nil, 0, 0, 0, 0, err + return nil, 0, 0, 0, 0, nil, nil, nil, err } } else { dictLocs = make([]uint64, len(fieldsInv)) @@ -126,17 +126,18 @@ func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap, fieldsIndexOffset, err = persistFields(fieldsInv, cr, dictLocs) if err != nil { - return nil, 0, 0, 0, 0, err + return nil, 0, 0, 0, 0, nil, nil, nil, err } - return newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, nil + return newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs, fieldsInv, fieldsMap, nil } -// mapFields takes the fieldsInv list and builds the map +// mapFields takes the fieldsInv list and returns a map of fieldName +// to fieldID+1 func mapFields(fields []string) map[string]uint16 { rv := make(map[string]uint16, len(fields)) for i, fieldName := range fields { - rv[fieldName] = uint16(i) + rv[fieldName] = uint16(i) + 1 } return rv } @@ -338,7 +339,7 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, bufLoc = make([]uint64, 0, 5+len(loc.ArrayPositions())) } args := bufLoc[0:5] - args[0] = uint64(fieldsMap[loc.Field()]) + args[0] = uint64(fieldsMap[loc.Field()] - 1) args[1] = loc.Pos() args[2] = loc.Start() args[3] = loc.End() @@ -499,7 +500,7 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, poss[i] = poss[i][:0] } err := segment.VisitDocument(docNum, func(field string, typ byte, value []byte, pos []uint64) bool { - fieldID := int(fieldsMap[field]) + fieldID := int(fieldsMap[field]) - 1 vals[fieldID] = append(vals[fieldID], value) typs[fieldID] = append(typs[fieldID], typ) poss[fieldID] = append(poss[fieldID], pos) @@ -615,21 +616,21 @@ func mergeFields(segments []*SegmentBase) (bool, []string) { segment0Fields = segments[0].Fields() } - fieldsMap := map[string]struct{}{} + fieldsExist := map[string]struct{}{} for _, segment := range segments { fields := segment.Fields() for fieldi, field := range fields { - fieldsMap[field] = struct{}{} + fieldsExist[field] = struct{}{} if len(segment0Fields) != len(fields) || segment0Fields[fieldi] != field { fieldsSame = false } } } - rv := make([]string, 0, len(fieldsMap)) + rv := make([]string, 0, len(fieldsExist)) // ensure _id stays first rv = append(rv, "_id") - for k := range fieldsMap { + for k := range fieldsExist { if k != "_id" { rv = append(rv, k) } From a8ebf2a553c21012733a24bc950d88fb2830f955 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 21 Feb 2018 17:25:14 +0530 Subject: [PATCH 211/728] lowering epochDistance to 5, fixing the lastMergedEpoch value updates --- index/scorch/persister.go | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 4ba0ca8e8..24988c9c6 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -44,7 +44,7 @@ var DefaultChunkFactor uint32 = 1024 // This needs to be based on empirical data. // With high segment count with snapshots, // doubtful on the effectiveness of this approach. -var epochDistance = uint64(100) +var epochDistance = uint64(5) type notificationChan chan struct{} @@ -63,11 +63,12 @@ OUTER: persistWatchers = append(persistWatchers, ew) default: } - if ew != nil { + if ew != nil && ew.epoch > lastMergedEpoch { lastMergedEpoch = ew.epoch - persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch, - &lastMergedEpoch, persistWatchers) } + persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch, + &lastMergedEpoch, persistWatchers) + var ourSnapshot *IndexSnapshot var ourPersisted []chan error @@ -171,8 +172,7 @@ OUTER: persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) // check for slow merger and pause persister until merger catch up - if lastPersistedEpoch > *lastMergedEpoch && - lastPersistedEpoch-*lastMergedEpoch > epochDistance { + if lastPersistedEpoch > *lastMergedEpoch+epochDistance { select { case <-s.closeCh: @@ -180,7 +180,6 @@ OUTER: case ew := <-s.persisterNotifier: persistWatchers = append(persistWatchers, ew) *lastMergedEpoch = ew.epoch - log.Printf("persister waiting as lastPersistedEpoch->%d merger epoch->%d", lastPersistedEpoch, *lastMergedEpoch) continue OUTER } } else { From a1db057656c6ab92fafbc3da14a218ea686b3928 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 23 Feb 2018 15:35:58 +0530 Subject: [PATCH 212/728] configurable mergePlanner options mergePlanner options are parsed from the scorch configs parameters --- index/scorch/merge.go | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index fb4e80d20..77368b369 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -28,6 +28,7 @@ import ( func (s *Scorch) mergerLoop() { var lastEpochMergePlanned uint64 + mergePlannerOptions := s.parseMergePlannerOptions() OUTER: for { select { @@ -45,7 +46,7 @@ OUTER: startTime := time.Now() // lets get started - err := s.planMergeAtSnapshot(ourSnapshot) + err := s.planMergeAtSnapshot(ourSnapshot, mergePlannerOptions) if err != nil { s.fireAsyncError(fmt.Errorf("merging err: %v", err)) _ = ourSnapshot.DecRef() @@ -82,7 +83,36 @@ OUTER: s.asyncTasks.Done() } -func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot) error { +func (s *Scorch) parseMergePlannerOptions() *mergeplan.MergePlanOptions { + mergePlannerOptions := &mergeplan.DefaultMergePlanOptions + scorchOptions := map[string]interface{}{} + if v, ok := s.config["scorchOptions"]; ok { + if scorchOptions, ok = v.(map[string]interface{}); ok { + if v, ok := scorchOptions["maxSegmentsPerTier"].(float64); ok { + mergePlannerOptions.MaxSegmentsPerTier = int(v) + } + if v, ok := scorchOptions["maxSegmentSize"].(float64); ok { + mergePlannerOptions.MaxSegmentSize = int64(v) + } + if v, ok := scorchOptions["tierGrowth"].(float64); ok { + mergePlannerOptions.TierGrowth = v + } + if v, ok := scorchOptions["segmentsPerMergeTask"].(float64); ok { + mergePlannerOptions.SegmentsPerMergeTask = int(v) + } + if v, ok := scorchOptions["floorSegmentSize"].(float64); ok { + mergePlannerOptions.FloorSegmentSize = int64(v) + } + if v, ok := scorchOptions["reclaimDeletesWeight"].(float64); ok { + mergePlannerOptions.ReclaimDeletesWeight = v + } + } + } + return mergePlannerOptions +} + +func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, + options *mergeplan.MergePlanOptions) error { // build list of zap segments in this snapshot var onlyZapSnapshots []mergeplan.Segment for _, segmentSnapshot := range ourSnapshot.segment { @@ -92,7 +122,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot) error { } // give this list to the planner - resultMergePlan, err := mergeplan.Plan(onlyZapSnapshots, nil) + resultMergePlan, err := mergeplan.Plan(onlyZapSnapshots, options) if err != nil { return fmt.Errorf("merge planning err: %v", err) } From c50d9b4023943f89514f2de842dfd77b1c93f585 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 14 Feb 2018 14:57:46 -0800 Subject: [PATCH 213/728] scorch conditional merging during persistSnapshot() As part of this change, there are nw helper methods -- persistSnapshotMaybeMerge() and persistSnapshotDirect(). --- index/scorch/introducer.go | 5 +- index/scorch/merge.go | 48 ++++++++++++------- index/scorch/persister.go | 95 +++++++++++++++++++++++++++++++++++++- 3 files changed, 130 insertions(+), 18 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 4499fa41b..e0cdf44a0 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -247,6 +247,8 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { }) newSnapshot.offsets = append(newSnapshot.offsets, running) + newSnapshot.AddRef() // 1 ref for the nextMerge.notify response + // swap in new segment rootPrev := s.root s.root = newSnapshot @@ -257,7 +259,8 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { _ = rootPrev.DecRef() } - // notify merger we incorporated this + // notify requester that we incorporated this + nextMerge.notify <- newSnapshot close(nextMerge.notify) } diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 7c7eedac5..434841757 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -103,7 +103,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot) error { } // process tasks in serial for now - var notifications []notificationChan + var notifications []chan *IndexSnapshot for _, task := range resultMergePlan.Tasks { oldMap := make(map[uint64]*SegmentSnapshot) newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) @@ -137,7 +137,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot) error { old: oldMap, oldNewDocNums: make(map[uint64][]uint64), new: segment, - notify: make(notificationChan), + notify: make(chan *IndexSnapshot, 1), } notifications = append(notifications, sm.notify) for i, segNewDocNums := range newDocNums { @@ -156,7 +156,10 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot) error { select { case <-s.closeCh: return nil - case <-notification: + case newSnapshot := <-notification: + if newSnapshot != nil { + _ = newSnapshot.DecRef() + } } } return nil @@ -167,14 +170,15 @@ type segmentMerge struct { old map[uint64]*SegmentSnapshot oldNewDocNums map[uint64][]uint64 new segment.Segment - notify notificationChan + notify chan *IndexSnapshot } -// perform in-memory merging of the given SegmentBase instances, and -// synchronously introduce the merged segment into the root +// perform a merging of the given SegmentBase instances into a new, +// persisted segment, and synchronously introduce that new segment +// into the root func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, sbs []*zap.SegmentBase, sbsDrops []*roaring.Bitmap, sbsIndexes []int, - chunkFactor uint32) (uint64, error) { + chunkFactor uint32) (uint64, *IndexSnapshot, uint64, error) { var br bytes.Buffer cr := zap.NewCountHashWriter(&br) @@ -183,24 +187,36 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, docValueOffset, dictLocs, fieldsInv, fieldsMap, err := zap.MergeToWriter(sbs, sbsDrops, chunkFactor, cr) if err != nil { - return 0, nil + return 0, nil, 0, err } - segment, err := zap.InitSegmentBase(br.Bytes(), cr.Sum32(), chunkFactor, + sb, err := zap.InitSegmentBase(br.Bytes(), cr.Sum32(), chunkFactor, fieldsMap, fieldsInv, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs) if err != nil { - return 0, nil + return 0, nil, 0, err } newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) + filename := zapFileName(newSegmentID) + path := s.path + string(os.PathSeparator) + filename + err = zap.PersistSegmentBase(sb, path) + if err != nil { + return 0, nil, 0, err + } + + segment, err := zap.Open(path) + if err != nil { + return 0, nil, 0, err + } + sm := &segmentMerge{ id: newSegmentID, old: make(map[uint64]*SegmentSnapshot), oldNewDocNums: make(map[uint64][]uint64), new: segment, - notify: make(notificationChan), + notify: make(chan *IndexSnapshot, 1), } for i, idx := range sbsIndexes { @@ -211,15 +227,15 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, select { // send to introducer case <-s.closeCh: - return 0, nil // TODO: instead return some ErrInterruptedClosed? + _ = segment.DecRef() + return 0, nil, 0, nil // TODO: return ErrInterruptedClosed? case s.merges <- sm: } select { // wait for introduction to complete case <-s.closeCh: - return 0, nil // TODO: instead return some ErrInterruptedClosed? - case <-sm.notify: + return 0, nil, 0, nil // TODO: return ErrInterruptedClosed? + case newSnapshot := <-sm.notify: + return numDocs, newSnapshot, newSegmentID, nil } - - return numDocs, nil } diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 83909a880..1b7c1b5eb 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -145,7 +145,100 @@ OUTER: } } -func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) (err error) { +func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { + persisted, err := s.persistSnapshotMaybeMerge(snapshot) + if err != nil { + return err + } + if persisted { + return nil + } + + return s.persistSnapshotDirect(snapshot) +} + +// DefaultMinSegmentsForInMemoryMerge represents the default number of +// in-memory zap segments that persistSnapshotMaybeMerge() needs to +// see in an IndexSnapshot before it decides to merge and persist +// those segments +var DefaultMinSegmentsForInMemoryMerge = 2 + +// persistSnapshotMaybeMerge examines the snapshot and might merge and +// persist the in-memory zap segments if there are enough of them +func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) ( + bool, error) { + // collect the in-memory zap segments (SegmentBase instances) + var sbs []*zap.SegmentBase + var sbsDrops []*roaring.Bitmap + var sbsIndexes []int + + for i, segmentSnapshot := range snapshot.segment { + if sb, ok := segmentSnapshot.segment.(*zap.SegmentBase); ok { + sbs = append(sbs, sb) + sbsDrops = append(sbsDrops, segmentSnapshot.deleted) + sbsIndexes = append(sbsIndexes, i) + } + } + + if len(sbs) < DefaultMinSegmentsForInMemoryMerge { + return false, nil + } + + _, newSnapshot, newSegmentID, err := s.mergeSegmentBases( + snapshot, sbs, sbsDrops, sbsIndexes, DefaultChunkFactor) + if err != nil { + return false, err + } + if newSnapshot == nil { + return false, nil + } + + defer func() { + _ = newSnapshot.DecRef() + }() + + mergedSegmentIDs := map[uint64]struct{}{} + for _, idx := range sbsIndexes { + mergedSegmentIDs[snapshot.segment[idx].id] = struct{}{} + } + + // construct a snapshot that's logically equivalent to the input + // snapshot, but with merged segments replaced by the new segment + equiv := &IndexSnapshot{ + parent: snapshot.parent, + segment: make([]*SegmentSnapshot, 0, len(snapshot.segment)), + internal: snapshot.internal, + epoch: snapshot.epoch, + } + + // copy to the equiv the segments that weren't replaced + for _, segment := range snapshot.segment { + if _, wasMerged := mergedSegmentIDs[segment.id]; !wasMerged { + equiv.segment = append(equiv.segment, segment) + } + } + + // append to the equiv the new segment + for _, segment := range newSnapshot.segment { + if segment.id == newSegmentID { + equiv.segment = append(equiv.segment, &SegmentSnapshot{ + id: newSegmentID, + segment: segment.segment, + deleted: nil, // nil since merging handled deletions + }) + break + } + } + + err = s.persistSnapshotDirect(equiv) + if err != nil { + return false, err + } + + return true, nil +} + +func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { // start a write transaction tx, err := s.rootBolt.Begin(true) if err != nil { From da70758635de8f574c3dc2082f162dac8aaa4e9a Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Fri, 23 Feb 2018 14:43:59 -0800 Subject: [PATCH 214/728] Handle case where store snapshot isn't closed in upsidedown's Batch() API --- index/upsidedown/upsidedown.go | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/index/upsidedown/upsidedown.go b/index/upsidedown/upsidedown.go index 1243375b7..70e6e457f 100644 --- a/index/upsidedown/upsidedown.go +++ b/index/upsidedown/upsidedown.go @@ -837,6 +837,11 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) { docBackIndexRowErr = err return } + defer func() { + if cerr := kvreader.Close(); err == nil && cerr != nil { + docBackIndexRowErr = cerr + } + }() for docID, doc := range batch.IndexOps { backIndexRow, err := backIndexRowForDoc(kvreader, index.IndexInternalID(docID)) @@ -847,12 +852,6 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) { docBackIndexRowCh <- &docBackIndexRow{docID, doc, backIndexRow} } - - err = kvreader.Close() - if err != nil { - docBackIndexRowErr = err - return - } }() // wait for analysis result From 683e195ac4b55700696ba0334ed8d79f8fa53ece Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 16 Feb 2018 13:56:57 +0530 Subject: [PATCH 215/728] adding empty segment handling during introduction cleaning up the segment live size check --- index/scorch/introducer.go | 54 +++++++++++++++++++++++++++++--------- 1 file changed, 41 insertions(+), 13 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index f75801910..1f2de98b7 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -100,8 +100,8 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { // prepare new index snapshot newSnapshot := &IndexSnapshot{ parent: s, - segment: make([]*SegmentSnapshot, nsegs, nsegs+1), - offsets: make([]uint64, nsegs, nsegs+1), + segment: make([]*SegmentSnapshot, 0, nsegs+1), + offsets: make([]uint64, 0, nsegs+1), internal: make(map[string][]byte, len(s.root.internal)), epoch: s.nextSnapshotEpoch, refs: 1, @@ -124,24 +124,29 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { return err } } - newSnapshot.segment[i] = &SegmentSnapshot{ + + newss := &SegmentSnapshot{ id: s.root.segment[i].id, segment: s.root.segment[i].segment, cachedDocs: s.root.segment[i].cachedDocs, } - s.root.segment[i].segment.AddRef() - + // apply new obsoletions if s.root.segment[i].deleted == nil { - newSnapshot.segment[i].deleted = delta + newss.deleted = delta } else { - newSnapshot.segment[i].deleted = roaring.Or(s.root.segment[i].deleted, delta) + newss.deleted = roaring.Or(s.root.segment[i].deleted, delta) + } + + // check for live size before copying + if newss.LiveSize() > 0 { + newSnapshot.segment = append(newSnapshot.segment, newss) + s.root.segment[i].segment.AddRef() + newSnapshot.offsets = append(newSnapshot.offsets, running) + running += s.root.segment[i].Count() } - - newSnapshot.offsets[i] = running - running += s.root.segment[i].Count() - } + // append new segment, if any, to end of the new index snapshot if next.data != nil { newSegmentSnapshot := &SegmentSnapshot{ @@ -230,7 +235,13 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { newSegmentDeleted.Add(uint32(newDocNum)) } } - } else { + // clean up the old segment map to figure out the + // obsolete segments wrt root in meantime, whatever + // segments left behind in old map after processing + // the root segments would be the obsolete segment set + delete(nextMerge.old, segmentID) + + } else if s.root.segment[i].LiveSize() > 0 { // this segment is staying newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ id: s.root.segment[i].id, @@ -244,7 +255,24 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { } } - if nextMerge.new != nil { + // before the newMerge introduction, need to clean the newly + // merged segment wrt the current root segments, hence + // applying the obsolete segment contents to newly merged segment + for segID, ss := range nextMerge.old { + obsoleted := ss.DocNumbersLive() + if obsoleted != nil { + obsoletedIter := obsoleted.Iterator() + for obsoletedIter.HasNext() { + oldDocNum := obsoletedIter.Next() + newDocNum := nextMerge.oldNewDocNums[segID][oldDocNum] + newSegmentDeleted.Add(uint32(newDocNum)) + } + } + } + // In case where all the docs in the newly merged segment getting + // deleted by the time we reach here, can skip the introduction. + if nextMerge.new != nil && + nextMerge.new.Count() > newSegmentDeleted.GetCardinality() { // put new segment at end newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ id: nextMerge.id, From 874829759b1b22981092b32a2942744851de4b6b Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Sat, 24 Feb 2018 23:53:49 +0530 Subject: [PATCH 216/728] cleaning up the wait loop --- index/scorch/persister.go | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 24988c9c6..8069ae4e6 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -42,8 +42,7 @@ var DefaultChunkFactor uint32 = 1024 // an optimum pace so that the merger can skip // many intermediate snapshots. // This needs to be based on empirical data. -// With high segment count with snapshots, -// doubtful on the effectiveness of this approach. +// TODO - may need to revisit this approach/value. var epochDistance = uint64(5) type notificationChan chan struct{} @@ -172,19 +171,17 @@ OUTER: persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) // check for slow merger and pause persister until merger catch up - if lastPersistedEpoch > *lastMergedEpoch+epochDistance { - - select { - case <-s.closeCh: - break OUTER - case ew := <-s.persisterNotifier: - persistWatchers = append(persistWatchers, ew) - *lastMergedEpoch = ew.epoch - continue OUTER - } - } else { + if lastPersistedEpoch <= *lastMergedEpoch+epochDistance { break OUTER } + + select { + case <-s.closeCh: + break OUTER + case ew := <-s.persisterNotifier: + persistWatchers = append(persistWatchers, ew) + *lastMergedEpoch = ew.epoch + } } return persistWatchers From f0a65f041d4c53f8fbb9c5956154e2bfb8603939 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Sun, 25 Feb 2018 20:58:53 +0530 Subject: [PATCH 217/728] cleaning up the wait loop --- index/scorch/persister.go | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 55b09d168..40333aaa1 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -164,16 +164,12 @@ func notifyMergeWatchers(lastPersistedEpoch uint64, func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch *uint64, persistWatchers []*epochWatcher) []*epochWatcher { -OUTER: - for { + // first, let the watchers proceed if they lag behind + persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) - // first, let the watchers proceed if they lag behind - persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) - - // check for slow merger and pause persister until merger catch up - if lastPersistedEpoch <= *lastMergedEpoch+epochDistance { - break OUTER - } +OUTER: + // check for slow merger and await until the merger catch up + for lastPersistedEpoch > *lastMergedEpoch+epochDistance { select { case <-s.closeCh: @@ -182,6 +178,9 @@ OUTER: persistWatchers = append(persistWatchers, ew) *lastMergedEpoch = ew.epoch } + + // let the watchers proceed if they lag behind + persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) } return persistWatchers From e4cc79a9ad6125d8d6ef45efb02735894afd7918 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 26 Feb 2018 15:56:30 +0530 Subject: [PATCH 218/728] adopting json parsing on options, fixed the inadvertant option modification --- index/scorch/merge.go | 46 ++++++++++++++++++++----------------------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 77368b369..fbc4385eb 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -15,6 +15,7 @@ package scorch import ( + "encoding/json" "fmt" "os" "sync/atomic" @@ -28,7 +29,13 @@ import ( func (s *Scorch) mergerLoop() { var lastEpochMergePlanned uint64 - mergePlannerOptions := s.parseMergePlannerOptions() + mergePlannerOptions, err := s.parseMergePlannerOptions() + if err != nil { + s.fireAsyncError(fmt.Errorf("mergePlannerOption json parsing err: %v", err)) + s.asyncTasks.Done() + return + } + OUTER: for { select { @@ -83,32 +90,21 @@ OUTER: s.asyncTasks.Done() } -func (s *Scorch) parseMergePlannerOptions() *mergeplan.MergePlanOptions { - mergePlannerOptions := &mergeplan.DefaultMergePlanOptions - scorchOptions := map[string]interface{}{} - if v, ok := s.config["scorchOptions"]; ok { - if scorchOptions, ok = v.(map[string]interface{}); ok { - if v, ok := scorchOptions["maxSegmentsPerTier"].(float64); ok { - mergePlannerOptions.MaxSegmentsPerTier = int(v) - } - if v, ok := scorchOptions["maxSegmentSize"].(float64); ok { - mergePlannerOptions.MaxSegmentSize = int64(v) - } - if v, ok := scorchOptions["tierGrowth"].(float64); ok { - mergePlannerOptions.TierGrowth = v - } - if v, ok := scorchOptions["segmentsPerMergeTask"].(float64); ok { - mergePlannerOptions.SegmentsPerMergeTask = int(v) - } - if v, ok := scorchOptions["floorSegmentSize"].(float64); ok { - mergePlannerOptions.FloorSegmentSize = int64(v) - } - if v, ok := scorchOptions["reclaimDeletesWeight"].(float64); ok { - mergePlannerOptions.ReclaimDeletesWeight = v - } +func (s *Scorch) parseMergePlannerOptions() (*mergeplan.MergePlanOptions, + error) { + mergePlannerOptions := mergeplan.DefaultMergePlanOptions + if v, ok := s.config["scorchMergePlanOptions"]; ok { + b, err := json.Marshal(v) + if err != nil { + return &mergePlannerOptions, err + } + + err = json.Unmarshal(b, &mergePlannerOptions) + if err != nil { + return &mergePlannerOptions, err } } - return mergePlannerOptions + return &mergePlannerOptions, nil } func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, From e02849fcdae060357951381bda6e10c552a414ee Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 26 Feb 2018 16:21:33 +0530 Subject: [PATCH 219/728] fix the indentation --- index/scorch/merge.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 48fff013c..61059b9b9 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -16,7 +16,7 @@ package scorch import ( "bytes" - "encoding/json" + "encoding/json" "fmt" "os" From ce2332e111bb56bd9a4317a821dc977ef5427784 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 24 Feb 2018 09:35:03 -0800 Subject: [PATCH 220/728] scorch zap merge reuses tf/locEncoder across terms The finishTerm() helper func that's invoked on every outer loop resets the tf/locEncoders so they can be safely reused. --- index/scorch/segment/zap/merge.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 808b16b75..1c050efe9 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -170,6 +170,9 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, rv := make([]uint64, len(fieldsInv)) fieldDvLocs := make([]uint64, len(fieldsInv)) + tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) + locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) + // docTermMap is keyed by docNum, where the array impl provides // better memory usage behavior than a sparse-friendlier hashmap // for when docs have much structural similarity (i.e., every doc @@ -227,9 +230,6 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, newRoaring := roaring.NewBitmap() newRoaringLocs := roaring.NewBitmap() - tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) - locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) - finishTerm := func(term []byte) error { if term == nil { return nil From 98d5d7bd81fbe0714800cae23b7b13a9601dd81a Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 24 Feb 2018 09:38:45 -0800 Subject: [PATCH 221/728] scorch zap chunkedIntCoder optimizations The optimizations / changes include... - reuse of a memory buf when serializing varint's. - reuse of a govarint.U64Base128Encoder instance, as it's a thin, wrapper around an underlying chunkBuf, so Reset()'s on the chunkBuf is enough for encoder reuse. - chunkedIntcoder.Write() method was changed to invoke w.Write() less often by forming a larger, reused buf. Profiling and analysis showed w.Write() was getting called a lot, often with tiny 1 or 2 byte inputs. The theory is w.Write() and its underlying memmove() can be more efficient when provided with larger bufs. - some repeated code removal, by reusing the Close() method. --- index/scorch/segment/zap/intcoder.go | 37 +++++++++++++--------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/index/scorch/segment/zap/intcoder.go b/index/scorch/segment/zap/intcoder.go index e9f295023..b505fec94 100644 --- a/index/scorch/segment/zap/intcoder.go +++ b/index/scorch/segment/zap/intcoder.go @@ -30,6 +30,8 @@ type chunkedIntCoder struct { encoder *govarint.Base128Encoder chunkLens []uint64 currChunk uint64 + + buf []byte } // newChunkedIntCoder returns a new chunk int coder which packs data into @@ -67,12 +69,8 @@ func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error { // starting a new chunk if c.encoder != nil { // close out last - c.encoder.Close() - encodingBytes := c.chunkBuf.Bytes() - c.chunkLens[c.currChunk] = uint64(len(encodingBytes)) - c.final = append(c.final, encodingBytes...) + c.Close() c.chunkBuf.Reset() - c.encoder = govarint.NewU64Base128Encoder(&c.chunkBuf) } c.currChunk = chunk } @@ -98,26 +96,25 @@ func (c *chunkedIntCoder) Close() { // Write commits all the encoded chunked integers to the provided writer. func (c *chunkedIntCoder) Write(w io.Writer) (int, error) { - var tw int - buf := make([]byte, binary.MaxVarintLen64) - // write out the number of chunks + bufNeeded := binary.MaxVarintLen64 * (1 + len(c.chunkLens)) + if len(c.buf) < bufNeeded { + c.buf = make([]byte, bufNeeded) + } + buf := c.buf + + // write out the number of chunks & each chunkLen n := binary.PutUvarint(buf, uint64(len(c.chunkLens))) - nw, err := w.Write(buf[:n]) - tw += nw + for _, chunkLen := range c.chunkLens { + n += binary.PutUvarint(buf[n:], uint64(chunkLen)) + } + + tw, err := w.Write(buf[:n]) if err != nil { return tw, err } - // write out the chunk lens - for _, chunkLen := range c.chunkLens { - n := binary.PutUvarint(buf, uint64(chunkLen)) - nw, err = w.Write(buf[:n]) - tw += nw - if err != nil { - return tw, err - } - } + // write out the data - nw, err = w.Write(c.final) + nw, err := w.Write(c.final) tw += nw if err != nil { return tw, err From 99ed12717633062862e33d0ab774dc9b36df6d62 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 26 Feb 2018 14:23:53 -0800 Subject: [PATCH 222/728] scorch zap merge optimize newDocNums lookup to outside of loop And, also a "go fmt". --- index/scorch/introducer.go | 8 ++++---- index/scorch/merge.go | 2 +- index/scorch/persister.go | 1 - index/scorch/segment/zap/merge.go | 4 +++- 4 files changed, 8 insertions(+), 7 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 18f8438f8..1a7d656ca 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -130,14 +130,14 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { segment: s.root.segment[i].segment, cachedDocs: s.root.segment[i].cachedDocs, } - + // apply new obsoletions if s.root.segment[i].deleted == nil { newss.deleted = delta } else { newss.deleted = roaring.Or(s.root.segment[i].deleted, delta) } - + // check for live size before copying if newss.LiveSize() > 0 { newSnapshot.segment = append(newSnapshot.segment, newss) @@ -241,7 +241,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { // the root segments would be the obsolete segment set delete(nextMerge.old, segmentID) - } else if s.root.segment[i].LiveSize() > 0 { + } else if s.root.segment[i].LiveSize() > 0 { // this segment is staying newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ id: s.root.segment[i].id, @@ -269,7 +269,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { } } } - // In case where all the docs in the newly merged segment getting + // In case where all the docs in the newly merged segment getting // deleted by the time we reach here, can skip the introduction. if nextMerge.new != nil && nextMerge.new.Count() > newSegmentDeleted.GetCardinality() { diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 61059b9b9..ad756588a 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -16,7 +16,7 @@ package scorch import ( "bytes" - "encoding/json" + "encoding/json" "fmt" "os" diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 40333aaa1..c21bb1439 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -68,7 +68,6 @@ OUTER: persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch, &lastMergedEpoch, persistWatchers) - var ourSnapshot *IndexSnapshot var ourPersisted []chan error diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 1c050efe9..77ae173b5 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -316,10 +316,12 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, return nil, 0, err2 } + newDocNumsI := newDocNums[itrI] + postItr = postings.iterator(postItr) next, err2 := postItr.Next() for next != nil && err2 == nil { - hitNewDocNum := newDocNums[itrI][next.Number()] + hitNewDocNum := newDocNumsI[next.Number()] if hitNewDocNum == docDropped { return nil, 0, fmt.Errorf("see hit with dropped doc num") } From 3f1dcb60781b5f57375d5ed93700f96db9ff8804 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 27 Feb 2018 09:23:23 -0800 Subject: [PATCH 223/728] scorch zap merge optimize drops lookup to outside of loop --- index/scorch/segment/zap/merge.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 77ae173b5..ae8c5b197 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -462,10 +462,12 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, for segI, segment := range segments { segNewDocNums := make([]uint64, segment.numDocs) + dropsI := drops[segI] + // optimize when the field mapping is the same across all // segments and there are no deletions, via byte-copying // of stored docs bytes directly to the writer - if fieldsSame && (drops[segI] == nil || drops[segI].GetCardinality() == 0) { + if fieldsSame && (dropsI == nil || dropsI.GetCardinality() == 0) { err := segment.copyStoredDocs(newDocNum, docNumOffsets, w) if err != nil { return 0, nil, err @@ -483,7 +485,7 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, // for each doc num for docNum := uint64(0); docNum < segment.numDocs; docNum++ { // TODO: roaring's API limits docNums to 32-bits? - if drops[segI] != nil && drops[segI].Contains(uint32(docNum)) { + if dropsI != nil && dropsI.Contains(uint32(docNum)) { segNewDocNums[docNum] = docDropped continue } From f58a205ae87086e9494f25e209afeedb99a8fdbb Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 27 Feb 2018 11:29:16 -0800 Subject: [PATCH 224/728] remove 1.6 from travis, add "1.10" --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 70bc093f7..775fed3a9 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,10 +3,10 @@ sudo: false language: go go: - - 1.6.x - 1.7.x - 1.8.x - 1.9.x + - "1.10" script: - go get golang.org/x/tools/cmd/cover From c74e08f039e56cef576e4336382b2a2d12d9e026 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 27 Feb 2018 11:33:12 -0800 Subject: [PATCH 225/728] BREAKING API CHANGE - use stdlib context pkg update all references to context to use std lib pkg --- index.go | 3 ++- index_alias_impl.go | 3 +-- index_alias_impl_test.go | 3 +-- index_impl.go | 3 +-- index_test.go | 3 +-- search/collector.go | 3 +-- search/collector/bench_test.go | 2 +- search/collector/topn.go | 2 +- search/collector/topn_test.go | 3 +-- vendor/manifest | 9 --------- 10 files changed, 10 insertions(+), 24 deletions(-) diff --git a/index.go b/index.go index e85652d96..ea7b3832a 100644 --- a/index.go +++ b/index.go @@ -15,11 +15,12 @@ package bleve import ( + "context" + "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/store" "github.com/blevesearch/bleve/mapping" - "golang.org/x/net/context" ) // A Batch groups together multiple Index and Delete diff --git a/index_alias_impl.go b/index_alias_impl.go index 9e9a3594f..f678a059b 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -15,12 +15,11 @@ package bleve import ( + "context" "sort" "sync" "time" - "golang.org/x/net/context" - "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/store" diff --git a/index_alias_impl_test.go b/index_alias_impl_test.go index a59406647..2ee64991f 100644 --- a/index_alias_impl_test.go +++ b/index_alias_impl_test.go @@ -15,13 +15,12 @@ package bleve import ( + "context" "fmt" "reflect" "testing" "time" - "golang.org/x/net/context" - "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/store" diff --git a/index_impl.go b/index_impl.go index 799b582a0..caea1b8e0 100644 --- a/index_impl.go +++ b/index_impl.go @@ -15,6 +15,7 @@ package bleve import ( + "context" "encoding/json" "fmt" "os" @@ -22,8 +23,6 @@ import ( "sync/atomic" "time" - "golang.org/x/net/context" - "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/store" diff --git a/index_test.go b/index_test.go index 762e3838e..f16a8f637 100644 --- a/index_test.go +++ b/index_test.go @@ -15,6 +15,7 @@ package bleve import ( + "context" "fmt" "io/ioutil" "log" @@ -28,8 +29,6 @@ import ( "testing" "time" - "golang.org/x/net/context" - "github.com/blevesearch/bleve/analysis/analyzer/keyword" "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" diff --git a/search/collector.go b/search/collector.go index cba4829d4..0d163a9d9 100644 --- a/search/collector.go +++ b/search/collector.go @@ -15,11 +15,10 @@ package search import ( + "context" "time" "github.com/blevesearch/bleve/index" - - "golang.org/x/net/context" ) type Collector interface { diff --git a/search/collector/bench_test.go b/search/collector/bench_test.go index e75613c36..e6a786f41 100644 --- a/search/collector/bench_test.go +++ b/search/collector/bench_test.go @@ -15,13 +15,13 @@ package collector import ( + "context" "math/rand" "strconv" "testing" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" - "golang.org/x/net/context" ) type createCollector func() search.Collector diff --git a/search/collector/topn.go b/search/collector/topn.go index 2c7c6752d..388370e7e 100644 --- a/search/collector/topn.go +++ b/search/collector/topn.go @@ -15,11 +15,11 @@ package collector import ( + "context" "time" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" - "golang.org/x/net/context" ) type collectorStore interface { diff --git a/search/collector/topn_test.go b/search/collector/topn_test.go index b8c331ae6..d50e38a0c 100644 --- a/search/collector/topn_test.go +++ b/search/collector/topn_test.go @@ -15,10 +15,9 @@ package collector import ( + "context" "testing" - "golang.org/x/net/context" - "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" ) diff --git a/vendor/manifest b/vendor/manifest index 9a86629ed..0837684fa 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -107,15 +107,6 @@ "branch": "master", "notests": true }, - { - "importpath": "golang.org/x/net/context", - "repository": "https://go.googlesource.com/net", - "vcs": "", - "revision": "e45385e9b226f570b1f086bf287b25d3d4117776", - "branch": "master", - "path": "/context", - "notests": true - }, { "importpath": "golang.org/x/text/transform", "repository": "https://go.googlesource.com/text", From 806313276698e176d9824dfefbbed730eef259ad Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 27 Feb 2018 11:57:21 -0800 Subject: [PATCH 226/728] fix new issues found by go vet when using stdlib context pkg --- index_alias_impl_test.go | 17 +++++++++++------ index_test.go | 8 +++++--- 2 files changed, 16 insertions(+), 9 deletions(-) diff --git a/index_alias_impl_test.go b/index_alias_impl_test.go index 2ee64991f..9599b89d6 100644 --- a/index_alias_impl_test.go +++ b/index_alias_impl_test.go @@ -782,7 +782,9 @@ func TestMultiSearchTimeout(t *testing.T) { }} // first run with absurdly long time out, should succeed - ctx, _ = context.WithTimeout(context.Background(), 10*time.Second) + var cancel context.CancelFunc + ctx, cancel = context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() query := NewTermQuery("test") sr := NewSearchRequest(query) res, err := MultiSearch(ctx, sr, ei1, ei2) @@ -803,7 +805,8 @@ func TestMultiSearchTimeout(t *testing.T) { } // now run a search again with an absurdly low timeout (should timeout) - ctx, _ = context.WithTimeout(context.Background(), 1*time.Microsecond) + ctx, cancel = context.WithTimeout(context.Background(), 1*time.Microsecond) + defer cancel() res, err = MultiSearch(ctx, sr, ei1, ei2) if err != nil { t.Errorf("expected no error, got %v", err) @@ -829,7 +832,6 @@ func TestMultiSearchTimeout(t *testing.T) { } // now run a search again with a normal timeout, but cancel it first - var cancel context.CancelFunc ctx, cancel = context.WithTimeout(context.Background(), 5*time.Second) cancel() res, err = MultiSearch(ctx, sr, ei1, ei2) @@ -936,7 +938,9 @@ func TestMultiSearchTimeoutPartial(t *testing.T) { // ei3 is set to take >50ms, so run search with timeout less than // this, this should return partial results - ctx, _ = context.WithTimeout(context.Background(), 25*time.Millisecond) + var cancel context.CancelFunc + ctx, cancel = context.WithTimeout(context.Background(), 25*time.Millisecond) + defer cancel() query := NewTermQuery("test") sr := NewSearchRequest(query) expected := &SearchResult{ @@ -1089,8 +1093,9 @@ func TestIndexAliasMultipleLayer(t *testing.T) { // ei2 and ei3 have 50ms delay // search across aliasTop should still get results from ei1 and ei4 // total should still be 4 - - ctx, _ = context.WithTimeout(context.Background(), 25*time.Millisecond) + var cancel context.CancelFunc + ctx, cancel = context.WithTimeout(context.Background(), 25*time.Millisecond) + defer cancel() query := NewTermQuery("test") sr := NewSearchRequest(query) expected := &SearchResult{ diff --git a/index_test.go b/index_test.go index f16a8f637..a69357bf6 100644 --- a/index_test.go +++ b/index_test.go @@ -1507,7 +1507,8 @@ func TestSearchTimeout(t *testing.T) { }() // first run a search with an absurdly long timeout (should succeeed) - ctx, _ := context.WithTimeout(context.Background(), 10*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() query := NewTermQuery("water") req := NewSearchRequest(query) _, err = index.SearchInContext(ctx, req) @@ -1516,7 +1517,8 @@ func TestSearchTimeout(t *testing.T) { } // now run a search again with an absurdly low timeout (should timeout) - ctx, _ = context.WithTimeout(context.Background(), 1*time.Microsecond) + ctx, cancel = context.WithTimeout(context.Background(), 1*time.Microsecond) + defer cancel() sq := &slowQuery{ actual: query, delay: 50 * time.Millisecond, // on Windows timer resolution is 15ms @@ -1528,7 +1530,7 @@ func TestSearchTimeout(t *testing.T) { } // now run a search with a long timeout, but with a long query, and cancel it - ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + ctx, cancel = context.WithTimeout(context.Background(), 10*time.Second) sq = &slowQuery{ actual: query, delay: 100 * time.Millisecond, // on Windows timer resolution is 15ms From a32362ba2eee879782c6a142b87c1c7fe3081311 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 27 Feb 2018 15:09:22 -0800 Subject: [PATCH 227/728] MB-28403: scorch introduceMerge doesn't prealloc segments capacity There's now multiple competing merge activities (file-merging and in-memory merging during persistence), so the simple math to precalculate capacity for the slice of segments in introduceMerge() no longer works for all cases and might have negative capacity. This change removes that (sometimes wrong) precalculation, and instead depends on append() to grow the slice correctly. --- index/scorch/introducer.go | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 1a7d656ca..af0181cab 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -195,19 +195,8 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { // acquire lock s.rootLock.Lock() - // prepare new index snapshot - currSize := len(s.root.segment) - newSize := currSize + 1 - len(nextMerge.old) - - // empty segments deletion - if nextMerge.new == nil { - newSize-- - } - newSnapshot := &IndexSnapshot{ parent: s, - segment: make([]*SegmentSnapshot, 0, newSize), - offsets: make([]uint64, 0, newSize), internal: s.root.internal, epoch: s.nextSnapshotEpoch, refs: 1, From 4dbb4b14956d221964147971e08f4e5401869f96 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 13 Feb 2018 07:44:14 -0800 Subject: [PATCH 228/728] scorch zap posting reuses freqNorm & loc reader and decoder --- index/scorch/segment/zap/posting.go | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index d504885d0..b54899bea 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -159,7 +159,9 @@ type PostingsIterator struct { currChunkFreqNorm []byte currChunkLoc []byte freqNormDecoder *govarint.Base128Decoder + freqNormReader *bytes.Reader locDecoder *govarint.Base128Decoder + locReader *bytes.Reader freqChunkLens []uint64 freqChunkStart uint64 @@ -183,7 +185,12 @@ func (i *PostingsIterator) loadChunk(chunk int) error { } end := start + i.freqChunkLens[chunk] i.currChunkFreqNorm = i.postings.sb.mem[start:end] - i.freqNormDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkFreqNorm)) + if i.freqNormReader == nil { + i.freqNormReader = bytes.NewReader(i.currChunkFreqNorm) + i.freqNormDecoder = govarint.NewU64Base128Decoder(i.freqNormReader) + } else { + i.freqNormReader.Reset(i.currChunkFreqNorm) + } start = i.locChunkStart for j := 0; j < chunk; j++ { @@ -191,7 +198,12 @@ func (i *PostingsIterator) loadChunk(chunk int) error { } end = start + i.locChunkLens[chunk] i.currChunkLoc = i.postings.sb.mem[start:end] - i.locDecoder = govarint.NewU64Base128Decoder(bytes.NewReader(i.currChunkLoc)) + if i.locReader == nil { + i.locReader = bytes.NewReader(i.currChunkLoc) + i.locDecoder = govarint.NewU64Base128Decoder(i.locReader) + } else { + i.locReader.Reset(i.currChunkLoc) + } i.currChunk = uint32(chunk) return nil } From dd7d93ee5ebc99a755298f3eff7539f86f7c3c60 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 13 Feb 2018 09:16:20 -0800 Subject: [PATCH 229/728] scorch zap loadChunk reuses Location slices --- index/scorch/segment/zap/posting.go | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index b54899bea..ada39b434 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -171,7 +171,8 @@ type PostingsIterator struct { locBitmap *roaring.Bitmap - next Posting + next Posting // reused across Next() calls + nextLocs []Location // reused across Next() calls } func (i *PostingsIterator) loadChunk(chunk int) error { @@ -333,7 +334,8 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { } } - i.next = Posting{} // clear the struct. + reuseLocs := i.next.locs // hold for reuse before struct clearing + i.next = Posting{} // clear the struct rv := &i.next rv.iterator = i rv.docNum = uint64(n) @@ -346,15 +348,23 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { } rv.norm = math.Float32frombits(uint32(normBits)) if i.locBitmap.Contains(n) { - // read off 'freq' locations - rv.locs = make([]segment.Location, rv.freq) - locs := make([]Location, rv.freq) + // read off 'freq' locations, into reused slices + if cap(i.nextLocs) >= int(rv.freq) { + i.nextLocs = i.nextLocs[0:rv.freq] + } else { + i.nextLocs = make([]Location, rv.freq) + } + if cap(reuseLocs) >= int(rv.freq) { + rv.locs = reuseLocs[0:rv.freq] + } else { + rv.locs = make([]segment.Location, rv.freq) + } for j := 0; j < int(rv.freq); j++ { - err := i.readLocation(&locs[j]) + err := i.readLocation(&i.nextLocs[j]) if err != nil { return nil, err } - rv.locs[j] = &locs[j] + rv.locs[j] = &i.nextLocs[j] } } From 4b742505aae0905a869ddc116c4b0e7cab3c06e6 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 28 Feb 2018 15:31:55 +0530 Subject: [PATCH 230/728] adding stats for scorch --- index/scorch/introducer.go | 6 +- index/scorch/merge.go | 32 ++++++++++- index/scorch/persister.go | 5 +- index/scorch/scorch.go | 15 +++-- index/scorch/snapshot_index.go | 2 +- index/scorch/snapshot_index_tfr.go | 2 +- index/scorch/stats.go | 91 ++++++++++++++++++++++-------- 7 files changed, 120 insertions(+), 33 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index af0181cab..0d270b171 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -159,7 +159,8 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { // increment numItemsIntroduced which tracks the number of items // queued for persistence. - atomic.AddUint64(&s.stats.numItemsIntroduced, newSegmentSnapshot.Count()) + atomic.AddUint64(&s.stats.TotIntroducedItems, newSegmentSnapshot.Count()) + atomic.AddUint64(&s.stats.TotIntroducedBatchSegments, 1) } // copy old values for key, oldVal := range s.root.internal { @@ -270,6 +271,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { cachedDocs: &cachedDocs{cache: nil}, }) newSnapshot.offsets = append(newSnapshot.offsets, running) + atomic.AddUint64(&s.stats.TotIntroducedMergeSegments, 1) } newSnapshot.AddRef() // 1 ref for the nextMerge.notify response @@ -335,6 +337,8 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { // release lock s.rootLock.Unlock() + atomic.AddUint64(&s.stats.TotRollbackOpsDone, 1) + if rootPrev != nil { _ = rootPrev.DecRef() } diff --git a/index/scorch/merge.go b/index/scorch/merge.go index ad756588a..6562353b7 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -129,6 +129,9 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, return nil } + mip := uint64(len(resultMergePlan.Tasks)) + atomic.AddUint64(&s.stats.CurInProgressFileMerges, mip) + // process tasks in serial for now var notifications []chan *IndexSnapshot for _, task := range resultMergePlan.Tasks { @@ -165,6 +168,10 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, s.unmarkIneligibleForRemoval(filename) return fmt.Errorf("merging failed: %v", err) } + + // update the count of file segments merged + atomic.AddUint64(&s.stats.TotMergedFileSegments, uint64(len(segmentsToMerge))) + segment, err = zap.Open(path) if err != nil { s.unmarkIneligibleForRemoval(filename) @@ -193,16 +200,26 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, case s.merges <- sm: } } + + atomic.AddUint64(&s.stats.CurInProgressFileMerges, ^uint64(mip-1)) + + var newSnapshot *IndexSnapshot for _, notification := range notifications { select { case <-s.closeCh: return nil - case newSnapshot := <-notification: + case newSnapshot = <-notification: if newSnapshot != nil { _ = newSnapshot.DecRef() } } } + + // merge operation completed and the introduction is complete + if newSnapshot != nil { + atomic.AddUint64(&s.stats.TotFileMergeOpsDone, 1) + } + return nil } @@ -224,6 +241,8 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, cr := zap.NewCountHashWriter(&br) + atomic.AddUint64(&s.stats.CurInProgressMemoryMerges, 1) + newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs, fieldsInv, fieldsMap, err := zap.MergeToWriter(sbs, sbsDrops, chunkFactor, cr) @@ -238,6 +257,9 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, return 0, nil, 0, err } + // update the count of in-memory merged segments + atomic.AddUint64(&s.stats.TotMergedMemorySegments, uint64(len(sbs))) + newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) filename := zapFileName(newSegmentID) @@ -252,6 +274,10 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, return 0, nil, 0, err } + // update persisted stats + atomic.AddUint64(&s.stats.TotPersistedItems, segment.Count()) + atomic.AddUint64(&s.stats.TotPersistedSegments, 1) + sm := &segmentMerge{ id: newSegmentID, old: make(map[uint64]*SegmentSnapshot), @@ -273,10 +299,14 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, case s.merges <- sm: } + atomic.AddUint64(&s.stats.CurInProgressMemoryMerges, ^uint64(0)) + select { // wait for introduction to complete case <-s.closeCh: return 0, nil, 0, nil // TODO: return ErrInterruptedClosed? case newSnapshot := <-sm.notify: + // update counters on success + atomic.AddUint64(&s.stats.TotMemoryMergeOpsDone, 1) return numDocs, newSnapshot, newSegmentID, nil } } diff --git a/index/scorch/persister.go b/index/scorch/persister.go index c21bb1439..7186e7c34 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -169,6 +169,8 @@ func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastM OUTER: // check for slow merger and await until the merger catch up for lastPersistedEpoch > *lastMergedEpoch+epochDistance { + // update the stat on each pause cycle + atomic.AddUint64(&s.stats.TotPersisterPause, 1) select { case <-s.closeCh: @@ -412,7 +414,8 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { newIndexSnapshot.segment[i] = newSegmentSnapshot delete(newSegments, segmentSnapshot.id) // update items persisted incase of a new segment snapshot - atomic.AddUint64(&s.stats.numItemsPersisted, newSegmentSnapshot.Count()) + atomic.AddUint64(&s.stats.TotPersistedItems, newSegmentSnapshot.Count()) + atomic.AddUint64(&s.stats.TotPersistedSegments, 1) } else { newIndexSnapshot.segment[i] = s.root.segment[i] newIndexSnapshot.segment[i].segment.AddRef() diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index f539313d1..2b9096655 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -111,6 +111,7 @@ func (s *Scorch) fireAsyncError(err error) { if s.onAsyncError != nil { s.onAsyncError(err) } + atomic.AddUint64(&s.stats.TotOnErrors, 1) } func (s *Scorch) Open() error { @@ -275,7 +276,7 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { } close(resultChan) - atomic.AddUint64(&s.stats.analysisTime, uint64(time.Since(start))) + atomic.AddUint64(&s.stats.TotAnalysisTime, uint64(time.Since(start))) // notify handlers that we're about to introduce a segment s.fireEvent(EventKindBatchIntroductionStart, 0) @@ -286,6 +287,8 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { if err != nil { return err } + } else { + atomic.AddUint64(&s.stats.TotEmptyBatches, 1) } err = s.prepareSegment(newSegment, ids, batch.InternalOps) @@ -293,12 +296,12 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { if newSegment != nil { _ = newSegment.Close() } - atomic.AddUint64(&s.stats.errors, 1) + atomic.AddUint64(&s.stats.TotOnErrors, 1) } else { - atomic.AddUint64(&s.stats.updates, numUpdates) - atomic.AddUint64(&s.stats.deletes, numDeletes) - atomic.AddUint64(&s.stats.batches, 1) - atomic.AddUint64(&s.stats.numPlainTextBytesIndexed, numPlainTextBytes) + atomic.AddUint64(&s.stats.TotUpdates, numUpdates) + atomic.AddUint64(&s.stats.TotDeletes, numDeletes) + atomic.AddUint64(&s.stats.TotBatches, 1) + atomic.AddUint64(&s.stats.TotIndexedPlainTextBytes, numPlainTextBytes) } return err } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index bb9975768..7236eeb7e 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -372,7 +372,7 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, rv.postings[i] = pl rv.iterators[i] = pl.Iterator() } - atomic.AddUint64(&i.parent.stats.termSearchersStarted, uint64(1)) + atomic.AddUint64(&i.parent.stats.TotTermSearchersStarted, uint64(1)) return rv, nil } diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index 87fd0d14f..d1f23b272 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -126,7 +126,7 @@ func (i *IndexSnapshotTermFieldReader) Count() uint64 { func (i *IndexSnapshotTermFieldReader) Close() error { if i.snapshot != nil { - atomic.AddUint64(&i.snapshot.parent.stats.termSearchersFinished, uint64(1)) + atomic.AddUint64(&i.snapshot.parent.stats.TotTermSearchersFinished, uint64(1)) } return nil } diff --git a/index/scorch/stats.go b/index/scorch/stats.go index c44a977bf..5edfc42c2 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -20,31 +20,78 @@ import ( "sync/atomic" ) -// Stats tracks statistics about the index +// Stats tracks statistics about the index, fields that are +// prefixed like CurXxxx are gauges (can go up and down), +// and fields that are prefixed like TotXxxx are monotonically +// increasing counters. type Stats struct { - updates, deletes, batches, errors uint64 - analysisTime, indexTime uint64 - termSearchersStarted uint64 - termSearchersFinished uint64 - numPlainTextBytesIndexed uint64 - numItemsIntroduced uint64 - numItemsPersisted uint64 - i *Scorch + TotUpdates uint64 + TotDeletes uint64 + TotBatches uint64 + TotEmptyBatches uint64 + TotOnErrors uint64 + TotAnalysisTime uint64 + TotIndexTime uint64 + TotIndexedPlainTextBytes uint64 + + TotIndexSnapshotBeg uint64 + TotIndexSnapshotEnd uint64 + + TotIntroducedBatchSegments uint64 + TotIntroducedMergeSegments uint64 + TotIntroducedItems uint64 + + TotTermSearchersStarted uint64 + TotTermSearchersFinished uint64 + + TotPersistedItems uint64 + TotPersistedSegments uint64 + TotPersisterPause uint64 + + TotMemoryMergeOpsDone uint64 + TotMergedMemorySegments uint64 + TotMergedFileSegments uint64 + TotFileMergeOpsDone uint64 + + TotRollbackOpsDone uint64 + + CurInProgressMemoryMerges uint64 + CurInProgressFileMerges uint64 + + CurMemoryBytes uint64 + + i *Scorch } func (s *Stats) statsMap() (map[string]interface{}, error) { m := map[string]interface{}{} - m["updates"] = atomic.LoadUint64(&s.updates) - m["deletes"] = atomic.LoadUint64(&s.deletes) - m["batches"] = atomic.LoadUint64(&s.batches) - m["errors"] = atomic.LoadUint64(&s.errors) - m["analysis_time"] = atomic.LoadUint64(&s.analysisTime) - m["index_time"] = atomic.LoadUint64(&s.indexTime) - m["term_searchers_started"] = atomic.LoadUint64(&s.termSearchersStarted) - m["term_searchers_finished"] = atomic.LoadUint64(&s.termSearchersFinished) - m["num_plain_text_bytes_indexed"] = atomic.LoadUint64(&s.numPlainTextBytesIndexed) - m["num_items_introduced"] = atomic.LoadUint64(&s.numItemsIntroduced) - m["num_items_persisted"] = atomic.LoadUint64(&s.numItemsPersisted) + m["TotUpdates"] = atomic.LoadUint64(&s.TotUpdates) + m["TotDeletes"] = atomic.LoadUint64(&s.TotDeletes) + m["TotBatches"] = atomic.LoadUint64(&s.TotBatches) + m["TotEmptyBatches"] = atomic.LoadUint64(&s.TotEmptyBatches) + m["TotOnErrors"] = atomic.LoadUint64(&s.TotOnErrors) + m["TotAnalysisTime"] = atomic.LoadUint64(&s.TotAnalysisTime) + m["TotIndexSnapshotBeg"] = atomic.LoadUint64(&s.TotIndexSnapshotBeg) + m["TotIndexSnapshotEnd"] = atomic.LoadUint64(&s.TotIndexSnapshotEnd) + + m["TotTermSearchersStarted"] = atomic.LoadUint64(&s.TotTermSearchersStarted) + m["TotTermSearchersFinished"] = atomic.LoadUint64(&s.TotTermSearchersFinished) + m["TotIndexedPlainTextBytes"] = atomic.LoadUint64(&s.TotIndexedPlainTextBytes) + m["TotIntroducedItems"] = atomic.LoadUint64(&s.TotIntroducedItems) + m["TotPersistedItems"] = atomic.LoadUint64(&s.TotPersistedItems) + + m["TotMemoryMergeOpsDone"] = atomic.LoadUint64(&s.TotMemoryMergeOpsDone) + m["TotFileMergeOpsDone"] = atomic.LoadUint64(&s.TotFileMergeOpsDone) + m["TotIntroducedBatchSegments"] = atomic.LoadUint64(&s.TotIntroducedBatchSegments) + m["TotIntroducedMergeSegments"] = atomic.LoadUint64(&s.TotIntroducedMergeSegments) + m["TotPersistedSegments"] = atomic.LoadUint64(&s.TotPersistedSegments) + m["TotRollbackOpsDone"] = atomic.LoadUint64(&s.TotRollbackOpsDone) + m["CurInProgressFileMerges"] = atomic.LoadUint64(&s.CurInProgressFileMerges) + m["CurInProgressMemoryMerges"] = atomic.LoadUint64(&s.CurInProgressMemoryMerges) + m["TotPersisterPause"] = atomic.LoadUint64(&s.TotPersisterPause) + m["TotMergedMemorySegments"] = atomic.LoadUint64(&s.TotMergedMemorySegments) + m["TotMergedFileSegments"] = atomic.LoadUint64(&s.TotMergedFileSegments) + m["CurMemoryBytes"] = s.i.MemoryUsed() if s.i.path != "" { finfos, err := ioutil.ReadDir(s.i.path) @@ -61,8 +108,8 @@ func (s *Stats) statsMap() (map[string]interface{}, error) { } } - m["num_bytes_used_disk"] = numBytesUsedDisk - m["num_files_on_disk"] = numFilesOnDisk + m["TotOnDiskBytes"] = numBytesUsedDisk + m["TotOnDiskFiles"] = numFilesOnDisk } return m, nil From 7d46d2c7ae17f121132cd956d3c4fb576d403b70 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 28 Feb 2018 10:09:12 -0800 Subject: [PATCH 231/728] scorch zap intcoder encoder is never nil --- index/scorch/segment/zap/intcoder.go | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/index/scorch/segment/zap/intcoder.go b/index/scorch/segment/zap/intcoder.go index b505fec94..247e36fbc 100644 --- a/index/scorch/segment/zap/intcoder.go +++ b/index/scorch/segment/zap/intcoder.go @@ -67,11 +67,8 @@ func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error { chunk := docNum / c.chunkSize if chunk != c.currChunk { // starting a new chunk - if c.encoder != nil { - // close out last - c.Close() - c.chunkBuf.Reset() - } + c.Close() + c.chunkBuf.Reset() c.currChunk = chunk } @@ -92,6 +89,7 @@ func (c *chunkedIntCoder) Close() { encodingBytes := c.chunkBuf.Bytes() c.chunkLens[c.currChunk] = uint64(len(encodingBytes)) c.final = append(c.final, encodingBytes...) + c.currChunk = uint64(cap(c.chunkLens)) // sentinel to detect double close } // Write commits all the encoded chunked integers to the provided writer. From 1b661ef844a5e6d369c2af4d27ab537ca603e2de Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 28 Feb 2018 11:36:35 -0800 Subject: [PATCH 232/728] stats cleanup, renaming, gauges replaced with counters --- index/scorch/introducer.go | 17 ++++- index/scorch/merge.go | 62 ++++++++++------ index/scorch/persister.go | 30 +++++--- index/scorch/scorch.go | 43 +++++++++-- index/scorch/stats.go | 147 ++++++++++++++++++------------------- 5 files changed, 180 insertions(+), 119 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 0d270b171..b6007b4ac 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -48,6 +48,8 @@ func (s *Scorch) mainLoop() { var epochWatchers []*epochWatcher OUTER: for { + atomic.AddUint64(&s.stats.TotIntroduceLoop, 1) + select { case <-s.closeCh: break OUTER @@ -92,6 +94,9 @@ OUTER: } func (s *Scorch) introduceSegment(next *segmentIntroduction) error { + atomic.AddUint64(&s.stats.TotIntroduceSegmentBeg, 1) + defer atomic.AddUint64(&s.stats.TotIntroduceSegmentEnd, 1) + // acquire lock s.rootLock.Lock() @@ -160,7 +165,7 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { // increment numItemsIntroduced which tracks the number of items // queued for persistence. atomic.AddUint64(&s.stats.TotIntroducedItems, newSegmentSnapshot.Count()) - atomic.AddUint64(&s.stats.TotIntroducedBatchSegments, 1) + atomic.AddUint64(&s.stats.TotIntroducedSegmentsBatch, 1) } // copy old values for key, oldVal := range s.root.internal { @@ -193,6 +198,9 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { } func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { + atomic.AddUint64(&s.stats.TotIntroduceMergeBeg, 1) + defer atomic.AddUint64(&s.stats.TotIntroduceMergeEnd, 1) + // acquire lock s.rootLock.Lock() @@ -271,7 +279,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { cachedDocs: &cachedDocs{cache: nil}, }) newSnapshot.offsets = append(newSnapshot.offsets, running) - atomic.AddUint64(&s.stats.TotIntroducedMergeSegments, 1) + atomic.AddUint64(&s.stats.TotIntroducedSegmentsMerge, 1) } newSnapshot.AddRef() // 1 ref for the nextMerge.notify response @@ -292,6 +300,9 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { } func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { + atomic.AddUint64(&s.stats.TotIntroduceRevertBeg, 1) + defer atomic.AddUint64(&s.stats.TotIntroduceRevertEnd, 1) + if revertTo.snapshot == nil { err := fmt.Errorf("Cannot revert to a nil snapshot") revertTo.applied <- err @@ -337,8 +348,6 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { // release lock s.rootLock.Unlock() - atomic.AddUint64(&s.stats.TotRollbackOpsDone, 1) - if rootPrev != nil { _ = rootPrev.DecRef() } diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 6562353b7..005d4f41e 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -40,6 +40,8 @@ func (s *Scorch) mergerLoop() { OUTER: for { + atomic.AddUint64(&s.stats.TotFileMergeLoopBeg, 1) + select { case <-s.closeCh: break OUTER @@ -59,6 +61,7 @@ OUTER: if err != nil { s.fireAsyncError(fmt.Errorf("merging err: %v", err)) _ = ourSnapshot.DecRef() + atomic.AddUint64(&s.stats.TotFileMergeLoopErr, 1) continue OUTER } lastEpochMergePlanned = ourSnapshot.epoch @@ -88,7 +91,10 @@ OUTER: case <-ew.notifyCh: } } + + atomic.AddUint64(&s.stats.TotFileMergeLoopEnd, 1) } + s.asyncTasks.Done() } @@ -119,35 +125,45 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, } } + atomic.AddUint64(&s.stats.TotFileMergePlan, 1) + // give this list to the planner resultMergePlan, err := mergeplan.Plan(onlyZapSnapshots, options) if err != nil { + atomic.AddUint64(&s.stats.TotFileMergePlanErr, 1) return fmt.Errorf("merge planning err: %v", err) } if resultMergePlan == nil { // nothing to do + atomic.AddUint64(&s.stats.TotFileMergePlanNone, 1) return nil } - mip := uint64(len(resultMergePlan.Tasks)) - atomic.AddUint64(&s.stats.CurInProgressFileMerges, mip) + atomic.AddUint64(&s.stats.TotFileMergePlanOk, 1) + + atomic.AddUint64(&s.stats.TotFileMergePlanTasks, uint64(len(resultMergePlan.Tasks))) // process tasks in serial for now var notifications []chan *IndexSnapshot for _, task := range resultMergePlan.Tasks { if len(task.Segments) == 0 { + atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegmentsEmpty, 1) continue } + atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegments, uint64(len(task.Segments))) + oldMap := make(map[uint64]*SegmentSnapshot) newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) segmentsToMerge := make([]*zap.Segment, 0, len(task.Segments)) docsToDrop := make([]*roaring.Bitmap, 0, len(task.Segments)) + for _, planSegment := range task.Segments { if segSnapshot, ok := planSegment.(*SegmentSnapshot); ok { oldMap[segSnapshot.id] = segSnapshot if zapSeg, ok := segSnapshot.segment.(*zap.Segment); ok { if segSnapshot.LiveSize() == 0 { + atomic.AddUint64(&s.stats.TotFileMergeSegmentsEmpty, 1) oldMap[segSnapshot.id] = nil } else { segmentsToMerge = append(segmentsToMerge, zapSeg) @@ -163,24 +179,27 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, filename := zapFileName(newSegmentID) s.markIneligibleForRemoval(filename) path := s.path + string(os.PathSeparator) + filename + atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1) newDocNums, err := zap.Merge(segmentsToMerge, docsToDrop, path, 1024) + atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1) if err != nil { s.unmarkIneligibleForRemoval(filename) + atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1) return fmt.Errorf("merging failed: %v", err) } - // update the count of file segments merged - atomic.AddUint64(&s.stats.TotMergedFileSegments, uint64(len(segmentsToMerge))) - segment, err = zap.Open(path) if err != nil { s.unmarkIneligibleForRemoval(filename) + atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1) return err } oldNewDocNums = make(map[uint64][]uint64) for i, segNewDocNums := range newDocNums { oldNewDocNums[task.Segments[i].Id()] = segNewDocNums } + + atomic.AddUint64(&s.stats.TotFileMergeSegments, uint64(len(segmentsToMerge))) } sm := &segmentMerge{ @@ -198,28 +217,24 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, _ = segment.Close() return nil case s.merges <- sm: + atomic.AddUint64(&s.stats.TotFileMergeIntroductions, 1) } - } - atomic.AddUint64(&s.stats.CurInProgressFileMerges, ^uint64(mip-1)) + atomic.AddUint64(&s.stats.TotFileMergePlanTasksDone, 1) + } - var newSnapshot *IndexSnapshot for _, notification := range notifications { select { case <-s.closeCh: return nil - case newSnapshot = <-notification: + case newSnapshot := <-notification: + atomic.AddUint64(&s.stats.TotFileMergeIntroductionsDone, 1) if newSnapshot != nil { _ = newSnapshot.DecRef() } } } - // merge operation completed and the introduction is complete - if newSnapshot != nil { - atomic.AddUint64(&s.stats.TotFileMergeOpsDone, 1) - } - return nil } @@ -237,16 +252,19 @@ type segmentMerge struct { func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, sbs []*zap.SegmentBase, sbsDrops []*roaring.Bitmap, sbsIndexes []int, chunkFactor uint32) (uint64, *IndexSnapshot, uint64, error) { + atomic.AddUint64(&s.stats.TotMemMergeBeg, 1) + var br bytes.Buffer cr := zap.NewCountHashWriter(&br) - atomic.AddUint64(&s.stats.CurInProgressMemoryMerges, 1) - + atomic.AddUint64(&s.stats.TotMemMergeZapBeg, 1) newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs, fieldsInv, fieldsMap, err := zap.MergeToWriter(sbs, sbsDrops, chunkFactor, cr) + atomic.AddUint64(&s.stats.TotMemMergeZapEnd, 1) if err != nil { + atomic.AddUint64(&s.stats.TotMemMergeErr, 1) return 0, nil, 0, err } @@ -254,23 +272,23 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, fieldsMap, fieldsInv, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs) if err != nil { + atomic.AddUint64(&s.stats.TotMemMergeErr, 1) return 0, nil, 0, err } - // update the count of in-memory merged segments - atomic.AddUint64(&s.stats.TotMergedMemorySegments, uint64(len(sbs))) - newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) filename := zapFileName(newSegmentID) path := s.path + string(os.PathSeparator) + filename err = zap.PersistSegmentBase(sb, path) if err != nil { + atomic.AddUint64(&s.stats.TotMemMergeErr, 1) return 0, nil, 0, err } segment, err := zap.Open(path) if err != nil { + atomic.AddUint64(&s.stats.TotMemMergeErr, 1) return 0, nil, 0, err } @@ -299,14 +317,12 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, case s.merges <- sm: } - atomic.AddUint64(&s.stats.CurInProgressMemoryMerges, ^uint64(0)) - select { // wait for introduction to complete case <-s.closeCh: return 0, nil, 0, nil // TODO: return ErrInterruptedClosed? case newSnapshot := <-sm.notify: - // update counters on success - atomic.AddUint64(&s.stats.TotMemoryMergeOpsDone, 1) + atomic.AddUint64(&s.stats.TotMemMergeSegments, uint64(len(sbs))) + atomic.AddUint64(&s.stats.TotMemMergeDone, 1) return numDocs, newSnapshot, newSegmentID, nil } } diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 7186e7c34..6ffbd44d4 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -55,6 +55,8 @@ func (s *Scorch) persisterLoop() { var ew *epochWatcher OUTER: for { + atomic.AddUint64(&s.stats.TotPersistLoopBeg, 1) + select { case <-s.closeCh: break OUTER @@ -65,8 +67,8 @@ OUTER: if ew != nil && ew.epoch > lastMergedEpoch { lastMergedEpoch = ew.epoch } - persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch, - &lastMergedEpoch, persistWatchers) + lastMergedEpoch, persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch, + lastMergedEpoch, persistWatchers) var ourSnapshot *IndexSnapshot var ourPersisted []chan error @@ -94,6 +96,7 @@ OUTER: if err != nil { s.fireAsyncError(fmt.Errorf("got err persisting snapshot: %v", err)) _ = ourSnapshot.DecRef() + atomic.AddUint64(&s.stats.TotPersistLoopErr, 1) continue OUTER } @@ -115,6 +118,7 @@ OUTER: s.fireEvent(EventKindPersisterProgress, time.Since(startTime)) if changed { + atomic.AddUint64(&s.stats.TotPersistLoopProgress, 1) continue OUTER } } @@ -133,17 +137,21 @@ OUTER: s.removeOldData() // might as well cleanup while waiting + atomic.AddUint64(&s.stats.TotPersistLoopWait, 1) + select { case <-s.closeCh: break OUTER case <-w.notifyCh: // woken up, next loop should pick up work - continue OUTER + atomic.AddUint64(&s.stats.TotPersistLoopWaitNotified, 1) case ew = <-s.persisterNotifier: // if the watchers are already caught up then let them wait, // else let them continue to do the catch up persistWatchers = append(persistWatchers, ew) } + + atomic.AddUint64(&s.stats.TotPersistLoopEnd, 1) } } @@ -160,31 +168,32 @@ func notifyMergeWatchers(lastPersistedEpoch uint64, return watchersNext } -func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch *uint64, - persistWatchers []*epochWatcher) []*epochWatcher { +func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch uint64, + persistWatchers []*epochWatcher) (uint64, []*epochWatcher) { // first, let the watchers proceed if they lag behind persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) OUTER: // check for slow merger and await until the merger catch up - for lastPersistedEpoch > *lastMergedEpoch+epochDistance { - // update the stat on each pause cycle - atomic.AddUint64(&s.stats.TotPersisterPause, 1) + for lastPersistedEpoch > lastMergedEpoch+epochDistance { + atomic.AddUint64(&s.stats.TotPersisterSlowMergerPause, 1) select { case <-s.closeCh: break OUTER case ew := <-s.persisterNotifier: persistWatchers = append(persistWatchers, ew) - *lastMergedEpoch = ew.epoch + lastMergedEpoch = ew.epoch } + atomic.AddUint64(&s.stats.TotPersisterSlowMergerResume, 1) + // let the watchers proceed if they lag behind persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) } - return persistWatchers + return lastMergedEpoch, persistWatchers } func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { @@ -413,6 +422,7 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { } newIndexSnapshot.segment[i] = newSegmentSnapshot delete(newSegments, segmentSnapshot.id) + // update items persisted incase of a new segment snapshot atomic.AddUint64(&s.stats.TotPersistedItems, newSegmentSnapshot.Count()) atomic.AddUint64(&s.stats.TotPersistedSegments, 1) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 2b9096655..d3b9b36a1 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -17,6 +17,7 @@ package scorch import ( "encoding/json" "fmt" + "io/ioutil" "os" "sync" "sync/atomic" @@ -43,7 +44,7 @@ type Scorch struct { version uint8 config map[string]interface{} analysisQueue *index.AnalysisQueue - stats *Stats + stats Stats nextSegmentID uint64 path string @@ -80,7 +81,6 @@ func NewScorch(storeName string, closeCh: make(chan struct{}), ineligibleForRemoval: map[string]bool{}, } - rv.stats = &Stats{i: rv} rv.root = &IndexSnapshot{parent: rv, refs: 1} ro, ok := config["read_only"].(bool) if ok { @@ -288,7 +288,7 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { return err } } else { - atomic.AddUint64(&s.stats.TotEmptyBatches, 1) + atomic.AddUint64(&s.stats.TotBatchesEmpty, 1) } err = s.prepareSegment(newSegment, ids, batch.InternalOps) @@ -377,10 +377,43 @@ func (s *Scorch) Reader() (index.IndexReader, error) { } func (s *Scorch) Stats() json.Marshaler { - return s.stats + return &s.stats } func (s *Scorch) StatsMap() map[string]interface{} { - m, _ := s.stats.statsMap() + m := s.stats.ToMap() + + if s.path != "" { + finfos, err := ioutil.ReadDir(s.path) + if err == nil { + var numFilesOnDisk, numBytesUsedDisk uint64 + for _, finfo := range finfos { + if !finfo.IsDir() { + numBytesUsedDisk += uint64(finfo.Size()) + numFilesOnDisk++ + } + } + + m["TotOnDiskBytes"] = numBytesUsedDisk + m["TotOnDiskFiles"] = numFilesOnDisk + } + } + + // TODO: consider one day removing these backwards compatible + // names for apps using the old names + m["updates"] = m["TotUpdates"] + m["deletes"] = m["TotDeletes"] + m["batches"] = m["TotBatches"] + m["errors"] = m["TotOnErrors"] + m["analysis_time"] = m["TotAnalysisTime"] + m["index_time"] = m["TotIndexTime"] + m["term_searchers_started"] = m["TotTermSearchersStarted"] + m["term_searchers_finished"] = m["TotTermSearchersFinished"] + m["num_plain_text_bytes_indexed"] = m["TotIndexedPlainTextBytes"] + m["num_items_introduced"] = m["TotIntroducedItems"] + m["num_items_persisted"] = m["TotPersistedItems"] + m["num_bytes_used_disk"] = m["TotOnDiskBytes"] + m["num_files_on_disk"] = m["TotOnDiskFiles"] + return m } diff --git a/index/scorch/stats.go b/index/scorch/stats.go index 5edfc42c2..d239e62ec 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -16,7 +16,7 @@ package scorch import ( "encoding/json" - "io/ioutil" + "reflect" "sync/atomic" ) @@ -25,101 +25,94 @@ import ( // and fields that are prefixed like TotXxxx are monotonically // increasing counters. type Stats struct { - TotUpdates uint64 - TotDeletes uint64 - TotBatches uint64 - TotEmptyBatches uint64 - TotOnErrors uint64 - TotAnalysisTime uint64 - TotIndexTime uint64 - TotIndexedPlainTextBytes uint64 + TotUpdates uint64 + TotDeletes uint64 + TotBatches uint64 + TotBatchesEmpty uint64 + TotOnErrors uint64 - TotIndexSnapshotBeg uint64 - TotIndexSnapshotEnd uint64 + TotAnalysisTime uint64 + TotIndexTime uint64 - TotIntroducedBatchSegments uint64 - TotIntroducedMergeSegments uint64 - TotIntroducedItems uint64 + TotIndexedPlainTextBytes uint64 TotTermSearchersStarted uint64 TotTermSearchersFinished uint64 + TotIntroduceLoop uint64 + TotIntroduceSegmentBeg uint64 + TotIntroduceSegmentEnd uint64 + TotIntroduceMergeBeg uint64 + TotIntroduceMergeEnd uint64 + TotIntroduceRevertBeg uint64 + TotIntroduceRevertEnd uint64 + + TotIntroducedItems uint64 + TotIntroducedSegmentsBatch uint64 + TotIntroducedSegmentsMerge uint64 + + TotPersistLoopBeg uint64 + TotPersistLoopErr uint64 + TotPersistLoopProgress uint64 + TotPersistLoopWait uint64 + TotPersistLoopWaitNotified uint64 + TotPersistLoopEnd uint64 + TotPersistedItems uint64 TotPersistedSegments uint64 - TotPersisterPause uint64 - TotMemoryMergeOpsDone uint64 - TotMergedMemorySegments uint64 - TotMergedFileSegments uint64 - TotFileMergeOpsDone uint64 + TotPersisterSlowMergerPause uint64 + TotPersisterSlowMergerResume uint64 - TotRollbackOpsDone uint64 + TotFileMergeLoopBeg uint64 + TotFileMergeLoopErr uint64 + TotFileMergeLoopEnd uint64 - CurInProgressMemoryMerges uint64 - CurInProgressFileMerges uint64 + TotFileMergePlan uint64 + TotFileMergePlanErr uint64 + TotFileMergePlanNone uint64 + TotFileMergePlanOk uint64 - CurMemoryBytes uint64 + TotFileMergePlanTasks uint64 + TotFileMergePlanTasksDone uint64 + TotFileMergePlanTasksErr uint64 + TotFileMergePlanTasksSegments uint64 + TotFileMergePlanTasksSegmentsEmpty uint64 - i *Scorch -} + TotFileMergeSegmentsEmpty uint64 + TotFileMergeSegments uint64 -func (s *Stats) statsMap() (map[string]interface{}, error) { - m := map[string]interface{}{} - m["TotUpdates"] = atomic.LoadUint64(&s.TotUpdates) - m["TotDeletes"] = atomic.LoadUint64(&s.TotDeletes) - m["TotBatches"] = atomic.LoadUint64(&s.TotBatches) - m["TotEmptyBatches"] = atomic.LoadUint64(&s.TotEmptyBatches) - m["TotOnErrors"] = atomic.LoadUint64(&s.TotOnErrors) - m["TotAnalysisTime"] = atomic.LoadUint64(&s.TotAnalysisTime) - m["TotIndexSnapshotBeg"] = atomic.LoadUint64(&s.TotIndexSnapshotBeg) - m["TotIndexSnapshotEnd"] = atomic.LoadUint64(&s.TotIndexSnapshotEnd) - - m["TotTermSearchersStarted"] = atomic.LoadUint64(&s.TotTermSearchersStarted) - m["TotTermSearchersFinished"] = atomic.LoadUint64(&s.TotTermSearchersFinished) - m["TotIndexedPlainTextBytes"] = atomic.LoadUint64(&s.TotIndexedPlainTextBytes) - m["TotIntroducedItems"] = atomic.LoadUint64(&s.TotIntroducedItems) - m["TotPersistedItems"] = atomic.LoadUint64(&s.TotPersistedItems) - - m["TotMemoryMergeOpsDone"] = atomic.LoadUint64(&s.TotMemoryMergeOpsDone) - m["TotFileMergeOpsDone"] = atomic.LoadUint64(&s.TotFileMergeOpsDone) - m["TotIntroducedBatchSegments"] = atomic.LoadUint64(&s.TotIntroducedBatchSegments) - m["TotIntroducedMergeSegments"] = atomic.LoadUint64(&s.TotIntroducedMergeSegments) - m["TotPersistedSegments"] = atomic.LoadUint64(&s.TotPersistedSegments) - m["TotRollbackOpsDone"] = atomic.LoadUint64(&s.TotRollbackOpsDone) - m["CurInProgressFileMerges"] = atomic.LoadUint64(&s.CurInProgressFileMerges) - m["CurInProgressMemoryMerges"] = atomic.LoadUint64(&s.CurInProgressMemoryMerges) - m["TotPersisterPause"] = atomic.LoadUint64(&s.TotPersisterPause) - m["TotMergedMemorySegments"] = atomic.LoadUint64(&s.TotMergedMemorySegments) - m["TotMergedFileSegments"] = atomic.LoadUint64(&s.TotMergedFileSegments) - m["CurMemoryBytes"] = s.i.MemoryUsed() - - if s.i.path != "" { - finfos, err := ioutil.ReadDir(s.i.path) - if err != nil { - return nil, err - } + TotFileMergeZapBeg uint64 + TotFileMergeZapEnd uint64 - var numFilesOnDisk, numBytesUsedDisk uint64 + TotFileMergeIntroductions uint64 + TotFileMergeIntroductionsDone uint64 - for _, finfo := range finfos { - if !finfo.IsDir() { - numBytesUsedDisk += uint64(finfo.Size()) - numFilesOnDisk++ - } - } + TotMemMergeBeg uint64 + TotMemMergeErr uint64 + TotMemMergeDone uint64 + TotMemMergeZapBeg uint64 + TotMemMergeZapEnd uint64 + TotMemMergeSegments uint64 +} - m["TotOnDiskBytes"] = numBytesUsedDisk - m["TotOnDiskFiles"] = numFilesOnDisk +// atomically populates the returned map +func (s *Stats) ToMap() map[string]interface{} { + m := map[string]interface{}{} + sve := reflect.ValueOf(s).Elem() + svet := sve.Type() + for i := 0; i < svet.NumField(); i++ { + svef := sve.Field(i) + if svef.CanAddr() { + svefp := svef.Addr().Interface() + m[svet.Field(i).Name] = atomic.LoadUint64(svefp.(*uint64)) + } } - - return m, nil + return m } -// MarshalJSON implements json.Marshaler +// MarshalJSON implements json.Marshaler, and in contrast to standard +// json marshaling provides atomic safety func (s *Stats) MarshalJSON() ([]byte, error) { - m, err := s.statsMap() - if err != nil { - return nil, err - } - return json.Marshal(m) + return json.Marshal(s.ToMap()) } From 0363b24dd4e16d2546f55a94d076174232983dec Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 1 Mar 2018 07:31:53 -0800 Subject: [PATCH 233/728] update to use new vellum Reset API --- index/scorch/segment/zap/build.go | 20 +++++++++++--------- index/scorch/segment/zap/merge.go | 18 +++++++++++------- 2 files changed, 22 insertions(+), 16 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 72357ae7d..77f18b05c 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -471,16 +471,11 @@ func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs varintBuf := make([]byte, binary.MaxVarintLen64) var buffer bytes.Buffer + builder, err := vellum.New(&buffer, nil) + if err != nil { + return nil, err + } for fieldID, fieldTerms := range memSegment.DictKeys { - if fieldID != 0 { - buffer.Reset() - } - - // start a new vellum for this field - builder, err := vellum.New(&buffer, nil) - if err != nil { - return nil, err - } dict := memSegment.Dicts[fieldID] // now walk the dictionary in order of fieldTerms (already sorted) @@ -514,6 +509,13 @@ func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs if err != nil { return nil, err } + + // reset buffer and vellum builder + buffer.Reset() + err = builder.Reset(&buffer) + if err != nil { + return nil, err + } } return rv, nil diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index ae8c5b197..33ce16c59 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -180,16 +180,13 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, var docTermMap [][]byte var vellumBuf bytes.Buffer + newVellum, err := vellum.New(&vellumBuf, nil) + if err != nil { + return nil, 0, err + } // for each field for fieldID, fieldName := range fieldsInv { - if fieldID != 0 { - vellumBuf.Reset() - } - newVellum, err := vellum.New(&vellumBuf, nil) - if err != nil { - return nil, 0, err - } // collect FST iterators from all active segments for this field var newDocNums [][]uint64 @@ -423,6 +420,13 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, if err != nil { return nil, 0, err } + + // reset vellum buffer and vellum builder + vellumBuf.Reset() + err = newVellum.Reset(&vellumBuf) + if err != nil { + return nil, 0, err + } } fieldDvLocsOffset := uint64(w.Count()) From 7e5bb0bd8d5ca2ab81fb52f2e65baf265d499698 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 1 Mar 2018 14:13:36 -0800 Subject: [PATCH 234/728] renamed to CurOnDiskBytes/Files as those are gauges --- index/scorch/scorch.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index d3b9b36a1..226c4647e 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -393,8 +393,8 @@ func (s *Scorch) StatsMap() map[string]interface{} { } } - m["TotOnDiskBytes"] = numBytesUsedDisk - m["TotOnDiskFiles"] = numFilesOnDisk + m["CurOnDiskBytes"] = numBytesUsedDisk + m["CurOnDiskFiles"] = numFilesOnDisk } } @@ -411,8 +411,8 @@ func (s *Scorch) StatsMap() map[string]interface{} { m["num_plain_text_bytes_indexed"] = m["TotIndexedPlainTextBytes"] m["num_items_introduced"] = m["TotIntroducedItems"] m["num_items_persisted"] = m["TotPersistedItems"] - m["num_bytes_used_disk"] = m["TotOnDiskBytes"] - m["num_files_on_disk"] = m["TotOnDiskFiles"] + m["num_bytes_used_disk"] = m["CurOnDiskBytes"] + m["num_files_on_disk"] = m["CurOnDiskFiles"] return m } From 868a66279e41cd55284257df747d1cc3386a3b6f Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 2 Mar 2018 11:07:37 -0800 Subject: [PATCH 235/728] scorch indexing time stat Looks like this was forgotten along the way -- the stat for analysis time was tracked correctly, but indexing time wasn't. --- index/scorch/scorch.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 226c4647e..86039a7e4 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -278,6 +278,8 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { atomic.AddUint64(&s.stats.TotAnalysisTime, uint64(time.Since(start))) + indexStart := time.Now() + // notify handlers that we're about to introduce a segment s.fireEvent(EventKindBatchIntroductionStart, 0) @@ -303,6 +305,9 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { atomic.AddUint64(&s.stats.TotBatches, 1) atomic.AddUint64(&s.stats.TotIndexedPlainTextBytes, numPlainTextBytes) } + + atomic.AddUint64(&s.stats.TotIndexTime, uint64(time.Since(indexStart))) + return err } From d61d9e4cf6efbf7baa77649d4e1cedf160f9867a Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 2 Mar 2018 11:17:16 -0800 Subject: [PATCH 236/728] scorch stats MaxBatchIntroTime and TotBatchIntroTime --- index/scorch/scorch.go | 8 ++++++++ index/scorch/stats.go | 14 +++++++++----- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 86039a7e4..2b85b69dd 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -344,6 +344,8 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, _ = root.DecRef() + introStartTime := time.Now() + s.introductions <- introduction // block until this segment is applied @@ -356,6 +358,12 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, err = <-introduction.persisted } + introTime := uint64(time.Since(introStartTime)) + atomic.AddUint64(&s.stats.TotBatchIntroTime, introTime) + if atomic.LoadUint64(&s.stats.MaxBatchIntroTime) < introTime { + atomic.AddUint64(&s.stats.MaxBatchIntroTime, introTime) + } + return err } diff --git a/index/scorch/stats.go b/index/scorch/stats.go index d239e62ec..21e6d0188 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -25,11 +25,15 @@ import ( // and fields that are prefixed like TotXxxx are monotonically // increasing counters. type Stats struct { - TotUpdates uint64 - TotDeletes uint64 - TotBatches uint64 - TotBatchesEmpty uint64 - TotOnErrors uint64 + TotUpdates uint64 + TotDeletes uint64 + + TotBatches uint64 + TotBatchesEmpty uint64 + TotBatchIntroTime uint64 + MaxBatchIntroTime uint64 + + TotOnErrors uint64 TotAnalysisTime uint64 TotIndexTime uint64 From 30acc55d05b12dc05453dfb5c8b3cf98fcfbc3dc Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 2 Mar 2018 14:03:54 -0800 Subject: [PATCH 237/728] remove unnecessary scorch reader wrapper we now use *IndexSnapshot directly --- index/scorch/reader.go | 110 --------------------------------- index/scorch/scorch.go | 4 +- index/scorch/scorch_test.go | 8 +-- index/scorch/snapshot_index.go | 28 +++++++++ 4 files changed, 34 insertions(+), 116 deletions(-) delete mode 100644 index/scorch/reader.go diff --git a/index/scorch/reader.go b/index/scorch/reader.go deleted file mode 100644 index 365ecb670..000000000 --- a/index/scorch/reader.go +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package scorch - -import ( - "github.com/blevesearch/bleve/document" - "github.com/blevesearch/bleve/index" -) - -type Reader struct { - root *IndexSnapshot // Owns 1 ref-count on the index snapshot. -} - -func (r *Reader) TermFieldReader(term []byte, field string, includeFreq, - includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { - return r.root.TermFieldReader(term, field, includeFreq, includeNorm, includeTermVectors) -} - -// DocIDReader returns an iterator over all doc ids -// The caller must close returned instance to release associated resources. -func (r *Reader) DocIDReaderAll() (index.DocIDReader, error) { - return r.root.DocIDReaderAll() -} - -func (r *Reader) DocIDReaderOnly(ids []string) (index.DocIDReader, error) { - return r.root.DocIDReaderOnly(ids) -} - -func (r *Reader) FieldDict(field string) (index.FieldDict, error) { - return r.root.FieldDict(field) -} - -// FieldDictRange is currently defined to include the start and end terms -func (r *Reader) FieldDictRange(field string, startTerm []byte, - endTerm []byte) (index.FieldDict, error) { - return r.root.FieldDictRange(field, startTerm, endTerm) -} - -func (r *Reader) FieldDictPrefix(field string, - termPrefix []byte) (index.FieldDict, error) { - return r.root.FieldDictPrefix(field, termPrefix) -} - -func (r *Reader) Document(id string) (*document.Document, error) { - return r.root.Document(id) -} -func (r *Reader) DocumentVisitFieldTerms(id index.IndexInternalID, fields []string, - visitor index.DocumentFieldTermVisitor) error { - return r.root.DocumentVisitFieldTerms(id, fields, visitor) -} - -func (r *Reader) Fields() ([]string, error) { - return r.root.Fields() -} - -func (r *Reader) GetInternal(key []byte) ([]byte, error) { - return r.root.GetInternal(key) -} - -func (r *Reader) DocCount() (uint64, error) { - return r.root.DocCount() -} - -func (r *Reader) ExternalID(id index.IndexInternalID) (string, error) { - return r.root.ExternalID(id) -} - -func (r *Reader) InternalID(id string) (index.IndexInternalID, error) { - return r.root.InternalID(id) -} - -func (r *Reader) DumpAll() chan interface{} { - rv := make(chan interface{}) - go func() { - close(rv) - }() - return rv -} - -func (r *Reader) DumpDoc(id string) chan interface{} { - rv := make(chan interface{}) - go func() { - close(rv) - }() - return rv -} - -func (r *Reader) DumpFields() chan interface{} { - rv := make(chan interface{}) - go func() { - close(rv) - }() - return rv -} - -func (r *Reader) Close() error { - return r.root.DecRef() -} diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 2b85b69dd..fd2376777 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -383,8 +383,8 @@ func (s *Scorch) DeleteInternal(key []byte) error { // release associated resources. func (s *Scorch) Reader() (index.IndexReader, error) { s.rootLock.RLock() - rv := &Reader{root: s.root} - rv.root.AddRef() + rv := s.root + rv.AddRef() s.rootLock.RUnlock() return rv, nil } diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index 87e9bdb21..3be52cb1f 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -680,7 +680,7 @@ func TestIndexInternalCRUD(t *testing.T) { t.Error(err) } - if len(indexReader.(*Reader).root.segment) != 0 { + if len(indexReader.(*IndexSnapshot).segment) != 0 { t.Errorf("expected 0 segments") } @@ -709,7 +709,7 @@ func TestIndexInternalCRUD(t *testing.T) { t.Error(err) } - if len(indexReader2.(*Reader).root.segment) != 0 { + if len(indexReader2.(*IndexSnapshot).segment) != 0 { t.Errorf("expected 0 segments") } @@ -738,7 +738,7 @@ func TestIndexInternalCRUD(t *testing.T) { t.Error(err) } - if len(indexReader3.(*Reader).root.segment) != 0 { + if len(indexReader3.(*IndexSnapshot).segment) != 0 { t.Errorf("expected 0 segments") } @@ -831,7 +831,7 @@ func TestIndexBatch(t *testing.T) { } }() - numSegments := len(indexReader.(*Reader).root.segment) + numSegments := len(indexReader.(*IndexSnapshot).segment) if numSegments <= 0 { t.Errorf("expected some segments, got: %d", numSegments) } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 7236eeb7e..5289b1434 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -85,6 +85,10 @@ func (i *IndexSnapshot) DecRef() (err error) { return err } +func (i *IndexSnapshot) Close() error { + return i.DecRef() +} + func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i segment.TermDictionary) segment.DictionaryIterator) (*IndexSnapshotFieldDict, error) { results := make(chan *asynchSegmentResult) @@ -501,3 +505,27 @@ func extractDvPendingFields(requestedFields, persistedFields []string) []string } return rv } + +func (i *IndexSnapshot) DumpAll() chan interface{} { + rv := make(chan interface{}) + go func() { + close(rv) + }() + return rv +} + +func (i *IndexSnapshot) DumpDoc(id string) chan interface{} { + rv := make(chan interface{}) + go func() { + close(rv) + }() + return rv +} + +func (i *IndexSnapshot) DumpFields() chan interface{} { + rv := make(chan interface{}) + go func() { + close(rv) + }() + return rv +} From a5253bfe2b47c91dd735f065d47d5f535e20ca67 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 2 Mar 2018 14:30:17 -0800 Subject: [PATCH 238/728] scorch persister goes through introducer to affect root This change allows the introducer to become the only goroutine to modify the root, which in turn allows the introducer to greatly reduce its root lock holding surface area. --- index/scorch/introducer.go | 141 +++++++++++++++++++++++++++---------- index/scorch/persister.go | 51 ++++---------- index/scorch/scorch.go | 4 ++ index/scorch/stats.go | 2 + 4 files changed, 124 insertions(+), 74 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index b6007b4ac..7f1d0073e 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -33,6 +33,11 @@ type segmentIntroduction struct { persisted chan error } +type persistIntroduction struct { + persisted map[uint64]segment.Segment + applied notificationChan +} + type epochWatcher struct { epoch uint64 notifyCh notificationChan @@ -66,6 +71,9 @@ OUTER: continue OUTER } + case persist := <-s.persists: + s.introducePersist(persist) + case revertTo := <-s.revertToSnapshots: err := s.revertToSnapshot(revertTo) if err != nil { @@ -97,32 +105,30 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { atomic.AddUint64(&s.stats.TotIntroduceSegmentBeg, 1) defer atomic.AddUint64(&s.stats.TotIntroduceSegmentEnd, 1) - // acquire lock - s.rootLock.Lock() + s.rootLock.RLock() + root := s.root + s.rootLock.RUnlock() - nsegs := len(s.root.segment) + nsegs := len(root.segment) // prepare new index snapshot newSnapshot := &IndexSnapshot{ parent: s, segment: make([]*SegmentSnapshot, 0, nsegs+1), offsets: make([]uint64, 0, nsegs+1), - internal: make(map[string][]byte, len(s.root.internal)), - epoch: s.nextSnapshotEpoch, + internal: make(map[string][]byte, len(root.internal)), refs: 1, } - s.nextSnapshotEpoch++ // iterate through current segments var running uint64 - for i := range s.root.segment { + for i := range root.segment { // see if optimistic work included this segment - delta, ok := next.obsoletes[s.root.segment[i].id] + delta, ok := next.obsoletes[root.segment[i].id] if !ok { var err error - delta, err = s.root.segment[i].segment.DocNumbers(next.ids) + delta, err = root.segment[i].segment.DocNumbers(next.ids) if err != nil { - s.rootLock.Unlock() next.applied <- fmt.Errorf("error computing doc numbers: %v", err) close(next.applied) _ = newSnapshot.DecRef() @@ -131,24 +137,24 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { } newss := &SegmentSnapshot{ - id: s.root.segment[i].id, - segment: s.root.segment[i].segment, - cachedDocs: s.root.segment[i].cachedDocs, + id: root.segment[i].id, + segment: root.segment[i].segment, + cachedDocs: root.segment[i].cachedDocs, } // apply new obsoletions - if s.root.segment[i].deleted == nil { + if root.segment[i].deleted == nil { newss.deleted = delta } else { - newss.deleted = roaring.Or(s.root.segment[i].deleted, delta) + newss.deleted = roaring.Or(root.segment[i].deleted, delta) } // check for live size before copying if newss.LiveSize() > 0 { newSnapshot.segment = append(newSnapshot.segment, newss) - s.root.segment[i].segment.AddRef() + root.segment[i].segment.AddRef() newSnapshot.offsets = append(newSnapshot.offsets, running) - running += s.root.segment[i].Count() + running += root.segment[i].Count() } } @@ -168,7 +174,7 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { atomic.AddUint64(&s.stats.TotIntroducedSegmentsBatch, 1) } // copy old values - for key, oldVal := range s.root.internal { + for key, oldVal := range root.internal { newSnapshot.internal[key] = oldVal } // set new values and apply deletes @@ -179,10 +185,14 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { delete(newSnapshot.internal, key) } } + + s.rootLock.Lock() if next.persisted != nil { s.rootPersisted = append(s.rootPersisted, next.persisted) } // swap in new index snapshot + newSnapshot.epoch = s.nextSnapshotEpoch + s.nextSnapshotEpoch++ rootPrev := s.root s.root = newSnapshot // release lock @@ -197,34 +207,89 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { return nil } +func (s *Scorch) introducePersist(persist *persistIntroduction) { + atomic.AddUint64(&s.stats.TotIntroducePersistBeg, 1) + defer atomic.AddUint64(&s.stats.TotIntroducePersistEnd, 1) + + s.rootLock.RLock() + root := s.root + s.rootLock.RUnlock() + + newIndexSnapshot := &IndexSnapshot{ + parent: s, + epoch: s.nextSnapshotEpoch, + segment: make([]*SegmentSnapshot, len(root.segment)), + offsets: make([]uint64, len(root.offsets)), + internal: make(map[string][]byte, len(root.internal)), + refs: 1, + } + s.nextSnapshotEpoch++ + + for i, segmentSnapshot := range root.segment { + // see if this segment has been replaced + if replacement, ok := persist.persisted[segmentSnapshot.id]; ok { + newSegmentSnapshot := &SegmentSnapshot{ + id: segmentSnapshot.id, + segment: replacement, + deleted: segmentSnapshot.deleted, + cachedDocs: segmentSnapshot.cachedDocs, + } + newIndexSnapshot.segment[i] = newSegmentSnapshot + delete(persist.persisted, segmentSnapshot.id) + + // update items persisted incase of a new segment snapshot + atomic.AddUint64(&s.stats.TotPersistedItems, newSegmentSnapshot.Count()) + atomic.AddUint64(&s.stats.TotPersistedSegments, 1) + } else { + newIndexSnapshot.segment[i] = root.segment[i] + newIndexSnapshot.segment[i].segment.AddRef() + } + newIndexSnapshot.offsets[i] = root.offsets[i] + } + + for k, v := range root.internal { + newIndexSnapshot.internal[k] = v + } + + s.rootLock.Lock() + rootPrev := s.root + s.root = newIndexSnapshot + s.rootLock.Unlock() + + if rootPrev != nil { + _ = rootPrev.DecRef() + } + + close(persist.applied) +} + func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { atomic.AddUint64(&s.stats.TotIntroduceMergeBeg, 1) defer atomic.AddUint64(&s.stats.TotIntroduceMergeEnd, 1) - // acquire lock - s.rootLock.Lock() + s.rootLock.RLock() + root := s.root + s.rootLock.RUnlock() newSnapshot := &IndexSnapshot{ parent: s, - internal: s.root.internal, - epoch: s.nextSnapshotEpoch, + internal: root.internal, refs: 1, } - s.nextSnapshotEpoch++ // iterate through current segments newSegmentDeleted := roaring.NewBitmap() var running uint64 - for i := range s.root.segment { - segmentID := s.root.segment[i].id + for i := range root.segment { + segmentID := root.segment[i].id if segSnapAtMerge, ok := nextMerge.old[segmentID]; ok { // this segment is going away, see if anything else was deleted since we started the merge - if segSnapAtMerge != nil && s.root.segment[i].deleted != nil { + if segSnapAtMerge != nil && root.segment[i].deleted != nil { // assume all these deletes are new - deletedSince := s.root.segment[i].deleted + deletedSince := root.segment[i].deleted // if we already knew about some of them, remove if segSnapAtMerge.deleted != nil { - deletedSince = roaring.AndNot(s.root.segment[i].deleted, segSnapAtMerge.deleted) + deletedSince = roaring.AndNot(root.segment[i].deleted, segSnapAtMerge.deleted) } deletedSinceItr := deletedSince.Iterator() for deletedSinceItr.HasNext() { @@ -238,18 +303,17 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { // segments left behind in old map after processing // the root segments would be the obsolete segment set delete(nextMerge.old, segmentID) - - } else if s.root.segment[i].LiveSize() > 0 { + } else if root.segment[i].LiveSize() > 0 { // this segment is staying newSnapshot.segment = append(newSnapshot.segment, &SegmentSnapshot{ - id: s.root.segment[i].id, - segment: s.root.segment[i].segment, - deleted: s.root.segment[i].deleted, - cachedDocs: s.root.segment[i].cachedDocs, + id: root.segment[i].id, + segment: root.segment[i].segment, + deleted: root.segment[i].deleted, + cachedDocs: root.segment[i].cachedDocs, }) - s.root.segment[i].segment.AddRef() + root.segment[i].segment.AddRef() newSnapshot.offsets = append(newSnapshot.offsets, running) - running += s.root.segment[i].Count() + running += root.segment[i].Count() } } @@ -284,7 +348,10 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { newSnapshot.AddRef() // 1 ref for the nextMerge.notify response - // swap in new segment + s.rootLock.Lock() + // swap in new index snapshot + newSnapshot.epoch = s.nextSnapshotEpoch + s.nextSnapshotEpoch++ rootPrev := s.root s.root = newSnapshot // release lock diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 6ffbd44d4..f1a372e72 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -401,46 +401,23 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { } } - s.rootLock.Lock() - newIndexSnapshot := &IndexSnapshot{ - parent: s, - epoch: s.nextSnapshotEpoch, - segment: make([]*SegmentSnapshot, len(s.root.segment)), - offsets: make([]uint64, len(s.root.offsets)), - internal: make(map[string][]byte, len(s.root.internal)), - refs: 1, - } - s.nextSnapshotEpoch++ - for i, segmentSnapshot := range s.root.segment { - // see if this segment has been replaced - if replacement, ok := newSegments[segmentSnapshot.id]; ok { - newSegmentSnapshot := &SegmentSnapshot{ - id: segmentSnapshot.id, - segment: replacement, - deleted: segmentSnapshot.deleted, - cachedDocs: segmentSnapshot.cachedDocs, - } - newIndexSnapshot.segment[i] = newSegmentSnapshot - delete(newSegments, segmentSnapshot.id) - - // update items persisted incase of a new segment snapshot - atomic.AddUint64(&s.stats.TotPersistedItems, newSegmentSnapshot.Count()) - atomic.AddUint64(&s.stats.TotPersistedSegments, 1) - } else { - newIndexSnapshot.segment[i] = s.root.segment[i] - newIndexSnapshot.segment[i].segment.AddRef() - } - newIndexSnapshot.offsets[i] = s.root.offsets[i] + persist := &persistIntroduction{ + persisted: newSegments, + applied: make(notificationChan), } - for k, v := range s.root.internal { - newIndexSnapshot.internal[k] = v + + select { + case <-s.closeCh: + err = ErrClosed + return err + case s.persists <- persist: } - rootPrev := s.root - s.root = newIndexSnapshot - s.rootLock.Unlock() - if rootPrev != nil { - _ = rootPrev.DecRef() + select { + case <-s.closeCh: + err = ErrClosed + return err + case <-persist.applied: } } diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index fd2376777..87372a326 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -39,6 +39,8 @@ const Name = "scorch" const Version uint8 = 1 +var ErrClosed = fmt.Errorf("scorch closed") + type Scorch struct { readOnly bool version uint8 @@ -59,6 +61,7 @@ type Scorch struct { closeCh chan struct{} introductions chan *segmentIntroduction + persists chan *persistIntroduction merges chan *segmentMerge introducerNotifier chan *epochWatcher revertToSnapshots chan *snapshotReversion @@ -174,6 +177,7 @@ func (s *Scorch) openBolt() error { } s.introductions = make(chan *segmentIntroduction) + s.persists = make(chan *persistIntroduction) s.merges = make(chan *segmentMerge) s.introducerNotifier = make(chan *epochWatcher, 1) s.revertToSnapshots = make(chan *snapshotReversion) diff --git a/index/scorch/stats.go b/index/scorch/stats.go index 21e6d0188..cd416a7c0 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -46,6 +46,8 @@ type Stats struct { TotIntroduceLoop uint64 TotIntroduceSegmentBeg uint64 TotIntroduceSegmentEnd uint64 + TotIntroducePersistBeg uint64 + TotIntroducePersistEnd uint64 TotIntroduceMergeBeg uint64 TotIntroduceMergeEnd uint64 TotIntroduceRevertBeg uint64 From 88c740095b7a5224c98e7538a0ee3c7bca574ee2 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 3 Mar 2018 10:59:53 -0800 Subject: [PATCH 239/728] scorch optimizations for mem.PostingsIterator.Next() & docTermMap Due to the usage rules of iterators, mem.PostingsIterator.Next() can reuse its returned Postings instance. Also, there's a micro optimization in persistDocValues() for one fewer access to the docTermMap in the inner-loop. --- index/scorch/segment/mem/posting.go | 6 +++--- index/scorch/segment/zap/build.go | 5 ++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/index/scorch/segment/mem/posting.go b/index/scorch/segment/mem/posting.go index d91a00561..25cbeb458 100644 --- a/index/scorch/segment/mem/posting.go +++ b/index/scorch/segment/mem/posting.go @@ -73,6 +73,7 @@ type PostingsIterator struct { offset int locoffset int actual roaring.IntIterable + reuse Posting } // Next returns the next posting on the postings list, or nil at the end @@ -92,17 +93,16 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { i.offset++ allN = i.all.Next() } - rv := &Posting{ + i.reuse = Posting{ iterator: i, docNum: uint64(n), offset: i.offset, locoffset: i.locoffset, hasLoc: i.locations.Contains(n), } - i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset]) i.offset++ - return rv, nil + return &i.reuse, nil } // Posting is a single entry in a postings list diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 77f18b05c..b075496cf 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -552,8 +552,7 @@ func persistDocValues(memSegment *mem.Segment, w *CountHashWriter, nextPosting, err2 := postingsItr.Next() for err2 == nil && nextPosting != nil { docNum := nextPosting.Number() - docTermMap[docNum] = append(docTermMap[docNum], []byte(next.Term)...) - docTermMap[docNum] = append(docTermMap[docNum], termSeparator) + docTermMap[docNum] = append(append(docTermMap[docNum], []byte(next.Term)...), termSeparator) nextPosting, err2 = postingsItr.Next() } if err2 != nil { @@ -562,10 +561,10 @@ func persistDocValues(memSegment *mem.Segment, w *CountHashWriter, next, err = dictItr.Next() } - if err != nil { return nil, err } + // sort wrt to docIDs var docNumbers docIDRange for k := range docTermMap { From b7cfef81c9a86d590b2a9ccdaed184c3e1bbcdef Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 3 Mar 2018 11:43:22 -0800 Subject: [PATCH 240/728] scorch optimize mem processDocument() dict access This change moves the dict lookup to outside of the loop. --- index/scorch/segment/mem/build.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 57d60dc89..2a2683dca 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -252,8 +252,9 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { // now that its been rolled up into docMap, walk that for fieldID, tokenFrequencies := range docMap { + dict := s.Dicts[fieldID] for term, tokenFreq := range tokenFrequencies { - pid := s.Dicts[fieldID][term] - 1 + pid := dict[term] - 1 bs := s.Postings[pid] bs.AddInt(int(docNum)) s.Freqs[pid] = append(s.Freqs[pid], uint64(tokenFreq.Frequency())) From 6ae799052a8eaf945a769f526c209bd1385d3fbf Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 3 Mar 2018 11:52:27 -0800 Subject: [PATCH 241/728] scorch mem optimize processDocument() stored field --- index/scorch/segment/mem/build.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 2a2683dca..3a892f9ac 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -222,12 +222,6 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { } } - storeField := func(docNum uint64, field uint16, typ byte, val []byte, pos []uint64) { - s.Stored[docNum][field] = append(s.Stored[docNum][field], val) - s.StoredTypes[docNum][field] = append(s.StoredTypes[docNum][field], typ) - s.StoredPos[docNum][field] = append(s.StoredPos[docNum][field], pos) - } - // walk each composite field for _, field := range result.Document.CompositeFields { fieldID := uint16(s.getOrDefineField(field.Name())) @@ -235,6 +229,10 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { processField(fieldID, field.Name(), l, tf) } + docStored := s.Stored[docNum] + docStoredTypes := s.StoredTypes[docNum] + docStoredPos := s.StoredPos[docNum] + // walk each field for i, field := range result.Document.Fields { fieldID := uint16(s.getOrDefineField(field.Name())) @@ -242,7 +240,9 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { tf := result.Analyzed[i] processField(fieldID, field.Name(), l, tf) if field.Options().IsStored() { - storeField(docNum, fieldID, encodeFieldType(field), field.Value(), field.ArrayPositions()) + docStored[fieldID] = append(docStored[fieldID], field.Value()) + docStoredTypes[fieldID] = append(docStoredTypes[fieldID], encodeFieldType(field)) + docStoredPos[fieldID] = append(docStoredPos[fieldID], field.ArrayPositions()) } if field.Options().IncludeDocValues() { From 884da6f93a3c0b9a4567501ad4ddf96eea739227 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 3 Mar 2018 11:58:28 -0800 Subject: [PATCH 242/728] scorch optimize mem processDocument() norm calculation This change moves the norm calculation outside of the inner loop. --- index/scorch/segment/mem/build.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 3a892f9ac..643ae36e4 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -253,12 +253,13 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { // now that its been rolled up into docMap, walk that for fieldID, tokenFrequencies := range docMap { dict := s.Dicts[fieldID] + norm := float32(1.0/math.Sqrt(float64(fieldLens[fieldID]))) for term, tokenFreq := range tokenFrequencies { pid := dict[term] - 1 bs := s.Postings[pid] bs.AddInt(int(docNum)) s.Freqs[pid] = append(s.Freqs[pid], uint64(tokenFreq.Frequency())) - s.Norms[pid] = append(s.Norms[pid], float32(1.0/math.Sqrt(float64(fieldLens[fieldID])))) + s.Norms[pid] = append(s.Norms[pid], norm) locationBS := s.PostingsLocs[pid] if len(tokenFreq.Locations) > 0 { locationBS.AddInt(int(docNum)) From dec265c4810fbe0c6584638eb6ac6521ce7112ba Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 5 Mar 2018 16:32:57 +0530 Subject: [PATCH 243/728] adding compaction_written_bytes/sec stats to scorch --- index/scorch/merge.go | 2 +- index/scorch/scorch.go | 1 + index/scorch/segment/zap/merge.go | 9 ++++++++- index/scorch/stats.go | 6 ++++++ 4 files changed, 16 insertions(+), 2 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 005d4f41e..67e1590ac 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -180,7 +180,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, s.markIneligibleForRemoval(filename) path := s.path + string(os.PathSeparator) + filename atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1) - newDocNums, err := zap.Merge(segmentsToMerge, docsToDrop, path, 1024) + newDocNums, err := zap.Merge(segmentsToMerge, docsToDrop, path, 1024, &s.stats) atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1) if err != nil { s.unmarkIneligibleForRemoval(filename) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 87372a326..8a2b1ec3b 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -430,6 +430,7 @@ func (s *Scorch) StatsMap() map[string]interface{} { m["num_items_persisted"] = m["TotPersistedItems"] m["num_bytes_used_disk"] = m["CurOnDiskBytes"] m["num_files_on_disk"] = m["CurOnDiskFiles"] + m["total_compaction_written_bytes"] = m["TotCompactionWrittenBytes"] return m } diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 33ce16c59..a9383f207 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -31,12 +31,17 @@ import ( const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc +// StatsReporter interface represents stats reporting methods. +type StatsReporter interface { + ReportBytesWritten(numBytesWritten uint64) +} + // Merge takes a slice of zap segments and bit masks describing which // documents may be dropped, and creates a new segment containing the // remaining data. This new segment is built at the specified path, // with the provided chunkFactor. func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, - chunkFactor uint32) ([][]uint64, error) { + chunkFactor uint32, stats StatsReporter) ([][]uint64, error) { flag := os.O_RDWR | os.O_CREATE f, err := os.OpenFile(path, flag, 0600) @@ -92,6 +97,8 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, return nil, err } + stats.ReportBytesWritten(uint64(cr.Count())) + return newDocNums, nil } diff --git a/index/scorch/stats.go b/index/scorch/stats.go index cd416a7c0..4f8c8b99e 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -100,6 +100,8 @@ type Stats struct { TotMemMergeZapBeg uint64 TotMemMergeZapEnd uint64 TotMemMergeSegments uint64 + + TotCompactionWrittenBytes uint64 } // atomically populates the returned map @@ -122,3 +124,7 @@ func (s *Stats) ToMap() map[string]interface{} { func (s *Stats) MarshalJSON() ([]byte, error) { return json.Marshal(s.ToMap()) } + +func (s *Stats) ReportBytesWritten(numBytesWritten uint64) { + atomic.AddUint64(&s.TotCompactionWrittenBytes, numBytesWritten) +} From 395b0a312dd078b115cfdeaa0888a8cf9afa0e95 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 5 Mar 2018 17:02:58 +0530 Subject: [PATCH 244/728] adding UTs --- index/scorch/segment/zap/merge.go | 4 +- index/scorch/segment/zap/merge_test.go | 96 ++++++++++++++++++++++++-- 2 files changed, 95 insertions(+), 5 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index a9383f207..9399046b0 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -97,7 +97,9 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, return nil, err } - stats.ReportBytesWritten(uint64(cr.Count())) + if stats != nil { + stats.ReportBytesWritten(uint64(cr.Count())) + } return newDocNums, nil } diff --git a/index/scorch/segment/zap/merge_test.go b/index/scorch/segment/zap/merge_test.go index bb09f8314..2ba0b373a 100644 --- a/index/scorch/segment/zap/merge_test.go +++ b/index/scorch/segment/zap/merge_test.go @@ -20,6 +20,7 @@ import ( "reflect" "sort" "strings" + "sync/atomic" "testing" "github.com/RoaringBitmap/roaring" @@ -72,7 +73,7 @@ func TestMerge(t *testing.T) { segsToMerge[0] = segment.(*Segment) segsToMerge[1] = segment2.(*Segment) - _, err = Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024) + _, err = Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024, nil) if err != nil { t.Fatal(err) } @@ -176,7 +177,7 @@ func testMergeWithEmptySegments(t *testing.T, before bool, numEmptySegments int) drops := make([]*roaring.Bitmap, len(segsToMerge)) - _, err = Merge(segsToMerge, drops, "/tmp/scorch3.zap", 1024) + _, err = Merge(segsToMerge, drops, "/tmp/scorch3.zap", 1024, nil) if err != nil { t.Fatal(err) } @@ -218,7 +219,7 @@ func testMergeWithSelf(t *testing.T, segCur *Segment, expectedCount uint64) { segsToMerge := make([]*Segment, 1) segsToMerge[0] = segCur - _, err := Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/"+fname, 1024) + _, err := Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/"+fname, 1024, nil) if err != nil { t.Fatal(err) } @@ -590,7 +591,7 @@ func testMergeWithUpdates(t *testing.T, segmentDocIds [][]string, docsToDrop []* func testMergeAndDropSegments(t *testing.T, segsToMerge []*Segment, docsToDrop []*roaring.Bitmap, expectedNumDocs uint64) { _ = os.RemoveAll("/tmp/scorch-merged.zap") - _, err := Merge(segsToMerge, docsToDrop, "/tmp/scorch-merged.zap", 1024) + _, err := Merge(segsToMerge, docsToDrop, "/tmp/scorch-merged.zap", 1024, nil) if err != nil { t.Fatal(err) } @@ -782,3 +783,90 @@ func buildMemSegmentMultiHelper(docIds []string) *mem.Segment { return segment } + +type statTest struct { + totalWrittenBytes uint64 +} + +func (s *statTest) ReportBytesWritten(numBytesWritten uint64) { + atomic.AddUint64(&s.totalWrittenBytes, numBytesWritten) +} + +func TestMergeBytesWritten(t *testing.T) { + _ = os.RemoveAll("/tmp/scorch.zap") + _ = os.RemoveAll("/tmp/scorch2.zap") + _ = os.RemoveAll("/tmp/scorch3.zap") + + memSegment := buildMemSegmentMulti() + err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024) + if err != nil { + t.Fatal(err) + } + + memSegment2 := buildMemSegmentMulti2() + err = PersistSegment(memSegment2, "/tmp/scorch2.zap", 1024) + if err != nil { + t.Fatal(err) + } + + segment, err := Open("/tmp/scorch.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + segment2, err := Open("/tmp/scorch2.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment2.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + segsToMerge := make([]*Segment, 2) + segsToMerge[0] = segment.(*Segment) + segsToMerge[1] = segment2.(*Segment) + + reporter := &statTest{} + + _, err = Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024, reporter) + if err != nil { + t.Fatal(err) + } + + if reporter.totalWrittenBytes == 0 { + t.Fatalf("expected a non zero total_compaction_written_bytes") + } + + segm, err := Open("/tmp/scorch3.zap") + if err != nil { + t.Fatalf("error opening merged segment: %v", err) + } + seg3 := segm.(*Segment) + defer func() { + cerr := seg3.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + if seg3.Path() != "/tmp/scorch3.zap" { + t.Fatalf("wrong path") + } + if seg3.Count() != 4 { + t.Fatalf("wrong count") + } + if len(seg3.Fields()) != 5 { + t.Fatalf("wrong # fields: %#v\n", seg3.Fields()) + } + + testMergeWithSelf(t, seg3, 4) +} From d44c5ad5682450baff48ead3345d69f9ec5399cc Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 3 Mar 2018 19:32:39 -0800 Subject: [PATCH 245/728] scorch stats MaxBatchIntroTime bug fix and more timing stats Added timing stats for in-mem zap merging and file-based zap merging. --- index/scorch/merge.go | 19 +++++++++++++++++++ index/scorch/scorch.go | 2 +- index/scorch/stats.go | 8 ++++++-- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 005d4f41e..ee3ec46c5 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -179,9 +179,19 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, filename := zapFileName(newSegmentID) s.markIneligibleForRemoval(filename) path := s.path + string(os.PathSeparator) + filename + + fileMergeZapStartTime := time.Now() + atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1) newDocNums, err := zap.Merge(segmentsToMerge, docsToDrop, path, 1024) atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1) + + fileMergeZapTime := uint64(time.Since(fileMergeZapStartTime)) + atomic.AddUint64(&s.stats.TotFileMergeZapTime, fileMergeZapTime) + if atomic.LoadUint64(&s.stats.MaxFileMergeZapTime) < fileMergeZapTime { + atomic.StoreUint64(&s.stats.MaxFileMergeZapTime, fileMergeZapTime) + } + if err != nil { s.unmarkIneligibleForRemoval(filename) atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1) @@ -258,11 +268,20 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, cr := zap.NewCountHashWriter(&br) + memMergeZapStartTime := time.Now() + atomic.AddUint64(&s.stats.TotMemMergeZapBeg, 1) newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs, fieldsInv, fieldsMap, err := zap.MergeToWriter(sbs, sbsDrops, chunkFactor, cr) atomic.AddUint64(&s.stats.TotMemMergeZapEnd, 1) + + memMergeZapTime := uint64(time.Since(memMergeZapStartTime)) + atomic.AddUint64(&s.stats.TotMemMergeZapTime, memMergeZapTime) + if atomic.LoadUint64(&s.stats.MaxMemMergeZapTime) < memMergeZapTime { + atomic.StoreUint64(&s.stats.MaxMemMergeZapTime, memMergeZapTime) + } + if err != nil { atomic.AddUint64(&s.stats.TotMemMergeErr, 1) return 0, nil, 0, err diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 87372a326..a40f374ac 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -365,7 +365,7 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, introTime := uint64(time.Since(introStartTime)) atomic.AddUint64(&s.stats.TotBatchIntroTime, introTime) if atomic.LoadUint64(&s.stats.MaxBatchIntroTime) < introTime { - atomic.AddUint64(&s.stats.MaxBatchIntroTime, introTime) + atomic.StoreUint64(&s.stats.MaxBatchIntroTime, introTime) } return err diff --git a/index/scorch/stats.go b/index/scorch/stats.go index cd416a7c0..3c978af77 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -88,8 +88,10 @@ type Stats struct { TotFileMergeSegmentsEmpty uint64 TotFileMergeSegments uint64 - TotFileMergeZapBeg uint64 - TotFileMergeZapEnd uint64 + TotFileMergeZapBeg uint64 + TotFileMergeZapEnd uint64 + TotFileMergeZapTime uint64 + MaxFileMergeZapTime uint64 TotFileMergeIntroductions uint64 TotFileMergeIntroductionsDone uint64 @@ -99,6 +101,8 @@ type Stats struct { TotMemMergeDone uint64 TotMemMergeZapBeg uint64 TotMemMergeZapEnd uint64 + TotMemMergeZapTime uint64 + MaxMemMergeZapTime uint64 TotMemMergeSegments uint64 } From 85761c6a57987ece0b193042ddf953f3f360bad1 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 3 Mar 2018 19:39:21 -0800 Subject: [PATCH 246/728] go fmt --- index/scorch/segment/mem/build.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 643ae36e4..57971aae5 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -253,7 +253,7 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { // now that its been rolled up into docMap, walk that for fieldID, tokenFrequencies := range docMap { dict := s.Dicts[fieldID] - norm := float32(1.0/math.Sqrt(float64(fieldLens[fieldID]))) + norm := float32(1.0 / math.Sqrt(float64(fieldLens[fieldID]))) for term, tokenFreq := range tokenFrequencies { pid := dict[term] - 1 bs := s.Postings[pid] From 8c0881eab2caef301835eb9b0660e0b727d06687 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 4 Mar 2018 12:03:02 -0800 Subject: [PATCH 247/728] scorch zap build reuses mem postingsList/Iterator structs --- index/scorch/segment/mem/dict.go | 20 ++++++++++++++------ index/scorch/segment/mem/posting.go | 11 +++++++++-- index/scorch/segment/zap/build.go | 8 ++++++-- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/index/scorch/segment/mem/dict.go b/index/scorch/segment/mem/dict.go index cf92ef71f..b564ed1fb 100644 --- a/index/scorch/segment/mem/dict.go +++ b/index/scorch/segment/mem/dict.go @@ -33,12 +33,20 @@ type Dictionary struct { // PostingsList returns the postings list for the specified term func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) { - return &PostingsList{ - dictionary: d, - term: term, - postingsID: d.segment.Dicts[d.fieldID][term], - except: except, - }, nil + return d.InitPostingsList(term, except, nil) +} + +func (d *Dictionary) InitPostingsList(term string, except *roaring.Bitmap, + prealloc *PostingsList) (*PostingsList, error) { + rv := prealloc + if rv == nil { + rv = &PostingsList{} + } + rv.dictionary = d + rv.term = term + rv.postingsID = d.segment.Dicts[d.fieldID][term] + rv.except = except + return rv, nil } // Iterator returns an iterator for this dictionary diff --git a/index/scorch/segment/mem/posting.go b/index/scorch/segment/mem/posting.go index 25cbeb458..2554333a2 100644 --- a/index/scorch/segment/mem/posting.go +++ b/index/scorch/segment/mem/posting.go @@ -46,9 +46,16 @@ func (p *PostingsList) Count() uint64 { // Iterator returns an iterator for this postings list func (p *PostingsList) Iterator() segment.PostingsIterator { - rv := &PostingsIterator{ - postings: p, + return p.InitIterator(nil) +} +func (p *PostingsList) InitIterator(prealloc *PostingsIterator) *PostingsIterator { + rv := prealloc + if rv == nil { + rv = &PostingsIterator{postings: p} + } else { + *rv = PostingsIterator{postings: p} } + if p.postingsID > 0 { allbits := p.dictionary.segment.Postings[p.postingsID-1] rv.locations = p.dictionary.segment.PostingsLocs[p.postingsID-1] diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index b075496cf..7fbc995fe 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -532,6 +532,9 @@ func persistDocValues(memSegment *mem.Segment, w *CountHashWriter, fieldChunkOffsets := make(map[uint16]uint64, len(memSegment.FieldsInv)) fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) + var postings *mem.PostingsList + var postingsItr *mem.PostingsIterator + for fieldID := range memSegment.DocValueFields { field := memSegment.FieldsInv[fieldID] docTermMap := make(map[uint64][]byte, 0) @@ -543,12 +546,13 @@ func persistDocValues(memSegment *mem.Segment, w *CountHashWriter, dictItr := dict.Iterator() next, err := dictItr.Next() for err == nil && next != nil { - postings, err1 := dict.PostingsList(next.Term, nil) + var err1 error + postings, err1 = dict.(*mem.Dictionary).InitPostingsList(next.Term, nil, postings) if err1 != nil { return nil, err } - postingsItr := postings.Iterator() + postingsItr = postings.InitIterator(postingsItr) nextPosting, err2 := postingsItr.Next() for err2 == nil && nextPosting != nil { docNum := nextPosting.Number() From 856778ad7bb4d4c8433a504de1302360ac4a5dbc Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 4 Mar 2018 12:06:45 -0800 Subject: [PATCH 248/728] scorch zap build prealloc docNumbers capacity --- index/scorch/segment/zap/build.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 7fbc995fe..4edd277d0 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -570,7 +570,7 @@ func persistDocValues(memSegment *mem.Segment, w *CountHashWriter, } // sort wrt to docIDs - var docNumbers docIDRange + docNumbers := make(docIDRange, 0, len(docTermMap)) for k := range docTermMap { docNumbers = append(docNumbers, k) } From a338386a038594f37d5b7901c4d3a73e53424787 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 4 Mar 2018 12:56:33 -0800 Subject: [PATCH 249/728] scorch build optimize freq/loc slice capacity --- index/scorch/segment/zap/build.go | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 4edd277d0..eec9998bd 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -308,7 +308,7 @@ func persistStoredFieldValues(fieldID int, } func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFactor uint32) ([]uint64, []uint64, error) { - var freqOffsets, locOfffsets []uint64 + freqOffsets := make([]uint64, 0, len(memSegment.Postings)) tfEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) for postingID := range memSegment.Postings { if postingID != 0 { @@ -351,6 +351,7 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac } // now do it again for the locations + locOffsets := make([]uint64, 0, len(memSegment.Postings)) locEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) for postingID := range memSegment.Postings { if postingID != 0 { @@ -414,14 +415,15 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac } // record where this postings loc info starts - locOfffsets = append(locOfffsets, uint64(w.Count())) + locOffsets = append(locOffsets, uint64(w.Count())) locEncoder.Close() _, err := locEncoder.Write(w) if err != nil { return nil, nil, err } } - return freqOffsets, locOfffsets, nil + + return freqOffsets, locOffsets, nil } func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint64, err error) { From 8f8fd511b7d2a6f221a5cc51641017b4be9c2a40 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 4 Mar 2018 13:01:22 -0800 Subject: [PATCH 250/728] scorch zap access freqs[offset] outside loop --- index/scorch/segment/zap/build.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index eec9998bd..237cc5f3d 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -368,7 +368,8 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac var locOffset int for postingsListItr.HasNext() { docNum := uint64(postingsListItr.Next()) - for i := 0; i < int(freqs[offset]); i++ { + n := int(freqs[offset]) + for i := 0; i < n; i++ { if len(locfields) > 0 { // put field err := locEncoder.Add(docNum, uint64(locfields[locOffset])) From 502e64c2567c363d3cf0f2b1b542b9c1db74e973 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 5 Mar 2018 16:33:13 -0800 Subject: [PATCH 251/728] scorch zap Posting doesn't use iterator field --- index/scorch/segment/zap/posting.go | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index ada39b434..27d90f2b5 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -337,7 +337,6 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { reuseLocs := i.next.locs // hold for reuse before struct clearing i.next = Posting{} // clear the struct rv := &i.next - rv.iterator = i rv.docNum = uint64(n) var err error @@ -373,12 +372,10 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { // Posting is a single entry in a postings list type Posting struct { - iterator *PostingsIterator - docNum uint64 - - freq uint64 - norm float32 - locs []segment.Location + docNum uint64 + freq uint64 + norm float32 + locs []segment.Location } // Number returns the document number of this posting in this segment From fa5de8e09aae8ac706182333a2d9488111663936 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Tue, 6 Mar 2018 16:22:11 +0530 Subject: [PATCH 252/728] making NumSnapshotsToKeep configurable --- index/scorch/persister.go | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index f1a372e72..cab2d035d 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -633,14 +633,19 @@ func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) { return 0, err } - if len(persistedEpochs) <= NumSnapshotsToKeep { + numSnapshotsToKeep := NumSnapshotsToKeep + if val, ok := s.config["numSnapshotsToKeep"].(float64); ok && val > 0 { + numSnapshotsToKeep = int(val) + } + + if len(persistedEpochs) <= numSnapshotsToKeep { // we need to keep everything return 0, nil } // make a map of epochs to protect from deletion - protectedEpochs := make(map[uint64]struct{}, NumSnapshotsToKeep) - for _, epoch := range persistedEpochs[0:NumSnapshotsToKeep] { + protectedEpochs := make(map[uint64]struct{}, numSnapshotsToKeep) + for _, epoch := range persistedEpochs[0:numSnapshotsToKeep] { protectedEpochs[epoch] = struct{}{} } From 655268bec821e10be03d23734d60db6804f7cb82 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 6 Mar 2018 07:55:26 -0800 Subject: [PATCH 253/728] scorch zap postings iterator nextDocNum() helper method Refactored out a nextDocNum() helper method from Next() that future optimizations can use. --- index/scorch/segment/zap/posting.go | 94 ++++++++++++++++------------- 1 file changed, 53 insertions(+), 41 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 27d90f2b5..adc399eaa 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -279,9 +279,56 @@ func (i *PostingsIterator) readLocation(l *Location) error { // Next returns the next posting on the postings list, or nil at the end func (i *PostingsIterator) Next() (segment.Posting, error) { - if i.actual == nil || !i.actual.HasNext() { + docNum, exists, err := i.nextDocNum() + if err != nil { + return nil, err + } + if !exists { return nil, nil } + + reuseLocs := i.next.locs // hold for reuse before struct clearing + i.next = Posting{} // clear the struct + rv := &i.next + rv.docNum = docNum + + var normBits uint64 + rv.freq, normBits, err = i.readFreqNorm() + if err != nil { + return nil, err + } + rv.norm = math.Float32frombits(uint32(normBits)) + if i.locBitmap.Contains(uint32(docNum)) { + // read off 'freq' locations, into reused slices + if cap(i.nextLocs) >= int(rv.freq) { + i.nextLocs = i.nextLocs[0:rv.freq] + } else { + i.nextLocs = make([]Location, rv.freq) + } + if cap(reuseLocs) >= int(rv.freq) { + rv.locs = reuseLocs[0:rv.freq] + } else { + rv.locs = make([]segment.Location, rv.freq) + } + for j := 0; j < int(rv.freq); j++ { + err := i.readLocation(&i.nextLocs[j]) + if err != nil { + return nil, err + } + rv.locs[j] = &i.nextLocs[j] + } + } + + return rv, nil +} + +// nextDocNum returns the next docNum on the postings list, and also +// sets up the currChunk / loc related fields of the iterator. +func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { + if i.actual == nil || !i.actual.HasNext() { + return 0, false, nil + } + n := i.actual.Next() nChunk := n / i.postings.sb.chunkFactor allN := i.all.Next() @@ -292,30 +339,28 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { // if they don't match, adjust offsets to factor in item we're skipping over // incr the all iterator, and check again for allN != n { - // in different chunks, reset offsets if allNChunk != nChunk { i.locoffset = 0 i.offset = 0 } else { - if i.currChunk != nChunk || i.currChunkFreqNorm == nil { err := i.loadChunk(int(nChunk)) if err != nil { - return nil, fmt.Errorf("error loading chunk: %v", err) + return 0, false, fmt.Errorf("error loading chunk: %v", err) } } // read off freq/offsets even though we don't care about them freq, _, err := i.readFreqNorm() if err != nil { - return nil, err + return 0, false, err } if i.locBitmap.Contains(allN) { for j := 0; j < int(freq); j++ { err := i.readLocation(nil) if err != nil { - return nil, err + return 0, false, err } } } @@ -330,44 +375,11 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { if i.currChunk != nChunk || i.currChunkFreqNorm == nil { err := i.loadChunk(int(nChunk)) if err != nil { - return nil, fmt.Errorf("error loading chunk: %v", err) + return 0, false, fmt.Errorf("error loading chunk: %v", err) } } - reuseLocs := i.next.locs // hold for reuse before struct clearing - i.next = Posting{} // clear the struct - rv := &i.next - rv.docNum = uint64(n) - - var err error - var normBits uint64 - rv.freq, normBits, err = i.readFreqNorm() - if err != nil { - return nil, err - } - rv.norm = math.Float32frombits(uint32(normBits)) - if i.locBitmap.Contains(n) { - // read off 'freq' locations, into reused slices - if cap(i.nextLocs) >= int(rv.freq) { - i.nextLocs = i.nextLocs[0:rv.freq] - } else { - i.nextLocs = make([]Location, rv.freq) - } - if cap(reuseLocs) >= int(rv.freq) { - rv.locs = reuseLocs[0:rv.freq] - } else { - rv.locs = make([]segment.Location, rv.freq) - } - for j := 0; j < int(rv.freq); j++ { - err := i.readLocation(&i.nextLocs[j]) - if err != nil { - return nil, err - } - rv.locs[j] = &i.nextLocs[j] - } - } - - return rv, nil + return uint64(n), true, nil } // Posting is a single entry in a postings list From 530a3d24cf0768f4c7a82e9a61dd9a0eff3ec8a2 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 6 Mar 2018 07:58:42 -0800 Subject: [PATCH 254/728] scorch zap optimize merge by byte copying freq/norm/loc's This change adds a zap PostingsIterator.nextBytes() method, which is similar to Next(), but instead of returning a Posting instance, nextBytes() returns the encoded freq/norm and location byte slices. The zap merge code then provides those byte slices directly to the intCoder's via a new method, intCoder.AddBytes(), thereby avoiding having to encode many uvarint's. --- index/scorch/segment/zap/intcoder.go | 13 ++++++++ index/scorch/segment/zap/merge.go | 42 +++++++++----------------- index/scorch/segment/zap/posting.go | 44 +++++++++++++++++++++++++--- 3 files changed, 67 insertions(+), 32 deletions(-) diff --git a/index/scorch/segment/zap/intcoder.go b/index/scorch/segment/zap/intcoder.go index 247e36fbc..8d1f94536 100644 --- a/index/scorch/segment/zap/intcoder.go +++ b/index/scorch/segment/zap/intcoder.go @@ -82,6 +82,19 @@ func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error { return nil } +func (c *chunkedIntCoder) AddBytes(docNum uint64, buf []byte) error { + chunk := docNum / c.chunkSize + if chunk != c.currChunk { + // starting a new chunk + c.Close() + c.chunkBuf.Reset() + c.currChunk = chunk + } + + _, err := c.chunkBuf.Write(buf) + return err +} + // Close indicates you are done calling Add() this allows the final chunk // to be encoded. func (c *chunkedIntCoder) Close() { diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 33ce16c59..5066dfb9e 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -162,7 +162,6 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, var bufReuse bytes.Buffer var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64) - var bufLoc []uint64 var postings *PostingsList var postItr *PostingsIterator @@ -316,45 +315,32 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, newDocNumsI := newDocNums[itrI] postItr = postings.iterator(postItr) - next, err2 := postItr.Next() - for next != nil && err2 == nil { - hitNewDocNum := newDocNumsI[next.Number()] + + nextDocNum, nextFreqNormBytes, nextLocBytes, err2 := postItr.nextBytes() + for err2 == nil && len(nextFreqNormBytes) > 0 { + hitNewDocNum := newDocNumsI[nextDocNum] if hitNewDocNum == docDropped { return nil, 0, fmt.Errorf("see hit with dropped doc num") } + newRoaring.Add(uint32(hitNewDocNum)) - // encode norm bits - norm := next.Norm() - normBits := math.Float32bits(float32(norm)) - err = tfEncoder.Add(hitNewDocNum, next.Frequency(), uint64(normBits)) - if err != nil { - return nil, 0, err + err2 = tfEncoder.AddBytes(hitNewDocNum, nextFreqNormBytes) + if err2 != nil { + return nil, 0, err2 } - locs := next.Locations() - if len(locs) > 0 { + + if len(nextLocBytes) > 0 { newRoaringLocs.Add(uint32(hitNewDocNum)) - for _, loc := range locs { - if cap(bufLoc) < 5+len(loc.ArrayPositions()) { - bufLoc = make([]uint64, 0, 5+len(loc.ArrayPositions())) - } - args := bufLoc[0:5] - args[0] = uint64(fieldsMap[loc.Field()] - 1) - args[1] = loc.Pos() - args[2] = loc.Start() - args[3] = loc.End() - args[4] = uint64(len(loc.ArrayPositions())) - args = append(args, loc.ArrayPositions()...) - err = locEncoder.Add(hitNewDocNum, args...) - if err != nil { - return nil, 0, err - } + err2 = locEncoder.AddBytes(hitNewDocNum, nextLocBytes) + if err2 != nil { + return nil, 0, err2 } } docTermMap[hitNewDocNum] = append(append(docTermMap[hitNewDocNum], term...), termSeparator) - next, err2 = postItr.Next() + nextDocNum, nextFreqNormBytes, nextLocBytes, err2 = postItr.nextBytes() } if err2 != nil { return nil, 0, err2 diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index adc399eaa..2dab41669 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -280,12 +280,9 @@ func (i *PostingsIterator) readLocation(l *Location) error { // Next returns the next posting on the postings list, or nil at the end func (i *PostingsIterator) Next() (segment.Posting, error) { docNum, exists, err := i.nextDocNum() - if err != nil { + if err != nil || !exists { return nil, err } - if !exists { - return nil, nil - } reuseLocs := i.next.locs // hold for reuse before struct clearing i.next = Posting{} // clear the struct @@ -322,6 +319,45 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { return rv, nil } +// nextBytes returns the docNum and the encoded freq & loc bytes for +// the next posting +func (i *PostingsIterator) nextBytes() (uint64, []byte, []byte, error) { + docNum, exists, err := i.nextDocNum() + if err != nil { + return 0, nil, nil, err + } + if !exists { + return 0, nil, nil, nil + } + + startFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len() + + freq, _, err := i.readFreqNorm() + if err != nil { + return 0, nil, nil, err + } + + endFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len() + bytesFreqNorm := i.currChunkFreqNorm[startFreqNorm:endFreqNorm] + + var bytesLoc []byte + if i.locBitmap.Contains(uint32(docNum)) { + startLoc := len(i.currChunkLoc) - i.locReader.Len() + + for j := uint64(0); j < freq; j++ { + err := i.readLocation(nil) + if err != nil { + return 0, nil, nil, err + } + } + + endLoc := len(i.currChunkLoc) - i.locReader.Len() + bytesLoc = i.currChunkLoc[startLoc:endLoc] + } + + return docNum, bytesFreqNorm, bytesLoc, nil +} + // nextDocNum returns the next docNum on the postings list, and also // sets up the currChunk / loc related fields of the iterator. func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { From 5b86da85f358199d80436005d104607a7fed867d Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 6 Mar 2018 08:06:12 -0800 Subject: [PATCH 255/728] scorch zap optimize postings itr with tf/loc reader/decoder reuse --- index/scorch/segment/zap/posting.go | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 2dab41669..589c7cb85 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -45,7 +45,25 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { if rv == nil { rv = &PostingsIterator{} } else { + freqNormReader := rv.freqNormReader + if freqNormReader != nil { + freqNormReader.Reset([]byte(nil)) + } + freqNormDecoder := rv.freqNormDecoder + + locReader := rv.locReader + if locReader != nil { + locReader.Reset([]byte(nil)) + } + locDecoder := rv.locDecoder + *rv = PostingsIterator{} // clear the struct + + rv.freqNormReader = freqNormReader + rv.freqNormDecoder = freqNormDecoder + + rv.locReader = locReader + rv.locDecoder = locDecoder } rv.postings = p From 7e36109b3c83f2e454805e709bd850e859574244 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Thu, 1 Mar 2018 17:12:16 -0800 Subject: [PATCH 256/728] MB-28162: Provide API to estimate memory needed to run a search query MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This API (unexported) will estimate the amount of memory needed to execute a search query over an index before the collector begins data collection. Sample estimates for certain queries: {Size: 10, BenchmarkUpsidedownSearchOverhead} ESTIMATE BENCHMEM TermQuery 4616 4796 MatchQuery 5210 5405 DisjunctionQuery (Match queries) 7700 8447 DisjunctionQuery (Term queries) 6514 6591 ConjunctionQuery (Match queries) 7524 8175 Nested disjunction query (disjunction of disjunctions) 10306 10708 … --- document/document.go | 21 +++++- document/field_composite.go | 9 +++ index/index.go | 35 ++++++++++ index/scorch/scorch.go | 10 +-- index/scorch/segment/empty.go | 12 ++++ index/scorch/segment/mem/build.go | 2 +- index/scorch/segment/mem/dict.go | 20 ++++++ index/scorch/segment/mem/posting.go | 62 ++++++++++++++++++ index/scorch/segment/mem/segment.go | 83 ++++++++++++------------ index/scorch/segment/mem/segment_test.go | 2 +- index/scorch/segment/segment.go | 15 +++-- index/scorch/segment/zap/contentcoder.go | 8 +++ index/scorch/segment/zap/docvalues.go | 31 +++++---- index/scorch/segment/zap/posting.go | 79 ++++++++++++++++++++++ index/scorch/segment/zap/segment.go | 78 +++++++++++----------- index/scorch/snapshot_index.go | 7 ++ index/scorch/snapshot_index_doc.go | 13 ++++ index/scorch/snapshot_index_tfr.go | 30 +++++++++ index/scorch/snapshot_segment.go | 4 +- index/upsidedown/index_reader.go | 14 ++++ index/upsidedown/reader.go | 39 ++++++++++- index/upsidedown/row.go | 29 +++++++++ index_impl.go | 53 ++++++++++++++- index_test.go | 55 ++++++++++++++++ search.go | 30 +++++++++ search/collector/search_test.go | 18 +++++ search/collector/topn.go | 25 +++++++ search/explanation.go | 21 ++++++ search/facet/facet_builder_datetime.go | 29 +++++++++ search/facet/facet_builder_numeric.go | 29 +++++++++ search/facet/facet_builder_terms.go | 21 ++++++ search/facets_builder.go | 47 ++++++++++++++ search/pool.go | 11 ++++ search/scorer/scorer_conjunction.go | 14 ++++ search/scorer/scorer_constant.go | 19 ++++++ search/scorer/scorer_disjunction.go | 13 ++++ search/scorer/scorer_term.go | 24 +++++++ search/search.go | 82 +++++++++++++++++++++++ search/searcher/search_boolean.go | 36 ++++++++++ search/searcher/search_conjunction.go | 26 ++++++++ search/searcher/search_disjunction.go | 35 ++++++++++ search/searcher/search_docid.go | 16 +++++ search/searcher/search_filter.go | 15 +++++ search/searcher/search_match_all.go | 17 +++++ search/searcher/search_match_none.go | 15 +++++ search/searcher/search_phrase.go | 31 +++++++++ search/searcher/search_term.go | 18 +++++ size/sizes.go | 57 ++++++++++++++++ 48 files changed, 1242 insertions(+), 118 deletions(-) create mode 100644 size/sizes.go diff --git a/document/document.go b/document/document.go index c37585c66..921098b0b 100644 --- a/document/document.go +++ b/document/document.go @@ -14,7 +14,19 @@ package document -import "fmt" +import ( + "fmt" + "reflect" + + "github.com/blevesearch/bleve/size" +) + +var reflectStaticSizeDocument int + +func init() { + var d Document + reflectStaticSizeDocument = int(reflect.TypeOf(d).Size()) +} type Document struct { ID string `json:"id"` @@ -30,6 +42,13 @@ func NewDocument(id string) *Document { } } +func (d *Document) Size() int { + return reflectStaticSizeDocument + size.SizeOfPtr + + len(d.ID) + + len(d.Fields)*size.SizeOfPtr + + len(d.CompositeFields)*(size.SizeOfPtr+reflectStaticSizeCompositeField) +} + func (d *Document) AddField(f Field) *Document { switch f := f.(type) { case *CompositeField: diff --git a/document/field_composite.go b/document/field_composite.go index b41b1b8ed..e53cd4566 100644 --- a/document/field_composite.go +++ b/document/field_composite.go @@ -15,9 +15,18 @@ package document import ( + "reflect" + "github.com/blevesearch/bleve/analysis" ) +var reflectStaticSizeCompositeField int + +func init() { + var cf CompositeField + reflectStaticSizeCompositeField = int(reflect.TypeOf(cf).Size()) +} + const DefaultCompositeIndexingOptions = IndexField type CompositeField struct { diff --git a/index/index.go b/index/index.go index 9870b4172..c25d7fa46 100644 --- a/index/index.go +++ b/index/index.go @@ -18,11 +18,23 @@ import ( "bytes" "encoding/json" "fmt" + "reflect" "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index/store" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeTermFieldDoc int +var reflectStaticSizeTermFieldVector int + +func init() { + var tfd TermFieldDoc + reflectStaticSizeTermFieldDoc = int(reflect.TypeOf(tfd).Size()) + var tfv TermFieldVector + reflectStaticSizeTermFieldVector = int(reflect.TypeOf(tfv).Size()) +} + var ErrorUnknownStorageType = fmt.Errorf("unknown storage type") type Index interface { @@ -82,6 +94,8 @@ type IndexReader interface { DumpFields() chan interface{} Close() error + + Size() int } // FieldTerms contains the terms used by a document, keyed by field @@ -115,6 +129,11 @@ type TermFieldVector struct { End uint64 } +func (tfv *TermFieldVector) Size() int { + return reflectStaticSizeTermFieldVector + size.SizeOfPtr + + len(tfv.Field) + len(tfv.ArrayPositions)*size.SizeOfUint64 +} + // IndexInternalID is an opaque document identifier interal to the index impl type IndexInternalID []byte @@ -134,6 +153,17 @@ type TermFieldDoc struct { Vectors []*TermFieldVector } +func (tfd *TermFieldDoc) Size() int { + sizeInBytes := reflectStaticSizeTermFieldDoc + size.SizeOfPtr + + len(tfd.Term) + len(tfd.ID) + + for _, entry := range tfd.Vectors { + sizeInBytes += entry.Size() + } + + return sizeInBytes +} + // Reset allows an already allocated TermFieldDoc to be reused func (tfd *TermFieldDoc) Reset() *TermFieldDoc { // remember the []byte used for the ID @@ -161,6 +191,8 @@ type TermFieldReader interface { // Count returns the number of documents contains the term in this field. Count() uint64 Close() error + + Size() int } type DictEntry struct { @@ -185,6 +217,9 @@ type DocIDReader interface { // will start there instead. If ID is greater than or equal to the end of // the range, Next() call will return io.EOF. Advance(ID IndexInternalID) (IndexInternalID, error) + + Size() int + Close() error } diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index a40f374ac..2a9eb6341 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -472,20 +472,20 @@ func (s *Scorch) AddEligibleForRemoval(epoch uint64) { } func (s *Scorch) MemoryUsed() uint64 { - var memUsed uint64 + var memUsed int s.rootLock.RLock() if s.root != nil { for _, segmentSnapshot := range s.root.segment { memUsed += 8 /* size of id -> uint64 */ + - segmentSnapshot.segment.SizeInBytes() + segmentSnapshot.segment.Size() if segmentSnapshot.deleted != nil { - memUsed += segmentSnapshot.deleted.GetSizeInBytes() + memUsed += int(segmentSnapshot.deleted.GetSizeInBytes()) } - memUsed += segmentSnapshot.cachedDocs.sizeInBytes() + memUsed += segmentSnapshot.cachedDocs.size() } } s.rootLock.RUnlock() - return memUsed + return uint64(memUsed) } func (s *Scorch) markIneligibleForRemoval(filename string) { diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index 83454644d..6c19f60f9 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -46,6 +46,10 @@ func (e *EmptySegment) Close() error { return nil } +func (e *EmptySegment) Size() uint64 { + return 0 +} + func (e *EmptySegment) AddRef() { } @@ -84,6 +88,10 @@ func (e *EmptyPostingsList) Iterator() PostingsIterator { return &EmptyPostingsIterator{} } +func (e *EmptyPostingsList) Size() int { + return 0 +} + func (e *EmptyPostingsList) Count() uint64 { return 0 } @@ -93,3 +101,7 @@ type EmptyPostingsIterator struct{} func (e *EmptyPostingsIterator) Next() (Posting, error) { return nil, nil } + +func (e *EmptyPostingsIterator) Size() int { + return 0 +} diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 57971aae5..264c94d1c 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -45,7 +45,7 @@ func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { } // compute memory usage of segment - s.updateSizeInBytes() + s.updateSize() // professional debugging // diff --git a/index/scorch/segment/mem/dict.go b/index/scorch/segment/mem/dict.go index b564ed1fb..9f5a873ae 100644 --- a/index/scorch/segment/mem/dict.go +++ b/index/scorch/segment/mem/dict.go @@ -15,14 +15,23 @@ package mem import ( + "reflect" "sort" "strings" "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeDictionary int + +func init() { + var d Dictionary + reflectStaticSizeDictionary = int(reflect.TypeOf(d).Size()) +} + // Dictionary is the in-memory representation of the term dictionary type Dictionary struct { segment *Segment @@ -30,6 +39,17 @@ type Dictionary struct { fieldID uint16 } +func (d *Dictionary) Size() int { + sizeInBytes := reflectStaticSizeDictionary + size.SizeOfPtr + + len(d.field) + + if d.segment != nil { + sizeInBytes += int(d.segment.Size()) + } + + return sizeInBytes +} + // PostingsList returns the postings list for the specified term func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) { diff --git a/index/scorch/segment/mem/posting.go b/index/scorch/segment/mem/posting.go index 2554333a2..4203acbe5 100644 --- a/index/scorch/segment/mem/posting.go +++ b/index/scorch/segment/mem/posting.go @@ -15,10 +15,29 @@ package mem import ( + "reflect" + "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizePostingsList int +var reflectStaticSizePostingsIterator int +var reflectStaticSizePosting int +var reflectStaticSizeLocation int + +func init() { + var pl PostingsList + reflectStaticSizePostingsList = int(reflect.TypeOf(pl).Size()) + var pi PostingsIterator + reflectStaticSizePostingsIterator = int(reflect.TypeOf(pi).Size()) + var p Posting + reflectStaticSizePosting = int(reflect.TypeOf(p).Size()) + var l Location + reflectStaticSizeLocation = int(reflect.TypeOf(l).Size()) +} + // PostingsList is an in-memory represenation of a postings list type PostingsList struct { dictionary *Dictionary @@ -27,6 +46,20 @@ type PostingsList struct { except *roaring.Bitmap } +func (p *PostingsList) Size() int { + sizeInBytes := reflectStaticSizePostingsList + size.SizeOfPtr + + if p.dictionary != nil { + sizeInBytes += p.dictionary.Size() + } + + if p.except != nil { + sizeInBytes += int(p.except.GetSizeInBytes()) + } + + return sizeInBytes +} + // Count returns the number of items on this postings list func (p *PostingsList) Count() uint64 { var rv uint64 @@ -83,6 +116,16 @@ type PostingsIterator struct { reuse Posting } +func (i *PostingsIterator) Size() int { + sizeInBytes := reflectStaticSizePostingsIterator + size.SizeOfPtr + + if i.locations != nil { + sizeInBytes += int(i.locations.GetSizeInBytes()) + } + + return sizeInBytes +} + // Next returns the next posting on the postings list, or nil at the end func (i *PostingsIterator) Next() (segment.Posting, error) { if i.actual == nil || !i.actual.HasNext() { @@ -121,6 +164,16 @@ type Posting struct { hasLoc bool } +func (p *Posting) Size() int { + sizeInBytes := reflectStaticSizePosting + size.SizeOfPtr + + if p.iterator != nil { + sizeInBytes += p.iterator.Size() + } + + return sizeInBytes +} + // Number returns the document number of this posting in this segment func (p *Posting) Number() uint64 { return p.docNum @@ -158,6 +211,15 @@ type Location struct { offset int } +func (l *Location) Size() int { + sizeInBytes := reflectStaticSizeLocation + if l.p != nil { + sizeInBytes += l.p.Size() + } + + return sizeInBytes +} + // Field returns the name of the field (useful in composite fields to know // which original field the value came from) func (l *Location) Field() string { diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go index 04bdb368a..e9c4a2730 100644 --- a/index/scorch/segment/mem/segment.go +++ b/index/scorch/segment/mem/segment.go @@ -16,11 +16,20 @@ package mem import ( "fmt" + "reflect" "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeSegment int + +func init() { + var s Segment + reflectStaticSizeSegment = int(reflect.TypeOf(s).Size()) +} + // _id field is always guaranteed to have fieldID of 0 const idFieldID uint16 = 0 @@ -96,7 +105,7 @@ type Segment struct { // Footprint of the segment, updated when analyzed document mutations // are added into the segment - sizeInBytes uint64 + sizeInBytes int } // New builds a new empty Segment @@ -107,99 +116,87 @@ func New() *Segment { } } -func (s *Segment) updateSizeInBytes() { - var sizeInBytes uint64 +func (s *Segment) updateSize() { + sizeInBytes := reflectStaticSizeSegment // FieldsMap, FieldsInv for k, _ := range s.FieldsMap { - sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 + - 2 /* size of uint16 */) + sizeInBytes += (len(k)+size.SizeOfString)*2 + + size.SizeOfUint16 } - // overhead from the data structures - sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice) // Dicts, DictKeys for _, entry := range s.Dicts { for k, _ := range entry { - sizeInBytes += uint64((len(k)+int(segment.SizeOfString))*2 + - 8 /* size of uint64 */) + sizeInBytes += (len(k)+size.SizeOfString)*2 + + size.SizeOfUint64 } // overhead from the data structures - sizeInBytes += (segment.SizeOfMap + segment.SizeOfSlice) + sizeInBytes += (size.SizeOfMap + size.SizeOfSlice) } - sizeInBytes += (segment.SizeOfSlice * 2) // Postings, PostingsLocs for i := 0; i < len(s.Postings); i++ { - sizeInBytes += (s.Postings[i].GetSizeInBytes() + segment.SizeOfPointer) + - (s.PostingsLocs[i].GetSizeInBytes() + segment.SizeOfPointer) + sizeInBytes += (int(s.Postings[i].GetSizeInBytes()) + size.SizeOfPtr) + + (int(s.PostingsLocs[i].GetSizeInBytes()) + size.SizeOfPtr) } - sizeInBytes += (segment.SizeOfSlice * 2) // Freqs, Norms for i := 0; i < len(s.Freqs); i++ { - sizeInBytes += uint64(len(s.Freqs[i])*8 /* size of uint64 */ + - len(s.Norms[i])*4 /* size of float32 */) + - (segment.SizeOfSlice * 2) + sizeInBytes += (len(s.Freqs[i])*size.SizeOfUint64 + + len(s.Norms[i])*size.SizeOfFloat32) + + (size.SizeOfSlice * 2) } - sizeInBytes += (segment.SizeOfSlice * 2) // Location data for i := 0; i < len(s.Locfields); i++ { - sizeInBytes += uint64(len(s.Locfields[i])*2 /* size of uint16 */ + - len(s.Locstarts[i])*8 /* size of uint64 */ + - len(s.Locends[i])*8 /* size of uint64 */ + - len(s.Locpos[i])*8 /* size of uint64 */) + sizeInBytes += len(s.Locfields[i])*size.SizeOfUint16 + + len(s.Locstarts[i])*size.SizeOfUint64 + + len(s.Locends[i])*size.SizeOfUint64 + + len(s.Locpos[i])*size.SizeOfUint64 for j := 0; j < len(s.Locarraypos[i]); j++ { - sizeInBytes += uint64(len(s.Locarraypos[i][j])*8 /* size of uint64 */) + - segment.SizeOfSlice + sizeInBytes += len(s.Locarraypos[i][j])*size.SizeOfUint64 + + size.SizeOfSlice } - sizeInBytes += (segment.SizeOfSlice * 5) + sizeInBytes += (size.SizeOfSlice * 5) } - sizeInBytes += (segment.SizeOfSlice * 5) // Stored data for i := 0; i < len(s.Stored); i++ { for _, v := range s.Stored[i] { - sizeInBytes += uint64(2 /* size of uint16 */) + sizeInBytes += size.SizeOfUint16 for _, arr := range v { - sizeInBytes += uint64(len(arr)) + segment.SizeOfSlice + sizeInBytes += len(arr) + size.SizeOfSlice } - sizeInBytes += segment.SizeOfSlice + sizeInBytes += size.SizeOfSlice } for _, v := range s.StoredTypes[i] { - sizeInBytes += uint64(2 /* size of uint16 */ +len(v)) + segment.SizeOfSlice + sizeInBytes += size.SizeOfUint16 + len(v) + size.SizeOfSlice } for _, v := range s.StoredPos[i] { - sizeInBytes += uint64(2 /* size of uint16 */) + sizeInBytes += size.SizeOfUint16 for _, arr := range v { - sizeInBytes += uint64(len(arr)*8 /* size of uint64 */) + - segment.SizeOfSlice + sizeInBytes += len(arr)*size.SizeOfUint64 + + size.SizeOfSlice } - sizeInBytes += segment.SizeOfSlice + sizeInBytes += size.SizeOfSlice } // overhead from map(s) within Stored, StoredTypes, StoredPos - sizeInBytes += (segment.SizeOfMap * 3) + sizeInBytes += (size.SizeOfMap * 3) } - // overhead from data structures: Stored, StoredTypes, StoredPos - sizeInBytes += (segment.SizeOfSlice * 3) // DocValueFields - sizeInBytes += uint64(len(s.DocValueFields)*3 /* size of uint16 + bool */) + - segment.SizeOfMap - - // SizeInBytes - sizeInBytes += uint64(8) + sizeInBytes += len(s.DocValueFields) * (size.SizeOfUint16 + size.SizeOfBool) s.sizeInBytes = sizeInBytes } -func (s *Segment) SizeInBytes() uint64 { +func (s *Segment) Size() int { return s.sizeInBytes } diff --git a/index/scorch/segment/mem/segment_test.go b/index/scorch/segment/mem/segment_test.go index 5e3818c24..6c5625d86 100644 --- a/index/scorch/segment/mem/segment_test.go +++ b/index/scorch/segment/mem/segment_test.go @@ -169,7 +169,7 @@ func TestSingle(t *testing.T) { t.Fatalf("segment nil, not expected") } - if segment.SizeInBytes() <= 0 { + if segment.Size() <= 0 { t.Fatalf("segment size not updated") } diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index d5435ab96..8eee5f75f 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -19,12 +19,6 @@ import ( "github.com/blevesearch/bleve/index" ) -// Overhead from go data structures when deployed on a 64-bit system. -const SizeOfMap uint64 = 8 -const SizeOfPointer uint64 = 8 -const SizeOfSlice uint64 = 24 -const SizeOfString uint64 = 16 - // DocumentFieldValueVisitor defines a callback to be visited for each // stored field value. The return value determines if the visitor // should keep going. Returning true continues visiting, false stops. @@ -42,7 +36,7 @@ type Segment interface { Close() error - SizeInBytes() uint64 + Size() int AddRef() DecRef() error @@ -63,6 +57,8 @@ type DictionaryIterator interface { type PostingsList interface { Iterator() PostingsIterator + Size() int + Count() uint64 // NOTE deferred for future work @@ -77,6 +73,8 @@ type PostingsIterator interface { // implementations may return a shared instance to reduce memory // allocations. Next() (Posting, error) + + Size() int } type Posting interface { @@ -86,6 +84,8 @@ type Posting interface { Norm() float64 Locations() []Location + + Size() int } type Location interface { @@ -94,6 +94,7 @@ type Location interface { End() uint64 Pos() uint64 ArrayPositions() []uint64 + Size() int } // DocumentFieldTermVisitable is implemented by various scorch segment diff --git a/index/scorch/segment/zap/contentcoder.go b/index/scorch/segment/zap/contentcoder.go index 83457146e..933f10a1e 100644 --- a/index/scorch/segment/zap/contentcoder.go +++ b/index/scorch/segment/zap/contentcoder.go @@ -18,10 +18,18 @@ import ( "bytes" "encoding/binary" "io" + "reflect" "github.com/golang/snappy" ) +var reflectStaticSizeMetaData int + +func init() { + var md MetaData + reflectStaticSizeMetaData = int(reflect.TypeOf(md).Size()) +} + var termSeparator byte = 0xff var termSeparatorSplitSlice = []byte{termSeparator} diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index 0514bd307..13635c57e 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -19,13 +19,21 @@ import ( "encoding/binary" "fmt" "math" + "reflect" "sort" "github.com/blevesearch/bleve/index" - "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" "github.com/golang/snappy" ) +var reflectStaticSizedocValueIterator int + +func init() { + var dvi docValueIterator + reflectStaticSizedocValueIterator = int(reflect.TypeOf(dvi).Size()) +} + type docValueIterator struct { field string curChunkNum uint64 @@ -36,21 +44,12 @@ type docValueIterator struct { curChunkData []byte // compressed data cache } -func (di *docValueIterator) sizeInBytes() uint64 { - // curChunkNum, numChunks, dvDataLoc --> uint64 - sizeInBytes := 24 - - // field - sizeInBytes += (len(di.field) + int(segment.SizeOfString)) - - // chunkLens, curChunkHeader - sizeInBytes += len(di.chunkLens)*8 + - len(di.curChunkHeader)*24 + - int(segment.SizeOfSlice*2) /* overhead from slices */ - - // curChunkData is mmap'ed, not included - - return uint64(sizeInBytes) +func (di *docValueIterator) size() int { + return reflectStaticSizedocValueIterator + size.SizeOfPtr + + len(di.field) + + len(di.chunkLens)*size.SizeOfUint64 + + len(di.curChunkHeader)*reflectStaticSizeMetaData + + len(di.curChunkData) } func (di *docValueIterator) fieldName() string { diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 589c7cb85..e9c68cbae 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -19,12 +19,30 @@ import ( "encoding/binary" "fmt" "math" + "reflect" "github.com/RoaringBitmap/roaring" "github.com/Smerity/govarint" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizePostingsList int +var reflectStaticSizePostingsIterator int +var reflectStaticSizePosting int +var reflectStaticSizeLocation int + +func init() { + var pl PostingsList + reflectStaticSizePostingsList = int(reflect.TypeOf(pl).Size()) + var pi PostingsIterator + reflectStaticSizePostingsIterator = int(reflect.TypeOf(pi).Size()) + var p Posting + reflectStaticSizePosting = int(reflect.TypeOf(p).Size()) + var l Location + reflectStaticSizeLocation = int(reflect.TypeOf(l).Size()) +} + // PostingsList is an in-memory represenation of a postings list type PostingsList struct { sb *SegmentBase @@ -36,6 +54,28 @@ type PostingsList struct { except *roaring.Bitmap } +func (p *PostingsList) Size() int { + sizeInBytes := reflectStaticSizePostingsList + size.SizeOfPtr + + if p.sb != nil { + sizeInBytes += (p.sb.Size() - len(p.sb.mem)) // do not include the mmap'ed part + } + + if p.locBitmap != nil { + sizeInBytes += int(p.locBitmap.GetSizeInBytes()) + } + + if p.postings != nil { + sizeInBytes += int(p.postings.GetSizeInBytes()) + } + + if p.except != nil { + sizeInBytes += int(p.except.GetSizeInBytes()) + } + + return sizeInBytes +} + // Iterator returns an iterator for this postings list func (p *PostingsList) Iterator() segment.PostingsIterator { return p.iterator(nil) @@ -193,6 +233,25 @@ type PostingsIterator struct { nextLocs []Location // reused across Next() calls } +func (i *PostingsIterator) Size() int { + sizeInBytes := reflectStaticSizePostingsIterator + size.SizeOfPtr + + len(i.currChunkFreqNorm) + + len(i.currChunkLoc) + + len(i.freqChunkLens)*size.SizeOfUint64 + + len(i.locChunkLens)*size.SizeOfUint64 + + i.next.Size() + + if i.locBitmap != nil { + sizeInBytes += int(i.locBitmap.GetSizeInBytes()) + } + + for _, entry := range i.nextLocs { + sizeInBytes += entry.Size() + } + + return sizeInBytes +} + func (i *PostingsIterator) loadChunk(chunk int) error { if chunk >= len(i.freqChunkLens) || chunk >= len(i.locChunkLens) { return fmt.Errorf("tried to load chunk that doesn't exist %d/(%d %d)", chunk, len(i.freqChunkLens), len(i.locChunkLens)) @@ -444,6 +503,20 @@ type Posting struct { locs []segment.Location } +func (p *Posting) Size() int { + sizeInBytes := reflectStaticSizePosting + + if p.iterator != nil { + sizeInBytes += p.iterator.Size() + } + + for _, entry := range p.locs { + sizeInBytes += entry.Size() + } + + return sizeInBytes +} + // Number returns the document number of this posting in this segment func (p *Posting) Number() uint64 { return p.docNum @@ -473,6 +546,12 @@ type Location struct { ap []uint64 } +func (l *Location) Size() int { + return reflectStaticSizeLocation + + len(l.field) + + len(l.ap)*size.SizeOfUint64 +} + // Field returns the name of the field (useful in composite fields to know // which original field the value came from) func (l *Location) Field() string { diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 40c0af274..972b7578e 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -20,16 +20,25 @@ import ( "fmt" "io" "os" + "reflect" "sync" "github.com/RoaringBitmap/roaring" "github.com/Smerity/govarint" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" "github.com/couchbase/vellum" mmap "github.com/edsrzf/mmap-go" "github.com/golang/snappy" ) +var reflectStaticSizeSegmentBase int + +func init() { + var sb SegmentBase + reflectStaticSizeSegmentBase = int(reflect.TypeOf(sb).Size()) +} + // Open returns a zap impl of a segment func Open(path string) (segment.Segment, error) { f, err := os.Open(path) @@ -92,6 +101,32 @@ type SegmentBase struct { fieldDvIterMap map[uint16]*docValueIterator // naive chunk cache per field } +func (sb *SegmentBase) Size() int { + sizeInBytes := reflectStaticSizeSegmentBase + + len(sb.mem) + + // fieldsMap + for k, _ := range sb.fieldsMap { + sizeInBytes += (len(k) + size.SizeOfString) + size.SizeOfUint16 + } + + // fieldsInv, dictLocs + for _, entry := range sb.fieldsInv { + sizeInBytes += len(entry) + size.SizeOfString + } + sizeInBytes += len(sb.dictLocs) * size.SizeOfUint64 + + // fieldDvIterMap + for _, v := range sb.fieldDvIterMap { + sizeInBytes += size.SizeOfUint16 + size.SizeOfPtr + if v != nil { + sizeInBytes += v.size() + } + } + + return sizeInBytes +} + func (sb *SegmentBase) AddRef() {} func (sb *SegmentBase) DecRef() (err error) { return nil } func (sb *SegmentBase) Close() (err error) { return nil } @@ -111,56 +146,19 @@ type Segment struct { refs int64 } -func (s *Segment) SizeInBytes() uint64 { +func (s *Segment) Size() int { // 8 /* size of file pointer */ // 4 /* size of version -> uint32 */ // 4 /* size of crc -> uint32 */ sizeOfUints := 16 - sizeInBytes := (len(s.path) + int(segment.SizeOfString)) + sizeOfUints + sizeInBytes := (len(s.path) + size.SizeOfString) + sizeOfUints // mutex, refs -> int64 sizeInBytes += 16 // do not include the mmap'ed part - return uint64(sizeInBytes) + s.SegmentBase.SizeInBytes() - uint64(len(s.mem)) -} - -func (s *SegmentBase) SizeInBytes() uint64 { - // 4 /* size of memCRC -> uint32 */ - // 4 /* size of chunkFactor -> uint32 */ - // 8 /* size of numDocs -> uint64 */ - // 8 /* size of storedIndexOffset -> uint64 */ - // 8 /* size of fieldsIndexOffset -> uint64 */ - // 8 /* size of docValueOffset -> uint64 */ - sizeInBytes := 40 - - sizeInBytes += len(s.mem) + int(segment.SizeOfSlice) - - // fieldsMap - for k, _ := range s.fieldsMap { - sizeInBytes += (len(k) + int(segment.SizeOfString)) + 2 /* size of uint16 */ - } - sizeInBytes += int(segment.SizeOfMap) /* overhead from map */ - - // fieldsInv, dictLocs - for _, entry := range s.fieldsInv { - sizeInBytes += (len(entry) + int(segment.SizeOfString)) - } - sizeInBytes += len(s.dictLocs) * 8 /* size of uint64 */ - sizeInBytes += int(segment.SizeOfSlice) * 3 /* overhead from slices */ - - // fieldDvIterMap - sizeInBytes += len(s.fieldDvIterMap) * - int(segment.SizeOfPointer+2 /* size of uint16 */) - for _, entry := range s.fieldDvIterMap { - if entry != nil { - sizeInBytes += int(entry.sizeInBytes()) - } - } - sizeInBytes += int(segment.SizeOfMap) - - return uint64(sizeInBytes) + return sizeInBytes + s.SegmentBase.Size() - len(s.mem) } func (s *Segment) AddRef() { diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 5289b1434..9394f391e 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -27,6 +27,7 @@ import ( "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" ) type asynchSegmentResult struct { @@ -89,6 +90,12 @@ func (i *IndexSnapshot) Close() error { return i.DecRef() } +func (i *IndexSnapshot) Size() int { + // Just return the size of the pointer for estimating the overhead + // during Search, a reference of the IndexSnapshot serves as the reader. + return size.SizeOfPtr +} + func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i segment.TermDictionary) segment.DictionaryIterator) (*IndexSnapshotFieldDict, error) { results := make(chan *asynchSegmentResult) diff --git a/index/scorch/snapshot_index_doc.go b/index/scorch/snapshot_index_doc.go index d1205ff8e..27da20865 100644 --- a/index/scorch/snapshot_index_doc.go +++ b/index/scorch/snapshot_index_doc.go @@ -16,17 +16,30 @@ package scorch import ( "bytes" + "reflect" "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeIndexSnapshotDocIDReader int + +func init() { + var isdr IndexSnapshotDocIDReader + reflectStaticSizeIndexSnapshotDocIDReader = int(reflect.TypeOf(isdr).Size()) +} + type IndexSnapshotDocIDReader struct { snapshot *IndexSnapshot iterators []roaring.IntIterable segmentOffset int } +func (i *IndexSnapshotDocIDReader) Size() int { + return reflectStaticSizeIndexSnapshotDocIDReader + size.SizeOfPtr +} + func (i *IndexSnapshotDocIDReader) Next() (index.IndexInternalID, error) { for i.segmentOffset < len(i.iterators) { if !i.iterators[i.segmentOffset].HasNext() { diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index d1f23b272..e1a0e9a59 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -16,12 +16,21 @@ package scorch import ( "bytes" + "reflect" "sync/atomic" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeIndexSnapshotTermFieldReader int + +func init() { + var istfr IndexSnapshotTermFieldReader + reflectStaticSizeIndexSnapshotTermFieldReader = int(reflect.TypeOf(istfr).Size()) +} + type IndexSnapshotTermFieldReader struct { term []byte field string @@ -36,6 +45,27 @@ type IndexSnapshotTermFieldReader struct { currID index.IndexInternalID } +func (i *IndexSnapshotTermFieldReader) Size() int { + sizeInBytes := reflectStaticSizeIndexSnapshotTermFieldReader + size.SizeOfPtr + + len(i.term) + + len(i.field) + + len(i.currID) + + for _, entry := range i.postings { + sizeInBytes += entry.Size() + } + + for _, entry := range i.iterators { + sizeInBytes += entry.Size() + } + + if i.currPosting != nil { + sizeInBytes += i.currPosting.Size() + } + + return sizeInBytes +} + func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) { rv := preAlloced if rv == nil { diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 5e64cb1f2..cdfe317fe 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -213,7 +213,7 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e return nil } -func (c *cachedDocs) sizeInBytes() uint64 { +func (c *cachedDocs) size() int { sizeInBytes := 0 c.m.Lock() for k, v := range c.cache { // cachedFieldDocs @@ -225,5 +225,5 @@ func (c *cachedDocs) sizeInBytes() uint64 { } } c.m.Unlock() - return uint64(sizeInBytes) + return sizeInBytes } diff --git a/index/upsidedown/index_reader.go b/index/upsidedown/index_reader.go index 77d523c30..4e5755219 100644 --- a/index/upsidedown/index_reader.go +++ b/index/upsidedown/index_reader.go @@ -15,17 +15,31 @@ package upsidedown import ( + "reflect" + "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/store" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeIndexReader int + +func init() { + var ir IndexReader + reflectStaticSizeIndexReader = int(reflect.TypeOf(ir).Size()) +} + type IndexReader struct { index *UpsideDownCouch kvreader store.KVReader docCount uint64 } +func (i *IndexReader) Size() int { + return reflectStaticSizeIndexReader + size.SizeOfPtr +} + func (i *IndexReader) TermFieldReader(term []byte, fieldName string, includeFreq, includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { fieldIndex, fieldExists := i.index.fieldCache.FieldNamed(fieldName, false) if fieldExists { diff --git a/index/upsidedown/reader.go b/index/upsidedown/reader.go index 1f40c02de..646d4d8ac 100644 --- a/index/upsidedown/reader.go +++ b/index/upsidedown/reader.go @@ -16,13 +16,27 @@ package upsidedown import ( "bytes" + "reflect" "sort" "sync/atomic" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/store" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeUpsideDownCouchTermFieldReader int +var reflectStaticSizeUpsideDownCouchDocIDReader int + +func init() { + var tfr UpsideDownCouchTermFieldReader + reflectStaticSizeUpsideDownCouchTermFieldReader = + int(reflect.TypeOf(tfr).Size()) + var cdr UpsideDownCouchDocIDReader + reflectStaticSizeUpsideDownCouchDocIDReader = + int(reflect.TypeOf(cdr).Size()) +} + type UpsideDownCouchTermFieldReader struct { count uint64 indexReader *IndexReader @@ -35,6 +49,19 @@ type UpsideDownCouchTermFieldReader struct { includeTermVectors bool } +func (r *UpsideDownCouchTermFieldReader) Size() int { + sizeInBytes := reflectStaticSizeUpsideDownCouchTermFieldReader + size.SizeOfPtr + + len(r.term) + + r.tfrPrealloc.Size() + + len(r.keyBuf) + + if r.tfrNext != nil { + sizeInBytes += r.tfrNext.Size() + } + + return sizeInBytes +} + func newUpsideDownCouchTermFieldReader(indexReader *IndexReader, term []byte, field uint16, includeFreq, includeNorm, includeTermVectors bool) (*UpsideDownCouchTermFieldReader, error) { bufNeeded := termFrequencyRowKeySize(term, nil) if bufNeeded < dictionaryRowKeySize(term) { @@ -174,8 +201,18 @@ type UpsideDownCouchDocIDReader struct { onlyMode bool } -func newUpsideDownCouchDocIDReader(indexReader *IndexReader) (*UpsideDownCouchDocIDReader, error) { +func (r *UpsideDownCouchDocIDReader) Size() int { + sizeInBytes := reflectStaticSizeUpsideDownCouchDocIDReader + + r.indexReader.Size() + + for _, entry := range r.only { + sizeInBytes += size.SizeOfString + len(entry) + } + return sizeInBytes +} + +func newUpsideDownCouchDocIDReader(indexReader *IndexReader) (*UpsideDownCouchDocIDReader, error) { startBytes := []byte{0x0} endBytes := []byte{0xff} diff --git a/index/upsidedown/row.go b/index/upsidedown/row.go index 7e503ae05..ba50314cd 100644 --- a/index/upsidedown/row.go +++ b/index/upsidedown/row.go @@ -20,10 +20,22 @@ import ( "fmt" "io" "math" + "reflect" + "github.com/blevesearch/bleve/size" "github.com/golang/protobuf/proto" ) +var reflectStaticSizeTermFrequencyRow int +var reflectStaticSizeTermVector int + +func init() { + var tfr TermFrequencyRow + reflectStaticSizeTermFrequencyRow = int(reflect.TypeOf(tfr).Size()) + var tv TermVector + reflectStaticSizeTermVector = int(reflect.TypeOf(tv).Size()) +} + const ByteSeparator byte = 0xff type UpsideDownCouchRowStream chan UpsideDownCouchRow @@ -358,6 +370,11 @@ type TermVector struct { end uint64 } +func (tv *TermVector) Size() int { + return reflectStaticSizeTermVector + size.SizeOfPtr + + len(tv.arrayPositions)*size.SizeOfUint64 +} + func (tv *TermVector) String() string { return fmt.Sprintf("Field: %d Pos: %d Start: %d End %d ArrayPositions: %#v", tv.field, tv.pos, tv.start, tv.end, tv.arrayPositions) } @@ -371,6 +388,18 @@ type TermFrequencyRow struct { field uint16 } +func (tfr *TermFrequencyRow) Size() int { + sizeInBytes := reflectStaticSizeTermFrequencyRow + + len(tfr.term) + + len(tfr.doc) + + for _, entry := range tfr.vectors { + sizeInBytes += entry.Size() + } + + return sizeInBytes +} + func (tfr *TermFrequencyRow) Term() []byte { return tfr.term } diff --git a/index_impl.go b/index_impl.go index caea1b8e0..df6e748d9 100644 --- a/index_impl.go +++ b/index_impl.go @@ -362,8 +362,59 @@ func (i *indexImpl) Search(req *SearchRequest) (sr *SearchResult, err error) { return i.SearchInContext(context.Background(), req) } +// memNeededForSearch is a helper function that returns an estimate of RAM +// needed to execute a search request. +func memNeededForSearch(req *SearchRequest, + searcher search.Searcher, + topnCollector *collector.TopNCollector) uint64 { + + backingSize := req.Size + req.From + 1 + if req.Size+req.From > collector.PreAllocSizeSkipCap { + backingSize = collector.PreAllocSizeSkipCap + 1 + } + numDocMatches := backingSize + searcher.DocumentMatchPoolSize() + + estimate := 0 + + // overhead, size in bytes from collector + estimate += topnCollector.Size() + + var dm search.DocumentMatch + sizeOfDocumentMatch := dm.Size() + + // pre-allocing DocumentMatchPool + var sc search.SearchContext + estimate += sc.Size() + numDocMatches*sizeOfDocumentMatch + + // searcher overhead + estimate += searcher.Size() + + // overhead from results, lowestMatchOutsideResults + estimate += (numDocMatches + 1) * sizeOfDocumentMatch + + // additional overhead from SearchResult + var sr SearchResult + estimate += sr.Size() + + // overhead from facet results + if req.Facets != nil { + var fr search.FacetResult + estimate += len(req.Facets) * fr.Size() + } + + // highlighting, store + var d document.Document + if len(req.Fields) > 0 || req.Highlight != nil { + for i := 0; i < (req.Size + req.From); i++ { // size + from => number of hits + estimate += (req.Size + req.From) * d.Size() + } + } + + return uint64(estimate) +} + // SearchInContext executes a search request operation within the provided -// Context. Returns a SearchResult object or an error. +// Context. Returns a SearchResult object or an error. func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr *SearchResult, err error) { i.mutex.RLock() defer i.mutex.RUnlock() diff --git a/index_test.go b/index_test.go index a69357bf6..f1e53647d 100644 --- a/index_test.go +++ b/index_test.go @@ -36,6 +36,9 @@ import ( "github.com/blevesearch/bleve/mapping" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/query" + + "github.com/blevesearch/bleve/index/scorch" + "github.com/blevesearch/bleve/index/upsidedown" ) func TestCrud(t *testing.T) { @@ -1815,3 +1818,55 @@ func TestIndexAdvancedCountMatchSearch(t *testing.T) { t.Fatal(err) } } + +func benchmarkSearchOverhead(indexType string, b *testing.B) { + defer func() { + err := os.RemoveAll("testidx") + if err != nil { + b.Fatal(err) + } + }() + + index, err := NewUsing("testidx", NewIndexMapping(), + indexType, Config.DefaultKVStore, nil) + if err != nil { + b.Fatal(err) + } + defer func() { + err := index.Close() + if err != nil { + b.Fatal(err) + } + }() + + elements := []string{"air", "water", "fire", "earth"} + for j := 0; j < 10000; j++ { + err = index.Index(fmt.Sprintf("%d", j), + map[string]interface{}{"name": elements[j%len(elements)]}) + if err != nil { + b.Fatal(err) + } + } + + query1 := NewTermQuery("water") + query2 := NewTermQuery("fire") + query := NewDisjunctionQuery(query1, query2) + req := NewSearchRequest(query) + + b.ResetTimer() + + for n := 0; n < b.N; n++ { + _, err = index.Search(req) + if err != nil { + b.Fatal(err) + } + } +} + +func BenchmarkUpsidedownSearchOverhead(b *testing.B) { + benchmarkSearchOverhead(upsidedown.Name, b) +} + +func BenchmarkScorchSearchOverhead(b *testing.B) { + benchmarkSearchOverhead(scorch.Name, b) +} diff --git a/search.go b/search.go index 46d849c1b..e324262e5 100644 --- a/search.go +++ b/search.go @@ -17,6 +17,7 @@ package bleve import ( "encoding/json" "fmt" + "reflect" "time" "github.com/blevesearch/bleve/analysis" @@ -24,8 +25,19 @@ import ( "github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/query" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeSearchResult int +var reflectStaticSizeSearchStatus int + +func init() { + var sr SearchResult + reflectStaticSizeSearchResult = int(reflect.TypeOf(sr).Size()) + var ss SearchStatus + reflectStaticSizeSearchStatus = int(reflect.TypeOf(ss).Size()) +} + var cache = registry.NewCache() const defaultDateTimeParser = optional.Name @@ -432,6 +444,24 @@ type SearchResult struct { Facets search.FacetResults `json:"facets"` } +func (sr *SearchResult) Size() int { + sizeInBytes := reflectStaticSizeSearchResult + size.SizeOfPtr + + reflectStaticSizeSearchStatus + + for _, entry := range sr.Hits { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + for k, v := range sr.Facets { + sizeInBytes += size.SizeOfString + len(k) + + v.Size() + } + + return sizeInBytes +} + func (sr *SearchResult) String() string { rv := "" if sr.Total > 0 { diff --git a/search/collector/search_test.go b/search/collector/search_test.go index 8457fb989..3ba71c1d1 100644 --- a/search/collector/search_test.go +++ b/search/collector/search_test.go @@ -15,6 +15,8 @@ package collector import ( + "reflect" + "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" @@ -25,6 +27,18 @@ type stubSearcher struct { matches []*search.DocumentMatch } +func (ss *stubSearcher) Size() int { + sizeInBytes := int(reflect.TypeOf(*ss).Size()) + + for _, entry := range ss.matches { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + return sizeInBytes +} + func (ss *stubSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) { if ss.index < len(ss.matches) { rv := ctx.DocumentMatchPool.Get() @@ -76,6 +90,10 @@ func (ss *stubSearcher) DocumentMatchPoolSize() int { type stubReader struct{} +func (sr *stubReader) Size() int { + return 0 +} + func (sr *stubReader) TermFieldReader(term []byte, field string, includeFreq, includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { return nil, nil } diff --git a/search/collector/topn.go b/search/collector/topn.go index 388370e7e..d684868cc 100644 --- a/search/collector/topn.go +++ b/search/collector/topn.go @@ -16,12 +16,21 @@ package collector import ( "context" + "reflect" "time" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeTopNCollector int + +func init() { + var coll TopNCollector + reflectStaticSizeTopNCollector = int(reflect.TypeOf(coll).Size()) +} + type collectorStore interface { // Add the document, and if the new store size exceeds the provided size // the last element is removed and returned. If the size has not been @@ -98,6 +107,22 @@ func NewTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector return hc } +func (hc *TopNCollector) Size() int { + sizeInBytes := reflectStaticSizeTopNCollector + size.SizeOfPtr + + if hc.facetsBuilder != nil { + sizeInBytes += hc.facetsBuilder.Size() + } + + for _, entry := range hc.neededFields { + sizeInBytes += len(entry) + size.SizeOfString + } + + sizeInBytes += len(hc.cachedScoring) + len(hc.cachedDesc) + + return sizeInBytes +} + // Collect goes to the index to find the matching documents func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, reader index.IndexReader) error { startTime := time.Now() diff --git a/search/explanation.go b/search/explanation.go index 766367d77..3b81737b5 100644 --- a/search/explanation.go +++ b/search/explanation.go @@ -17,8 +17,18 @@ package search import ( "encoding/json" "fmt" + "reflect" + + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeExplanation int + +func init() { + var e Explanation + reflectStaticSizeExplanation = int(reflect.TypeOf(e).Size()) +} + type Explanation struct { Value float64 `json:"value"` Message string `json:"message"` @@ -32,3 +42,14 @@ func (expl *Explanation) String() string { } return string(js) } + +func (expl *Explanation) Size() int { + sizeInBytes := reflectStaticSizeExplanation + size.SizeOfPtr + + len(expl.Message) + + for _, entry := range expl.Children { + sizeInBytes += entry.Size() + } + + return sizeInBytes +} diff --git a/search/facet/facet_builder_datetime.go b/search/facet/facet_builder_datetime.go index 8657a553a..c45442e4d 100644 --- a/search/facet/facet_builder_datetime.go +++ b/search/facet/facet_builder_datetime.go @@ -15,13 +15,25 @@ package facet import ( + "reflect" "sort" "time" "github.com/blevesearch/bleve/numeric" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeDateTimeFacetBuilder int +var reflectStaticSizedateTimeRange int + +func init() { + var dtfb DateTimeFacetBuilder + reflectStaticSizeDateTimeFacetBuilder = int(reflect.TypeOf(dtfb).Size()) + var dtr dateTimeRange + reflectStaticSizedateTimeRange = int(reflect.TypeOf(dtr).Size()) +} + type dateTimeRange struct { start time.Time end time.Time @@ -46,6 +58,23 @@ func NewDateTimeFacetBuilder(field string, size int) *DateTimeFacetBuilder { } } +func (fb *DateTimeFacetBuilder) Size() int { + sizeInBytes := reflectStaticSizeDateTimeFacetBuilder + size.SizeOfPtr + + len(fb.field) + + for k, _ := range fb.termsCount { + sizeInBytes += size.SizeOfString + len(k) + + size.SizeOfInt + } + + for k, _ := range fb.ranges { + sizeInBytes += size.SizeOfString + len(k) + + size.SizeOfPtr + reflectStaticSizedateTimeRange + } + + return sizeInBytes +} + func (fb *DateTimeFacetBuilder) AddRange(name string, start, end time.Time) { r := dateTimeRange{ start: start, diff --git a/search/facet/facet_builder_numeric.go b/search/facet/facet_builder_numeric.go index 2ab5f2789..c1692b549 100644 --- a/search/facet/facet_builder_numeric.go +++ b/search/facet/facet_builder_numeric.go @@ -15,12 +15,24 @@ package facet import ( + "reflect" "sort" "github.com/blevesearch/bleve/numeric" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeNumericFacetBuilder int +var reflectStaticSizenumericRange int + +func init() { + var nfb NumericFacetBuilder + reflectStaticSizeNumericFacetBuilder = int(reflect.TypeOf(nfb).Size()) + var nr numericRange + reflectStaticSizenumericRange = int(reflect.TypeOf(nr).Size()) +} + type numericRange struct { min *float64 max *float64 @@ -45,6 +57,23 @@ func NewNumericFacetBuilder(field string, size int) *NumericFacetBuilder { } } +func (fb *NumericFacetBuilder) Size() int { + sizeInBytes := reflectStaticSizeNumericFacetBuilder + size.SizeOfPtr + + len(fb.field) + + for k, _ := range fb.termsCount { + sizeInBytes += size.SizeOfString + len(k) + + size.SizeOfInt + } + + for k, _ := range fb.ranges { + sizeInBytes += size.SizeOfString + len(k) + + size.SizeOfPtr + reflectStaticSizenumericRange + } + + return sizeInBytes +} + func (fb *NumericFacetBuilder) AddRange(name string, min, max *float64) { r := numericRange{ min: min, diff --git a/search/facet/facet_builder_terms.go b/search/facet/facet_builder_terms.go index a41e475a9..5b5901e01 100644 --- a/search/facet/facet_builder_terms.go +++ b/search/facet/facet_builder_terms.go @@ -15,11 +15,20 @@ package facet import ( + "reflect" "sort" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeTermsFacetBuilder int + +func init() { + var tfb TermsFacetBuilder + reflectStaticSizeTermsFacetBuilder = int(reflect.TypeOf(tfb).Size()) +} + type TermsFacetBuilder struct { size int field string @@ -37,6 +46,18 @@ func NewTermsFacetBuilder(field string, size int) *TermsFacetBuilder { } } +func (fb *TermsFacetBuilder) Size() int { + sizeInBytes := reflectStaticSizeTermsFacetBuilder + size.SizeOfPtr + + len(fb.field) + + for k, _ := range fb.termsCount { + sizeInBytes += size.SizeOfString + len(k) + + size.SizeOfInt + } + + return sizeInBytes +} + func (fb *TermsFacetBuilder) Field() string { return fb.field } diff --git a/search/facets_builder.go b/search/facets_builder.go index 05e270413..34e45af84 100644 --- a/search/facets_builder.go +++ b/search/facets_builder.go @@ -15,11 +15,32 @@ package search import ( + "reflect" "sort" "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeFacetsBuilder int +var reflectStaticSizeFacetResult int +var reflectStaticSizeTermFacet int +var reflectStaticSizeNumericRangeFacet int +var reflectStaticSizeDateRangeFacet int + +func init() { + var fb FacetsBuilder + reflectStaticSizeFacetsBuilder = int(reflect.TypeOf(fb).Size()) + var fr FacetResult + reflectStaticSizeFacetResult = int(reflect.TypeOf(fr).Size()) + var tf TermFacet + reflectStaticSizeTermFacet = int(reflect.TypeOf(tf).Size()) + var nrf NumericRangeFacet + reflectStaticSizeNumericRangeFacet = int(reflect.TypeOf(nrf).Size()) + var drf DateRangeFacet + reflectStaticSizeDateRangeFacet = int(reflect.TypeOf(drf).Size()) +} + type FacetBuilder interface { StartDoc() UpdateVisitor(field string, term []byte) @@ -27,6 +48,8 @@ type FacetBuilder interface { Result() *FacetResult Field() string + + Size() int } type FacetsBuilder struct { @@ -42,6 +65,22 @@ func NewFacetsBuilder(indexReader index.IndexReader) *FacetsBuilder { } } +func (fb *FacetsBuilder) Size() int { + sizeInBytes := reflectStaticSizeFacetsBuilder + size.SizeOfPtr + + fb.indexReader.Size() + + for k, v := range fb.facets { + sizeInBytes += size.SizeOfString + len(k) + + v.Size() + } + + for _, entry := range fb.fields { + sizeInBytes += size.SizeOfString + len(entry) + } + + return sizeInBytes +} + func (fb *FacetsBuilder) Add(name string, facetBuilder FacetBuilder) { fb.facets[name] = facetBuilder fb.fields = append(fb.fields, facetBuilder.Field()) @@ -213,6 +252,14 @@ type FacetResult struct { DateRanges DateRangeFacets `json:"date_ranges,omitempty"` } +func (fr *FacetResult) Size() int { + return reflectStaticSizeFacetResult + size.SizeOfPtr + + len(fr.Field) + + len(fr.Terms)*(reflectStaticSizeTermFacet+size.SizeOfPtr) + + len(fr.NumericRanges)*(reflectStaticSizeNumericRangeFacet+size.SizeOfPtr) + + len(fr.DateRanges)*(reflectStaticSizeDateRangeFacet+size.SizeOfPtr) +} + func (fr *FacetResult) Merge(other *FacetResult) { fr.Total += other.Total fr.Missing += other.Missing diff --git a/search/pool.go b/search/pool.go index b9b52a613..ba8be8fc2 100644 --- a/search/pool.go +++ b/search/pool.go @@ -14,6 +14,17 @@ package search +import ( + "reflect" +) + +var reflectStaticSizeDocumentMatchPool int + +func init() { + var dmp DocumentMatchPool + reflectStaticSizeDocumentMatchPool = int(reflect.TypeOf(dmp).Size()) +} + // DocumentMatchPoolTooSmall is a callback function that can be executed // when the DocumentMatchPool does not have sufficient capacity // By default we just perform just-in-time allocation, but you could log diff --git a/search/scorer/scorer_conjunction.go b/search/scorer/scorer_conjunction.go index aad6f9c16..b866293e0 100644 --- a/search/scorer/scorer_conjunction.go +++ b/search/scorer/scorer_conjunction.go @@ -15,13 +15,27 @@ package scorer import ( + "reflect" + "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeConjunctionQueryScorer int + +func init() { + var cqs ConjunctionQueryScorer + reflectStaticSizeConjunctionQueryScorer = int(reflect.TypeOf(cqs).Size()) +} + type ConjunctionQueryScorer struct { options search.SearcherOptions } +func (s *ConjunctionQueryScorer) Size() int { + return reflectStaticSizeConjunctionQueryScorer + size.SizeOfPtr +} + func NewConjunctionQueryScorer(options search.SearcherOptions) *ConjunctionQueryScorer { return &ConjunctionQueryScorer{ options: options, diff --git a/search/scorer/scorer_constant.go b/search/scorer/scorer_constant.go index a65a826f2..dc10fdaa4 100644 --- a/search/scorer/scorer_constant.go +++ b/search/scorer/scorer_constant.go @@ -16,11 +16,20 @@ package scorer import ( "fmt" + "reflect" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeConstantScorer int + +func init() { + var cs ConstantScorer + reflectStaticSizeConstantScorer = int(reflect.TypeOf(cs).Size()) +} + type ConstantScorer struct { constant float64 boost float64 @@ -30,6 +39,16 @@ type ConstantScorer struct { queryWeightExplanation *search.Explanation } +func (s *ConstantScorer) Size() int { + sizeInBytes := reflectStaticSizeConstantScorer + size.SizeOfPtr + + if s.queryWeightExplanation != nil { + sizeInBytes += s.queryWeightExplanation.Size() + } + + return sizeInBytes +} + func NewConstantScorer(constant float64, boost float64, options search.SearcherOptions) *ConstantScorer { rv := ConstantScorer{ options: options, diff --git a/search/scorer/scorer_disjunction.go b/search/scorer/scorer_disjunction.go index 184a15d27..36a601c72 100644 --- a/search/scorer/scorer_disjunction.go +++ b/search/scorer/scorer_disjunction.go @@ -16,14 +16,27 @@ package scorer import ( "fmt" + "reflect" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeDisjunctionQueryScorer int + +func init() { + var dqs DisjunctionQueryScorer + reflectStaticSizeDisjunctionQueryScorer = int(reflect.TypeOf(dqs).Size()) +} + type DisjunctionQueryScorer struct { options search.SearcherOptions } +func (s *DisjunctionQueryScorer) Size() int { + return reflectStaticSizeDisjunctionQueryScorer + size.SizeOfPtr +} + func NewDisjunctionQueryScorer(options search.SearcherOptions) *DisjunctionQueryScorer { return &DisjunctionQueryScorer{ options: options, diff --git a/search/scorer/scorer_term.go b/search/scorer/scorer_term.go index b5f46322c..077e38e0f 100644 --- a/search/scorer/scorer_term.go +++ b/search/scorer/scorer_term.go @@ -17,11 +17,20 @@ package scorer import ( "fmt" "math" + "reflect" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeTermQueryScorer int + +func init() { + var tqs TermQueryScorer + reflectStaticSizeTermQueryScorer = int(reflect.TypeOf(tqs).Size()) +} + type TermQueryScorer struct { queryTerm []byte queryField string @@ -36,6 +45,21 @@ type TermQueryScorer struct { queryWeightExplanation *search.Explanation } +func (s *TermQueryScorer) Size() int { + sizeInBytes := reflectStaticSizeTermQueryScorer + size.SizeOfPtr + + len(s.queryTerm) + len(s.queryField) + + if s.idfExplanation != nil { + sizeInBytes += s.idfExplanation.Size() + } + + if s.queryWeightExplanation != nil { + sizeInBytes += s.queryWeightExplanation.Size() + } + + return sizeInBytes +} + func NewTermQueryScorer(queryTerm []byte, queryField string, queryBoost float64, docTotal, docTerm uint64, options search.SearcherOptions) *TermQueryScorer { rv := TermQueryScorer{ queryTerm: queryTerm, diff --git a/search/search.go b/search/search.go index f9a92783b..ca030df4b 100644 --- a/search/search.go +++ b/search/search.go @@ -16,11 +16,26 @@ package search import ( "fmt" + "reflect" "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeDocumentMatch int +var reflectStaticSizeSearchContext int +var reflectStaticSizeLocation int + +func init() { + var dm DocumentMatch + reflectStaticSizeDocumentMatch = int(reflect.TypeOf(dm).Size()) + var sc SearchContext + reflectStaticSizeSearchContext = int(reflect.TypeOf(sc).Size()) + var l Location + reflectStaticSizeLocation = int(reflect.TypeOf(l).Size()) +} + type ArrayPositions []uint64 func (ap ArrayPositions) Equals(other ArrayPositions) bool { @@ -47,6 +62,11 @@ type Location struct { ArrayPositions ArrayPositions `json:"array_positions"` } +func (l *Location) Size() int { + return reflectStaticSizeLocation + size.SizeOfPtr + + len(l.ArrayPositions)*size.SizeOfUint64 +} + type Locations []*Location type TermLocationMap map[string]Locations @@ -117,6 +137,52 @@ func (dm *DocumentMatch) Reset() *DocumentMatch { return dm } +func (dm *DocumentMatch) Size() int { + sizeInBytes := reflectStaticSizeDocumentMatch + size.SizeOfPtr + + len(dm.Index) + + len(dm.ID) + + len(dm.IndexInternalID) + + if dm.Expl != nil { + sizeInBytes += dm.Expl.Size() + } + + for k, v := range dm.Locations { + sizeInBytes += size.SizeOfString + len(k) + for k1, v1 := range v { + sizeInBytes += size.SizeOfString + len(k1) + + size.SizeOfSlice + for _, entry := range v1 { + sizeInBytes += entry.Size() + } + } + } + + for k, v := range dm.Fragments { + sizeInBytes += size.SizeOfString + len(k) + + size.SizeOfSlice + + for _, entry := range v { + sizeInBytes += size.SizeOfString + len(entry) + } + } + + for _, entry := range dm.Sort { + sizeInBytes += size.SizeOfString + len(entry) + } + + for k, _ := range dm.Fields { + sizeInBytes += size.SizeOfString + len(k) + + size.SizeOfPtr + } + + if dm.Document != nil { + sizeInBytes += dm.Document.Size() + } + + return sizeInBytes +} + func (dm *DocumentMatch) String() string { return fmt.Sprintf("[%s-%f]", string(dm.IndexInternalID), dm.Score) } @@ -135,6 +201,7 @@ type Searcher interface { SetQueryNorm(float64) Count() uint64 Min() int + Size() int DocumentMatchPoolSize() int } @@ -148,3 +215,18 @@ type SearcherOptions struct { type SearchContext struct { DocumentMatchPool *DocumentMatchPool } + +func (sc *SearchContext) Size() int { + sizeInBytes := reflectStaticSizeSearchContext + size.SizeOfPtr + + reflectStaticSizeDocumentMatchPool + size.SizeOfPtr + + if sc.DocumentMatchPool != nil { + for _, entry := range sc.DocumentMatchPool.avail { + if entry != nil { + sizeInBytes += entry.Size() + } + } + } + + return sizeInBytes +} diff --git a/search/searcher/search_boolean.go b/search/searcher/search_boolean.go index a905c29e5..b87337e1e 100644 --- a/search/searcher/search_boolean.go +++ b/search/searcher/search_boolean.go @@ -16,12 +16,21 @@ package searcher import ( "math" + "reflect" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/scorer" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeBooleanSearcher int + +func init() { + var bs BooleanSearcher + reflectStaticSizeBooleanSearcher = int(reflect.TypeOf(bs).Size()) +} + type BooleanSearcher struct { indexReader index.IndexReader mustSearcher search.Searcher @@ -52,6 +61,33 @@ func NewBooleanSearcher(indexReader index.IndexReader, mustSearcher search.Searc return &rv, nil } +func (s *BooleanSearcher) Size() int { + sizeInBytes := reflectStaticSizeBooleanSearcher + size.SizeOfPtr + + s.indexReader.Size() + + if s.mustSearcher != nil { + sizeInBytes += s.mustSearcher.Size() + } + + if s.shouldSearcher != nil { + sizeInBytes += s.shouldSearcher.Size() + } + + if s.mustNotSearcher != nil { + sizeInBytes += s.mustNotSearcher.Size() + } + + sizeInBytes += s.scorer.Size() + + for _, entry := range s.matches { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + return sizeInBytes +} + func (s *BooleanSearcher) computeQueryNorm() { // first calculate sum of squared weights sumOfSquaredWeights := 0.0 diff --git a/search/searcher/search_conjunction.go b/search/searcher/search_conjunction.go index 73fba19cd..da65f3981 100644 --- a/search/searcher/search_conjunction.go +++ b/search/searcher/search_conjunction.go @@ -16,13 +16,22 @@ package searcher import ( "math" + "reflect" "sort" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/scorer" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeConjunctionSearcher int + +func init() { + var cs ConjunctionSearcher + reflectStaticSizeConjunctionSearcher = int(reflect.TypeOf(cs).Size()) +} + type ConjunctionSearcher struct { indexReader index.IndexReader searchers OrderedSearcherList @@ -54,6 +63,23 @@ func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.S return &rv, nil } +func (s *ConjunctionSearcher) Size() int { + sizeInBytes := reflectStaticSizeConjunctionSearcher + size.SizeOfPtr + + s.scorer.Size() + + for _, entry := range s.searchers { + sizeInBytes += entry.Size() + } + + for _, entry := range s.currs { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + return sizeInBytes +} + func (s *ConjunctionSearcher) computeQueryNorm() { // first calculate sum of squared weights sumOfSquaredWeights := 0.0 diff --git a/search/searcher/search_disjunction.go b/search/searcher/search_disjunction.go index b6910ddb6..119bac970 100644 --- a/search/searcher/search_disjunction.go +++ b/search/searcher/search_disjunction.go @@ -17,13 +17,22 @@ package searcher import ( "fmt" "math" + "reflect" "sort" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/scorer" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeDisjunctionSearcher int + +func init() { + var ds DisjunctionSearcher + reflectStaticSizeDisjunctionSearcher = int(reflect.TypeOf(ds).Size()) +} + // DisjunctionMaxClauseCount is a compile time setting that applications can // adjust to non-zero value to cause the DisjunctionSearcher to return an // error instead of exeucting searches when the size exceeds this value. @@ -90,6 +99,32 @@ func newDisjunctionSearcher(indexReader index.IndexReader, return &rv, nil } +func (s *DisjunctionSearcher) Size() int { + sizeInBytes := reflectStaticSizeDisjunctionSearcher + size.SizeOfPtr + + s.indexReader.Size() + + s.scorer.Size() + + for _, entry := range s.searchers { + sizeInBytes += entry.Size() + } + + for _, entry := range s.currs { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + for _, entry := range s.matching { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + sizeInBytes += len(s.matchingIdxs) * size.SizeOfInt + + return sizeInBytes +} + func (s *DisjunctionSearcher) computeQueryNorm() { // first calculate sum of squared weights sumOfSquaredWeights := 0.0 diff --git a/search/searcher/search_docid.go b/search/searcher/search_docid.go index 06351b4a0..3b258a580 100644 --- a/search/searcher/search_docid.go +++ b/search/searcher/search_docid.go @@ -15,11 +15,21 @@ package searcher import ( + "reflect" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/scorer" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeDocIDSearcher int + +func init() { + var ds DocIDSearcher + reflectStaticSizeDocIDSearcher = int(reflect.TypeOf(ds).Size()) +} + // DocIDSearcher returns documents matching a predefined set of identifiers. type DocIDSearcher struct { reader index.DocIDReader @@ -42,6 +52,12 @@ func NewDocIDSearcher(indexReader index.IndexReader, ids []string, boost float64 }, nil } +func (s *DocIDSearcher) Size() int { + return reflectStaticSizeDocIDSearcher + size.SizeOfPtr + + s.reader.Size() + + s.scorer.Size() +} + func (s *DocIDSearcher) Count() uint64 { return uint64(s.count) } diff --git a/search/searcher/search_filter.go b/search/searcher/search_filter.go index 219f2ee7e..7c95fb41c 100644 --- a/search/searcher/search_filter.go +++ b/search/searcher/search_filter.go @@ -15,10 +15,20 @@ package searcher import ( + "reflect" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeFilteringSearcher int + +func init() { + var fs FilteringSearcher + reflectStaticSizeFilteringSearcher = int(reflect.TypeOf(fs).Size()) +} + // FilterFunc defines a function which can filter documents // returning true means keep the document // returning false means do not keep the document @@ -38,6 +48,11 @@ func NewFilteringSearcher(s search.Searcher, filter FilterFunc) *FilteringSearch } } +func (f *FilteringSearcher) Size() int { + return reflectStaticSizeFilteringSearcher + size.SizeOfPtr + + f.child.Size() +} + func (f *FilteringSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) { next, err := f.child.Next(ctx) for next != nil && err == nil { diff --git a/search/searcher/search_match_all.go b/search/searcher/search_match_all.go index 822db2ea0..3f34e5918 100644 --- a/search/searcher/search_match_all.go +++ b/search/searcher/search_match_all.go @@ -15,11 +15,21 @@ package searcher import ( + "reflect" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/scorer" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeMatchAllSearcher int + +func init() { + var mas MatchAllSearcher + reflectStaticSizeMatchAllSearcher = int(reflect.TypeOf(mas).Size()) +} + type MatchAllSearcher struct { indexReader index.IndexReader reader index.DocIDReader @@ -46,6 +56,13 @@ func NewMatchAllSearcher(indexReader index.IndexReader, boost float64, options s }, nil } +func (s *MatchAllSearcher) Size() int { + return reflectStaticSizeMatchAllSearcher + size.SizeOfPtr + + s.indexReader.Size() + + s.reader.Size() + + s.scorer.Size() +} + func (s *MatchAllSearcher) Count() uint64 { return s.count } diff --git a/search/searcher/search_match_none.go b/search/searcher/search_match_none.go index 947596714..6b50b3222 100644 --- a/search/searcher/search_match_none.go +++ b/search/searcher/search_match_none.go @@ -15,10 +15,20 @@ package searcher import ( + "reflect" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeMatchNoneSearcher int + +func init() { + var mns MatchNoneSearcher + reflectStaticSizeMatchNoneSearcher = int(reflect.TypeOf(mns).Size()) +} + type MatchNoneSearcher struct { indexReader index.IndexReader } @@ -29,6 +39,11 @@ func NewMatchNoneSearcher(indexReader index.IndexReader) (*MatchNoneSearcher, er }, nil } +func (s *MatchNoneSearcher) Size() int { + return reflectStaticSizeMatchNoneSearcher + size.SizeOfPtr + + s.indexReader.Size() +} + func (s *MatchNoneSearcher) Count() uint64 { return uint64(0) } diff --git a/search/searcher/search_phrase.go b/search/searcher/search_phrase.go index 6237cecfd..23a359bd7 100644 --- a/search/searcher/search_phrase.go +++ b/search/searcher/search_phrase.go @@ -17,11 +17,20 @@ package searcher import ( "fmt" "math" + "reflect" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizePhraseSearcher int + +func init() { + var ps PhraseSearcher + reflectStaticSizePhraseSearcher = int(reflect.TypeOf(ps).Size()) +} + type PhraseSearcher struct { indexReader index.IndexReader mustSearcher *ConjunctionSearcher @@ -32,6 +41,28 @@ type PhraseSearcher struct { initialized bool } +func (s *PhraseSearcher) Size() int { + sizeInBytes := reflectStaticSizePhraseSearcher + size.SizeOfPtr + + s.indexReader.Size() + + if s.mustSearcher != nil { + sizeInBytes += s.mustSearcher.Size() + } + + if s.currMust != nil { + sizeInBytes += s.currMust.Size() + } + + for _, entry := range s.terms { + sizeInBytes += size.SizeOfSlice + for _, entry1 := range entry { + sizeInBytes += size.SizeOfString + len(entry1) + } + } + + return sizeInBytes +} + func NewPhraseSearcher(indexReader index.IndexReader, terms []string, field string, options search.SearcherOptions) (*PhraseSearcher, error) { // turn flat terms []string into [][]string mterms := make([][]string, len(terms)) diff --git a/search/searcher/search_term.go b/search/searcher/search_term.go index 6fae6ae5a..576d6643a 100644 --- a/search/searcher/search_term.go +++ b/search/searcher/search_term.go @@ -15,11 +15,21 @@ package searcher import ( + "reflect" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/scorer" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeTermSearcher int + +func init() { + var ts TermSearcher + reflectStaticSizeTermSearcher = int(reflect.TypeOf(ts).Size()) +} + type TermSearcher struct { indexReader index.IndexReader reader index.TermFieldReader @@ -63,6 +73,14 @@ func NewTermSearcherBytes(indexReader index.IndexReader, term []byte, field stri }, nil } +func (s *TermSearcher) Size() int { + return reflectStaticSizeTermSearcher + size.SizeOfPtr + + s.indexReader.Size() + + s.reader.Size() + + s.tfd.Size() + + s.scorer.Size() +} + func (s *TermSearcher) Count() uint64 { return s.reader.Count() } diff --git a/size/sizes.go b/size/sizes.go new file mode 100644 index 000000000..4ba544a71 --- /dev/null +++ b/size/sizes.go @@ -0,0 +1,57 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package size + +import ( + "reflect" +) + +func init() { + var a bool + SizeOfBool = int(reflect.TypeOf(a).Size()) + var b float32 + SizeOfFloat32 = int(reflect.TypeOf(b).Size()) + var c float64 + SizeOfFloat64 = int(reflect.TypeOf(c).Size()) + var d map[int]int + SizeOfMap = int(reflect.TypeOf(d).Size()) + var e *int + SizeOfPtr = int(reflect.TypeOf(e).Size()) + var f []int + SizeOfSlice = int(reflect.TypeOf(f).Size()) + var g string + SizeOfString = int(reflect.TypeOf(g).Size()) + var h uint8 + SizeOfUint8 = int(reflect.TypeOf(h).Size()) + var i uint16 + SizeOfUint16 = int(reflect.TypeOf(i).Size()) + var j uint32 + SizeOfUint32 = int(reflect.TypeOf(j).Size()) + var k uint64 + SizeOfUint64 = int(reflect.TypeOf(k).Size()) +} + +var SizeOfBool int +var SizeOfFloat32 int +var SizeOfFloat64 int +var SizeOfInt int +var SizeOfMap int +var SizeOfPtr int +var SizeOfSlice int +var SizeOfString int +var SizeOfUint8 int +var SizeOfUint16 int +var SizeOfUint32 int +var SizeOfUint64 int From 96071c085cbdf0beb7d7a74f483b1b2c53f4bd97 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 5 Mar 2018 16:49:55 -0800 Subject: [PATCH 257/728] MB-28163: Register a callback with context to estimate RAM for search This callback if registered with context will invoke the api to estimate the memory needed to execute a search query. The callback defined at the client side will be responsible for determining whether to continue with the search or abort based on the threshold settings. --- index_impl.go | 13 +++++++++++++ index_test.go | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+) diff --git a/index_impl.go b/index_impl.go index df6e748d9..1036aef2a 100644 --- a/index_impl.go +++ b/index_impl.go @@ -50,6 +50,10 @@ const storePath = "store" var mappingInternalKey = []byte("_mapping") +const SearchMemCheckCallbackKey = "_search_mem_callback_key" + +type SearchMemCheckCallbackFn func(size uint64) error + func indexStorePath(path string) string { return path + string(os.PathSeparator) + storePath } @@ -479,6 +483,15 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr collector.SetFacetsBuilder(facetsBuilder) } + if memCb := ctx.Value(SearchMemCheckCallbackKey); memCb != nil { + if memCbFn, ok := memCb.(SearchMemCheckCallbackFn); ok { + err = memCbFn(memNeededForSearch(req, searcher, collector)) + } + } + if err != nil { + return nil, err + } + err = collector.Collect(ctx, searcher, indexReader) if err != nil { return nil, err diff --git a/index_test.go b/index_test.go index f1e53647d..57429dcb2 100644 --- a/index_test.go +++ b/index_test.go @@ -1870,3 +1870,50 @@ func BenchmarkUpsidedownSearchOverhead(b *testing.B) { func BenchmarkScorchSearchOverhead(b *testing.B) { benchmarkSearchOverhead(scorch.Name, b) } + +func TestSearchMemCheckCallback(t *testing.T) { + defer func() { + err := os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + }() + + index, err := New("testidx", NewIndexMapping()) + if err != nil { + t.Fatal(err) + } + defer func() { + err := index.Close() + if err != nil { + t.Fatal(err) + } + }() + + elements := []string{"air", "water", "fire", "earth"} + for j := 0; j < 10000; j++ { + err = index.Index(fmt.Sprintf("%d", j), + map[string]interface{}{"name": elements[j%len(elements)]}) + if err != nil { + t.Fatal(err) + } + } + + query := NewTermQuery("water") + req := NewSearchRequest(query) + + expErr := fmt.Errorf("MEM_LIMIT_EXCEEDED") + f := func(size uint64) error { + if size > 1000 { + return expErr + } + return nil + } + + ctx := context.WithValue(context.Background(), SearchMemCheckCallbackKey, + SearchMemCheckCallbackFn(f)) + _, err = index.SearchInContext(ctx, req) + if err != expErr { + t.Fatalf("Expected: %v, Got: %v", expErr, err) + } +} From 38b6c522b0a8a77f775b8f340909a7517766d67a Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Tue, 6 Mar 2018 14:00:54 -0800 Subject: [PATCH 258/728] Address build breakage after rebase Removed attribute: iterator of type Posting --- index/scorch/segment/zap/posting.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index e9c68cbae..ef21df8d5 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -506,10 +506,6 @@ type Posting struct { func (p *Posting) Size() int { sizeInBytes := reflectStaticSizePosting - if p.iterator != nil { - sizeInBytes += p.iterator.Size() - } - for _, entry := range p.locs { sizeInBytes += entry.Size() } From b62ca996f63e64dad76bdd95c7d8b4f31d4d6f0a Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 6 Mar 2018 13:30:44 -0800 Subject: [PATCH 259/728] scorch zap optimize chunkedIntCoder.Add() calls to use multiple vals This change leverages the ability for the chunkedIntCoder.Add() method to accept multiple input param values (via the '...' param signature), meaning there are fewer Add() invocations. --- index/scorch/segment/zap/build.go | 51 +++++-------------------------- 1 file changed, 8 insertions(+), 43 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 237cc5f3d..361e56e51 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -319,19 +319,10 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac postingsListItr := memSegment.Postings[postingID].Iterator() var offset int for postingsListItr.HasNext() { - docNum := uint64(postingsListItr.Next()) - // put freq - err := tfEncoder.Add(docNum, freqs[offset]) - if err != nil { - return nil, nil, err - } - - // put norm - norm := norms[offset] - normBits := math.Float32bits(norm) - err = tfEncoder.Add(docNum, uint64(normBits)) + // put freq & norm + err := tfEncoder.Add(docNum, freqs[offset], uint64(math.Float32bits(norms[offset]))) if err != nil { return nil, nil, err } @@ -347,7 +338,6 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac if err != nil { return nil, nil, err } - } // now do it again for the locations @@ -371,44 +361,18 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac n := int(freqs[offset]) for i := 0; i < n; i++ { if len(locfields) > 0 { - // put field - err := locEncoder.Add(docNum, uint64(locfields[locOffset])) + err := locEncoder.Add(docNum, uint64(locfields[locOffset]), + locpos[locOffset], locstarts[locOffset], locends[locOffset], + uint64(len(locarraypos[locOffset]))) if err != nil { return nil, nil, err } - // put pos - err = locEncoder.Add(docNum, locpos[locOffset]) - if err != nil { - return nil, nil, err - } - - // put start - err = locEncoder.Add(docNum, locstarts[locOffset]) - if err != nil { - return nil, nil, err - } - - // put end - err = locEncoder.Add(docNum, locends[locOffset]) - if err != nil { - return nil, nil, err - } - - // put the number of array positions to follow - num := len(locarraypos[locOffset]) - err = locEncoder.Add(docNum, uint64(num)) + // put each array position + err = locEncoder.Add(docNum, locarraypos[locOffset]...) if err != nil { return nil, nil, err } - - // put each array position - for _, pos := range locarraypos[locOffset] { - err = locEncoder.Add(docNum, pos) - if err != nil { - return nil, nil, err - } - } } locOffset++ } @@ -417,6 +381,7 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac // record where this postings loc info starts locOffsets = append(locOffsets, uint64(w.Count())) + locEncoder.Close() _, err := locEncoder.Write(w) if err != nil { From dde6c2e01b3b33828f33f28ad43af2147713f7c1 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 6 Mar 2018 14:59:20 -0800 Subject: [PATCH 260/728] scorch zap optimize writeRoaringWithLen() Before this change, writeRoaringWithLen() would leverage a reused bytes.Buffer (#A) and invoke the roaring.WriteTo() API. But, it turns out the roaring.WriteTo() API has a suboptimal implementation, in that underneath-the-hood it converts the roaring bitmap to a byte buffer (using roaring.ToBytes()), and then calls Write(). But, that Write() turns out to be an additional memcpy into the provided bytes.Buffer (#A). By directly invoking roaring.ToBytes(), this change to writeRoaringWithLen() avoids the extra memory allocation and memcpy. --- index/scorch/segment/zap/build.go | 6 ++---- index/scorch/segment/zap/merge.go | 5 ++--- index/scorch/segment/zap/write.go | 20 ++++++++++---------- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 361e56e51..404ec8694 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -394,13 +394,12 @@ func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFac func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint64, err error) { rv = make([]uint64, 0, len(memSegment.PostingsLocs)) - var reuseBuf bytes.Buffer reuseBufVarint := make([]byte, binary.MaxVarintLen64) for postingID := range memSegment.PostingsLocs { // record where we start this posting loc rv = append(rv, uint64(w.Count())) // write out the length and bitmap - _, err = writeRoaringWithLen(memSegment.PostingsLocs[postingID], w, &reuseBuf, reuseBufVarint) + _, err = writeRoaringWithLen(memSegment.PostingsLocs[postingID], w, reuseBufVarint) if err != nil { return nil, err } @@ -411,7 +410,6 @@ func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, postingsListLocs, freqOffsets, locOffsets []uint64) (rv []uint64, err error) { rv = make([]uint64, 0, len(memSegment.Postings)) - var reuseBuf bytes.Buffer reuseBufVarint := make([]byte, binary.MaxVarintLen64) for postingID := range memSegment.Postings { // record where we start this posting list @@ -425,7 +423,7 @@ func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, } // write out the length and bitmap - _, err = writeRoaringWithLen(memSegment.Postings[postingID], w, &reuseBuf, reuseBufVarint) + _, err = writeRoaringWithLen(memSegment.Postings[postingID], w, reuseBufVarint) if err != nil { return nil, err } diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 5066dfb9e..3a0577cdd 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -160,7 +160,6 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, newSegDocCount uint64, chunkFactor uint32, w *CountHashWriter) ([]uint64, uint64, error) { - var bufReuse bytes.Buffer var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64) var postings *PostingsList @@ -247,7 +246,7 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, return err } postingLocOffset := uint64(w.Count()) - _, err = writeRoaringWithLen(newRoaringLocs, w, &bufReuse, bufMaxVarintLen64) + _, err = writeRoaringWithLen(newRoaringLocs, w, bufMaxVarintLen64) if err != nil { return err } @@ -271,7 +270,7 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, if err != nil { return err } - _, err = writeRoaringWithLen(newRoaring, w, &bufReuse, bufMaxVarintLen64) + _, err = writeRoaringWithLen(newRoaring, w, bufMaxVarintLen64) if err != nil { return err } diff --git a/index/scorch/segment/zap/write.go b/index/scorch/segment/zap/write.go index c5316a99f..7f4f5a88b 100644 --- a/index/scorch/segment/zap/write.go +++ b/index/scorch/segment/zap/write.go @@ -15,7 +15,6 @@ package zap import ( - "bytes" "encoding/binary" "io" @@ -25,28 +24,29 @@ import ( // writes out the length of the roaring bitmap in bytes as varint // then writes out the roaring bitmap itself func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer, - reuseBuf *bytes.Buffer, reuseBufVarint []byte) (int, error) { - reuseBuf.Reset() - - // write out postings list to memory so we know the len - postingsListLen, err := r.WriteTo(reuseBuf) + reuseBufVarint []byte) (int, error) { + buf, err := r.ToBytes() if err != nil { return 0, err } + var tw int - // write out the length of this postings list - n := binary.PutUvarint(reuseBufVarint, uint64(postingsListLen)) + + // write out the length + n := binary.PutUvarint(reuseBufVarint, uint64(len(buf))) nw, err := w.Write(reuseBufVarint[:n]) tw += nw if err != nil { return tw, err } - // write out the postings list itself - nw, err = w.Write(reuseBuf.Bytes()) + + // write out the roaring bytes + nw, err = w.Write(buf) tw += nw if err != nil { return tw, err } + return tw, nil } From 8841d79d26aa76dcd88df1e1f7bf39742e694364 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 6 Mar 2018 15:26:54 -0800 Subject: [PATCH 261/728] scorch optimize mem processField inner-loop --- index/scorch/segment/mem/build.go | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index 264c94d1c..a064fcba0 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -111,13 +111,15 @@ func (s *Segment) initializeDict(results []*index.AnalysisResult) { } processField := func(fieldID uint16, tfs analysis.TokenFrequencies) { + dict := s.Dicts[fieldID] + dictKeys := s.DictKeys[fieldID] for term, tf := range tfs { - pidPlus1, exists := s.Dicts[fieldID][term] + pidPlus1, exists := dict[term] if !exists { numPostingsLists++ pidPlus1 = uint64(numPostingsLists) - s.Dicts[fieldID][term] = pidPlus1 - s.DictKeys[fieldID] = append(s.DictKeys[fieldID], term) + dict[term] = pidPlus1 + dictKeys = append(dictKeys, term) numTermsPerPostingsList = append(numTermsPerPostingsList, 0) numLocsPerPostingsList = append(numLocsPerPostingsList, 0) } @@ -127,6 +129,7 @@ func (s *Segment) initializeDict(results []*index.AnalysisResult) { totLocs += len(tf.Locations) } numTokenFrequencies += len(tfs) + s.DictKeys[fieldID] = dictKeys } for _, result := range results { From 5c721226cfab83b0cbd366aef6919ccc0df20364 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Tue, 6 Mar 2018 15:53:14 -0800 Subject: [PATCH 262/728] Fixing the scorch search request memory estimate Do not re-account for certain referenced data in the zap structures. New estimates: ESTIMATE BENCHMEM TermQuery 11396 12437 MatchQuery 12244 12951 DisjunctionQuery (Term queries) 20644 20709 --- index/scorch/segment/zap/posting.go | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index ef21df8d5..8106ebcc0 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -57,18 +57,6 @@ type PostingsList struct { func (p *PostingsList) Size() int { sizeInBytes := reflectStaticSizePostingsList + size.SizeOfPtr - if p.sb != nil { - sizeInBytes += (p.sb.Size() - len(p.sb.mem)) // do not include the mmap'ed part - } - - if p.locBitmap != nil { - sizeInBytes += int(p.locBitmap.GetSizeInBytes()) - } - - if p.postings != nil { - sizeInBytes += int(p.postings.GetSizeInBytes()) - } - if p.except != nil { sizeInBytes += int(p.except.GetSizeInBytes()) } From 2a9739ee1b438c0a187353b65e4b52890a4a2045 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 7 Mar 2018 14:37:33 +0530 Subject: [PATCH 263/728] naming change, interface removal --- index/scorch/merge.go | 3 ++- index/scorch/scorch.go | 2 +- index/scorch/segment/zap/merge.go | 25 ++++++++----------------- index/scorch/segment/zap/merge_test.go | 23 ++++++----------------- index/scorch/stats.go | 7 +------ 5 files changed, 18 insertions(+), 42 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 67e1590ac..d7edfd402 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -180,8 +180,9 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, s.markIneligibleForRemoval(filename) path := s.path + string(os.PathSeparator) + filename atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1) - newDocNums, err := zap.Merge(segmentsToMerge, docsToDrop, path, 1024, &s.stats) + newDocNums, nBytes, err := zap.Merge(segmentsToMerge, docsToDrop, path, 1024) atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1) + atomic.AddUint64(&s.stats.TotFileMergeWrittenBytes, nBytes) if err != nil { s.unmarkIneligibleForRemoval(filename) atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 8a2b1ec3b..4da336e5e 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -430,7 +430,7 @@ func (s *Scorch) StatsMap() map[string]interface{} { m["num_items_persisted"] = m["TotPersistedItems"] m["num_bytes_used_disk"] = m["CurOnDiskBytes"] m["num_files_on_disk"] = m["CurOnDiskFiles"] - m["total_compaction_written_bytes"] = m["TotCompactionWrittenBytes"] + m["total_compaction_written_bytes"] = m["TotFileMergeWrittenBytes"] return m } diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 9399046b0..7abef1886 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -31,22 +31,17 @@ import ( const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc -// StatsReporter interface represents stats reporting methods. -type StatsReporter interface { - ReportBytesWritten(numBytesWritten uint64) -} - // Merge takes a slice of zap segments and bit masks describing which // documents may be dropped, and creates a new segment containing the // remaining data. This new segment is built at the specified path, // with the provided chunkFactor. func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, - chunkFactor uint32, stats StatsReporter) ([][]uint64, error) { + chunkFactor uint32) ([][]uint64, uint64, error) { flag := os.O_RDWR | os.O_CREATE f, err := os.OpenFile(path, flag, 0600) if err != nil { - return nil, err + return nil, 0, err } cleanup := func() { @@ -69,39 +64,35 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, MergeToWriter(segmentBases, drops, chunkFactor, cr) if err != nil { cleanup() - return nil, err + return nil, 0, err } err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, chunkFactor, cr.Sum32(), cr) if err != nil { cleanup() - return nil, err + return nil, 0, err } err = br.Flush() if err != nil { cleanup() - return nil, err + return nil, 0, err } err = f.Sync() if err != nil { cleanup() - return nil, err + return nil, 0, err } err = f.Close() if err != nil { cleanup() - return nil, err - } - - if stats != nil { - stats.ReportBytesWritten(uint64(cr.Count())) + return nil, 0, err } - return newDocNums, nil + return newDocNums, uint64(cr.Count()), nil } func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap, diff --git a/index/scorch/segment/zap/merge_test.go b/index/scorch/segment/zap/merge_test.go index 2ba0b373a..501947f96 100644 --- a/index/scorch/segment/zap/merge_test.go +++ b/index/scorch/segment/zap/merge_test.go @@ -20,7 +20,6 @@ import ( "reflect" "sort" "strings" - "sync/atomic" "testing" "github.com/RoaringBitmap/roaring" @@ -73,7 +72,7 @@ func TestMerge(t *testing.T) { segsToMerge[0] = segment.(*Segment) segsToMerge[1] = segment2.(*Segment) - _, err = Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024, nil) + _, _, err = Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024) if err != nil { t.Fatal(err) } @@ -177,7 +176,7 @@ func testMergeWithEmptySegments(t *testing.T, before bool, numEmptySegments int) drops := make([]*roaring.Bitmap, len(segsToMerge)) - _, err = Merge(segsToMerge, drops, "/tmp/scorch3.zap", 1024, nil) + _, _, err = Merge(segsToMerge, drops, "/tmp/scorch3.zap", 1024) if err != nil { t.Fatal(err) } @@ -219,7 +218,7 @@ func testMergeWithSelf(t *testing.T, segCur *Segment, expectedCount uint64) { segsToMerge := make([]*Segment, 1) segsToMerge[0] = segCur - _, err := Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/"+fname, 1024, nil) + _, _, err := Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/"+fname, 1024) if err != nil { t.Fatal(err) } @@ -591,7 +590,7 @@ func testMergeWithUpdates(t *testing.T, segmentDocIds [][]string, docsToDrop []* func testMergeAndDropSegments(t *testing.T, segsToMerge []*Segment, docsToDrop []*roaring.Bitmap, expectedNumDocs uint64) { _ = os.RemoveAll("/tmp/scorch-merged.zap") - _, err := Merge(segsToMerge, docsToDrop, "/tmp/scorch-merged.zap", 1024, nil) + _, _, err := Merge(segsToMerge, docsToDrop, "/tmp/scorch-merged.zap", 1024) if err != nil { t.Fatal(err) } @@ -784,14 +783,6 @@ func buildMemSegmentMultiHelper(docIds []string) *mem.Segment { return segment } -type statTest struct { - totalWrittenBytes uint64 -} - -func (s *statTest) ReportBytesWritten(numBytesWritten uint64) { - atomic.AddUint64(&s.totalWrittenBytes, numBytesWritten) -} - func TestMergeBytesWritten(t *testing.T) { _ = os.RemoveAll("/tmp/scorch.zap") _ = os.RemoveAll("/tmp/scorch2.zap") @@ -835,14 +826,12 @@ func TestMergeBytesWritten(t *testing.T) { segsToMerge[0] = segment.(*Segment) segsToMerge[1] = segment2.(*Segment) - reporter := &statTest{} - - _, err = Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024, reporter) + _, nBytes, err := Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024) if err != nil { t.Fatal(err) } - if reporter.totalWrittenBytes == 0 { + if nBytes == 0 { t.Fatalf("expected a non zero total_compaction_written_bytes") } diff --git a/index/scorch/stats.go b/index/scorch/stats.go index 4f8c8b99e..36245a8e8 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -87,6 +87,7 @@ type Stats struct { TotFileMergeSegmentsEmpty uint64 TotFileMergeSegments uint64 + TotFileMergeWrittenBytes uint64 TotFileMergeZapBeg uint64 TotFileMergeZapEnd uint64 @@ -100,8 +101,6 @@ type Stats struct { TotMemMergeZapBeg uint64 TotMemMergeZapEnd uint64 TotMemMergeSegments uint64 - - TotCompactionWrittenBytes uint64 } // atomically populates the returned map @@ -124,7 +123,3 @@ func (s *Stats) ToMap() map[string]interface{} { func (s *Stats) MarshalJSON() ([]byte, error) { return json.Marshal(s.ToMap()) } - -func (s *Stats) ReportBytesWritten(numBytesWritten uint64) { - atomic.AddUint64(&s.TotCompactionWrittenBytes, numBytesWritten) -} From 73ed8e248d340ceb2ca3380f2f91bc1fd316576b Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 7 Mar 2018 18:34:54 +0530 Subject: [PATCH 264/728] fixing the indentation issues. looks like it happened during the web based conflict resolution.. --- index/scorch/merge.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 89d6ec141..2b0e734c7 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -185,7 +185,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1) newDocNums, nBytes, err := zap.Merge(segmentsToMerge, docsToDrop, path, 1024) atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1) - atomic.AddUint64(&s.stats.TotFileMergeWrittenBytes, nBytes) + atomic.AddUint64(&s.stats.TotFileMergeWrittenBytes, nBytes) fileMergeZapTime := uint64(time.Since(fileMergeZapStartTime)) atomic.AddUint64(&s.stats.TotFileMergeZapTime, fileMergeZapTime) @@ -193,7 +193,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, atomic.StoreUint64(&s.stats.MaxFileMergeZapTime, fileMergeZapTime) } - if err != nil { + if err != nil { s.unmarkIneligibleForRemoval(filename) atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1) return fmt.Errorf("merging failed: %v", err) From 8c0f402d4b06a9c345f5049bd0f72b474db8ffbb Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 6 Mar 2018 15:48:22 -0800 Subject: [PATCH 265/728] scorch zap optimize processDocument() loc inner loop --- index/scorch/segment/mem/build.go | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index a064fcba0..a4b69013e 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -266,21 +266,34 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { locationBS := s.PostingsLocs[pid] if len(tokenFreq.Locations) > 0 { locationBS.AddInt(int(docNum)) + + locfields := s.Locfields[pid] + locstarts := s.Locstarts[pid] + locends := s.Locends[pid] + locpos := s.Locpos[pid] + locarraypos := s.Locarraypos[pid] + for _, loc := range tokenFreq.Locations { var locf = fieldID if loc.Field != "" { locf = uint16(s.getOrDefineField(loc.Field)) } - s.Locfields[pid] = append(s.Locfields[pid], locf) - s.Locstarts[pid] = append(s.Locstarts[pid], uint64(loc.Start)) - s.Locends[pid] = append(s.Locends[pid], uint64(loc.End)) - s.Locpos[pid] = append(s.Locpos[pid], uint64(loc.Position)) + locfields = append(locfields, locf) + locstarts = append(locstarts, uint64(loc.Start)) + locends = append(locends, uint64(loc.End)) + locpos = append(locpos, uint64(loc.Position)) if len(loc.ArrayPositions) > 0 { - s.Locarraypos[pid] = append(s.Locarraypos[pid], loc.ArrayPositions) + locarraypos = append(locarraypos, loc.ArrayPositions) } else { - s.Locarraypos[pid] = append(s.Locarraypos[pid], nil) + locarraypos = append(locarraypos, nil) } } + + s.Locfields[pid] = locfields + s.Locstarts[pid] = locstarts + s.Locends[pid] = locends + s.Locpos[pid] = locpos + s.Locarraypos[pid] = locarraypos } } } From 79f28b7c93b99934bf37241454d77fa940de3863 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 6 Mar 2018 16:29:33 -0800 Subject: [PATCH 266/728] scorch fix persistDocValues() err return --- index/scorch/segment/zap/build.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 404ec8694..f1698e39f 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -515,7 +515,7 @@ func persistDocValues(memSegment *mem.Segment, w *CountHashWriter, var err1 error postings, err1 = dict.(*mem.Dictionary).InitPostingsList(next.Term, nil, postings) if err1 != nil { - return nil, err + return nil, err1 } postingsItr = postings.InitIterator(postingsItr) From 59eb70d0200dedb7710406280a27d4f75ff6ca97 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 6 Mar 2018 17:47:25 -0800 Subject: [PATCH 267/728] scorch zap remove unused chunkedIntCoder field --- index/scorch/segment/zap/intcoder.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/index/scorch/segment/zap/intcoder.go b/index/scorch/segment/zap/intcoder.go index 8d1f94536..ea0330181 100644 --- a/index/scorch/segment/zap/intcoder.go +++ b/index/scorch/segment/zap/intcoder.go @@ -24,7 +24,6 @@ import ( type chunkedIntCoder struct { final []byte - maxDocNum uint64 chunkSize uint64 chunkBuf bytes.Buffer encoder *govarint.Base128Encoder @@ -41,7 +40,6 @@ func newChunkedIntCoder(chunkSize uint64, maxDocNum uint64) *chunkedIntCoder { total := maxDocNum/chunkSize + 1 rv := &chunkedIntCoder{ chunkSize: chunkSize, - maxDocNum: maxDocNum, chunkLens: make([]uint64, total), final: make([]byte, 0, 64), } From 2b5da7a8192821442761af98d689afdd74a1272e Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 7 Mar 2018 09:12:55 -0800 Subject: [PATCH 268/728] go fmt --- index/scorch/merge.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 2b0e734c7..ec2c8d4b3 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -186,14 +186,14 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, newDocNums, nBytes, err := zap.Merge(segmentsToMerge, docsToDrop, path, 1024) atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1) atomic.AddUint64(&s.stats.TotFileMergeWrittenBytes, nBytes) - + fileMergeZapTime := uint64(time.Since(fileMergeZapStartTime)) atomic.AddUint64(&s.stats.TotFileMergeZapTime, fileMergeZapTime) if atomic.LoadUint64(&s.stats.MaxFileMergeZapTime) < fileMergeZapTime { atomic.StoreUint64(&s.stats.MaxFileMergeZapTime, fileMergeZapTime) } - - if err != nil { + + if err != nil { s.unmarkIneligibleForRemoval(filename) atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1) return fmt.Errorf("merging failed: %v", err) From 1e2bb14f13ad14dc435fe6cd5c2bc09375df651f Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 7 Mar 2018 09:49:08 -0800 Subject: [PATCH 269/728] added TestRoaringSizes() --- index/scorch/segment/zap/write_test.go | 86 ++++++++++++++++++++++++++ 1 file changed, 86 insertions(+) create mode 100644 index/scorch/segment/zap/write_test.go diff --git a/index/scorch/segment/zap/write_test.go b/index/scorch/segment/zap/write_test.go new file mode 100644 index 000000000..2e72d4b82 --- /dev/null +++ b/index/scorch/segment/zap/write_test.go @@ -0,0 +1,86 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "math" + "testing" + + "github.com/RoaringBitmap/roaring" +) + +func TestRoaringSizes(t *testing.T) { + tests := []struct { + vals []uint32 + expectedSize int // expected serialized # bytes + optimizedSize int // after calling roaring's RunOptimize() API + }{ + {[]uint32{}, 8, 8}, // empty roaring is 8 bytes + + {[]uint32{0}, 18, 18}, // single entry roaring is 18 bytes + {[]uint32{1}, 18, 18}, + {[]uint32{4}, 18, 18}, + {[]uint32{4000}, 18, 18}, + {[]uint32{40000000}, 18, 18}, + {[]uint32{math.MaxUint32}, 18, 18}, + {[]uint32{math.MaxUint32 - 1}, 18, 18}, + + {[]uint32{0, 1}, 20, 20}, + {[]uint32{0, 10000000}, 28, 28}, + + {[]uint32{0, 1, 2}, 22, 15}, + {[]uint32{0, 1, 20000000}, 30, 30}, + + {[]uint32{0, 1, 2, 3}, 24, 15}, + {[]uint32{0, 1, 2, 30000000}, 32, 21}, + } + + for _, test := range tests { + bm := roaring.New() + for _, val := range test.vals { + bm.Add(val) + } + + b, err := bm.ToBytes() + if err != nil { + t.Errorf("expected no ToBytes() err, got: %v", err) + } + if len(b) != test.expectedSize { + t.Errorf("size did not match,"+ + " got: %d, test: %#v", len(b), test) + } + if int(bm.GetSerializedSizeInBytes()) != test.expectedSize { + t.Errorf("GetSerializedSizeInBytes did not match,"+ + " got: %d, test: %#v", + bm.GetSerializedSizeInBytes(), test) + } + + bm.RunOptimize() + + b, err = bm.ToBytes() + if err != nil { + t.Errorf("expected no ToBytes() err, got: %v", err) + } + if len(b) != test.optimizedSize { + t.Errorf("optimized size did not match,"+ + " got: %d, test: %#v", len(b), test) + } + if int(bm.GetSerializedSizeInBytes()) != test.optimizedSize { + t.Errorf("optimized GetSerializedSizeInBytes did not match,"+ + " got: %d, test: %#v", + bm.GetSerializedSizeInBytes(), test) + } + } +} From 33ef4ce35e22452ddde0a05e77a90afba5206d5f Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 7 Mar 2018 12:05:39 -0800 Subject: [PATCH 270/728] MB-28163: Provide an API to estimate the RAM needed for SearchResult exported API: MemoryNeededForSearchResult(req *SearchRequest) --- search.go | 43 +++++++++++++++++++++++++++++++++++++++++++ search_test.go | 15 +++++++++++++++ 2 files changed, 58 insertions(+) diff --git a/search.go b/search.go index e324262e5..86ea4193a 100644 --- a/search.go +++ b/search.go @@ -22,8 +22,10 @@ import ( "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/analysis/datetime/optional" + "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/search/collector" "github.com/blevesearch/bleve/search/query" "github.com/blevesearch/bleve/size" ) @@ -518,3 +520,44 @@ func (sr *SearchResult) Merge(other *SearchResult) { sr.Facets.Merge(other.Facets) } + +// MemoryNeededForSearchResult is an exported helper function to determine the RAM +// needed to accommodate the results for a given search request. +func MemoryNeededForSearchResult(req *SearchRequest) uint64 { + if req == nil { + return 0 + } + + numDocMatches := req.Size + req.From + if req.Size+req.From > collector.PreAllocSizeSkipCap { + numDocMatches = collector.PreAllocSizeSkipCap + } + + estimate := 0 + + // overhead from the SearchResult structure + var sr SearchResult + estimate += sr.Size() + + var dm search.DocumentMatch + sizeOfDocumentMatch := dm.Size() + + // overhead from results + estimate += numDocMatches * sizeOfDocumentMatch + + // overhead from facet results + if req.Facets != nil { + var fr search.FacetResult + estimate += len(req.Facets) * fr.Size() + } + + // highlighting, store + var d document.Document + if len(req.Fields) > 0 || req.Highlight != nil { + for i := 0; i < (req.Size + req.From); i++ { + estimate += (req.Size + req.From) * d.Size() + } + } + + return uint64(estimate) +} diff --git a/search_test.go b/search_test.go index 242494132..87a718285 100644 --- a/search_test.go +++ b/search_test.go @@ -399,3 +399,18 @@ func TestSearchResultFacetsMerge(t *testing.T) { t.Errorf("expected %#v, got %#v", expected, l) } } + +func TestMemoryNeededForSearchResult(t *testing.T) { + query := NewTermQuery("blah") + req := NewSearchRequest(query) + + var sr SearchResult + expect := sr.Size() + var dm search.DocumentMatch + expect += 10 * dm.Size() + + estimate := MemoryNeededForSearchResult(req) + if estimate != uint64(expect) { + t.Errorf("estimate not what is expected: %v != %v", estimate, expect) + } +} From eac98089907947b66dce3e8848fee62fa0717971 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 7 Mar 2018 15:00:38 -0800 Subject: [PATCH 271/728] scorch zap optimize FST val encoding for terms with 1 hit NOTE: this is a scorch zap file format change / bump to version 4. In this optimization, the uint64 val stored in the vellum FST (term dictionary) now may either be a uint64 postingsOffset (same as before this change) or a uint64 encoding of the docNum + norm (in the case where a term appears in just a single doc). --- cmd/bleve/cmd/zap/explore.go | 16 ++- index/scorch/segment/zap/build.go | 2 +- index/scorch/segment/zap/intcoder.go | 4 + index/scorch/segment/zap/merge.go | 42 ++++++- index/scorch/segment/zap/merge_test.go | 9 ++ index/scorch/segment/zap/posting.go | 158 +++++++++++++++++++++---- 6 files changed, 201 insertions(+), 30 deletions(-) diff --git a/cmd/bleve/cmd/zap/explore.go b/cmd/bleve/cmd/zap/explore.go index de05c63e7..543b572fd 100644 --- a/cmd/bleve/cmd/zap/explore.go +++ b/cmd/bleve/cmd/zap/explore.go @@ -18,7 +18,9 @@ import ( "encoding/binary" "fmt" "log" + "math" + "github.com/blevesearch/bleve/index/scorch/segment/zap" "github.com/couchbase/vellum" "github.com/spf13/cobra" ) @@ -57,7 +59,19 @@ var exploreCmd = &cobra.Command{ return fmt.Errorf("error looking for term : %v", err) } if exists { - fmt.Printf("postings list begins at %d (%x)\n", postingsAddr, postingsAddr) + fmt.Printf("fst val is %d (%x)\n", postingsAddr, postingsAddr) + + if postingsAddr&zap.FSTValEncodingMask == zap.FSTValEncoding1Hit { + docNum, normBits := zap.FSTValDecode1Hit(postingsAddr) + norm := math.Float32frombits(uint32(normBits)) + fmt.Printf("Posting List is 1-hit encoded, docNum: %d, norm: %f\n", + docNum, norm) + return nil + } + + if postingsAddr&zap.FSTValEncodingMask != zap.FSTValEncodingGeneral { + return fmt.Errorf("unknown fst val encoding") + } var n uint64 freqAddr, read := binary.Uvarint(data[postingsAddr : postingsAddr+binary.MaxVarintLen64]) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index f1698e39f..8ec610953 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -28,7 +28,7 @@ import ( "github.com/golang/snappy" ) -const version uint32 = 3 +const version uint32 = 4 const fieldNotUninverted = math.MaxUint64 diff --git a/index/scorch/segment/zap/intcoder.go b/index/scorch/segment/zap/intcoder.go index ea0330181..6680e608e 100644 --- a/index/scorch/segment/zap/intcoder.go +++ b/index/scorch/segment/zap/intcoder.go @@ -130,3 +130,7 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) { } return tw, nil } + +func (c *chunkedIntCoder) FinalSize() int { + return len(c.final) +} diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 454213af9..383fedbf3 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -225,6 +225,21 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, newRoaring := roaring.NewBitmap() newRoaringLocs := roaring.NewBitmap() + var lastDocNum, lastFreq, lastNorm uint64 + + // determines whether to use "1-hit" encoding optimization + // when a term appears in only 1 doc, with no loc info, + // has freq of 1, and the docNum fits into 31-bits + use1HitEncoding := func(termCardinality uint64) (bool, uint64, uint64) { + if termCardinality == uint64(1) && locEncoder.FinalSize() <= 0 { + docNum := uint64(newRoaring.Minimum()) + if under32Bits(docNum) && docNum == lastDocNum && lastFreq == 1 { + return true, docNum, lastNorm + } + } + return false, 0, 0 + } + finishTerm := func(term []byte) error { if term == nil { return nil @@ -233,8 +248,16 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, tfEncoder.Close() locEncoder.Close() - if newRoaring.GetCardinality() > 0 { - // this field/term actually has hits in the new segment, lets write it down + termCardinality := newRoaring.GetCardinality() + + encodeAs1Hit, docNum1Hit, normBits1Hit := use1HitEncoding(termCardinality) + if encodeAs1Hit { + err = newVellum.Insert(term, FSTValEncode1Hit(docNum1Hit, normBits1Hit)) + if err != nil { + return err + } + } else if termCardinality > 0 { + // this field/term has hits in the new segment freqOffset := uint64(w.Count()) _, err := tfEncoder.Write(w) if err != nil { @@ -251,7 +274,6 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, return err } postingOffset := uint64(w.Count()) - // write out the start of the term info n := binary.PutUvarint(bufMaxVarintLen64, freqOffset) _, err = w.Write(bufMaxVarintLen64[:n]) @@ -287,6 +309,10 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, tfEncoder.Reset() locEncoder.Reset() + lastDocNum = 0 + lastFreq = 0 + lastNorm = 0 + return nil } @@ -315,7 +341,8 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, postItr = postings.iterator(postItr) - nextDocNum, nextFreqNormBytes, nextLocBytes, err2 := postItr.nextBytes() + nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err2 := + postItr.nextBytes() for err2 == nil && len(nextFreqNormBytes) > 0 { hitNewDocNum := newDocNumsI[nextDocNum] if hitNewDocNum == docDropped { @@ -339,7 +366,12 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, docTermMap[hitNewDocNum] = append(append(docTermMap[hitNewDocNum], term...), termSeparator) - nextDocNum, nextFreqNormBytes, nextLocBytes, err2 = postItr.nextBytes() + lastDocNum = hitNewDocNum + lastFreq = nextFreq + lastNorm = nextNorm + + nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err2 = + postItr.nextBytes() } if err2 != nil { return nil, 0, err2 diff --git a/index/scorch/segment/zap/merge_test.go b/index/scorch/segment/zap/merge_test.go index 501947f96..d80b26086 100644 --- a/index/scorch/segment/zap/merge_test.go +++ b/index/scorch/segment/zap/merge_test.go @@ -859,3 +859,12 @@ func TestMergeBytesWritten(t *testing.T) { testMergeWithSelf(t, seg3, 4) } + +func TestUnder32Bits(t *testing.T) { + if !under32Bits(0) || !under32Bits(uint64(0x7fffffff)) { + t.Errorf("under32Bits bad") + } + if under32Bits(uint64(0x80000000)) || under32Bits(uint64(0x80000001)) { + t.Errorf("under32Bits wrong") + } +} diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 8106ebcc0..7ae36120b 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -43,6 +43,55 @@ func init() { reflectStaticSizeLocation = int(reflect.TypeOf(l).Size()) } +// FST or vellum value (uint64) encoding is determined by the top two +// highest-order or most significant bits... +// +// encoding : MSB +// name : 63 62 61...to...bit #0 (LSB) +// ----------+---+---+--------------------------------------------------- +// general : 0 | 0 | 62-bits of postingsOffset. +// ~ : 0 | 1 | reserved for future. +// 1-hit : 1 | 0 | 31-bits of positive float31 norm | 31-bits docNum. +// ~ : 1 | 1 | reserved for future. +// +// Encoding "general" is able to handle all cases, where the +// postingsOffset points to more information about the postings for +// the term. +// +// Encoding "1-hit" is used to optimize a commonly seen case when a +// term has only a single hit. For example, a term in the _id field +// will have only 1 hit. The "1-hit" encoding is used for a term +// in a field when... +// +// - term vector info is disabled for that field; +// - and, the term appears in only a single doc for that field; +// - and, the term's freq is exactly 1 in that single doc for that field; +// - and, the docNum must fit into 31-bits; +// +// Otherwise, the "general" encoding is used instead. +// +// In the "1-hit" encoding, the field in that single doc may have +// other terms, which is supported in the "1-hit" encoding by the +// positive float31 norm. + +const FSTValEncodingMask = uint64(0xc000000000000000) +const FSTValEncodingGeneral = uint64(0x0000000000000000) +const FSTValEncoding1Hit = uint64(0x8000000000000000) + +func FSTValEncode1Hit(docNum uint64, normBits uint64) uint64 { + return FSTValEncoding1Hit | ((mask31Bits & normBits) << 31) | (mask31Bits & docNum) +} + +func FSTValDecode1Hit(v uint64) (docNum uint64, normBits uint64) { + return (mask31Bits & v), (mask31Bits & (v >> 31)) +} + +const mask31Bits = uint64(0x000000007fffffff) + +func under32Bits(x uint64) bool { + return x <= mask31Bits +} + // PostingsList is an in-memory represenation of a postings list type PostingsList struct { sb *SegmentBase @@ -52,6 +101,10 @@ type PostingsList struct { locBitmap *roaring.Bitmap postings *roaring.Bitmap except *roaring.Bitmap + + // when postingsOffset == freqOffset == 0, then the postings list + // represents a "1-hit" encoding, and has the following norm + normBits1Hit uint64 } func (p *PostingsList) Size() int { @@ -85,6 +138,8 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { } locDecoder := rv.locDecoder + buf := rv.buf + *rv = PostingsIterator{} // clear the struct rv.freqNormReader = freqNormReader @@ -92,11 +147,17 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { rv.locReader = locReader rv.locDecoder = locDecoder + + rv.buf = buf } rv.postings = p - if p.postings != nil { - // prepare the freq chunk details + if p.postings == nil { + return rv + } + + if p.freqOffset > 0 && p.locOffset > 0 { + // "general" encoding, so prepare the freq chunk details var n uint64 var read int var numFreqChunks uint64 @@ -120,15 +181,19 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { n += uint64(read) } rv.locChunkStart = p.locOffset + n - rv.locBitmap = p.locBitmap + } else { + // "1-hit" encoding + rv.normBits1Hit = p.normBits1Hit + } - rv.all = p.postings.Iterator() - if p.except != nil { - allExcept := roaring.AndNot(p.postings, p.except) - rv.actual = allExcept.Iterator() - } else { - rv.actual = p.postings.Iterator() - } + rv.locBitmap = p.locBitmap + + rv.all = p.postings.Iterator() + if p.except != nil { + allExcept := roaring.AndNot(p.postings, p.except) + rv.actual = allExcept.Iterator() + } else { + rv.actual = p.postings.Iterator() } return rv @@ -153,6 +218,11 @@ func (p *PostingsList) Count() uint64 { func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { rv.postingsOffset = postingsOffset + // handle "1-hit" encoding special case + if rv.postingsOffset&FSTValEncodingMask == FSTValEncoding1Hit { + return rv.init1Hit(postingsOffset) + } + // read the location of the freq/norm details var n uint64 var read int @@ -193,6 +263,24 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { return nil } +var emptyRoaring = roaring.NewBitmap() + +func (rv *PostingsList) init1Hit(fstVal uint64) error { + docNum, normBits := FSTValDecode1Hit(fstVal) + + rv.locBitmap = emptyRoaring + + rv.postings = roaring.NewBitmap() + rv.postings.Add(uint32(docNum)) + + // TODO: we can likely do better than allocating a roaring bitmap + // with just 1 entry, but for now reuse existing machinery + + rv.normBits1Hit = normBits + + return nil +} + // PostingsIterator provides a way to iterate through the postings list type PostingsIterator struct { postings *PostingsList @@ -219,6 +307,10 @@ type PostingsIterator struct { next Posting // reused across Next() calls nextLocs []Location // reused across Next() calls + + normBits1Hit uint64 + + buf []byte } func (i *PostingsIterator) Size() int { @@ -244,7 +336,8 @@ func (i *PostingsIterator) loadChunk(chunk int) error { if chunk >= len(i.freqChunkLens) || chunk >= len(i.locChunkLens) { return fmt.Errorf("tried to load chunk that doesn't exist %d/(%d %d)", chunk, len(i.freqChunkLens), len(i.locChunkLens)) } - // load correct chunk bytes + + // load freq chunk bytes start := i.freqChunkStart for j := 0; j < chunk; j++ { start += i.freqChunkLens[j] @@ -258,6 +351,7 @@ func (i *PostingsIterator) loadChunk(chunk int) error { i.freqNormReader.Reset(i.currChunkFreqNorm) } + // load loc chunk bytes start = i.locChunkStart for j := 0; j < chunk; j++ { start += i.locChunkLens[j] @@ -270,11 +364,16 @@ func (i *PostingsIterator) loadChunk(chunk int) error { } else { i.locReader.Reset(i.currChunkLoc) } + i.currChunk = uint32(chunk) return nil } func (i *PostingsIterator) readFreqNorm() (uint64, uint64, error) { + if i.normBits1Hit != 0 { + return 1, i.normBits1Hit, nil + } + freq, err := i.freqNormDecoder.GetU64() if err != nil { return 0, 0, fmt.Errorf("error reading frequency: %v", err) @@ -360,6 +459,7 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { return nil, err } rv.norm = math.Float32frombits(uint32(normBits)) + if i.locBitmap.Contains(uint32(docNum)) { // read off 'freq' locations, into reused slices if cap(i.nextLocs) >= int(rv.freq) { @@ -386,33 +486,40 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { // nextBytes returns the docNum and the encoded freq & loc bytes for // the next posting -func (i *PostingsIterator) nextBytes() (uint64, []byte, []byte, error) { +func (i *PostingsIterator) nextBytes() ( + docNumOut uint64, freq uint64, normBits uint64, + bytesFreqNorm []byte, bytesLoc []byte, err error) { docNum, exists, err := i.nextDocNum() - if err != nil { - return 0, nil, nil, err + if err != nil || !exists { + return 0, 0, 0, nil, nil, err } - if !exists { - return 0, nil, nil, nil + + if i.normBits1Hit != 0 { + if i.buf == nil { + i.buf = make([]byte, binary.MaxVarintLen64*2) + } + n := binary.PutUvarint(i.buf, uint64(1)) + n += binary.PutUvarint(i.buf, i.normBits1Hit) + return docNum, uint64(1), i.normBits1Hit, i.buf[:n], nil, nil } startFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len() - freq, _, err := i.readFreqNorm() + freq, normBits, err = i.readFreqNorm() if err != nil { - return 0, nil, nil, err + return 0, 0, 0, nil, nil, err } endFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len() - bytesFreqNorm := i.currChunkFreqNorm[startFreqNorm:endFreqNorm] + bytesFreqNorm = i.currChunkFreqNorm[startFreqNorm:endFreqNorm] - var bytesLoc []byte if i.locBitmap.Contains(uint32(docNum)) { startLoc := len(i.currChunkLoc) - i.locReader.Len() for j := uint64(0); j < freq; j++ { err := i.readLocation(nil) if err != nil { - return 0, nil, nil, err + return 0, 0, 0, nil, nil, err } } @@ -420,7 +527,7 @@ func (i *PostingsIterator) nextBytes() (uint64, []byte, []byte, error) { bytesLoc = i.currChunkLoc[startLoc:endLoc] } - return docNum, bytesFreqNorm, bytesLoc, nil + return docNum, freq, normBits, bytesFreqNorm, bytesLoc, nil } // nextDocNum returns the next docNum on the postings list, and also @@ -431,8 +538,13 @@ func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { } n := i.actual.Next() - nChunk := n / i.postings.sb.chunkFactor allN := i.all.Next() + + if i.normBits1Hit != 0 { + return uint64(n), true, nil + } + + nChunk := n / i.postings.sb.chunkFactor allNChunk := allN / i.postings.sb.chunkFactor // n is the next actual hit (excluding some postings) From 25beba615d5105b6933553b8af3f04559bc2ecf5 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 8 Mar 2018 09:36:36 -0800 Subject: [PATCH 272/728] scorch mem processDocument reuses fieldLens/docMap arrays This change produces less garbage by switching from a map[uint16]'s to array's for the fieldLens and docMap, and then reusing those arrays across multiple processDocument() calls. --- index/scorch/segment/mem/build.go | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go index a4b69013e..0b329704a 100644 --- a/index/scorch/segment/mem/build.go +++ b/index/scorch/segment/mem/build.go @@ -35,8 +35,10 @@ func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { s.initializeDict(results) // walk each doc + fieldLensReuse := make([]int, len(s.FieldsMap)) + docMapReuse := make([]analysis.TokenFrequencies, len(s.FieldsMap)) for _, result := range results { - s.processDocument(result) + s.processDocument(result, fieldLensReuse, docMapReuse) } // go back and sort the dictKeys @@ -209,19 +211,25 @@ func (s *Segment) initializeDict(results []*index.AnalysisResult) { } } -func (s *Segment) processDocument(result *index.AnalysisResult) { - // used to collate information across fields - docMap := make(map[uint16]analysis.TokenFrequencies, len(s.FieldsMap)) - fieldLens := make(map[uint16]int, len(s.FieldsMap)) +func (s *Segment) processDocument(result *index.AnalysisResult, + fieldLens []int, docMap []analysis.TokenFrequencies) { + // clear the fieldLens and docMap for reuse + n := len(s.FieldsMap) + for i := 0; i < n; i++ { + fieldLens[i] = 0 + docMap[i] = nil + } docNum := uint64(s.addDocument()) - processField := func(field uint16, name string, l int, tf analysis.TokenFrequencies) { - fieldLens[field] += l - if existingFreqs, ok := docMap[field]; ok { + processField := func(fieldID uint16, name string, l int, tf analysis.TokenFrequencies) { + fieldLens[fieldID] += l + + existingFreqs := docMap[fieldID] + if existingFreqs != nil { existingFreqs.MergeAll(name, tf) } else { - docMap[field] = tf + docMap[fieldID] = tf } } @@ -274,7 +282,7 @@ func (s *Segment) processDocument(result *index.AnalysisResult) { locarraypos := s.Locarraypos[pid] for _, loc := range tokenFreq.Locations { - var locf = fieldID + var locf = uint16(fieldID) if loc.Field != "" { locf = uint16(s.getOrDefineField(loc.Field)) } From 40f63baeb9812fc6f757b60245ce516ed20572bc Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Thu, 8 Mar 2018 10:38:09 -0800 Subject: [PATCH 273/728] MB-28562: Support search query callbacks before and after execution + SearchQueryStartCallback + SearchQueryEndCallback --- index_impl.go | 21 ++++++++++++++++----- index_test.go | 6 +++--- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/index_impl.go b/index_impl.go index 1036aef2a..68777f072 100644 --- a/index_impl.go +++ b/index_impl.go @@ -50,9 +50,11 @@ const storePath = "store" var mappingInternalKey = []byte("_mapping") -const SearchMemCheckCallbackKey = "_search_mem_callback_key" +const SearchQueryStartCallbackKey = "_search_query_start_callback_key" +const SearchQueryEndCallbackKey = "_search_query_end_callback_key" -type SearchMemCheckCallbackFn func(size uint64) error +type SearchQueryStartCallbackFn func(size uint64) error +type SearchQueryEndCallbackFn func(size uint64) error func indexStorePath(path string) string { return path + string(os.PathSeparator) + storePath @@ -483,15 +485,24 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr collector.SetFacetsBuilder(facetsBuilder) } - if memCb := ctx.Value(SearchMemCheckCallbackKey); memCb != nil { - if memCbFn, ok := memCb.(SearchMemCheckCallbackFn); ok { - err = memCbFn(memNeededForSearch(req, searcher, collector)) + memNeeded := memNeededForSearch(req, searcher, collector) + if cb := ctx.Value(SearchQueryStartCallbackKey); cb != nil { + if cbF, ok := cb.(SearchQueryStartCallbackFn); ok { + err = cbF(memNeeded) } } if err != nil { return nil, err } + if cb := ctx.Value(SearchQueryEndCallbackKey); cb != nil { + if cbF, ok := cb.(SearchQueryEndCallbackFn); ok { + defer func() { + _ = cbF(memNeeded) + }() + } + } + err = collector.Collect(ctx, searcher, indexReader) if err != nil { return nil, err diff --git a/index_test.go b/index_test.go index 57429dcb2..69ca61a98 100644 --- a/index_test.go +++ b/index_test.go @@ -1871,7 +1871,7 @@ func BenchmarkScorchSearchOverhead(b *testing.B) { benchmarkSearchOverhead(scorch.Name, b) } -func TestSearchMemCheckCallback(t *testing.T) { +func TestSearchQueryCallback(t *testing.T) { defer func() { err := os.RemoveAll("testidx") if err != nil { @@ -1910,8 +1910,8 @@ func TestSearchMemCheckCallback(t *testing.T) { return nil } - ctx := context.WithValue(context.Background(), SearchMemCheckCallbackKey, - SearchMemCheckCallbackFn(f)) + ctx := context.WithValue(context.Background(), SearchQueryStartCallbackKey, + SearchQueryStartCallbackFn(f)) _, err = index.SearchInContext(ctx, req) if err != expErr { t.Fatalf("Expected: %v, Got: %v", expErr, err) From b04909d3ee67e964e3decb4db745ce63106a7280 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 7 Mar 2018 13:12:06 +0530 Subject: [PATCH 274/728] adding the integer parser utility --- index/scorch/persister.go | 11 +++------- index/scorch/scorch.go | 43 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 46 insertions(+), 8 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index cab2d035d..ccb0c1f21 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -633,19 +633,14 @@ func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) { return 0, err } - numSnapshotsToKeep := NumSnapshotsToKeep - if val, ok := s.config["numSnapshotsToKeep"].(float64); ok && val > 0 { - numSnapshotsToKeep = int(val) - } - - if len(persistedEpochs) <= numSnapshotsToKeep { + if len(persistedEpochs) <= s.numSnapshotsToKeep { // we need to keep everything return 0, nil } // make a map of epochs to protect from deletion - protectedEpochs := make(map[uint64]struct{}, numSnapshotsToKeep) - for _, epoch := range persistedEpochs[0:numSnapshotsToKeep] { + protectedEpochs := make(map[uint64]struct{}, s.numSnapshotsToKeep) + for _, epoch := range persistedEpochs[0:s.numSnapshotsToKeep] { protectedEpochs[epoch] = struct{}{} } diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 87372a326..7a33fb7f0 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -58,6 +58,7 @@ type Scorch struct { nextSnapshotEpoch uint64 eligibleForRemoval []uint64 // Index snapshot epochs that are safe to GC. ineligibleForRemoval map[string]bool // Filenames that should not be GC'ed yet. + numSnapshotsToKeep int closeCh chan struct{} introductions chan *segmentIntroduction @@ -191,6 +192,17 @@ func (s *Scorch) openBolt() error { } } + s.numSnapshotsToKeep = NumSnapshotsToKeep + if v, ok := s.config["numSnapshotsToKeep"]; ok { + var t int + if t, err = parseToInteger(v); err != nil { + return fmt.Errorf("numSnapshotsToKeep parse err: %v", err) + } + if t > 0 { + s.numSnapshotsToKeep = t + } + } + return nil } @@ -503,3 +515,34 @@ func (s *Scorch) unmarkIneligibleForRemoval(filename string) { func init() { registry.RegisterIndexType(Name, NewScorch) } + +func parseToInteger(v interface{}) (int, error) { + switch v.(type) { + case float32: + return int(v.(float32)), nil + case float64: + return int(v.(float64)), nil + case int: + return v.(int), nil + case int8: + return int(v.(int8)), nil + case int16: + return int(v.(int16)), nil + case int32: + return int(v.(int32)), nil + case int64: + return int(v.(int64)), nil + case uint: + return int(v.(uint)), nil + case uint8: + return int(v.(uint8)), nil + case uint16: + return int(v.(uint16)), nil + case uint32: + return int(v.(uint32)), nil + case uint64: + return int(v.(uint64)), nil + default: + return 0, fmt.Errorf("expects a numeric value") + } +} From d6522e7e17812bcb6795751abb1cad7acd7af6b1 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 9 Mar 2018 16:01:37 +0530 Subject: [PATCH 275/728] minor optimisation to loadChunk method --- index/scorch/segment/zap/contentcoder.go | 7 +- index/scorch/segment/zap/docvalues.go | 23 ++- index/scorch/segment/zap/intcoder.go | 46 ++++- index/scorch/segment/zap/intcoder_test.go | 212 ++++++++++++++++++++++ index/scorch/segment/zap/posting.go | 44 ++--- 5 files changed, 294 insertions(+), 38 deletions(-) diff --git a/index/scorch/segment/zap/contentcoder.go b/index/scorch/segment/zap/contentcoder.go index 933f10a1e..c731f52c4 100644 --- a/index/scorch/segment/zap/contentcoder.go +++ b/index/scorch/segment/zap/contentcoder.go @@ -156,7 +156,12 @@ func (c *chunkedContentCoder) Write(w io.Writer) (int, error) { if err != nil { return tw, err } - // write out the chunk lens + + if len(c.chunkLens) > 1 { + chunkLengthsToOffsets(c.chunkLens) + } + + // write out the chunk starting offsets for _, chunkLen := range c.chunkLens { n := binary.PutUvarint(buf, uint64(chunkLen)) nw, err = w.Write(buf[:n]) diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index 13635c57e..882ff43dd 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -38,7 +38,7 @@ type docValueIterator struct { field string curChunkNum uint64 numChunks uint64 - chunkLens []uint64 + chunkOffsets []uint64 dvDataLoc uint64 curChunkHeader []MetaData curChunkData []byte // compressed data cache @@ -47,7 +47,7 @@ type docValueIterator struct { func (di *docValueIterator) size() int { return reflectStaticSizedocValueIterator + size.SizeOfPtr + len(di.field) + - len(di.chunkLens)*size.SizeOfUint64 + + len(di.chunkOffsets)*size.SizeOfUint64 + len(di.curChunkHeader)*reflectStaticSizeMetaData + len(di.curChunkData) } @@ -78,16 +78,16 @@ func (s *SegmentBase) loadFieldDocValueIterator(field string, offset += uint64(read) fdvIter := &docValueIterator{ - curChunkNum: math.MaxUint64, - field: field, - chunkLens: make([]uint64, int(numChunks)), + curChunkNum: math.MaxUint64, + field: field, + chunkOffsets: make([]uint64, int(numChunks)), } for i := 0; i < int(numChunks); i++ { clen, read = binary.Uvarint(s.mem[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64]) if read <= 0 { return nil, fmt.Errorf("corrupted chunk length during segment load") } - fdvIter.chunkLens[i] = clen + fdvIter.chunkOffsets[i] = clen offset += uint64(read) } @@ -99,12 +99,11 @@ func (di *docValueIterator) loadDvChunk(chunkNumber, localDocNum uint64, s *SegmentBase) error { // advance to the chunk where the docValues // reside for the given docNum - destChunkDataLoc := di.dvDataLoc - for i := 0; i < int(chunkNumber); i++ { - destChunkDataLoc += di.chunkLens[i] - } + destChunkDataLoc, curChunkEnd := di.dvDataLoc, di.dvDataLoc + start, end := readChunkBoundary(int(chunkNumber), di.chunkOffsets) + destChunkDataLoc += start + curChunkEnd += end - curChunkSize := di.chunkLens[chunkNumber] // read the number of docs reside in the chunk numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64]) if read <= 0 { @@ -124,7 +123,7 @@ func (di *docValueIterator) loadDvChunk(chunkNumber, } compressedDataLoc := chunkMetaLoc + offset - dataLength := destChunkDataLoc + curChunkSize - compressedDataLoc + dataLength := curChunkEnd - compressedDataLoc di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength] di.curChunkNum = chunkNumber return nil diff --git a/index/scorch/segment/zap/intcoder.go b/index/scorch/segment/zap/intcoder.go index 6680e608e..79fe5156e 100644 --- a/index/scorch/segment/zap/intcoder.go +++ b/index/scorch/segment/zap/intcoder.go @@ -111,7 +111,12 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) { } buf := c.buf - // write out the number of chunks & each chunkLen + // convert the chunk lengths into starting chunk offsets + if len(c.chunkLens) > 1 { + chunkLengthsToOffsets(c.chunkLens) + } + + // write out the number of chunks & each chunk starting offsets n := binary.PutUvarint(buf, uint64(len(c.chunkLens))) for _, chunkLen := range c.chunkLens { n += binary.PutUvarint(buf[n:], uint64(chunkLen)) @@ -134,3 +139,42 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) { func (c *chunkedIntCoder) FinalSize() int { return len(c.final) } + +// chunkLengthsToOffsets converts the chunk length array +// to a chunk starting offset array. The readChunkBoundary +// will figure out the start and end of every chunk from +// these offsets. The starting offset of the first/single +// array element will always be zero and this position is +// used for storing the size of the current last item in +// the array at any given point. +// For eg: +// Lens -> 5 5 5 5 => 5 5 10 15 +// Lens -> 0 5 0 5 => 5 0 5 5 +// Lens -> 0 0 0 5 => 5 0 0 0 +// Lens -> 5 0 0 0 => 0 5 5 5 +// Lens -> 0 5 0 0 => 0 0 5 5 +// Lens -> 0 0 5 0 => 0 0 0 5 +func chunkLengthsToOffsets(lengths []uint64) { + lengths[1], lengths[0] = lengths[0], lengths[1] + for i := 2; i < len(lengths); i++ { + cur := lengths[i] + lengths[i] = lengths[i-1] + lengths[0] + lengths[0] = cur + } +} + +func readChunkBoundary(chunk int, offsets []uint64) (uint64, uint64) { + var start, end uint64 + if chunk > 0 { + start = offsets[chunk] + } + // single element case + if chunk == 0 && len(offsets) == 1 { + end = offsets[chunk] + } else if chunk < len(offsets)-1 { + end = offsets[chunk+1] + } else { // for last element + end = start + offsets[0] + } + return start, end +} diff --git a/index/scorch/segment/zap/intcoder_test.go b/index/scorch/segment/zap/intcoder_test.go index 85d2c5a76..8c77eab61 100644 --- a/index/scorch/segment/zap/intcoder_test.go +++ b/index/scorch/segment/zap/intcoder_test.go @@ -71,3 +71,215 @@ func TestChunkIntCoder(t *testing.T) { } } } + +func TestChunkLengthToOffsets(t *testing.T) { + + tests := []struct { + lengths []uint64 + expectedOffsets []uint64 + }{ + { + lengths: []uint64{5, 5, 5, 5, 5}, + expectedOffsets: []uint64{5, 5, 10, 15, 20}, + }, + { + lengths: []uint64{0, 5, 0, 5, 0}, + expectedOffsets: []uint64{0, 0, 5, 5, 10}, + }, + { + lengths: []uint64{0, 0, 0, 0, 5}, + expectedOffsets: []uint64{5, 0, 0, 0, 0}, + }, + { + lengths: []uint64{5, 0, 0, 0, 0}, + expectedOffsets: []uint64{0, 5, 5, 5, 5}, + }, + { + lengths: []uint64{0, 5, 0, 0, 0}, + expectedOffsets: []uint64{0, 0, 5, 5, 5}, + }, + { + lengths: []uint64{0, 0, 0, 5, 0}, + expectedOffsets: []uint64{0, 0, 0, 0, 5}, + }, + { + lengths: []uint64{0, 0, 0, 5, 5}, + expectedOffsets: []uint64{5, 0, 0, 0, 5}, + }, + { + lengths: []uint64{5, 5, 5, 0, 0}, + expectedOffsets: []uint64{0, 5, 10, 15, 15}, + }, + } + + for i, test := range tests { + chunkLengthsToOffsets(test.lengths) + if !reflect.DeepEqual(test.expectedOffsets, test.lengths) { + t.Errorf("Test: %d failed, got %+v, expected %+v", i, test.lengths, test.expectedOffsets) + } + } +} + +func TestChunkReadBoundaryFromOffsets(t *testing.T) { + + tests := []struct { + chunkNumber int + offsets []uint64 + expectedStart uint64 + expectedEnd uint64 + }{ + { + offsets: []uint64{5, 5, 10, 15, 20}, + chunkNumber: 4, + expectedStart: 20, + expectedEnd: 25, + }, + { + offsets: []uint64{5, 5, 10, 15, 20}, + chunkNumber: 0, + expectedStart: 0, + expectedEnd: 5, + }, + { + offsets: []uint64{5, 5, 10, 15, 20}, + chunkNumber: 2, + expectedStart: 10, + expectedEnd: 15, + }, + { + offsets: []uint64{0, 0, 5, 5, 10}, + chunkNumber: 4, + expectedStart: 10, + expectedEnd: 10, + }, + { + offsets: []uint64{0, 0, 5, 5, 10}, + chunkNumber: 1, + expectedStart: 0, + expectedEnd: 5, + }, + { + offsets: []uint64{5, 0, 0, 0, 0}, + chunkNumber: 0, + expectedStart: 0, + expectedEnd: 0, + }, + { + offsets: []uint64{5, 0, 0, 0, 0}, + chunkNumber: 4, + expectedStart: 0, + expectedEnd: 5, + }, + { + offsets: []uint64{5, 0, 0, 0, 0}, + chunkNumber: 1, + expectedStart: 0, + expectedEnd: 0, + }, + { + offsets: []uint64{0, 5, 5, 5, 5}, + chunkNumber: 1, + expectedStart: 5, + expectedEnd: 5, + }, + { + offsets: []uint64{0, 5, 5, 5, 5}, + chunkNumber: 0, + expectedStart: 0, + expectedEnd: 5, + }, + { + offsets: []uint64{0, 0, 5, 5, 5}, + chunkNumber: 2, + expectedStart: 5, + expectedEnd: 5, + }, + { + offsets: []uint64{0, 0, 5, 5, 5}, + chunkNumber: 1, + expectedStart: 0, + expectedEnd: 5, + }, + { + offsets: []uint64{0, 0, 0, 0, 5}, + chunkNumber: 4, + expectedStart: 5, + expectedEnd: 5, + }, + { + offsets: []uint64{0, 0, 0, 0, 5}, + chunkNumber: 3, + expectedStart: 0, + expectedEnd: 5, + }, + { + offsets: []uint64{0, 0, 0, 0, 5}, + chunkNumber: 2, + expectedStart: 0, + expectedEnd: 0, + }, + { + offsets: []uint64{5, 0, 0, 0, 5}, + chunkNumber: 0, + expectedStart: 0, + expectedEnd: 0, + }, + { + offsets: []uint64{5, 0, 0, 0, 5}, + chunkNumber: 1, + expectedStart: 0, + expectedEnd: 0, + }, + { + offsets: []uint64{5, 0, 0, 0, 5}, + chunkNumber: 3, + expectedStart: 0, + expectedEnd: 5, + }, + { + offsets: []uint64{5, 0, 0, 0, 5}, + chunkNumber: 4, + expectedStart: 5, + expectedEnd: 10, + }, + { + offsets: []uint64{0, 5, 10, 15, 15}, + chunkNumber: 0, + expectedStart: 0, + expectedEnd: 5, + }, + { + offsets: []uint64{0, 5, 10, 15, 15}, + chunkNumber: 1, + expectedStart: 5, + expectedEnd: 10, + }, + { + offsets: []uint64{0, 5, 10, 15, 15}, + chunkNumber: 2, + expectedStart: 10, + expectedEnd: 15, + }, + { + offsets: []uint64{0, 5, 10, 15, 15}, + chunkNumber: 3, + expectedStart: 15, + expectedEnd: 15, + }, + { + offsets: []uint64{0, 5, 10, 15, 15}, + chunkNumber: 4, + expectedStart: 15, + expectedEnd: 15, + }, + } + + for i, test := range tests { + s, e := readChunkBoundary(test.chunkNumber, test.offsets) + if test.expectedStart != s || test.expectedEnd != e { + t.Errorf("Test: %d failed for chunkNumber: %d got start: %d end: %d,"+ + " expected start: %d end: %d", i, test.chunkNumber, s, e, + test.expectedStart, test.expectedEnd) + } + } +} diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 7ae36120b..c47648cda 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -163,9 +163,9 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { var numFreqChunks uint64 numFreqChunks, read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) n += uint64(read) - rv.freqChunkLens = make([]uint64, int(numFreqChunks)) + rv.freqChunkOffsets = make([]uint64, int(numFreqChunks)) for i := 0; i < int(numFreqChunks); i++ { - rv.freqChunkLens[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) + rv.freqChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) n += uint64(read) } rv.freqChunkStart = p.freqOffset + n @@ -175,9 +175,9 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { var numLocChunks uint64 numLocChunks, read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) n += uint64(read) - rv.locChunkLens = make([]uint64, int(numLocChunks)) + rv.locChunkOffsets = make([]uint64, int(numLocChunks)) for i := 0; i < int(numLocChunks); i++ { - rv.locChunkLens[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) + rv.locChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) n += uint64(read) } rv.locChunkStart = p.locOffset + n @@ -297,11 +297,11 @@ type PostingsIterator struct { locDecoder *govarint.Base128Decoder locReader *bytes.Reader - freqChunkLens []uint64 - freqChunkStart uint64 + freqChunkOffsets []uint64 + freqChunkStart uint64 - locChunkLens []uint64 - locChunkStart uint64 + locChunkOffsets []uint64 + locChunkStart uint64 locBitmap *roaring.Bitmap @@ -317,8 +317,8 @@ func (i *PostingsIterator) Size() int { sizeInBytes := reflectStaticSizePostingsIterator + size.SizeOfPtr + len(i.currChunkFreqNorm) + len(i.currChunkLoc) + - len(i.freqChunkLens)*size.SizeOfUint64 + - len(i.locChunkLens)*size.SizeOfUint64 + + len(i.freqChunkOffsets)*size.SizeOfUint64 + + len(i.locChunkOffsets)*size.SizeOfUint64 + i.next.Size() if i.locBitmap != nil { @@ -333,16 +333,14 @@ func (i *PostingsIterator) Size() int { } func (i *PostingsIterator) loadChunk(chunk int) error { - if chunk >= len(i.freqChunkLens) || chunk >= len(i.locChunkLens) { - return fmt.Errorf("tried to load chunk that doesn't exist %d/(%d %d)", chunk, len(i.freqChunkLens), len(i.locChunkLens)) + if chunk >= len(i.freqChunkOffsets) || chunk >= len(i.locChunkOffsets) { + return fmt.Errorf("tried to load chunk that doesn't exist %d/(%d %d)", chunk, len(i.freqChunkOffsets), len(i.locChunkOffsets)) } - // load freq chunk bytes - start := i.freqChunkStart - for j := 0; j < chunk; j++ { - start += i.freqChunkLens[j] - } - end := start + i.freqChunkLens[chunk] + end, start := i.freqChunkStart, i.freqChunkStart + s, e := readChunkBoundary(chunk, i.freqChunkOffsets) + start += s + end += e i.currChunkFreqNorm = i.postings.sb.mem[start:end] if i.freqNormReader == nil { i.freqNormReader = bytes.NewReader(i.currChunkFreqNorm) @@ -351,12 +349,10 @@ func (i *PostingsIterator) loadChunk(chunk int) error { i.freqNormReader.Reset(i.currChunkFreqNorm) } - // load loc chunk bytes - start = i.locChunkStart - for j := 0; j < chunk; j++ { - start += i.locChunkLens[j] - } - end = start + i.locChunkLens[chunk] + end, start = i.locChunkStart, i.locChunkStart + s, e = readChunkBoundary(chunk, i.locChunkOffsets) + start += s + end += e i.currChunkLoc = i.postings.sb.mem[start:end] if i.locReader == nil { i.locReader = bytes.NewReader(i.currChunkLoc) From 8e8c3ee8c92446a6d48b651a2ede68764f12ca8e Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Thu, 8 Mar 2018 15:29:01 -0800 Subject: [PATCH 276/728] Adding RoaringBitmap/roaring to the bleve vendor manifest --- vendor/manifest | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vendor/manifest b/vendor/manifest index 0837684fa..1883de76e 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -74,6 +74,14 @@ "branch": "master", "notests": true }, + { + "importpath": "github.com/RoaringBitmap/roaring", + "repository": "https://github.com/RoaringBitmap/roaring", + "vcs": "", + "revision": "01d244c43a7e8d1191a4f369f5908ea9eb9bc9ac", + "branch": "master", + "notests": true + }, { "importpath": "github.com/seiflotfy/cuckoofilter", "repository": "https://github.com/seiflotfy/cuckoofilter", From 3884cf4d1216457fe46fbe617cf4e73c3a516250 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 8 Mar 2018 21:36:19 -0800 Subject: [PATCH 277/728] scorch zap writePostings() helper func refactored out --- index/scorch/segment/zap/merge.go | 120 ++++++++++++++++++------------ 1 file changed, 71 insertions(+), 49 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 383fedbf3..a934dfc35 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -248,56 +248,15 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, tfEncoder.Close() locEncoder.Close() - termCardinality := newRoaring.GetCardinality() - - encodeAs1Hit, docNum1Hit, normBits1Hit := use1HitEncoding(termCardinality) - if encodeAs1Hit { - err = newVellum.Insert(term, FSTValEncode1Hit(docNum1Hit, normBits1Hit)) - if err != nil { - return err - } - } else if termCardinality > 0 { - // this field/term has hits in the new segment - freqOffset := uint64(w.Count()) - _, err := tfEncoder.Write(w) - if err != nil { - return err - } - locOffset := uint64(w.Count()) - _, err = locEncoder.Write(w) - if err != nil { - return err - } - postingLocOffset := uint64(w.Count()) - _, err = writeRoaringWithLen(newRoaringLocs, w, bufMaxVarintLen64) - if err != nil { - return err - } - postingOffset := uint64(w.Count()) - // write out the start of the term info - n := binary.PutUvarint(bufMaxVarintLen64, freqOffset) - _, err = w.Write(bufMaxVarintLen64[:n]) - if err != nil { - return err - } - // write out the start of the loc info - n = binary.PutUvarint(bufMaxVarintLen64, locOffset) - _, err = w.Write(bufMaxVarintLen64[:n]) - if err != nil { - return err - } - // write out the start of the posting locs - n = binary.PutUvarint(bufMaxVarintLen64, postingLocOffset) - _, err = w.Write(bufMaxVarintLen64[:n]) - if err != nil { - return err - } - _, err = writeRoaringWithLen(newRoaring, w, bufMaxVarintLen64) - if err != nil { - return err - } + postingsOffset, err := writePostings( + newRoaring, newRoaringLocs, tfEncoder, locEncoder, + use1HitEncoding, w, bufMaxVarintLen64) + if err != nil { + return err + } - err = newVellum.Insert(term, postingOffset) + if postingsOffset > 0 { + err = newVellum.Insert(term, postingsOffset) if err != nil { return err } @@ -460,6 +419,69 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, return rv, fieldDvLocsOffset, nil } +func writePostings(postings, postingLocs *roaring.Bitmap, + tfEncoder, locEncoder *chunkedIntCoder, + use1HitEncoding func(uint64) (bool, uint64, uint64), + w *CountHashWriter, bufMaxVarintLen64 []byte) ( + offset uint64, err error) { + termCardinality := postings.GetCardinality() + if termCardinality <= 0 { + return 0, nil + } + + if use1HitEncoding != nil { + encodeAs1Hit, docNum1Hit, normBits1Hit := use1HitEncoding(termCardinality) + if encodeAs1Hit { + return FSTValEncode1Hit(docNum1Hit, normBits1Hit), nil + } + } + + tfOffset := uint64(w.Count()) + _, err = tfEncoder.Write(w) + if err != nil { + return 0, err + } + + locOffset := uint64(w.Count()) + _, err = locEncoder.Write(w) + if err != nil { + return 0, err + } + + postingLocsOffset := uint64(w.Count()) + _, err = writeRoaringWithLen(postingLocs, w, bufMaxVarintLen64) + if err != nil { + return 0, err + } + + postingsOffset := uint64(w.Count()) + + n := binary.PutUvarint(bufMaxVarintLen64, tfOffset) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return 0, err + } + + n = binary.PutUvarint(bufMaxVarintLen64, locOffset) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return 0, err + } + + n = binary.PutUvarint(bufMaxVarintLen64, postingLocsOffset) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return 0, err + } + + _, err = writeRoaringWithLen(postings, w, bufMaxVarintLen64) + if err != nil { + return 0, err + } + + return postingsOffset, nil +} + func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, fieldsMap map[string]uint16, fieldsInv []string, fieldsSame bool, newSegDocCount uint64, w *CountHashWriter) (uint64, [][]uint64, error) { From e82774ad20cef894996733f144a7f8c09c3ae7f1 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 9 Mar 2018 00:16:28 -0800 Subject: [PATCH 278/728] scorch zap AnalysisResultsToSegmentBase() AnalysisResultsToSegmentBase() allows analysis results to be directly converted into a zap-encoded SegmentBase, which can then be introduced onto the root, avoiding the creation of mem.Segment data structures. This leads to some reduction of garbage memory allocations. The grouping and sorting and shaping of the postings list information is taken from the mem.Segment codepaths. The encoding of stored fields reuses functions from zap's merger, which has the largest savings of garbage memory avoidance. And, the encoding of tf/loc chunks, postings & dictionary information also follows the approach used by zap's merger, which also has some savings of garbage memory avoidance. In future changes, the mem.Segment dependencies will be removed from zap, which should result in a smaller codebase. --- index/scorch/scorch.go | 3 +- index/scorch/segment/zap/new.go | 659 ++++++++++++++++++++++++++++++++ 2 files changed, 660 insertions(+), 2 deletions(-) create mode 100644 index/scorch/segment/zap/new.go diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 6d0bcd1e3..e16a146f7 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -28,7 +28,6 @@ import ( "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/blevesearch/bleve/index/scorch/segment/mem" "github.com/blevesearch/bleve/index/scorch/segment/zap" "github.com/blevesearch/bleve/index/store" "github.com/blevesearch/bleve/registry" @@ -289,7 +288,7 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { var newSegment segment.Segment if len(analysisResults) > 0 { - newSegment, err = zap.NewSegmentBase(mem.NewFromAnalyzedDocs(analysisResults), DefaultChunkFactor) + newSegment, err = zap.AnalysisResultsToSegmentBase(analysisResults, DefaultChunkFactor) if err != nil { return err } diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go new file mode 100644 index 000000000..3a8b2012d --- /dev/null +++ b/index/scorch/segment/zap/new.go @@ -0,0 +1,659 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "bytes" + "encoding/binary" + "math" + "sort" + + "github.com/RoaringBitmap/roaring" + "github.com/Smerity/govarint" + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/couchbase/vellum" + "github.com/golang/snappy" +) + +// AnalysisResultsToSegmentBase produces an in-memory zap-encoded +// SegmentBase from analysis results +func AnalysisResultsToSegmentBase(results []*index.AnalysisResult, + chunkFactor uint32) (*SegmentBase, error) { + var br bytes.Buffer + + s := interim{ + results: results, + chunkFactor: chunkFactor, + w: NewCountHashWriter(&br), + FieldsMap: map[string]uint16{}, + } + + storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, + err := s.convert() + if err != nil { + return nil, err + } + + sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkFactor, + s.FieldsMap, s.FieldsInv, uint64(len(results)), + storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets) + + return sb, err +} + +// interim holds temporary working data used while converting from +// analysis results to a zap-encoded segment +type interim struct { + results []*index.AnalysisResult + + chunkFactor uint32 + + w *CountHashWriter + + // FieldsMap adds 1 to field id to avoid zero value issues + // name -> field id + 1 + FieldsMap map[string]uint16 + + // FieldsInv is the inverse of FieldsMap + // field id -> name + FieldsInv []string + + // Term dictionaries for each field + // field id -> term -> postings list id + 1 + Dicts []map[string]uint64 + + // Terms for each field, where terms are sorted ascending + // field id -> []term + DictKeys [][]string + + // Fields whose IncludeDocValues is true + // field id -> bool + IncludeDocValues []bool + + // postings id -> bitmap of docNums + Postings []*roaring.Bitmap + + // postings id -> bitmap of docNums that have locations + PostingsLocs []*roaring.Bitmap + + // postings id -> freq/norm's, one for each docNum in postings + FreqNorms [][]interimFreqNorm + + // postings id -> locs, one for each freq + Locs [][]interimLoc + + buf0 bytes.Buffer + tmp0 []byte + tmp1 []byte +} + +func (s *interim) grabBuf(size int) []byte { + buf := s.tmp0 + if cap(buf) < size { + buf = make([]byte, size) + s.tmp0 = buf + } + return buf[0:size] +} + +type interimStoredField struct { + vals [][]byte + typs []byte + arrayposs [][]uint64 // array positions +} + +type interimFreqNorm struct { + freq uint64 + norm float32 +} + +type interimLoc struct { + fieldID uint16 + pos uint64 + start uint64 + end uint64 + arrayposs []uint64 +} + +func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) { + s.getOrDefineField("_id") // _id field is fieldID 0 + + for _, result := range s.results { + for _, field := range result.Document.CompositeFields { + s.getOrDefineField(field.Name()) + } + for _, field := range result.Document.Fields { + s.getOrDefineField(field.Name()) + } + } + + sort.Strings(s.FieldsInv[1:]) // keep _id as first field + + s.FieldsMap = make(map[string]uint16, len(s.FieldsInv)) + for fieldID, fieldName := range s.FieldsInv { + s.FieldsMap[fieldName] = uint16(fieldID + 1) + } + + s.IncludeDocValues = make([]bool, len(s.FieldsInv)) + + s.prepareDicts() + + for _, dict := range s.DictKeys { + sort.Strings(dict) + } + + s.processDocuments() + + storedIndexOffset, err := s.writeStoredFields() + if err != nil { + return 0, 0, 0, nil, err + } + + var fdvIndexOffset uint64 + var dictOffsets []uint64 + + if len(s.results) > 0 { + fdvIndexOffset, dictOffsets, err = s.writeDicts() + if err != nil { + return 0, 0, 0, nil, err + } + } else { + dictOffsets = make([]uint64, len(s.FieldsInv)) + } + + fieldsIndexOffset, err := persistFields(s.FieldsInv, s.w, dictOffsets) + if err != nil { + return 0, 0, 0, nil, err + } + + return storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, nil +} + +func (s *interim) getOrDefineField(fieldName string) int { + fieldIDPlus1, exists := s.FieldsMap[fieldName] + if !exists { + fieldIDPlus1 = uint16(len(s.FieldsInv) + 1) + s.FieldsMap[fieldName] = fieldIDPlus1 + s.FieldsInv = append(s.FieldsInv, fieldName) + s.Dicts = append(s.Dicts, make(map[string]uint64)) + s.DictKeys = append(s.DictKeys, make([]string, 0)) + } + return int(fieldIDPlus1 - 1) +} + +// fill Dicts and DictKeys from analysis results +func (s *interim) prepareDicts() { + var pidNext int + + numTermsPerPostingsList := make([]int, 0, 64) // key is postings list id + numLocsPerPostingsList := make([]int, 0, 64) // key is postings list id + + var totTFs int + var totLocs int + + visitField := func(fieldID uint16, tfs analysis.TokenFrequencies) { + dict := s.Dicts[fieldID] + dictKeys := s.DictKeys[fieldID] + + for term, tf := range tfs { + pidPlus1, exists := dict[term] + if !exists { + pidNext++ + pidPlus1 = uint64(pidNext) + + dict[term] = pidPlus1 + dictKeys = append(dictKeys, term) + + numTermsPerPostingsList = append(numTermsPerPostingsList, 0) + numLocsPerPostingsList = append(numLocsPerPostingsList, 0) + } + + pid := pidPlus1 - 1 + + numTermsPerPostingsList[pid] += 1 + numLocsPerPostingsList[pid] += len(tf.Locations) + + totLocs += len(tf.Locations) + } + + totTFs += len(tfs) + + s.DictKeys[fieldID] = dictKeys + } + + for _, result := range s.results { + // walk each composite field + for _, field := range result.Document.CompositeFields { + fieldID := uint16(s.getOrDefineField(field.Name())) + _, tf := field.Analyze() + visitField(fieldID, tf) + } + + // walk each field + for i, field := range result.Document.Fields { + fieldID := uint16(s.getOrDefineField(field.Name())) + tf := result.Analyzed[i] + visitField(fieldID, tf) + } + } + + numPostingsLists := pidNext + + s.Postings = make([]*roaring.Bitmap, numPostingsLists) + for i := 0; i < numPostingsLists; i++ { + s.Postings[i] = roaring.New() + } + + s.PostingsLocs = make([]*roaring.Bitmap, numPostingsLists) + for i := 0; i < numPostingsLists; i++ { + s.PostingsLocs[i] = roaring.New() + } + + s.FreqNorms = make([][]interimFreqNorm, numPostingsLists) + + freqNormsBacking := make([]interimFreqNorm, totTFs) + for pid, numTerms := range numTermsPerPostingsList { + s.FreqNorms[pid] = freqNormsBacking[0:0] + freqNormsBacking = freqNormsBacking[numTerms:] + } + + s.Locs = make([][]interimLoc, numPostingsLists) + + locsBacking := make([]interimLoc, totLocs) + for pid, numLocs := range numLocsPerPostingsList { + s.Locs[pid] = locsBacking[0:0] + locsBacking = locsBacking[numLocs:] + } +} + +func (s *interim) processDocuments() { + numFields := len(s.FieldsInv) + reuseFieldLens := make([]int, numFields) + reuseFieldTFs := make([]analysis.TokenFrequencies, numFields) + + for docNum, result := range s.results { + for i := 0; i < numFields; i++ { // clear these for reuse + reuseFieldLens[i] = 0 + reuseFieldTFs[i] = nil + } + + s.processDocument(uint64(docNum), result, + reuseFieldLens, reuseFieldTFs) + } +} + +func (s *interim) processDocument(docNum uint64, + result *index.AnalysisResult, + fieldLens []int, fieldTFs []analysis.TokenFrequencies) { + visitField := func(fieldID uint16, fieldName string, + ln int, tf analysis.TokenFrequencies) { + fieldLens[fieldID] += ln + + existingFreqs := fieldTFs[fieldID] + if existingFreqs != nil { + existingFreqs.MergeAll(fieldName, tf) + } else { + fieldTFs[fieldID] = tf + } + } + + // walk each composite field + for _, field := range result.Document.CompositeFields { + fieldID := uint16(s.getOrDefineField(field.Name())) + ln, tf := field.Analyze() + visitField(fieldID, field.Name(), ln, tf) + } + + // walk each field + for i, field := range result.Document.Fields { + fieldID := uint16(s.getOrDefineField(field.Name())) + ln := result.Length[i] + tf := result.Analyzed[i] + visitField(fieldID, field.Name(), ln, tf) + } + + // now that it's been rolled up into fieldTFs, walk that + for fieldID, tfs := range fieldTFs { + dict := s.Dicts[fieldID] + norm := float32(1.0 / math.Sqrt(float64(fieldLens[fieldID]))) + + for term, tf := range tfs { + pid := dict[term] - 1 + bs := s.Postings[pid] + bs.AddInt(int(docNum)) + + s.FreqNorms[pid] = append(s.FreqNorms[pid], + interimFreqNorm{ + freq: uint64(tf.Frequency()), + norm: norm, + }) + + if len(tf.Locations) > 0 { + locBS := s.PostingsLocs[pid] + locBS.AddInt(int(docNum)) + + locs := s.Locs[pid] + + for _, loc := range tf.Locations { + var locf = uint16(fieldID) + if loc.Field != "" { + locf = uint16(s.getOrDefineField(loc.Field)) + } + var arrayposs []uint64 + if len(loc.ArrayPositions) > 0 { + arrayposs = loc.ArrayPositions + } + locs = append(locs, interimLoc{ + fieldID: locf, + pos: uint64(loc.Position), + start: uint64(loc.Start), + end: uint64(loc.End), + arrayposs: arrayposs, + }) + } + + s.Locs[pid] = locs + } + } + } +} + +func (s *interim) writeStoredFields() ( + storedIndexOffset uint64, err error) { + metaBuf := &s.buf0 + metaEncoder := govarint.NewU64Base128Encoder(metaBuf) + + data, compressed := s.tmp0[:0], s.tmp1[:0] + defer func() { s.tmp0, s.tmp1 = data, compressed }() + + // keyed by docNum + docStoredOffsets := make([]uint64, len(s.results)) + + // keyed by fieldID, for the current doc in the loop + docStoredFields := map[uint16]interimStoredField{} + + for docNum, result := range s.results { + for fieldID := range docStoredFields { // reset for next doc + delete(docStoredFields, fieldID) + } + + for _, field := range result.Document.Fields { + fieldID := uint16(s.getOrDefineField(field.Name())) + + opts := field.Options() + + if opts.IsStored() { + isf := docStoredFields[fieldID] + isf.vals = append(isf.vals, field.Value()) + isf.typs = append(isf.typs, encodeFieldType(field)) + isf.arrayposs = append(isf.arrayposs, field.ArrayPositions()) + docStoredFields[fieldID] = isf + } + + if opts.IncludeDocValues() { + s.IncludeDocValues[fieldID] = true + } + } + + var curr int + + metaBuf.Reset() + data = data[:0] + compressed = compressed[:0] + + for fieldID := range s.FieldsInv { + isf, exists := docStoredFields[uint16(fieldID)] + if exists { + curr, data, err = persistStoredFieldValues( + fieldID, isf.vals, isf.typs, isf.arrayposs, + curr, metaEncoder, data) + if err != nil { + return 0, err + } + } + } + + metaEncoder.Close() + metaBytes := metaBuf.Bytes() + + compressed = snappy.Encode(compressed, data) + + docStoredOffsets[docNum] = uint64(s.w.Count()) + + _, err := writeUvarints(s.w, + uint64(len(metaBytes)), + uint64(len(compressed))) + if err != nil { + return 0, err + } + + _, err = s.w.Write(metaBytes) + if err != nil { + return 0, err + } + + _, err = s.w.Write(compressed) + if err != nil { + return 0, err + } + } + + storedIndexOffset = uint64(s.w.Count()) + + for _, docStoredOffset := range docStoredOffsets { + err = binary.Write(s.w, binary.BigEndian, docStoredOffset) + if err != nil { + return 0, err + } + } + + return storedIndexOffset, nil +} + +func (s *interim) writeDicts() (uint64, []uint64, error) { + dictOffsets := make([]uint64, len(s.FieldsInv)) + + fdvOffsets := make([]uint64, len(s.FieldsInv)) + + buf := s.grabBuf(binary.MaxVarintLen64) + + tfEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1)) + locEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1)) + fdvEncoder := newChunkedContentCoder(uint64(s.chunkFactor), uint64(len(s.results)-1)) + + var docTermMap [][]byte + + s.buf0.Reset() + builder, err := vellum.New(&s.buf0, nil) + if err != nil { + return 0, nil, err + } + + for fieldID, terms := range s.DictKeys { + if cap(docTermMap) < len(s.results) { + docTermMap = make([][]byte, len(s.results)) + } else { + docTermMap = docTermMap[0:len(s.results)] + for docNum := range docTermMap { // reset the docTermMap + docTermMap[docNum] = docTermMap[docNum][:0] + } + } + + dict := s.Dicts[fieldID] + + for _, term := range terms { // terms are already sorted + pid := dict[term] - 1 + + postingsBS := s.Postings[pid] + postingsLocsBS := s.PostingsLocs[pid] + + freqNorms := s.FreqNorms[pid] + freqNormOffset := 0 + + locs := s.Locs[pid] + locOffset := 0 + + postingsItr := postingsBS.Iterator() + for postingsItr.HasNext() { + docNum := uint64(postingsItr.Next()) + + freqNorm := freqNorms[freqNormOffset] + + err = tfEncoder.Add(docNum, freqNorm.freq, + uint64(math.Float32bits(freqNorm.norm))) + if err != nil { + return 0, nil, err + } + + for i := uint64(0); i < freqNorm.freq; i++ { + if len(locs) > 0 { + loc := locs[locOffset] + + err = locEncoder.Add(docNum, uint64(loc.fieldID), + loc.pos, loc.start, loc.end, + uint64(len(loc.arrayposs))) + if err != nil { + return 0, nil, err + } + + err = locEncoder.Add(docNum, loc.arrayposs...) + if err != nil { + return 0, nil, err + } + } + + locOffset++ + } + + freqNormOffset++ + + docTermMap[docNum] = append( + append(docTermMap[docNum], term...), + termSeparator) + } + + tfEncoder.Close() + locEncoder.Close() + + postingsOffset, err := writePostings( + postingsBS, postingsLocsBS, tfEncoder, locEncoder, + nil, s.w, buf) + if err != nil { + return 0, nil, err + } + + if postingsOffset > uint64(0) { + err = builder.Insert([]byte(term), postingsOffset) + if err != nil { + return 0, nil, err + } + } + + tfEncoder.Reset() + locEncoder.Reset() + } + + err = builder.Close() + if err != nil { + return 0, nil, err + } + + // record where this dictionary starts + dictOffsets[fieldID] = uint64(s.w.Count()) + + vellumData := s.buf0.Bytes() + + // write out the length of the vellum data + n := binary.PutUvarint(buf, uint64(len(vellumData))) + _, err = s.w.Write(buf[:n]) + if err != nil { + return 0, nil, err + } + + // write this vellum to disk + _, err = s.w.Write(vellumData) + if err != nil { + return 0, nil, err + } + + // reset vellum for reuse + s.buf0.Reset() + + err = builder.Reset(&s.buf0) + if err != nil { + return 0, nil, err + } + + // write the field doc values + if s.IncludeDocValues[fieldID] { + for docNum, docTerms := range docTermMap { + if len(docTerms) > 0 { + err = fdvEncoder.Add(uint64(docNum), docTerms) + if err != nil { + return 0, nil, err + } + } + } + err = fdvEncoder.Close() + if err != nil { + return 0, nil, err + } + + fdvOffsets[fieldID] = uint64(s.w.Count()) + + _, err = fdvEncoder.Write(s.w) + if err != nil { + return 0, nil, err + } + + fdvEncoder.Reset() + } else { + fdvOffsets[fieldID] = fieldNotUninverted + } + } + + fdvIndexOffset := uint64(s.w.Count()) + + for _, fdvOffset := range fdvOffsets { + n := binary.PutUvarint(buf, fdvOffset) + _, err := s.w.Write(buf[:n]) + if err != nil { + return 0, nil, err + } + } + + return fdvIndexOffset, dictOffsets, nil +} + +func encodeFieldType(f document.Field) byte { + fieldType := byte('x') + switch f.(type) { + case *document.TextField: + fieldType = 't' + case *document.NumericField: + fieldType = 'n' + case *document.DateTimeField: + fieldType = 'd' + case *document.BooleanField: + fieldType = 'b' + case *document.GeoPointField: + fieldType = 'g' + case *document.CompositeField: + fieldType = 'c' + } + return fieldType +} From eade78be2f2026b82a66e4eff98ca1567c3d88c3 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 9 Mar 2018 14:05:17 -0800 Subject: [PATCH 279/728] scorch zap unit tests no longer use mem.Segment --- index/scorch/segment/zap/build_test.go | 37 +++++++++++++------ index/scorch/segment/zap/dict_test.go | 11 ++---- index/scorch/segment/zap/merge_test.go | 47 +++++++++++------------- index/scorch/segment/zap/segment_test.go | 20 +++++----- 4 files changed, 61 insertions(+), 54 deletions(-) diff --git a/index/scorch/segment/zap/build_test.go b/index/scorch/segment/zap/build_test.go index 9063980b7..65de7931d 100644 --- a/index/scorch/segment/zap/build_test.go +++ b/index/scorch/segment/zap/build_test.go @@ -21,20 +21,22 @@ import ( "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" - "github.com/blevesearch/bleve/index/scorch/segment/mem" ) func TestBuild(t *testing.T) { _ = os.RemoveAll("/tmp/scorch.zap") - memSegment := buildMemSegment() - err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024) + sb, err := buildTestSegment() + if err != nil { + t.Fatal(err) + } + err = PersistSegmentBase(sb, "/tmp/scorch.zap") if err != nil { t.Fatal(err) } } -func buildMemSegment() *mem.Segment { +func buildTestSegment() (*SegmentBase, error) { doc := &document.Document{ ID: "a", Fields: []document.Field{ @@ -120,11 +122,22 @@ func buildMemSegment() *mem.Segment { } } - return mem.NewFromAnalyzedDocs(results) + return AnalysisResultsToSegmentBase(results, 1024) +} + +func buildTestSegmentMulti() (*SegmentBase, error) { + results := buildTestAnalysisResultsMulti() + + return AnalysisResultsToSegmentBase(results, 1024) } -func buildMemSegmentMulti() *mem.Segment { +func buildTestSegmentMultiWithChunkFactor(chunkFactor uint32) (*SegmentBase, error) { + results := buildTestAnalysisResultsMulti() + return AnalysisResultsToSegmentBase(results, chunkFactor) +} + +func buildTestAnalysisResultsMulti() []*index.AnalysisResult { doc := &document.Document{ ID: "a", Fields: []document.Field{ @@ -282,13 +295,11 @@ func buildMemSegmentMulti() *mem.Segment { } } - segment := mem.NewFromAnalyzedDocs(results) - - return segment + return results } -func buildMemSegmentWithDefaultFieldMapping() (*mem.Segment, []string) { - +func buildTestSegmentWithDefaultFieldMapping(chunkFactor uint32) ( + *SegmentBase, []string, error) { doc := &document.Document{ ID: "a", Fields: []document.Field{ @@ -371,5 +382,7 @@ func buildMemSegmentWithDefaultFieldMapping() (*mem.Segment, []string) { } } - return mem.NewFromAnalyzedDocs(results), fields + sb, err := AnalysisResultsToSegmentBase(results, chunkFactor) + + return sb, fields, err } diff --git a/index/scorch/segment/zap/dict_test.go b/index/scorch/segment/zap/dict_test.go index 336fb37ca..b70f2adf7 100644 --- a/index/scorch/segment/zap/dict_test.go +++ b/index/scorch/segment/zap/dict_test.go @@ -22,10 +22,9 @@ import ( "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" - "github.com/blevesearch/bleve/index/scorch/segment/mem" ) -func buildMemSegmentForDict() *mem.Segment { +func buildTestSegmentForDict() (*SegmentBase, error) { doc := &document.Document{ ID: "a", Fields: []document.Field{ @@ -99,17 +98,15 @@ func buildMemSegmentForDict() *mem.Segment { }, } - segment := mem.NewFromAnalyzedDocs(results) - - return segment + return AnalysisResultsToSegmentBase(results, 1024) } func TestDictionary(t *testing.T) { _ = os.RemoveAll("/tmp/scorch.zap") - memSegment := buildMemSegmentForDict() - err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024) + testSeg, _ := buildTestSegmentForDict() + err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") if err != nil { t.Fatalf("error persisting segment: %v", err) } diff --git a/index/scorch/segment/zap/merge_test.go b/index/scorch/segment/zap/merge_test.go index d80b26086..d931f6c23 100644 --- a/index/scorch/segment/zap/merge_test.go +++ b/index/scorch/segment/zap/merge_test.go @@ -26,7 +26,6 @@ import ( "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" - "github.com/blevesearch/bleve/index/scorch/segment/mem" ) func TestMerge(t *testing.T) { @@ -34,14 +33,14 @@ func TestMerge(t *testing.T) { _ = os.RemoveAll("/tmp/scorch2.zap") _ = os.RemoveAll("/tmp/scorch3.zap") - memSegment := buildMemSegmentMulti() - err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024) + testSeg, _ := buildTestSegmentMulti() + err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") if err != nil { t.Fatal(err) } - memSegment2 := buildMemSegmentMulti2() - err = PersistSegment(memSegment2, "/tmp/scorch2.zap", 1024) + testSeg2, _ := buildTestSegmentMulti2() + err = PersistSegmentBase(testSeg2, "/tmp/scorch2.zap") if err != nil { t.Fatal(err) } @@ -121,8 +120,8 @@ func TestMergeWithEmptySegmentsFirst(t *testing.T) { func testMergeWithEmptySegments(t *testing.T, before bool, numEmptySegments int) { _ = os.RemoveAll("/tmp/scorch.zap") - memSegment := buildMemSegmentMulti() - err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024) + testSeg, _ := buildTestSegmentMulti() + err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") if err != nil { t.Fatal(err) } @@ -148,8 +147,8 @@ func testMergeWithEmptySegments(t *testing.T, before bool, numEmptySegments int) _ = os.RemoveAll("/tmp/" + fname) - emptySegment := mem.NewFromAnalyzedDocs([]*index.AnalysisResult{}) - err = PersistSegment(emptySegment, "/tmp/"+fname, 1024) + emptySegment, _ := AnalysisResultsToSegmentBase([]*index.AnalysisResult{}, 1024) + err = PersistSegmentBase(emptySegment, "/tmp/"+fname) if err != nil { t.Fatal(err) } @@ -462,8 +461,8 @@ func testMergeAndDrop(t *testing.T, docsToDrop []*roaring.Bitmap) { _ = os.RemoveAll("/tmp/scorch.zap") _ = os.RemoveAll("/tmp/scorch2.zap") - memSegment := buildMemSegmentMulti() - err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024) + testSeg, _ := buildTestSegmentMulti() + err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") if err != nil { t.Fatal(err) } @@ -478,8 +477,8 @@ func testMergeAndDrop(t *testing.T, docsToDrop []*roaring.Bitmap) { } }() - memSegment2 := buildMemSegmentMulti2() - err = PersistSegment(memSegment2, "/tmp/scorch2.zap", 1024) + testSeg2, _ := buildTestSegmentMulti2() + err = PersistSegmentBase(testSeg2, "/tmp/scorch2.zap") if err != nil { t.Fatal(err) } @@ -565,8 +564,8 @@ func testMergeWithUpdates(t *testing.T, segmentDocIds [][]string, docsToDrop []* _ = os.RemoveAll("/tmp/" + fname) - memSegment := buildMemSegmentMultiHelper(docIds) - err := PersistSegment(memSegment, "/tmp/"+fname, 1024) + testSeg, _ := buildTestSegmentMultiHelper(docIds) + err := PersistSegmentBase(testSeg, "/tmp/"+fname) if err != nil { t.Fatal(err) } @@ -616,11 +615,11 @@ func testMergeAndDropSegments(t *testing.T, segsToMerge []*Segment, docsToDrop [ testMergeWithSelf(t, segm.(*Segment), expectedNumDocs) } -func buildMemSegmentMulti2() *mem.Segment { - return buildMemSegmentMultiHelper([]string{"c", "d"}) +func buildTestSegmentMulti2() (*SegmentBase, error) { + return buildTestSegmentMultiHelper([]string{"c", "d"}) } -func buildMemSegmentMultiHelper(docIds []string) *mem.Segment { +func buildTestSegmentMultiHelper(docIds []string) (*SegmentBase, error) { doc := &document.Document{ ID: "c", Fields: []document.Field{ @@ -778,9 +777,7 @@ func buildMemSegmentMultiHelper(docIds []string) *mem.Segment { } } - segment := mem.NewFromAnalyzedDocs(results) - - return segment + return AnalysisResultsToSegmentBase(results, 1024) } func TestMergeBytesWritten(t *testing.T) { @@ -788,14 +785,14 @@ func TestMergeBytesWritten(t *testing.T) { _ = os.RemoveAll("/tmp/scorch2.zap") _ = os.RemoveAll("/tmp/scorch3.zap") - memSegment := buildMemSegmentMulti() - err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024) + testSeg, _ := buildTestSegmentMulti() + err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") if err != nil { t.Fatal(err) } - memSegment2 := buildMemSegmentMulti2() - err = PersistSegment(memSegment2, "/tmp/scorch2.zap", 1024) + testSeg2, _ := buildTestSegmentMulti2() + err = PersistSegmentBase(testSeg2, "/tmp/scorch2.zap") if err != nil { t.Fatal(err) } diff --git a/index/scorch/segment/zap/segment_test.go b/index/scorch/segment/zap/segment_test.go index 9ce354ce3..50d5dbd7f 100644 --- a/index/scorch/segment/zap/segment_test.go +++ b/index/scorch/segment/zap/segment_test.go @@ -28,8 +28,8 @@ import ( func TestOpen(t *testing.T) { _ = os.RemoveAll("/tmp/scorch.zap") - memSegment := buildMemSegment() - err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024) + testSeg, _ := buildTestSegment() + err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") if err != nil { t.Fatalf("error persisting segment: %v", err) } @@ -328,8 +328,8 @@ func TestOpen(t *testing.T) { func TestOpenMulti(t *testing.T) { _ = os.RemoveAll("/tmp/scorch.zap") - memSegment := buildMemSegmentMulti() - err := PersistSegment(memSegment, "/tmp/scorch.zap", 1024) + testSeg, _ := buildTestSegmentMulti() + err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") if err != nil { t.Fatalf("error persisting segment: %v", err) } @@ -428,8 +428,8 @@ func TestOpenMulti(t *testing.T) { func TestOpenMultiWithTwoChunks(t *testing.T) { _ = os.RemoveAll("/tmp/scorch.zap") - memSegment := buildMemSegmentMulti() - err := PersistSegment(memSegment, "/tmp/scorch.zap", 1) + testSeg, _ := buildTestSegmentMultiWithChunkFactor(1) + err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") if err != nil { t.Fatalf("error persisting segment: %v", err) } @@ -523,8 +523,8 @@ func TestOpenMultiWithTwoChunks(t *testing.T) { func TestSegmentVisitableDocValueFieldsList(t *testing.T) { _ = os.RemoveAll("/tmp/scorch.zap") - memSegment := buildMemSegmentMulti() - err := PersistSegment(memSegment, "/tmp/scorch.zap", 1) + testSeg, _ := buildTestSegmentMultiWithChunkFactor(1) + err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") if err != nil { t.Fatalf("error persisting segment: %v", err) } @@ -551,8 +551,8 @@ func TestSegmentVisitableDocValueFieldsList(t *testing.T) { } _ = os.RemoveAll("/tmp/scorch.zap") - memSegment, expectedFields := buildMemSegmentWithDefaultFieldMapping() - err = PersistSegment(memSegment, "/tmp/scorch.zap", 1) + testSeg, expectedFields, _ := buildTestSegmentWithDefaultFieldMapping(1) + err = PersistSegmentBase(testSeg, "/tmp/scorch.zap") if err != nil { t.Fatalf("error persisting segment: %v", err) } From 5abf7b7a19071d24123e1add6443ebd91005fc68 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 9 Mar 2018 14:11:42 -0800 Subject: [PATCH 280/728] scorch zap remove mem.Segment usage from persist / build.go --- index/scorch/segment/zap/build.go | 488 ------------------------------ 1 file changed, 488 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 8ec610953..30ae8d774 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -16,16 +16,10 @@ package zap import ( "bufio" - "bytes" - "encoding/binary" "math" "os" - "sort" "github.com/Smerity/govarint" - "github.com/blevesearch/bleve/index/scorch/segment/mem" - "github.com/couchbase/vellum" - "github.com/golang/snappy" ) const version uint32 = 4 @@ -82,186 +76,6 @@ func PersistSegmentBase(sb *SegmentBase, path string) error { return nil } -// PersistSegment takes the in-memory segment and persists it to -// the specified path in the zap file format. -func PersistSegment(memSegment *mem.Segment, path string, chunkFactor uint32) error { - flag := os.O_RDWR | os.O_CREATE - - f, err := os.OpenFile(path, flag, 0600) - if err != nil { - return err - } - - cleanup := func() { - _ = f.Close() - _ = os.Remove(path) - } - - // buffer the output - br := bufio.NewWriter(f) - - // wrap it for counting (tracking offsets) - cr := NewCountHashWriter(br) - - numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, err := - persistBase(memSegment, cr, chunkFactor) - if err != nil { - cleanup() - return err - } - - err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, - chunkFactor, cr.Sum32(), cr) - if err != nil { - cleanup() - return err - } - - err = br.Flush() - if err != nil { - cleanup() - return err - } - - err = f.Sync() - if err != nil { - cleanup() - return err - } - - err = f.Close() - if err != nil { - cleanup() - return err - } - - return nil -} - -func persistBase(memSegment *mem.Segment, cr *CountHashWriter, chunkFactor uint32) ( - numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64, - dictLocs []uint64, err error) { - docValueOffset = uint64(fieldNotUninverted) - - if len(memSegment.Stored) > 0 { - storedIndexOffset, err = persistStored(memSegment, cr) - if err != nil { - return 0, 0, 0, 0, nil, err - } - - freqOffsets, locOffsets, err := persistPostingDetails(memSegment, cr, chunkFactor) - if err != nil { - return 0, 0, 0, 0, nil, err - } - - postingsListLocs, err := persistPostingsLocs(memSegment, cr) - if err != nil { - return 0, 0, 0, 0, nil, err - } - - postingsLocs, err := persistPostingsLists(memSegment, cr, postingsListLocs, freqOffsets, locOffsets) - if err != nil { - return 0, 0, 0, 0, nil, err - } - - dictLocs, err = persistDictionary(memSegment, cr, postingsLocs) - if err != nil { - return 0, 0, 0, 0, nil, err - } - - docValueOffset, err = persistFieldDocValues(memSegment, cr, chunkFactor) - if err != nil { - return 0, 0, 0, 0, nil, err - } - } else { - dictLocs = make([]uint64, len(memSegment.FieldsInv)) - } - - fieldsIndexOffset, err = persistFields(memSegment.FieldsInv, cr, dictLocs) - if err != nil { - return 0, 0, 0, 0, nil, err - } - - return uint64(len(memSegment.Stored)), storedIndexOffset, fieldsIndexOffset, docValueOffset, - dictLocs, nil -} - -func persistStored(memSegment *mem.Segment, w *CountHashWriter) (uint64, error) { - var curr int - var metaBuf bytes.Buffer - var data, compressed []byte - - metaEncoder := govarint.NewU64Base128Encoder(&metaBuf) - - docNumOffsets := make(map[int]uint64, len(memSegment.Stored)) - - for docNum, storedValues := range memSegment.Stored { - if docNum != 0 { - // reset buffer if necessary - curr = 0 - metaBuf.Reset() - data = data[:0] - compressed = compressed[:0] - } - - st := memSegment.StoredTypes[docNum] - sp := memSegment.StoredPos[docNum] - - // encode fields in order - for fieldID := range memSegment.FieldsInv { - if storedFieldValues, ok := storedValues[uint16(fieldID)]; ok { - stf := st[uint16(fieldID)] - spf := sp[uint16(fieldID)] - - var err2 error - curr, data, err2 = persistStoredFieldValues(fieldID, - storedFieldValues, stf, spf, curr, metaEncoder, data) - if err2 != nil { - return 0, err2 - } - } - } - - metaEncoder.Close() - metaBytes := metaBuf.Bytes() - - // compress the data - compressed = snappy.Encode(compressed, data) - - // record where we're about to start writing - docNumOffsets[docNum] = uint64(w.Count()) - - // write out the meta len and compressed data len - _, err := writeUvarints(w, uint64(len(metaBytes)), uint64(len(compressed))) - if err != nil { - return 0, err - } - - // now write the meta - _, err = w.Write(metaBytes) - if err != nil { - return 0, err - } - // now write the compressed data - _, err = w.Write(compressed) - if err != nil { - return 0, err - } - } - - // return value is the start of the stored index - rv := uint64(w.Count()) - // now write out the stored doc index - for docNum := range memSegment.Stored { - err := binary.Write(w, binary.BigEndian, docNumOffsets[docNum]) - if err != nil { - return 0, err - } - } - - return rv, nil -} - func persistStoredFieldValues(fieldID int, storedFieldValues [][]byte, stf []byte, spf [][]uint64, curr int, metaEncoder *govarint.Base128Encoder, data []byte) ( @@ -307,308 +121,6 @@ func persistStoredFieldValues(fieldID int, return curr, data, nil } -func persistPostingDetails(memSegment *mem.Segment, w *CountHashWriter, chunkFactor uint32) ([]uint64, []uint64, error) { - freqOffsets := make([]uint64, 0, len(memSegment.Postings)) - tfEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) - for postingID := range memSegment.Postings { - if postingID != 0 { - tfEncoder.Reset() - } - freqs := memSegment.Freqs[postingID] - norms := memSegment.Norms[postingID] - postingsListItr := memSegment.Postings[postingID].Iterator() - var offset int - for postingsListItr.HasNext() { - docNum := uint64(postingsListItr.Next()) - - // put freq & norm - err := tfEncoder.Add(docNum, freqs[offset], uint64(math.Float32bits(norms[offset]))) - if err != nil { - return nil, nil, err - } - - offset++ - } - - // record where this postings freq info starts - freqOffsets = append(freqOffsets, uint64(w.Count())) - - tfEncoder.Close() - _, err := tfEncoder.Write(w) - if err != nil { - return nil, nil, err - } - } - - // now do it again for the locations - locOffsets := make([]uint64, 0, len(memSegment.Postings)) - locEncoder := newChunkedIntCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) - for postingID := range memSegment.Postings { - if postingID != 0 { - locEncoder.Reset() - } - freqs := memSegment.Freqs[postingID] - locfields := memSegment.Locfields[postingID] - locpos := memSegment.Locpos[postingID] - locstarts := memSegment.Locstarts[postingID] - locends := memSegment.Locends[postingID] - locarraypos := memSegment.Locarraypos[postingID] - postingsListItr := memSegment.Postings[postingID].Iterator() - var offset int - var locOffset int - for postingsListItr.HasNext() { - docNum := uint64(postingsListItr.Next()) - n := int(freqs[offset]) - for i := 0; i < n; i++ { - if len(locfields) > 0 { - err := locEncoder.Add(docNum, uint64(locfields[locOffset]), - locpos[locOffset], locstarts[locOffset], locends[locOffset], - uint64(len(locarraypos[locOffset]))) - if err != nil { - return nil, nil, err - } - - // put each array position - err = locEncoder.Add(docNum, locarraypos[locOffset]...) - if err != nil { - return nil, nil, err - } - } - locOffset++ - } - offset++ - } - - // record where this postings loc info starts - locOffsets = append(locOffsets, uint64(w.Count())) - - locEncoder.Close() - _, err := locEncoder.Write(w) - if err != nil { - return nil, nil, err - } - } - - return freqOffsets, locOffsets, nil -} - -func persistPostingsLocs(memSegment *mem.Segment, w *CountHashWriter) (rv []uint64, err error) { - rv = make([]uint64, 0, len(memSegment.PostingsLocs)) - reuseBufVarint := make([]byte, binary.MaxVarintLen64) - for postingID := range memSegment.PostingsLocs { - // record where we start this posting loc - rv = append(rv, uint64(w.Count())) - // write out the length and bitmap - _, err = writeRoaringWithLen(memSegment.PostingsLocs[postingID], w, reuseBufVarint) - if err != nil { - return nil, err - } - } - return rv, nil -} - -func persistPostingsLists(memSegment *mem.Segment, w *CountHashWriter, - postingsListLocs, freqOffsets, locOffsets []uint64) (rv []uint64, err error) { - rv = make([]uint64, 0, len(memSegment.Postings)) - reuseBufVarint := make([]byte, binary.MaxVarintLen64) - for postingID := range memSegment.Postings { - // record where we start this posting list - rv = append(rv, uint64(w.Count())) - - // write out the term info, loc info, and loc posting list offset - _, err = writeUvarints(w, freqOffsets[postingID], - locOffsets[postingID], postingsListLocs[postingID]) - if err != nil { - return nil, err - } - - // write out the length and bitmap - _, err = writeRoaringWithLen(memSegment.Postings[postingID], w, reuseBufVarint) - if err != nil { - return nil, err - } - } - return rv, nil -} - -func persistDictionary(memSegment *mem.Segment, w *CountHashWriter, postingsLocs []uint64) ([]uint64, error) { - rv := make([]uint64, 0, len(memSegment.DictKeys)) - - varintBuf := make([]byte, binary.MaxVarintLen64) - - var buffer bytes.Buffer - builder, err := vellum.New(&buffer, nil) - if err != nil { - return nil, err - } - for fieldID, fieldTerms := range memSegment.DictKeys { - - dict := memSegment.Dicts[fieldID] - // now walk the dictionary in order of fieldTerms (already sorted) - for _, fieldTerm := range fieldTerms { - postingID := dict[fieldTerm] - 1 - postingsAddr := postingsLocs[postingID] - err = builder.Insert([]byte(fieldTerm), postingsAddr) - if err != nil { - return nil, err - } - } - err = builder.Close() - if err != nil { - return nil, err - } - - // record where this dictionary starts - rv = append(rv, uint64(w.Count())) - - vellumData := buffer.Bytes() - - // write out the length of the vellum data - n := binary.PutUvarint(varintBuf, uint64(len(vellumData))) - _, err = w.Write(varintBuf[:n]) - if err != nil { - return nil, err - } - - // write this vellum to disk - _, err = w.Write(vellumData) - if err != nil { - return nil, err - } - - // reset buffer and vellum builder - buffer.Reset() - err = builder.Reset(&buffer) - if err != nil { - return nil, err - } - } - - return rv, nil -} - -type docIDRange []uint64 - -func (a docIDRange) Len() int { return len(a) } -func (a docIDRange) Swap(i, j int) { a[i], a[j] = a[j], a[i] } -func (a docIDRange) Less(i, j int) bool { return a[i] < a[j] } - -func persistDocValues(memSegment *mem.Segment, w *CountHashWriter, - chunkFactor uint32) (map[uint16]uint64, error) { - fieldChunkOffsets := make(map[uint16]uint64, len(memSegment.FieldsInv)) - fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), uint64(len(memSegment.Stored)-1)) - - var postings *mem.PostingsList - var postingsItr *mem.PostingsIterator - - for fieldID := range memSegment.DocValueFields { - field := memSegment.FieldsInv[fieldID] - docTermMap := make(map[uint64][]byte, 0) - dict, err := memSegment.Dictionary(field) - if err != nil { - return nil, err - } - - dictItr := dict.Iterator() - next, err := dictItr.Next() - for err == nil && next != nil { - var err1 error - postings, err1 = dict.(*mem.Dictionary).InitPostingsList(next.Term, nil, postings) - if err1 != nil { - return nil, err1 - } - - postingsItr = postings.InitIterator(postingsItr) - nextPosting, err2 := postingsItr.Next() - for err2 == nil && nextPosting != nil { - docNum := nextPosting.Number() - docTermMap[docNum] = append(append(docTermMap[docNum], []byte(next.Term)...), termSeparator) - nextPosting, err2 = postingsItr.Next() - } - if err2 != nil { - return nil, err2 - } - - next, err = dictItr.Next() - } - if err != nil { - return nil, err - } - - // sort wrt to docIDs - docNumbers := make(docIDRange, 0, len(docTermMap)) - for k := range docTermMap { - docNumbers = append(docNumbers, k) - } - sort.Sort(docNumbers) - - for _, docNum := range docNumbers { - err = fdvEncoder.Add(docNum, docTermMap[docNum]) - if err != nil { - return nil, err - } - } - - fieldChunkOffsets[fieldID] = uint64(w.Count()) - err = fdvEncoder.Close() - if err != nil { - return nil, err - } - // persist the doc value details for this field - _, err = fdvEncoder.Write(w) - if err != nil { - return nil, err - } - // reseting encoder for the next field - fdvEncoder.Reset() - } - - return fieldChunkOffsets, nil -} - -func persistFieldDocValues(memSegment *mem.Segment, w *CountHashWriter, - chunkFactor uint32) (uint64, error) { - fieldDvOffsets, err := persistDocValues(memSegment, w, chunkFactor) - if err != nil { - return 0, err - } - - fieldDocValuesOffset := uint64(w.Count()) - buf := make([]byte, binary.MaxVarintLen64) - offset := uint64(0) - ok := true - for fieldID := range memSegment.FieldsInv { - // if the field isn't configured for docValue, then mark - // the offset accordingly - if offset, ok = fieldDvOffsets[uint16(fieldID)]; !ok { - offset = fieldNotUninverted - } - n := binary.PutUvarint(buf, uint64(offset)) - _, err := w.Write(buf[:n]) - if err != nil { - return 0, err - } - } - - return fieldDocValuesOffset, nil -} - -func NewSegmentBase(memSegment *mem.Segment, chunkFactor uint32) (*SegmentBase, error) { - var br bytes.Buffer - - cr := NewCountHashWriter(&br) - - numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs, err := - persistBase(memSegment, cr, chunkFactor) - if err != nil { - return nil, err - } - - return InitSegmentBase(br.Bytes(), cr.Sum32(), chunkFactor, - memSegment.FieldsMap, memSegment.FieldsInv, numDocs, - storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs) -} - func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32, fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64, storedIndexOffset uint64, fieldsIndexOffset uint64, docValueOffset uint64, From 2a20a36e154e58b35b37904d59a372eed20ee792 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 9 Mar 2018 23:19:47 -0800 Subject: [PATCH 281/728] scorch zap optimimze to avoid bitmaps for 1-hit posting lists This commit avoids creating roaring.Bitmap's (which would have just a single entry) when a postings list/iterator represents a single "1-hit" encoding. --- index/scorch/segment/zap/posting.go | 129 ++++++++++++++++------------ index/scorch/segment/zap/segment.go | 8 +- 2 files changed, 78 insertions(+), 59 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 7ae36120b..f2df32bf7 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -92,6 +92,8 @@ func under32Bits(x uint64) bool { return x <= mask31Bits } +const docNum1HitFinished = math.MaxUint64 + // PostingsList is an in-memory represenation of a postings list type PostingsList struct { sb *SegmentBase @@ -102,8 +104,9 @@ type PostingsList struct { postings *roaring.Bitmap except *roaring.Bitmap - // when postingsOffset == freqOffset == 0, then the postings list - // represents a "1-hit" encoding, and has the following norm + // when normBits1Hit != 0, then this postings list came from a + // 1-hit encoding, and only the docNum1Hit & normBits1Hit apply + docNum1Hit uint64 normBits1Hit uint64 } @@ -117,6 +120,17 @@ func (p *PostingsList) Size() int { return sizeInBytes } +func (p *PostingsList) OrInto(receiver *roaring.Bitmap) { + if p.normBits1Hit != 0 { + receiver.Add(uint32(p.docNum1Hit)) + return + } + + if p.postings != nil { + receiver.Or(p.postings) + } +} + // Iterator returns an iterator for this postings list func (p *PostingsList) Iterator() segment.PostingsIterator { return p.iterator(nil) @@ -152,39 +166,47 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { } rv.postings = p + if p.normBits1Hit != 0 { + // "1-hit" encoding + rv.docNum1Hit = p.docNum1Hit + rv.normBits1Hit = p.normBits1Hit + + if p.except != nil && p.except.Contains(uint32(rv.docNum1Hit)) { + rv.docNum1Hit = docNum1HitFinished + } + + return rv + } + + // "general" encoding, check if empty if p.postings == nil { return rv } - if p.freqOffset > 0 && p.locOffset > 0 { - // "general" encoding, so prepare the freq chunk details - var n uint64 - var read int - var numFreqChunks uint64 - numFreqChunks, read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) + // prepare the freq chunk details + var n uint64 + var read int + var numFreqChunks uint64 + numFreqChunks, read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + rv.freqChunkLens = make([]uint64, int(numFreqChunks)) + for i := 0; i < int(numFreqChunks); i++ { + rv.freqChunkLens[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) n += uint64(read) - rv.freqChunkLens = make([]uint64, int(numFreqChunks)) - for i := 0; i < int(numFreqChunks); i++ { - rv.freqChunkLens[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) - n += uint64(read) - } - rv.freqChunkStart = p.freqOffset + n + } + rv.freqChunkStart = p.freqOffset + n - // prepare the loc chunk details - n = 0 - var numLocChunks uint64 - numLocChunks, read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) + // prepare the loc chunk details + n = 0 + var numLocChunks uint64 + numLocChunks, read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + rv.locChunkLens = make([]uint64, int(numLocChunks)) + for i := 0; i < int(numLocChunks); i++ { + rv.locChunkLens[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) n += uint64(read) - rv.locChunkLens = make([]uint64, int(numLocChunks)) - for i := 0; i < int(numLocChunks); i++ { - rv.locChunkLens[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) - n += uint64(read) - } - rv.locChunkStart = p.locOffset + n - } else { - // "1-hit" encoding - rv.normBits1Hit = p.normBits1Hit } + rv.locChunkStart = p.locOffset + n rv.locBitmap = p.locBitmap @@ -201,18 +223,20 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { // Count returns the number of items on this postings list func (p *PostingsList) Count() uint64 { - if p.postings != nil { - n := p.postings.GetCardinality() - if p.except != nil { - e := p.except.GetCardinality() - if e > n { - e = n - } - return n - e - } - return n + var n uint64 + if p.normBits1Hit != 0 { + n = 1 + } else if p.postings != nil { + n = p.postings.GetCardinality() + } + var e uint64 + if p.except != nil { + e = p.except.GetCardinality() } - return 0 + if n <= e { + return 0 + } + return n - e } func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { @@ -263,19 +287,10 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { return nil } -var emptyRoaring = roaring.NewBitmap() - func (rv *PostingsList) init1Hit(fstVal uint64) error { docNum, normBits := FSTValDecode1Hit(fstVal) - rv.locBitmap = emptyRoaring - - rv.postings = roaring.NewBitmap() - rv.postings.Add(uint32(docNum)) - - // TODO: we can likely do better than allocating a roaring bitmap - // with just 1 entry, but for now reuse existing machinery - + rv.docNum1Hit = docNum rv.normBits1Hit = normBits return nil @@ -308,6 +323,7 @@ type PostingsIterator struct { next Posting // reused across Next() calls nextLocs []Location // reused across Next() calls + docNum1Hit uint64 normBits1Hit uint64 buf []byte @@ -460,7 +476,7 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { } rv.norm = math.Float32frombits(uint32(normBits)) - if i.locBitmap.Contains(uint32(docNum)) { + if i.locBitmap != nil && i.locBitmap.Contains(uint32(docNum)) { // read off 'freq' locations, into reused slices if cap(i.nextLocs) >= int(rv.freq) { i.nextLocs = i.nextLocs[0:rv.freq] @@ -513,7 +529,7 @@ func (i *PostingsIterator) nextBytes() ( endFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len() bytesFreqNorm = i.currChunkFreqNorm[startFreqNorm:endFreqNorm] - if i.locBitmap.Contains(uint32(docNum)) { + if i.locBitmap != nil && i.locBitmap.Contains(uint32(docNum)) { startLoc := len(i.currChunkLoc) - i.locReader.Len() for j := uint64(0); j < freq; j++ { @@ -533,6 +549,15 @@ func (i *PostingsIterator) nextBytes() ( // nextDocNum returns the next docNum on the postings list, and also // sets up the currChunk / loc related fields of the iterator. func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { + if i.normBits1Hit != 0 { + if i.docNum1Hit == docNum1HitFinished { + return 0, false, nil + } + docNum := i.docNum1Hit + i.docNum1Hit = docNum1HitFinished // consume our 1-hit docNum + return docNum, true, nil + } + if i.actual == nil || !i.actual.HasNext() { return 0, false, nil } @@ -540,10 +565,6 @@ func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { n := i.actual.Next() allN := i.all.Next() - if i.normBits1Hit != 0 { - return uint64(n), true, nil - } - nChunk := n / i.postings.sb.chunkFactor allNChunk := allN / i.postings.sb.chunkFactor diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 972b7578e..e1d2a14f7 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -341,15 +341,13 @@ func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) { return nil, err } - var postings *PostingsList + var postingsList *PostingsList for _, id := range ids { - postings, err = idDict.postingsList([]byte(id), nil, postings) + postingsList, err = idDict.postingsList([]byte(id), nil, postingsList) if err != nil { return nil, err } - if postings.postings != nil { - rv.Or(postings.postings) - } + postingsList.OrInto(rv) } } From aaccf59191dc5bc31fd2bb703e53d6dd4142a269 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 12 Mar 2018 15:36:46 +0530 Subject: [PATCH 282/728] docValue space savings merging the doc value length and loc slices into a single offset slice as that is enough to compute the starting offset and length of the the doc values data for a given document inside a docValue chunk. --- cmd/bleve/cmd/zap/docvalue.go | 18 ++++++++++--- index/scorch/segment/zap/contentcoder.go | 26 ++++++++++++------- index/scorch/segment/zap/contentcoder_test.go | 6 ++--- index/scorch/segment/zap/docvalues.go | 24 ++++++++++++----- 4 files changed, 51 insertions(+), 23 deletions(-) diff --git a/cmd/bleve/cmd/zap/docvalue.go b/cmd/bleve/cmd/zap/docvalue.go index 743974955..234c45e6b 100644 --- a/cmd/bleve/cmd/zap/docvalue.go +++ b/cmd/bleve/cmd/zap/docvalue.go @@ -209,9 +209,7 @@ var docvalueCmd = &cobra.Command{ for i := 0; i < int(numDocs); i++ { curChunkHeader[i].DocNum, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) offset += uint64(nread) - curChunkHeader[i].DocDvLoc, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) - offset += uint64(nread) - curChunkHeader[i].DocDvLen, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + curChunkHeader[i].DocDvOffset, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) offset += uint64(nread) } @@ -255,7 +253,19 @@ func getDocValueLocs(docNum uint64, metaHeader []zap.MetaData) (uint64, uint64) return metaHeader[i].DocNum >= docNum }) if i < len(metaHeader) && metaHeader[i].DocNum == docNum { - return metaHeader[i].DocDvLoc, metaHeader[i].DocDvLen + var start, end uint64 + if i > 0 { + start = metaHeader[i].DocDvOffset + } + // single element case + if i == 0 && len(metaHeader) == 1 { + end = metaHeader[i].DocDvOffset + } else if i < len(metaHeader)-1 { + end = metaHeader[i+1].DocDvOffset + } else { // for last element + end = start + metaHeader[0].DocDvOffset + } + return start, end } return math.MaxUint64, math.MaxUint64 } diff --git a/index/scorch/segment/zap/contentcoder.go b/index/scorch/segment/zap/contentcoder.go index 933f10a1e..adaab2fa3 100644 --- a/index/scorch/segment/zap/contentcoder.go +++ b/index/scorch/segment/zap/contentcoder.go @@ -47,9 +47,8 @@ type chunkedContentCoder struct { // MetaData represents the data information inside a // chunk. type MetaData struct { - DocNum uint64 // docNum of the data inside the chunk - DocDvLoc uint64 // starting offset for a given docid - DocDvLen uint64 // length of data inside the chunk for the given docid + DocNum uint64 // docNum of the data inside the chunk + DocDvOffset uint64 // offset of data inside the chunk for the given docid } // newChunkedContentCoder returns a new chunk content coder which @@ -94,9 +93,20 @@ func (c *chunkedContentCoder) flushContents() error { return err } + // convert the document data lens to data offsets + if len(c.chunkMeta) > 1 { + c.chunkMeta[1].DocDvOffset, c.chunkMeta[0].DocDvOffset = + c.chunkMeta[0].DocDvOffset, c.chunkMeta[1].DocDvOffset + for i := 2; i < len(c.chunkMeta); i++ { + cur := c.chunkMeta[i].DocDvOffset + c.chunkMeta[i].DocDvOffset = c.chunkMeta[i-1].DocDvOffset + c.chunkMeta[0].DocDvOffset + c.chunkMeta[0].DocDvOffset = cur + } + } + // write out the metaData slice for _, meta := range c.chunkMeta { - _, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvLoc, meta.DocDvLen) + _, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvOffset) if err != nil { return err } @@ -130,17 +140,15 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error { c.currChunk = chunk } - // mark the starting offset for this doc - dvOffset := c.chunkBuf.Len() + // mark the data length for this doc dvSize, err := c.chunkBuf.Write(vals) if err != nil { return err } c.chunkMeta = append(c.chunkMeta, MetaData{ - DocNum: docNum, - DocDvLoc: uint64(dvOffset), - DocDvLen: uint64(dvSize), + DocNum: docNum, + DocDvOffset: uint64(dvSize), }) return nil } diff --git a/index/scorch/segment/zap/contentcoder_test.go b/index/scorch/segment/zap/contentcoder_test.go index 0e45b783e..c6b3df82c 100644 --- a/index/scorch/segment/zap/contentcoder_test.go +++ b/index/scorch/segment/zap/contentcoder_test.go @@ -35,7 +35,7 @@ func TestChunkContentCoder(t *testing.T) { docNums: []uint64{0}, vals: [][]byte{[]byte("bleve")}, // 1 chunk, chunk-0 length 11(b), value - expected: string([]byte{0x1, 0xb, 0x1, 0x0, 0x0, 0x05, 0x05, 0x10, 0x62, 0x6c, 0x65, 0x76, 0x65}), + expected: string([]byte{0x1, 0xa, 0x1, 0x0, 0x05, 0x05, 0x10, 0x62, 0x6c, 0x65, 0x76, 0x65}), }, { maxDocNum: 1, @@ -46,8 +46,8 @@ func TestChunkContentCoder(t *testing.T) { []byte("scorch"), }, - expected: string([]byte{0x02, 0x0c, 0x0c, 0x01, 0x00, 0x00, 0x06, 0x06, 0x14, - 0x75, 0x70, 0x73, 0x69, 0x64, 0x65, 0x01, 0x01, 0x00, 0x06, 0x06, + expected: string([]byte{0x02, 0x0b, 0x0b, 0x01, 0x00, 0x06, 0x06, 0x14, + 0x75, 0x70, 0x73, 0x69, 0x64, 0x65, 0x01, 0x01, 0x06, 0x06, 0x14, 0x73, 0x63, 0x6f, 0x72, 0x63, 0x68}), }, } diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index 13635c57e..bbefe5a10 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -117,9 +117,7 @@ func (di *docValueIterator) loadDvChunk(chunkNumber, for i := 0; i < int(numDocs); i++ { di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) offset += uint64(read) - di.curChunkHeader[i].DocDvLoc, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) - offset += uint64(read) - di.curChunkHeader[i].DocDvLen, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + di.curChunkHeader[i].DocDvOffset, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) offset += uint64(read) } @@ -133,8 +131,8 @@ func (di *docValueIterator) loadDvChunk(chunkNumber, func (di *docValueIterator) visitDocValues(docNum uint64, visitor index.DocumentFieldTermVisitor) error { // binary search the term locations for the docNum - start, length := di.getDocValueLocs(docNum) - if start == math.MaxUint64 || length == math.MaxUint64 { + start, end := di.getDocValueLocs(docNum) + if start == math.MaxUint64 || end == math.MaxUint64 { return nil } // uncompress the already loaded data @@ -144,7 +142,7 @@ func (di *docValueIterator) visitDocValues(docNum uint64, } // pick the terms for the given docNum - uncompressed = uncompressed[start : start+length] + uncompressed = uncompressed[start:end] for { i := bytes.Index(uncompressed, termSeparatorSplitSlice) if i < 0 { @@ -163,7 +161,19 @@ func (di *docValueIterator) getDocValueLocs(docNum uint64) (uint64, uint64) { return di.curChunkHeader[i].DocNum >= docNum }) if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum { - return di.curChunkHeader[i].DocDvLoc, di.curChunkHeader[i].DocDvLen + var start, end uint64 + if i > 0 { + start = di.curChunkHeader[i].DocDvOffset + } + // single element case + if i == 0 && len(di.curChunkHeader) == 1 { + end = di.curChunkHeader[i].DocDvOffset + } else if i < len(di.curChunkHeader)-1 { + end = di.curChunkHeader[i+1].DocDvOffset + } else { // for last element + end = start + di.curChunkHeader[0].DocDvOffset + } + return start, end } return math.MaxUint64, math.MaxUint64 } From 90aa91105ab3ab5848902a59e161e6a642aa3577 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 9 Mar 2018 11:19:39 +0530 Subject: [PATCH 283/728] handling only int, float64 values --- index/scorch/scorch.go | 31 ++++++------------------------- 1 file changed, 6 insertions(+), 25 deletions(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 7a33fb7f0..c171092d6 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -516,33 +516,14 @@ func init() { registry.RegisterIndexType(Name, NewScorch) } -func parseToInteger(v interface{}) (int, error) { - switch v.(type) { - case float32: - return int(v.(float32)), nil +func parseToInteger(i interface{}) (int, error) { + switch v := i.(type) { case float64: - return int(v.(float64)), nil + return int(v), nil case int: - return v.(int), nil - case int8: - return int(v.(int8)), nil - case int16: - return int(v.(int16)), nil - case int32: - return int(v.(int32)), nil - case int64: - return int(v.(int64)), nil - case uint: - return int(v.(uint)), nil - case uint8: - return int(v.(uint8)), nil - case uint16: - return int(v.(uint16)), nil - case uint32: - return int(v.(uint32)), nil - case uint64: - return int(v.(uint64)), nil + return v, nil + default: - return 0, fmt.Errorf("expects a numeric value") + return 0, fmt.Errorf("expects int or float64 value") } } From 531800c479915f58ab823f00fe1f5fa112695702 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 11 Mar 2018 16:30:52 -0700 Subject: [PATCH 284/728] scorch zap use roaring Add() instead of AddInt() This change invokes Add() directly as AddInt() is a convenience wrapper around Add(). --- index/scorch/segment/zap/new.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index 3a8b2012d..f29711c06 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -334,7 +334,7 @@ func (s *interim) processDocument(docNum uint64, for term, tf := range tfs { pid := dict[term] - 1 bs := s.Postings[pid] - bs.AddInt(int(docNum)) + bs.Add(uint32(docNum)) s.FreqNorms[pid] = append(s.FreqNorms[pid], interimFreqNorm{ @@ -344,7 +344,7 @@ func (s *interim) processDocument(docNum uint64, if len(tf.Locations) > 0 { locBS := s.PostingsLocs[pid] - locBS.AddInt(int(docNum)) + locBS.Add(uint32(docNum)) locs := s.Locs[pid] From c4ceffe58430d4f4103d5cca7a0777dceebd8af6 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 11 Mar 2018 12:09:21 -0700 Subject: [PATCH 285/728] scorch zap sync Pool for interim data --- index/scorch/segment/zap/new.go | 93 +++++++++++++++++++++++++++------ 1 file changed, 78 insertions(+), 15 deletions(-) diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index f29711c06..c7a6b0ce4 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -19,6 +19,7 @@ import ( "encoding/binary" "math" "sort" + "sync" "github.com/RoaringBitmap/roaring" "github.com/Smerity/govarint" @@ -35,12 +36,11 @@ func AnalysisResultsToSegmentBase(results []*index.AnalysisResult, chunkFactor uint32) (*SegmentBase, error) { var br bytes.Buffer - s := interim{ - results: results, - chunkFactor: chunkFactor, - w: NewCountHashWriter(&br), - FieldsMap: map[string]uint16{}, - } + s := interimPool.Get().(*interim) + + s.results = results + s.chunkFactor = chunkFactor + s.w = NewCountHashWriter(&br) storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, err := s.convert() @@ -52,9 +52,13 @@ func AnalysisResultsToSegmentBase(results []*index.AnalysisResult, s.FieldsMap, s.FieldsInv, uint64(len(results)), storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets) + interimPool.Put(s.cleanse()) + return sb, err } +var interimPool = sync.Pool{New: func() interface{} { return &interim{} }} + // interim holds temporary working data used while converting from // analysis results to a zap-encoded segment type interim struct { @@ -101,6 +105,41 @@ type interim struct { tmp1 []byte } +func (s *interim) cleanse() *interim { + s.results = nil + s.chunkFactor = 0 + s.w = nil + s.FieldsMap = nil + s.FieldsInv = s.FieldsInv[:0] + for i := range s.Dicts { + s.Dicts[i] = nil + } + s.Dicts = s.Dicts[:0] + for i := range s.DictKeys { + s.DictKeys[i] = s.DictKeys[i][:0] + } + s.DictKeys = s.DictKeys[:0] + for i := range s.IncludeDocValues { + s.IncludeDocValues[i] = false + } + s.IncludeDocValues = s.IncludeDocValues[:0] + for _, idn := range s.Postings { + idn.Clear() + } + s.Postings = s.Postings[:0] + for _, idn := range s.PostingsLocs { + idn.Clear() + } + s.PostingsLocs = s.PostingsLocs[:0] + s.FreqNorms = nil + s.Locs = nil + s.buf0.Reset() + s.tmp0 = s.tmp0[:0] + s.tmp1 = s.tmp1[:0] + + return s +} + func (s *interim) grabBuf(size int) []byte { buf := s.tmp0 if cap(buf) < size { @@ -130,6 +169,8 @@ type interimLoc struct { } func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) { + s.FieldsMap = map[string]uint16{} + s.getOrDefineField("_id") // _id field is fieldID 0 for _, result := range s.results { @@ -143,12 +184,15 @@ func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) { sort.Strings(s.FieldsInv[1:]) // keep _id as first field - s.FieldsMap = make(map[string]uint16, len(s.FieldsInv)) for fieldID, fieldName := range s.FieldsInv { s.FieldsMap[fieldName] = uint16(fieldID + 1) } - s.IncludeDocValues = make([]bool, len(s.FieldsInv)) + if cap(s.IncludeDocValues) >= len(s.FieldsInv) { + s.IncludeDocValues = s.IncludeDocValues[:len(s.FieldsInv)] + } else { + s.IncludeDocValues = make([]bool, len(s.FieldsInv)) + } s.prepareDicts() @@ -189,9 +233,18 @@ func (s *interim) getOrDefineField(fieldName string) int { fieldIDPlus1 = uint16(len(s.FieldsInv) + 1) s.FieldsMap[fieldName] = fieldIDPlus1 s.FieldsInv = append(s.FieldsInv, fieldName) + s.Dicts = append(s.Dicts, make(map[string]uint64)) - s.DictKeys = append(s.DictKeys, make([]string, 0)) + + n := len(s.DictKeys) + if n < cap(s.DictKeys) { + s.DictKeys = s.DictKeys[:n+1] + s.DictKeys[n] = s.DictKeys[n][:0] + } else { + s.DictKeys = append(s.DictKeys, []string(nil)) + } } + return int(fieldIDPlus1 - 1) } @@ -253,16 +306,25 @@ func (s *interim) prepareDicts() { numPostingsLists := pidNext - s.Postings = make([]*roaring.Bitmap, numPostingsLists) - for i := 0; i < numPostingsLists; i++ { - s.Postings[i] = roaring.New() + if cap(s.Postings) >= numPostingsLists { + s.Postings = s.Postings[:numPostingsLists] + } else { + s.Postings = make([]*roaring.Bitmap, numPostingsLists) + for i := 0; i < numPostingsLists; i++ { + s.Postings[i] = roaring.New() + } } - s.PostingsLocs = make([]*roaring.Bitmap, numPostingsLists) - for i := 0; i < numPostingsLists; i++ { - s.PostingsLocs[i] = roaring.New() + if cap(s.PostingsLocs) >= numPostingsLists { + s.PostingsLocs = s.PostingsLocs[:numPostingsLists] + } else { + s.PostingsLocs = make([]*roaring.Bitmap, numPostingsLists) + for i := 0; i < numPostingsLists; i++ { + s.PostingsLocs[i] = roaring.New() + } } + // TODO: reuse this. s.FreqNorms = make([][]interimFreqNorm, numPostingsLists) freqNormsBacking := make([]interimFreqNorm, totTFs) @@ -271,6 +333,7 @@ func (s *interim) prepareDicts() { freqNormsBacking = freqNormsBacking[numTerms:] } + // TODO: reuse this. s.Locs = make([][]interimLoc, numPostingsLists) locsBacking := make([]interimLoc, totLocs) From cad88096cacab844536d322312c0a59f150926c3 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 11 Mar 2018 16:30:34 -0700 Subject: [PATCH 286/728] scorch zap reuse roaring Bitmap during merge --- index/scorch/segment/zap/merge.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index a934dfc35..07de0943c 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -183,6 +183,9 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, return nil, 0, err } + newRoaring := roaring.NewBitmap() + newRoaringLocs := roaring.NewBitmap() + // for each field for fieldID, fieldName := range fieldsInv { @@ -222,8 +225,8 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, var prevTerm []byte - newRoaring := roaring.NewBitmap() - newRoaringLocs := roaring.NewBitmap() + newRoaring.Clear() + newRoaringLocs.Clear() var lastDocNum, lastFreq, lastNorm uint64 @@ -262,8 +265,8 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, } } - newRoaring = roaring.NewBitmap() - newRoaringLocs = roaring.NewBitmap() + newRoaring.Clear() + newRoaringLocs.Clear() tfEncoder.Reset() locEncoder.Reset() From b1f39695217295421681d509744e55c0027b5f5d Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 11 Mar 2018 20:13:31 -0700 Subject: [PATCH 287/728] scorch zap reuse roaring Bitmap in postings lists --- index/scorch/segment/zap/dict.go | 12 ++++++++++++ index/scorch/segment/zap/posting.go | 8 ++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index e5d712686..3b8132f2c 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -68,7 +68,19 @@ func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) if rv == nil { rv = &PostingsList{} } else { + postings := rv.postings + if postings != nil { + postings.Clear() + } + locBitmap := rv.locBitmap + if locBitmap != nil { + locBitmap.Clear() + } + *rv = PostingsList{} // clear the struct + + rv.postings = postings + rv.locBitmap = locBitmap } rv.sb = d.sb rv.except = except diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index f2df32bf7..bc533ad14 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -266,7 +266,9 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { locRoaringBytes := d.sb.mem[locBitmapOffset+uint64(read) : locBitmapOffset+uint64(read)+locBitmapLen] - rv.locBitmap = roaring.NewBitmap() + if rv.locBitmap == nil { + rv.locBitmap = roaring.NewBitmap() + } _, err := rv.locBitmap.FromBuffer(locRoaringBytes) if err != nil { return fmt.Errorf("error loading roaring bitmap of locations with hits: %v", err) @@ -278,7 +280,9 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { roaringBytes := d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen] - rv.postings = roaring.NewBitmap() + if rv.postings == nil { + rv.postings = roaring.NewBitmap() + } _, err = rv.postings.FromBuffer(roaringBytes) if err != nil { return fmt.Errorf("error loading roaring bitmap: %v", err) From 07901910e24dabd67f9e26c88a03a8675e0fd87d Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 11 Mar 2018 21:07:14 -0700 Subject: [PATCH 288/728] scorch zap reuse roaring Bitmap in prepareDicts() slice growth In this change, if the postings/postingsLocs slices need to be grown, then copy over and reuse any of the preallocated roaring Bitmap's from the old slice. --- index/scorch/segment/zap/new.go | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index c7a6b0ce4..5b625b995 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -309,19 +309,27 @@ func (s *interim) prepareDicts() { if cap(s.Postings) >= numPostingsLists { s.Postings = s.Postings[:numPostingsLists] } else { - s.Postings = make([]*roaring.Bitmap, numPostingsLists) + postings := make([]*roaring.Bitmap, numPostingsLists) + copy(postings, s.Postings[:cap(s.Postings)]) for i := 0; i < numPostingsLists; i++ { - s.Postings[i] = roaring.New() + if postings[i] == nil { + postings[i] = roaring.New() + } } + s.Postings = postings } if cap(s.PostingsLocs) >= numPostingsLists { s.PostingsLocs = s.PostingsLocs[:numPostingsLists] } else { - s.PostingsLocs = make([]*roaring.Bitmap, numPostingsLists) + postingsLocs := make([]*roaring.Bitmap, numPostingsLists) + copy(postingsLocs, s.PostingsLocs[:cap(s.PostingsLocs)]) for i := 0; i < numPostingsLists; i++ { - s.PostingsLocs[i] = roaring.New() + if postingsLocs[i] == nil { + postingsLocs[i] = roaring.New() + } } + s.PostingsLocs = postingsLocs } // TODO: reuse this. From dbfc5e913027c7dc5a85f9a9f91476babf8322da Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 12 Mar 2018 10:04:11 -0700 Subject: [PATCH 289/728] scorch zap reuse interim freq/norm/loc slices --- index/scorch/segment/zap/new.go | 68 ++++++++++++++++++++++++--------- 1 file changed, 49 insertions(+), 19 deletions(-) diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index 5b625b995..51971ba51 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -95,10 +95,15 @@ type interim struct { PostingsLocs []*roaring.Bitmap // postings id -> freq/norm's, one for each docNum in postings - FreqNorms [][]interimFreqNorm + FreqNorms [][]interimFreqNorm + freqNormsBacking []interimFreqNorm // postings id -> locs, one for each freq - Locs [][]interimLoc + Locs [][]interimLoc + locsBacking []interimLoc + + numTermsPerPostingsList []int // key is postings list id + numLocsPerPostingsList []int // key is postings list id buf0 bytes.Buffer tmp0 []byte @@ -131,8 +136,18 @@ func (s *interim) cleanse() *interim { idn.Clear() } s.PostingsLocs = s.PostingsLocs[:0] - s.FreqNorms = nil - s.Locs = nil + s.FreqNorms = s.FreqNorms[:0] + for i := range s.freqNormsBacking { + s.freqNormsBacking[i] = interimFreqNorm{} + } + s.freqNormsBacking = s.freqNormsBacking[:0] + s.Locs = s.Locs[:0] + for i := range s.locsBacking { + s.locsBacking[i] = interimLoc{} + } + s.locsBacking = s.locsBacking[:0] + s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0] + s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0] s.buf0.Reset() s.tmp0 = s.tmp0[:0] s.tmp1 = s.tmp1[:0] @@ -252,9 +267,6 @@ func (s *interim) getOrDefineField(fieldName string) int { func (s *interim) prepareDicts() { var pidNext int - numTermsPerPostingsList := make([]int, 0, 64) // key is postings list id - numLocsPerPostingsList := make([]int, 0, 64) // key is postings list id - var totTFs int var totLocs int @@ -271,14 +283,14 @@ func (s *interim) prepareDicts() { dict[term] = pidPlus1 dictKeys = append(dictKeys, term) - numTermsPerPostingsList = append(numTermsPerPostingsList, 0) - numLocsPerPostingsList = append(numLocsPerPostingsList, 0) + s.numTermsPerPostingsList = append(s.numTermsPerPostingsList, 0) + s.numLocsPerPostingsList = append(s.numLocsPerPostingsList, 0) } pid := pidPlus1 - 1 - numTermsPerPostingsList[pid] += 1 - numLocsPerPostingsList[pid] += len(tf.Locations) + s.numTermsPerPostingsList[pid] += 1 + s.numLocsPerPostingsList[pid] += len(tf.Locations) totLocs += len(tf.Locations) } @@ -332,20 +344,38 @@ func (s *interim) prepareDicts() { s.PostingsLocs = postingsLocs } - // TODO: reuse this. - s.FreqNorms = make([][]interimFreqNorm, numPostingsLists) + if cap(s.FreqNorms) >= numPostingsLists { + s.FreqNorms = s.FreqNorms[:numPostingsLists] + } else { + s.FreqNorms = make([][]interimFreqNorm, numPostingsLists) + } + + if cap(s.freqNormsBacking) >= totTFs { + s.freqNormsBacking = s.freqNormsBacking[:totTFs] + } else { + s.freqNormsBacking = make([]interimFreqNorm, totTFs) + } - freqNormsBacking := make([]interimFreqNorm, totTFs) - for pid, numTerms := range numTermsPerPostingsList { + freqNormsBacking := s.freqNormsBacking + for pid, numTerms := range s.numTermsPerPostingsList { s.FreqNorms[pid] = freqNormsBacking[0:0] freqNormsBacking = freqNormsBacking[numTerms:] } - // TODO: reuse this. - s.Locs = make([][]interimLoc, numPostingsLists) + if cap(s.Locs) >= numPostingsLists { + s.Locs = s.Locs[:numPostingsLists] + } else { + s.Locs = make([][]interimLoc, numPostingsLists) + } + + if cap(s.locsBacking) >= totLocs { + s.locsBacking = s.locsBacking[:totLocs] + } else { + s.locsBacking = make([]interimLoc, totLocs) + } - locsBacking := make([]interimLoc, totLocs) - for pid, numLocs := range numLocsPerPostingsList { + locsBacking := s.locsBacking + for pid, numLocs := range s.numLocsPerPostingsList { s.Locs[pid] = locsBacking[0:0] locsBacking = locsBacking[numLocs:] } From 19318194fa039bf85d6a1261ccfa91101f8f023c Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Tue, 13 Mar 2018 14:06:48 +0530 Subject: [PATCH 290/728] moving to new offset slice format --- cmd/bleve/cmd/zap/docvalue.go | 14 +-------- index/scorch/segment/zap/build.go | 2 +- index/scorch/segment/zap/contentcoder.go | 36 +++++++++++++++++++----- index/scorch/segment/zap/docvalues.go | 14 +-------- 4 files changed, 32 insertions(+), 34 deletions(-) diff --git a/cmd/bleve/cmd/zap/docvalue.go b/cmd/bleve/cmd/zap/docvalue.go index 234c45e6b..dcfa58de1 100644 --- a/cmd/bleve/cmd/zap/docvalue.go +++ b/cmd/bleve/cmd/zap/docvalue.go @@ -253,19 +253,7 @@ func getDocValueLocs(docNum uint64, metaHeader []zap.MetaData) (uint64, uint64) return metaHeader[i].DocNum >= docNum }) if i < len(metaHeader) && metaHeader[i].DocNum == docNum { - var start, end uint64 - if i > 0 { - start = metaHeader[i].DocDvOffset - } - // single element case - if i == 0 && len(metaHeader) == 1 { - end = metaHeader[i].DocDvOffset - } else if i < len(metaHeader)-1 { - end = metaHeader[i+1].DocDvOffset - } else { // for last element - end = start + metaHeader[0].DocDvOffset - } - return start, end + return zap.ReadDocValueBoundary(i, metaHeader) } return math.MaxUint64, math.MaxUint64 } diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 8ec610953..28df9bd60 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -28,7 +28,7 @@ import ( "github.com/golang/snappy" ) -const version uint32 = 4 +const version uint32 = 6 const fieldNotUninverted = math.MaxUint64 diff --git a/index/scorch/segment/zap/contentcoder.go b/index/scorch/segment/zap/contentcoder.go index adaab2fa3..3bb904caa 100644 --- a/index/scorch/segment/zap/contentcoder.go +++ b/index/scorch/segment/zap/contentcoder.go @@ -95,13 +95,14 @@ func (c *chunkedContentCoder) flushContents() error { // convert the document data lens to data offsets if len(c.chunkMeta) > 1 { - c.chunkMeta[1].DocDvOffset, c.chunkMeta[0].DocDvOffset = - c.chunkMeta[0].DocDvOffset, c.chunkMeta[1].DocDvOffset - for i := 2; i < len(c.chunkMeta); i++ { - cur := c.chunkMeta[i].DocDvOffset - c.chunkMeta[i].DocDvOffset = c.chunkMeta[i-1].DocDvOffset + c.chunkMeta[0].DocDvOffset - c.chunkMeta[0].DocDvOffset = cur + var runningOffset uint64 + var index, i int + for i = 1; i < len(c.chunkMeta); i++ { + runningOffset += c.chunkMeta[i-1].DocDvOffset + c.chunkMeta[index].DocDvOffset = runningOffset + index++ } + c.chunkMeta[index].DocDvOffset = c.chunkMeta[i-1].DocDvOffset } // write out the metaData slice @@ -140,7 +141,7 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error { c.currChunk = chunk } - // mark the data length for this doc + // mark the data size for this doc dvSize, err := c.chunkBuf.Write(vals) if err != nil { return err @@ -181,3 +182,24 @@ func (c *chunkedContentCoder) Write(w io.Writer) (int, error) { } return tw, nil } + +// ReadDocValueBoundary elicits the start, end offsets from a +// starting offset based metaData header slice +func ReadDocValueBoundary(chunk int, metaHeaders []MetaData) (uint64, uint64) { + var start, end uint64 + if chunk > len(metaHeaders) { + return start, end + } + + if chunk > 0 { + start = metaHeaders[chunk-1].DocDvOffset + } + + if chunk < len(metaHeaders)-1 { + end = metaHeaders[chunk].DocDvOffset + } else { + end = start + metaHeaders[chunk].DocDvOffset + } + + return start, end +} diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index bbefe5a10..2399801a1 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -161,19 +161,7 @@ func (di *docValueIterator) getDocValueLocs(docNum uint64) (uint64, uint64) { return di.curChunkHeader[i].DocNum >= docNum }) if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum { - var start, end uint64 - if i > 0 { - start = di.curChunkHeader[i].DocDvOffset - } - // single element case - if i == 0 && len(di.curChunkHeader) == 1 { - end = di.curChunkHeader[i].DocDvOffset - } else if i < len(di.curChunkHeader)-1 { - end = di.curChunkHeader[i+1].DocDvOffset - } else { // for last element - end = start + di.curChunkHeader[0].DocDvOffset - } - return start, end + return ReadDocValueBoundary(i, di.curChunkHeader) } return math.MaxUint64, math.MaxUint64 } From debbcd7d47715c2dc78d5007fca7a5e8e63184f6 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Tue, 13 Mar 2018 17:29:05 +0530 Subject: [PATCH 291/728] adding maxsegment size limit checks --- index/scorch/merge.go | 5 ++ index/scorch/mergeplan/merge_plan.go | 17 +++++++ index/scorch/mergeplan/merge_plan_test.go | 58 +++++++++++++++++++++++ 3 files changed, 80 insertions(+) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index ec2c8d4b3..41086ad3d 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -111,6 +111,11 @@ func (s *Scorch) parseMergePlannerOptions() (*mergeplan.MergePlanOptions, if err != nil { return &mergePlannerOptions, err } + + err = mergeplan.ValidateMergePlannerOptions(&mergePlannerOptions) + if err != nil { + return nil, err + } } return &mergePlannerOptions, nil } diff --git a/index/scorch/mergeplan/merge_plan.go b/index/scorch/mergeplan/merge_plan.go index 62f643f43..f0d6f162d 100644 --- a/index/scorch/mergeplan/merge_plan.go +++ b/index/scorch/mergeplan/merge_plan.go @@ -18,6 +18,7 @@ package mergeplan import ( + "errors" "fmt" "math" "sort" @@ -115,6 +116,14 @@ func (o *MergePlanOptions) RaiseToFloorSegmentSize(s int64) int64 { return o.FloorSegmentSize } +// MaxSegmentSizeLimit represents the maximum size of a segment, +// this limit comes as the roaring lib supports uint32. +const MaxSegmentSizeLimit = 1<<32 - 1 + +// ErrMaxSegmentSizeTooLarge is returned when the size of the segment +// exceeds the MaxSegmentSizeLimit +var ErrMaxSegmentSizeTooLarge = errors.New("MaxSegmentSize exceeds the size limit") + // Suggested default options. var DefaultMergePlanOptions = MergePlanOptions{ MaxSegmentsPerTier: 10, @@ -367,3 +376,11 @@ func ToBarChart(prefix string, barMax int, segments []Segment, plan *MergePlan) return strings.Join(rv, "\n") } + +// ValidateMergePlannerOptions validates the merge planner options +func ValidateMergePlannerOptions(options *MergePlanOptions) error { + if options.MaxSegmentSize > MaxSegmentSizeLimit { + return ErrMaxSegmentSizeTooLarge + } + return nil +} diff --git a/index/scorch/mergeplan/merge_plan_test.go b/index/scorch/mergeplan/merge_plan_test.go index 419ab8253..3adc1f4b8 100644 --- a/index/scorch/mergeplan/merge_plan_test.go +++ b/index/scorch/mergeplan/merge_plan_test.go @@ -17,10 +17,12 @@ package mergeplan import ( "encoding/json" "fmt" + "math/rand" "os" "reflect" "sort" "testing" + "time" ) // Implements the Segment interface for testing, @@ -401,6 +403,62 @@ func TestManySameSizedSegmentsWithDeletesBetweenMerges(t *testing.T) { } } +func TestValidateMergePlannerOptions(t *testing.T) { + o := &MergePlanOptions{ + MaxSegmentSize: 1 << 32, + MaxSegmentsPerTier: 3, + TierGrowth: 3.0, + SegmentsPerMergeTask: 3, + } + err := ValidateMergePlannerOptions(o) + if err != ErrMaxSegmentSizeTooLarge { + t.Error("Validation expected to fail as the MaxSegmentSize exceeds limit") + } +} + +func TestPlanMaxSegmentSizeLimit(t *testing.T) { + o := &MergePlanOptions{ + MaxSegmentSize: 20, + MaxSegmentsPerTier: 5, + TierGrowth: 3.0, + SegmentsPerMergeTask: 5, + FloorSegmentSize: 5, + } + segments := makeLinearSegments(20) + + s := rand.NewSource(time.Now().UnixNano()) + r := rand.New(s) + + max := 20 + min := 5 + randomInRange := func() int64 { + return int64(r.Intn(max-min) + min) + } + for i := 1; i < 20; i++ { + o.MaxSegmentSize = randomInRange() + plans, err := Plan(segments, o) + if err != nil { + t.Errorf("Plan failed, err: %v", err) + } + if len(plans.Tasks) == 0 { + t.Errorf("expected some plans with tasks") + } + + for _, task := range plans.Tasks { + var totalLiveSize int64 + for _, segs := range task.Segments { + totalLiveSize += segs.LiveSize() + + } + if totalLiveSize >= o.MaxSegmentSize { + t.Errorf("merged segments size: %d exceeding the MaxSegmentSize"+ + "limit: %d", totalLiveSize, o.MaxSegmentSize) + } + } + } + +} + // ---------------------------------------- type testCyclesSpec struct { From 715144d6323850d01ed970f8794692c1b99d5b9d Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Tue, 13 Mar 2018 13:34:48 -0700 Subject: [PATCH 292/728] MB-27385: De-duplicate the list of requested fields De-duplicate the list of fields provided by the client as part of the search request, so as to not inadvertantly load the same stored field more than once. --- index_impl.go | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/index_impl.go b/index_impl.go index 68777f072..4d03b78af 100644 --- a/index_impl.go +++ b/index_impl.go @@ -534,7 +534,8 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr doc, err := indexReader.Document(hit.ID) if err == nil && doc != nil { if len(req.Fields) > 0 { - for _, f := range req.Fields { + fieldsToLoad := deDuplicate(req.Fields) + for _, f := range fieldsToLoad { for _, docF := range doc.Fields { if f == "*" || docF.Name() == f { var value interface{} @@ -830,3 +831,16 @@ func (f *indexImplFieldDict) Close() error { } return f.indexReader.Close() } + +// helper function to remove duplicate entries from slice of strings +func deDuplicate(fields []string) []string { + entries := make(map[string]struct{}) + ret := []string{} + for _, entry := range fields { + if _, exists := entries[entry]; !exists { + entries[entry] = struct{}{} + ret = append(ret, entry) + } + } + return ret +} From 7578ff7cb8df1279853e29684f5c43d7a5c9af70 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 13 Mar 2018 11:10:31 -0700 Subject: [PATCH 293/728] scorch zap optimize interim's reuse of vellum builders Since interim structs are now sync.Pool'ed, we can now also hold onto and reuse the associated vellum builder. --- index/scorch/segment/zap/new.go | 52 ++++++++++++++++++++------------- 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index 51971ba51..4c9ec9c19 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -52,7 +52,9 @@ func AnalysisResultsToSegmentBase(results []*index.AnalysisResult, s.FieldsMap, s.FieldsInv, uint64(len(results)), storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets) - interimPool.Put(s.cleanse()) + if err == nil && s.reset() == nil { + interimPool.Put(s) + } return sb, err } @@ -105,12 +107,16 @@ type interim struct { numTermsPerPostingsList []int // key is postings list id numLocsPerPostingsList []int // key is postings list id - buf0 bytes.Buffer + builder *vellum.Builder + builderBuf bytes.Buffer + + metaBuf bytes.Buffer + tmp0 []byte tmp1 []byte } -func (s *interim) cleanse() *interim { +func (s *interim) reset() (err error) { s.results = nil s.chunkFactor = 0 s.w = nil @@ -148,11 +154,15 @@ func (s *interim) cleanse() *interim { s.locsBacking = s.locsBacking[:0] s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0] s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0] - s.buf0.Reset() + s.builderBuf.Reset() + if s.builder != nil { + err = s.builder.Reset(&s.builderBuf) + } + s.metaBuf.Reset() s.tmp0 = s.tmp0[:0] s.tmp1 = s.tmp1[:0] - return s + return err } func (s *interim) grabBuf(size int) []byte { @@ -475,8 +485,7 @@ func (s *interim) processDocument(docNum uint64, func (s *interim) writeStoredFields() ( storedIndexOffset uint64, err error) { - metaBuf := &s.buf0 - metaEncoder := govarint.NewU64Base128Encoder(metaBuf) + metaEncoder := govarint.NewU64Base128Encoder(&s.metaBuf) data, compressed := s.tmp0[:0], s.tmp1[:0] defer func() { s.tmp0, s.tmp1 = data, compressed }() @@ -512,7 +521,7 @@ func (s *interim) writeStoredFields() ( var curr int - metaBuf.Reset() + s.metaBuf.Reset() data = data[:0] compressed = compressed[:0] @@ -529,7 +538,7 @@ func (s *interim) writeStoredFields() ( } metaEncoder.Close() - metaBytes := metaBuf.Bytes() + metaBytes := s.metaBuf.Bytes() compressed = snappy.Encode(compressed, data) @@ -565,8 +574,8 @@ func (s *interim) writeStoredFields() ( return storedIndexOffset, nil } -func (s *interim) writeDicts() (uint64, []uint64, error) { - dictOffsets := make([]uint64, len(s.FieldsInv)) +func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err error) { + dictOffsets = make([]uint64, len(s.FieldsInv)) fdvOffsets := make([]uint64, len(s.FieldsInv)) @@ -578,10 +587,11 @@ func (s *interim) writeDicts() (uint64, []uint64, error) { var docTermMap [][]byte - s.buf0.Reset() - builder, err := vellum.New(&s.buf0, nil) - if err != nil { - return 0, nil, err + if s.builder == nil { + s.builder, err = vellum.New(&s.builderBuf, nil) + if err != nil { + return 0, nil, err + } } for fieldID, terms := range s.DictKeys { @@ -658,7 +668,7 @@ func (s *interim) writeDicts() (uint64, []uint64, error) { } if postingsOffset > uint64(0) { - err = builder.Insert([]byte(term), postingsOffset) + err = s.builder.Insert([]byte(term), postingsOffset) if err != nil { return 0, nil, err } @@ -668,7 +678,7 @@ func (s *interim) writeDicts() (uint64, []uint64, error) { locEncoder.Reset() } - err = builder.Close() + err = s.builder.Close() if err != nil { return 0, nil, err } @@ -676,7 +686,7 @@ func (s *interim) writeDicts() (uint64, []uint64, error) { // record where this dictionary starts dictOffsets[fieldID] = uint64(s.w.Count()) - vellumData := s.buf0.Bytes() + vellumData := s.builderBuf.Bytes() // write out the length of the vellum data n := binary.PutUvarint(buf, uint64(len(vellumData))) @@ -692,9 +702,9 @@ func (s *interim) writeDicts() (uint64, []uint64, error) { } // reset vellum for reuse - s.buf0.Reset() + s.builderBuf.Reset() - err = builder.Reset(&s.buf0) + err = s.builder.Reset(&s.builderBuf) if err != nil { return 0, nil, err } @@ -727,7 +737,7 @@ func (s *interim) writeDicts() (uint64, []uint64, error) { } } - fdvIndexOffset := uint64(s.w.Count()) + fdvIndexOffset = uint64(s.w.Count()) for _, fdvOffset := range fdvOffsets { n := binary.PutUvarint(buf, fdvOffset) From 4af65a78460e4168ed1e8474b30d2b39a202e93c Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 13 Mar 2018 11:44:56 -0700 Subject: [PATCH 294/728] scorch zap prealloc buf via estimate from previous interim work --- index/scorch/segment/zap/new.go | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index 4c9ec9c19..dd2740fb2 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -34,10 +34,18 @@ import ( // SegmentBase from analysis results func AnalysisResultsToSegmentBase(results []*index.AnalysisResult, chunkFactor uint32) (*SegmentBase, error) { - var br bytes.Buffer - s := interimPool.Get().(*interim) + var br bytes.Buffer + if s.lastNumDocs > 0 { + // use previous results to initialize the buf with an estimate + // size, but note that the interim instance comes from a + // global interimPool, so multiple scorch instances indexing + // different docs can lead to low quality estimates + avgBytesPerDoc := s.lastOutSize / s.lastNumDocs + br.Grow(avgBytesPerDoc * (len(results) + 1)) + } + s.results = results s.chunkFactor = chunkFactor s.w = NewCountHashWriter(&br) @@ -53,6 +61,8 @@ func AnalysisResultsToSegmentBase(results []*index.AnalysisResult, storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets) if err == nil && s.reset() == nil { + s.lastNumDocs = len(results) + s.lastOutSize = len(br.Bytes()) interimPool.Put(s) } @@ -114,6 +124,9 @@ type interim struct { tmp0 []byte tmp1 []byte + + lastNumDocs int + lastOutSize int } func (s *interim) reset() (err error) { @@ -161,6 +174,8 @@ func (s *interim) reset() (err error) { s.metaBuf.Reset() s.tmp0 = s.tmp0[:0] s.tmp1 = s.tmp1[:0] + s.lastNumDocs = 0 + s.lastOutSize = 0 return err } From 441065a41b436b0cc8ff1e4d7d86a7c45668ffbc Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 14 Mar 2018 10:42:19 +0530 Subject: [PATCH 295/728] comments,simplification --- index/scorch/segment/zap/contentcoder.go | 34 ++++--------------- index/scorch/segment/zap/contentcoder_test.go | 2 +- 2 files changed, 7 insertions(+), 29 deletions(-) diff --git a/index/scorch/segment/zap/contentcoder.go b/index/scorch/segment/zap/contentcoder.go index 3bb904caa..30197c0e5 100644 --- a/index/scorch/segment/zap/contentcoder.go +++ b/index/scorch/segment/zap/contentcoder.go @@ -93,18 +93,6 @@ func (c *chunkedContentCoder) flushContents() error { return err } - // convert the document data lens to data offsets - if len(c.chunkMeta) > 1 { - var runningOffset uint64 - var index, i int - for i = 1; i < len(c.chunkMeta); i++ { - runningOffset += c.chunkMeta[i-1].DocDvOffset - c.chunkMeta[index].DocDvOffset = runningOffset - index++ - } - c.chunkMeta[index].DocDvOffset = c.chunkMeta[i-1].DocDvOffset - } - // write out the metaData slice for _, meta := range c.chunkMeta { _, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvOffset) @@ -141,7 +129,8 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error { c.currChunk = chunk } - // mark the data size for this doc + // get the starting offset for this doc + dvOffset := c.chunkBuf.Len() dvSize, err := c.chunkBuf.Write(vals) if err != nil { return err @@ -149,7 +138,7 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error { c.chunkMeta = append(c.chunkMeta, MetaData{ DocNum: docNum, - DocDvOffset: uint64(dvSize), + DocDvOffset: uint64(dvOffset + dvSize), }) return nil } @@ -184,22 +173,11 @@ func (c *chunkedContentCoder) Write(w io.Writer) (int, error) { } // ReadDocValueBoundary elicits the start, end offsets from a -// starting offset based metaData header slice +// metaData header slice func ReadDocValueBoundary(chunk int, metaHeaders []MetaData) (uint64, uint64) { - var start, end uint64 - if chunk > len(metaHeaders) { - return start, end - } - + var start uint64 if chunk > 0 { start = metaHeaders[chunk-1].DocDvOffset } - - if chunk < len(metaHeaders)-1 { - end = metaHeaders[chunk].DocDvOffset - } else { - end = start + metaHeaders[chunk].DocDvOffset - } - - return start, end + return start, metaHeaders[chunk].DocDvOffset } diff --git a/index/scorch/segment/zap/contentcoder_test.go b/index/scorch/segment/zap/contentcoder_test.go index c6b3df82c..fce84714b 100644 --- a/index/scorch/segment/zap/contentcoder_test.go +++ b/index/scorch/segment/zap/contentcoder_test.go @@ -69,7 +69,7 @@ func TestChunkContentCoder(t *testing.T) { } if !reflect.DeepEqual(test.expected, string(actual.Bytes())) { - t.Errorf("got % s, expected % s", string(actual.Bytes()), test.expected) + t.Errorf("got:%s, expected:%s", string(actual.Bytes()), test.expected) } } } From 1775602958abb6358f2f490d9914865280d026fa Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 15 Mar 2018 14:40:00 +0530 Subject: [PATCH 296/728] posting iterator array positions clean up, max segment size limit adjustment for hit-1 optimisation --- index/scorch/mergeplan/merge_plan.go | 6 +++--- index/scorch/segment/zap/posting.go | 2 ++ 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/index/scorch/mergeplan/merge_plan.go b/index/scorch/mergeplan/merge_plan.go index f0d6f162d..b09e5381e 100644 --- a/index/scorch/mergeplan/merge_plan.go +++ b/index/scorch/mergeplan/merge_plan.go @@ -117,14 +117,14 @@ func (o *MergePlanOptions) RaiseToFloorSegmentSize(s int64) int64 { } // MaxSegmentSizeLimit represents the maximum size of a segment, -// this limit comes as the roaring lib supports uint32. -const MaxSegmentSizeLimit = 1<<32 - 1 +// this limit comes with hit-1 optimisation/max encoding limit uint31. +const MaxSegmentSizeLimit = 1<<31 - 1 // ErrMaxSegmentSizeTooLarge is returned when the size of the segment // exceeds the MaxSegmentSizeLimit var ErrMaxSegmentSizeTooLarge = errors.New("MaxSegmentSize exceeds the size limit") -// Suggested default options. +// DefaultMergePlanOptions suggests the default options. var DefaultMergePlanOptions = MergePlanOptions{ MaxSegmentsPerTier: 10, MaxSegmentSize: 5000000, diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index bc533ad14..fbe703c10 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -444,6 +444,8 @@ func (i *PostingsIterator) readLocation(l *Location) error { l.end = end if numArrayPos > 0 { l.ap = make([]uint64, int(numArrayPos)) + } else { + l.ap = l.ap[:0] } } From d1155c223a431bdd0e564ff5f15547aa9fe8e640 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Tue, 13 Mar 2018 12:13:48 +0530 Subject: [PATCH 297/728] zap version bump, changed the offset slice format ,UTs --- index/scorch/segment/zap/build.go | 2 +- index/scorch/segment/zap/contentcoder.go | 11 +- index/scorch/segment/zap/contentcoder_test.go | 2 +- index/scorch/segment/zap/docvalues.go | 8 +- index/scorch/segment/zap/intcoder.go | 68 +++++------ index/scorch/segment/zap/intcoder_test.go | 114 ++++++++---------- index/scorch/segment/zap/posting.go | 8 +- 7 files changed, 93 insertions(+), 120 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 30ae8d774..cd56e3b54 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -22,7 +22,7 @@ import ( "github.com/Smerity/govarint" ) -const version uint32 = 4 +const version uint32 = 5 const fieldNotUninverted = math.MaxUint64 diff --git a/index/scorch/segment/zap/contentcoder.go b/index/scorch/segment/zap/contentcoder.go index c731f52c4..5ba15d69c 100644 --- a/index/scorch/segment/zap/contentcoder.go +++ b/index/scorch/segment/zap/contentcoder.go @@ -157,13 +157,10 @@ func (c *chunkedContentCoder) Write(w io.Writer) (int, error) { return tw, err } - if len(c.chunkLens) > 1 { - chunkLengthsToOffsets(c.chunkLens) - } - - // write out the chunk starting offsets - for _, chunkLen := range c.chunkLens { - n := binary.PutUvarint(buf, uint64(chunkLen)) + chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens) + // write out the chunk offsets + for _, chunkOffset := range chunkOffsets { + n := binary.PutUvarint(buf, chunkOffset) nw, err = w.Write(buf[:n]) tw += nw if err != nil { diff --git a/index/scorch/segment/zap/contentcoder_test.go b/index/scorch/segment/zap/contentcoder_test.go index 0e45b783e..da80f9479 100644 --- a/index/scorch/segment/zap/contentcoder_test.go +++ b/index/scorch/segment/zap/contentcoder_test.go @@ -46,7 +46,7 @@ func TestChunkContentCoder(t *testing.T) { []byte("scorch"), }, - expected: string([]byte{0x02, 0x0c, 0x0c, 0x01, 0x00, 0x00, 0x06, 0x06, 0x14, + expected: string([]byte{0x02, 0x0c, 0x18, 0x01, 0x00, 0x00, 0x06, 0x06, 0x14, 0x75, 0x70, 0x73, 0x69, 0x64, 0x65, 0x01, 0x01, 0x00, 0x06, 0x06, 0x14, 0x73, 0x63, 0x6f, 0x72, 0x63, 0x68}), }, diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index 882ff43dd..61b83877f 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -69,7 +69,7 @@ func (s *SegmentBase) loadFieldDocValueIterator(field string, } // read the number of chunks, chunk lengths - var offset, clen uint64 + var offset, loc uint64 numChunks, read := binary.Uvarint(s.mem[fieldDvLoc : fieldDvLoc+binary.MaxVarintLen64]) if read <= 0 { return nil, fmt.Errorf("failed to read the field "+ @@ -83,11 +83,11 @@ func (s *SegmentBase) loadFieldDocValueIterator(field string, chunkOffsets: make([]uint64, int(numChunks)), } for i := 0; i < int(numChunks); i++ { - clen, read = binary.Uvarint(s.mem[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64]) + loc, read = binary.Uvarint(s.mem[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64]) if read <= 0 { - return nil, fmt.Errorf("corrupted chunk length during segment load") + return nil, fmt.Errorf("corrupted chunk offset during segment load") } - fdvIter.chunkOffsets[i] = clen + fdvIter.chunkOffsets[i] = loc offset += uint64(read) } diff --git a/index/scorch/segment/zap/intcoder.go b/index/scorch/segment/zap/intcoder.go index 79fe5156e..81ef8bb2e 100644 --- a/index/scorch/segment/zap/intcoder.go +++ b/index/scorch/segment/zap/intcoder.go @@ -111,15 +111,13 @@ func (c *chunkedIntCoder) Write(w io.Writer) (int, error) { } buf := c.buf - // convert the chunk lengths into starting chunk offsets - if len(c.chunkLens) > 1 { - chunkLengthsToOffsets(c.chunkLens) - } + // convert the chunk lengths into chunk offsets + chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens) - // write out the number of chunks & each chunk starting offsets - n := binary.PutUvarint(buf, uint64(len(c.chunkLens))) - for _, chunkLen := range c.chunkLens { - n += binary.PutUvarint(buf[n:], uint64(chunkLen)) + // write out the number of chunks & each chunk offsets + n := binary.PutUvarint(buf, uint64(len(chunkOffsets))) + for _, chunkOffset := range chunkOffsets { + n += binary.PutUvarint(buf[n:], chunkOffset) } tw, err := w.Write(buf[:n]) @@ -140,41 +138,35 @@ func (c *chunkedIntCoder) FinalSize() int { return len(c.final) } -// chunkLengthsToOffsets converts the chunk length array -// to a chunk starting offset array. The readChunkBoundary +// modifyLengthsToEndOffsets converts the chunk length array +// to a chunk offset array. The readChunkBoundary // will figure out the start and end of every chunk from -// these offsets. The starting offset of the first/single -// array element will always be zero and this position is -// used for storing the size of the current last item in -// the array at any given point. -// For eg: -// Lens -> 5 5 5 5 => 5 5 10 15 -// Lens -> 0 5 0 5 => 5 0 5 5 -// Lens -> 0 0 0 5 => 5 0 0 0 -// Lens -> 5 0 0 0 => 0 5 5 5 -// Lens -> 0 5 0 0 => 0 0 5 5 -// Lens -> 0 0 5 0 => 0 0 0 5 -func chunkLengthsToOffsets(lengths []uint64) { - lengths[1], lengths[0] = lengths[0], lengths[1] - for i := 2; i < len(lengths); i++ { - cur := lengths[i] - lengths[i] = lengths[i-1] + lengths[0] - lengths[0] = cur +// these offsets. Starting offset of i'th index is stored +// in i-1'th position except for 0'th index and ending offset +// is stored at i'th index position. +// For 0'th element, starting position is always zero. +// eg: +// Lens -> 5 5 5 5 => 5 10 15 20 +// Lens -> 0 5 0 5 => 0 5 5 10 +// Lens -> 0 0 0 5 => 0 0 0 5 +// Lens -> 5 0 0 0 => 5 5 5 5 +// Lens -> 0 5 0 0 => 0 5 5 5 +// Lens -> 0 0 5 0 => 0 0 5 5 +func modifyLengthsToEndOffsets(lengths []uint64) []uint64 { + var runningOffset uint64 + var index, i int + for i = 1; i <= len(lengths); i++ { + runningOffset += lengths[i-1] + lengths[index] = runningOffset + index++ } + return lengths } func readChunkBoundary(chunk int, offsets []uint64) (uint64, uint64) { - var start, end uint64 + var start uint64 if chunk > 0 { - start = offsets[chunk] - } - // single element case - if chunk == 0 && len(offsets) == 1 { - end = offsets[chunk] - } else if chunk < len(offsets)-1 { - end = offsets[chunk+1] - } else { // for last element - end = start + offsets[0] + start = offsets[chunk-1] } - return start, end + return start, offsets[chunk] } diff --git a/index/scorch/segment/zap/intcoder_test.go b/index/scorch/segment/zap/intcoder_test.go index 8c77eab61..952e0669d 100644 --- a/index/scorch/segment/zap/intcoder_test.go +++ b/index/scorch/segment/zap/intcoder_test.go @@ -46,8 +46,8 @@ func TestChunkIntCoder(t *testing.T) { []uint64{3}, []uint64{7}, }, - // 2 chunks, chunk-0 length 1, chunk-1 length 1, value 3, value 7 - expected: []byte{0x2, 0x1, 0x1, 0x3, 0x7}, + // 2 chunks, chunk-0 offset 1, chunk-1 offset 2, value 3, value 7 + expected: []byte{0x2, 0x1, 0x2, 0x3, 0x7}, }, } @@ -80,40 +80,48 @@ func TestChunkLengthToOffsets(t *testing.T) { }{ { lengths: []uint64{5, 5, 5, 5, 5}, - expectedOffsets: []uint64{5, 5, 10, 15, 20}, + expectedOffsets: []uint64{5, 10, 15, 20, 25}, }, { lengths: []uint64{0, 5, 0, 5, 0}, - expectedOffsets: []uint64{0, 0, 5, 5, 10}, + expectedOffsets: []uint64{0, 5, 5, 10, 10}, }, { lengths: []uint64{0, 0, 0, 0, 5}, - expectedOffsets: []uint64{5, 0, 0, 0, 0}, + expectedOffsets: []uint64{0, 0, 0, 0, 5}, }, { lengths: []uint64{5, 0, 0, 0, 0}, - expectedOffsets: []uint64{0, 5, 5, 5, 5}, + expectedOffsets: []uint64{5, 5, 5, 5, 5}, }, { lengths: []uint64{0, 5, 0, 0, 0}, - expectedOffsets: []uint64{0, 0, 5, 5, 5}, + expectedOffsets: []uint64{0, 5, 5, 5, 5}, }, { lengths: []uint64{0, 0, 0, 5, 0}, - expectedOffsets: []uint64{0, 0, 0, 0, 5}, + expectedOffsets: []uint64{0, 0, 0, 5, 5}, }, { lengths: []uint64{0, 0, 0, 5, 5}, - expectedOffsets: []uint64{5, 0, 0, 0, 5}, + expectedOffsets: []uint64{0, 0, 0, 5, 10}, }, { lengths: []uint64{5, 5, 5, 0, 0}, - expectedOffsets: []uint64{0, 5, 10, 15, 15}, + expectedOffsets: []uint64{5, 10, 15, 15, 15}, + }, + { + lengths: []uint64{5}, + expectedOffsets: []uint64{5}, + }, + { + lengths: []uint64{5, 5}, + expectedOffsets: []uint64{5, 10}, }, } for i, test := range tests { - chunkLengthsToOffsets(test.lengths) + modifyLengthsToEndOffsets(test.lengths) if !reflect.DeepEqual(test.expectedOffsets, test.lengths) { t.Errorf("Test: %d failed, got %+v, expected %+v", i, test.lengths, test.expectedOffsets) } @@ -129,86 +137,80 @@ func TestChunkReadBoundaryFromOffsets(t *testing.T) { expectedEnd uint64 }{ { - offsets: []uint64{5, 5, 10, 15, 20}, + offsets: []uint64{5, 10, 15, 20, 25}, chunkNumber: 4, expectedStart: 20, expectedEnd: 25, }, { - offsets: []uint64{5, 5, 10, 15, 20}, + offsets: []uint64{5, 10, 15, 20, 25}, chunkNumber: 0, expectedStart: 0, expectedEnd: 5, }, { - offsets: []uint64{5, 5, 10, 15, 20}, + offsets: []uint64{5, 10, 15, 20, 25}, chunkNumber: 2, expectedStart: 10, expectedEnd: 15, }, { - offsets: []uint64{0, 0, 5, 5, 10}, + offsets: []uint64{0, 5, 5, 10, 10}, chunkNumber: 4, expectedStart: 10, expectedEnd: 10, }, { - offsets: []uint64{0, 0, 5, 5, 10}, + offsets: []uint64{0, 5, 5, 10, 10}, chunkNumber: 1, expectedStart: 0, expectedEnd: 5, }, { - offsets: []uint64{5, 0, 0, 0, 0}, + offsets: []uint64{5, 5, 5, 5, 5}, chunkNumber: 0, expectedStart: 0, - expectedEnd: 0, + expectedEnd: 5, }, { - offsets: []uint64{5, 0, 0, 0, 0}, + offsets: []uint64{5, 5, 5, 5, 5}, chunkNumber: 4, - expectedStart: 0, + expectedStart: 5, expectedEnd: 5, }, { - offsets: []uint64{5, 0, 0, 0, 0}, + offsets: []uint64{5, 5, 5, 5, 5}, chunkNumber: 1, - expectedStart: 0, - expectedEnd: 0, + expectedStart: 5, + expectedEnd: 5, }, { offsets: []uint64{0, 5, 5, 5, 5}, chunkNumber: 1, - expectedStart: 5, + expectedStart: 0, expectedEnd: 5, }, { offsets: []uint64{0, 5, 5, 5, 5}, chunkNumber: 0, expectedStart: 0, - expectedEnd: 5, + expectedEnd: 0, }, { - offsets: []uint64{0, 0, 5, 5, 5}, + offsets: []uint64{0, 0, 0, 5, 5}, chunkNumber: 2, - expectedStart: 5, - expectedEnd: 5, + expectedStart: 0, + expectedEnd: 0, }, { - offsets: []uint64{0, 0, 5, 5, 5}, + offsets: []uint64{0, 0, 0, 5, 5}, chunkNumber: 1, expectedStart: 0, - expectedEnd: 5, + expectedEnd: 0, }, { offsets: []uint64{0, 0, 0, 0, 5}, chunkNumber: 4, - expectedStart: 5, - expectedEnd: 5, - }, - { - offsets: []uint64{0, 0, 0, 0, 5}, - chunkNumber: 3, expectedStart: 0, expectedEnd: 5, }, @@ -219,59 +221,41 @@ func TestChunkReadBoundaryFromOffsets(t *testing.T) { expectedEnd: 0, }, { - offsets: []uint64{5, 0, 0, 0, 5}, - chunkNumber: 0, - expectedStart: 0, - expectedEnd: 0, - }, - { - offsets: []uint64{5, 0, 0, 0, 5}, - chunkNumber: 1, - expectedStart: 0, - expectedEnd: 0, - }, - { - offsets: []uint64{5, 0, 0, 0, 5}, - chunkNumber: 3, - expectedStart: 0, - expectedEnd: 5, - }, - { - offsets: []uint64{5, 0, 0, 0, 5}, - chunkNumber: 4, - expectedStart: 5, - expectedEnd: 10, - }, - { - offsets: []uint64{0, 5, 10, 15, 15}, + offsets: []uint64{5, 10, 15, 15, 15}, chunkNumber: 0, expectedStart: 0, expectedEnd: 5, }, { - offsets: []uint64{0, 5, 10, 15, 15}, + offsets: []uint64{5, 10, 15, 15, 15}, chunkNumber: 1, expectedStart: 5, expectedEnd: 10, }, { - offsets: []uint64{0, 5, 10, 15, 15}, + offsets: []uint64{5, 10, 15, 15, 15}, chunkNumber: 2, expectedStart: 10, expectedEnd: 15, }, { - offsets: []uint64{0, 5, 10, 15, 15}, + offsets: []uint64{5, 10, 15, 15, 15}, chunkNumber: 3, expectedStart: 15, expectedEnd: 15, }, { - offsets: []uint64{0, 5, 10, 15, 15}, + offsets: []uint64{5, 10, 15, 15, 15}, chunkNumber: 4, expectedStart: 15, expectedEnd: 15, }, + { + offsets: []uint64{5}, + chunkNumber: 0, + expectedStart: 0, + expectedEnd: 5, + }, } for i, test := range tests { diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index c3fc2330c..bdbb47e31 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -189,9 +189,9 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { var numFreqChunks uint64 numFreqChunks, read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) n += uint64(read) - rv.freqChunkLens = make([]uint64, int(numFreqChunks)) + rv.freqChunkOffsets = make([]uint64, int(numFreqChunks)) for i := 0; i < int(numFreqChunks); i++ { - rv.freqChunkLens[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) + rv.freqChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) n += uint64(read) } rv.freqChunkStart = p.freqOffset + n @@ -201,9 +201,9 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { var numLocChunks uint64 numLocChunks, read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) n += uint64(read) - rv.locChunkLens = make([]uint64, int(numLocChunks)) + rv.locChunkOffsets = make([]uint64, int(numLocChunks)) for i := 0; i < int(numLocChunks); i++ { - rv.locChunkLens[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) + rv.locChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) n += uint64(read) } rv.locChunkStart = p.locOffset + n From 65fed52d0b5b6cbb8e116a308d76bdf10dd96b52 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Thu, 15 Mar 2018 13:20:32 -0700 Subject: [PATCH 298/728] Do not account IndexReader's size in the query RAM estimate Since its just the pointer size of the IndexReader that is being accounted for while estimating the RAM needed to execute a search query, get rid of the Size() API in the IndexReader interface. --- index/index.go | 2 -- index/scorch/snapshot_index.go | 7 ------- index/upsidedown/index_reader.go | 5 ----- index/upsidedown/reader.go | 2 +- search/facets_builder.go | 3 +-- search/searcher/search_boolean.go | 3 +-- search/searcher/search_disjunction.go | 1 - search/searcher/search_match_all.go | 1 - search/searcher/search_match_none.go | 3 +-- search/searcher/search_phrase.go | 3 +-- search/searcher/search_term.go | 1 - 11 files changed, 5 insertions(+), 26 deletions(-) diff --git a/index/index.go b/index/index.go index c25d7fa46..e5a69297b 100644 --- a/index/index.go +++ b/index/index.go @@ -94,8 +94,6 @@ type IndexReader interface { DumpFields() chan interface{} Close() error - - Size() int } // FieldTerms contains the terms used by a document, keyed by field diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 9394f391e..5289b1434 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -27,7 +27,6 @@ import ( "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/blevesearch/bleve/size" ) type asynchSegmentResult struct { @@ -90,12 +89,6 @@ func (i *IndexSnapshot) Close() error { return i.DecRef() } -func (i *IndexSnapshot) Size() int { - // Just return the size of the pointer for estimating the overhead - // during Search, a reference of the IndexSnapshot serves as the reader. - return size.SizeOfPtr -} - func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i segment.TermDictionary) segment.DictionaryIterator) (*IndexSnapshotFieldDict, error) { results := make(chan *asynchSegmentResult) diff --git a/index/upsidedown/index_reader.go b/index/upsidedown/index_reader.go index 4e5755219..e045f67c7 100644 --- a/index/upsidedown/index_reader.go +++ b/index/upsidedown/index_reader.go @@ -20,7 +20,6 @@ import ( "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/store" - "github.com/blevesearch/bleve/size" ) var reflectStaticSizeIndexReader int @@ -36,10 +35,6 @@ type IndexReader struct { docCount uint64 } -func (i *IndexReader) Size() int { - return reflectStaticSizeIndexReader + size.SizeOfPtr -} - func (i *IndexReader) TermFieldReader(term []byte, fieldName string, includeFreq, includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { fieldIndex, fieldExists := i.index.fieldCache.FieldNamed(fieldName, false) if fieldExists { diff --git a/index/upsidedown/reader.go b/index/upsidedown/reader.go index 646d4d8ac..bc0fef119 100644 --- a/index/upsidedown/reader.go +++ b/index/upsidedown/reader.go @@ -203,7 +203,7 @@ type UpsideDownCouchDocIDReader struct { func (r *UpsideDownCouchDocIDReader) Size() int { sizeInBytes := reflectStaticSizeUpsideDownCouchDocIDReader + - r.indexReader.Size() + reflectStaticSizeIndexReader + size.SizeOfPtr for _, entry := range r.only { sizeInBytes += size.SizeOfString + len(entry) diff --git a/search/facets_builder.go b/search/facets_builder.go index 34e45af84..c5d41e2d3 100644 --- a/search/facets_builder.go +++ b/search/facets_builder.go @@ -66,8 +66,7 @@ func NewFacetsBuilder(indexReader index.IndexReader) *FacetsBuilder { } func (fb *FacetsBuilder) Size() int { - sizeInBytes := reflectStaticSizeFacetsBuilder + size.SizeOfPtr + - fb.indexReader.Size() + sizeInBytes := reflectStaticSizeFacetsBuilder + size.SizeOfPtr for k, v := range fb.facets { sizeInBytes += size.SizeOfString + len(k) + diff --git a/search/searcher/search_boolean.go b/search/searcher/search_boolean.go index b87337e1e..f7ee2cd83 100644 --- a/search/searcher/search_boolean.go +++ b/search/searcher/search_boolean.go @@ -62,8 +62,7 @@ func NewBooleanSearcher(indexReader index.IndexReader, mustSearcher search.Searc } func (s *BooleanSearcher) Size() int { - sizeInBytes := reflectStaticSizeBooleanSearcher + size.SizeOfPtr + - s.indexReader.Size() + sizeInBytes := reflectStaticSizeBooleanSearcher + size.SizeOfPtr if s.mustSearcher != nil { sizeInBytes += s.mustSearcher.Size() diff --git a/search/searcher/search_disjunction.go b/search/searcher/search_disjunction.go index 119bac970..32d614801 100644 --- a/search/searcher/search_disjunction.go +++ b/search/searcher/search_disjunction.go @@ -101,7 +101,6 @@ func newDisjunctionSearcher(indexReader index.IndexReader, func (s *DisjunctionSearcher) Size() int { sizeInBytes := reflectStaticSizeDisjunctionSearcher + size.SizeOfPtr + - s.indexReader.Size() + s.scorer.Size() for _, entry := range s.searchers { diff --git a/search/searcher/search_match_all.go b/search/searcher/search_match_all.go index 3f34e5918..bb6640122 100644 --- a/search/searcher/search_match_all.go +++ b/search/searcher/search_match_all.go @@ -58,7 +58,6 @@ func NewMatchAllSearcher(indexReader index.IndexReader, boost float64, options s func (s *MatchAllSearcher) Size() int { return reflectStaticSizeMatchAllSearcher + size.SizeOfPtr + - s.indexReader.Size() + s.reader.Size() + s.scorer.Size() } diff --git a/search/searcher/search_match_none.go b/search/searcher/search_match_none.go index 6b50b3222..a345e17f7 100644 --- a/search/searcher/search_match_none.go +++ b/search/searcher/search_match_none.go @@ -40,8 +40,7 @@ func NewMatchNoneSearcher(indexReader index.IndexReader) (*MatchNoneSearcher, er } func (s *MatchNoneSearcher) Size() int { - return reflectStaticSizeMatchNoneSearcher + size.SizeOfPtr + - s.indexReader.Size() + return reflectStaticSizeMatchNoneSearcher + size.SizeOfPtr } func (s *MatchNoneSearcher) Count() uint64 { diff --git a/search/searcher/search_phrase.go b/search/searcher/search_phrase.go index 23a359bd7..0026794dd 100644 --- a/search/searcher/search_phrase.go +++ b/search/searcher/search_phrase.go @@ -42,8 +42,7 @@ type PhraseSearcher struct { } func (s *PhraseSearcher) Size() int { - sizeInBytes := reflectStaticSizePhraseSearcher + size.SizeOfPtr + - s.indexReader.Size() + sizeInBytes := reflectStaticSizePhraseSearcher + size.SizeOfPtr if s.mustSearcher != nil { sizeInBytes += s.mustSearcher.Size() diff --git a/search/searcher/search_term.go b/search/searcher/search_term.go index 576d6643a..b99e4c263 100644 --- a/search/searcher/search_term.go +++ b/search/searcher/search_term.go @@ -75,7 +75,6 @@ func NewTermSearcherBytes(indexReader index.IndexReader, term []byte, field stri func (s *TermSearcher) Size() int { return reflectStaticSizeTermSearcher + size.SizeOfPtr + - s.indexReader.Size() + s.reader.Size() + s.tfd.Size() + s.scorer.Size() From 45e0e5c666df945cfd015b085b76ecc3ce9f0771 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 15 Mar 2018 16:31:04 -0400 Subject: [PATCH 299/728] memoize the size of an entire index snapshot by memoizing the size of index snapshots and their constituent parts, we significantly reduce the amount of time that the lock is held in the app_herder, when calculating the total memory used --- index/scorch/introducer.go | 4 ++++ index/scorch/scorch.go | 25 ++++++++++--------------- index/scorch/segment/zap/build.go | 1 + index/scorch/segment/zap/segment.go | 8 +++++++- index/scorch/snapshot_index.go | 20 ++++++++++++++++++++ index/scorch/snapshot_segment.go | 26 ++++++++++++++++++++++---- 6 files changed, 64 insertions(+), 20 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 7f1d0073e..98769ed56 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -186,6 +186,7 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { } } + newSnapshot.updateSize() s.rootLock.Lock() if next.persisted != nil { s.rootPersisted = append(s.rootPersisted, next.persisted) @@ -251,6 +252,7 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) { newIndexSnapshot.internal[k] = v } + newIndexSnapshot.updateSize() s.rootLock.Lock() rootPrev := s.root s.root = newIndexSnapshot @@ -348,6 +350,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { newSnapshot.AddRef() // 1 ref for the nextMerge.notify response + newSnapshot.updateSize() s.rootLock.Lock() // swap in new index snapshot newSnapshot.epoch = s.nextSnapshotEpoch @@ -409,6 +412,7 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { s.rootPersisted = append(s.rootPersisted, revertTo.persisted) } + newSnapshot.updateSize() // swap in new snapshot rootPrev := s.root s.root = newSnapshot diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 4bda2d34b..85f9082c5 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -397,11 +397,15 @@ func (s *Scorch) DeleteInternal(key []byte) error { // Reader returns a low-level accessor on the index data. Close it to // release associated resources. func (s *Scorch) Reader() (index.IndexReader, error) { + return s.currentSnapshot(), nil +} + +func (s *Scorch) currentSnapshot() *IndexSnapshot { s.rootLock.RLock() rv := s.root rv.AddRef() s.rootLock.RUnlock() - return rv, nil + return rv } func (s *Scorch) Stats() json.Marshaler { @@ -484,20 +488,11 @@ func (s *Scorch) AddEligibleForRemoval(epoch uint64) { } func (s *Scorch) MemoryUsed() uint64 { - var memUsed int - s.rootLock.RLock() - if s.root != nil { - for _, segmentSnapshot := range s.root.segment { - memUsed += 8 /* size of id -> uint64 */ + - segmentSnapshot.segment.Size() - if segmentSnapshot.deleted != nil { - memUsed += int(segmentSnapshot.deleted.GetSizeInBytes()) - } - memUsed += segmentSnapshot.cachedDocs.size() - } - } - s.rootLock.RUnlock() - return uint64(memUsed) + indexSnapshot := s.currentSnapshot() + defer func() { + _ = indexSnapshot.Close() + }() + return uint64(indexSnapshot.SizeFull()) } func (s *Scorch) markIneligibleForRemoval(filename string) { diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 30ae8d774..e7abe7dfb 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -138,6 +138,7 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32, dictLocs: dictLocs, fieldDvIterMap: make(map[uint16]*docValueIterator), } + sb.updateSize() err := sb.loadDvIterators() if err != nil { diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index e1d2a14f7..0d2ad072f 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -63,6 +63,7 @@ func Open(path string) (segment.Segment, error) { path: path, refs: 1, } + rv.SegmentBase.updateSize() err = rv.loadConfig() if err != nil { @@ -99,9 +100,14 @@ type SegmentBase struct { docValueOffset uint64 dictLocs []uint64 fieldDvIterMap map[uint16]*docValueIterator // naive chunk cache per field + size uint64 } func (sb *SegmentBase) Size() int { + return int(sb.size) +} + +func (sb *SegmentBase) updateSize() { sizeInBytes := reflectStaticSizeSegmentBase + len(sb.mem) @@ -124,7 +130,7 @@ func (sb *SegmentBase) Size() int { } } - return sizeInBytes + sb.size = uint64(sizeInBytes) } func (sb *SegmentBase) AddRef() {} diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 9394f391e..ddf42c60f 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -19,6 +19,7 @@ import ( "container/heap" "encoding/binary" "fmt" + "reflect" "sort" "sync" "sync/atomic" @@ -41,12 +42,20 @@ type asynchSegmentResult struct { err error } +var reflectStaticSizeIndexSnapshot int + +func init() { + var is interface{} = IndexSnapshot{} + reflectStaticSizeIndexSnapshot = int(reflect.TypeOf(is).Size()) +} + type IndexSnapshot struct { parent *Scorch segment []*SegmentSnapshot offsets []uint64 internal map[string][]byte epoch uint64 + size uint64 m sync.Mutex // Protects the fields that follow. refs int64 @@ -96,6 +105,17 @@ func (i *IndexSnapshot) Size() int { return size.SizeOfPtr } +func (i *IndexSnapshot) SizeFull() int { + return int(i.size) +} + +func (i *IndexSnapshot) updateSize() { + i.size += uint64(reflectStaticSizeIndexSnapshot) + for _, s := range i.segment { + i.size += uint64(s.Size()) + } +} + func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i segment.TermDictionary) segment.DictionaryIterator) (*IndexSnapshotFieldDict, error) { results := make(chan *asynchSegmentResult) diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index cdfe317fe..edf52a6e7 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -19,6 +19,7 @@ import ( "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/size" ) var TermSeparator byte = 0xff @@ -128,15 +129,26 @@ func (s *SegmentSnapshot) Fields() []string { return s.segment.Fields() } +func (s *SegmentSnapshot) Size() (rv int) { + rv = s.segment.Size() + if s.deleted != nil { + rv += int(s.deleted.GetSizeInBytes()) + } + rv += s.cachedDocs.Size() + return +} + type cachedFieldDocs struct { readyCh chan struct{} // closed when the cachedFieldDocs.docs is ready to be used. err error // Non-nil if there was an error when preparing this cachedFieldDocs. docs map[uint64][]byte // Keyed by localDocNum, value is a list of terms delimited by 0xFF. + size uint64 } func (cfd *cachedFieldDocs) prepareFields(field string, ss *SegmentSnapshot) { defer close(cfd.readyCh) + cfd.size += uint64(size.SizeOfUint64) /* size field */ dict, err := ss.segment.Dictionary(field) if err != nil { cfd.err = err @@ -152,12 +164,14 @@ func (cfd *cachedFieldDocs) prepareFields(field string, ss *SegmentSnapshot) { return } + cfd.size += uint64(size.SizeOfUint64) /* map key */ postingsItr := postings.Iterator() nextPosting, err2 := postingsItr.Next() for err2 == nil && nextPosting != nil { docNum := nextPosting.Number() cfd.docs[docNum] = append(cfd.docs[docNum], []byte(next.Term)...) cfd.docs[docNum] = append(cfd.docs[docNum], TermSeparator) + cfd.size += uint64(len(next.Term) + 1) // map value nextPosting, err2 = postingsItr.Next() } @@ -178,6 +192,7 @@ func (cfd *cachedFieldDocs) prepareFields(field string, ss *SegmentSnapshot) { type cachedDocs struct { m sync.Mutex // As the cache is asynchronously prepared, need a lock cache map[string]*cachedFieldDocs // Keyed by field + size uint64 } func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) error { @@ -208,14 +223,18 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e } c.m.Lock() } + c.updateSizeLOCKED() c.m.Unlock() return nil } -func (c *cachedDocs) size() int { +func (c *cachedDocs) Size() int { + return int(c.size) +} + +func (c *cachedDocs) updateSizeLOCKED() { sizeInBytes := 0 - c.m.Lock() for k, v := range c.cache { // cachedFieldDocs sizeInBytes += len(k) if v != nil { @@ -224,6 +243,5 @@ func (c *cachedDocs) size() int { } } } - c.m.Unlock() - return sizeInBytes + c.size = uint64(sizeInBytes) } From 11ff31c2f993d98770ec79b481bccf87da206e7c Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 16 Mar 2018 11:31:47 -0400 Subject: [PATCH 300/728] rename SizeFull to Size --- index/scorch/scorch.go | 2 +- index/scorch/snapshot_index.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 85f9082c5..cc47cda86 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -492,7 +492,7 @@ func (s *Scorch) MemoryUsed() uint64 { defer func() { _ = indexSnapshot.Close() }() - return uint64(indexSnapshot.SizeFull()) + return uint64(indexSnapshot.Size()) } func (s *Scorch) markIneligibleForRemoval(filename string) { diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 755c4637f..6f4b0288e 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -98,7 +98,7 @@ func (i *IndexSnapshot) Close() error { return i.DecRef() } -func (i *IndexSnapshot) SizeFull() int { +func (i *IndexSnapshot) Size() int { return int(i.size) } From e52eb84e37958bd7aed9c06412e131318400472e Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 16 Mar 2018 09:51:23 -0700 Subject: [PATCH 301/728] scorch zap optimize merge when deletion bitmap is empty This change detects whether a deletion bitmap is empty, and treats that as a nil bitmap, which allows further postings iterator codepaths to avoid roaring bitmap operations (like, AndNot(docNums, drops)). --- index/scorch/segment/zap/merge.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 07de0943c..d0d08c101 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -207,7 +207,11 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, } if itr != nil { newDocNums = append(newDocNums, newDocNumsIn[segmentI]) - drops = append(drops, dropsIn[segmentI]) + if dropsIn[segmentI] != nil && !dropsIn[segmentI].IsEmpty() { + drops = append(drops, dropsIn[segmentI]) + } else { + drops = append(drops, nil) + } dicts = append(dicts, dict) itrs = append(itrs, itr) } From b411e65234c9a2805e9de826a70612a29045b956 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 16 Mar 2018 09:58:53 -0700 Subject: [PATCH 302/728] scorch zap optimize postingsIterator reuse of freq/locChunkOffsets --- index/scorch/segment/zap/posting.go | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 6bc6e9268..081ec5f6c 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -152,6 +152,9 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { } locDecoder := rv.locDecoder + freqChunkOffsets := rv.freqChunkOffsets[:0] + locChunkOffsets := rv.locChunkOffsets[:0] + buf := rv.buf *rv = PostingsIterator{} // clear the struct @@ -162,6 +165,9 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { rv.locReader = locReader rv.locDecoder = locDecoder + rv.freqChunkOffsets = freqChunkOffsets + rv.locChunkOffsets = locChunkOffsets + rv.buf = buf } rv.postings = p @@ -189,7 +195,11 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { var numFreqChunks uint64 numFreqChunks, read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) n += uint64(read) - rv.freqChunkOffsets = make([]uint64, int(numFreqChunks)) + if cap(rv.freqChunkOffsets) >= int(numFreqChunks) { + rv.freqChunkOffsets = rv.freqChunkOffsets[:int(numFreqChunks)] + } else { + rv.freqChunkOffsets = make([]uint64, int(numFreqChunks)) + } for i := 0; i < int(numFreqChunks); i++ { rv.freqChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) n += uint64(read) @@ -201,7 +211,11 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { var numLocChunks uint64 numLocChunks, read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) n += uint64(read) - rv.locChunkOffsets = make([]uint64, int(numLocChunks)) + if cap(rv.locChunkOffsets) >= int(numLocChunks) { + rv.locChunkOffsets = rv.locChunkOffsets[:int(numLocChunks)] + } else { + rv.locChunkOffsets = make([]uint64, int(numLocChunks)) + } for i := 0; i < int(numLocChunks); i++ { rv.locChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) n += uint64(read) From 60bdf6d247d1611bc18b1f654ea6ba48f0bf3a49 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Fri, 16 Mar 2018 11:43:25 -0700 Subject: [PATCH 303/728] Return an error when the snapshotEpoch is invalid Avoiding this stacktrace (SIGSEGV) while using bleve scorch cmd-line utility when snapshotEpoch provided is invalid: github.com/blevesearch/bleve/index/scorch.(*IndexSnapshot).Segments(...) /Users/abhinavdangeti/Documents/couchbaseV/godeps/src/github.com/blevesearch/bleve/index/scorch/snapshot_index.go:56 github.com/blevesearch/bleve/cmd/bleve/cmd/scorch.glob..func1(0x1f347e0, 0xc4201f1400, 0x2, 0x2, 0x0, 0x0) /Users/abhinavdangeti/Documents/couchbaseV/godeps/src/github.com/blevesearch/bleve/cmd/bleve/cmd/scorch/ascii.go:43 +0xe4 github.com/blevesearch/bleve/cmd/bleve/vendor/github.com/spf13/cobra.(*Command).execute(0x1f347e0, 0xc4201f12e0, 0x2, 0x2, 0x1f347e0, 0xc4201f12e0) /Users/abhinavdangeti/Documents/couchbaseV/godeps/src/github.com/blevesearch/bleve/cmd/bleve/vendor/github.com/spf13/cobra/command.go:646 +0x3e8 github.com/blevesearch/bleve/cmd/bleve/vendor/github.com/spf13/cobra.(*Command).ExecuteC(0x1f334c0, 0x0, 0x0, 0x0) /Users/abhinavdangeti/Documents/couchbaseV/godeps/src/github.com/blevesearch/bleve/cmd/bleve/vendor/github.com/spf13/cobra/command.go:737 +0x2fe github.com/blevesearch/bleve/cmd/bleve/vendor/github.com/spf13/cobra.(*Command).Execute(0x1f334c0, 0x0, 0x0) /Users/abhinavdangeti/Documents/couchbaseV/godeps/src/github.com/blevesearch/bleve/cmd/bleve/vendor/github.com/spf13/cobra/command.go:695 +0x2b github.com/blevesearch/bleve/cmd/bleve/cmd.Execute() /Users/abhinavdangeti/Documents/couchbaseV/godeps/src/github.com/blevesearch/bleve/cmd/bleve/cmd/root.go:74 +0x31 main.main() /Users/abhinavdangeti/Documents/couchbaseV/goproj/src/github.com/couchbase/cbft/cmd/cbft-bleve/main.go:39 +0x1cb --- index/scorch/persister.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index ccb0c1f21..dda4bdfbb 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -514,7 +514,7 @@ func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) { snapshotKey := segment.EncodeUvarintAscending(nil, epoch) snapshot := snapshots.Bucket(snapshotKey) if snapshot == nil { - return nil + return fmt.Errorf("snapshot with epoch: %v - doesn't exist", epoch) } rv, err = s.loadSnapshot(snapshot) return err From 5df53c8e1ff6677ba1920f8abb0e14195e5273c3 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 16 Mar 2018 11:49:50 -0700 Subject: [PATCH 304/728] scorch zap file merger uses 1MB buffered writer pprof of bleve-blast was showing file merging was in syscall/write a lot. The bufio.NewWriter() provides a default buffer size of 4K, which is too small, and using bufio.NewWriterSize(1MB buffer size) leads to syscall/write dropping out of the file merging flame graphs. --- index/scorch/segment/zap/merge.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index d0d08c101..f3b6a68e7 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -29,6 +29,8 @@ import ( "github.com/golang/snappy" ) +var DefaultFileMergerBufferSize = 1024 * 1024 + const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc // Merge takes a slice of zap segments and bit masks describing which @@ -55,7 +57,7 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, } // buffer the output - br := bufio.NewWriter(f) + br := bufio.NewWriterSize(f, DefaultFileMergerBufferSize) // wrap it for counting (tracking offsets) cr := NewCountHashWriter(br) From 980ce9ebb300d66abedcc430277fd228d4b0d772 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 19 Mar 2018 11:29:58 +0530 Subject: [PATCH 305/728] MB-28753 - document number "xxx" not found err with update workload Introducer was incorrectly updating the offsets slice of segments, by considering only the live doc count while computing the "running". This can result in incorrectly computing the residing segment as well as the local doc numbers while loading a document after a search hit. --- index/scorch/introducer.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 98769ed56..627d4e4cd 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -154,7 +154,7 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { newSnapshot.segment = append(newSnapshot.segment, newss) root.segment[i].segment.AddRef() newSnapshot.offsets = append(newSnapshot.offsets, running) - running += root.segment[i].Count() + running += newss.segment.Count() } } @@ -315,7 +315,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { }) root.segment[i].segment.AddRef() newSnapshot.offsets = append(newSnapshot.offsets, running) - running += root.segment[i].Count() + running += root.segment[i].segment.Count() } } From 1ef41101bac733727e9930eeab96c458b5ebe68a Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 19 Mar 2018 17:20:49 +0530 Subject: [PATCH 306/728] vellum adoption for regex and fuzzy queries --- index/index.go | 7 ++++++ index/scorch/segment/empty.go | 9 +++++++ index/scorch/segment/mem/dict.go | 24 ++++++++++++++++++ index/scorch/segment/segment.go | 2 ++ index/scorch/segment/zap/dict.go | 42 ++++++++++++++++++++++++++++++++ index/scorch/snapshot_index.go | 14 +++++++++++ index/scorch/snapshot_segment.go | 9 +++++++ search/searcher/search_fuzzy.go | 22 ++++++++++++++++- search/searcher/search_regexp.go | 41 ++++++++++++++++++++++++------- 9 files changed, 160 insertions(+), 10 deletions(-) diff --git a/index/index.go b/index/index.go index e5a69297b..96fb7d25e 100644 --- a/index/index.go +++ b/index/index.go @@ -96,6 +96,13 @@ type IndexReader interface { Close() error } +// IndexReaderAdv is an optional interface for advanced users +// Hope to have a better name here... +type IndexReaderAdv interface { + FieldDictRegex(field string, regex []byte) (FieldDict, error) + FieldDictFuzzy(field string, term []byte, fuzziness int) (FieldDict, error) +} + // FieldTerms contains the terms used by a document, keyed by field type FieldTerms map[string][]string diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index 6c19f60f9..918875e17 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -76,6 +76,15 @@ func (e *EmptyDictionary) RangeIterator(start, end string) DictionaryIterator { return &EmptyDictionaryIterator{} } +func (e *EmptyDictionary) RegexIterator(start string) DictionaryIterator { + return &EmptyDictionaryIterator{} +} + +func (e *EmptyDictionary) FuzzyIterator(term string, + fuzziness int) DictionaryIterator { + return &EmptyDictionaryIterator{} +} + type EmptyDictionaryIterator struct{} func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) { diff --git a/index/scorch/segment/mem/dict.go b/index/scorch/segment/mem/dict.go index 9f5a873ae..2877f9453 100644 --- a/index/scorch/segment/mem/dict.go +++ b/index/scorch/segment/mem/dict.go @@ -98,6 +98,30 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator } } +// RegexIterator returns an iterator which only visits terms matching +// the given regex expression. +// TODO complete the implementation +func (d *Dictionary) RegexIterator(regex string) segment.DictionaryIterator { + offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], regex) + return &DictionaryIterator{ + d: d, + offset: offset, + prefix: regex, + } +} + +// FuzzyIterator returns an iterator which only visits terms matching +// the given edit distance. +// TODO complete the implementation +func (d *Dictionary) FuzzyIterator(term string, fuzziness int) segment.DictionaryIterator { + offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], term) + return &DictionaryIterator{ + d: d, + offset: offset, + prefix: term, + } +} + // DictionaryIterator is an iterator for term dictionary type DictionaryIterator struct { d *Dictionary diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index 8eee5f75f..cf3b21f41 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -48,6 +48,8 @@ type TermDictionary interface { Iterator() DictionaryIterator PrefixIterator(prefix string) DictionaryIterator RangeIterator(start, end string) DictionaryIterator + RegexIterator(regex string) DictionaryIterator + FuzzyIterator(term string, fuzziness int) DictionaryIterator } type DictionaryIterator interface { diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 3b8132f2c..2281ec044 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -21,6 +21,7 @@ import ( "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" "github.com/couchbase/vellum" + "github.com/couchbase/vellum/levenshtein" "github.com/couchbase/vellum/regexp" ) @@ -148,6 +149,47 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator return rv } +// RegexIterator returns an iterator which only visits terms having the +// the specified regex +func (d *Dictionary) RegexIterator(regex string) segment.DictionaryIterator { + rv := &DictionaryIterator{ + d: d, + } + + if d.fst != nil { + r, err := regexp.New(regex) + if err == nil { + itr, err := d.fst.Search(r, nil, nil) + if err == nil { + rv.itr = itr + } + } + } + + return rv +} + +// FuzzyIterator returns an iterator which only visits terms having the +// the specified edit/levenshtein distance +func (d *Dictionary) FuzzyIterator(term string, + fuzziness int) segment.DictionaryIterator { + rv := &DictionaryIterator{ + d: d, + } + + if d.fst != nil { + la, err := levenshtein.New(term, fuzziness) + if err == nil { + itr, err := d.fst.Search(la, nil, nil) + if err == nil { + rv.itr = itr + } + } + } + + return rv +} + // DictionaryIterator is an iterator for term dictionary type DictionaryIterator struct { d *Dictionary diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 6f4b0288e..172086980 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -175,6 +175,20 @@ func (i *IndexSnapshot) FieldDictPrefix(field string, }) } +func (i *IndexSnapshot) FieldDictRegex(field string, + termRegex []byte) (index.FieldDict, error) { + return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { + return i.RegexIterator(string(termRegex)) + }) +} + +func (i *IndexSnapshot) FieldDictFuzzy(field string, + term []byte, fuzziness int) (index.FieldDict, error) { + return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { + return i.FuzzyIterator(string(term), fuzziness) + }) +} + func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) { results := make(chan *asynchSegmentResult) for index, segment := range i.segment { diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index edf52a6e7..6edc6ae6e 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -48,6 +48,15 @@ func (s *SegmentDictionarySnapshot) RangeIterator(start, end string) segment.Dic return s.d.RangeIterator(start, end) } +func (s *SegmentDictionarySnapshot) RegexIterator(regex string) segment.DictionaryIterator { + return s.d.RegexIterator(regex) +} + +func (s *SegmentDictionarySnapshot) FuzzyIterator(term string, + fuzziness int) segment.DictionaryIterator { + return s.d.FuzzyIterator(term, fuzziness) +} + type SegmentSnapshot struct { id uint64 segment segment.Segment diff --git a/search/searcher/search_fuzzy.go b/search/searcher/search_fuzzy.go index 90abaa0a8..69aab2f73 100644 --- a/search/searcher/search_fuzzy.go +++ b/search/searcher/search_fuzzy.go @@ -15,6 +15,9 @@ package searcher import ( + "log" + "time" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" ) @@ -31,9 +34,10 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string, break } } - + t := time.Now() candidateTerms, err := findFuzzyCandidateTerms(indexReader, term, fuzziness, field, prefixTerm) + log.Printf("time taken-> %f", time.Since(t).Seconds()) if err != nil { return nil, err } @@ -49,6 +53,22 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, if len(prefixTerm) > 0 { fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm)) } else { + // in case of advanced reader implementations directly call + // the levenshtein automaton based iterator to collect the + // candidate terms + if ir, ok := indexReader.(index.IndexReaderAdv); ok { + fieldDict, err = ir.FieldDictFuzzy(field, []byte(term), fuzziness) + if err != nil { + return rv, err + } + tfd, err := fieldDict.Next() + for err == nil && tfd != nil { + rv = append(rv, tfd.Term) + tfd, err = fieldDict.Next() + } + log.Printf("candidate FSA fuzzy terms: %+v", rv) + return rv, nil + } fieldDict, err = indexReader.FieldDict(field) } defer func() { diff --git a/search/searcher/search_regexp.go b/search/searcher/search_regexp.go index b7cf520ac..806f135a3 100644 --- a/search/searcher/search_regexp.go +++ b/search/searcher/search_regexp.go @@ -15,7 +15,9 @@ package searcher import ( + "log" "regexp" + "time" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" @@ -29,19 +31,40 @@ import ( func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp, field string, boost float64, options search.SearcherOptions) ( search.Searcher, error) { - - prefixTerm, complete := pattern.LiteralPrefix() var candidateTerms []string - if complete { - // there is no pattern - candidateTerms = []string{prefixTerm} - } else { - var err error - candidateTerms, err = findRegexpCandidateTerms(indexReader, pattern, field, - prefixTerm) + t := time.Now() + if ir, ok := indexReader.(index.IndexReaderAdv); ok { + fieldDict, err := ir.FieldDictRegex(field, []byte(pattern.String())) if err != nil { return nil, err } + defer func() { + if cerr := fieldDict.Close(); cerr != nil && err == nil { + err = cerr + } + }() + + // enumerate the terms and check against regexp + tfd, err := fieldDict.Next() + for err == nil && tfd != nil { + candidateTerms = append(candidateTerms, tfd.Term) + tfd, err = fieldDict.Next() + } + log.Printf("fsa time took-> %f", time.Since(t).Seconds()) + } else { + prefixTerm, complete := pattern.LiteralPrefix() + if complete { + // there is no pattern + candidateTerms = []string{prefixTerm} + } else { + var err error + candidateTerms, err = findRegexpCandidateTerms(indexReader, pattern, field, + prefixTerm) + if err != nil { + return nil, err + } + } + log.Printf("time took-> %f", time.Since(t).Seconds()) } return NewMultiTermSearcher(indexReader, candidateTerms, field, boost, From c881146270fdfb8e55b614f746da256eca6e29df Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 19 Mar 2018 10:06:17 -0700 Subject: [PATCH 307/728] scorch zap mergeTermFreqNormLocsByCopying() helper func --- index/scorch/segment/zap/merge.go | 81 +++++++++++++++++-------------- 1 file changed, 45 insertions(+), 36 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index f3b6a68e7..4fe10edd6 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -305,44 +305,13 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, return nil, 0, err2 } - newDocNumsI := newDocNums[itrI] - postItr = postings.iterator(postItr) - nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err2 := - postItr.nextBytes() - for err2 == nil && len(nextFreqNormBytes) > 0 { - hitNewDocNum := newDocNumsI[nextDocNum] - if hitNewDocNum == docDropped { - return nil, 0, fmt.Errorf("see hit with dropped doc num") - } - - newRoaring.Add(uint32(hitNewDocNum)) - err2 = tfEncoder.AddBytes(hitNewDocNum, nextFreqNormBytes) - if err2 != nil { - return nil, 0, err2 - } - - if len(nextLocBytes) > 0 { - newRoaringLocs.Add(uint32(hitNewDocNum)) - err2 = locEncoder.AddBytes(hitNewDocNum, nextLocBytes) - if err2 != nil { - return nil, 0, err2 - } - } - - docTermMap[hitNewDocNum] = - append(append(docTermMap[hitNewDocNum], term...), termSeparator) - - lastDocNum = hitNewDocNum - lastFreq = nextFreq - lastNorm = nextNorm - - nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err2 = - postItr.nextBytes() - } - if err2 != nil { - return nil, 0, err2 + lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying( + term, postItr, newDocNums[itrI], newRoaring, newRoaringLocs, + tfEncoder, locEncoder, docTermMap) + if err != nil { + return nil, 0, err } prevTerm = prevTerm[:0] // copy to prevTerm in case Next() reuses term mem @@ -428,6 +397,46 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, return rv, fieldDvLocsOffset, nil } +func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator, + newDocNums []uint64, newRoaring *roaring.Bitmap, newRoaringLocs *roaring.Bitmap, + tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, docTermMap [][]byte) ( + lastDocNum uint64, lastFreq uint64, lastNorm uint64, err error) { + nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err := + postItr.nextBytes() + for err == nil && len(nextFreqNormBytes) > 0 { + hitNewDocNum := newDocNums[nextDocNum] + if hitNewDocNum == docDropped { + return 0, 0, 0, fmt.Errorf("see hit with dropped doc num") + } + + newRoaring.Add(uint32(hitNewDocNum)) + err = tfEncoder.AddBytes(hitNewDocNum, nextFreqNormBytes) + if err != nil { + return 0, 0, 0, err + } + + if len(nextLocBytes) > 0 { + newRoaringLocs.Add(uint32(hitNewDocNum)) + err = locEncoder.AddBytes(hitNewDocNum, nextLocBytes) + if err != nil { + return 0, 0, 0, err + } + } + + docTermMap[hitNewDocNum] = + append(append(docTermMap[hitNewDocNum], term...), termSeparator) + + lastDocNum = hitNewDocNum + lastFreq = nextFreq + lastNorm = nextNorm + + nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err = + postItr.nextBytes() + } + + return lastDocNum, lastFreq, lastNorm, err +} + func writePostings(postings, postingLocs *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCoder, use1HitEncoding func(uint64) (bool, uint64, uint64), From f65ba5c0f4df639ab1a8f4ddb5502eb9e269f6a8 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 19 Mar 2018 10:28:01 -0700 Subject: [PATCH 308/728] MB-28781 - scorch zap merge freq/loc copying only when fieldsSame The optimization recently introduced in commit 530a3d24cf0768f4c7a, ("scorch zap optimize merge by byte copying freq/norm/loc's") was to byte-copy freq/norm/loc data directly during merging. But, it was incorrect if the fields were different across segments. This change now performs that byte-copying merging optimization only when the fields are the same across segments, and if not, leverages the old approach of deserializing & re-serializing the freq/norm/loc information, which has the important step of remapping fieldID's. See also: https://issues.couchbase.com/browse/MB-28781 --- index/scorch/segment/zap/merge.go | 78 ++++++++++++++++++++++++++++--- 1 file changed, 72 insertions(+), 6 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 4fe10edd6..1da5e5269 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -117,7 +117,8 @@ func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap, return nil, 0, 0, 0, 0, nil, nil, nil, err } - dictLocs, docValueOffset, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap, + dictLocs, docValueOffset, err = persistMergedRest(segments, drops, + fieldsInv, fieldsMap, fieldsSame, newDocNums, numDocs, chunkFactor, cr) if err != nil { return nil, 0, 0, 0, 0, nil, nil, nil, err @@ -158,11 +159,12 @@ func computeNewDocCount(segments []*SegmentBase, drops []*roaring.Bitmap) uint64 } func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, - fieldsInv []string, fieldsMap map[string]uint16, newDocNumsIn [][]uint64, - newSegDocCount uint64, chunkFactor uint32, + fieldsInv []string, fieldsMap map[string]uint16, fieldsSame bool, + newDocNumsIn [][]uint64, newSegDocCount uint64, chunkFactor uint32, w *CountHashWriter) ([]uint64, uint64, error) { var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64) + var bufLoc []uint64 var postings *PostingsList var postItr *PostingsIterator @@ -307,9 +309,16 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, postItr = postings.iterator(postItr) - lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying( - term, postItr, newDocNums[itrI], newRoaring, newRoaringLocs, - tfEncoder, locEncoder, docTermMap) + if fieldsSame { + // can optimize by copying freq/norm/loc bytes directly + lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying( + term, postItr, newDocNums[itrI], newRoaring, newRoaringLocs, + tfEncoder, locEncoder, docTermMap) + } else { + lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs( + fieldsMap, term, postItr, newDocNums[itrI], newRoaring, newRoaringLocs, + tfEncoder, locEncoder, docTermMap, bufLoc) + } if err != nil { return nil, 0, err } @@ -397,6 +406,63 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, return rv, fieldDvLocsOffset, nil } +func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *PostingsIterator, + newDocNums []uint64, newRoaring *roaring.Bitmap, newRoaringLocs *roaring.Bitmap, + tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, docTermMap [][]byte, + bufLoc []uint64) ( + lastDocNum uint64, lastFreq uint64, lastNorm uint64, bufLocOut []uint64, err error) { + next, err := postItr.Next() + for next != nil && err == nil { + hitNewDocNum := newDocNums[next.Number()] + if hitNewDocNum == docDropped { + return 0, 0, 0, nil, fmt.Errorf("see hit with dropped docNum") + } + + newRoaring.Add(uint32(hitNewDocNum)) + + nextFreq := next.Frequency() + nextNorm := uint64(math.Float32bits(float32(next.Norm()))) + + err = tfEncoder.Add(hitNewDocNum, nextFreq, nextNorm) + if err != nil { + return 0, 0, 0, nil, err + } + + locs := next.Locations() + if len(locs) > 0 { + newRoaringLocs.Add(uint32(hitNewDocNum)) + + for _, loc := range locs { + if cap(bufLoc) < 5+len(loc.ArrayPositions()) { + bufLoc = make([]uint64, 0, 5+len(loc.ArrayPositions())) + } + args := bufLoc[0:5] + args[0] = uint64(fieldsMap[loc.Field()] - 1) + args[1] = loc.Pos() + args[2] = loc.Start() + args[3] = loc.End() + args[4] = uint64(len(loc.ArrayPositions())) + args = append(args, loc.ArrayPositions()...) + err = locEncoder.Add(hitNewDocNum, args...) + if err != nil { + return 0, 0, 0, nil, err + } + } + } + + docTermMap[hitNewDocNum] = + append(append(docTermMap[hitNewDocNum], term...), termSeparator) + + lastDocNum = hitNewDocNum + lastFreq = nextFreq + lastNorm = nextNorm + + next, err = postItr.Next() + } + + return lastDocNum, lastFreq, lastNorm, bufLoc, err +} + func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator, newDocNums []uint64, newRoaring *roaring.Bitmap, newRoaringLocs *roaring.Bitmap, tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, docTermMap [][]byte) ( From e9b228bcdd09472170b27840539436e17da034c0 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 19 Mar 2018 14:14:59 -0400 Subject: [PATCH 309/728] improve command-line tool for zap correctly handle/print additional loc bitmap address this fixes bitmap length that is output instantiate roaring bitmap and print it out removed some unnecessary debug logging updated dict command to print 1-hit encoded vals this makes dict command usable for seeing which doc ids are in a segment and their corresponding doc number --- cmd/bleve/cmd/zap/dict.go | 11 ++++++++++- cmd/bleve/cmd/zap/explore.go | 22 ++++++++++++++++------ 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/cmd/bleve/cmd/zap/dict.go b/cmd/bleve/cmd/zap/dict.go index 3e2727195..e80be3601 100644 --- a/cmd/bleve/cmd/zap/dict.go +++ b/cmd/bleve/cmd/zap/dict.go @@ -17,7 +17,9 @@ package zap import ( "encoding/binary" "fmt" + "math" + "github.com/blevesearch/bleve/index/scorch/segment/zap" "github.com/couchbase/vellum" "github.com/spf13/cobra" ) @@ -54,7 +56,14 @@ var dictCmd = &cobra.Command{ itr, err := fst.Iterator(nil, nil) for err == nil { currTerm, currVal := itr.Current() - fmt.Printf("%s - %d (%x)\n", currTerm, currVal, currVal) + extra := "" + if currVal&zap.FSTValEncodingMask == zap.FSTValEncoding1Hit { + docNum, normBits := zap.FSTValDecode1Hit(currVal) + norm := math.Float32frombits(uint32(normBits)) + extra = fmt.Sprintf("-- docNum: %d, norm: %f", docNum, norm) + } + + fmt.Printf("%s - %d (%x) %s\n", currTerm, currVal, currVal, extra) err = itr.Next() } if err != nil && err != vellum.ErrIteratorDone { diff --git a/cmd/bleve/cmd/zap/explore.go b/cmd/bleve/cmd/zap/explore.go index 543b572fd..225b7373f 100644 --- a/cmd/bleve/cmd/zap/explore.go +++ b/cmd/bleve/cmd/zap/explore.go @@ -17,9 +17,9 @@ package zap import ( "encoding/binary" "fmt" - "log" "math" + "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index/scorch/segment/zap" "github.com/couchbase/vellum" "github.com/spf13/cobra" @@ -59,7 +59,7 @@ var exploreCmd = &cobra.Command{ return fmt.Errorf("error looking for term : %v", err) } if exists { - fmt.Printf("fst val is %d (%x)\n", postingsAddr, postingsAddr) + fmt.Printf("FST val is %d (%x)\n", postingsAddr, postingsAddr) if postingsAddr&zap.FSTValEncodingMask == zap.FSTValEncoding1Hit { docNum, normBits := zap.FSTValDecode1Hit(postingsAddr) @@ -81,10 +81,21 @@ var exploreCmd = &cobra.Command{ locAddr, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64]) n += uint64(read) + var locBitmapAddr uint64 + locBitmapAddr, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64]) + n += uint64(read) + var postingListLen uint64 - postingListLen, _ = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64]) + postingListLen, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64]) + n += uint64(read) fmt.Printf("Posting List Length: %d\n", postingListLen) + bitmap := roaring.New() + _, err = bitmap.FromBuffer(data[postingsAddr+n : postingsAddr+n+postingListLen]) + if err != nil { + return err + } + fmt.Printf("Posting List: %v\n", bitmap) fmt.Printf("Freq details at: %d (%x)\n", freqAddr, freqAddr) numChunks, r2 := binary.Uvarint(data[freqAddr : freqAddr+binary.MaxVarintLen64]) @@ -109,11 +120,8 @@ var exploreCmd = &cobra.Command{ var locOffsets []uint64 for j := uint64(0); j < numLChunks; j++ { - log.Printf("reading from %d(%x)\n", locAddr+n, locAddr+n) - log.Printf("data i see here: % x\n", data[locAddr+n:locAddr+n+binary.MaxVarintLen64]) lchunkLen, r4 := binary.Uvarint(data[locAddr+n : locAddr+n+binary.MaxVarintLen64]) n += uint64(r4) - log.Printf("see chunk len %d(%x)\n", lchunkLen, lchunkLen) locOffsets = append(locOffsets, lchunkLen) } @@ -123,6 +131,8 @@ var exploreCmd = &cobra.Command{ running2 += offset } + fmt.Printf("Loc Bitmap at: %d (%x)\n", locBitmapAddr, locBitmapAddr) + } else { fmt.Printf("dictionary does not contain term '%s'\n", args[2]) } From 85df86ba17c1dceaa69264e22238432a6fd495fd Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 19 Mar 2018 12:33:13 -0700 Subject: [PATCH 310/728] Unit tests for segments with docs with non-overlapping fields --- index/scorch/segment/mem/segment_test.go | 177 +++++++++++++++++++++++ index/scorch/segment/zap/build_test.go | 165 +++++++++++++++++++++ index/scorch/segment/zap/segment_test.go | 137 ++++++++++++++++++ 3 files changed, 479 insertions(+) diff --git a/index/scorch/segment/mem/segment_test.go b/index/scorch/segment/mem/segment_test.go index 6c5625d86..565719278 100644 --- a/index/scorch/segment/mem/segment_test.go +++ b/index/scorch/segment/mem/segment_test.go @@ -697,3 +697,180 @@ func TestMultiple(t *testing.T) { } } + +func TestMultipleWithNonOverlappingFields(t *testing.T) { + doc1 := &document.Document{ + ID: "a", + Fields: []document.Field{ + document.NewTextField("_id", []uint64{}, []byte("a")), + document.NewTextField("name", []uint64{}, []byte("ABC")), + document.NewTextField("dept", []uint64{}, []byte("ABC dept")), + document.NewTextField("manages.id", []uint64{}, []byte("XYZ")), + document.NewTextField("manages.count", []uint64{}, []byte("1")), + }, + CompositeFields: []*document.CompositeField{ + document.NewCompositeField("_all", true, nil, []string{"_id"}), + }, + } + + doc2 := &document.Document{ + ID: "b", + Fields: []document.Field{ + document.NewTextField("_id", []uint64{}, []byte("b")), + document.NewTextField("name", []uint64{}, []byte("XYZ")), + document.NewTextField("dept", []uint64{}, []byte("ABC dept")), + document.NewTextField("reportsTo.id", []uint64{}, []byte("ABC")), + }, + CompositeFields: []*document.CompositeField{ + document.NewCompositeField("_all", true, nil, []string{"_id"}), + }, + } + + results := []*index.AnalysisResult{ + &index.AnalysisResult{ + Document: doc1, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("a"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("ABC"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("ABC"), + }, + &analysis.Token{ + Start: 4, + End: 8, + Position: 2, + Term: []byte("dept"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("XYZ"), + }, + }, []uint64{0}, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("1"), + }, + }, []uint64{1}, true), + }, + Length: []int{ + 1, + 1, + 2, + 1, + 1, + }, + }, + &index.AnalysisResult{ + Document: doc2, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("b"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("XYZ"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("ABC"), + }, + &analysis.Token{ + Start: 4, + End: 8, + Position: 2, + Term: []byte("dept"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("ABC"), + }, + }, []uint64{0}, true), + }, + Length: []int{ + 1, + 1, + 2, + 1, + }, + }, + } + + // fix up composite fields + for _, ar := range results { + for i, f := range ar.Document.Fields { + for _, cf := range ar.Document.CompositeFields { + cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i]) + } + } + } + + segment := NewFromAnalyzedDocs(results) + if segment == nil { + t.Fatalf("segment nil, not expected") + } + + if segment.Count() != 2 { + t.Errorf("expected count 2, got %d", segment.Count()) + } + + expectFields := map[string]struct{}{ + "_id": struct{}{}, + "_all": struct{}{}, + "name": struct{}{}, + "dept": struct{}{}, + "manages.id": struct{}{}, + "manages.count": struct{}{}, + "reportsTo.id": struct{}{}, + } + + fields := segment.Fields() + if len(fields) != len(expectFields) { + t.Errorf("expected %d fields, only got %d", len(expectFields), len(fields)) + } + for _, field := range fields { + if _, ok := expectFields[field]; !ok { + t.Errorf("got unexpected field: %s", field) + } + } +} diff --git a/index/scorch/segment/zap/build_test.go b/index/scorch/segment/zap/build_test.go index 65de7931d..e8189f760 100644 --- a/index/scorch/segment/zap/build_test.go +++ b/index/scorch/segment/zap/build_test.go @@ -137,6 +137,12 @@ func buildTestSegmentMultiWithChunkFactor(chunkFactor uint32) (*SegmentBase, err return AnalysisResultsToSegmentBase(results, chunkFactor) } +func buildTestSegmentMultiWithDifferentFields(includeDocA, includeDocB bool) (*SegmentBase, error) { + results := buildTestAnalysisResultsMultiWithDifferentFields(includeDocA, includeDocB) + + return AnalysisResultsToSegmentBase(results, 1024) +} + func buildTestAnalysisResultsMulti() []*index.AnalysisResult { doc := &document.Document{ ID: "a", @@ -298,6 +304,165 @@ func buildTestAnalysisResultsMulti() []*index.AnalysisResult { return results } +func buildTestAnalysisResultsMultiWithDifferentFields(includeDocA, includeDocB bool) []*index.AnalysisResult { + results := []*index.AnalysisResult{} + + if includeDocA { + doc := &document.Document{ + ID: "a", + Fields: []document.Field{ + document.NewTextField("_id", []uint64{}, []byte("a")), + document.NewTextField("name", []uint64{}, []byte("ABC")), + document.NewTextField("dept", []uint64{}, []byte("ABC dept")), + document.NewTextField("manages.id", []uint64{}, []byte("XYZ")), + document.NewTextField("manages.count", []uint64{}, []byte("1")), + }, + CompositeFields: []*document.CompositeField{ + document.NewCompositeField("_all", true, nil, []string{"_id"}), + }, + } + + result := &index.AnalysisResult{ + Document: doc, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("a"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("ABC"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("ABC"), + }, + &analysis.Token{ + Start: 4, + End: 8, + Position: 2, + Term: []byte("dept"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("XYZ"), + }, + }, []uint64{0}, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("1"), + }, + }, []uint64{1}, true), + }, + Length: []int{ + 1, + 1, + 2, + 1, + 1, + }, + } + + results = append(results, result) + } + + if includeDocB { + doc := &document.Document{ + ID: "b", + Fields: []document.Field{ + document.NewTextField("_id", []uint64{}, []byte("b")), + document.NewTextField("name", []uint64{}, []byte("XYZ")), + document.NewTextField("dept", []uint64{}, []byte("ABC dept")), + document.NewTextField("reportsTo.id", []uint64{}, []byte("ABC")), + }, + CompositeFields: []*document.CompositeField{ + document.NewCompositeField("_all", true, nil, []string{"_id"}), + }, + } + + result := &index.AnalysisResult{ + Document: doc, + Analyzed: []analysis.TokenFrequencies{ + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 1, + Position: 1, + Term: []byte("b"), + }, + }, nil, false), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("XYZ"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("ABC"), + }, + &analysis.Token{ + Start: 4, + End: 8, + Position: 2, + Term: []byte("dept"), + }, + }, nil, true), + analysis.TokenFrequency(analysis.TokenStream{ + &analysis.Token{ + Start: 0, + End: 3, + Position: 1, + Term: []byte("ABC"), + }, + }, []uint64{0}, true), + }, + Length: []int{ + 1, + 1, + 2, + 1, + }, + } + + results = append(results, result) + } + + // fix up composite fields + for _, ar := range results { + for i, f := range ar.Document.Fields { + for _, cf := range ar.Document.CompositeFields { + cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i]) + } + } + } + + return results +} + func buildTestSegmentWithDefaultFieldMapping(chunkFactor uint32) ( *SegmentBase, []string, error) { doc := &document.Document{ diff --git a/index/scorch/segment/zap/segment_test.go b/index/scorch/segment/zap/segment_test.go index 50d5dbd7f..339d24ce5 100644 --- a/index/scorch/segment/zap/segment_test.go +++ b/index/scorch/segment/zap/segment_test.go @@ -21,6 +21,7 @@ import ( "sort" "testing" + "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" ) @@ -600,3 +601,139 @@ func TestSegmentVisitableDocValueFieldsList(t *testing.T) { } } + +func TestSegmentDocsWithNonOverlappingFields(t *testing.T) { + _ = os.RemoveAll("/tmp/scorch.zap") + + testSeg, err := buildTestSegmentMultiWithDifferentFields(true, true) + if err != nil { + t.Fatalf("error building segment: %v", err) + } + err = PersistSegmentBase(testSeg, "/tmp/scorch.zap") + if err != nil { + t.Fatalf("error persisting segment: %v", err) + } + + segment, err := Open("/tmp/scorch.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", cerr) + } + }() + + if segment.Count() != 2 { + t.Errorf("expected 2, got %d", segment.Count()) + } + + expectFields := map[string]struct{}{ + "_id": struct{}{}, + "_all": struct{}{}, + "name": struct{}{}, + "dept": struct{}{}, + "manages.id": struct{}{}, + "manages.count": struct{}{}, + "reportsTo.id": struct{}{}, + } + + fields := segment.Fields() + if len(fields) != len(expectFields) { + t.Errorf("expected %d fields, only got %d", len(expectFields), len(fields)) + } + for _, field := range fields { + if _, ok := expectFields[field]; !ok { + t.Errorf("got unexpected field: %s", field) + } + } +} + +func TestMergedSegmentDocsWithNonOverlappingFields(t *testing.T) { + _ = os.RemoveAll("/tmp/scorch1.zap") + _ = os.RemoveAll("/tmp/scorch2.zap") + _ = os.RemoveAll("/tmp/scorch3.zap") + + testSeg1, _ := buildTestSegmentMultiWithDifferentFields(true, false) + err := PersistSegmentBase(testSeg1, "/tmp/scorch1.zap") + if err != nil { + t.Fatalf("error persisting segment: %v", err) + } + + testSeg2, _ := buildTestSegmentMultiWithDifferentFields(false, true) + err = PersistSegmentBase(testSeg2, "/tmp/scorch2.zap") + if err != nil { + t.Fatalf("error persisting segment: %v", err) + } + + segment1, err := Open("/tmp/scorch1.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment1.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", cerr) + } + }() + + segment2, err := Open("/tmp/scorch2.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment2.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", cerr) + } + }() + + segsToMerge := make([]*Segment, 2) + segsToMerge[0] = segment1.(*Segment) + segsToMerge[1] = segment2.(*Segment) + + _, nBytes, err := Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024) + if err != nil { + t.Fatal(err) + } + + if nBytes == 0 { + t.Fatalf("expected a non zero total_compaction_written_bytes") + } + + segmentM, err := Open("/tmp/scorch3.zap") + if err != nil { + t.Fatalf("error opening merged segment: %v", err) + } + defer func() { + cerr := segmentM.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", cerr) + } + }() + + if segmentM.Count() != 2 { + t.Errorf("expected 2, got %d", segmentM.Count()) + } + + expectFields := map[string]struct{}{ + "_id": struct{}{}, + "_all": struct{}{}, + "name": struct{}{}, + "dept": struct{}{}, + "manages.id": struct{}{}, + "manages.count": struct{}{}, + "reportsTo.id": struct{}{}, + } + + fields := segmentM.Fields() + if len(fields) != len(expectFields) { + t.Errorf("expected %d fields, only got %d", len(expectFields), len(fields)) + } + for _, field := range fields { + if _, ok := expectFields[field]; !ok { + t.Errorf("got unexpected field: %s", field) + } + } +} From 85b4a31e2a05bac763f72ba8a129887e0772c0b3 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 20 Mar 2018 11:12:18 -0700 Subject: [PATCH 311/728] scorch zap getField() which panics if the field is unknown --- index/scorch/segment/zap/new.go | 23 +++++++++++++++++------ 1 file changed, 17 insertions(+), 6 deletions(-) diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index dd2740fb2..68827f0b7 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -17,6 +17,7 @@ package zap import ( "bytes" "encoding/binary" + "fmt" "math" "sort" "sync" @@ -288,6 +289,16 @@ func (s *interim) getOrDefineField(fieldName string) int { return int(fieldIDPlus1 - 1) } +// the fieldName must be for a known field +func (s *interim) getField(fieldName string) int { + fieldIDPlus1, exists := s.FieldsMap[fieldName] + if !exists || fieldIDPlus1 <= 0 { + panic(fmt.Sprintf("getField saw unknown fieldName: %s, fieldsMap: %#v", + fieldName, s.FieldsMap)) + } + return int(fieldIDPlus1 - 1) +} + // fill Dicts and DictKeys from analysis results func (s *interim) prepareDicts() { var pidNext int @@ -328,14 +339,14 @@ func (s *interim) prepareDicts() { for _, result := range s.results { // walk each composite field for _, field := range result.Document.CompositeFields { - fieldID := uint16(s.getOrDefineField(field.Name())) + fieldID := uint16(s.getField(field.Name())) _, tf := field.Analyze() visitField(fieldID, tf) } // walk each field for i, field := range result.Document.Fields { - fieldID := uint16(s.getOrDefineField(field.Name())) + fieldID := uint16(s.getField(field.Name())) tf := result.Analyzed[i] visitField(fieldID, tf) } @@ -439,14 +450,14 @@ func (s *interim) processDocument(docNum uint64, // walk each composite field for _, field := range result.Document.CompositeFields { - fieldID := uint16(s.getOrDefineField(field.Name())) + fieldID := uint16(s.getField(field.Name())) ln, tf := field.Analyze() visitField(fieldID, field.Name(), ln, tf) } // walk each field for i, field := range result.Document.Fields { - fieldID := uint16(s.getOrDefineField(field.Name())) + fieldID := uint16(s.getField(field.Name())) ln := result.Length[i] tf := result.Analyzed[i] visitField(fieldID, field.Name(), ln, tf) @@ -477,7 +488,7 @@ func (s *interim) processDocument(docNum uint64, for _, loc := range tf.Locations { var locf = uint16(fieldID) if loc.Field != "" { - locf = uint16(s.getOrDefineField(loc.Field)) + locf = uint16(s.getField(loc.Field)) } var arrayposs []uint64 if len(loc.ArrayPositions) > 0 { @@ -517,7 +528,7 @@ func (s *interim) writeStoredFields() ( } for _, field := range result.Document.Fields { - fieldID := uint16(s.getOrDefineField(field.Name())) + fieldID := uint16(s.getField(field.Name())) opts := field.Options() From 2f4d3d858751a019650379443709d700454149a7 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 20 Mar 2018 11:17:46 -0700 Subject: [PATCH 312/728] scorch zap panic if mergeFields() sees unsorted fields mergeFields depends on the fields from the various segments being sorted for the fieldsSame comparison to work. Of note, the 'fieldi > 1' guard skips the 0th field, which should always be the '_id' field. --- index/scorch/segment/zap/merge.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 1da5e5269..622cf8cc2 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -755,6 +755,10 @@ func mergeFields(segments []*SegmentBase) (bool, []string) { if len(segment0Fields) != len(fields) || segment0Fields[fieldi] != field { fieldsSame = false } + + if fieldi > 1 && field <= fields[fieldi-1] { + panic(fmt.Sprintf("mergeFields on unsorted fields: %#v", fields)) + } } } From 35ea1d4423cc186a2540f442f62b975c43fcd4b8 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 20 Mar 2018 17:41:56 -0400 Subject: [PATCH 313/728] fix MB-28719 and MB-28781 invalid/missing field in scorch Use of sync.Pool to reuse the interm structure relied on resetting the fieldsInv slice. However, actual segments continued to use this same fieldsInv slice after returning it to the pool. Simple fix is to nil out fieldsInv slice in reset method and let the newly built segment keep the one from the interim struct. --- index/scorch/segment/zap/new.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index 68827f0b7..e9f0b2c06 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -135,7 +135,7 @@ func (s *interim) reset() (err error) { s.chunkFactor = 0 s.w = nil s.FieldsMap = nil - s.FieldsInv = s.FieldsInv[:0] + s.FieldsInv = nil for i := range s.Dicts { s.Dicts[i] = nil } From 844845b5d251cbd8b745cc5e2bd39139ad86393a Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Tue, 20 Mar 2018 14:51:25 -0700 Subject: [PATCH 314/728] Revert "scorch zap panic if mergeFields() sees unsorted fields" This reverts commit 2f4d3d858751a019650379443709d700454149a7. --- index/scorch/segment/zap/merge.go | 4 ---- 1 file changed, 4 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 622cf8cc2..1da5e5269 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -755,10 +755,6 @@ func mergeFields(segments []*SegmentBase) (bool, []string) { if len(segment0Fields) != len(fields) || segment0Fields[fieldi] != field { fieldsSame = false } - - if fieldi > 1 && field <= fields[fieldi-1] { - panic(fmt.Sprintf("mergeFields on unsorted fields: %#v", fields)) - } } } From 0e3c57c4651599020763e1a577e352d449e57bfc Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Tue, 20 Mar 2018 14:51:33 -0700 Subject: [PATCH 315/728] Revert "scorch zap getField() which panics if the field is unknown" This reverts commit 85b4a31e2a05bac763f72ba8a129887e0772c0b3. --- index/scorch/segment/zap/new.go | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index 68827f0b7..dd2740fb2 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -17,7 +17,6 @@ package zap import ( "bytes" "encoding/binary" - "fmt" "math" "sort" "sync" @@ -289,16 +288,6 @@ func (s *interim) getOrDefineField(fieldName string) int { return int(fieldIDPlus1 - 1) } -// the fieldName must be for a known field -func (s *interim) getField(fieldName string) int { - fieldIDPlus1, exists := s.FieldsMap[fieldName] - if !exists || fieldIDPlus1 <= 0 { - panic(fmt.Sprintf("getField saw unknown fieldName: %s, fieldsMap: %#v", - fieldName, s.FieldsMap)) - } - return int(fieldIDPlus1 - 1) -} - // fill Dicts and DictKeys from analysis results func (s *interim) prepareDicts() { var pidNext int @@ -339,14 +328,14 @@ func (s *interim) prepareDicts() { for _, result := range s.results { // walk each composite field for _, field := range result.Document.CompositeFields { - fieldID := uint16(s.getField(field.Name())) + fieldID := uint16(s.getOrDefineField(field.Name())) _, tf := field.Analyze() visitField(fieldID, tf) } // walk each field for i, field := range result.Document.Fields { - fieldID := uint16(s.getField(field.Name())) + fieldID := uint16(s.getOrDefineField(field.Name())) tf := result.Analyzed[i] visitField(fieldID, tf) } @@ -450,14 +439,14 @@ func (s *interim) processDocument(docNum uint64, // walk each composite field for _, field := range result.Document.CompositeFields { - fieldID := uint16(s.getField(field.Name())) + fieldID := uint16(s.getOrDefineField(field.Name())) ln, tf := field.Analyze() visitField(fieldID, field.Name(), ln, tf) } // walk each field for i, field := range result.Document.Fields { - fieldID := uint16(s.getField(field.Name())) + fieldID := uint16(s.getOrDefineField(field.Name())) ln := result.Length[i] tf := result.Analyzed[i] visitField(fieldID, field.Name(), ln, tf) @@ -488,7 +477,7 @@ func (s *interim) processDocument(docNum uint64, for _, loc := range tf.Locations { var locf = uint16(fieldID) if loc.Field != "" { - locf = uint16(s.getField(loc.Field)) + locf = uint16(s.getOrDefineField(loc.Field)) } var arrayposs []uint64 if len(loc.ArrayPositions) > 0 { @@ -528,7 +517,7 @@ func (s *interim) writeStoredFields() ( } for _, field := range result.Document.Fields { - fieldID := uint16(s.getField(field.Name())) + fieldID := uint16(s.getOrDefineField(field.Name())) opts := field.Options() From d1e2b55c72f543300a1848f6c68fa4096846ed86 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 21 Mar 2018 17:47:56 -0700 Subject: [PATCH 316/728] scorch zap postingsItr.nextDocNum() maintains allNChunk correctly When PostingsIterator.nextDocNum() moves the 'all' roaring bitmap iterator forwards, it was incorrectly not keeping the allNChunk value aligned. --- index/scorch/segment/zap/posting.go | 1 + 1 file changed, 1 insertion(+) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 081ec5f6c..4b4b4933c 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -620,6 +620,7 @@ func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { } allN = i.all.Next() + allNChunk = allN / i.postings.sb.chunkFactor } if i.currChunk != nChunk || i.currChunkFreqNorm == nil { From b506fae4f73dcb73674b6d2f8f88df8f5726a78e Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 21 Mar 2018 17:54:22 -0700 Subject: [PATCH 317/728] scorch zap postingsItr remove unused offset/locoffset fields --- index/scorch/segment/zap/posting.go | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 4b4b4933c..f5ccad1ad 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -316,11 +316,9 @@ func (rv *PostingsList) init1Hit(fstVal uint64) error { // PostingsIterator provides a way to iterate through the postings list type PostingsIterator struct { - postings *PostingsList - all roaring.IntIterable - offset int - locoffset int - actual roaring.IntIterable + postings *PostingsList + all roaring.IntIterable + actual roaring.IntIterable currChunk uint32 currChunkFreqNorm []byte @@ -584,16 +582,12 @@ func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { nChunk := n / i.postings.sb.chunkFactor allNChunk := allN / i.postings.sb.chunkFactor - // n is the next actual hit (excluding some postings) - // allN is the next hit in the full postings - // if they don't match, adjust offsets to factor in item we're skipping over - // incr the all iterator, and check again + // n is the next actual hit (excluding some postings), and + // allN is the next hit in the full postings, and + // if they don't match, move 'all' forwards until they do for allN != n { - // in different chunks, reset offsets - if allNChunk != nChunk { - i.locoffset = 0 - i.offset = 0 - } else { + // in the same chunk, so move the freq/norm/loc decoders forward + if allNChunk == nChunk { if i.currChunk != nChunk || i.currChunkFreqNorm == nil { err := i.loadChunk(int(nChunk)) if err != nil { @@ -614,9 +608,6 @@ func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { } } } - - // in same chunk, need to account for offsets - i.offset++ } allN = i.all.Next() From 6b78dd4184d9d8fd8c4f2c8d0422e7d7d16f64ac Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 22 Mar 2018 06:46:06 -0700 Subject: [PATCH 318/728] fix cmd/bleve scorch ascii cmd help text Initially, there was a typo with an extra space char, but then I realized there was some copypasting corrections. --- cmd/bleve/cmd/scorch/ascii.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmd/bleve/cmd/scorch/ascii.go b/cmd/bleve/cmd/scorch/ascii.go index 7b36b5b9c..34fb1ed70 100644 --- a/cmd/bleve/cmd/scorch/ascii.go +++ b/cmd/bleve/cmd/scorch/ascii.go @@ -22,11 +22,11 @@ import ( "github.com/spf13/cobra" ) -// asciiCmd represents the snapshots command +// asciiCmd represents the ascii command var asciiCmd = &cobra.Command{ Use: "ascii", - Short: "ascii prints details an ascii representation of the snapshots in the index", - Long: `The ascii command prints an ascii representation of the snapshots in the index.`, + Short: "ascii prints an ascii representation of the segments in a snapshot", + Long: `The ascii command prints an ascii representation of the segments in a given snapshot.`, RunE: func(cmd *cobra.Command, args []string) error { if len(args) < 2 { From 18cfcd11d102a83271ab1d96b9035bdfe0e3f019 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 21 Mar 2018 12:12:49 -0700 Subject: [PATCH 319/728] MB-28782: Error handling in merger/persister when index is closed When the index is closed, do not fire an AsyncError (fatal) from either the merger or the persister that is actively working. This is quite a probable situation, so exit the loop within the goroutine. --- index/scorch/merge.go | 14 +++++++++----- index/scorch/persister.go | 5 +++++ 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 41086ad3d..42b5e950f 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -17,7 +17,6 @@ package scorch import ( "bytes" "encoding/json" - "fmt" "os" "sync/atomic" @@ -59,6 +58,11 @@ OUTER: // lets get started err := s.planMergeAtSnapshot(ourSnapshot, mergePlannerOptions) if err != nil { + if err == ErrClosed { + // index has been closed + _ = ourSnapshot.DecRef() + break OUTER + } s.fireAsyncError(fmt.Errorf("merging err: %v", err)) _ = ourSnapshot.DecRef() atomic.AddUint64(&s.stats.TotFileMergeLoopErr, 1) @@ -231,7 +235,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, select { case <-s.closeCh: _ = segment.Close() - return nil + return ErrClosed case s.merges <- sm: atomic.AddUint64(&s.stats.TotFileMergeIntroductions, 1) } @@ -242,7 +246,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, for _, notification := range notifications { select { case <-s.closeCh: - return nil + return ErrClosed case newSnapshot := <-notification: atomic.AddUint64(&s.stats.TotFileMergeIntroductionsDone, 1) if newSnapshot != nil { @@ -338,13 +342,13 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, select { // send to introducer case <-s.closeCh: _ = segment.DecRef() - return 0, nil, 0, nil // TODO: return ErrInterruptedClosed? + return 0, nil, 0, ErrClosed case s.merges <- sm: } select { // wait for introduction to complete case <-s.closeCh: - return 0, nil, 0, nil // TODO: return ErrInterruptedClosed? + return 0, nil, 0, ErrClosed case newSnapshot := <-sm.notify: atomic.AddUint64(&s.stats.TotMemMergeSegments, uint64(len(sbs))) atomic.AddUint64(&s.stats.TotMemMergeDone, 1) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index dda4bdfbb..2fab53240 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -94,6 +94,11 @@ OUTER: close(ch) } if err != nil { + if err == ErrClosed { + // index has been closed + _ = ourSnapshot.DecRef() + break OUTER + } s.fireAsyncError(fmt.Errorf("got err persisting snapshot: %v", err)) _ = ourSnapshot.DecRef() atomic.AddUint64(&s.stats.TotPersistLoopErr, 1) From 621b58dd8341232c01653db36372f0e0361bc90f Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 22 Mar 2018 16:36:38 -0700 Subject: [PATCH 320/728] scorch zap replace locsBitmap w/ 1 bit from freq-norm varint encoding NOTE: this is a zap file format change. The separate "postings locations" roaring Bitmap that encoded whether a posting has locations info is now replaced by the least significant bit in the freq varint encoded in the freq-norm chunkedIntCoder. encode/decodeFreqHasLocs() are added as helper functions. --- cmd/bleve/cmd/zap/explore.go | 6 --- index/scorch/segment/zap/build.go | 2 +- index/scorch/segment/zap/dict.go | 5 -- index/scorch/segment/zap/merge.go | 26 +++------- index/scorch/segment/zap/new.go | 42 ++++----------- index/scorch/segment/zap/posting.go | 79 ++++++++++++++--------------- 6 files changed, 57 insertions(+), 103 deletions(-) diff --git a/cmd/bleve/cmd/zap/explore.go b/cmd/bleve/cmd/zap/explore.go index 225b7373f..0c2471edc 100644 --- a/cmd/bleve/cmd/zap/explore.go +++ b/cmd/bleve/cmd/zap/explore.go @@ -81,10 +81,6 @@ var exploreCmd = &cobra.Command{ locAddr, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64]) n += uint64(read) - var locBitmapAddr uint64 - locBitmapAddr, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64]) - n += uint64(read) - var postingListLen uint64 postingListLen, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64]) n += uint64(read) @@ -131,8 +127,6 @@ var exploreCmd = &cobra.Command{ running2 += offset } - fmt.Printf("Loc Bitmap at: %d (%x)\n", locBitmapAddr, locBitmapAddr) - } else { fmt.Printf("dictionary does not contain term '%s'\n", args[2]) } diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 20b892ca3..9e9d787bb 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -22,7 +22,7 @@ import ( "github.com/Smerity/govarint" ) -const version uint32 = 6 +const version uint32 = 7 const fieldNotUninverted = math.MaxUint64 diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 3b8132f2c..38b4faca1 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -72,15 +72,10 @@ func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) if postings != nil { postings.Clear() } - locBitmap := rv.locBitmap - if locBitmap != nil { - locBitmap.Clear() - } *rv = PostingsList{} // clear the struct rv.postings = postings - rv.locBitmap = locBitmap } rv.sb = d.sb rv.except = except diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 1da5e5269..51dd74206 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -259,9 +259,8 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, tfEncoder.Close() locEncoder.Close() - postingsOffset, err := writePostings( - newRoaring, newRoaringLocs, tfEncoder, locEncoder, - use1HitEncoding, w, bufMaxVarintLen64) + postingsOffset, err := writePostings(newRoaring, + tfEncoder, locEncoder, use1HitEncoding, w, bufMaxVarintLen64) if err != nil { return err } @@ -423,12 +422,14 @@ func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *Po nextFreq := next.Frequency() nextNorm := uint64(math.Float32bits(float32(next.Norm()))) - err = tfEncoder.Add(hitNewDocNum, nextFreq, nextNorm) + locs := next.Locations() + + err = tfEncoder.Add(hitNewDocNum, + encodeFreqHasLocs(nextFreq, len(locs) > 0), nextNorm) if err != nil { return 0, 0, 0, nil, err } - locs := next.Locations() if len(locs) > 0 { newRoaringLocs.Add(uint32(hitNewDocNum)) @@ -503,8 +504,7 @@ func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator, return lastDocNum, lastFreq, lastNorm, err } -func writePostings(postings, postingLocs *roaring.Bitmap, - tfEncoder, locEncoder *chunkedIntCoder, +func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCoder, use1HitEncoding func(uint64) (bool, uint64, uint64), w *CountHashWriter, bufMaxVarintLen64 []byte) ( offset uint64, err error) { @@ -532,12 +532,6 @@ func writePostings(postings, postingLocs *roaring.Bitmap, return 0, err } - postingLocsOffset := uint64(w.Count()) - _, err = writeRoaringWithLen(postingLocs, w, bufMaxVarintLen64) - if err != nil { - return 0, err - } - postingsOffset := uint64(w.Count()) n := binary.PutUvarint(bufMaxVarintLen64, tfOffset) @@ -552,12 +546,6 @@ func writePostings(postings, postingLocs *roaring.Bitmap, return 0, err } - n = binary.PutUvarint(bufMaxVarintLen64, postingLocsOffset) - _, err = w.Write(bufMaxVarintLen64[:n]) - if err != nil { - return 0, err - } - _, err = writeRoaringWithLen(postings, w, bufMaxVarintLen64) if err != nil { return 0, err diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index 7d098349d..5837436fe 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -103,9 +103,6 @@ type interim struct { // postings id -> bitmap of docNums Postings []*roaring.Bitmap - // postings id -> bitmap of docNums that have locations - PostingsLocs []*roaring.Bitmap - // postings id -> freq/norm's, one for each docNum in postings FreqNorms [][]interimFreqNorm freqNormsBacking []interimFreqNorm @@ -151,10 +148,6 @@ func (s *interim) reset() (err error) { idn.Clear() } s.Postings = s.Postings[:0] - for _, idn := range s.PostingsLocs { - idn.Clear() - } - s.PostingsLocs = s.PostingsLocs[:0] s.FreqNorms = s.FreqNorms[:0] for i := range s.freqNormsBacking { s.freqNormsBacking[i] = interimFreqNorm{} @@ -196,8 +189,9 @@ type interimStoredField struct { } type interimFreqNorm struct { - freq uint64 - norm float32 + freq uint64 + norm float32 + hasLocs bool } type interimLoc struct { @@ -356,19 +350,6 @@ func (s *interim) prepareDicts() { s.Postings = postings } - if cap(s.PostingsLocs) >= numPostingsLists { - s.PostingsLocs = s.PostingsLocs[:numPostingsLists] - } else { - postingsLocs := make([]*roaring.Bitmap, numPostingsLists) - copy(postingsLocs, s.PostingsLocs[:cap(s.PostingsLocs)]) - for i := 0; i < numPostingsLists; i++ { - if postingsLocs[i] == nil { - postingsLocs[i] = roaring.New() - } - } - s.PostingsLocs = postingsLocs - } - if cap(s.FreqNorms) >= numPostingsLists { s.FreqNorms = s.FreqNorms[:numPostingsLists] } else { @@ -464,14 +445,12 @@ func (s *interim) processDocument(docNum uint64, s.FreqNorms[pid] = append(s.FreqNorms[pid], interimFreqNorm{ - freq: uint64(tf.Frequency()), - norm: norm, + freq: uint64(tf.Frequency()), + norm: norm, + hasLocs: len(tf.Locations) > 0, }) if len(tf.Locations) > 0 { - locBS := s.PostingsLocs[pid] - locBS.Add(uint32(docNum)) - locs := s.Locs[pid] for _, loc := range tf.Locations { @@ -625,7 +604,6 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err pid := dict[term] - 1 postingsBS := s.Postings[pid] - postingsLocsBS := s.PostingsLocs[pid] freqNorms := s.FreqNorms[pid] freqNormOffset := 0 @@ -639,7 +617,8 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err freqNorm := freqNorms[freqNormOffset] - err = tfEncoder.Add(docNum, freqNorm.freq, + err = tfEncoder.Add(docNum, + encodeFreqHasLocs(freqNorm.freq, freqNorm.hasLocs), uint64(math.Float32bits(freqNorm.norm))) if err != nil { return 0, nil, err @@ -675,9 +654,8 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err tfEncoder.Close() locEncoder.Close() - postingsOffset, err := writePostings( - postingsBS, postingsLocsBS, tfEncoder, locEncoder, - nil, s.w, buf) + postingsOffset, err := + writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf) if err != nil { return 0, nil, err } diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index f5ccad1ad..004b80317 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -100,7 +100,6 @@ type PostingsList struct { postingsOffset uint64 freqOffset uint64 locOffset uint64 - locBitmap *roaring.Bitmap postings *roaring.Bitmap except *roaring.Bitmap @@ -222,8 +221,6 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { } rv.locChunkStart = p.locOffset + n - rv.locBitmap = p.locBitmap - rv.all = p.postings.Iterator() if p.except != nil { allExcept := roaring.AndNot(p.postings, p.except) @@ -271,23 +268,6 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) n += uint64(read) - var locBitmapOffset uint64 - locBitmapOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) - n += uint64(read) - - var locBitmapLen uint64 - locBitmapLen, read = binary.Uvarint(d.sb.mem[locBitmapOffset : locBitmapOffset+binary.MaxVarintLen64]) - - locRoaringBytes := d.sb.mem[locBitmapOffset+uint64(read) : locBitmapOffset+uint64(read)+locBitmapLen] - - if rv.locBitmap == nil { - rv.locBitmap = roaring.NewBitmap() - } - _, err := rv.locBitmap.FromBuffer(locRoaringBytes) - if err != nil { - return fmt.Errorf("error loading roaring bitmap of locations with hits: %v", err) - } - var postingsLen uint64 postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) n += uint64(read) @@ -297,7 +277,7 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { if rv.postings == nil { rv.postings = roaring.NewBitmap() } - _, err = rv.postings.FromBuffer(roaringBytes) + _, err := rv.postings.FromBuffer(roaringBytes) if err != nil { return fmt.Errorf("error loading roaring bitmap: %v", err) } @@ -334,8 +314,6 @@ type PostingsIterator struct { locChunkOffsets []uint64 locChunkStart uint64 - locBitmap *roaring.Bitmap - next Posting // reused across Next() calls nextLocs []Location // reused across Next() calls @@ -353,10 +331,6 @@ func (i *PostingsIterator) Size() int { len(i.locChunkOffsets)*size.SizeOfUint64 + i.next.Size() - if i.locBitmap != nil { - sizeInBytes += int(i.locBitmap.GetSizeInBytes()) - } - for _, entry := range i.nextLocs { sizeInBytes += entry.Size() } @@ -397,20 +371,37 @@ func (i *PostingsIterator) loadChunk(chunk int) error { return nil } -func (i *PostingsIterator) readFreqNorm() (uint64, uint64, error) { +func (i *PostingsIterator) readFreqNormHasLocs() (uint64, uint64, bool, error) { if i.normBits1Hit != 0 { - return 1, i.normBits1Hit, nil + return 1, i.normBits1Hit, false, nil } - freq, err := i.freqNormDecoder.GetU64() + freqHasLocs, err := i.freqNormDecoder.GetU64() if err != nil { - return 0, 0, fmt.Errorf("error reading frequency: %v", err) + return 0, 0, false, fmt.Errorf("error reading frequency: %v", err) } + freq, hasLocs := decodeFreqHasLocs(freqHasLocs) + normBits, err := i.freqNormDecoder.GetU64() if err != nil { - return 0, 0, fmt.Errorf("error reading norm: %v", err) + return 0, 0, false, fmt.Errorf("error reading norm: %v", err) } - return freq, normBits, err + + return freq, normBits, hasLocs, err +} + +func encodeFreqHasLocs(freq uint64, hasLocs bool) uint64 { + rv := freq << 1 + if hasLocs { + rv = rv | 0x01 // 0'th LSB encodes whether there are locations + } + return rv +} + +func decodeFreqHasLocs(freqHasLocs uint64) (uint64, bool) { + freq := freqHasLocs >> 1 + hasLocs := freqHasLocs&0x01 != 0 + return freq, hasLocs } // readLocation processes all the integers on the stream representing a single @@ -484,13 +475,16 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { rv.docNum = docNum var normBits uint64 - rv.freq, normBits, err = i.readFreqNorm() + var hasLocs bool + + rv.freq, normBits, hasLocs, err = i.readFreqNormHasLocs() if err != nil { return nil, err } + rv.norm = math.Float32frombits(uint32(normBits)) - if i.locBitmap != nil && i.locBitmap.Contains(uint32(docNum)) { + if hasLocs { // read off 'freq' locations, into reused slices if cap(i.nextLocs) >= int(rv.freq) { i.nextLocs = i.nextLocs[0:rv.freq] @@ -514,6 +508,8 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { return rv, nil } +var freqHasLocs1Hit = encodeFreqHasLocs(1, false) + // nextBytes returns the docNum and the encoded freq & loc bytes for // the next posting func (i *PostingsIterator) nextBytes() ( @@ -528,14 +524,16 @@ func (i *PostingsIterator) nextBytes() ( if i.buf == nil { i.buf = make([]byte, binary.MaxVarintLen64*2) } - n := binary.PutUvarint(i.buf, uint64(1)) + n := binary.PutUvarint(i.buf, freqHasLocs1Hit) n += binary.PutUvarint(i.buf, i.normBits1Hit) return docNum, uint64(1), i.normBits1Hit, i.buf[:n], nil, nil } startFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len() - freq, normBits, err = i.readFreqNorm() + var hasLocs bool + + freq, normBits, hasLocs, err = i.readFreqNormHasLocs() if err != nil { return 0, 0, 0, nil, nil, err } @@ -543,7 +541,7 @@ func (i *PostingsIterator) nextBytes() ( endFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len() bytesFreqNorm = i.currChunkFreqNorm[startFreqNorm:endFreqNorm] - if i.locBitmap != nil && i.locBitmap.Contains(uint32(docNum)) { + if hasLocs { startLoc := len(i.currChunkLoc) - i.locReader.Len() for j := uint64(0); j < freq; j++ { @@ -596,11 +594,12 @@ func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { } // read off freq/offsets even though we don't care about them - freq, _, err := i.readFreqNorm() + freq, _, hasLocs, err := i.readFreqNormHasLocs() if err != nil { return 0, false, err } - if i.locBitmap.Contains(allN) { + + if hasLocs { for j := 0; j < int(freq); j++ { err := i.readLocation(nil) if err != nil { From 67f75005c4ba45391a562d6c70089c3ef9485fe6 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 22 Mar 2018 17:05:27 -0700 Subject: [PATCH 321/728] fix cmd/bleve help string for internal command --- cmd/bleve/cmd/scorch/internal.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/bleve/cmd/scorch/internal.go b/cmd/bleve/cmd/scorch/internal.go index 027e90282..dc9497969 100644 --- a/cmd/bleve/cmd/scorch/internal.go +++ b/cmd/bleve/cmd/scorch/internal.go @@ -23,7 +23,7 @@ import ( var ascii bool -// internalCmd represents the snapshots command +// internalCmd represents the internal command var internalCmd = &cobra.Command{ Use: "internal", Short: "internal prints the internal k/v pairs in a snapshot", From 49a4ee60ba1d9569403a9c980aa87c5131048bf3 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 23 Mar 2018 10:01:30 -0700 Subject: [PATCH 322/728] Revert "scorch zap replace locsBitmap w/ 1 bit from freq-norm varint encoding" Testing with the cbft application led to cbft process exits... AsyncError exit()... error reading location field: EOF -- main.initBleveOptions.func1() at init_bleve.go:85 This reverts commit 621b58dd8341232c01653db36372f0e0361bc90f. --- cmd/bleve/cmd/zap/explore.go | 6 +++ index/scorch/segment/zap/build.go | 2 +- index/scorch/segment/zap/dict.go | 5 ++ index/scorch/segment/zap/merge.go | 26 +++++++--- index/scorch/segment/zap/new.go | 42 +++++++++++---- index/scorch/segment/zap/posting.go | 79 +++++++++++++++-------------- 6 files changed, 103 insertions(+), 57 deletions(-) diff --git a/cmd/bleve/cmd/zap/explore.go b/cmd/bleve/cmd/zap/explore.go index 0c2471edc..225b7373f 100644 --- a/cmd/bleve/cmd/zap/explore.go +++ b/cmd/bleve/cmd/zap/explore.go @@ -81,6 +81,10 @@ var exploreCmd = &cobra.Command{ locAddr, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64]) n += uint64(read) + var locBitmapAddr uint64 + locBitmapAddr, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64]) + n += uint64(read) + var postingListLen uint64 postingListLen, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64]) n += uint64(read) @@ -127,6 +131,8 @@ var exploreCmd = &cobra.Command{ running2 += offset } + fmt.Printf("Loc Bitmap at: %d (%x)\n", locBitmapAddr, locBitmapAddr) + } else { fmt.Printf("dictionary does not contain term '%s'\n", args[2]) } diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 9e9d787bb..20b892ca3 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -22,7 +22,7 @@ import ( "github.com/Smerity/govarint" ) -const version uint32 = 7 +const version uint32 = 6 const fieldNotUninverted = math.MaxUint64 diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 38b4faca1..3b8132f2c 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -72,10 +72,15 @@ func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) if postings != nil { postings.Clear() } + locBitmap := rv.locBitmap + if locBitmap != nil { + locBitmap.Clear() + } *rv = PostingsList{} // clear the struct rv.postings = postings + rv.locBitmap = locBitmap } rv.sb = d.sb rv.except = except diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 51dd74206..1da5e5269 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -259,8 +259,9 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, tfEncoder.Close() locEncoder.Close() - postingsOffset, err := writePostings(newRoaring, - tfEncoder, locEncoder, use1HitEncoding, w, bufMaxVarintLen64) + postingsOffset, err := writePostings( + newRoaring, newRoaringLocs, tfEncoder, locEncoder, + use1HitEncoding, w, bufMaxVarintLen64) if err != nil { return err } @@ -422,14 +423,12 @@ func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *Po nextFreq := next.Frequency() nextNorm := uint64(math.Float32bits(float32(next.Norm()))) - locs := next.Locations() - - err = tfEncoder.Add(hitNewDocNum, - encodeFreqHasLocs(nextFreq, len(locs) > 0), nextNorm) + err = tfEncoder.Add(hitNewDocNum, nextFreq, nextNorm) if err != nil { return 0, 0, 0, nil, err } + locs := next.Locations() if len(locs) > 0 { newRoaringLocs.Add(uint32(hitNewDocNum)) @@ -504,7 +503,8 @@ func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator, return lastDocNum, lastFreq, lastNorm, err } -func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCoder, +func writePostings(postings, postingLocs *roaring.Bitmap, + tfEncoder, locEncoder *chunkedIntCoder, use1HitEncoding func(uint64) (bool, uint64, uint64), w *CountHashWriter, bufMaxVarintLen64 []byte) ( offset uint64, err error) { @@ -532,6 +532,12 @@ func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCo return 0, err } + postingLocsOffset := uint64(w.Count()) + _, err = writeRoaringWithLen(postingLocs, w, bufMaxVarintLen64) + if err != nil { + return 0, err + } + postingsOffset := uint64(w.Count()) n := binary.PutUvarint(bufMaxVarintLen64, tfOffset) @@ -546,6 +552,12 @@ func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCo return 0, err } + n = binary.PutUvarint(bufMaxVarintLen64, postingLocsOffset) + _, err = w.Write(bufMaxVarintLen64[:n]) + if err != nil { + return 0, err + } + _, err = writeRoaringWithLen(postings, w, bufMaxVarintLen64) if err != nil { return 0, err diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index 5837436fe..7d098349d 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -103,6 +103,9 @@ type interim struct { // postings id -> bitmap of docNums Postings []*roaring.Bitmap + // postings id -> bitmap of docNums that have locations + PostingsLocs []*roaring.Bitmap + // postings id -> freq/norm's, one for each docNum in postings FreqNorms [][]interimFreqNorm freqNormsBacking []interimFreqNorm @@ -148,6 +151,10 @@ func (s *interim) reset() (err error) { idn.Clear() } s.Postings = s.Postings[:0] + for _, idn := range s.PostingsLocs { + idn.Clear() + } + s.PostingsLocs = s.PostingsLocs[:0] s.FreqNorms = s.FreqNorms[:0] for i := range s.freqNormsBacking { s.freqNormsBacking[i] = interimFreqNorm{} @@ -189,9 +196,8 @@ type interimStoredField struct { } type interimFreqNorm struct { - freq uint64 - norm float32 - hasLocs bool + freq uint64 + norm float32 } type interimLoc struct { @@ -350,6 +356,19 @@ func (s *interim) prepareDicts() { s.Postings = postings } + if cap(s.PostingsLocs) >= numPostingsLists { + s.PostingsLocs = s.PostingsLocs[:numPostingsLists] + } else { + postingsLocs := make([]*roaring.Bitmap, numPostingsLists) + copy(postingsLocs, s.PostingsLocs[:cap(s.PostingsLocs)]) + for i := 0; i < numPostingsLists; i++ { + if postingsLocs[i] == nil { + postingsLocs[i] = roaring.New() + } + } + s.PostingsLocs = postingsLocs + } + if cap(s.FreqNorms) >= numPostingsLists { s.FreqNorms = s.FreqNorms[:numPostingsLists] } else { @@ -445,12 +464,14 @@ func (s *interim) processDocument(docNum uint64, s.FreqNorms[pid] = append(s.FreqNorms[pid], interimFreqNorm{ - freq: uint64(tf.Frequency()), - norm: norm, - hasLocs: len(tf.Locations) > 0, + freq: uint64(tf.Frequency()), + norm: norm, }) if len(tf.Locations) > 0 { + locBS := s.PostingsLocs[pid] + locBS.Add(uint32(docNum)) + locs := s.Locs[pid] for _, loc := range tf.Locations { @@ -604,6 +625,7 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err pid := dict[term] - 1 postingsBS := s.Postings[pid] + postingsLocsBS := s.PostingsLocs[pid] freqNorms := s.FreqNorms[pid] freqNormOffset := 0 @@ -617,8 +639,7 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err freqNorm := freqNorms[freqNormOffset] - err = tfEncoder.Add(docNum, - encodeFreqHasLocs(freqNorm.freq, freqNorm.hasLocs), + err = tfEncoder.Add(docNum, freqNorm.freq, uint64(math.Float32bits(freqNorm.norm))) if err != nil { return 0, nil, err @@ -654,8 +675,9 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err tfEncoder.Close() locEncoder.Close() - postingsOffset, err := - writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf) + postingsOffset, err := writePostings( + postingsBS, postingsLocsBS, tfEncoder, locEncoder, + nil, s.w, buf) if err != nil { return 0, nil, err } diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 004b80317..f5ccad1ad 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -100,6 +100,7 @@ type PostingsList struct { postingsOffset uint64 freqOffset uint64 locOffset uint64 + locBitmap *roaring.Bitmap postings *roaring.Bitmap except *roaring.Bitmap @@ -221,6 +222,8 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { } rv.locChunkStart = p.locOffset + n + rv.locBitmap = p.locBitmap + rv.all = p.postings.Iterator() if p.except != nil { allExcept := roaring.AndNot(p.postings, p.except) @@ -268,6 +271,23 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) n += uint64(read) + var locBitmapOffset uint64 + locBitmapOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + + var locBitmapLen uint64 + locBitmapLen, read = binary.Uvarint(d.sb.mem[locBitmapOffset : locBitmapOffset+binary.MaxVarintLen64]) + + locRoaringBytes := d.sb.mem[locBitmapOffset+uint64(read) : locBitmapOffset+uint64(read)+locBitmapLen] + + if rv.locBitmap == nil { + rv.locBitmap = roaring.NewBitmap() + } + _, err := rv.locBitmap.FromBuffer(locRoaringBytes) + if err != nil { + return fmt.Errorf("error loading roaring bitmap of locations with hits: %v", err) + } + var postingsLen uint64 postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) n += uint64(read) @@ -277,7 +297,7 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { if rv.postings == nil { rv.postings = roaring.NewBitmap() } - _, err := rv.postings.FromBuffer(roaringBytes) + _, err = rv.postings.FromBuffer(roaringBytes) if err != nil { return fmt.Errorf("error loading roaring bitmap: %v", err) } @@ -314,6 +334,8 @@ type PostingsIterator struct { locChunkOffsets []uint64 locChunkStart uint64 + locBitmap *roaring.Bitmap + next Posting // reused across Next() calls nextLocs []Location // reused across Next() calls @@ -331,6 +353,10 @@ func (i *PostingsIterator) Size() int { len(i.locChunkOffsets)*size.SizeOfUint64 + i.next.Size() + if i.locBitmap != nil { + sizeInBytes += int(i.locBitmap.GetSizeInBytes()) + } + for _, entry := range i.nextLocs { sizeInBytes += entry.Size() } @@ -371,37 +397,20 @@ func (i *PostingsIterator) loadChunk(chunk int) error { return nil } -func (i *PostingsIterator) readFreqNormHasLocs() (uint64, uint64, bool, error) { +func (i *PostingsIterator) readFreqNorm() (uint64, uint64, error) { if i.normBits1Hit != 0 { - return 1, i.normBits1Hit, false, nil + return 1, i.normBits1Hit, nil } - freqHasLocs, err := i.freqNormDecoder.GetU64() + freq, err := i.freqNormDecoder.GetU64() if err != nil { - return 0, 0, false, fmt.Errorf("error reading frequency: %v", err) + return 0, 0, fmt.Errorf("error reading frequency: %v", err) } - freq, hasLocs := decodeFreqHasLocs(freqHasLocs) - normBits, err := i.freqNormDecoder.GetU64() if err != nil { - return 0, 0, false, fmt.Errorf("error reading norm: %v", err) + return 0, 0, fmt.Errorf("error reading norm: %v", err) } - - return freq, normBits, hasLocs, err -} - -func encodeFreqHasLocs(freq uint64, hasLocs bool) uint64 { - rv := freq << 1 - if hasLocs { - rv = rv | 0x01 // 0'th LSB encodes whether there are locations - } - return rv -} - -func decodeFreqHasLocs(freqHasLocs uint64) (uint64, bool) { - freq := freqHasLocs >> 1 - hasLocs := freqHasLocs&0x01 != 0 - return freq, hasLocs + return freq, normBits, err } // readLocation processes all the integers on the stream representing a single @@ -475,16 +484,13 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { rv.docNum = docNum var normBits uint64 - var hasLocs bool - - rv.freq, normBits, hasLocs, err = i.readFreqNormHasLocs() + rv.freq, normBits, err = i.readFreqNorm() if err != nil { return nil, err } - rv.norm = math.Float32frombits(uint32(normBits)) - if hasLocs { + if i.locBitmap != nil && i.locBitmap.Contains(uint32(docNum)) { // read off 'freq' locations, into reused slices if cap(i.nextLocs) >= int(rv.freq) { i.nextLocs = i.nextLocs[0:rv.freq] @@ -508,8 +514,6 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { return rv, nil } -var freqHasLocs1Hit = encodeFreqHasLocs(1, false) - // nextBytes returns the docNum and the encoded freq & loc bytes for // the next posting func (i *PostingsIterator) nextBytes() ( @@ -524,16 +528,14 @@ func (i *PostingsIterator) nextBytes() ( if i.buf == nil { i.buf = make([]byte, binary.MaxVarintLen64*2) } - n := binary.PutUvarint(i.buf, freqHasLocs1Hit) + n := binary.PutUvarint(i.buf, uint64(1)) n += binary.PutUvarint(i.buf, i.normBits1Hit) return docNum, uint64(1), i.normBits1Hit, i.buf[:n], nil, nil } startFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len() - var hasLocs bool - - freq, normBits, hasLocs, err = i.readFreqNormHasLocs() + freq, normBits, err = i.readFreqNorm() if err != nil { return 0, 0, 0, nil, nil, err } @@ -541,7 +543,7 @@ func (i *PostingsIterator) nextBytes() ( endFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len() bytesFreqNorm = i.currChunkFreqNorm[startFreqNorm:endFreqNorm] - if hasLocs { + if i.locBitmap != nil && i.locBitmap.Contains(uint32(docNum)) { startLoc := len(i.currChunkLoc) - i.locReader.Len() for j := uint64(0); j < freq; j++ { @@ -594,12 +596,11 @@ func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { } // read off freq/offsets even though we don't care about them - freq, _, hasLocs, err := i.readFreqNormHasLocs() + freq, _, err := i.readFreqNorm() if err != nil { return 0, false, err } - - if hasLocs { + if i.locBitmap.Contains(allN) { for j := 0; j < int(freq); j++ { err := i.readLocation(nil) if err != nil { From 7a19e6fd7e807171abbc2f823f3a12b037fefb89 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 23 Mar 2018 12:40:02 -0700 Subject: [PATCH 323/728] scorch zap replace locsBitmap w/ 1 bit from freq-norm varint encoding This is attempt #2 of the optimization that replaces the locsBitmap, without any changes from the original commit attempt. A commit that follows this one contains the actual fix. See also... - commit 621b58dd834123 (the 1st attempt) - commit 49a4ee60ba1d95 (the revert) ------------- The original commit message body from 621b58 was... NOTE: this is a zap file format change. The separate "postings locations" roaring Bitmap that encoded whether a posting has locations info is now replaced by the least significant bit in the freq varint encoded in the freq-norm chunkedIntCoder. encode/decodeFreqHasLocs() are added as helper functions. --- cmd/bleve/cmd/zap/explore.go | 6 --- index/scorch/segment/zap/build.go | 2 +- index/scorch/segment/zap/dict.go | 5 -- index/scorch/segment/zap/merge.go | 26 +++------- index/scorch/segment/zap/new.go | 42 ++++----------- index/scorch/segment/zap/posting.go | 79 ++++++++++++++--------------- 6 files changed, 57 insertions(+), 103 deletions(-) diff --git a/cmd/bleve/cmd/zap/explore.go b/cmd/bleve/cmd/zap/explore.go index 225b7373f..0c2471edc 100644 --- a/cmd/bleve/cmd/zap/explore.go +++ b/cmd/bleve/cmd/zap/explore.go @@ -81,10 +81,6 @@ var exploreCmd = &cobra.Command{ locAddr, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64]) n += uint64(read) - var locBitmapAddr uint64 - locBitmapAddr, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64]) - n += uint64(read) - var postingListLen uint64 postingListLen, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64]) n += uint64(read) @@ -131,8 +127,6 @@ var exploreCmd = &cobra.Command{ running2 += offset } - fmt.Printf("Loc Bitmap at: %d (%x)\n", locBitmapAddr, locBitmapAddr) - } else { fmt.Printf("dictionary does not contain term '%s'\n", args[2]) } diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 20b892ca3..9e9d787bb 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -22,7 +22,7 @@ import ( "github.com/Smerity/govarint" ) -const version uint32 = 6 +const version uint32 = 7 const fieldNotUninverted = math.MaxUint64 diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 3b8132f2c..38b4faca1 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -72,15 +72,10 @@ func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) if postings != nil { postings.Clear() } - locBitmap := rv.locBitmap - if locBitmap != nil { - locBitmap.Clear() - } *rv = PostingsList{} // clear the struct rv.postings = postings - rv.locBitmap = locBitmap } rv.sb = d.sb rv.except = except diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 1da5e5269..51dd74206 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -259,9 +259,8 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, tfEncoder.Close() locEncoder.Close() - postingsOffset, err := writePostings( - newRoaring, newRoaringLocs, tfEncoder, locEncoder, - use1HitEncoding, w, bufMaxVarintLen64) + postingsOffset, err := writePostings(newRoaring, + tfEncoder, locEncoder, use1HitEncoding, w, bufMaxVarintLen64) if err != nil { return err } @@ -423,12 +422,14 @@ func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *Po nextFreq := next.Frequency() nextNorm := uint64(math.Float32bits(float32(next.Norm()))) - err = tfEncoder.Add(hitNewDocNum, nextFreq, nextNorm) + locs := next.Locations() + + err = tfEncoder.Add(hitNewDocNum, + encodeFreqHasLocs(nextFreq, len(locs) > 0), nextNorm) if err != nil { return 0, 0, 0, nil, err } - locs := next.Locations() if len(locs) > 0 { newRoaringLocs.Add(uint32(hitNewDocNum)) @@ -503,8 +504,7 @@ func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator, return lastDocNum, lastFreq, lastNorm, err } -func writePostings(postings, postingLocs *roaring.Bitmap, - tfEncoder, locEncoder *chunkedIntCoder, +func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCoder, use1HitEncoding func(uint64) (bool, uint64, uint64), w *CountHashWriter, bufMaxVarintLen64 []byte) ( offset uint64, err error) { @@ -532,12 +532,6 @@ func writePostings(postings, postingLocs *roaring.Bitmap, return 0, err } - postingLocsOffset := uint64(w.Count()) - _, err = writeRoaringWithLen(postingLocs, w, bufMaxVarintLen64) - if err != nil { - return 0, err - } - postingsOffset := uint64(w.Count()) n := binary.PutUvarint(bufMaxVarintLen64, tfOffset) @@ -552,12 +546,6 @@ func writePostings(postings, postingLocs *roaring.Bitmap, return 0, err } - n = binary.PutUvarint(bufMaxVarintLen64, postingLocsOffset) - _, err = w.Write(bufMaxVarintLen64[:n]) - if err != nil { - return 0, err - } - _, err = writeRoaringWithLen(postings, w, bufMaxVarintLen64) if err != nil { return 0, err diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index 7d098349d..5837436fe 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -103,9 +103,6 @@ type interim struct { // postings id -> bitmap of docNums Postings []*roaring.Bitmap - // postings id -> bitmap of docNums that have locations - PostingsLocs []*roaring.Bitmap - // postings id -> freq/norm's, one for each docNum in postings FreqNorms [][]interimFreqNorm freqNormsBacking []interimFreqNorm @@ -151,10 +148,6 @@ func (s *interim) reset() (err error) { idn.Clear() } s.Postings = s.Postings[:0] - for _, idn := range s.PostingsLocs { - idn.Clear() - } - s.PostingsLocs = s.PostingsLocs[:0] s.FreqNorms = s.FreqNorms[:0] for i := range s.freqNormsBacking { s.freqNormsBacking[i] = interimFreqNorm{} @@ -196,8 +189,9 @@ type interimStoredField struct { } type interimFreqNorm struct { - freq uint64 - norm float32 + freq uint64 + norm float32 + hasLocs bool } type interimLoc struct { @@ -356,19 +350,6 @@ func (s *interim) prepareDicts() { s.Postings = postings } - if cap(s.PostingsLocs) >= numPostingsLists { - s.PostingsLocs = s.PostingsLocs[:numPostingsLists] - } else { - postingsLocs := make([]*roaring.Bitmap, numPostingsLists) - copy(postingsLocs, s.PostingsLocs[:cap(s.PostingsLocs)]) - for i := 0; i < numPostingsLists; i++ { - if postingsLocs[i] == nil { - postingsLocs[i] = roaring.New() - } - } - s.PostingsLocs = postingsLocs - } - if cap(s.FreqNorms) >= numPostingsLists { s.FreqNorms = s.FreqNorms[:numPostingsLists] } else { @@ -464,14 +445,12 @@ func (s *interim) processDocument(docNum uint64, s.FreqNorms[pid] = append(s.FreqNorms[pid], interimFreqNorm{ - freq: uint64(tf.Frequency()), - norm: norm, + freq: uint64(tf.Frequency()), + norm: norm, + hasLocs: len(tf.Locations) > 0, }) if len(tf.Locations) > 0 { - locBS := s.PostingsLocs[pid] - locBS.Add(uint32(docNum)) - locs := s.Locs[pid] for _, loc := range tf.Locations { @@ -625,7 +604,6 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err pid := dict[term] - 1 postingsBS := s.Postings[pid] - postingsLocsBS := s.PostingsLocs[pid] freqNorms := s.FreqNorms[pid] freqNormOffset := 0 @@ -639,7 +617,8 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err freqNorm := freqNorms[freqNormOffset] - err = tfEncoder.Add(docNum, freqNorm.freq, + err = tfEncoder.Add(docNum, + encodeFreqHasLocs(freqNorm.freq, freqNorm.hasLocs), uint64(math.Float32bits(freqNorm.norm))) if err != nil { return 0, nil, err @@ -675,9 +654,8 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err tfEncoder.Close() locEncoder.Close() - postingsOffset, err := writePostings( - postingsBS, postingsLocsBS, tfEncoder, locEncoder, - nil, s.w, buf) + postingsOffset, err := + writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf) if err != nil { return 0, nil, err } diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index f5ccad1ad..004b80317 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -100,7 +100,6 @@ type PostingsList struct { postingsOffset uint64 freqOffset uint64 locOffset uint64 - locBitmap *roaring.Bitmap postings *roaring.Bitmap except *roaring.Bitmap @@ -222,8 +221,6 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { } rv.locChunkStart = p.locOffset + n - rv.locBitmap = p.locBitmap - rv.all = p.postings.Iterator() if p.except != nil { allExcept := roaring.AndNot(p.postings, p.except) @@ -271,23 +268,6 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) n += uint64(read) - var locBitmapOffset uint64 - locBitmapOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) - n += uint64(read) - - var locBitmapLen uint64 - locBitmapLen, read = binary.Uvarint(d.sb.mem[locBitmapOffset : locBitmapOffset+binary.MaxVarintLen64]) - - locRoaringBytes := d.sb.mem[locBitmapOffset+uint64(read) : locBitmapOffset+uint64(read)+locBitmapLen] - - if rv.locBitmap == nil { - rv.locBitmap = roaring.NewBitmap() - } - _, err := rv.locBitmap.FromBuffer(locRoaringBytes) - if err != nil { - return fmt.Errorf("error loading roaring bitmap of locations with hits: %v", err) - } - var postingsLen uint64 postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) n += uint64(read) @@ -297,7 +277,7 @@ func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { if rv.postings == nil { rv.postings = roaring.NewBitmap() } - _, err = rv.postings.FromBuffer(roaringBytes) + _, err := rv.postings.FromBuffer(roaringBytes) if err != nil { return fmt.Errorf("error loading roaring bitmap: %v", err) } @@ -334,8 +314,6 @@ type PostingsIterator struct { locChunkOffsets []uint64 locChunkStart uint64 - locBitmap *roaring.Bitmap - next Posting // reused across Next() calls nextLocs []Location // reused across Next() calls @@ -353,10 +331,6 @@ func (i *PostingsIterator) Size() int { len(i.locChunkOffsets)*size.SizeOfUint64 + i.next.Size() - if i.locBitmap != nil { - sizeInBytes += int(i.locBitmap.GetSizeInBytes()) - } - for _, entry := range i.nextLocs { sizeInBytes += entry.Size() } @@ -397,20 +371,37 @@ func (i *PostingsIterator) loadChunk(chunk int) error { return nil } -func (i *PostingsIterator) readFreqNorm() (uint64, uint64, error) { +func (i *PostingsIterator) readFreqNormHasLocs() (uint64, uint64, bool, error) { if i.normBits1Hit != 0 { - return 1, i.normBits1Hit, nil + return 1, i.normBits1Hit, false, nil } - freq, err := i.freqNormDecoder.GetU64() + freqHasLocs, err := i.freqNormDecoder.GetU64() if err != nil { - return 0, 0, fmt.Errorf("error reading frequency: %v", err) + return 0, 0, false, fmt.Errorf("error reading frequency: %v", err) } + freq, hasLocs := decodeFreqHasLocs(freqHasLocs) + normBits, err := i.freqNormDecoder.GetU64() if err != nil { - return 0, 0, fmt.Errorf("error reading norm: %v", err) + return 0, 0, false, fmt.Errorf("error reading norm: %v", err) } - return freq, normBits, err + + return freq, normBits, hasLocs, err +} + +func encodeFreqHasLocs(freq uint64, hasLocs bool) uint64 { + rv := freq << 1 + if hasLocs { + rv = rv | 0x01 // 0'th LSB encodes whether there are locations + } + return rv +} + +func decodeFreqHasLocs(freqHasLocs uint64) (uint64, bool) { + freq := freqHasLocs >> 1 + hasLocs := freqHasLocs&0x01 != 0 + return freq, hasLocs } // readLocation processes all the integers on the stream representing a single @@ -484,13 +475,16 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { rv.docNum = docNum var normBits uint64 - rv.freq, normBits, err = i.readFreqNorm() + var hasLocs bool + + rv.freq, normBits, hasLocs, err = i.readFreqNormHasLocs() if err != nil { return nil, err } + rv.norm = math.Float32frombits(uint32(normBits)) - if i.locBitmap != nil && i.locBitmap.Contains(uint32(docNum)) { + if hasLocs { // read off 'freq' locations, into reused slices if cap(i.nextLocs) >= int(rv.freq) { i.nextLocs = i.nextLocs[0:rv.freq] @@ -514,6 +508,8 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { return rv, nil } +var freqHasLocs1Hit = encodeFreqHasLocs(1, false) + // nextBytes returns the docNum and the encoded freq & loc bytes for // the next posting func (i *PostingsIterator) nextBytes() ( @@ -528,14 +524,16 @@ func (i *PostingsIterator) nextBytes() ( if i.buf == nil { i.buf = make([]byte, binary.MaxVarintLen64*2) } - n := binary.PutUvarint(i.buf, uint64(1)) + n := binary.PutUvarint(i.buf, freqHasLocs1Hit) n += binary.PutUvarint(i.buf, i.normBits1Hit) return docNum, uint64(1), i.normBits1Hit, i.buf[:n], nil, nil } startFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len() - freq, normBits, err = i.readFreqNorm() + var hasLocs bool + + freq, normBits, hasLocs, err = i.readFreqNormHasLocs() if err != nil { return 0, 0, 0, nil, nil, err } @@ -543,7 +541,7 @@ func (i *PostingsIterator) nextBytes() ( endFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len() bytesFreqNorm = i.currChunkFreqNorm[startFreqNorm:endFreqNorm] - if i.locBitmap != nil && i.locBitmap.Contains(uint32(docNum)) { + if hasLocs { startLoc := len(i.currChunkLoc) - i.locReader.Len() for j := uint64(0); j < freq; j++ { @@ -596,11 +594,12 @@ func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { } // read off freq/offsets even though we don't care about them - freq, _, err := i.readFreqNorm() + freq, _, hasLocs, err := i.readFreqNormHasLocs() if err != nil { return 0, false, err } - if i.locBitmap.Contains(allN) { + + if hasLocs { for j := 0; j < int(freq); j++ { err := i.readLocation(nil) if err != nil { From ba644f38935bfc35a5578af6ce08a50555e896ea Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 23 Mar 2018 12:39:20 -0700 Subject: [PATCH 324/728] scorch zap fix postingsIter.nextBytes() when 1-bit encoded The previous commit's optimization that replaced the locsBitmap was incorrectly handling the case when there was a 1-bit encoding optimization in the postingsIterator.nextBytes() method, incorrectly generating the freq-norm bytes. Also as part of this change, more unused locsBitmap's were removed. --- index/scorch/segment/zap/build.go | 2 +- index/scorch/segment/zap/merge.go | 14 ++++---------- index/scorch/segment/zap/posting.go | 2 +- 3 files changed, 6 insertions(+), 12 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 9e9d787bb..03e4cbae8 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -22,7 +22,7 @@ import ( "github.com/Smerity/govarint" ) -const version uint32 = 7 +const version uint32 = 8 const fieldNotUninverted = math.MaxUint64 diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 51dd74206..6faca646f 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -188,7 +188,6 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, } newRoaring := roaring.NewBitmap() - newRoaringLocs := roaring.NewBitmap() // for each field for fieldID, fieldName := range fieldsInv { @@ -234,7 +233,6 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, var prevTerm []byte newRoaring.Clear() - newRoaringLocs.Clear() var lastDocNum, lastFreq, lastNorm uint64 @@ -273,7 +271,6 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, } newRoaring.Clear() - newRoaringLocs.Clear() tfEncoder.Reset() locEncoder.Reset() @@ -311,11 +308,11 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, if fieldsSame { // can optimize by copying freq/norm/loc bytes directly lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying( - term, postItr, newDocNums[itrI], newRoaring, newRoaringLocs, + term, postItr, newDocNums[itrI], newRoaring, tfEncoder, locEncoder, docTermMap) } else { lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs( - fieldsMap, term, postItr, newDocNums[itrI], newRoaring, newRoaringLocs, + fieldsMap, term, postItr, newDocNums[itrI], newRoaring, tfEncoder, locEncoder, docTermMap, bufLoc) } if err != nil { @@ -406,7 +403,7 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, } func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *PostingsIterator, - newDocNums []uint64, newRoaring *roaring.Bitmap, newRoaringLocs *roaring.Bitmap, + newDocNums []uint64, newRoaring *roaring.Bitmap, tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, docTermMap [][]byte, bufLoc []uint64) ( lastDocNum uint64, lastFreq uint64, lastNorm uint64, bufLocOut []uint64, err error) { @@ -431,8 +428,6 @@ func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *Po } if len(locs) > 0 { - newRoaringLocs.Add(uint32(hitNewDocNum)) - for _, loc := range locs { if cap(bufLoc) < 5+len(loc.ArrayPositions()) { bufLoc = make([]uint64, 0, 5+len(loc.ArrayPositions())) @@ -465,7 +460,7 @@ func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *Po } func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator, - newDocNums []uint64, newRoaring *roaring.Bitmap, newRoaringLocs *roaring.Bitmap, + newDocNums []uint64, newRoaring *roaring.Bitmap, tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, docTermMap [][]byte) ( lastDocNum uint64, lastFreq uint64, lastNorm uint64, err error) { nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err := @@ -483,7 +478,6 @@ func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator, } if len(nextLocBytes) > 0 { - newRoaringLocs.Add(uint32(hitNewDocNum)) err = locEncoder.AddBytes(hitNewDocNum, nextLocBytes) if err != nil { return 0, 0, 0, err diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 004b80317..02e286575 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -525,7 +525,7 @@ func (i *PostingsIterator) nextBytes() ( i.buf = make([]byte, binary.MaxVarintLen64*2) } n := binary.PutUvarint(i.buf, freqHasLocs1Hit) - n += binary.PutUvarint(i.buf, i.normBits1Hit) + n += binary.PutUvarint(i.buf[n:], i.normBits1Hit) return docNum, uint64(1), i.normBits1Hit, i.buf[:n], nil, nil } From 84424edcad4d55d7cd4ae69f40babc3ff8c479e4 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 16 Mar 2018 22:31:04 -0700 Subject: [PATCH 325/728] scorch zap sync.Pool for reusable VisitDocument() data structures As part of this, snappy.Decode() is also provided a reused buffer for decompression. --- index/scorch/segment/zap/segment.go | 34 +++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 0d2ad072f..3a3fd02eb 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -273,19 +273,39 @@ func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) { return rv, nil } +// visitDocumentCtx holds data structures that are reusable across +// multiple VisitDocument() calls to avoid memory allocations +type visitDocumentCtx struct { + buf []byte + reader bytes.Reader + decoder *govarint.Base128Decoder + arrayPos []uint64 +} + +var visitDocumentCtxPool = sync.Pool{ + New: func() interface{} { + reuse := &visitDocumentCtx{} + reuse.decoder = govarint.NewU64Base128Decoder(&reuse.reader) + return reuse + }, +} + // VisitDocument invokes the DocFieldValueVistor for each stored field // for the specified doc number func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { // first make sure this is a valid number in this segment if num < s.numDocs { + vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) + meta, compressed := s.getDocStoredMetaAndCompressed(num) - uncompressed, err := snappy.Decode(nil, compressed) + uncompressed, err := snappy.Decode(vdc.buf[:cap(vdc.buf)], compressed) if err != nil { return err } + // now decode meta and process - reader := bytes.NewReader(meta) - decoder := govarint.NewU64Base128Decoder(reader) + vdc.reader.Reset(meta) + decoder := vdc.decoder keepGoing := true for keepGoing { @@ -314,7 +334,10 @@ func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldVal } var arrayPos []uint64 if numap > 0 { - arrayPos = make([]uint64, numap) + if cap(vdc.arrayPos) < int(numap) { + vdc.arrayPos = make([]uint64, numap) + } + arrayPos = vdc.arrayPos[:numap] for i := 0; i < int(numap); i++ { ap, err := decoder.GetU64() if err != nil { @@ -327,6 +350,9 @@ func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldVal value := uncompressed[offset : offset+l] keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos) } + + vdc.buf = uncompressed + visitDocumentCtxPool.Put(vdc) } return nil } From 6540b197d495053d66b48141bb1525e9a86f61a5 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 17 Mar 2018 10:40:34 -0700 Subject: [PATCH 326/728] scorch zap provide full buffer capacity to snappy Encode/Decode() The snappy Encode/Decode() API's accept an optional destination buffer param where their encoded/decoded output results will be placed, but they only check that the buffer has enough len() rather than enough capacity before deciding to allocate a new buffer. --- index/scorch/segment/zap/contentcoder.go | 8 +++++--- index/scorch/segment/zap/docvalues.go | 4 +++- index/scorch/segment/zap/merge.go | 3 +-- index/scorch/segment/zap/new.go | 3 +-- 4 files changed, 10 insertions(+), 8 deletions(-) diff --git a/index/scorch/segment/zap/contentcoder.go b/index/scorch/segment/zap/contentcoder.go index 1e7a785ca..2148d1d45 100644 --- a/index/scorch/segment/zap/contentcoder.go +++ b/index/scorch/segment/zap/contentcoder.go @@ -42,6 +42,8 @@ type chunkedContentCoder struct { chunkBuf bytes.Buffer chunkMeta []MetaData + + compressed []byte // temp buf for snappy compression } // MetaData represents the data information inside a @@ -105,10 +107,10 @@ func (c *chunkedContentCoder) flushContents() error { metaData := c.chunkMetaBuf.Bytes() c.final = append(c.final, c.chunkMetaBuf.Bytes()...) // write the compressed data to the final data - compressedData := snappy.Encode(nil, c.chunkBuf.Bytes()) - c.final = append(c.final, compressedData...) + c.compressed = snappy.Encode(c.compressed[:cap(c.compressed)], c.chunkBuf.Bytes()) + c.final = append(c.final, c.compressed...) - c.chunkLens[c.currChunk] = uint64(len(compressedData) + len(metaData)) + c.chunkLens[c.currChunk] = uint64(len(c.compressed) + len(metaData)) return nil } diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index 844271902..dcd2cb052 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -42,6 +42,7 @@ type docValueIterator struct { dvDataLoc uint64 curChunkHeader []MetaData curChunkData []byte // compressed data cache + uncompressed []byte // temp buf for snappy decompression } func (di *docValueIterator) size() int { @@ -135,10 +136,11 @@ func (di *docValueIterator) visitDocValues(docNum uint64, return nil } // uncompress the already loaded data - uncompressed, err := snappy.Decode(nil, di.curChunkData) + uncompressed, err := snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) if err != nil { return err } + di.uncompressed = uncompressed // pick the terms for the given docNum uncompressed = uncompressed[start:end] diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 6faca646f..167ebfa24 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -604,7 +604,6 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, curr = 0 metaBuf.Reset() data = data[:0] - compressed = compressed[:0] // collect all the data for i := 0; i < len(fieldsInv); i++ { @@ -641,7 +640,7 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, metaEncoder.Close() metaBytes := metaBuf.Bytes() - compressed = snappy.Encode(compressed, data) + compressed = snappy.Encode(compressed[:cap(compressed)], data) // record where we're about to start writing docNumOffsets[newDocNum] = uint64(w.Count()) diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index 5837436fe..83a0cbced 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -517,7 +517,6 @@ func (s *interim) writeStoredFields() ( s.metaBuf.Reset() data = data[:0] - compressed = compressed[:0] for fieldID := range s.FieldsInv { isf, exists := docStoredFields[uint16(fieldID)] @@ -534,7 +533,7 @@ func (s *interim) writeStoredFields() ( metaEncoder.Close() metaBytes := s.metaBuf.Bytes() - compressed = snappy.Encode(compressed, data) + compressed = snappy.Encode(compressed[:cap(compressed)], data) docStoredOffsets[docNum] = uint64(s.w.Count()) From db792717a6d83b0354dfbf9aa5a7ad0e9331db5b Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 17 Mar 2018 10:59:57 -0700 Subject: [PATCH 327/728] scorch zap postingsIter reuses nextLocs/nextSegmentLocs The previous code would inefficiently throw away the nextLocs and would also throw away the []segment.Location slice if there were no locations, such as if it was a 1-hit postings list. This change tries to reuse the nextLocs/nextSegmentLocs for all cases. --- index/scorch/segment/zap/posting.go | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 02e286575..3fecaa23e 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -154,6 +154,9 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { freqChunkOffsets := rv.freqChunkOffsets[:0] locChunkOffsets := rv.locChunkOffsets[:0] + nextLocs := rv.nextLocs[:0] + nextSegmentLocs := rv.nextSegmentLocs[:0] + buf := rv.buf *rv = PostingsIterator{} // clear the struct @@ -167,6 +170,9 @@ func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { rv.freqChunkOffsets = freqChunkOffsets rv.locChunkOffsets = locChunkOffsets + rv.nextLocs = nextLocs + rv.nextSegmentLocs = nextSegmentLocs + rv.buf = buf } rv.postings = p @@ -314,8 +320,9 @@ type PostingsIterator struct { locChunkOffsets []uint64 locChunkStart uint64 - next Posting // reused across Next() calls - nextLocs []Location // reused across Next() calls + next Posting // reused across Next() calls + nextLocs []Location // reused across Next() calls + nextSegmentLocs []segment.Location // reused across Next() calls docNum1Hit uint64 normBits1Hit uint64 @@ -469,8 +476,7 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { return nil, err } - reuseLocs := i.next.locs // hold for reuse before struct clearing - i.next = Posting{} // clear the struct + i.next = Posting{} // clear the struct rv := &i.next rv.docNum = docNum @@ -491,11 +497,10 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { } else { i.nextLocs = make([]Location, rv.freq) } - if cap(reuseLocs) >= int(rv.freq) { - rv.locs = reuseLocs[0:rv.freq] - } else { - rv.locs = make([]segment.Location, rv.freq) + if cap(i.nextSegmentLocs) < int(rv.freq) { + i.nextSegmentLocs = make([]segment.Location, rv.freq) } + rv.locs = i.nextSegmentLocs[0:rv.freq] for j := 0; j < int(rv.freq); j++ { err := i.readLocation(&i.nextLocs[j]) if err != nil { From 3f4b161850618c9c026b06fcc694753e8ccff83d Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 21 Mar 2018 10:19:02 -0700 Subject: [PATCH 328/728] scorch zap postingsIter reuses array positions slice --- index/scorch/segment/zap/posting.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 3fecaa23e..e994617ad 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -448,10 +448,10 @@ func (i *PostingsIterator) readLocation(l *Location) error { l.pos = pos l.start = start l.end = end - if numArrayPos > 0 { + if cap(l.ap) < int(numArrayPos) { l.ap = make([]uint64, int(numArrayPos)) } else { - l.ap = l.ap[:0] + l.ap = l.ap[:int(numArrayPos)] } } From fc7584f5a012ff5919156ecc8a0a414985218883 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 23 Mar 2018 16:56:37 -0700 Subject: [PATCH 329/728] scorch zap prealloc extra locs for future growth --- index/scorch/segment/zap/posting.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index e994617ad..cf1e1ab0e 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -495,10 +495,10 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { if cap(i.nextLocs) >= int(rv.freq) { i.nextLocs = i.nextLocs[0:rv.freq] } else { - i.nextLocs = make([]Location, rv.freq) + i.nextLocs = make([]Location, rv.freq, rv.freq * 2) } if cap(i.nextSegmentLocs) < int(rv.freq) { - i.nextSegmentLocs = make([]segment.Location, rv.freq) + i.nextSegmentLocs = make([]segment.Location, rv.freq, rv.freq * 2) } rv.locs = i.nextSegmentLocs[0:rv.freq] for j := 0; j < int(rv.freq); j++ { From 192621f402857bd9bd1aa091be3ee38572256bee Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 24 Mar 2018 10:03:57 -0700 Subject: [PATCH 330/728] scorch includeFreq/Norm/Locs params for postingsList.Iterator API This commit adds boolean flag params to the scorch PostingsList.Iterator() method, so that the caller can specify whether freq/norm/locs information is needed or not. Future changes can leverage these params for optimizations. --- index/scorch/segment/empty.go | 2 +- index/scorch/segment/mem/posting.go | 2 +- index/scorch/segment/mem/segment_test.go | 14 +++++++------- index/scorch/segment/segment.go | 2 +- index/scorch/segment/zap/merge.go | 2 +- index/scorch/segment/zap/merge_test.go | 4 ++-- index/scorch/segment/zap/posting.go | 11 ++++++----- index/scorch/segment/zap/segment_test.go | 16 ++++++++-------- index/scorch/snapshot_index.go | 2 +- index/scorch/snapshot_segment.go | 2 +- 10 files changed, 29 insertions(+), 28 deletions(-) diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index 6c19f60f9..6fa85f657 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -84,7 +84,7 @@ func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) { type EmptyPostingsList struct{} -func (e *EmptyPostingsList) Iterator() PostingsIterator { +func (e *EmptyPostingsList) Iterator(includeFreq, includeNorm, includeLocations bool) PostingsIterator { return &EmptyPostingsIterator{} } diff --git a/index/scorch/segment/mem/posting.go b/index/scorch/segment/mem/posting.go index 4203acbe5..362fdb7c5 100644 --- a/index/scorch/segment/mem/posting.go +++ b/index/scorch/segment/mem/posting.go @@ -78,7 +78,7 @@ func (p *PostingsList) Count() uint64 { } // Iterator returns an iterator for this postings list -func (p *PostingsList) Iterator() segment.PostingsIterator { +func (p *PostingsList) Iterator(includeFreq, includeNorm, includeLocations bool) segment.PostingsIterator { return p.InitIterator(nil) } func (p *PostingsList) InitIterator(prealloc *PostingsIterator) *PostingsIterator { diff --git a/index/scorch/segment/mem/segment_test.go b/index/scorch/segment/mem/segment_test.go index 6c5625d86..c4c01f144 100644 --- a/index/scorch/segment/mem/segment_test.go +++ b/index/scorch/segment/mem/segment_test.go @@ -48,7 +48,7 @@ func TestEmpty(t *testing.T) { t.Fatal("got nil postings list, expected non-nil") } - postingsItr := postingsList.Iterator() + postingsItr := postingsList.Iterator(true, true, true) if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") } @@ -211,7 +211,7 @@ func TestSingle(t *testing.T) { t.Fatal("got nil postings list, expected non-nil") } - postingsItr := postingsList.Iterator() + postingsItr := postingsList.Iterator(true, true, true) if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") } @@ -257,7 +257,7 @@ func TestSingle(t *testing.T) { t.Fatal("got nil postings list, expected non-nil") } - postingsItr = postingsList.Iterator() + postingsItr = postingsList.Iterator(true, true, true) if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") } @@ -325,7 +325,7 @@ func TestSingle(t *testing.T) { t.Fatal("got nil postings list, expected non-nil") } - postingsItr = postingsList.Iterator() + postingsItr = postingsList.Iterator(true, true, true) if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") } @@ -394,7 +394,7 @@ func TestSingle(t *testing.T) { t.Fatal("got nil postings list, expected non-nil") } - postingsItr = postingsList.Iterator() + postingsItr = postingsList.Iterator(true, true, true) if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") } @@ -638,7 +638,7 @@ func TestMultiple(t *testing.T) { t.Fatal("got nil postings list, expected non-nil") } - postingsItr := postingsList.Iterator() + postingsItr := postingsList.Iterator(true, true, true) if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") } @@ -677,7 +677,7 @@ func TestMultiple(t *testing.T) { t.Errorf("expected count from postings list to be 1, got %d", postingsListExcludingCount) } - postingsItrExcluding := postingsListExcluding.Iterator() + postingsItrExcluding := postingsListExcluding.Iterator(true, true, true) if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") } diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index 8eee5f75f..adab7a01e 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -55,7 +55,7 @@ type DictionaryIterator interface { } type PostingsList interface { - Iterator() PostingsIterator + Iterator(includeFreq, includeNorm, includeLocations bool) PostingsIterator Size() int diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 167ebfa24..0d40d5f28 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -303,7 +303,7 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, return nil, 0, err2 } - postItr = postings.iterator(postItr) + postItr = postings.iterator(true, true, true, postItr) if fieldsSame { // can optimize by copying freq/norm/loc bytes directly diff --git a/index/scorch/segment/zap/merge_test.go b/index/scorch/segment/zap/merge_test.go index d931f6c23..2675bf838 100644 --- a/index/scorch/segment/zap/merge_test.go +++ b/index/scorch/segment/zap/merge_test.go @@ -332,8 +332,8 @@ func compareSegments(a, b *Segment) string { fieldName, next.Term, aplist.Count(), bplist.Count())) } - apitr := aplist.Iterator() - bpitr := bplist.Iterator() + apitr := aplist.Iterator(true, true, true) + bpitr := bplist.Iterator(true, true, true) if (apitr != nil) != (bpitr != nil) { rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsList.Iterator() results different: %v %v", fieldName, next.Term, apitr, bpitr)) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index cf1e1ab0e..4092b685e 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -131,11 +131,12 @@ func (p *PostingsList) OrInto(receiver *roaring.Bitmap) { } // Iterator returns an iterator for this postings list -func (p *PostingsList) Iterator() segment.PostingsIterator { - return p.iterator(nil) +func (p *PostingsList) Iterator(includeFreq, includeNorm, includeLocations bool) segment.PostingsIterator { + return p.iterator(includeFreq, includeNorm, includeLocations, nil) } -func (p *PostingsList) iterator(rv *PostingsIterator) *PostingsIterator { +func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocations bool, + rv *PostingsIterator) *PostingsIterator { if rv == nil { rv = &PostingsIterator{} } else { @@ -495,10 +496,10 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { if cap(i.nextLocs) >= int(rv.freq) { i.nextLocs = i.nextLocs[0:rv.freq] } else { - i.nextLocs = make([]Location, rv.freq, rv.freq * 2) + i.nextLocs = make([]Location, rv.freq, rv.freq*2) } if cap(i.nextSegmentLocs) < int(rv.freq) { - i.nextSegmentLocs = make([]segment.Location, rv.freq, rv.freq * 2) + i.nextSegmentLocs = make([]segment.Location, rv.freq, rv.freq*2) } rv.locs = i.nextSegmentLocs[0:rv.freq] for j := 0; j < int(rv.freq); j++ { diff --git a/index/scorch/segment/zap/segment_test.go b/index/scorch/segment/zap/segment_test.go index 50d5dbd7f..9f0a4015d 100644 --- a/index/scorch/segment/zap/segment_test.go +++ b/index/scorch/segment/zap/segment_test.go @@ -84,7 +84,7 @@ func TestOpen(t *testing.T) { t.Fatal("got nil postings list, expected non-nil") } - postingsItr := postingsList.Iterator() + postingsItr := postingsList.Iterator(true, true, true) if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") } @@ -130,7 +130,7 @@ func TestOpen(t *testing.T) { t.Fatal("got nil postings list, expected non-nil") } - postingsItr = postingsList.Iterator() + postingsItr = postingsList.Iterator(true, true, true) if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") } @@ -198,7 +198,7 @@ func TestOpen(t *testing.T) { t.Fatal("got nil postings list, expected non-nil") } - postingsItr = postingsList.Iterator() + postingsItr = postingsList.Iterator(true, true, true) if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") } @@ -267,7 +267,7 @@ func TestOpen(t *testing.T) { t.Fatal("got nil postings list, expected non-nil") } - postingsItr = postingsList.Iterator() + postingsItr = postingsList.Iterator(true, true, true) if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") } @@ -366,7 +366,7 @@ func TestOpenMulti(t *testing.T) { t.Fatal("got nil postings list, expected non-nil") } - postingsItr := postingsList.Iterator() + postingsItr := postingsList.Iterator(true, true, true) if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") } @@ -405,7 +405,7 @@ func TestOpenMulti(t *testing.T) { t.Errorf("expected count from postings list to be 1, got %d", postingsListExcludingCount) } - postingsItrExcluding := postingsListExcluding.Iterator() + postingsItrExcluding := postingsListExcluding.Iterator(true, true, true) if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") } @@ -466,7 +466,7 @@ func TestOpenMultiWithTwoChunks(t *testing.T) { t.Fatal("got nil postings list, expected non-nil") } - postingsItr := postingsList.Iterator() + postingsItr := postingsList.Iterator(true, true, true) if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") } @@ -500,7 +500,7 @@ func TestOpenMultiWithTwoChunks(t *testing.T) { t.Fatal("got nil postings list, expected non-nil") } - postingsItrExcluding := postingsListExcluding.Iterator() + postingsItrExcluding := postingsListExcluding.Iterator(true, true, true) if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 6f4b0288e..95343af7f 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -394,7 +394,7 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, return nil, err } rv.postings[i] = pl - rv.iterators[i] = pl.Iterator() + rv.iterators[i] = pl.Iterator(includeFreq, includeNorm, includeTermVectors) } atomic.AddUint64(&i.parent.stats.TotTermSearchersStarted, uint64(1)) return rv, nil diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index edf52a6e7..805e56642 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -165,7 +165,7 @@ func (cfd *cachedFieldDocs) prepareFields(field string, ss *SegmentSnapshot) { } cfd.size += uint64(size.SizeOfUint64) /* map key */ - postingsItr := postings.Iterator() + postingsItr := postings.Iterator(false, false, false) nextPosting, err2 := postingsItr.Next() for err2 == nil && nextPosting != nil { docNum := nextPosting.Number() From 1cab701f85fba914edf192ded54d76a088b415f1 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 24 Mar 2018 10:32:01 -0700 Subject: [PATCH 331/728] scorch zap postingsIter skips freq/norm/locs parsing if allowed In this optimization, the zap PostingsIterator skips the parsing of freq/norm/locs chunks based on the includeFreq|Norm|Locs flags. In bleve-query microbenchmark on dev macbookpro, with 50K en-wiki docs, on a medium frequency term search that does not ask for term vectors, throughput was ~750 q/sec before the change and went to ~1400 q/sec after the change. --- index/scorch/segment/zap/posting.go | 134 ++++++++++++++++------------ 1 file changed, 79 insertions(+), 55 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 4092b685e..b3a1891ec 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -131,11 +131,11 @@ func (p *PostingsList) OrInto(receiver *roaring.Bitmap) { } // Iterator returns an iterator for this postings list -func (p *PostingsList) Iterator(includeFreq, includeNorm, includeLocations bool) segment.PostingsIterator { - return p.iterator(includeFreq, includeNorm, includeLocations, nil) +func (p *PostingsList) Iterator(includeFreq, includeNorm, includeLocs bool) segment.PostingsIterator { + return p.iterator(includeFreq, includeNorm, includeLocs, nil) } -func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocations bool, +func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool, rv *PostingsIterator) *PostingsIterator { if rv == nil { rv = &PostingsIterator{} @@ -195,38 +195,45 @@ func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocations bool, return rv } - // prepare the freq chunk details var n uint64 var read int - var numFreqChunks uint64 - numFreqChunks, read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) - n += uint64(read) - if cap(rv.freqChunkOffsets) >= int(numFreqChunks) { - rv.freqChunkOffsets = rv.freqChunkOffsets[:int(numFreqChunks)] - } else { - rv.freqChunkOffsets = make([]uint64, int(numFreqChunks)) - } - for i := 0; i < int(numFreqChunks); i++ { - rv.freqChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) + + // prepare the freq chunk details + rv.includeFreqNorm = includeFreq || includeNorm + if rv.includeFreqNorm { + var numFreqChunks uint64 + numFreqChunks, read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) n += uint64(read) + if cap(rv.freqChunkOffsets) >= int(numFreqChunks) { + rv.freqChunkOffsets = rv.freqChunkOffsets[:int(numFreqChunks)] + } else { + rv.freqChunkOffsets = make([]uint64, int(numFreqChunks)) + } + for i := 0; i < int(numFreqChunks); i++ { + rv.freqChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + } + rv.freqChunkStart = p.freqOffset + n } - rv.freqChunkStart = p.freqOffset + n // prepare the loc chunk details - n = 0 - var numLocChunks uint64 - numLocChunks, read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) - n += uint64(read) - if cap(rv.locChunkOffsets) >= int(numLocChunks) { - rv.locChunkOffsets = rv.locChunkOffsets[:int(numLocChunks)] - } else { - rv.locChunkOffsets = make([]uint64, int(numLocChunks)) - } - for i := 0; i < int(numLocChunks); i++ { - rv.locChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) + rv.includeLocs = includeLocs + if rv.includeLocs { + n = 0 + var numLocChunks uint64 + numLocChunks, read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) n += uint64(read) + if cap(rv.locChunkOffsets) >= int(numLocChunks) { + rv.locChunkOffsets = rv.locChunkOffsets[:int(numLocChunks)] + } else { + rv.locChunkOffsets = make([]uint64, int(numLocChunks)) + } + for i := 0; i < int(numLocChunks); i++ { + rv.locChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + } + rv.locChunkStart = p.locOffset + n } - rv.locChunkStart = p.locOffset + n rv.all = p.postings.Iterator() if p.except != nil { @@ -329,6 +336,9 @@ type PostingsIterator struct { normBits1Hit uint64 buf []byte + + includeFreqNorm bool + includeLocs bool } func (i *PostingsIterator) Size() int { @@ -347,32 +357,42 @@ func (i *PostingsIterator) Size() int { } func (i *PostingsIterator) loadChunk(chunk int) error { - if chunk >= len(i.freqChunkOffsets) || chunk >= len(i.locChunkOffsets) { - return fmt.Errorf("tried to load chunk that doesn't exist %d/(%d %d)", chunk, len(i.freqChunkOffsets), len(i.locChunkOffsets)) - } - - end, start := i.freqChunkStart, i.freqChunkStart - s, e := readChunkBoundary(chunk, i.freqChunkOffsets) - start += s - end += e - i.currChunkFreqNorm = i.postings.sb.mem[start:end] - if i.freqNormReader == nil { - i.freqNormReader = bytes.NewReader(i.currChunkFreqNorm) - i.freqNormDecoder = govarint.NewU64Base128Decoder(i.freqNormReader) - } else { - i.freqNormReader.Reset(i.currChunkFreqNorm) + if i.includeFreqNorm { + if chunk >= len(i.freqChunkOffsets) { + return fmt.Errorf("tried to load freq chunk that doesn't exist %d/(%d)", + chunk, len(i.freqChunkOffsets)) + } + + end, start := i.freqChunkStart, i.freqChunkStart + s, e := readChunkBoundary(chunk, i.freqChunkOffsets) + start += s + end += e + i.currChunkFreqNorm = i.postings.sb.mem[start:end] + if i.freqNormReader == nil { + i.freqNormReader = bytes.NewReader(i.currChunkFreqNorm) + i.freqNormDecoder = govarint.NewU64Base128Decoder(i.freqNormReader) + } else { + i.freqNormReader.Reset(i.currChunkFreqNorm) + } } - end, start = i.locChunkStart, i.locChunkStart - s, e = readChunkBoundary(chunk, i.locChunkOffsets) - start += s - end += e - i.currChunkLoc = i.postings.sb.mem[start:end] - if i.locReader == nil { - i.locReader = bytes.NewReader(i.currChunkLoc) - i.locDecoder = govarint.NewU64Base128Decoder(i.locReader) - } else { - i.locReader.Reset(i.currChunkLoc) + if i.includeLocs { + if chunk >= len(i.locChunkOffsets) { + return fmt.Errorf("tried to load loc chunk that doesn't exist %d/(%d)", + chunk, len(i.locChunkOffsets)) + } + + end, start := i.locChunkStart, i.locChunkStart + s, e := readChunkBoundary(chunk, i.locChunkOffsets) + start += s + end += e + i.currChunkLoc = i.postings.sb.mem[start:end] + if i.locReader == nil { + i.locReader = bytes.NewReader(i.currChunkLoc) + i.locDecoder = govarint.NewU64Base128Decoder(i.locReader) + } else { + i.locReader.Reset(i.currChunkLoc) + } } i.currChunk = uint32(chunk) @@ -481,6 +501,10 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { rv := &i.next rv.docNum = docNum + if !i.includeFreqNorm { + return rv, nil + } + var normBits uint64 var hasLocs bool @@ -491,7 +515,7 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { rv.norm = math.Float32frombits(uint32(normBits)) - if hasLocs { + if i.includeLocs && hasLocs { // read off 'freq' locations, into reused slices if cap(i.nextLocs) >= int(rv.freq) { i.nextLocs = i.nextLocs[0:rv.freq] @@ -591,7 +615,7 @@ func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { // if they don't match, move 'all' forwards until they do for allN != n { // in the same chunk, so move the freq/norm/loc decoders forward - if allNChunk == nChunk { + if i.includeFreqNorm && allNChunk == nChunk { if i.currChunk != nChunk || i.currChunkFreqNorm == nil { err := i.loadChunk(int(nChunk)) if err != nil { @@ -605,7 +629,7 @@ func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { return 0, false, err } - if hasLocs { + if i.includeLocs && hasLocs { for j := 0; j < int(freq); j++ { err := i.readLocation(nil) if err != nil { @@ -619,7 +643,7 @@ func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { allNChunk = allN / i.postings.sb.chunkFactor } - if i.currChunk != nChunk || i.currChunkFreqNorm == nil { + if i.includeFreqNorm && (i.currChunk != nChunk || i.currChunkFreqNorm == nil) { err := i.loadChunk(int(nChunk)) if err != nil { return 0, false, fmt.Errorf("error loading chunk: %v", err) From 72ac35296132e4ab3da85f56237bebbf3cf2bcbe Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 23 Mar 2018 16:07:14 +0530 Subject: [PATCH 332/728] TermFieldReader Advance optimization skips to the target segment and avoid un necesary read of freq,loc,norm details --- index/scorch/segment/empty.go | 4 ++++ index/scorch/segment/mem/posting.go | 21 +++++++++++++++++ index/scorch/segment/segment.go | 4 ++++ index/scorch/segment/zap/posting.go | 31 +++++++++++++++++++++++++ index/scorch/snapshot_index_tfr.go | 35 +++++++++++++++++------------ 5 files changed, 81 insertions(+), 14 deletions(-) diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index 6c19f60f9..21f8db3fc 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -105,3 +105,7 @@ func (e *EmptyPostingsIterator) Next() (Posting, error) { func (e *EmptyPostingsIterator) Size() int { return 0 } + +func (e *EmptyPostingsIterator) Advance(uint64) (Posting, error) { + return nil, nil +} diff --git a/index/scorch/segment/mem/posting.go b/index/scorch/segment/mem/posting.go index 4203acbe5..2b65a91c8 100644 --- a/index/scorch/segment/mem/posting.go +++ b/index/scorch/segment/mem/posting.go @@ -155,6 +155,27 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { return &i.reuse, nil } +func (i *PostingsIterator) Advance(docNumber uint64) (segment.Posting, error) { + if i.reuse.Number() == docNumber { + return &i.reuse, nil + } + next, err := i.Next() + if err != nil { + return next, err + } + + nnum := next.Number() + for nnum < docNumber { + next, err = i.Next() + if err != nil || next == nil { + return next, err + } + nnum = next.Number() + } + + return next, nil +} + // Posting is a single entry in a postings list type Posting struct { iterator *PostingsIterator diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index 8eee5f75f..5e917b798 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -75,6 +75,10 @@ type PostingsIterator interface { Next() (Posting, error) Size() int + + // Advance will return the respective posting of the + // sepcified doc number or its immediate follower. + Advance(docNum uint64) (Posting, error) } type Posting interface { diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 004b80317..b756c936e 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -558,6 +558,37 @@ func (i *PostingsIterator) nextBytes() ( return docNum, freq, normBits, bytesFreqNorm, bytesLoc, nil } +func (i *PostingsIterator) Advance(docNumber uint64) (segment.Posting, error) { + // check if we are already there + if i.next.Number() == docNumber { + return &i.next, nil + } + + nChunk := uint32(docNumber) / i.postings.sb.chunkFactor + if i.currChunk != nChunk { + err := i.loadChunk(int(nChunk)) + if err != nil { + return nil, fmt.Errorf("Advance, error loading chunk: %v", err) + } + } + + next, err := i.Next() + if err != nil || next == nil { + return nil, err + } + + nnum := next.Number() + for nnum < docNumber { + next, err = i.Next() + if err != nil || next == nil { + return next, err + } + nnum = next.Number() + } + + return next, nil +} + // nextDocNum returns the next docNum on the postings list, and also // sets up the currChunk / loc related fields of the iterator. func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index e1a0e9a59..4c3d08edd 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -115,7 +115,8 @@ func (i *IndexSnapshotTermFieldReader) postingToTermFieldDoc(next segment.Postin } } -func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) { +func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, + preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) { // FIXME do something better // for now, if we need to seek backwards, then restart from the beginning if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 { @@ -126,24 +127,30 @@ func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAllo } *i = *(i2.(*IndexSnapshotTermFieldReader)) } - // FIXME do something better - next, err := i.Next(preAlloced) + + num, err := docInternalToNumber(ID) if err != nil { - return nil, err + return nil, nil } - if next == nil { + segIndex, ldocNum := i.snapshot.segmentIndexAndLocalDocNumFromGlobal(num) + if segIndex > len(i.snapshot.segment) { return nil, nil } - for bytes.Compare(next.ID, ID) < 0 { - next, err = i.Next(preAlloced) - if err != nil { - return nil, err - } - if next == nil { - break - } + // skip directly to the target segment + next, err := i.iterators[segIndex].Advance(ldocNum) + if err != nil || next == nil { + return nil, err + } + + if preAlloced == nil { + preAlloced = &index.TermFieldDoc{} } - return next, nil + preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+ + i.snapshot.offsets[segIndex]) + i.postingToTermFieldDoc(next, preAlloced) + i.currID = preAlloced.ID + i.currPosting = next + return preAlloced, nil } func (i *IndexSnapshotTermFieldReader) Count() uint64 { From db6a2c274f692fc77093ea93ea7146a3cba1b7e2 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Tue, 27 Mar 2018 22:10:09 +0530 Subject: [PATCH 333/728] adding nil check --- index/scorch/segment/mem/posting.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/scorch/segment/mem/posting.go b/index/scorch/segment/mem/posting.go index 2b65a91c8..27be5dbc6 100644 --- a/index/scorch/segment/mem/posting.go +++ b/index/scorch/segment/mem/posting.go @@ -160,7 +160,7 @@ func (i *PostingsIterator) Advance(docNumber uint64) (segment.Posting, error) { return &i.reuse, nil } next, err := i.Next() - if err != nil { + if err != nil || next == nil { return next, err } From 596d990eb95198f7d0fc19353b052622587f8895 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 27 Mar 2018 13:57:46 -0700 Subject: [PATCH 334/728] scorch zap optimize when zero hits Instead of allocating brand-new empty postingsList/Iterator instances, reuse some empty singletons. --- index/scorch/segment/zap/dict.go | 8 +++++++- index/scorch/segment/zap/posting.go | 13 +++++++++++++ index/scorch/segment/zap/segment.go | 2 +- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 38b4faca1..049ebb366 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -39,6 +39,9 @@ func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment. func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { if d.fst == nil { + if rv == nil || rv == emptyPostingsList { + return emptyPostingsList, nil + } return d.postingsListInit(rv, except), nil } @@ -47,6 +50,9 @@ func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *Posti return nil, fmt.Errorf("vellum err: %v", err) } if !exists { + if rv == nil || rv == emptyPostingsList { + return emptyPostingsList, nil + } return d.postingsListInit(rv, except), nil } @@ -65,7 +71,7 @@ func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, except *roari } func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) *PostingsList { - if rv == nil { + if rv == nil || rv == emptyPostingsList { rv = &PostingsList{} } else { postings := rv.postings diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 6b9e1a533..c0c39571a 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -109,6 +109,9 @@ type PostingsList struct { normBits1Hit uint64 } +// represents an immutable, empty postings list +var emptyPostingsList = &PostingsList{} + func (p *PostingsList) Size() int { sizeInBytes := reflectStaticSizePostingsList + size.SizeOfPtr @@ -132,6 +135,10 @@ func (p *PostingsList) OrInto(receiver *roaring.Bitmap) { // Iterator returns an iterator for this postings list func (p *PostingsList) Iterator(includeFreq, includeNorm, includeLocs bool) segment.PostingsIterator { + if p.normBits1Hit == 0 && p.postings == nil { + return emptyPostingsIterator + } + return p.iterator(includeFreq, includeNorm, includeLocs, nil) } @@ -341,6 +348,8 @@ type PostingsIterator struct { includeLocs bool } +var emptyPostingsIterator = &PostingsIterator{} + func (i *PostingsIterator) Size() int { sizeInBytes := reflectStaticSizePostingsIterator + size.SizeOfPtr + len(i.currChunkFreqNorm) + @@ -589,6 +598,10 @@ func (i *PostingsIterator) nextBytes() ( } func (i *PostingsIterator) Advance(docNumber uint64) (segment.Posting, error) { + if i.postings == nil { + return nil, nil + } + // check if we are already there if i.next.Number() == docNumber { return &i.next, nil diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 3a3fd02eb..58f8bee02 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -373,7 +373,7 @@ func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) { return nil, err } - var postingsList *PostingsList + postingsList := emptyPostingsList for _, id := range ids { postingsList, err = idDict.postingsList([]byte(id), nil, postingsList) if err != nil { From 013d06d756ca362eb06586e7ac88ca508a2ddb9c Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 27 Mar 2018 14:05:17 -0700 Subject: [PATCH 335/728] scorch TermFieldReader() reuses string(term) --- index/scorch/snapshot_index.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 95343af7f..251aa9fad 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -373,7 +373,7 @@ func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err err func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { - + termStr := string(term) rv := &IndexSnapshotTermFieldReader{ term: term, field: field, @@ -389,7 +389,7 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, if err != nil { return nil, err } - pl, err := dict.PostingsList(string(term), nil) + pl, err := dict.PostingsList(termStr, nil) if err != nil { return nil, err } From b955bdcd72657b268eb6e6a8d530d919557a3ad0 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 28 Mar 2018 10:01:01 -0700 Subject: [PATCH 336/728] scorch optimize docInternalToNumber() to avoid allocations docInternalToNumber() no longer allocates a reader instance and a heap uint64 to hold the result. --- index/scorch/snapshot_index.go | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 251aa9fad..800d250f6 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -413,12 +413,10 @@ func docNumberToBytes(buf []byte, in uint64) []byte { } func docInternalToNumber(in index.IndexInternalID) (uint64, error) { - var res uint64 - err := binary.Read(bytes.NewReader(in), binary.BigEndian, &res) - if err != nil { - return 0, err + if len(in) != 8 { + return 0, fmt.Errorf("wrong len for IndexInternalID: %q", in) } - return res, nil + return binary.BigEndian.Uint64(in), nil } func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID, From 72d65ab1e18d2959aa9690fcdb2d02450d1367cf Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 28 Mar 2018 15:53:25 -0700 Subject: [PATCH 337/728] scorch zap optimize stored field format for _id field NOTE: This is a zap version and file format change. This optimization treats the _id field as a special case, where the _id field is no longer snappy compressed along with all the other non-"_id" stored fields. Instead, the _id field's length and bytes are written out early and as-is, making the _id value easier to access. This can be beneficial to search-related methods, such as ExternalID(), which only require _id field information. As part of this, we're also no longer writing out unnecessary data for the _id field that we persist for other non-"_id" fields, such as: the numeric fieldID (i.e., uint16(0)), the type (i.e., assumed to be 't' as a text field), the start offset, the end len, and the arrayPositions. Various unit tests were corrected as part of this change in order to more uniformly treat the _id field as having IndexingOptions as a stored field and not having DocValues. --- index/scorch/segment/zap/build.go | 2 +- index/scorch/segment/zap/build_test.go | 9 +++++--- index/scorch/segment/zap/merge.go | 26 ++++++++++++++++++------ index/scorch/segment/zap/new.go | 17 ++++++++++++++-- index/scorch/segment/zap/segment.go | 25 ++++++++++++++++++----- index/scorch/segment/zap/segment_test.go | 6 ++---- 6 files changed, 64 insertions(+), 21 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 03e4cbae8..e8f3499b4 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -22,7 +22,7 @@ import ( "github.com/Smerity/govarint" ) -const version uint32 = 8 +const version uint32 = 9 const fieldNotUninverted = math.MaxUint64 diff --git a/index/scorch/segment/zap/build_test.go b/index/scorch/segment/zap/build_test.go index e8189f760..5b6ffd559 100644 --- a/index/scorch/segment/zap/build_test.go +++ b/index/scorch/segment/zap/build_test.go @@ -311,7 +311,8 @@ func buildTestAnalysisResultsMultiWithDifferentFields(includeDocA, includeDocB b doc := &document.Document{ ID: "a", Fields: []document.Field{ - document.NewTextField("_id", []uint64{}, []byte("a")), + document.NewTextFieldCustom("_id", nil, []byte("a"), + document.IndexField|document.StoreField, nil), document.NewTextField("name", []uint64{}, []byte("ABC")), document.NewTextField("dept", []uint64{}, []byte("ABC dept")), document.NewTextField("manages.id", []uint64{}, []byte("XYZ")), @@ -388,7 +389,8 @@ func buildTestAnalysisResultsMultiWithDifferentFields(includeDocA, includeDocB b doc := &document.Document{ ID: "b", Fields: []document.Field{ - document.NewTextField("_id", []uint64{}, []byte("b")), + document.NewTextFieldCustom("_id", nil, []byte("b"), + document.IndexField|document.StoreField, nil), document.NewTextField("name", []uint64{}, []byte("XYZ")), document.NewTextField("dept", []uint64{}, []byte("ABC dept")), document.NewTextField("reportsTo.id", []uint64{}, []byte("ABC")), @@ -468,7 +470,8 @@ func buildTestSegmentWithDefaultFieldMapping(chunkFactor uint32) ( doc := &document.Document{ ID: "a", Fields: []document.Field{ - document.NewTextField("_id", nil, []byte("a")), + document.NewTextFieldCustom("_id", nil, []byte("a"), + document.IndexField|document.StoreField, nil), document.NewTextField("name", nil, []byte("wow")), document.NewTextField("desc", nil, []byte("some thing")), document.NewTextField("tag", []uint64{0}, []byte("cold")), diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 0d40d5f28..3eafa0c77 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -622,12 +622,19 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, return 0, nil, err } - // now walk the fields in order - for fieldID := range fieldsInv { - storedFieldValues := vals[int(fieldID)] + // _id field special case optimizes ExternalID() lookups + idFieldVal := vals[uint16(0)][0] + _, err = metaEncoder.PutU64(uint64(len(idFieldVal))) + if err != nil { + return 0, nil, err + } + + // now walk the non-"_id" fields in order + for fieldID := 1; fieldID < len(fieldsInv); fieldID++ { + storedFieldValues := vals[fieldID] - stf := typs[int(fieldID)] - spf := poss[int(fieldID)] + stf := typs[fieldID] + spf := poss[fieldID] var err2 error curr, data, err2 = persistStoredFieldValues(fieldID, @@ -646,7 +653,9 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, docNumOffsets[newDocNum] = uint64(w.Count()) // write out the meta len and compressed data len - _, err = writeUvarints(w, uint64(len(metaBytes)), uint64(len(compressed))) + _, err = writeUvarints(w, + uint64(len(metaBytes)), + uint64(len(idFieldVal)+len(compressed))) if err != nil { return 0, nil, err } @@ -655,6 +664,11 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, if err != nil { return 0, nil, err } + // now write the _id field val (counted as part of the 'compressed' data) + _, err = w.Write(idFieldVal) + if err != nil { + return 0, nil, err + } // now write the compressed data _, err = w.Write(compressed) if err != nil { diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index 83a0cbced..63b154622 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -518,7 +518,15 @@ func (s *interim) writeStoredFields() ( s.metaBuf.Reset() data = data[:0] - for fieldID := range s.FieldsInv { + // _id field special case optimizes ExternalID() lookups + idFieldVal := docStoredFields[uint16(0)].vals[0] + _, err = metaEncoder.PutU64(uint64(len(idFieldVal))) + if err != nil { + return 0, err + } + + // handle non-"_id" fields + for fieldID := 1; fieldID < len(s.FieldsInv); fieldID++ { isf, exists := docStoredFields[uint16(fieldID)] if exists { curr, data, err = persistStoredFieldValues( @@ -539,7 +547,7 @@ func (s *interim) writeStoredFields() ( _, err := writeUvarints(s.w, uint64(len(metaBytes)), - uint64(len(compressed))) + uint64(len(idFieldVal)+len(compressed))) if err != nil { return 0, err } @@ -549,6 +557,11 @@ func (s *interim) writeStoredFields() ( return 0, err } + _, err = s.w.Write(idFieldVal) + if err != nil { + return 0, err + } + _, err = s.w.Write(compressed) if err != nil { return 0, err diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 58f8bee02..541647abb 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -298,16 +298,31 @@ func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldVal vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) meta, compressed := s.getDocStoredMetaAndCompressed(num) - uncompressed, err := snappy.Decode(vdc.buf[:cap(vdc.buf)], compressed) + + vdc.reader.Reset(meta) + decoder := vdc.decoder + + // handle _id field special case + idFieldValLen, err := decoder.GetU64() if err != nil { return err } + idFieldVal := compressed[:idFieldValLen] - // now decode meta and process - vdc.reader.Reset(meta) - decoder := vdc.decoder + keepGoing := visitor("_id", byte('t'), idFieldVal, nil) + if !keepGoing { + visitDocumentCtxPool.Put(vdc) + return nil + } + + // handle non-"_id" fields + compressed = compressed[idFieldValLen:] + + uncompressed, err := snappy.Decode(vdc.buf[:cap(vdc.buf)], compressed) + if err != nil { + return err + } - keepGoing := true for keepGoing { field, err := decoder.GetU64() if err == io.EOF { diff --git a/index/scorch/segment/zap/segment_test.go b/index/scorch/segment/zap/segment_test.go index eb7ed65e4..00ae1c2f6 100644 --- a/index/scorch/segment/zap/segment_test.go +++ b/index/scorch/segment/zap/segment_test.go @@ -18,7 +18,6 @@ import ( "math" "os" "reflect" - "sort" "testing" "github.com/RoaringBitmap/roaring" @@ -552,7 +551,7 @@ func TestSegmentVisitableDocValueFieldsList(t *testing.T) { } _ = os.RemoveAll("/tmp/scorch.zap") - testSeg, expectedFields, _ := buildTestSegmentWithDefaultFieldMapping(1) + testSeg, _, _ = buildTestSegmentWithDefaultFieldMapping(1) err = PersistSegmentBase(testSeg, "/tmp/scorch.zap") if err != nil { t.Fatalf("error persisting segment: %v", err) @@ -576,7 +575,7 @@ func TestSegmentVisitableDocValueFieldsList(t *testing.T) { t.Fatalf("segment VisitableDocValueFields err: %v", err) } - sort.Strings(expectedFields[1:]) // keep _id as first field + expectedFields := []string{"desc", "name", "tag"} if !reflect.DeepEqual(fields, expectedFields) { t.Errorf("expected field terms: %#v, got: %#v", expectedFields, fields) } @@ -593,7 +592,6 @@ func TestSegmentVisitableDocValueFieldsList(t *testing.T) { "name": []string{"wow"}, "desc": []string{"some", "thing"}, "tag": []string{"cold"}, - "_id": []string{"a"}, } if !reflect.DeepEqual(fieldTerms, expectedFieldTerms) { t.Errorf("expected field terms: %#v, got: %#v", expectedFieldTerms, fieldTerms) From c892c4175100ca79d387fe4eb123b13ac3618384 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 28 Mar 2018 17:36:48 -0700 Subject: [PATCH 338/728] scorch optimize conjunctions via push-down to roaring.And() This change pushes down conjunction information to indexers for potential TermFieldReader optimizations. The scorch indexer can optimize this situation by constructing a more selective roaring "actual" bitmap for any PostingsIterator's that are part of the conjunction. On a bleve-query microbenchmark, dev macbook, with a scorch index of 50K en-wiki docs, using a repeated, single-threaded conjunction query of several relatively high-frequency terms (query-string of "+text:see +text:also +text:where +text:http"), perf was ~265 queries/sec before this change, and ~310 queries/sec after this change. --- index/index.go | 16 +++++ index/scorch/optimize.go | 93 +++++++++++++++++++++++++++ index/scorch/segment/zap/posting.go | 14 ++-- search/searcher/search_conjunction.go | 24 +++++++ search/searcher/search_disjunction.go | 15 +++++ search/searcher/search_term.go | 10 +++ 6 files changed, 166 insertions(+), 6 deletions(-) create mode 100644 index/scorch/optimize.go diff --git a/index/index.go b/index/index.go index e5a69297b..e3731e3f8 100644 --- a/index/index.go +++ b/index/index.go @@ -272,3 +272,19 @@ func (b *Batch) Reset() { b.IndexOps = make(map[string]*document.Document) b.InternalOps = make(map[string][]byte) } + +// Optimizable represents an optional interface that implementable by +// optimizable resources (e.g., TermFieldReaders, Searchers). These +// optimizable resources are provided the same OptimizableContext +// instance, so that they can coordinate via dynamic interface +// casting. +type Optimizable interface { + Optimize(kind string, octx OptimizableContext) (OptimizableContext, error) +} + +type OptimizableContext interface { + // Once all the optimzable resources have been provided the same + // OptimizableContext instance, the optimization preparations are + // finished or completed via the Finish() method. + Finish() error +} diff --git a/index/scorch/optimize.go b/index/scorch/optimize.go new file mode 100644 index 000000000..b45fc8b0d --- /dev/null +++ b/index/scorch/optimize.go @@ -0,0 +1,93 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "fmt" + + "github.com/RoaringBitmap/roaring" + + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment/zap" +) + +func (s *IndexSnapshotTermFieldReader) Optimize(kind string, octx index.OptimizableContext) ( + index.OptimizableContext, error) { + if kind != "conjunction" { + return octx, nil + } + + if octx == nil { + octx = &OptimizeTFRConjunction{snapshot: s.snapshot} + } + + o, ok := octx.(*OptimizeTFRConjunction) + if !ok { + return octx, nil + } + + if o.snapshot != s.snapshot { + return nil, fmt.Errorf("tried to optimize across different snapshots") + } + + o.tfrs = append(o.tfrs, s) + + return o, nil +} + +type OptimizeTFRConjunction struct { + snapshot *IndexSnapshot + + tfrs []*IndexSnapshotTermFieldReader +} + +func (o *OptimizeTFRConjunction) Finish() error { + if len(o.tfrs) <= 1 { + return nil + } + + for i := range o.snapshot.segment { + itr0, ok := o.tfrs[0].iterators[i].(*zap.PostingsIterator) + if !ok || itr0.ActualBM == nil { + continue + } + + itr1, ok := o.tfrs[1].iterators[i].(*zap.PostingsIterator) + if !ok || itr1.ActualBM == nil { + continue + } + + bm := roaring.And(itr0.ActualBM, itr1.ActualBM) + + for _, tfr := range o.tfrs[2:] { + itr, ok := tfr.iterators[i].(*zap.PostingsIterator) + if !ok || itr.ActualBM == nil { + continue + } + + bm.And(itr.ActualBM) + } + + for _, tfr := range o.tfrs { + itr, ok := tfr.iterators[i].(*zap.PostingsIterator) + if ok && itr.ActualBM != nil { + itr.ActualBM = bm + itr.Actual = bm.Iterator() + } + } + } + + return nil +} diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index c0c39571a..d2e51e039 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -244,10 +244,11 @@ func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool, rv.all = p.postings.Iterator() if p.except != nil { - allExcept := roaring.AndNot(p.postings, p.except) - rv.actual = allExcept.Iterator() + rv.ActualBM = roaring.AndNot(p.postings, p.except) + rv.Actual = rv.ActualBM.Iterator() } else { - rv.actual = p.postings.Iterator() + rv.ActualBM = p.postings + rv.Actual = p.postings.Iterator() } return rv @@ -319,7 +320,8 @@ func (rv *PostingsList) init1Hit(fstVal uint64) error { type PostingsIterator struct { postings *PostingsList all roaring.IntIterable - actual roaring.IntIterable + Actual roaring.IntIterable + ActualBM *roaring.Bitmap currChunk uint32 currChunkFreqNorm []byte @@ -644,11 +646,11 @@ func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { return docNum, true, nil } - if i.actual == nil || !i.actual.HasNext() { + if i.Actual == nil || !i.Actual.HasNext() { return 0, false, nil } - n := i.actual.Next() + n := i.Actual.Next() allN := i.all.Next() nChunk := n / i.postings.sb.chunkFactor diff --git a/search/searcher/search_conjunction.go b/search/searcher/search_conjunction.go index da65f3981..a48052679 100644 --- a/search/searcher/search_conjunction.go +++ b/search/searcher/search_conjunction.go @@ -60,6 +60,30 @@ func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.S scorer: scorer.NewConjunctionQueryScorer(options), } rv.computeQueryNorm() + + // attempt push-down conjunction optimization when there's >1 searchers + if len(searchers) > 1 { + var octx index.OptimizableContext + + for _, searcher := range searchers { + o, ok := searcher.(index.Optimizable) + if ok { + var err error + octx, err = o.Optimize("conjunction", octx) + if err != nil { + return nil, err + } + } + } + + if octx != nil { + err := octx.Finish() + if err != nil { + return nil, err + } + } + } + return &rv, nil } diff --git a/search/searcher/search_disjunction.go b/search/searcher/search_disjunction.go index 32d614801..b75041371 100644 --- a/search/searcher/search_disjunction.go +++ b/search/searcher/search_disjunction.go @@ -306,3 +306,18 @@ func (s *DisjunctionSearcher) DocumentMatchPoolSize() int { } return rv } + +// a disjunction searcher implements the index.Optimizable interface +// but only activates on an edge case where the disjunction is a +// wrapper around a single Optimizable child searcher +func (s *DisjunctionSearcher) Optimize(kind string, octx index.OptimizableContext) ( + index.OptimizableContext, error) { + if len(s.searchers) == 1 { + o, ok := s.searchers[0].(index.Optimizable) + if ok { + return o.Optimize(kind, octx) + } + } + + return octx, nil +} diff --git a/search/searcher/search_term.go b/search/searcher/search_term.go index b99e4c263..4fee58bbf 100644 --- a/search/searcher/search_term.go +++ b/search/searcher/search_term.go @@ -137,3 +137,13 @@ func (s *TermSearcher) Min() int { func (s *TermSearcher) DocumentMatchPoolSize() int { return 1 } + +func (s *TermSearcher) Optimize(kind string, octx index.OptimizableContext) ( + index.OptimizableContext, error) { + o, ok := s.reader.(index.Optimizable) + if ok { + return o.Optimize(kind, octx) + } + + return octx, nil +} From 71a9f6c1b36409fa34e2e4782932110a40ee4ab8 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 29 Mar 2018 16:12:08 -0700 Subject: [PATCH 339/728] scorch optimize DictionaryIterator.Next() entry reuse Optimize DictionaryIterator to reuse a index.DictEntry rather than allocating a new instance on every Next(). This follows an approach used by upsidedown's dictionary iterators to avoid mem allocations. --- index/scorch/segment/zap/dict.go | 17 ++++++++--------- index/scorch/snapshot_index.go | 2 +- index/scorch/snapshot_index_dict.go | 15 ++++++++------- 3 files changed, 17 insertions(+), 17 deletions(-) diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 049ebb366..33240df2c 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -151,10 +151,11 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator // DictionaryIterator is an iterator for term dictionary type DictionaryIterator struct { - d *Dictionary - itr vellum.Iterator - err error - tmp PostingsList + d *Dictionary + itr vellum.Iterator + err error + tmp PostingsList + entry index.DictEntry } // Next returns the next entry in the dictionary @@ -169,10 +170,8 @@ func (i *DictionaryIterator) Next() (*index.DictEntry, error) { if i.err != nil { return nil, i.err } - rv := &index.DictEntry{ - Term: string(term), - Count: i.tmp.Count(), - } + i.entry.Term = string(term) + i.entry.Count = i.tmp.Count() i.err = i.itr.Next() - return rv, nil + return &i.entry, nil } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 800d250f6..4e22f68c5 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -140,7 +140,7 @@ func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i s if next != nil { rv.cursors = append(rv.cursors, &segmentDictCursor{ itr: asr.dictItr, - curr: next, + curr: *next, }) } } diff --git a/index/scorch/snapshot_index_dict.go b/index/scorch/snapshot_index_dict.go index 3c902cad6..2d229ca0f 100644 --- a/index/scorch/snapshot_index_dict.go +++ b/index/scorch/snapshot_index_dict.go @@ -23,12 +23,13 @@ import ( type segmentDictCursor struct { itr segment.DictionaryIterator - curr *index.DictEntry + curr index.DictEntry } type IndexSnapshotFieldDict struct { snapshot *IndexSnapshot cursors []*segmentDictCursor + entry index.DictEntry } func (i *IndexSnapshotFieldDict) Len() int { return len(i.cursors) } @@ -54,7 +55,7 @@ func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) { if len(i.cursors) <= 0 { return nil, nil } - rv := i.cursors[0].curr + i.entry = i.cursors[0].curr next, err := i.cursors[0].itr.Next() if err != nil { return nil, err @@ -64,12 +65,12 @@ func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) { heap.Pop(i) } else { // modified heap, fix it - i.cursors[0].curr = next + i.cursors[0].curr = *next heap.Fix(i, 0) } // look for any other entries with the exact same term - for len(i.cursors) > 0 && i.cursors[0].curr.Term == rv.Term { - rv.Count += i.cursors[0].curr.Count + for len(i.cursors) > 0 && i.cursors[0].curr.Term == i.entry.Term { + i.entry.Count += i.cursors[0].curr.Count next, err := i.cursors[0].itr.Next() if err != nil { return nil, err @@ -79,12 +80,12 @@ func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) { heap.Pop(i) } else { // modified heap, fix it - i.cursors[0].curr = next + i.cursors[0].curr = *next heap.Fix(i, 0) } } - return rv, nil + return &i.entry, nil } func (i *IndexSnapshotFieldDict) Close() error { From dd6ffbcb8ab38177809a2ce489d0a2ef65c7cfed Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Thu, 29 Mar 2018 18:36:48 -0700 Subject: [PATCH 340/728] Address data race with eligibleForRemoval WARNING: DATA RACE Read at 0x00c4201945c8 by goroutine 70: runtime.growslice() /usr/local/Cellar/go/1.9.3/libexec/src/runtime/slice.go:82 +0x0 github.com/blevesearch/bleve/index/scorch.(*Scorch).AddEligibleForRemoval() /Users/abhinavdangeti/Documents/go/src/github.com/blevesearch/bleve/index/scorch/scorch.go:485 +0x1bd Previous write at 0x00c4201945c8 by goroutine 47: github.com/blevesearch/bleve/index/scorch.(*Scorch).loadFromBolt.func1() /Users/abhinavdangeti/Documents/go/src/github.com/blevesearch/bleve/index/scorch/persister.go:476 +0x32b github.com/boltdb/bolt.(*DB).View() /Users/abhinavdangeti/Documents/go/src/github.com/boltdb/bolt/db.go:629 +0xc1 github.com/blevesearch/bleve/index/scorch.(*Scorch).loadFromBolt() /Users/abhinavdangeti/Documents/go/src/github.com/blevesearch/bleve/index/scorch/persister.go:462 +0xa1 github.com/blevesearch/bleve/index/scorch.(*Scorch).openBolt() /Users/abhinavdangeti/Documents/go/src/github.com/blevesearch/bleve/index/scorch/scorch.go:172 +0x9f7 github.com/blevesearch/bleve/index/scorch.(*Scorch).Open() /Users/abhinavdangeti/Documents/go/src/github.com/blevesearch/bleve/index/scorch/scorch.go:121 +0x3c github.com/blevesearch/bleve/index/scorch.TestIndexInsertThenDelete() /Users/abhinavdangeti/Documents/go/src/github.com/blevesearch/bleve/index/scorch/scorch_test.go:341 +0x18d3 testing.tRunner() /usr/local/Cellar/go/1.9.3/libexec/src/testing/testing.go:746 +0x16c --- index/scorch/persister.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 2fab53240..9a5c4ca46 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -473,19 +473,19 @@ func (s *Scorch) loadFromBolt() error { continue } if foundRoot { - s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch) + s.AddEligibleForRemoval(snapshotEpoch) continue } snapshot := snapshots.Bucket(k) if snapshot == nil { log.Printf("snapshot key, but bucket missing %x, continuing", k) - s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch) + s.AddEligibleForRemoval(snapshotEpoch) continue } indexSnapshot, err := s.loadSnapshot(snapshot) if err != nil { log.Printf("unable to load snapshot, %v, continuing", err) - s.eligibleForRemoval = append(s.eligibleForRemoval, snapshotEpoch) + s.AddEligibleForRemoval(snapshotEpoch) continue } indexSnapshot.epoch = snapshotEpoch From 0b4dadf430338183b4ce0da0ffc92f6b2c1e4e25 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Thu, 29 Mar 2018 18:56:16 -0700 Subject: [PATCH 341/728] Update nextSnapshotEpoch only within root Lock --- index/scorch/introducer.go | 9 +++++---- index/scorch/persister.go | 2 +- index/scorch/scorch.go | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 627d4e4cd..b00260bbe 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -212,19 +212,20 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) { atomic.AddUint64(&s.stats.TotIntroducePersistBeg, 1) defer atomic.AddUint64(&s.stats.TotIntroducePersistEnd, 1) - s.rootLock.RLock() + s.rootLock.Lock() root := s.root - s.rootLock.RUnlock() + nextSnapshotEpoch := s.nextSnapshotEpoch + s.nextSnapshotEpoch++ + s.rootLock.Unlock() newIndexSnapshot := &IndexSnapshot{ parent: s, - epoch: s.nextSnapshotEpoch, + epoch: nextSnapshotEpoch, segment: make([]*SegmentSnapshot, len(root.segment)), offsets: make([]uint64, len(root.offsets)), internal: make(map[string][]byte, len(root.internal)), refs: 1, } - s.nextSnapshotEpoch++ for i, segmentSnapshot := range root.segment { // see if this segment has been replaced diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 2fab53240..14fce1f30 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -495,8 +495,8 @@ func (s *Scorch) loadFromBolt() error { return err } s.nextSegmentID++ - s.nextSnapshotEpoch = snapshotEpoch + 1 s.rootLock.Lock() + s.nextSnapshotEpoch = snapshotEpoch + 1 if s.root != nil { _ = s.root.DecRef() } diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index cc47cda86..13e9a4027 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -57,8 +57,8 @@ type Scorch struct { nextSnapshotEpoch uint64 eligibleForRemoval []uint64 // Index snapshot epochs that are safe to GC. ineligibleForRemoval map[string]bool // Filenames that should not be GC'ed yet. - numSnapshotsToKeep int + numSnapshotsToKeep int closeCh chan struct{} introductions chan *segmentIntroduction persists chan *persistIntroduction From 7e3d8ae96d16204cda4c4adc310de4699c07b20d Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Thu, 29 Mar 2018 19:03:43 -0700 Subject: [PATCH 342/728] Address data race with size of cachedDocs WARNING: DATA RACE Read at 0x00c420901b50 by goroutine 240: github.com/blevesearch/bleve/index/scorch.(*SegmentSnapshot).Size() /Users/abhinavdangeti/Documents/go/src/github.com/blevesearch/bleve/index/scorch/snapshot_segment.go:233 +0xbb github.com/blevesearch/bleve/index/scorch.(*IndexSnapshot).updateSize() /Users/abhinavdangeti/Documents/go/src/github.com/blevesearch/bleve/index/scorch/snapshot_index.go:108 +0x102 github.com/blevesearch/bleve/index/scorch.(*Scorch).introduceMerge() /Users/abhinavdangeti/Documents/go/src/github.com/blevesearch/bleve/index/scorch/introducer.go:354 +0x131e github.com/blevesearch/bleve/index/scorch.(*Scorch).mainLoop() /Users/abhinavdangeti/Documents/go/src/github.com/blevesearch/bleve/index/scorch/introducer.go:66 +0x82d Previous write at 0x00c420901b50 by goroutine 103: github.com/blevesearch/bleve/index/scorch.(*cachedDocs).updateSizeLOCKED() /Users/abhinavdangeti/Documents/go/src/github.com/blevesearch/bleve/index/scorch/snapshot_segment.go:246 +0x203 github.com/blevesearch/bleve/index/scorch.(*cachedDocs).prepareFields() /Users/abhinavdangeti/Documents/go/src/github.com/blevesearch/bleve/index/scorch/snapshot_segment.go:226 +0x5fb github.com/blevesearch/bleve/index/scorch.(*IndexSnapshot).DocumentVisitFieldTerms.func1() /Users/abhinavdangeti/Documents/go/src/github.com/blevesearch/bleve/index/scorch/snapshot_index.go:456 +0x9f --- index/scorch/snapshot_segment.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 805e56642..c1fc9b8f3 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -16,6 +16,7 @@ package scorch import ( "sync" + "sync/atomic" "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index/scorch/segment" @@ -230,7 +231,7 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e } func (c *cachedDocs) Size() int { - return int(c.size) + return int(atomic.LoadUint64(&c.size)) } func (c *cachedDocs) updateSizeLOCKED() { @@ -243,5 +244,5 @@ func (c *cachedDocs) updateSizeLOCKED() { } } } - c.size = uint64(sizeInBytes) + atomic.StoreUint64(&c.size, uint64(sizeInBytes)) } From ecf91a296db857b5bfd9e3b4a6a23cd864ccb825 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 29 Mar 2018 12:32:56 +0530 Subject: [PATCH 343/728] removing dependency on govarint --- index/scorch/segment/zap/README.md | 6 +++--- index/scorch/segment/zap/build.go | 16 +++++++------- index/scorch/segment/zap/intcoder.go | 12 +++++------ index/scorch/segment/zap/merge.go | 13 +++++++----- index/scorch/segment/zap/new.go | 8 ++++--- index/scorch/segment/zap/posting.go | 31 ++++++++++------------------ index/scorch/segment/zap/segment.go | 16 ++++++-------- 7 files changed, 46 insertions(+), 56 deletions(-) diff --git a/index/scorch/segment/zap/README.md b/index/scorch/segment/zap/README.md index 179adceaf..41f5902a0 100644 --- a/index/scorch/segment/zap/README.md +++ b/index/scorch/segment/zap/README.md @@ -28,7 +28,7 @@ Current usage: - produce a slice of metadata bytes and data bytes - produce these slices in field id order - field value is appended to the data slice - - metadata slice is govarint encoded with the following values for each field value + - metadata slice is varint encoded with the following values for each field value - field id (uint16) - field type (byte) - field value start offset in uncompressed data slice (uint64) @@ -53,7 +53,7 @@ With this index and a known document number, we have direct access to all the st ## posting details (freq/norm) section - for each posting list - - produce a slice containing multiple consecutive chunks (each chunk is govarint stream) + - produce a slice containing multiple consecutive chunks (each chunk is varint stream) - produce a slice remembering offsets of where each chunk starts - preparation phase: - for each hit in the posting list @@ -71,7 +71,7 @@ If you know the doc number you're interested in, this format lets you jump to th ## posting details (location) section - for each posting list - - produce a slice containing multiple consecutive chunks (each chunk is govarint stream) + - produce a slice containing multiple consecutive chunks (each chunk is varint stream) - produce a slice remembering offsets of where each chunk starts - preparation phase: - for each hit in the posting list diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 03e4cbae8..184b08d5b 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -18,8 +18,6 @@ import ( "bufio" "math" "os" - - "github.com/Smerity/govarint" ) const version uint32 = 8 @@ -78,37 +76,37 @@ func PersistSegmentBase(sb *SegmentBase, path string) error { func persistStoredFieldValues(fieldID int, storedFieldValues [][]byte, stf []byte, spf [][]uint64, - curr int, metaEncoder *govarint.Base128Encoder, data []byte) ( + curr int, metaEncode VarintEncoder, data []byte) ( int, []byte, error) { for i := 0; i < len(storedFieldValues); i++ { // encode field - _, err := metaEncoder.PutU64(uint64(fieldID)) + _, err := metaEncode(uint64(fieldID)) if err != nil { return 0, nil, err } // encode type - _, err = metaEncoder.PutU64(uint64(stf[i])) + _, err = metaEncode(uint64(stf[i])) if err != nil { return 0, nil, err } // encode start offset - _, err = metaEncoder.PutU64(uint64(curr)) + _, err = metaEncode(uint64(curr)) if err != nil { return 0, nil, err } // end len - _, err = metaEncoder.PutU64(uint64(len(storedFieldValues[i]))) + _, err = metaEncode(uint64(len(storedFieldValues[i]))) if err != nil { return 0, nil, err } // encode number of array pos - _, err = metaEncoder.PutU64(uint64(len(spf[i]))) + _, err = metaEncode(uint64(len(spf[i]))) if err != nil { return 0, nil, err } // encode all array positions for _, pos := range spf[i] { - _, err = metaEncoder.PutU64(pos) + _, err = metaEncode(pos) if err != nil { return 0, nil, err } diff --git a/index/scorch/segment/zap/intcoder.go b/index/scorch/segment/zap/intcoder.go index 81ef8bb2e..571d06edb 100644 --- a/index/scorch/segment/zap/intcoder.go +++ b/index/scorch/segment/zap/intcoder.go @@ -18,15 +18,12 @@ import ( "bytes" "encoding/binary" "io" - - "github.com/Smerity/govarint" ) type chunkedIntCoder struct { final []byte chunkSize uint64 chunkBuf bytes.Buffer - encoder *govarint.Base128Encoder chunkLens []uint64 currChunk uint64 @@ -43,7 +40,6 @@ func newChunkedIntCoder(chunkSize uint64, maxDocNum uint64) *chunkedIntCoder { chunkLens: make([]uint64, total), final: make([]byte, 0, 64), } - rv.encoder = govarint.NewU64Base128Encoder(&rv.chunkBuf) return rv } @@ -70,8 +66,13 @@ func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error { c.currChunk = chunk } + if len(c.buf) < binary.MaxVarintLen64 { + c.buf = make([]byte, binary.MaxVarintLen64) + } + for _, val := range vals { - _, err := c.encoder.PutU64(val) + wb := binary.PutUvarint(c.buf, val) + _, err := c.chunkBuf.Write(c.buf[:wb]) if err != nil { return err } @@ -96,7 +97,6 @@ func (c *chunkedIntCoder) AddBytes(docNum uint64, buf []byte) error { // Close indicates you are done calling Add() this allows the final chunk // to be encoded. func (c *chunkedIntCoder) Close() { - c.encoder.Close() encodingBytes := c.chunkBuf.Bytes() c.chunkLens[c.currChunk] = uint64(len(encodingBytes)) c.final = append(c.final, encodingBytes...) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 0d40d5f28..54c6b62cc 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -24,7 +24,6 @@ import ( "sort" "github.com/RoaringBitmap/roaring" - "github.com/Smerity/govarint" "github.com/couchbase/vellum" "github.com/golang/snappy" ) @@ -548,6 +547,8 @@ func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCo return postingsOffset, nil } +type VarintEncoder func(uint64) (int, error) + func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, fieldsMap map[string]uint16, fieldsInv []string, fieldsSame bool, newSegDocCount uint64, w *CountHashWriter) (uint64, [][]uint64, error) { @@ -556,10 +557,13 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, var newDocNum uint64 var curr int - var metaBuf bytes.Buffer var data, compressed []byte - - metaEncoder := govarint.NewU64Base128Encoder(&metaBuf) + var metaBuf bytes.Buffer + varBuf := make([]byte, binary.MaxVarintLen64) + metaEncoder := func(val uint64) (int, error) { + wb := binary.PutUvarint(varBuf, val) + return metaBuf.Write(varBuf[:wb]) + } vals := make([][][]byte, len(fieldsInv)) typs := make([][]byte, len(fieldsInv)) @@ -637,7 +641,6 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, } } - metaEncoder.Close() metaBytes := metaBuf.Bytes() compressed = snappy.Encode(compressed[:cap(compressed)], data) diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index 83a0cbced..7e64eb834 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -22,7 +22,6 @@ import ( "sync" "github.com/RoaringBitmap/roaring" - "github.com/Smerity/govarint" "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" @@ -479,7 +478,11 @@ func (s *interim) processDocument(docNum uint64, func (s *interim) writeStoredFields() ( storedIndexOffset uint64, err error) { - metaEncoder := govarint.NewU64Base128Encoder(&s.metaBuf) + varBuf := make([]byte, binary.MaxVarintLen64) + metaEncoder := func(val uint64) (int, error) { + wb := binary.PutUvarint(varBuf, val) + return s.metaBuf.Write(varBuf[:wb]) + } data, compressed := s.tmp0[:0], s.tmp1[:0] defer func() { s.tmp0, s.tmp1 = data, compressed }() @@ -530,7 +533,6 @@ func (s *interim) writeStoredFields() ( } } - metaEncoder.Close() metaBytes := s.metaBuf.Bytes() compressed = snappy.Encode(compressed[:cap(compressed)], data) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index c0c39571a..f96683a14 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -22,7 +22,6 @@ import ( "reflect" "github.com/RoaringBitmap/roaring" - "github.com/Smerity/govarint" "github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/size" ) @@ -151,13 +150,11 @@ func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool, if freqNormReader != nil { freqNormReader.Reset([]byte(nil)) } - freqNormDecoder := rv.freqNormDecoder locReader := rv.locReader if locReader != nil { locReader.Reset([]byte(nil)) } - locDecoder := rv.locDecoder freqChunkOffsets := rv.freqChunkOffsets[:0] locChunkOffsets := rv.locChunkOffsets[:0] @@ -170,10 +167,7 @@ func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool, *rv = PostingsIterator{} // clear the struct rv.freqNormReader = freqNormReader - rv.freqNormDecoder = freqNormDecoder - rv.locReader = locReader - rv.locDecoder = locDecoder rv.freqChunkOffsets = freqChunkOffsets rv.locChunkOffsets = locChunkOffsets @@ -324,10 +318,9 @@ type PostingsIterator struct { currChunk uint32 currChunkFreqNorm []byte currChunkLoc []byte - freqNormDecoder *govarint.Base128Decoder - freqNormReader *bytes.Reader - locDecoder *govarint.Base128Decoder - locReader *bytes.Reader + + freqNormReader *bytes.Reader + locReader *bytes.Reader freqChunkOffsets []uint64 freqChunkStart uint64 @@ -379,7 +372,6 @@ func (i *PostingsIterator) loadChunk(chunk int) error { i.currChunkFreqNorm = i.postings.sb.mem[start:end] if i.freqNormReader == nil { i.freqNormReader = bytes.NewReader(i.currChunkFreqNorm) - i.freqNormDecoder = govarint.NewU64Base128Decoder(i.freqNormReader) } else { i.freqNormReader.Reset(i.currChunkFreqNorm) } @@ -398,7 +390,6 @@ func (i *PostingsIterator) loadChunk(chunk int) error { i.currChunkLoc = i.postings.sb.mem[start:end] if i.locReader == nil { i.locReader = bytes.NewReader(i.currChunkLoc) - i.locDecoder = govarint.NewU64Base128Decoder(i.locReader) } else { i.locReader.Reset(i.currChunkLoc) } @@ -413,13 +404,13 @@ func (i *PostingsIterator) readFreqNormHasLocs() (uint64, uint64, bool, error) { return 1, i.normBits1Hit, false, nil } - freqHasLocs, err := i.freqNormDecoder.GetU64() + freqHasLocs, err := binary.ReadUvarint(i.freqNormReader) if err != nil { return 0, 0, false, fmt.Errorf("error reading frequency: %v", err) } freq, hasLocs := decodeFreqHasLocs(freqHasLocs) - normBits, err := i.freqNormDecoder.GetU64() + normBits, err := binary.ReadUvarint(i.freqNormReader) if err != nil { return 0, 0, false, fmt.Errorf("error reading norm: %v", err) } @@ -447,27 +438,27 @@ func decodeFreqHasLocs(freqHasLocs uint64) (uint64, bool) { // the contents. func (i *PostingsIterator) readLocation(l *Location) error { // read off field - fieldID, err := i.locDecoder.GetU64() + fieldID, err := binary.ReadUvarint(i.locReader) if err != nil { return fmt.Errorf("error reading location field: %v", err) } // read off pos - pos, err := i.locDecoder.GetU64() + pos, err := binary.ReadUvarint(i.locReader) if err != nil { return fmt.Errorf("error reading location pos: %v", err) } // read off start - start, err := i.locDecoder.GetU64() + start, err := binary.ReadUvarint(i.locReader) if err != nil { return fmt.Errorf("error reading location start: %v", err) } // read off end - end, err := i.locDecoder.GetU64() + end, err := binary.ReadUvarint(i.locReader) if err != nil { return fmt.Errorf("error reading location end: %v", err) } // read off num array pos - numArrayPos, err := i.locDecoder.GetU64() + numArrayPos, err := binary.ReadUvarint(i.locReader) if err != nil { return fmt.Errorf("error reading location num array pos: %v", err) } @@ -487,7 +478,7 @@ func (i *PostingsIterator) readLocation(l *Location) error { // read off array positions for k := 0; k < int(numArrayPos); k++ { - ap, err := i.locDecoder.GetU64() + ap, err := binary.ReadUvarint(i.locReader) if err != nil { return fmt.Errorf("error reading array position: %v", err) } diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 58f8bee02..95a45d86d 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -24,7 +24,6 @@ import ( "sync" "github.com/RoaringBitmap/roaring" - "github.com/Smerity/govarint" "github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/size" "github.com/couchbase/vellum" @@ -278,14 +277,12 @@ func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) { type visitDocumentCtx struct { buf []byte reader bytes.Reader - decoder *govarint.Base128Decoder arrayPos []uint64 } var visitDocumentCtxPool = sync.Pool{ New: func() interface{} { reuse := &visitDocumentCtx{} - reuse.decoder = govarint.NewU64Base128Decoder(&reuse.reader) return reuse }, } @@ -305,30 +302,29 @@ func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldVal // now decode meta and process vdc.reader.Reset(meta) - decoder := vdc.decoder keepGoing := true for keepGoing { - field, err := decoder.GetU64() + field, err := binary.ReadUvarint(&vdc.reader) if err == io.EOF { break } if err != nil { return err } - typ, err := decoder.GetU64() + typ, err := binary.ReadUvarint(&vdc.reader) if err != nil { return err } - offset, err := decoder.GetU64() + offset, err := binary.ReadUvarint(&vdc.reader) if err != nil { return err } - l, err := decoder.GetU64() + l, err := binary.ReadUvarint(&vdc.reader) if err != nil { return err } - numap, err := decoder.GetU64() + numap, err := binary.ReadUvarint(&vdc.reader) if err != nil { return err } @@ -339,7 +335,7 @@ func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldVal } arrayPos = vdc.arrayPos[:numap] for i := 0; i < int(numap); i++ { - ap, err := decoder.GetU64() + ap, err := binary.ReadUvarint(&vdc.reader) if err != nil { return err } From ff374bfb3acd87a29cd1b1d0fcf17bc82bba68ef Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 30 Mar 2018 15:51:51 +0530 Subject: [PATCH 344/728] fixing the function name --- index/scorch/segment/zap/build.go | 2 +- index/scorch/segment/zap/merge.go | 8 ++++---- index/scorch/segment/zap/new.go | 6 +++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 53da82f18..755808c27 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -76,7 +76,7 @@ func PersistSegmentBase(sb *SegmentBase, path string) error { func persistStoredFieldValues(fieldID int, storedFieldValues [][]byte, stf []byte, spf [][]uint64, - curr int, metaEncode VarintEncoder, data []byte) ( + curr int, metaEncode varintEncoder, data []byte) ( int, []byte, error) { for i := 0; i < len(storedFieldValues); i++ { // encode field diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 925a41a16..f2a881856 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -547,7 +547,7 @@ func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCo return postingsOffset, nil } -type VarintEncoder func(uint64) (int, error) +type varintEncoder func(uint64) (int, error) func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, fieldsMap map[string]uint16, fieldsInv []string, fieldsSame bool, newSegDocCount uint64, @@ -560,7 +560,7 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, var data, compressed []byte var metaBuf bytes.Buffer varBuf := make([]byte, binary.MaxVarintLen64) - metaEncoder := func(val uint64) (int, error) { + metaEncode := func(val uint64) (int, error) { wb := binary.PutUvarint(varBuf, val) return metaBuf.Write(varBuf[:wb]) } @@ -628,7 +628,7 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, // _id field special case optimizes ExternalID() lookups idFieldVal := vals[uint16(0)][0] - _, err = metaEncoder.PutU64(uint64(len(idFieldVal))) + _, err = metaEncode(uint64(len(idFieldVal))) if err != nil { return 0, nil, err } @@ -642,7 +642,7 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, var err2 error curr, data, err2 = persistStoredFieldValues(fieldID, - storedFieldValues, stf, spf, curr, metaEncoder, data) + storedFieldValues, stf, spf, curr, metaEncode, data) if err2 != nil { return 0, nil, err2 } diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index d9f6e3409..da24988ae 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -479,7 +479,7 @@ func (s *interim) processDocument(docNum uint64, func (s *interim) writeStoredFields() ( storedIndexOffset uint64, err error) { varBuf := make([]byte, binary.MaxVarintLen64) - metaEncoder := func(val uint64) (int, error) { + metaEncode := func(val uint64) (int, error) { wb := binary.PutUvarint(varBuf, val) return s.metaBuf.Write(varBuf[:wb]) } @@ -523,7 +523,7 @@ func (s *interim) writeStoredFields() ( // _id field special case optimizes ExternalID() lookups idFieldVal := docStoredFields[uint16(0)].vals[0] - _, err = metaEncoder.PutU64(uint64(len(idFieldVal))) + _, err = metaEncode(uint64(len(idFieldVal))) if err != nil { return 0, err } @@ -534,7 +534,7 @@ func (s *interim) writeStoredFields() ( if exists { curr, data, err = persistStoredFieldValues( fieldID, isf.vals, isf.typs, isf.arrayposs, - curr, metaEncoder, data) + curr, metaEncode, data) if err != nil { return 0, err } From 0ba1dd6139ab00443b224c7f895bf0a4d2372ab6 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 30 Mar 2018 16:14:00 -0400 Subject: [PATCH 345/728] code review changes based on my own feedback split interface into two changed regex to regexp finished mem segment impl removed log/time printfs --- index/index.go | 9 +- index/scorch/segment/empty.go | 2 +- index/scorch/segment/mem/dict.go | 115 ++++++++++++++++++++++---- index/scorch/segment/mem/dict_test.go | 51 ++++++++++++ index/scorch/segment/segment.go | 2 +- index/scorch/segment/zap/dict.go | 2 +- index/scorch/snapshot_index.go | 4 +- index/scorch/snapshot_segment.go | 4 +- search/searcher/search_fuzzy.go | 8 +- search/searcher/search_regexp.go | 9 +- 10 files changed, 165 insertions(+), 41 deletions(-) diff --git a/index/index.go b/index/index.go index c03796d1c..ea53ee5d6 100644 --- a/index/index.go +++ b/index/index.go @@ -96,10 +96,11 @@ type IndexReader interface { Close() error } -// IndexReaderAdv is an optional interface for advanced users -// Hope to have a better name here... -type IndexReaderAdv interface { - FieldDictRegex(field string, regex []byte) (FieldDict, error) +type IndexReaderRegexp interface { + FieldDictRegexp(field string, regex []byte) (FieldDict, error) +} + +type IndexReaderFuzzy interface { FieldDictFuzzy(field string, term []byte, fuzziness int) (FieldDict, error) } diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index d8ac45f47..f3315a804 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -76,7 +76,7 @@ func (e *EmptyDictionary) RangeIterator(start, end string) DictionaryIterator { return &EmptyDictionaryIterator{} } -func (e *EmptyDictionary) RegexIterator(start string) DictionaryIterator { +func (e *EmptyDictionary) RegexpIterator(start string) DictionaryIterator { return &EmptyDictionaryIterator{} } diff --git a/index/scorch/segment/mem/dict.go b/index/scorch/segment/mem/dict.go index 2877f9453..178c41cc8 100644 --- a/index/scorch/segment/mem/dict.go +++ b/index/scorch/segment/mem/dict.go @@ -15,7 +15,9 @@ package mem import ( + "math" "reflect" + "regexp" "sort" "strings" @@ -98,36 +100,42 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator } } -// RegexIterator returns an iterator which only visits terms matching +// RegexpIterator returns an iterator which only visits terms matching // the given regex expression. -// TODO complete the implementation -func (d *Dictionary) RegexIterator(regex string) segment.DictionaryIterator { - offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], regex) +func (d *Dictionary) RegexpIterator(pattern string) segment.DictionaryIterator { + regex, err := regexp.Compile(pattern) + if err != nil { + // invalid regexp, so set offset to the end + return &DictionaryIterator{ + d: d, + offset: len(d.segment.DictKeys[d.fieldID]), + } + } return &DictionaryIterator{ - d: d, - offset: offset, - prefix: regex, + d: d, + regex: regex, } } // FuzzyIterator returns an iterator which only visits terms matching // the given edit distance. -// TODO complete the implementation func (d *Dictionary) FuzzyIterator(term string, fuzziness int) segment.DictionaryIterator { - offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], term) return &DictionaryIterator{ - d: d, - offset: offset, - prefix: term, + d: d, + fuzzyTerm: term, + fuzziness: fuzziness, } } // DictionaryIterator is an iterator for term dictionary type DictionaryIterator struct { - d *Dictionary - prefix string - end string - offset int + d *Dictionary + prefix string + end string + offset int + regex *regexp.Regexp + fuzzyTerm string + fuzziness int dictEntry index.DictEntry // reused across Next()'s } @@ -146,6 +154,28 @@ func (d *DictionaryIterator) Next() (*index.DictEntry, error) { if d.end != "" && next > d.end { return nil, nil } + // check regexp + if d.regex != nil { + // keep going until we find a match, mindful of the end of the slice + for !d.regex.MatchString(next) { + d.offset++ + if d.offset > len(d.d.segment.DictKeys[d.d.fieldID])-1 { + return nil, nil + } + next = d.d.segment.DictKeys[d.d.fieldID][d.offset] + } + } + if d.fuzzyTerm != "" { + _, exceeded := LevenshteinDistanceMax(d.fuzzyTerm, next, d.fuzziness) + for exceeded { + d.offset++ + if d.offset > len(d.d.segment.DictKeys[d.d.fieldID])-1 { + return nil, nil + } + next = d.d.segment.DictKeys[d.d.fieldID][d.offset] + _, exceeded = LevenshteinDistanceMax(d.fuzzyTerm, next, d.fuzziness) + } + } d.offset++ postingID := d.d.segment.Dicts[d.d.fieldID][next] @@ -153,3 +183,56 @@ func (d *DictionaryIterator) Next() (*index.DictEntry, error) { d.dictEntry.Count = d.d.segment.Postings[postingID-1].GetCardinality() return &d.dictEntry, nil } + +// LevenshteinDistanceMax same as LevenshteinDistance but +// attempts to bail early once we know the distance +// will be greater than max +// in which case the first return val will be the max +// and the second will be true, indicating max was exceeded +func LevenshteinDistanceMax(a, b string, max int) (int, bool) { + la := len(a) + lb := len(b) + + ld := int(math.Abs(float64(la - lb))) + if ld > max { + return max, true + } + + d := make([]int, la+1) + var lastdiag, olddiag, temp int + + for i := 1; i <= la; i++ { + d[i] = i + } + for i := 1; i <= lb; i++ { + d[0] = i + lastdiag = i - 1 + rowmin := max + 1 + for j := 1; j <= la; j++ { + olddiag = d[j] + min := d[j] + 1 + if (d[j-1] + 1) < min { + min = d[j-1] + 1 + } + if a[j-1] == b[i-1] { + temp = 0 + } else { + temp = 1 + } + if (lastdiag + temp) < min { + min = lastdiag + temp + } + if min < rowmin { + rowmin = min + } + d[j] = min + + lastdiag = olddiag + } + // after each row if rowmin isn't less than max stop + if rowmin > max { + return max, true + } + } + return d[la], false +} diff --git a/index/scorch/segment/mem/dict_test.go b/index/scorch/segment/mem/dict_test.go index adfa4957d..e69016a81 100644 --- a/index/scorch/segment/mem/dict_test.go +++ b/index/scorch/segment/mem/dict_test.go @@ -157,4 +157,55 @@ func TestDictionary(t *testing.T) { if !reflect.DeepEqual(expected, got) { t.Errorf("expected: %v, got: %v", expected, got) } + + // test regexp iterator + expected = []string{"ball", "bat"} + got = got[:0] + itr = dict.RegexpIterator("ba.*") + next, err = itr.Next() + for next != nil && err == nil { + got = append(got, next.Term) + next, err = itr.Next() + } + if err != nil { + t.Fatalf("dict itr error: %v", err) + } + + if !reflect.DeepEqual(expected, got) { + t.Errorf("expected: %v, got: %v", expected, got) + } + + // test regexp iterator with invalid regexp + expected = []string{} + got = got[:0] + itr = dict.RegexpIterator(string([]byte{0xff})) + next, err = itr.Next() + for next != nil && err == nil { + got = append(got, next.Term) + next, err = itr.Next() + } + if err != nil { + t.Fatalf("dict itr error: %v", err) + } + + if !reflect.DeepEqual(expected, got) { + t.Errorf("expected: %v, got: %v", expected, got) + } + + // test fuzzy iterator + expected = []string{"bat", "cat"} + got = got[:0] + itr = dict.FuzzyIterator("vat", 1) + next, err = itr.Next() + for next != nil && err == nil { + got = append(got, next.Term) + next, err = itr.Next() + } + if err != nil { + t.Fatalf("dict itr error: %v", err) + } + + if !reflect.DeepEqual(expected, got) { + t.Errorf("expected: %v, got: %v", expected, got) + } } diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index de678e094..a79611363 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -48,7 +48,7 @@ type TermDictionary interface { Iterator() DictionaryIterator PrefixIterator(prefix string) DictionaryIterator RangeIterator(start, end string) DictionaryIterator - RegexIterator(regex string) DictionaryIterator + RegexpIterator(regex string) DictionaryIterator FuzzyIterator(term string, fuzziness int) DictionaryIterator } diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index fa846bf3f..4cda63be1 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -152,7 +152,7 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator // RegexIterator returns an iterator which only visits terms having the // the specified regex -func (d *Dictionary) RegexIterator(regex string) segment.DictionaryIterator { +func (d *Dictionary) RegexpIterator(regex string) segment.DictionaryIterator { rv := &DictionaryIterator{ d: d, } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 3e9d97f59..cdd983857 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -175,10 +175,10 @@ func (i *IndexSnapshot) FieldDictPrefix(field string, }) } -func (i *IndexSnapshot) FieldDictRegex(field string, +func (i *IndexSnapshot) FieldDictRegexp(field string, termRegex []byte) (index.FieldDict, error) { return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { - return i.RegexIterator(string(termRegex)) + return i.RegexpIterator(string(termRegex)) }) } diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 422d1e814..ac95d450f 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -49,8 +49,8 @@ func (s *SegmentDictionarySnapshot) RangeIterator(start, end string) segment.Dic return s.d.RangeIterator(start, end) } -func (s *SegmentDictionarySnapshot) RegexIterator(regex string) segment.DictionaryIterator { - return s.d.RegexIterator(regex) +func (s *SegmentDictionarySnapshot) RegexpIterator(regex string) segment.DictionaryIterator { + return s.d.RegexpIterator(regex) } func (s *SegmentDictionarySnapshot) FuzzyIterator(term string, diff --git a/search/searcher/search_fuzzy.go b/search/searcher/search_fuzzy.go index 69aab2f73..7d165b067 100644 --- a/search/searcher/search_fuzzy.go +++ b/search/searcher/search_fuzzy.go @@ -15,9 +15,6 @@ package searcher import ( - "log" - "time" - "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" ) @@ -34,10 +31,8 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string, break } } - t := time.Now() candidateTerms, err := findFuzzyCandidateTerms(indexReader, term, fuzziness, field, prefixTerm) - log.Printf("time taken-> %f", time.Since(t).Seconds()) if err != nil { return nil, err } @@ -56,7 +51,7 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, // in case of advanced reader implementations directly call // the levenshtein automaton based iterator to collect the // candidate terms - if ir, ok := indexReader.(index.IndexReaderAdv); ok { + if ir, ok := indexReader.(index.IndexReaderFuzzy); ok { fieldDict, err = ir.FieldDictFuzzy(field, []byte(term), fuzziness) if err != nil { return rv, err @@ -66,7 +61,6 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, rv = append(rv, tfd.Term) tfd, err = fieldDict.Next() } - log.Printf("candidate FSA fuzzy terms: %+v", rv) return rv, nil } fieldDict, err = indexReader.FieldDict(field) diff --git a/search/searcher/search_regexp.go b/search/searcher/search_regexp.go index 806f135a3..b88eb3eb5 100644 --- a/search/searcher/search_regexp.go +++ b/search/searcher/search_regexp.go @@ -15,9 +15,7 @@ package searcher import ( - "log" "regexp" - "time" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" @@ -32,9 +30,8 @@ func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp, field string, boost float64, options search.SearcherOptions) ( search.Searcher, error) { var candidateTerms []string - t := time.Now() - if ir, ok := indexReader.(index.IndexReaderAdv); ok { - fieldDict, err := ir.FieldDictRegex(field, []byte(pattern.String())) + if ir, ok := indexReader.(index.IndexReaderRegexp); ok { + fieldDict, err := ir.FieldDictRegexp(field, []byte(pattern.String())) if err != nil { return nil, err } @@ -50,7 +47,6 @@ func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp, candidateTerms = append(candidateTerms, tfd.Term) tfd, err = fieldDict.Next() } - log.Printf("fsa time took-> %f", time.Since(t).Seconds()) } else { prefixTerm, complete := pattern.LiteralPrefix() if complete { @@ -64,7 +60,6 @@ func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp, return nil, err } } - log.Printf("time took-> %f", time.Since(t).Seconds()) } return NewMultiTermSearcher(indexReader, candidateTerms, field, boost, From f8498ae0e903fda095ef5b15064bff4a62fd566a Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 30 Mar 2018 16:16:32 -0400 Subject: [PATCH 346/728] add comment for consistency --- index/scorch/segment/mem/dict.go | 1 + 1 file changed, 1 insertion(+) diff --git a/index/scorch/segment/mem/dict.go b/index/scorch/segment/mem/dict.go index 178c41cc8..b74872371 100644 --- a/index/scorch/segment/mem/dict.go +++ b/index/scorch/segment/mem/dict.go @@ -165,6 +165,7 @@ func (d *DictionaryIterator) Next() (*index.DictEntry, error) { next = d.d.segment.DictKeys[d.d.fieldID][d.offset] } } + // check fuzziness if d.fuzzyTerm != "" { _, exceeded := LevenshteinDistanceMax(d.fuzzyTerm, next, d.fuzziness) for exceeded { From ce15a8d22ebd51c7366066e57b3e2556f9fc0b10 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 30 Mar 2018 14:55:42 -0700 Subject: [PATCH 347/728] remove unused PhraseSearcher fields --- search/searcher/search_phrase.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/search/searcher/search_phrase.go b/search/searcher/search_phrase.go index 0026794dd..85d0f6b52 100644 --- a/search/searcher/search_phrase.go +++ b/search/searcher/search_phrase.go @@ -32,11 +32,9 @@ func init() { } type PhraseSearcher struct { - indexReader index.IndexReader mustSearcher *ConjunctionSearcher queryNorm float64 currMust *search.DocumentMatch - slop int terms [][]string initialized bool } @@ -126,7 +124,6 @@ func NewMultiPhraseSearcher(indexReader index.IndexReader, terms [][]string, fie // build our searcher rv := PhraseSearcher{ - indexReader: indexReader, mustSearcher: mustSearcher, terms: terms, } From 2e36384e3d0616354d0ead765d84d86bd091b682 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 30 Mar 2018 15:13:58 -0700 Subject: [PATCH 348/728] phrase searcher returns currMust back to pool --- search/searcher/search_phrase.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/search/searcher/search_phrase.go b/search/searcher/search_phrase.go index 85d0f6b52..7591c7d76 100644 --- a/search/searcher/search_phrase.go +++ b/search/searcher/search_phrase.go @@ -160,6 +160,9 @@ func (s *PhraseSearcher) advanceNextMust(ctx *search.SearchContext) error { var err error if s.mustSearcher != nil { + if s.currMust != nil { + ctx.DocumentMatchPool.Put(s.currMust) + } s.currMust, err = s.mustSearcher.Next(ctx) if err != nil { return err @@ -227,6 +230,7 @@ func (s *PhraseSearcher) checkCurrMustMatch(ctx *search.SearchContext) *search.D if freq > 0 { // return match rv := s.currMust + s.currMust = nil rv.Locations = rvftlm return rv } From f5e01aa54e0b23a86309169e7525f06a32e50c3f Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 30 Mar 2018 15:31:40 -0700 Subject: [PATCH 349/728] remove a level of phrasePath indirection for fewer alloc's --- search/searcher/search_phrase.go | 10 +-- search/searcher/search_phrase_test.go | 96 +++++++++++++-------------- 2 files changed, 54 insertions(+), 52 deletions(-) diff --git a/search/searcher/search_phrase.go b/search/searcher/search_phrase.go index 7591c7d76..dfb7267f7 100644 --- a/search/searcher/search_phrase.go +++ b/search/searcher/search_phrase.go @@ -243,7 +243,8 @@ func (s *PhraseSearcher) checkCurrMustMatch(ctx *search.SearchContext) *search.D // constraints (possibly more than once). if so, the number of times it was // satisfied, and these locations are returned. otherwise 0 and either // a nil or empty TermLocationMap -func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext, tlm search.TermLocationMap) (int, search.TermLocationMap) { +func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext, tlm search.TermLocationMap) ( + int, search.TermLocationMap) { paths := findPhrasePaths(0, nil, s.terms, tlm, nil, 0) rv := make(search.TermLocationMap, len(s.terms)) for _, p := range paths { @@ -261,7 +262,7 @@ func (p *phrasePart) String() string { return fmt.Sprintf("[%s %v]", p.term, p.loc) } -type phrasePath []*phrasePart +type phrasePath []phrasePart func (p phrasePath) MergeInto(in search.TermLocationMap) { for _, pp := range p { @@ -282,7 +283,8 @@ func (p phrasePath) MergeInto(in search.TermLocationMap) { // this is the primary state being built during the traversal // // returns slice of paths, or nil if invocation did not find any successul paths -func findPhrasePaths(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]string, tlm search.TermLocationMap, p phrasePath, remainingSlop int) []phrasePath { +func findPhrasePaths(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]string, + tlm search.TermLocationMap, p phrasePath, remainingSlop int) []phrasePath { // no more terms if len(phraseTerms) < 1 { @@ -321,7 +323,7 @@ func findPhrasePaths(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]s // if enough slop reamining, continue recursively if prevPos == 0 || (remainingSlop-dist) >= 0 { // this location works, add it to the path (but not for empty term) - px := append(p, &phrasePart{term: carTerm, loc: loc}) + px := append(p, phrasePart{term: carTerm, loc: loc}) rv = append(rv, findPhrasePaths(loc.Pos, loc.ArrayPositions, cdr, tlm, px, remainingSlop-dist)...) } } diff --git a/search/searcher/search_phrase_test.go b/search/searcher/search_phrase_test.go index 046b8d81d..a764ac2fe 100644 --- a/search/searcher/search_phrase_test.go +++ b/search/searcher/search_phrase_test.go @@ -185,8 +185,8 @@ func TestFindPhrasePaths(t *testing.T) { }, paths: []phrasePath{ phrasePath{ - &phrasePart{"cat", &search.Location{Pos: 1}}, - &phrasePart{"dog", &search.Location{Pos: 2}}, + phrasePart{"cat", &search.Location{Pos: 1}}, + phrasePart{"dog", &search.Location{Pos: 2}}, }, }, }, @@ -242,12 +242,12 @@ func TestFindPhrasePaths(t *testing.T) { }, paths: []phrasePath{ phrasePath{ - &phrasePart{"cat", &search.Location{Pos: 1}}, - &phrasePart{"dog", &search.Location{Pos: 2}}, + phrasePart{"cat", &search.Location{Pos: 1}}, + phrasePart{"dog", &search.Location{Pos: 2}}, }, phrasePath{ - &phrasePart{"cat", &search.Location{Pos: 8}}, - &phrasePart{"dog", &search.Location{Pos: 9}}, + phrasePart{"cat", &search.Location{Pos: 8}}, + phrasePart{"dog", &search.Location{Pos: 9}}, }, }, }, @@ -268,8 +268,8 @@ func TestFindPhrasePaths(t *testing.T) { }, paths: []phrasePath{ phrasePath{ - &phrasePart{"cat", &search.Location{Pos: 1}}, - &phrasePart{"dog", &search.Location{Pos: 3}}, + phrasePart{"cat", &search.Location{Pos: 1}}, + phrasePart{"dog", &search.Location{Pos: 3}}, }, }, }, @@ -290,8 +290,8 @@ func TestFindPhrasePaths(t *testing.T) { }, paths: []phrasePath{ phrasePath{ - &phrasePart{"cat", &search.Location{Pos: 2}}, - &phrasePart{"dog", &search.Location{Pos: 3}}, + phrasePart{"cat", &search.Location{Pos: 2}}, + phrasePart{"dog", &search.Location{Pos: 3}}, }, }, }, @@ -312,8 +312,8 @@ func TestFindPhrasePaths(t *testing.T) { }, paths: []phrasePath{ phrasePath{ - &phrasePart{"cat", &search.Location{Pos: 2}}, - &phrasePart{"dog", &search.Location{Pos: 3}}, + phrasePart{"cat", &search.Location{Pos: 2}}, + phrasePart{"dog", &search.Location{Pos: 3}}, }, }, }, @@ -372,8 +372,8 @@ func TestFindPhrasePathsSloppy(t *testing.T) { slop: 3, paths: []phrasePath{ phrasePath{ - &phrasePart{"one", &search.Location{Pos: 1}}, - &phrasePart{"five", &search.Location{Pos: 5}}, + phrasePart{"one", &search.Location{Pos: 1}}, + phrasePart{"five", &search.Location{Pos: 5}}, }, }, }, @@ -383,8 +383,8 @@ func TestFindPhrasePathsSloppy(t *testing.T) { slop: 0, paths: []phrasePath{ phrasePath{ - &phrasePart{"four", &search.Location{Pos: 4}}, - &phrasePart{"five", &search.Location{Pos: 5}}, + phrasePart{"four", &search.Location{Pos: 4}}, + phrasePart{"five", &search.Location{Pos: 5}}, }, }, }, @@ -399,8 +399,8 @@ func TestFindPhrasePathsSloppy(t *testing.T) { slop: 1, paths: []phrasePath{ phrasePath{ - &phrasePart{"one", &search.Location{Pos: 1}}, - &phrasePart{"two", &search.Location{Pos: 2}}, + phrasePart{"one", &search.Location{Pos: 1}}, + phrasePart{"two", &search.Location{Pos: 2}}, }, }, }, @@ -415,8 +415,8 @@ func TestFindPhrasePathsSloppy(t *testing.T) { slop: 2, paths: []phrasePath{ phrasePath{ - &phrasePart{"two", &search.Location{Pos: 2}}, - &phrasePart{"one", &search.Location{Pos: 1}}, + phrasePart{"two", &search.Location{Pos: 2}}, + phrasePart{"one", &search.Location{Pos: 1}}, }, }, }, @@ -431,9 +431,9 @@ func TestFindPhrasePathsSloppy(t *testing.T) { slop: 2, paths: []phrasePath{ phrasePath{ - &phrasePart{"one", &search.Location{Pos: 1}}, - &phrasePart{"three", &search.Location{Pos: 3}}, - &phrasePart{"five", &search.Location{Pos: 5}}, + phrasePart{"one", &search.Location{Pos: 1}}, + phrasePart{"three", &search.Location{Pos: 3}}, + phrasePart{"five", &search.Location{Pos: 5}}, }, }, }, @@ -448,9 +448,9 @@ func TestFindPhrasePathsSloppy(t *testing.T) { slop: 6, paths: []phrasePath{ phrasePath{ - &phrasePart{"five", &search.Location{Pos: 5}}, - &phrasePart{"three", &search.Location{Pos: 3}}, - &phrasePart{"one", &search.Location{Pos: 1}}, + phrasePart{"five", &search.Location{Pos: 5}}, + phrasePart{"three", &search.Location{Pos: 3}}, + phrasePart{"one", &search.Location{Pos: 1}}, }, }, }, @@ -500,8 +500,8 @@ func TestFindPhrasePathsSloppyPalyndrome(t *testing.T) { slop: 0, paths: []phrasePath{ phrasePath{ - &phrasePart{"two", &search.Location{Pos: 2}}, - &phrasePart{"three", &search.Location{Pos: 3}}, + phrasePart{"two", &search.Location{Pos: 2}}, + phrasePart{"three", &search.Location{Pos: 3}}, }, }, }, @@ -511,12 +511,12 @@ func TestFindPhrasePathsSloppyPalyndrome(t *testing.T) { slop: 2, paths: []phrasePath{ phrasePath{ - &phrasePart{"two", &search.Location{Pos: 2}}, - &phrasePart{"three", &search.Location{Pos: 3}}, + phrasePart{"two", &search.Location{Pos: 2}}, + phrasePart{"three", &search.Location{Pos: 3}}, }, phrasePath{ - &phrasePart{"two", &search.Location{Pos: 4}}, - &phrasePart{"three", &search.Location{Pos: 3}}, + phrasePart{"two", &search.Location{Pos: 4}}, + phrasePart{"three", &search.Location{Pos: 3}}, }, }, }, @@ -526,12 +526,12 @@ func TestFindPhrasePathsSloppyPalyndrome(t *testing.T) { slop: 2, paths: []phrasePath{ phrasePath{ - &phrasePart{"three", &search.Location{Pos: 3}}, - &phrasePart{"two", &search.Location{Pos: 2}}, + phrasePart{"three", &search.Location{Pos: 3}}, + phrasePart{"two", &search.Location{Pos: 2}}, }, phrasePath{ - &phrasePart{"three", &search.Location{Pos: 3}}, - &phrasePart{"two", &search.Location{Pos: 4}}, + phrasePart{"three", &search.Location{Pos: 3}}, + phrasePart{"two", &search.Location{Pos: 4}}, }, }, }, @@ -574,8 +574,8 @@ func TestFindMultiPhrasePaths(t *testing.T) { phrase: [][]string{[]string{"cat", "rat"}, []string{"dog"}}, paths: []phrasePath{ phrasePath{ - &phrasePart{"cat", &search.Location{Pos: 1}}, - &phrasePart{"dog", &search.Location{Pos: 2}}, + phrasePart{"cat", &search.Location{Pos: 1}}, + phrasePart{"dog", &search.Location{Pos: 2}}, }, }, }, @@ -592,12 +592,12 @@ func TestFindMultiPhrasePaths(t *testing.T) { phrase: [][]string{[]string{"cat", "dog"}, []string{"dog", "frog"}}, paths: []phrasePath{ phrasePath{ - &phrasePart{"cat", &search.Location{Pos: 1}}, - &phrasePart{"dog", &search.Location{Pos: 2}}, + phrasePart{"cat", &search.Location{Pos: 1}}, + phrasePart{"dog", &search.Location{Pos: 2}}, }, phrasePath{ - &phrasePart{"dog", &search.Location{Pos: 2}}, - &phrasePart{"frog", &search.Location{Pos: 3}}, + phrasePart{"dog", &search.Location{Pos: 2}}, + phrasePart{"frog", &search.Location{Pos: 3}}, }, }, }, @@ -606,8 +606,8 @@ func TestFindMultiPhrasePaths(t *testing.T) { phrase: [][]string{[]string{"cat", "rat"}, []string{""}, []string{"frog"}}, paths: []phrasePath{ phrasePath{ - &phrasePart{"cat", &search.Location{Pos: 1}}, - &phrasePart{"frog", &search.Location{Pos: 3}}, + phrasePart{"cat", &search.Location{Pos: 1}}, + phrasePart{"frog", &search.Location{Pos: 3}}, }, }, }, @@ -616,8 +616,8 @@ func TestFindMultiPhrasePaths(t *testing.T) { phrase: [][]string{[]string{"cat", "rat"}, []string{}, []string{"frog"}}, paths: []phrasePath{ phrasePath{ - &phrasePart{"cat", &search.Location{Pos: 1}}, - &phrasePart{"frog", &search.Location{Pos: 3}}, + phrasePart{"cat", &search.Location{Pos: 1}}, + phrasePart{"frog", &search.Location{Pos: 3}}, }, }, }, @@ -626,8 +626,8 @@ func TestFindMultiPhrasePaths(t *testing.T) { phrase: [][]string{[]string{"cat", "rat"}, nil, []string{"frog"}}, paths: []phrasePath{ phrasePath{ - &phrasePart{"cat", &search.Location{Pos: 1}}, - &phrasePart{"frog", &search.Location{Pos: 3}}, + phrasePart{"cat", &search.Location{Pos: 1}}, + phrasePart{"frog", &search.Location{Pos: 3}}, }, }, }, From d690422f6425e599f01574b928bef7a37cd77717 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 30 Mar 2018 16:34:31 -0700 Subject: [PATCH 350/728] scorch postingToTermFieldDoc() uses a backing array This optimization is so that postingToTermFieldDoc() will hit the allocator at most only twice per call rather than on every location entry. --- index/scorch/snapshot_index_tfr.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index 4c3d08edd..0f015c8db 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -103,14 +103,16 @@ func (i *IndexSnapshotTermFieldReader) postingToTermFieldDoc(next segment.Postin if i.includeTermVectors { locs := next.Locations() rv.Vectors = make([]*index.TermFieldVector, len(locs)) + backing := make([]index.TermFieldVector, len(locs)) for i, loc := range locs { - rv.Vectors[i] = &index.TermFieldVector{ + backing[i] = index.TermFieldVector{ Start: loc.Start(), End: loc.End(), Pos: loc.Pos(), ArrayPositions: loc.ArrayPositions(), Field: loc.Field(), } + rv.Vectors[i] = &backing[i] } } } From 01839100dcd0e40694645051057ef35a854d7768 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 30 Mar 2018 16:20:27 +0530 Subject: [PATCH 351/728] persisting the zap version to bolt zap version is already a part of footer s with '#' will be ignored, and an empty message aborts the commit. --- index/scorch/persister.go | 20 +++++++++++++++++++- index/scorch/segment/zap/build.go | 4 +++- index/scorch/segment/zap/segment.go | 2 +- index/scorch/segment/zap/write.go | 2 +- 4 files changed, 24 insertions(+), 4 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 2fab53240..1279a5c2a 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -16,6 +16,7 @@ package scorch import ( "bytes" + "encoding/binary" "fmt" "io/ioutil" "log" @@ -317,6 +318,22 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { return err } + // persist meta values + metaBucket, err := snapshotBucket.CreateBucketIfNotExists(boltMetaDataKey) + if err != nil { + return err + } + err = metaBucket.Put([]byte("type"), []byte(zap.Type)) + if err != nil { + return err + } + buf := make([]byte, binary.MaxVarintLen32) + binary.BigEndian.PutUint32(buf, zap.Version) + err = metaBucket.Put([]byte("version"), buf) + if err != nil { + return err + } + // persist internal values internalBucket, err := snapshotBucket.CreateBucketIfNotExists(boltInternalKey) if err != nil { @@ -457,6 +474,7 @@ var boltSnapshotsBucket = []byte{'s'} var boltPathKey = []byte{'p'} var boltDeletedKey = []byte{'d'} var boltInternalKey = []byte{'i'} +var boltMetaDataKey = []byte{'m'} func (s *Scorch) loadFromBolt() error { return s.rootBolt.View(func(tx *bolt.Tx) error { @@ -551,7 +569,7 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { _ = rv.DecRef() return nil, err } - } else { + } else if k[0] != boltMetaDataKey[0] { segmentBucket := snapshot.Bucket(k) if segmentBucket == nil { _ = rv.DecRef() diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index e8f3499b4..f519d4db4 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -22,7 +22,9 @@ import ( "github.com/Smerity/govarint" ) -const version uint32 = 9 +const Version uint32 = 9 + +const Type string = "zap" const fieldNotUninverted = math.MaxUint64 diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 541647abb..4cd44eb27 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -189,7 +189,7 @@ func (s *Segment) loadConfig() error { verOffset := crcOffset - 4 s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4]) - if s.version != version { + if s.version != Version { return fmt.Errorf("unsupported version %d", s.version) } diff --git a/index/scorch/segment/zap/write.go b/index/scorch/segment/zap/write.go index 7f4f5a88b..cddaedd00 100644 --- a/index/scorch/segment/zap/write.go +++ b/index/scorch/segment/zap/write.go @@ -118,7 +118,7 @@ func persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset return err } // write out 32-bit version - err = binary.Write(w, binary.BigEndian, version) + err = binary.Write(w, binary.BigEndian, Version) if err != nil { return err } From 56166ea4abaef47f729feff62ab9afdcf2343b30 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 2 Apr 2018 12:20:31 -0700 Subject: [PATCH 352/728] search_phrase findPhrasePaths() fix on slice reuse With term locations laid out like "a b c d d", findPhrasePaths() on a search for "a b c d" with sloppiness of 1 ought to have returned a result like... a: 1, b: 2, c: 3, d: 4 a: 1, b: 2, c: 3, d: 5 ...but instead was incorrectly returning results like this due to a subtle slice memory reuse issue... a: 1, b: 2, c: 3, d: 5 a: 1, b: 2, c: 3, d: 5 --- search/searcher/search_phrase.go | 4 +-- search/searcher/search_phrase_test.go | 46 ++++++++++++++++++++++++++- 2 files changed, 47 insertions(+), 3 deletions(-) diff --git a/search/searcher/search_phrase.go b/search/searcher/search_phrase.go index dfb7267f7..7166d4884 100644 --- a/search/searcher/search_phrase.go +++ b/search/searcher/search_phrase.go @@ -288,7 +288,7 @@ func findPhrasePaths(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]s // no more terms if len(phraseTerms) < 1 { - return []phrasePath{p} + return []phrasePath{append(phrasePath(nil), p...)} } car := phraseTerms[0] @@ -320,7 +320,7 @@ func findPhrasePaths(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]s dist = editDistance(prevPos+1, loc.Pos) } - // if enough slop reamining, continue recursively + // if enough slop remaining, continue recursively if prevPos == 0 || (remainingSlop-dist) >= 0 { // this location works, add it to the path (but not for empty term) px := append(p, phrasePart{term: carTerm, loc: loc}) diff --git a/search/searcher/search_phrase_test.go b/search/searcher/search_phrase_test.go index a764ac2fe..d09be57df 100644 --- a/search/searcher/search_phrase_test.go +++ b/search/searcher/search_phrase_test.go @@ -360,6 +360,7 @@ func TestFindPhrasePathsSloppy(t *testing.T) { phrase [][]string paths []phrasePath slop int + tlm search.TermLocationMap }{ // no match { @@ -454,10 +455,53 @@ func TestFindPhrasePathsSloppy(t *testing.T) { }, }, }, + // test an append() related edge case, where append()'s + // current behavior needs to be called 3 times starting from a + // nil slice before it grows to a slice with extra capacity -- + // hence, 3 initial terms of ark, bat, cat + { + phrase: [][]string{ + []string{"ark"}, []string{"bat"}, []string{"cat"}, []string{"dog"}, + }, + slop: 1, + paths: []phrasePath{ + phrasePath{ + phrasePart{"ark", &search.Location{Pos: 1}}, + phrasePart{"bat", &search.Location{Pos: 2}}, + phrasePart{"cat", &search.Location{Pos: 3}}, + phrasePart{"dog", &search.Location{Pos: 4}}, + }, + phrasePath{ + phrasePart{"ark", &search.Location{Pos: 1}}, + phrasePart{"bat", &search.Location{Pos: 2}}, + phrasePart{"cat", &search.Location{Pos: 3}}, + phrasePart{"dog", &search.Location{Pos: 5}}, + }, + }, + tlm: search.TermLocationMap{ // ark bat cat dog dog + "ark": search.Locations{ + &search.Location{Pos: 1}, + }, + "bat": search.Locations{ + &search.Location{Pos: 2}, + }, + "cat": search.Locations{ + &search.Location{Pos: 3}, + }, + "dog": search.Locations{ + &search.Location{Pos: 4}, + &search.Location{Pos: 5}, + }, + }, + }, } for i, test := range tests { - actualPaths := findPhrasePaths(0, nil, test.phrase, tlm, nil, test.slop) + tlmToUse := test.tlm + if tlmToUse == nil { + tlmToUse = tlm + } + actualPaths := findPhrasePaths(0, nil, test.phrase, tlmToUse, nil, test.slop) if !reflect.DeepEqual(actualPaths, test.paths) { t.Fatalf("expected: %v got %v for test %d", test.paths, actualPaths, i) } From 640d3c218ccb5c65375dbe06f8c13e6c62b59349 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 2 Apr 2018 22:53:45 -0700 Subject: [PATCH 353/728] optimization for LevenshteinDistanceMaxReuseSlice() This optimization allows the levenshtein-distance-max calculation to not necessarily have to allocate memory on every invocation. This optimization is used in the prefix edge case by the fuzzy searcher during the findFuzzyCandidateTerms() inner loop. --- search/levenshtein.go | 17 +++++++++++++---- search/searcher/search_fuzzy.go | 5 ++++- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/search/levenshtein.go b/search/levenshtein.go index ec033143a..687608d3f 100644 --- a/search/levenshtein.go +++ b/search/levenshtein.go @@ -57,15 +57,24 @@ func LevenshteinDistance(a, b string) int { // in which case the first return val will be the max // and the second will be true, indicating max was exceeded func LevenshteinDistanceMax(a, b string, max int) (int, bool) { + v, wasMax, _ := LevenshteinDistanceMaxReuseSlice(a, b, max, nil) + return v, wasMax +} + +func LevenshteinDistanceMaxReuseSlice(a, b string, max int, d []int) (int, bool, []int) { la := len(a) lb := len(b) ld := int(math.Abs(float64(la - lb))) if ld > max { - return max, true + return max, true, d } - d := make([]int, la+1) + if cap(d) < la+1 { + d = make([]int, la+1) + } + d = d[:la+1] + var lastdiag, olddiag, temp int for i := 1; i <= la; i++ { @@ -98,8 +107,8 @@ func LevenshteinDistanceMax(a, b string, max int) (int, bool) { } // after each row if rowmin isn't less than max stop if rowmin > max { - return max, true + return max, true, d } } - return d[la], false + return d[la], false, d } diff --git a/search/searcher/search_fuzzy.go b/search/searcher/search_fuzzy.go index 7d165b067..41ad804f1 100644 --- a/search/searcher/search_fuzzy.go +++ b/search/searcher/search_fuzzy.go @@ -72,9 +72,12 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, }() // enumerate terms and check levenshtein distance + var reuse []int tfd, err := fieldDict.Next() for err == nil && tfd != nil { - ld, exceeded := search.LevenshteinDistanceMax(term, tfd.Term, fuzziness) + var ld int + var exceeded bool + ld, exceeded, reuse = search.LevenshteinDistanceMaxReuseSlice(term, tfd.Term, fuzziness, reuse) if !exceeded && ld <= fuzziness { rv = append(rv, tfd.Term) if tooManyClauses(len(rv)) { From 57e0ff8b50d723c8cd471702a9a1046a912b1a56 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 4 Apr 2018 11:31:19 -0400 Subject: [PATCH 354/728] Revert "TermFieldReader Advance optimization" This reverts commit 72ac35296132e4ab3da85f56237bebbf3cf2bcbe. --- index/scorch/segment/empty.go | 4 ---- index/scorch/segment/mem/posting.go | 21 ----------------- index/scorch/segment/segment.go | 4 ---- index/scorch/segment/zap/posting.go | 35 ----------------------------- index/scorch/snapshot_index_tfr.go | 35 ++++++++++++----------------- 5 files changed, 14 insertions(+), 85 deletions(-) diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index f3315a804..a0b8434c7 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -114,7 +114,3 @@ func (e *EmptyPostingsIterator) Next() (Posting, error) { func (e *EmptyPostingsIterator) Size() int { return 0 } - -func (e *EmptyPostingsIterator) Advance(uint64) (Posting, error) { - return nil, nil -} diff --git a/index/scorch/segment/mem/posting.go b/index/scorch/segment/mem/posting.go index 710f6e764..362fdb7c5 100644 --- a/index/scorch/segment/mem/posting.go +++ b/index/scorch/segment/mem/posting.go @@ -155,27 +155,6 @@ func (i *PostingsIterator) Next() (segment.Posting, error) { return &i.reuse, nil } -func (i *PostingsIterator) Advance(docNumber uint64) (segment.Posting, error) { - if i.reuse.Number() == docNumber { - return &i.reuse, nil - } - next, err := i.Next() - if err != nil || next == nil { - return next, err - } - - nnum := next.Number() - for nnum < docNumber { - next, err = i.Next() - if err != nil || next == nil { - return next, err - } - nnum = next.Number() - } - - return next, nil -} - // Posting is a single entry in a postings list type Posting struct { iterator *PostingsIterator diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index a79611363..b754e94b4 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -77,10 +77,6 @@ type PostingsIterator interface { Next() (Posting, error) Size() int - - // Advance will return the respective posting of the - // sepcified doc number or its immediate follower. - Advance(docNum uint64) (Posting, error) } type Posting interface { diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 593773d07..1f198df5e 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -590,41 +590,6 @@ func (i *PostingsIterator) nextBytes() ( return docNum, freq, normBits, bytesFreqNorm, bytesLoc, nil } -func (i *PostingsIterator) Advance(docNumber uint64) (segment.Posting, error) { - if i.postings == nil { - return nil, nil - } - - // check if we are already there - if i.next.Number() == docNumber { - return &i.next, nil - } - - nChunk := uint32(docNumber) / i.postings.sb.chunkFactor - if i.currChunk != nChunk { - err := i.loadChunk(int(nChunk)) - if err != nil { - return nil, fmt.Errorf("Advance, error loading chunk: %v", err) - } - } - - next, err := i.Next() - if err != nil || next == nil { - return nil, err - } - - nnum := next.Number() - for nnum < docNumber { - next, err = i.Next() - if err != nil || next == nil { - return next, err - } - nnum = next.Number() - } - - return next, nil -} - // nextDocNum returns the next docNum on the postings list, and also // sets up the currChunk / loc related fields of the iterator. func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index 0f015c8db..46e657488 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -117,8 +117,7 @@ func (i *IndexSnapshotTermFieldReader) postingToTermFieldDoc(next segment.Postin } } -func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, - preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) { +func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAlloced *index.TermFieldDoc) (*index.TermFieldDoc, error) { // FIXME do something better // for now, if we need to seek backwards, then restart from the beginning if i.currPosting != nil && bytes.Compare(i.currID, ID) >= 0 { @@ -129,30 +128,24 @@ func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, } *i = *(i2.(*IndexSnapshotTermFieldReader)) } - - num, err := docInternalToNumber(ID) + // FIXME do something better + next, err := i.Next(preAlloced) if err != nil { - return nil, nil + return nil, err } - segIndex, ldocNum := i.snapshot.segmentIndexAndLocalDocNumFromGlobal(num) - if segIndex > len(i.snapshot.segment) { + if next == nil { return nil, nil } - // skip directly to the target segment - next, err := i.iterators[segIndex].Advance(ldocNum) - if err != nil || next == nil { - return nil, err - } - - if preAlloced == nil { - preAlloced = &index.TermFieldDoc{} + for bytes.Compare(next.ID, ID) < 0 { + next, err = i.Next(preAlloced) + if err != nil { + return nil, err + } + if next == nil { + break + } } - preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+ - i.snapshot.offsets[segIndex]) - i.postingToTermFieldDoc(next, preAlloced) - i.currID = preAlloced.ID - i.currPosting = next - return preAlloced, nil + return next, nil } func (i *IndexSnapshotTermFieldReader) Count() uint64 { From 1bee8b51a0624219ac118ddae6e379b27a5c7508 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 2 Apr 2018 15:03:21 -0700 Subject: [PATCH 355/728] search_phrase findPhrasePaths() fix to avoid replays of locations With term locations that look like "a b b", findPhrasePaths() on a search for "a b b" with sloppiness of 1 ought to have a result like... a: 1, b: 2, b: 3 ...but instead incorrectly had results like this due to not ignoring previously used locations during recursion... a: 1, b: 2, b: 2 a: 1, b: 2, b: 3 --- search/searcher/search_phrase.go | 30 ++++++++++++-- search/searcher/search_phrase_test.go | 58 +++++++++++++++++++++++++++ 2 files changed, 84 insertions(+), 4 deletions(-) diff --git a/search/searcher/search_phrase.go b/search/searcher/search_phrase.go index 7166d4884..270d54dfb 100644 --- a/search/searcher/search_phrase.go +++ b/search/searcher/search_phrase.go @@ -270,6 +270,18 @@ func (p phrasePath) MergeInto(in search.TermLocationMap) { } } +func (p phrasePath) String() string { + rv := "[" + for i, pp := range p { + if i > 0 { + rv += ", " + } + rv += pp.String() + } + rv += "]" + return rv +} + // findPhrasePaths is a function to identify phase matches from a set of known // term locations. the implementation is recursive, so care must be taken // with arguments and return values. @@ -285,10 +297,14 @@ func (p phrasePath) MergeInto(in search.TermLocationMap) { // returns slice of paths, or nil if invocation did not find any successul paths func findPhrasePaths(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]string, tlm search.TermLocationMap, p phrasePath, remainingSlop int) []phrasePath { + return findPhrasePathsRecur(prevPos, ap, phraseTerms, tlm, p, remainingSlop, nil) +} +func findPhrasePathsRecur(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]string, + tlm search.TermLocationMap, p phrasePath, remainingSlop int, rv []phrasePath) []phrasePath { // no more terms if len(phraseTerms) < 1 { - return []phrasePath{append(phrasePath(nil), p...)} + return append(rv, append(phrasePath(nil), p...)) } car := phraseTerms[0] @@ -301,13 +317,13 @@ func findPhrasePaths(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]s // if prevPos was 0, don't set it to 1 (as thats not a real abs pos) nextPos = 0 // don't advance nextPos if prevPos was 0 } - return findPhrasePaths(nextPos, ap, cdr, tlm, p, remainingSlop) + return findPhrasePathsRecur(nextPos, ap, cdr, tlm, p, remainingSlop, rv) } - var rv []phrasePath // locations for this term for _, carTerm := range car { locations := tlm[carTerm] + LOCATIONS_LOOP: for _, loc := range locations { if prevPos != 0 && !loc.ArrayPositions.Equals(ap) { // if the array positions are wrong, can't match, try next location @@ -322,9 +338,15 @@ func findPhrasePaths(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]s // if enough slop remaining, continue recursively if prevPos == 0 || (remainingSlop-dist) >= 0 { + for _, ppart := range p { + if ppart.term == carTerm && ppart.loc == loc { + continue LOCATIONS_LOOP + } + } + // this location works, add it to the path (but not for empty term) px := append(p, phrasePart{term: carTerm, loc: loc}) - rv = append(rv, findPhrasePaths(loc.Pos, loc.ArrayPositions, cdr, tlm, px, remainingSlop-dist)...) + rv = findPhrasePathsRecur(loc.Pos, loc.ArrayPositions, cdr, tlm, px, remainingSlop-dist, rv) } } } diff --git a/search/searcher/search_phrase_test.go b/search/searcher/search_phrase_test.go index d09be57df..c14b3e5f4 100644 --- a/search/searcher/search_phrase_test.go +++ b/search/searcher/search_phrase_test.go @@ -494,6 +494,64 @@ func TestFindPhrasePathsSloppy(t *testing.T) { }, }, }, + // test that we don't see multiple hits from the same location + { + phrase: [][]string{ + []string{"cat"}, []string{"dog"}, []string{"dog"}, + }, + slop: 1, + paths: []phrasePath{ + phrasePath{ + phrasePart{"cat", &search.Location{Pos: 1}}, + phrasePart{"dog", &search.Location{Pos: 2}}, + phrasePart{"dog", &search.Location{Pos: 3}}, + }, + }, + tlm: search.TermLocationMap{ // cat dog dog + "cat": search.Locations{ + &search.Location{Pos: 1}, + }, + "dog": search.Locations{ + &search.Location{Pos: 2}, + &search.Location{Pos: 3}, + }, + }, + }, + // test that we don't see multiple hits from the same location + { + phrase: [][]string{ + []string{"cat"}, []string{"dog"}, + }, + slop: 10, + paths: []phrasePath{ + phrasePath{ + phrasePart{"cat", &search.Location{Pos: 1}}, + phrasePart{"dog", &search.Location{Pos: 2}}, + }, + phrasePath{ + phrasePart{"cat", &search.Location{Pos: 1}}, + phrasePart{"dog", &search.Location{Pos: 4}}, + }, + phrasePath{ + phrasePart{"cat", &search.Location{Pos: 3}}, + phrasePart{"dog", &search.Location{Pos: 2}}, + }, + phrasePath{ + phrasePart{"cat", &search.Location{Pos: 3}}, + phrasePart{"dog", &search.Location{Pos: 4}}, + }, + }, + tlm: search.TermLocationMap{ // cat dog cat dog + "cat": search.Locations{ + &search.Location{Pos: 1}, + &search.Location{Pos: 3}, + }, + "dog": search.Locations{ + &search.Location{Pos: 2}, + &search.Location{Pos: 4}, + }, + }, + }, } for i, test := range tests { From 9d3ee754b01fbf6bc81e6b405bcc6e374754d1d0 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 4 Apr 2018 15:08:01 -0700 Subject: [PATCH 356/728] scorch introduce empty deleted bitmaps as nil --- index/scorch/introducer.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index b00260bbe..23749c906 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -148,6 +148,9 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { } else { newss.deleted = roaring.Or(root.segment[i].deleted, delta) } + if newss.deleted.IsEmpty() { + newss.deleted = nil + } // check for live size before copying if newss.LiveSize() > 0 { From 213e3b2197c92287a180b5590f4c3f74090c1147 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 4 Apr 2018 15:07:15 -0700 Subject: [PATCH 357/728] scorch track provenance or creator of snapshots I've found that having this info helps a little with debugging. --- index/scorch/introducer.go | 10 ++++++++++ index/scorch/persister.go | 2 ++ index/scorch/scorch.go | 2 +- index/scorch/snapshot_index.go | 1 + index/scorch/snapshot_segment.go | 1 + 5 files changed, 15 insertions(+), 1 deletion(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index b00260bbe..3b44e4c82 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -118,6 +118,7 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { offsets: make([]uint64, 0, nsegs+1), internal: make(map[string][]byte, len(root.internal)), refs: 1, + creator: "introduceSegment", } // iterate through current segments @@ -140,6 +141,7 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { id: root.segment[i].id, segment: root.segment[i].segment, cachedDocs: root.segment[i].cachedDocs, + creator: root.segment[i].creator, } // apply new obsoletions @@ -164,6 +166,7 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { id: next.id, segment: next.data, // take ownership of next.data's ref-count cachedDocs: &cachedDocs{cache: nil}, + creator: "introduceSegment", } newSnapshot.segment = append(newSnapshot.segment, newSegmentSnapshot) newSnapshot.offsets = append(newSnapshot.offsets, running) @@ -225,6 +228,7 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) { offsets: make([]uint64, len(root.offsets)), internal: make(map[string][]byte, len(root.internal)), refs: 1, + creator: "introducePersist", } for i, segmentSnapshot := range root.segment { @@ -235,6 +239,7 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) { segment: replacement, deleted: segmentSnapshot.deleted, cachedDocs: segmentSnapshot.cachedDocs, + creator: "introducePersist", } newIndexSnapshot.segment[i] = newSegmentSnapshot delete(persist.persisted, segmentSnapshot.id) @@ -278,6 +283,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { parent: s, internal: root.internal, refs: 1, + creator: "introduceMerge", } // iterate through current segments @@ -313,6 +319,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { segment: root.segment[i].segment, deleted: root.segment[i].deleted, cachedDocs: root.segment[i].cachedDocs, + creator: root.segment[i].creator, }) root.segment[i].segment.AddRef() newSnapshot.offsets = append(newSnapshot.offsets, running) @@ -344,6 +351,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { segment: nextMerge.new, // take ownership for nextMerge.new's ref-count deleted: newSegmentDeleted, cachedDocs: &cachedDocs{cache: nil}, + creator: "introduceMerge", }) newSnapshot.offsets = append(newSnapshot.offsets, running) atomic.AddUint64(&s.stats.TotIntroducedSegmentsMerge, 1) @@ -391,6 +399,7 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { internal: revertTo.snapshot.internal, epoch: s.nextSnapshotEpoch, refs: 1, + creator: "revertToSnapshot", } s.nextSnapshotEpoch++ @@ -401,6 +410,7 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { segment: segmentSnapshot.segment, deleted: segmentSnapshot.deleted, cachedDocs: segmentSnapshot.cachedDocs, + creator: segmentSnapshot.creator, } newSnapshot.segment[i].segment.AddRef() diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 6039b5097..b31686069 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -265,6 +265,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) ( segment: make([]*SegmentSnapshot, 0, len(snapshot.segment)), internal: snapshot.internal, epoch: snapshot.epoch, + creator: "persistSnapshotMaybeMerge", } // copy to the equiv the segments that weren't replaced @@ -536,6 +537,7 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { parent: s, internal: make(map[string][]byte), refs: 1, + creator: "loadSnapshot", } var running uint64 c := snapshot.Cursor() diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 13e9a4027..f218f1d54 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -84,7 +84,7 @@ func NewScorch(storeName string, closeCh: make(chan struct{}), ineligibleForRemoval: map[string]bool{}, } - rv.root = &IndexSnapshot{parent: rv, refs: 1} + rv.root = &IndexSnapshot{parent: rv, refs: 1, creator: "NewScorch"} ro, ok := config["read_only"].(bool) if ok { rv.readOnly = ro diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 2e84b3f4b..cf6e6250d 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -55,6 +55,7 @@ type IndexSnapshot struct { internal map[string][]byte epoch uint64 size uint64 + creator string m sync.Mutex // Protects the fields that follow. refs int64 diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index ac95d450f..c29fac997 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -62,6 +62,7 @@ type SegmentSnapshot struct { id uint64 segment segment.Segment deleted *roaring.Bitmap + creator string cachedDocs *cachedDocs } From 418f98ede8854ffb5528043f035f3b762bcb9d19 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 2 Apr 2018 10:28:09 -0700 Subject: [PATCH 358/728] Account for memory overhead from snapshots held by merger, persister --- index/scorch/merge.go | 3 +++ index/scorch/persister.go | 3 +++ index/scorch/scorch.go | 33 ++++++++++++++++++++++++++++++++- 3 files changed, 38 insertions(+), 1 deletion(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 42b5e950f..73351aa4c 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -50,6 +50,8 @@ OUTER: s.rootLock.RLock() ourSnapshot := s.root ourSnapshot.AddRef() + atomic.StoreUint64(&s.iStats.mergeSnapshotSize, uint64(ourSnapshot.Size())) + atomic.StoreUint64(&s.iStats.mergeEpoch, ourSnapshot.epoch) s.rootLock.RUnlock() if ourSnapshot.epoch != lastEpochMergePlanned { @@ -58,6 +60,7 @@ OUTER: // lets get started err := s.planMergeAtSnapshot(ourSnapshot, mergePlannerOptions) if err != nil { + atomic.StoreUint64(&s.iStats.mergeEpoch, 0) if err == ErrClosed { // index has been closed _ = ourSnapshot.DecRef() diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 6039b5097..ff6ac8f26 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -80,6 +80,8 @@ OUTER: ourSnapshot.AddRef() ourPersisted = s.rootPersisted s.rootPersisted = nil + atomic.StoreUint64(&s.iStats.persistSnapshotSize, uint64(ourSnapshot.Size())) + atomic.StoreUint64(&s.iStats.persistEpoch, ourSnapshot.epoch) } s.rootLock.Unlock() @@ -94,6 +96,7 @@ OUTER: close(ch) } if err != nil { + atomic.StoreUint64(&s.iStats.persistEpoch, 0) if err == ErrClosed { // index has been closed _ = ourSnapshot.DecRef() diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 13e9a4027..0947e7501 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -71,6 +71,15 @@ type Scorch struct { onEvent func(event Event) onAsyncError func(err error) + + iStats internalStats +} + +type internalStats struct { + persistEpoch uint64 + persistSnapshotSize uint64 + mergeEpoch uint64 + mergeSnapshotSize uint64 } func NewScorch(storeName string, @@ -492,7 +501,29 @@ func (s *Scorch) MemoryUsed() uint64 { defer func() { _ = indexSnapshot.Close() }() - return uint64(indexSnapshot.Size()) + + // Account for current root snapshot overhead + memUsed := uint64(indexSnapshot.Size()) + + // Account for snapshot that the persister may be working on + persistEpoch := atomic.LoadUint64(&s.iStats.persistEpoch) + persistSnapshotSize := atomic.LoadUint64(&s.iStats.persistSnapshotSize) + if persistEpoch != 0 && indexSnapshot.epoch > persistEpoch { + // the snapshot that the persister is working on isn't the same as + // the current snapshot + memUsed += persistSnapshotSize + } + + // Account for snapshot that the merger may be working on + mergeEpoch := atomic.LoadUint64(&s.iStats.mergeEpoch) + mergeSnapshotSize := atomic.LoadUint64(&s.iStats.mergeSnapshotSize) + if mergeEpoch != 0 && indexSnapshot.epoch > mergeEpoch { + // the snapshot that the merger is working on isn't the same as + // the current snapshot + memUsed += mergeSnapshotSize + } + + return memUsed } func (s *Scorch) markIneligibleForRemoval(filename string) { From 7dc965232c43ecbb2a5977bcdb6ae18fc5a0bf2e Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 5 Apr 2018 14:18:53 -0700 Subject: [PATCH 359/728] improve findPhrasePaths() doc comments and call signature From feedback on a previous PR, the doc comments on findPhrasePaths() was outdated. See... https://github.com/blevesearch/bleve/pull/872 Additionally, the recently introduced recursive helper func allowed simplification of the params of findPhrasePaths(). --- search/searcher/search_phrase.go | 38 +++++++++++++++++++-------- search/searcher/search_phrase_test.go | 8 +++--- 2 files changed, 31 insertions(+), 15 deletions(-) diff --git a/search/searcher/search_phrase.go b/search/searcher/search_phrase.go index 270d54dfb..76d9dc23f 100644 --- a/search/searcher/search_phrase.go +++ b/search/searcher/search_phrase.go @@ -245,7 +245,7 @@ func (s *PhraseSearcher) checkCurrMustMatch(ctx *search.SearchContext) *search.D // a nil or empty TermLocationMap func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext, tlm search.TermLocationMap) ( int, search.TermLocationMap) { - paths := findPhrasePaths(0, nil, s.terms, tlm, nil, 0) + paths := findPhrasePaths(s.terms, tlm, 0) rv := make(search.TermLocationMap, len(s.terms)) for _, p := range paths { p.MergeInto(rv) @@ -282,24 +282,39 @@ func (p phrasePath) String() string { return rv } -// findPhrasePaths is a function to identify phase matches from a set of known -// term locations. the implementation is recursive, so care must be taken -// with arguments and return values. +// findPhrasePaths is a function to identify phase matches from a set +// of known term locations. // -// prev - the previous location, nil on first invocation +// phraseTerms - slice containing the phrase terms, +// may contain empty string as placeholder (don't care) +// tlm - the Term Location Map containing all relevant term locations +// slop - amount of sloppiness that's allowed, which is the cummulative +// sum of the editDistances of each matching phrase part, +// where 0 means no sloppiness allowed (all editDistances must be 0) +// +// returns slice of paths, or nil if invocation did not find any successul paths +func findPhrasePaths(phraseTerms [][]string, + tlm search.TermLocationMap, slop int) []phrasePath { + return findPhrasePathsRecur(0, nil, phraseTerms, tlm, nil, slop, nil) +} + +// findPhrasePathsRecur is the recursive implementation of +// findPhrasePaths, so care must be taken with arguments and return +// values. +// +// prevPos - the previous location, 0 on first invocation +// ap - array positions of the first candidate phrase part to +// which further recursive phrase parts must match, +// nil on initial invocation or when there are no array positions // phraseTerms - slice containing the phrase terms themselves // may contain empty string as placeholder (don't care) // tlm - the Term Location Map containing all relevant term locations -// offset - the offset from the previous that this next term must match // p - the current path being explored (appended to in recursive calls) // this is the primary state being built during the traversal +// remainingSlop - decremented during recursion +// rv - the final result being appended to by all the recursive calls // // returns slice of paths, or nil if invocation did not find any successul paths -func findPhrasePaths(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]string, - tlm search.TermLocationMap, p phrasePath, remainingSlop int) []phrasePath { - return findPhrasePathsRecur(prevPos, ap, phraseTerms, tlm, p, remainingSlop, nil) -} - func findPhrasePathsRecur(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]string, tlm search.TermLocationMap, p phrasePath, remainingSlop int, rv []phrasePath) []phrasePath { // no more terms @@ -338,6 +353,7 @@ func findPhrasePathsRecur(prevPos uint64, ap search.ArrayPositions, phraseTerms // if enough slop remaining, continue recursively if prevPos == 0 || (remainingSlop-dist) >= 0 { + // skip if we've already used this term+loc already for _, ppart := range p { if ppart.term == carTerm && ppart.loc == loc { continue LOCATIONS_LOOP diff --git a/search/searcher/search_phrase_test.go b/search/searcher/search_phrase_test.go index c14b3e5f4..ea351380f 100644 --- a/search/searcher/search_phrase_test.go +++ b/search/searcher/search_phrase_test.go @@ -320,7 +320,7 @@ func TestFindPhrasePaths(t *testing.T) { } for i, test := range tests { - actualPaths := findPhrasePaths(0, nil, test.phrase, test.tlm, nil, 0) + actualPaths := findPhrasePaths(test.phrase, test.tlm, 0) if !reflect.DeepEqual(actualPaths, test.paths) { t.Fatalf("expected: %v got %v for test %d", test.paths, actualPaths, i) } @@ -559,7 +559,7 @@ func TestFindPhrasePathsSloppy(t *testing.T) { if tlmToUse == nil { tlmToUse = tlm } - actualPaths := findPhrasePaths(0, nil, test.phrase, tlmToUse, nil, test.slop) + actualPaths := findPhrasePaths(test.phrase, tlmToUse, test.slop) if !reflect.DeepEqual(actualPaths, test.paths) { t.Fatalf("expected: %v got %v for test %d", test.paths, actualPaths, i) } @@ -640,7 +640,7 @@ func TestFindPhrasePathsSloppyPalyndrome(t *testing.T) { } for i, test := range tests { - actualPaths := findPhrasePaths(0, nil, test.phrase, tlm, nil, test.slop) + actualPaths := findPhrasePaths(test.phrase, tlm, test.slop) if !reflect.DeepEqual(actualPaths, test.paths) { t.Fatalf("expected: %v got %v for test %d", test.paths, actualPaths, i) } @@ -736,7 +736,7 @@ func TestFindMultiPhrasePaths(t *testing.T) { } for i, test := range tests { - actualPaths := findPhrasePaths(0, nil, test.phrase, tlm, nil, 0) + actualPaths := findPhrasePaths(test.phrase, tlm, 0) if !reflect.DeepEqual(actualPaths, test.paths) { t.Fatalf("expected: %v got %v for test %d", test.paths, actualPaths, i) } From 6b448cffe55a79637b9dee0beac55904c307fdb0 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 6 Apr 2018 14:51:29 -0400 Subject: [PATCH 360/728] add ability change default open read-only flag this change allows an alternate distribution of the bleve command-line tool to change the default behavior and always open the index read-only --- cmd/bleve/cmd/root.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/cmd/bleve/cmd/root.go b/cmd/bleve/cmd/root.go index f0bea29fe..8dcf165b2 100644 --- a/cmd/bleve/cmd/root.go +++ b/cmd/bleve/cmd/root.go @@ -27,6 +27,10 @@ var cfgFile string var idx bleve.Index +// DefaultOpenReadOnly allows some distributions of this command to default +// to always opening the index read-only +var DefaultOpenReadOnly = false + const canMutateBleveIndex = "canMutateBleveIndex" // CanMutateBleveIndex returns true if the command is capable @@ -52,8 +56,11 @@ var RootCmd = &cobra.Command{ if len(args) < 1 { return fmt.Errorf("must specify path to index") } + runtimeConfig := map[string]interface{}{ + "read_only": DefaultOpenReadOnly, + } var err error - idx, err = bleve.Open(args[0]) + idx, err = bleve.OpenUsing(args[0], runtimeConfig) if err != nil { return fmt.Errorf("error opening bleve index: %v", err) } From 5af813d7e5f8424f09cbbf6ab414cf13a9ed0d0e Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 5 Apr 2018 15:59:19 -0700 Subject: [PATCH 361/728] optimize findPhrasePaths with path memory reuse This optimization renames findPhrasePathsRecur() back to findPhrasePaths(). The PhraseSearcher now also calls findPhrasePaths() with preallocated / recycled phrase path slices. Microbenchmarks of bleve-query for query-string of 'text:"see also"' on a 50K en-wiki scorch index went from ~68 q/sec before this change to ~74 q/sec after this change. --- search/searcher/search_phrase.go | 54 +++++++++++++-------------- search/searcher/search_phrase_test.go | 8 ++-- 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/search/searcher/search_phrase.go b/search/searcher/search_phrase.go index 76d9dc23f..95d7a6b3e 100644 --- a/search/searcher/search_phrase.go +++ b/search/searcher/search_phrase.go @@ -36,6 +36,8 @@ type PhraseSearcher struct { queryNorm float64 currMust *search.DocumentMatch terms [][]string + path phrasePath + paths []phrasePath initialized bool } @@ -245,12 +247,15 @@ func (s *PhraseSearcher) checkCurrMustMatch(ctx *search.SearchContext) *search.D // a nil or empty TermLocationMap func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext, tlm search.TermLocationMap) ( int, search.TermLocationMap) { - paths := findPhrasePaths(s.terms, tlm, 0) + if s.path == nil { + s.path = make(phrasePath, 0, len(s.terms)) + } + s.paths = findPhrasePaths(0, nil, s.terms, tlm, s.path[:0], 0, s.paths[:0]) rv := make(search.TermLocationMap, len(s.terms)) - for _, p := range paths { + for _, p := range s.paths { p.MergeInto(rv) } - return len(paths), rv + return len(s.paths), rv } type phrasePart struct { @@ -283,43 +288,38 @@ func (p phrasePath) String() string { } // findPhrasePaths is a function to identify phase matches from a set -// of known term locations. -// -// phraseTerms - slice containing the phrase terms, -// may contain empty string as placeholder (don't care) -// tlm - the Term Location Map containing all relevant term locations -// slop - amount of sloppiness that's allowed, which is the cummulative -// sum of the editDistances of each matching phrase part, -// where 0 means no sloppiness allowed (all editDistances must be 0) -// -// returns slice of paths, or nil if invocation did not find any successul paths -func findPhrasePaths(phraseTerms [][]string, - tlm search.TermLocationMap, slop int) []phrasePath { - return findPhrasePathsRecur(0, nil, phraseTerms, tlm, nil, slop, nil) -} - -// findPhrasePathsRecur is the recursive implementation of -// findPhrasePaths, so care must be taken with arguments and return -// values. +// of known term locations. it recursive so care must be taken with +// arguments and return values. // // prevPos - the previous location, 0 on first invocation // ap - array positions of the first candidate phrase part to // which further recursive phrase parts must match, // nil on initial invocation or when there are no array positions -// phraseTerms - slice containing the phrase terms themselves +// phraseTerms - slice containing the phrase terms, // may contain empty string as placeholder (don't care) // tlm - the Term Location Map containing all relevant term locations // p - the current path being explored (appended to in recursive calls) // this is the primary state being built during the traversal -// remainingSlop - decremented during recursion +// remainingSlop - amount of sloppiness that's allowed, which is the +// sum of the editDistances from each matching phrase part, +// where 0 means no sloppiness allowed (all editDistances must be 0), +// decremented during recursion // rv - the final result being appended to by all the recursive calls // // returns slice of paths, or nil if invocation did not find any successul paths -func findPhrasePathsRecur(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]string, +func findPhrasePaths(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]string, tlm search.TermLocationMap, p phrasePath, remainingSlop int, rv []phrasePath) []phrasePath { // no more terms if len(phraseTerms) < 1 { - return append(rv, append(phrasePath(nil), p...)) + // snapshot or copy the recursively built phrasePath p and + // append it to the rv, also optimizing by checking if next + // phrasePath item in the rv (which we're about to overwrite) + // is available for reuse + var pcopy phrasePath + if len(rv) < cap(rv) { + pcopy = rv[:len(rv)+1][len(rv)][:0] + } + return append(rv, append(pcopy, p...)) } car := phraseTerms[0] @@ -332,7 +332,7 @@ func findPhrasePathsRecur(prevPos uint64, ap search.ArrayPositions, phraseTerms // if prevPos was 0, don't set it to 1 (as thats not a real abs pos) nextPos = 0 // don't advance nextPos if prevPos was 0 } - return findPhrasePathsRecur(nextPos, ap, cdr, tlm, p, remainingSlop, rv) + return findPhrasePaths(nextPos, ap, cdr, tlm, p, remainingSlop, rv) } // locations for this term @@ -362,7 +362,7 @@ func findPhrasePathsRecur(prevPos uint64, ap search.ArrayPositions, phraseTerms // this location works, add it to the path (but not for empty term) px := append(p, phrasePart{term: carTerm, loc: loc}) - rv = findPhrasePathsRecur(loc.Pos, loc.ArrayPositions, cdr, tlm, px, remainingSlop-dist, rv) + rv = findPhrasePaths(loc.Pos, loc.ArrayPositions, cdr, tlm, px, remainingSlop-dist, rv) } } } diff --git a/search/searcher/search_phrase_test.go b/search/searcher/search_phrase_test.go index ea351380f..1c3e18a5e 100644 --- a/search/searcher/search_phrase_test.go +++ b/search/searcher/search_phrase_test.go @@ -320,7 +320,7 @@ func TestFindPhrasePaths(t *testing.T) { } for i, test := range tests { - actualPaths := findPhrasePaths(test.phrase, test.tlm, 0) + actualPaths := findPhrasePaths(0, nil, test.phrase, test.tlm, nil, 0, nil) if !reflect.DeepEqual(actualPaths, test.paths) { t.Fatalf("expected: %v got %v for test %d", test.paths, actualPaths, i) } @@ -559,7 +559,7 @@ func TestFindPhrasePathsSloppy(t *testing.T) { if tlmToUse == nil { tlmToUse = tlm } - actualPaths := findPhrasePaths(test.phrase, tlmToUse, test.slop) + actualPaths := findPhrasePaths(0, nil, test.phrase, tlmToUse, nil, test.slop, nil) if !reflect.DeepEqual(actualPaths, test.paths) { t.Fatalf("expected: %v got %v for test %d", test.paths, actualPaths, i) } @@ -640,7 +640,7 @@ func TestFindPhrasePathsSloppyPalyndrome(t *testing.T) { } for i, test := range tests { - actualPaths := findPhrasePaths(test.phrase, tlm, test.slop) + actualPaths := findPhrasePaths(0, nil, test.phrase, tlm, nil, test.slop, nil) if !reflect.DeepEqual(actualPaths, test.paths) { t.Fatalf("expected: %v got %v for test %d", test.paths, actualPaths, i) } @@ -736,7 +736,7 @@ func TestFindMultiPhrasePaths(t *testing.T) { } for i, test := range tests { - actualPaths := findPhrasePaths(test.phrase, tlm, 0) + actualPaths := findPhrasePaths(0, nil, test.phrase, tlm, nil, 0, nil) if !reflect.DeepEqual(actualPaths, test.paths) { t.Fatalf("expected: %v got %v for test %d", test.paths, actualPaths, i) } From 4dd4bc7d63c4b191621e930824085df35a2a5b34 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 5 Apr 2018 23:29:11 -0700 Subject: [PATCH 362/728] optimize to return nil facet results when there's no FacetBuilder --- search/collector/topn.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/search/collector/topn.go b/search/collector/topn.go index d684868cc..28f284a94 100644 --- a/search/collector/topn.go +++ b/search/collector/topn.go @@ -313,5 +313,5 @@ func (hc *TopNCollector) FacetResults() search.FacetResults { if hc.facetsBuilder != nil { return hc.facetsBuilder.Results() } - return search.FacetResults{} + return nil } From d54898da6a50532bb18396417a63ae4e237696a2 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 5 Apr 2018 23:37:11 -0700 Subject: [PATCH 363/728] optimize to return nil SearchResult.Errors on success --- index_impl.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/index_impl.go b/index_impl.go index 4d03b78af..b5373ff0d 100644 --- a/index_impl.go +++ b/index_impl.go @@ -609,9 +609,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr return &SearchResult{ Status: &SearchStatus{ Total: 1, - Failed: 0, Successful: 1, - Errors: make(map[string]error), }, Request: req, Hits: hits, From baf73756eaa11b3951ed941986a168b143688bc5 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 6 Apr 2018 00:34:11 -0700 Subject: [PATCH 364/728] optimize sort cache preallocations --- search/sort.go | 29 ++++++++++------------------- 1 file changed, 10 insertions(+), 19 deletions(-) diff --git a/search/sort.go b/search/sort.go index 28705d369..6afc9789b 100644 --- a/search/sort.go +++ b/search/sort.go @@ -251,23 +251,21 @@ func (so SortOrder) Compare(cachedScoring, cachedDesc []bool, i, j *DocumentMatc } func (so SortOrder) RequiresScore() bool { - rv := false for _, soi := range so { if soi.RequiresScoring() { - rv = true + return true } } - return rv + return false } func (so SortOrder) RequiresDocID() bool { - rv := false for _, soi := range so { if soi.RequiresDocID() { - rv = true + return true } } - return rv + return false } func (so SortOrder) RequiredFields() []string { @@ -279,7 +277,7 @@ func (so SortOrder) RequiredFields() []string { } func (so SortOrder) CacheIsScore() []bool { - var rv []bool + rv := make([]bool, 0, len(so)) for _, soi := range so { rv = append(rv, soi.RequiresScoring()) } @@ -287,7 +285,7 @@ func (so SortOrder) CacheIsScore() []bool { } func (so SortOrder) CacheDescending() []bool { - var rv []bool + rv := make([]bool, 0, len(so)) for _, soi := range so { rv = append(rv, soi.Descending()) } @@ -486,8 +484,7 @@ func (s *SortField) MarshalJSON() ([]byte, error) { } func (s *SortField) Copy() SearchSort { - var rv SortField - rv = *s + rv := *s return &rv } @@ -499,7 +496,6 @@ type SortDocID struct { // UpdateVisitor is a no-op for SortDocID as it's value // is not dependent on any field terms func (s *SortDocID) UpdateVisitor(field string, term []byte) { - } // Value returns the sort value of the DocumentMatch @@ -529,8 +525,7 @@ func (s *SortDocID) MarshalJSON() ([]byte, error) { } func (s *SortDocID) Copy() SearchSort { - var rv SortDocID - rv = *s + rv := *s return &rv } @@ -542,7 +537,6 @@ type SortScore struct { // UpdateVisitor is a no-op for SortScore as it's value // is not dependent on any field terms func (s *SortScore) UpdateVisitor(field string, term []byte) { - } // Value returns the sort value of the DocumentMatch @@ -572,8 +566,7 @@ func (s *SortScore) MarshalJSON() ([]byte, error) { } func (s *SortScore) Copy() SearchSort { - var rv SortScore - rv = *s + rv := *s return &rv } @@ -583,7 +576,6 @@ var maxDistance = string(numeric.MustNewPrefixCodedInt64(math.MaxInt64, 0)) // their distance from the specified point. func NewSortGeoDistance(field, unit string, lon, lat float64, desc bool) ( *SortGeoDistance, error) { - rv := &SortGeoDistance{ Field: field, Desc: desc, @@ -705,7 +697,6 @@ func (s *SortGeoDistance) MarshalJSON() ([]byte, error) { } func (s *SortGeoDistance) Copy() SearchSort { - var rv SortGeoDistance - rv = *s + rv := *s return &rv } From b4ac8051d116093a3ac168e70a835e3ffae53964 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 5 Apr 2018 18:11:37 -0700 Subject: [PATCH 365/728] optimize reuse of DocMatch.FieldTermLocations & Vectors during search In this commit, a new field, FieldTermLocations is introduced into the DocumentMatch struct. The FieldTermLocations is a slice that is not JSON serialized and is more ammenable than the existing DocumentMatch.Locations field for higher performance search processing, as slices are more friendly than maps for memory reuse. As the search collector finalizes results, it invokes the newly introduced DocumentMatch.Complete() method on the final hits, which converts the DocumentMatch.FieldTermLocations slice into a DocumentMatch.Locations map. All scorers and searchers that used to rely on DocumentMatch.Locations are appropriately updated. Of note, the phrase searcher also invokes DocumentMatch.Complete() so that it has the Locations map available for faster phrase processing. Term vector slice capacity are now also reused as part of this change. Microbenchmarks of bleve-query for query-string of 'text:"see also"' on a 50K en-wiki scorch index went from ~74 q/sec before this change to ~89 q/sec after this change. --- index/index.go | 2 + index/scorch/snapshot_index_tfr.go | 13 +++-- search/collector/topn.go | 1 + search/scorer/scorer_conjunction.go | 11 +---- search/scorer/scorer_disjunction.go | 11 +---- search/scorer/scorer_term.go | 60 ++++++++++------------ search/scorer/scorer_term_test.go | 4 ++ search/search.go | 71 +++++++++++++++++++++++++++ search/searcher/search_phrase.go | 54 ++++++++++++-------- search/searcher/search_phrase_test.go | 1 + search/util.go | 27 ++++++++++ test/versus_test.go | 44 +++++++++++------ 12 files changed, 205 insertions(+), 94 deletions(-) diff --git a/index/index.go b/index/index.go index ea53ee5d6..861f6a2ea 100644 --- a/index/index.go +++ b/index/index.go @@ -174,10 +174,12 @@ func (tfd *TermFieldDoc) Size() int { func (tfd *TermFieldDoc) Reset() *TermFieldDoc { // remember the []byte used for the ID id := tfd.ID + vectors := tfd.Vectors // idiom to copy over from empty TermFieldDoc (0 allocations) *tfd = TermFieldDoc{} // reuse the []byte already allocated (and reset len to 0) tfd.ID = id[:0] + tfd.Vectors = vectors[:0] return tfd } diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index 46e657488..c111d5177 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -102,17 +102,22 @@ func (i *IndexSnapshotTermFieldReader) postingToTermFieldDoc(next segment.Postin } if i.includeTermVectors { locs := next.Locations() - rv.Vectors = make([]*index.TermFieldVector, len(locs)) - backing := make([]index.TermFieldVector, len(locs)) + if cap(rv.Vectors) < len(locs) { + rv.Vectors = make([]*index.TermFieldVector, len(locs)) + backing := make([]index.TermFieldVector, len(locs)) + for i := range backing { + rv.Vectors[i] = &backing[i] + } + } + rv.Vectors = rv.Vectors[:len(locs)] for i, loc := range locs { - backing[i] = index.TermFieldVector{ + *rv.Vectors[i] = index.TermFieldVector{ Start: loc.Start(), End: loc.End(), Pos: loc.Pos(), ArrayPositions: loc.ArrayPositions(), Field: loc.Field(), } - rv.Vectors[i] = &backing[i] } } } diff --git a/search/collector/topn.go b/search/collector/topn.go index d684868cc..7a0bf9860 100644 --- a/search/collector/topn.go +++ b/search/collector/topn.go @@ -282,6 +282,7 @@ func (hc *TopNCollector) finalizeResults(r index.IndexReader) error { return err } } + doc.Complete(nil) return nil }) diff --git a/search/scorer/scorer_conjunction.go b/search/scorer/scorer_conjunction.go index b866293e0..48cdf3ae9 100644 --- a/search/scorer/scorer_conjunction.go +++ b/search/scorer/scorer_conjunction.go @@ -49,15 +49,11 @@ func (s *ConjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [ childrenExplanations = make([]*search.Explanation, len(constituents)) } - locations := []search.FieldTermLocationMap{} for i, docMatch := range constituents { sum += docMatch.Score if s.options.Explain { childrenExplanations[i] = docMatch.Expl } - if docMatch.Locations != nil { - locations = append(locations, docMatch.Locations) - } } newScore := sum var newExpl *search.Explanation @@ -69,11 +65,8 @@ func (s *ConjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [ rv := constituents[0] rv.Score = newScore rv.Expl = newExpl - if len(locations) == 1 { - rv.Locations = locations[0] - } else if len(locations) > 1 { - rv.Locations = search.MergeLocations(locations) - } + rv.FieldTermLocations = search.MergeFieldTermLocations( + rv.FieldTermLocations, constituents[1:]) return rv } diff --git a/search/scorer/scorer_disjunction.go b/search/scorer/scorer_disjunction.go index 36a601c72..7a955e168 100644 --- a/search/scorer/scorer_disjunction.go +++ b/search/scorer/scorer_disjunction.go @@ -50,15 +50,11 @@ func (s *DisjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [ childrenExplanations = make([]*search.Explanation, len(constituents)) } - var locations []search.FieldTermLocationMap for i, docMatch := range constituents { sum += docMatch.Score if s.options.Explain { childrenExplanations[i] = docMatch.Expl } - if docMatch.Locations != nil { - locations = append(locations, docMatch.Locations) - } } var rawExpl *search.Explanation @@ -80,11 +76,8 @@ func (s *DisjunctionQueryScorer) Score(ctx *search.SearchContext, constituents [ rv := constituents[0] rv.Score = newScore rv.Expl = newExpl - if len(locations) == 1 { - rv.Locations = locations[0] - } else if len(locations) > 1 { - rv.Locations = search.MergeLocations(locations) - } + rv.FieldTermLocations = search.MergeFieldTermLocations( + rv.FieldTermLocations, constituents[1:]) return rv } diff --git a/search/scorer/scorer_term.go b/search/scorer/scorer_term.go index 077e38e0f..5544f2d01 100644 --- a/search/scorer/scorer_term.go +++ b/search/scorer/scorer_term.go @@ -32,7 +32,7 @@ func init() { } type TermQueryScorer struct { - queryTerm []byte + queryTerm string queryField string queryBoost float64 docTerm uint64 @@ -62,7 +62,7 @@ func (s *TermQueryScorer) Size() int { func NewTermQueryScorer(queryTerm []byte, queryField string, queryBoost float64, docTotal, docTerm uint64, options search.SearcherOptions) *TermQueryScorer { rv := TermQueryScorer{ - queryTerm: queryTerm, + queryTerm: string(queryTerm), queryField: queryField, queryBoost: queryBoost, docTerm: docTerm, @@ -106,7 +106,7 @@ func (s *TermQueryScorer) SetQueryNorm(qnorm float64) { } s.queryWeightExplanation = &search.Explanation{ Value: s.queryWeight, - Message: fmt.Sprintf("queryWeight(%s:%s^%f), product of:", s.queryField, string(s.queryTerm), s.queryBoost), + Message: fmt.Sprintf("queryWeight(%s:%s^%f), product of:", s.queryField, s.queryTerm, s.queryBoost), Children: childrenExplanations, } } @@ -128,7 +128,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term childrenExplanations := make([]*search.Explanation, 3) childrenExplanations[0] = &search.Explanation{ Value: tf, - Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, string(s.queryTerm), termMatch.Freq), + Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq), } childrenExplanations[1] = &search.Explanation{ Value: termMatch.Norm, @@ -137,7 +137,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term childrenExplanations[2] = s.idfExplanation scoreExplanation = &search.Explanation{ Value: score, - Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.queryField, string(s.queryTerm), termMatch.ID), + Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.queryField, s.queryTerm, termMatch.ID), Children: childrenExplanations, } } @@ -151,7 +151,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term childExplanations[1] = scoreExplanation scoreExplanation = &search.Explanation{ Value: score, - Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.queryField, string(s.queryTerm), s.queryBoost, termMatch.ID), + Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.queryField, s.queryTerm, s.queryBoost, termMatch.ID), Children: childExplanations, } } @@ -164,41 +164,31 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term rv.Expl = scoreExplanation } - if termMatch.Vectors != nil && len(termMatch.Vectors) > 0 { - locs := make([]search.Location, len(termMatch.Vectors)) - locsUsed := 0 - - totalPositions := 0 - for _, v := range termMatch.Vectors { - totalPositions += len(v.ArrayPositions) + if len(termMatch.Vectors) > 0 { + if cap(rv.FieldTermLocations) < len(termMatch.Vectors) { + rv.FieldTermLocations = make([]search.FieldTermLocation, 0, len(termMatch.Vectors)) } - positions := make(search.ArrayPositions, totalPositions) - positionsUsed := 0 - rv.Locations = make(search.FieldTermLocationMap) for _, v := range termMatch.Vectors { - tlm := rv.Locations[v.Field] - if tlm == nil { - tlm = make(search.TermLocationMap) - rv.Locations[v.Field] = tlm - } - - loc := &locs[locsUsed] - locsUsed++ - - loc.Pos = v.Pos - loc.Start = v.Start - loc.End = v.End - + var ap search.ArrayPositions if len(v.ArrayPositions) > 0 { - loc.ArrayPositions = positions[positionsUsed : positionsUsed+len(v.ArrayPositions)] - for i, ap := range v.ArrayPositions { - loc.ArrayPositions[i] = ap + n := len(rv.FieldTermLocations) + if n < cap(rv.FieldTermLocations) { // reuse ap slice if available + ap = rv.FieldTermLocations[:n+1][n].Location.ArrayPositions[:0] } - positionsUsed += len(v.ArrayPositions) + ap = append(ap, v.ArrayPositions...) } - - tlm[string(s.queryTerm)] = append(tlm[string(s.queryTerm)], loc) + rv.FieldTermLocations = + append(rv.FieldTermLocations, search.FieldTermLocation{ + Field: v.Field, + Term: s.queryTerm, + Location: search.Location{ + Pos: v.Pos, + Start: v.Start, + End: v.End, + ArrayPositions: ap, + }, + }) } } diff --git a/search/scorer/scorer_term_test.go b/search/scorer/scorer_term_test.go index bacc00295..23d449788 100644 --- a/search/scorer/scorer_term_test.go +++ b/search/scorer/scorer_term_test.go @@ -156,6 +156,10 @@ func TestTermScorer(t *testing.T) { DocumentMatchPool: search.NewDocumentMatchPool(1, 0), } actual := scorer.Score(ctx, test.termMatch) + actual.Complete(nil) + if len(actual.FieldTermLocations) <= 0 { + actual.FieldTermLocations = nil + } if !reflect.DeepEqual(actual, test.result) { t.Errorf("expected %#v got %#v for %#v", test.result, actual, test.termMatch) diff --git a/search/search.go b/search/search.go index ca030df4b..440c09571 100644 --- a/search/search.go +++ b/search/search.go @@ -77,6 +77,12 @@ func (t TermLocationMap) AddLocation(term string, location *Location) { type FieldTermLocationMap map[string]TermLocationMap +type FieldTermLocation struct { + Field string + Term string + Location Location +} + type FieldFragmentMap map[string][]string type DocumentMatch struct { @@ -99,6 +105,12 @@ type DocumentMatch struct { // used to maintain natural index order HitNumber uint64 `json:"-"` + + // used to temporarily hold field term location information during + // search processing in an efficient, recycle-friendly manner, to + // be later incorporated into the Locations map when search + // results are completed + FieldTermLocations []FieldTermLocation `json:"-"` } func (dm *DocumentMatch) AddFieldValue(name string, value interface{}) { @@ -128,12 +140,19 @@ func (dm *DocumentMatch) Reset() *DocumentMatch { indexInternalID := dm.IndexInternalID // remember the []interface{} used for sort sort := dm.Sort + // remember the FieldTermLocations backing array + ftls := dm.FieldTermLocations + for i := range ftls { // recycle the ArrayPositions of each location + ftls[i].Location.ArrayPositions = ftls[i].Location.ArrayPositions[:0] + } // idiom to copy over from empty DocumentMatch (0 allocations) *dm = DocumentMatch{} // reuse the []byte already allocated (and reset len to 0) dm.IndexInternalID = indexInternalID[:0] // reuse the []interface{} already allocated (and reset len to 0) dm.Sort = sort[:0] + // reuse the FieldTermLocations already allocated (and reset len to 0) + dm.FieldTermLocations = ftls[:0] return dm } @@ -183,6 +202,58 @@ func (dm *DocumentMatch) Size() int { return sizeInBytes } +// Complete performs final preparation & transformation of the +// DocumentMatch at the end of search processing, also allowing the +// caller to provide an optional preallocated locations slice +func (dm *DocumentMatch) Complete(prealloc []Location) []Location { + // transform the FieldTermLocations slice into the Locations map + nlocs := len(dm.FieldTermLocations) + if nlocs > 0 { + if cap(prealloc) < nlocs { + prealloc = make([]Location, nlocs) + } + prealloc = prealloc[:nlocs] + + var lastField string + var tlm TermLocationMap + + for i, ftl := range dm.FieldTermLocations { + if lastField != ftl.Field { + lastField = ftl.Field + + if dm.Locations == nil { + dm.Locations = make(FieldTermLocationMap) + } + + tlm = dm.Locations[ftl.Field] + if tlm == nil { + tlm = make(TermLocationMap) + dm.Locations[ftl.Field] = tlm + } + } + + loc := &prealloc[i] + *loc = ftl.Location + + if len(loc.ArrayPositions) > 0 { // copy + loc.ArrayPositions = append(ArrayPositions(nil), loc.ArrayPositions...) + } + + tlm[ftl.Term] = append(tlm[ftl.Term], loc) + + dm.FieldTermLocations[i] = FieldTermLocation{ // recycle + Location: Location{ + ArrayPositions: ftl.Location.ArrayPositions[:0], + }, + } + } + } + + dm.FieldTermLocations = dm.FieldTermLocations[:0] // recycle + + return prealloc +} + func (dm *DocumentMatch) String() string { return fmt.Sprintf("[%s-%f]", string(dm.IndexInternalID), dm.Score) } diff --git a/search/searcher/search_phrase.go b/search/searcher/search_phrase.go index 95d7a6b3e..3711da063 100644 --- a/search/searcher/search_phrase.go +++ b/search/searcher/search_phrase.go @@ -38,6 +38,7 @@ type PhraseSearcher struct { terms [][]string path phrasePath paths []phrasePath + locations []search.Location initialized bool } @@ -214,48 +215,59 @@ func (s *PhraseSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, // also satisfies the phase constraints. if so, it returns a DocumentMatch // for this document, otherwise nil func (s *PhraseSearcher) checkCurrMustMatch(ctx *search.SearchContext) *search.DocumentMatch { - rvftlm := make(search.FieldTermLocationMap, 0) - freq := 0 + s.locations = s.currMust.Complete(s.locations) + + locations := s.currMust.Locations + s.currMust.Locations = nil + + ftls := s.currMust.FieldTermLocations + // typically we would expect there to only actually be results in // one field, but we allow for this to not be the case // but, we note that phrase constraints can only be satisfied within // a single field, so we can check them each independently - for field, tlm := range s.currMust.Locations { - - f, rvtlm := s.checkCurrMustMatchField(ctx, tlm) - if f > 0 { - freq += f - rvftlm[field] = rvtlm - } + for field, tlm := range locations { + ftls = s.checkCurrMustMatchField(ctx, field, tlm, ftls) } - if freq > 0 { + if len(ftls) > 0 { // return match rv := s.currMust s.currMust = nil - rv.Locations = rvftlm + rv.FieldTermLocations = ftls return rv } return nil } -// checkCurrMustMatchField is soley concerned with determining if one particular -// field within the currMust DocumentMatch Locations satisfies the phase -// constraints (possibly more than once). if so, the number of times it was -// satisfied, and these locations are returned. otherwise 0 and either -// a nil or empty TermLocationMap -func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext, tlm search.TermLocationMap) ( - int, search.TermLocationMap) { +// checkCurrMustMatchField is soley concerned with determining if one +// particular field within the currMust DocumentMatch Locations +// satisfies the phase constraints (possibly more than once). if so, +// the matching field term locations are appended to the provided +// slice +func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext, + field string, tlm search.TermLocationMap, + ftls []search.FieldTermLocation) []search.FieldTermLocation { if s.path == nil { s.path = make(phrasePath, 0, len(s.terms)) } s.paths = findPhrasePaths(0, nil, s.terms, tlm, s.path[:0], 0, s.paths[:0]) - rv := make(search.TermLocationMap, len(s.terms)) for _, p := range s.paths { - p.MergeInto(rv) + for _, pp := range p { + ftls = append(ftls, search.FieldTermLocation{ + Field: field, + Term: pp.term, + Location: search.Location{ + Pos: pp.loc.Pos, + Start: pp.loc.Start, + End: pp.loc.End, + ArrayPositions: pp.loc.ArrayPositions, + }, + }) + } } - return len(s.paths), rv + return ftls } type phrasePart struct { diff --git a/search/searcher/search_phrase_test.go b/search/searcher/search_phrase_test.go index 1c3e18a5e..04af20f08 100644 --- a/search/searcher/search_phrase_test.go +++ b/search/searcher/search_phrase_test.go @@ -74,6 +74,7 @@ func TestPhraseSearch(t *testing.T) { next, err := test.searcher.Next(ctx) i := 0 for err == nil && next != nil { + next.Complete(nil) if i < len(test.results) { if !next.IndexInternalID.Equals(test.results[i].IndexInternalID) { t.Errorf("expected result %d to have id %s got %s for test %d\n", i, test.results[i].IndexInternalID, next.IndexInternalID, testIndex) diff --git a/search/util.go b/search/util.go index 83212af1f..19dd5d68b 100644 --- a/search/util.go +++ b/search/util.go @@ -40,3 +40,30 @@ func MergeTermLocationMaps(rv, other TermLocationMap) TermLocationMap { } return rv } + +func MergeFieldTermLocations(dest []FieldTermLocation, matches []*DocumentMatch) []FieldTermLocation { + n := len(dest) + for _, dm := range matches { + n += len(dm.FieldTermLocations) + } + if cap(dest) < n { + dest = append(make([]FieldTermLocation, 0, n), dest...) + } + + for _, dm := range matches { + for _, ftl := range dm.FieldTermLocations { + dest = append(dest, FieldTermLocation{ + Field: ftl.Field, + Term: ftl.Term, + Location: Location{ + Pos: ftl.Location.Pos, + Start: ftl.Location.Start, + End: ftl.Location.End, + ArrayPositions: append(ArrayPositions(nil), ftl.Location.ArrayPositions...), + }, + }) + } + } + + return dest +} diff --git a/test/versus_test.go b/test/versus_test.go index 70463a93c..dbc7dd752 100644 --- a/test/versus_test.go +++ b/test/versus_test.go @@ -316,27 +316,39 @@ func testVersusSearches(vt *VersusTest, searchTemplates []string, idxA, idxB ble // putting the hits from A & B into maps. hitsA := hitsById(resA) hitsB := hitsById(resB) + for id, hitA := range hitsA { + hitB := hitsB[id] + if len(hitA.FieldTermLocations) <= 0 { + hitA.FieldTermLocations = nil + } + if len(hitB.FieldTermLocations) <= 0 { + hitB.FieldTermLocations = nil + } + if !reflect.DeepEqual(hitA, hitB) { + t.Errorf("\n driving from hitsA\n hitA: %#v,\n hitB: %#v", hitA, hitB) + idx, _ := strconv.Atoi(id) + t.Errorf("\n doc: %d, body: %s", idx, strings.Join(vt.Bodies[idx], " ")) + } + } + for id, hitB := range hitsB { + hitA := hitsA[id] + if len(hitA.FieldTermLocations) <= 0 { + hitA.FieldTermLocations = nil + } + if len(hitB.FieldTermLocations) <= 0 { + hitB.FieldTermLocations = nil + } + if !reflect.DeepEqual(hitA, hitB) { + t.Errorf("\n driving from hitsB\n hitA: %#v,\n hitB: %#v", hitA, hitB) + idx, _ := strconv.Atoi(id) + t.Errorf("\n doc: %d, body: %s", idx, strings.Join(vt.Bodies[idx], " ")) + } + } if !reflect.DeepEqual(hitsA, hitsB) { t.Errorf("=========\nsearch: (%d) %s,\n res hits mismatch,\n len(hitsA): %d,\n len(hitsB): %d", i, bufBytes, len(hitsA), len(hitsB)) t.Errorf("\n hitsA: %#v,\n hitsB: %#v", hitsA, hitsB) - for id, hitA := range hitsA { - hitB := hitsB[id] - if !reflect.DeepEqual(hitA, hitB) { - t.Errorf("\n driving from hitsA\n hitA: %#v,\n hitB: %#v", hitA, hitB) - idx, _ := strconv.Atoi(id) - t.Errorf("\n doc: %d, body: %s", idx, strings.Join(vt.Bodies[idx], " ")) - } - } - for id, hitB := range hitsB { - hitA := hitsA[id] - if !reflect.DeepEqual(hitA, hitB) { - t.Errorf("\n driving from hitsB\n hitA: %#v,\n hitB: %#v", hitA, hitB) - idx, _ := strconv.Atoi(id) - t.Errorf("\n doc: %d, body: %s", idx, strings.Join(vt.Bodies[idx], " ")) - } - } } resA.Hits = nil From bdd917bb1228d67837b566da78565e87fc94bd4b Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Sun, 8 Apr 2018 19:34:32 -0700 Subject: [PATCH 366/728] MB-28847: Account for total documents' size within a batch + Supporting APIs to fetch these stats: last added document's size and total documents' size. --- document/document.go | 16 ++++++++++++---- document/field.go | 2 ++ document/field_boolean.go | 16 ++++++++++++++++ document/field_composite.go | 16 ++++++++++++++++ document/field_datetime.go | 15 +++++++++++++++ document/field_geopoint.go | 15 +++++++++++++++ document/field_numeric.go | 15 +++++++++++++++ document/field_text.go | 16 ++++++++++++++++ index.go | 17 +++++++++++++++++ 9 files changed, 124 insertions(+), 4 deletions(-) diff --git a/document/document.go b/document/document.go index 921098b0b..6ac17b9ab 100644 --- a/document/document.go +++ b/document/document.go @@ -43,10 +43,18 @@ func NewDocument(id string) *Document { } func (d *Document) Size() int { - return reflectStaticSizeDocument + size.SizeOfPtr + - len(d.ID) + - len(d.Fields)*size.SizeOfPtr + - len(d.CompositeFields)*(size.SizeOfPtr+reflectStaticSizeCompositeField) + sizeInBytes := reflectStaticSizeDocument + size.SizeOfPtr + + len(d.ID) + + for _, entry := range d.Fields { + sizeInBytes += entry.Size() + } + + for _, entry := range d.CompositeFields { + sizeInBytes += entry.Size() + } + + return sizeInBytes } func (d *Document) AddField(f Field) *Document { diff --git a/document/field.go b/document/field.go index c17f81e5d..2fe916698 100644 --- a/document/field.go +++ b/document/field.go @@ -36,4 +36,6 @@ type Field interface { // that this field represents - this is a common metric for tracking // the rate of indexing NumPlainTextBytes() uint64 + + Size() int } diff --git a/document/field_boolean.go b/document/field_boolean.go index c226374c0..6864b16f4 100644 --- a/document/field_boolean.go +++ b/document/field_boolean.go @@ -16,10 +16,19 @@ package document import ( "fmt" + "reflect" "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeBooleanField int + +func init() { + var f BooleanField + reflectStaticSizeBooleanField = int(reflect.TypeOf(f).Size()) +} + const DefaultBooleanIndexingOptions = StoreField | IndexField | DocValues type BooleanField struct { @@ -30,6 +39,13 @@ type BooleanField struct { numPlainTextBytes uint64 } +func (b *BooleanField) Size() int { + return reflectStaticSizeBooleanField + size.SizeOfPtr + + len(b.name) + + len(b.arrayPositions)*size.SizeOfUint64 + + len(b.value) +} + func (b *BooleanField) Name() string { return b.name } diff --git a/document/field_composite.go b/document/field_composite.go index e53cd4566..a8285880f 100644 --- a/document/field_composite.go +++ b/document/field_composite.go @@ -18,6 +18,7 @@ import ( "reflect" "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/size" ) var reflectStaticSizeCompositeField int @@ -63,6 +64,21 @@ func NewCompositeFieldWithIndexingOptions(name string, defaultInclude bool, incl return rv } +func (c *CompositeField) Size() int { + sizeInBytes := reflectStaticSizeCompositeField + size.SizeOfPtr + + len(c.name) + + for k, _ := range c.includedFields { + sizeInBytes += size.SizeOfString + len(k) + size.SizeOfBool + } + + for k, _ := range c.excludedFields { + sizeInBytes += size.SizeOfString + len(k) + size.SizeOfBool + } + + return sizeInBytes +} + func (c *CompositeField) Name() string { return c.name } diff --git a/document/field_datetime.go b/document/field_datetime.go index 1db068c87..583b44cde 100644 --- a/document/field_datetime.go +++ b/document/field_datetime.go @@ -17,12 +17,21 @@ package document import ( "fmt" "math" + "reflect" "time" "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/numeric" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeDateTimeField int + +func init() { + var f DateTimeField + reflectStaticSizeDateTimeField = int(reflect.TypeOf(f).Size()) +} + const DefaultDateTimeIndexingOptions = StoreField | IndexField | DocValues const DefaultDateTimePrecisionStep uint = 4 @@ -37,6 +46,12 @@ type DateTimeField struct { numPlainTextBytes uint64 } +func (n *DateTimeField) Size() int { + return reflectStaticSizeDateTimeField + size.SizeOfPtr + + len(n.name) + + len(n.arrayPositions)*size.SizeOfUint64 +} + func (n *DateTimeField) Name() string { return n.name } diff --git a/document/field_geopoint.go b/document/field_geopoint.go index f508b3625..91fe23f96 100644 --- a/document/field_geopoint.go +++ b/document/field_geopoint.go @@ -16,12 +16,21 @@ package document import ( "fmt" + "reflect" "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/geo" "github.com/blevesearch/bleve/numeric" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeGeoPointField int + +func init() { + var f GeoPointField + reflectStaticSizeGeoPointField = int(reflect.TypeOf(f).Size()) +} + var GeoPrecisionStep uint = 9 type GeoPointField struct { @@ -32,6 +41,12 @@ type GeoPointField struct { numPlainTextBytes uint64 } +func (n *GeoPointField) Size() int { + return reflectStaticSizeGeoPointField + size.SizeOfPtr + + len(n.name) + + len(n.arrayPositions)*size.SizeOfUint64 +} + func (n *GeoPointField) Name() string { return n.name } diff --git a/document/field_numeric.go b/document/field_numeric.go index e32993c88..46c685e84 100644 --- a/document/field_numeric.go +++ b/document/field_numeric.go @@ -16,11 +16,20 @@ package document import ( "fmt" + "reflect" "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/numeric" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeNumericField int + +func init() { + var f NumericField + reflectStaticSizeNumericField = int(reflect.TypeOf(f).Size()) +} + const DefaultNumericIndexingOptions = StoreField | IndexField | DocValues const DefaultPrecisionStep uint = 4 @@ -33,6 +42,12 @@ type NumericField struct { numPlainTextBytes uint64 } +func (n *NumericField) Size() int { + return reflectStaticSizeNumericField + size.SizeOfPtr + + len(n.name) + + len(n.arrayPositions)*size.SizeOfPtr +} + func (n *NumericField) Name() string { return n.name } diff --git a/document/field_text.go b/document/field_text.go index 5f7a3ab64..c8e871c9d 100644 --- a/document/field_text.go +++ b/document/field_text.go @@ -16,10 +16,19 @@ package document import ( "fmt" + "reflect" "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeTextField int + +func init() { + var f TextField + reflectStaticSizeTextField = int(reflect.TypeOf(f).Size()) +} + const DefaultTextIndexingOptions = IndexField | DocValues type TextField struct { @@ -31,6 +40,13 @@ type TextField struct { numPlainTextBytes uint64 } +func (t *TextField) Size() int { + return reflectStaticSizeTextField + size.SizeOfPtr + + len(t.name) + + len(t.arrayPositions)*size.SizeOfUint64 + + len(t.value) +} + func (t *TextField) Name() string { return t.name } diff --git a/index.go b/index.go index ea7b3832a..197f4d4df 100644 --- a/index.go +++ b/index.go @@ -21,6 +21,7 @@ import ( "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/store" "github.com/blevesearch/bleve/mapping" + "github.com/blevesearch/bleve/size" ) // A Batch groups together multiple Index and Delete @@ -32,6 +33,9 @@ import ( type Batch struct { index Index internal *index.Batch + + lastDocSize uint64 + totalSize uint64 } // Index adds the specified index operation to the @@ -47,9 +51,22 @@ func (b *Batch) Index(id string, data interface{}) error { return err } b.internal.Update(doc) + + b.lastDocSize = uint64(doc.Size() + + len(id) + size.SizeOfString) // overhead from internal + b.totalSize += b.lastDocSize + return nil } +func (b *Batch) LastDocSize() uint64 { + return b.lastDocSize +} + +func (b *Batch) TotalDocsSize() uint64 { + return b.totalSize +} + // IndexAdvanced adds the specified index operation to the // batch which skips the mapping. NOTE: the bleve Index is not updated // until the batch is executed. From b1c9ab49814da0d5738026d846b665d1137966a8 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 9 Apr 2018 18:13:03 +0530 Subject: [PATCH 367/728] bumping the scorch version with meta persisence in rootbolt --- index/scorch/scorch.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 162e70291..14796c5e8 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -36,7 +36,7 @@ import ( const Name = "scorch" -const Version uint8 = 1 +const Version uint8 = 2 var ErrClosed = fmt.Errorf("scorch closed") From ce8f7057f0a0679dff2ff6d4dcc7a7b141fcb679 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 8 Apr 2018 22:49:26 -0700 Subject: [PATCH 368/728] scorch renamed dvIterator to dvReader The structure wasn't actually iterating, but instead allowed for random access reading by docNum of doc values. --- index/scorch/segment/zap/build.go | 4 ++-- index/scorch/segment/zap/docvalues.go | 34 +++++++++++++-------------- index/scorch/segment/zap/segment.go | 16 ++++++------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index f3fa1e9fa..2c261a3eb 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -136,11 +136,11 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32, fieldsIndexOffset: fieldsIndexOffset, docValueOffset: docValueOffset, dictLocs: dictLocs, - fieldDvIterMap: make(map[uint16]*docValueIterator), + fieldDvReaders: make(map[uint16]*docValueReader), } sb.updateSize() - err := sb.loadDvIterators() + err := sb.loadDvReaders() if err != nil { return nil, err } diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index dcd2cb052..f46f2bd5b 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -27,14 +27,14 @@ import ( "github.com/golang/snappy" ) -var reflectStaticSizedocValueIterator int +var reflectStaticSizedocValueReader int func init() { - var dvi docValueIterator - reflectStaticSizedocValueIterator = int(reflect.TypeOf(dvi).Size()) + var dvi docValueReader + reflectStaticSizedocValueReader = int(reflect.TypeOf(dvi).Size()) } -type docValueIterator struct { +type docValueReader struct { field string curChunkNum uint64 numChunks uint64 @@ -45,27 +45,27 @@ type docValueIterator struct { uncompressed []byte // temp buf for snappy decompression } -func (di *docValueIterator) size() int { - return reflectStaticSizedocValueIterator + size.SizeOfPtr + +func (di *docValueReader) size() int { + return reflectStaticSizedocValueReader + size.SizeOfPtr + len(di.field) + len(di.chunkOffsets)*size.SizeOfUint64 + len(di.curChunkHeader)*reflectStaticSizeMetaData + len(di.curChunkData) } -func (di *docValueIterator) fieldName() string { +func (di *docValueReader) fieldName() string { return di.field } -func (di *docValueIterator) curChunkNumber() uint64 { +func (di *docValueReader) curChunkNumber() uint64 { return di.curChunkNum } -func (s *SegmentBase) loadFieldDocValueIterator(field string, - fieldDvLoc uint64) (*docValueIterator, error) { +func (s *SegmentBase) loadFieldDocValueReader(field string, + fieldDvLoc uint64) (*docValueReader, error) { // get the docValue offset for the given fields if fieldDvLoc == fieldNotUninverted { - return nil, fmt.Errorf("loadFieldDocValueIterator: "+ + return nil, fmt.Errorf("loadFieldDocValueReader: "+ "no docValues found for field: %s", field) } @@ -78,7 +78,7 @@ func (s *SegmentBase) loadFieldDocValueIterator(field string, } offset += uint64(read) - fdvIter := &docValueIterator{ + fdvIter := &docValueReader{ curChunkNum: math.MaxUint64, field: field, chunkOffsets: make([]uint64, int(numChunks)), @@ -96,7 +96,7 @@ func (s *SegmentBase) loadFieldDocValueIterator(field string, return fdvIter, nil } -func (di *docValueIterator) loadDvChunk(chunkNumber, +func (di *docValueReader) loadDvChunk(chunkNumber, localDocNum uint64, s *SegmentBase) error { // advance to the chunk where the docValues // reside for the given docNum @@ -128,7 +128,7 @@ func (di *docValueIterator) loadDvChunk(chunkNumber, return nil } -func (di *docValueIterator) visitDocValues(docNum uint64, +func (di *docValueReader) visitDocValues(docNum uint64, visitor index.DocumentFieldTermVisitor) error { // binary search the term locations for the docNum start, end := di.getDocValueLocs(docNum) @@ -157,7 +157,7 @@ func (di *docValueIterator) visitDocValues(docNum uint64, return nil } -func (di *docValueIterator) getDocValueLocs(docNum uint64) (uint64, uint64) { +func (di *docValueReader) getDocValueLocs(docNum uint64) (uint64, uint64) { i := sort.Search(len(di.curChunkHeader), func(i int) bool { return di.curChunkHeader[i].DocNum >= docNum }) @@ -180,7 +180,7 @@ func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []strin // find the chunkNumber where the docValues are stored docInChunk := localDocNum / uint64(s.chunkFactor) - if dvIter, exists := s.fieldDvIterMap[fieldIDPlus1-1]; exists && + if dvIter, exists := s.fieldDvReaders[fieldIDPlus1-1]; exists && dvIter != nil { // check if the chunk is already loaded if docInChunk != dvIter.curChunkNumber() { @@ -202,7 +202,7 @@ func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []strin func (s *Segment) VisitableDocValueFields() ([]string, error) { var rv []string for fieldID, field := range s.fieldsInv { - if dvIter, ok := s.fieldDvIterMap[uint16(fieldID)]; ok && + if dvIter, ok := s.fieldDvReaders[uint16(fieldID)]; ok && dvIter != nil { rv = append(rv, field) } diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 138d88ab0..f9549416a 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -55,7 +55,7 @@ func Open(path string) (segment.Segment, error) { SegmentBase: SegmentBase{ mem: mm[0 : len(mm)-FooterSize], fieldsMap: make(map[string]uint16), - fieldDvIterMap: make(map[uint16]*docValueIterator), + fieldDvReaders: make(map[uint16]*docValueReader), }, f: f, mm: mm, @@ -76,7 +76,7 @@ func Open(path string) (segment.Segment, error) { return nil, err } - err = rv.loadDvIterators() + err = rv.loadDvReaders() if err != nil { _ = rv.Close() return nil, err @@ -98,7 +98,7 @@ type SegmentBase struct { fieldsIndexOffset uint64 docValueOffset uint64 dictLocs []uint64 - fieldDvIterMap map[uint16]*docValueIterator // naive chunk cache per field + fieldDvReaders map[uint16]*docValueReader // naive chunk cache per field size uint64 } @@ -121,8 +121,8 @@ func (sb *SegmentBase) updateSize() { } sizeInBytes += len(sb.dictLocs) * size.SizeOfUint64 - // fieldDvIterMap - for _, v := range sb.fieldDvIterMap { + // fieldDvReaders + for _, v := range sb.fieldDvReaders { sizeInBytes += size.SizeOfUint16 + size.SizeOfPtr if v != nil { sizeInBytes += v.size() @@ -480,7 +480,7 @@ func (s *Segment) DictAddr(field string) (uint64, error) { return s.dictLocs[fieldIDPlus1-1], nil } -func (s *SegmentBase) loadDvIterators() error { +func (s *SegmentBase) loadDvReaders() error { if s.docValueOffset == fieldNotUninverted { return nil } @@ -489,9 +489,9 @@ func (s *SegmentBase) loadDvIterators() error { for fieldID, field := range s.fieldsInv { fieldLoc, n := binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) if n <= 0 { - return fmt.Errorf("loadDvIterators: failed to read the docvalue offsets for field %d", fieldID) + return fmt.Errorf("loadDvReaders: failed to read the docvalue offsets for field %d", fieldID) } - s.fieldDvIterMap[uint16(fieldID)], _ = s.loadFieldDocValueIterator(field, fieldLoc) + s.fieldDvReaders[uint16(fieldID)], _ = s.loadFieldDocValueReader(field, fieldLoc) read += uint64(n) } return nil From 69f797b438d06f958b7ca75f3da6c7932d3fd504 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 9 Apr 2018 17:03:18 -0400 Subject: [PATCH 369/728] remove mem segment this was no longer used and becoming more work to maintain --- index/scorch/segment/mem/build.go | 347 --------- index/scorch/segment/mem/dict.go | 239 ------- index/scorch/segment/mem/dict_test.go | 211 ------ index/scorch/segment/mem/posting.go | 247 ------- index/scorch/segment/mem/segment.go | 286 -------- index/scorch/segment/mem/segment_test.go | 876 ----------------------- 6 files changed, 2206 deletions(-) delete mode 100644 index/scorch/segment/mem/build.go delete mode 100644 index/scorch/segment/mem/dict.go delete mode 100644 index/scorch/segment/mem/dict_test.go delete mode 100644 index/scorch/segment/mem/posting.go delete mode 100644 index/scorch/segment/mem/segment.go delete mode 100644 index/scorch/segment/mem/segment_test.go diff --git a/index/scorch/segment/mem/build.go b/index/scorch/segment/mem/build.go deleted file mode 100644 index 0b329704a..000000000 --- a/index/scorch/segment/mem/build.go +++ /dev/null @@ -1,347 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package mem - -import ( - "math" - "sort" - - "github.com/RoaringBitmap/roaring" - "github.com/blevesearch/bleve/analysis" - "github.com/blevesearch/bleve/document" - "github.com/blevesearch/bleve/index" -) - -// NewFromAnalyzedDocs places the analyzed document mutations into a new segment -func NewFromAnalyzedDocs(results []*index.AnalysisResult) *Segment { - s := New() - - // ensure that _id field get fieldID 0 - s.getOrDefineField("_id") - - // fill Dicts/DictKeys and preallocate memory - s.initializeDict(results) - - // walk each doc - fieldLensReuse := make([]int, len(s.FieldsMap)) - docMapReuse := make([]analysis.TokenFrequencies, len(s.FieldsMap)) - for _, result := range results { - s.processDocument(result, fieldLensReuse, docMapReuse) - } - - // go back and sort the dictKeys - for _, dict := range s.DictKeys { - sort.Strings(dict) - } - - // compute memory usage of segment - s.updateSize() - - // professional debugging - // - // log.Printf("fields: %v\n", s.FieldsMap) - // log.Printf("fieldsInv: %v\n", s.FieldsInv) - // log.Printf("fieldsLoc: %v\n", s.FieldsLoc) - // log.Printf("dicts: %v\n", s.Dicts) - // log.Printf("dict keys: %v\n", s.DictKeys) - // for i, posting := range s.Postings { - // log.Printf("posting %d: %v\n", i, posting) - // } - // for i, freq := range s.Freqs { - // log.Printf("freq %d: %v\n", i, freq) - // } - // for i, norm := range s.Norms { - // log.Printf("norm %d: %v\n", i, norm) - // } - // for i, field := range s.Locfields { - // log.Printf("field %d: %v\n", i, field) - // } - // for i, start := range s.Locstarts { - // log.Printf("start %d: %v\n", i, start) - // } - // for i, end := range s.Locends { - // log.Printf("end %d: %v\n", i, end) - // } - // for i, pos := range s.Locpos { - // log.Printf("pos %d: %v\n", i, pos) - // } - // for i, apos := range s.Locarraypos { - // log.Printf("apos %d: %v\n", i, apos) - // } - // log.Printf("stored: %v\n", s.Stored) - // log.Printf("stored types: %v\n", s.StoredTypes) - // log.Printf("stored pos: %v\n", s.StoredPos) - - return s -} - -// fill Dicts/DictKeys and preallocate memory for postings -func (s *Segment) initializeDict(results []*index.AnalysisResult) { - var numPostingsLists int - - numTermsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id. - numLocsPerPostingsList := make([]int, 0, 64) // Keyed by postings list id. - - var numTokenFrequencies int - var totLocs int - - // initial scan for all fieldID's to sort them - for _, result := range results { - for _, field := range result.Document.CompositeFields { - s.getOrDefineField(field.Name()) - } - for _, field := range result.Document.Fields { - s.getOrDefineField(field.Name()) - } - } - sort.Strings(s.FieldsInv[1:]) // keep _id as first field - s.FieldsMap = make(map[string]uint16, len(s.FieldsInv)) - for fieldID, fieldName := range s.FieldsInv { - s.FieldsMap[fieldName] = uint16(fieldID + 1) - } - - processField := func(fieldID uint16, tfs analysis.TokenFrequencies) { - dict := s.Dicts[fieldID] - dictKeys := s.DictKeys[fieldID] - for term, tf := range tfs { - pidPlus1, exists := dict[term] - if !exists { - numPostingsLists++ - pidPlus1 = uint64(numPostingsLists) - dict[term] = pidPlus1 - dictKeys = append(dictKeys, term) - numTermsPerPostingsList = append(numTermsPerPostingsList, 0) - numLocsPerPostingsList = append(numLocsPerPostingsList, 0) - } - pid := pidPlus1 - 1 - numTermsPerPostingsList[pid] += 1 - numLocsPerPostingsList[pid] += len(tf.Locations) - totLocs += len(tf.Locations) - } - numTokenFrequencies += len(tfs) - s.DictKeys[fieldID] = dictKeys - } - - for _, result := range results { - // walk each composite field - for _, field := range result.Document.CompositeFields { - fieldID := uint16(s.getOrDefineField(field.Name())) - _, tf := field.Analyze() - processField(fieldID, tf) - } - - // walk each field - for i, field := range result.Document.Fields { - fieldID := uint16(s.getOrDefineField(field.Name())) - tf := result.Analyzed[i] - processField(fieldID, tf) - } - } - - s.Postings = make([]*roaring.Bitmap, numPostingsLists) - for i := 0; i < numPostingsLists; i++ { - s.Postings[i] = roaring.New() - } - s.PostingsLocs = make([]*roaring.Bitmap, numPostingsLists) - for i := 0; i < numPostingsLists; i++ { - s.PostingsLocs[i] = roaring.New() - } - - // Preallocate big, contiguous backing arrays. - auint64Backing := make([][]uint64, numPostingsLists*4+totLocs) // For Freqs, Locstarts, Locends, Locpos, sub-Locarraypos. - uint64Backing := make([]uint64, numTokenFrequencies+totLocs*3) // For sub-Freqs, sub-Locstarts, sub-Locends, sub-Locpos. - float32Backing := make([]float32, numTokenFrequencies) // For sub-Norms. - uint16Backing := make([]uint16, totLocs) // For sub-Locfields. - - // Point top-level slices to the backing arrays. - s.Freqs = auint64Backing[0:numPostingsLists] - auint64Backing = auint64Backing[numPostingsLists:] - - s.Norms = make([][]float32, numPostingsLists) - - s.Locfields = make([][]uint16, numPostingsLists) - - s.Locstarts = auint64Backing[0:numPostingsLists] - auint64Backing = auint64Backing[numPostingsLists:] - - s.Locends = auint64Backing[0:numPostingsLists] - auint64Backing = auint64Backing[numPostingsLists:] - - s.Locpos = auint64Backing[0:numPostingsLists] - auint64Backing = auint64Backing[numPostingsLists:] - - s.Locarraypos = make([][][]uint64, numPostingsLists) - - // Point sub-slices to the backing arrays. - for pid, numTerms := range numTermsPerPostingsList { - s.Freqs[pid] = uint64Backing[0:0] - uint64Backing = uint64Backing[numTerms:] - - s.Norms[pid] = float32Backing[0:0] - float32Backing = float32Backing[numTerms:] - } - - for pid, numLocs := range numLocsPerPostingsList { - s.Locfields[pid] = uint16Backing[0:0] - uint16Backing = uint16Backing[numLocs:] - - s.Locstarts[pid] = uint64Backing[0:0] - uint64Backing = uint64Backing[numLocs:] - - s.Locends[pid] = uint64Backing[0:0] - uint64Backing = uint64Backing[numLocs:] - - s.Locpos[pid] = uint64Backing[0:0] - uint64Backing = uint64Backing[numLocs:] - - s.Locarraypos[pid] = auint64Backing[0:0] - auint64Backing = auint64Backing[numLocs:] - } -} - -func (s *Segment) processDocument(result *index.AnalysisResult, - fieldLens []int, docMap []analysis.TokenFrequencies) { - // clear the fieldLens and docMap for reuse - n := len(s.FieldsMap) - for i := 0; i < n; i++ { - fieldLens[i] = 0 - docMap[i] = nil - } - - docNum := uint64(s.addDocument()) - - processField := func(fieldID uint16, name string, l int, tf analysis.TokenFrequencies) { - fieldLens[fieldID] += l - - existingFreqs := docMap[fieldID] - if existingFreqs != nil { - existingFreqs.MergeAll(name, tf) - } else { - docMap[fieldID] = tf - } - } - - // walk each composite field - for _, field := range result.Document.CompositeFields { - fieldID := uint16(s.getOrDefineField(field.Name())) - l, tf := field.Analyze() - processField(fieldID, field.Name(), l, tf) - } - - docStored := s.Stored[docNum] - docStoredTypes := s.StoredTypes[docNum] - docStoredPos := s.StoredPos[docNum] - - // walk each field - for i, field := range result.Document.Fields { - fieldID := uint16(s.getOrDefineField(field.Name())) - l := result.Length[i] - tf := result.Analyzed[i] - processField(fieldID, field.Name(), l, tf) - if field.Options().IsStored() { - docStored[fieldID] = append(docStored[fieldID], field.Value()) - docStoredTypes[fieldID] = append(docStoredTypes[fieldID], encodeFieldType(field)) - docStoredPos[fieldID] = append(docStoredPos[fieldID], field.ArrayPositions()) - } - - if field.Options().IncludeDocValues() { - s.DocValueFields[fieldID] = true - } - } - - // now that its been rolled up into docMap, walk that - for fieldID, tokenFrequencies := range docMap { - dict := s.Dicts[fieldID] - norm := float32(1.0 / math.Sqrt(float64(fieldLens[fieldID]))) - for term, tokenFreq := range tokenFrequencies { - pid := dict[term] - 1 - bs := s.Postings[pid] - bs.AddInt(int(docNum)) - s.Freqs[pid] = append(s.Freqs[pid], uint64(tokenFreq.Frequency())) - s.Norms[pid] = append(s.Norms[pid], norm) - locationBS := s.PostingsLocs[pid] - if len(tokenFreq.Locations) > 0 { - locationBS.AddInt(int(docNum)) - - locfields := s.Locfields[pid] - locstarts := s.Locstarts[pid] - locends := s.Locends[pid] - locpos := s.Locpos[pid] - locarraypos := s.Locarraypos[pid] - - for _, loc := range tokenFreq.Locations { - var locf = uint16(fieldID) - if loc.Field != "" { - locf = uint16(s.getOrDefineField(loc.Field)) - } - locfields = append(locfields, locf) - locstarts = append(locstarts, uint64(loc.Start)) - locends = append(locends, uint64(loc.End)) - locpos = append(locpos, uint64(loc.Position)) - if len(loc.ArrayPositions) > 0 { - locarraypos = append(locarraypos, loc.ArrayPositions) - } else { - locarraypos = append(locarraypos, nil) - } - } - - s.Locfields[pid] = locfields - s.Locstarts[pid] = locstarts - s.Locends[pid] = locends - s.Locpos[pid] = locpos - s.Locarraypos[pid] = locarraypos - } - } - } -} - -func (s *Segment) getOrDefineField(name string) int { - fieldIDPlus1, ok := s.FieldsMap[name] - if !ok { - fieldIDPlus1 = uint16(len(s.FieldsInv) + 1) - s.FieldsMap[name] = fieldIDPlus1 - s.FieldsInv = append(s.FieldsInv, name) - s.Dicts = append(s.Dicts, make(map[string]uint64)) - s.DictKeys = append(s.DictKeys, make([]string, 0)) - } - return int(fieldIDPlus1 - 1) -} - -func (s *Segment) addDocument() int { - docNum := len(s.Stored) - s.Stored = append(s.Stored, map[uint16][][]byte{}) - s.StoredTypes = append(s.StoredTypes, map[uint16][]byte{}) - s.StoredPos = append(s.StoredPos, map[uint16][][]uint64{}) - return docNum -} - -func encodeFieldType(f document.Field) byte { - fieldType := byte('x') - switch f.(type) { - case *document.TextField: - fieldType = 't' - case *document.NumericField: - fieldType = 'n' - case *document.DateTimeField: - fieldType = 'd' - case *document.BooleanField: - fieldType = 'b' - case *document.GeoPointField: - fieldType = 'g' - case *document.CompositeField: - fieldType = 'c' - } - return fieldType -} diff --git a/index/scorch/segment/mem/dict.go b/index/scorch/segment/mem/dict.go deleted file mode 100644 index b74872371..000000000 --- a/index/scorch/segment/mem/dict.go +++ /dev/null @@ -1,239 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package mem - -import ( - "math" - "reflect" - "regexp" - "sort" - "strings" - - "github.com/RoaringBitmap/roaring" - "github.com/blevesearch/bleve/index" - "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/blevesearch/bleve/size" -) - -var reflectStaticSizeDictionary int - -func init() { - var d Dictionary - reflectStaticSizeDictionary = int(reflect.TypeOf(d).Size()) -} - -// Dictionary is the in-memory representation of the term dictionary -type Dictionary struct { - segment *Segment - field string - fieldID uint16 -} - -func (d *Dictionary) Size() int { - sizeInBytes := reflectStaticSizeDictionary + size.SizeOfPtr + - len(d.field) - - if d.segment != nil { - sizeInBytes += int(d.segment.Size()) - } - - return sizeInBytes -} - -// PostingsList returns the postings list for the specified term -func (d *Dictionary) PostingsList(term string, - except *roaring.Bitmap) (segment.PostingsList, error) { - return d.InitPostingsList(term, except, nil) -} - -func (d *Dictionary) InitPostingsList(term string, except *roaring.Bitmap, - prealloc *PostingsList) (*PostingsList, error) { - rv := prealloc - if rv == nil { - rv = &PostingsList{} - } - rv.dictionary = d - rv.term = term - rv.postingsID = d.segment.Dicts[d.fieldID][term] - rv.except = except - return rv, nil -} - -// Iterator returns an iterator for this dictionary -func (d *Dictionary) Iterator() segment.DictionaryIterator { - return &DictionaryIterator{ - d: d, - } -} - -// PrefixIterator returns an iterator which only visits terms having the -// the specified prefix -func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { - offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], prefix) - return &DictionaryIterator{ - d: d, - prefix: prefix, - offset: offset, - } -} - -// RangeIterator returns an iterator which only visits terms between the -// start and end terms. NOTE: bleve.index API specifies the end is inclusive. -func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator { - offset := sort.SearchStrings(d.segment.DictKeys[d.fieldID], start) - return &DictionaryIterator{ - d: d, - offset: offset, - end: end, - } -} - -// RegexpIterator returns an iterator which only visits terms matching -// the given regex expression. -func (d *Dictionary) RegexpIterator(pattern string) segment.DictionaryIterator { - regex, err := regexp.Compile(pattern) - if err != nil { - // invalid regexp, so set offset to the end - return &DictionaryIterator{ - d: d, - offset: len(d.segment.DictKeys[d.fieldID]), - } - } - return &DictionaryIterator{ - d: d, - regex: regex, - } -} - -// FuzzyIterator returns an iterator which only visits terms matching -// the given edit distance. -func (d *Dictionary) FuzzyIterator(term string, fuzziness int) segment.DictionaryIterator { - return &DictionaryIterator{ - d: d, - fuzzyTerm: term, - fuzziness: fuzziness, - } -} - -// DictionaryIterator is an iterator for term dictionary -type DictionaryIterator struct { - d *Dictionary - prefix string - end string - offset int - regex *regexp.Regexp - fuzzyTerm string - fuzziness int - - dictEntry index.DictEntry // reused across Next()'s -} - -// Next returns the next entry in the dictionary -func (d *DictionaryIterator) Next() (*index.DictEntry, error) { - if d.offset > len(d.d.segment.DictKeys[d.d.fieldID])-1 { - return nil, nil - } - next := d.d.segment.DictKeys[d.d.fieldID][d.offset] - // check prefix - if d.prefix != "" && !strings.HasPrefix(next, d.prefix) { - return nil, nil - } - // check end (bleve.index API demands inclusive end) - if d.end != "" && next > d.end { - return nil, nil - } - // check regexp - if d.regex != nil { - // keep going until we find a match, mindful of the end of the slice - for !d.regex.MatchString(next) { - d.offset++ - if d.offset > len(d.d.segment.DictKeys[d.d.fieldID])-1 { - return nil, nil - } - next = d.d.segment.DictKeys[d.d.fieldID][d.offset] - } - } - // check fuzziness - if d.fuzzyTerm != "" { - _, exceeded := LevenshteinDistanceMax(d.fuzzyTerm, next, d.fuzziness) - for exceeded { - d.offset++ - if d.offset > len(d.d.segment.DictKeys[d.d.fieldID])-1 { - return nil, nil - } - next = d.d.segment.DictKeys[d.d.fieldID][d.offset] - _, exceeded = LevenshteinDistanceMax(d.fuzzyTerm, next, d.fuzziness) - } - } - - d.offset++ - postingID := d.d.segment.Dicts[d.d.fieldID][next] - d.dictEntry.Term = next - d.dictEntry.Count = d.d.segment.Postings[postingID-1].GetCardinality() - return &d.dictEntry, nil -} - -// LevenshteinDistanceMax same as LevenshteinDistance but -// attempts to bail early once we know the distance -// will be greater than max -// in which case the first return val will be the max -// and the second will be true, indicating max was exceeded -func LevenshteinDistanceMax(a, b string, max int) (int, bool) { - la := len(a) - lb := len(b) - - ld := int(math.Abs(float64(la - lb))) - if ld > max { - return max, true - } - - d := make([]int, la+1) - var lastdiag, olddiag, temp int - - for i := 1; i <= la; i++ { - d[i] = i - } - for i := 1; i <= lb; i++ { - d[0] = i - lastdiag = i - 1 - rowmin := max + 1 - for j := 1; j <= la; j++ { - olddiag = d[j] - min := d[j] + 1 - if (d[j-1] + 1) < min { - min = d[j-1] + 1 - } - if a[j-1] == b[i-1] { - temp = 0 - } else { - temp = 1 - } - if (lastdiag + temp) < min { - min = lastdiag + temp - } - if min < rowmin { - rowmin = min - } - d[j] = min - - lastdiag = olddiag - } - // after each row if rowmin isn't less than max stop - if rowmin > max { - return max, true - } - } - return d[la], false -} diff --git a/index/scorch/segment/mem/dict_test.go b/index/scorch/segment/mem/dict_test.go deleted file mode 100644 index e69016a81..000000000 --- a/index/scorch/segment/mem/dict_test.go +++ /dev/null @@ -1,211 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package mem - -import ( - "reflect" - "testing" - - "github.com/blevesearch/bleve/analysis" - "github.com/blevesearch/bleve/document" - "github.com/blevesearch/bleve/index" -) - -func TestDictionary(t *testing.T) { - doc := &document.Document{ - ID: "a", - Fields: []document.Field{ - document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil), - document.NewTextFieldCustom("desc", nil, []byte("apple ball cat dog egg fish bat"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - }, - } - - // forge analyzed docs - results := []*index.AnalysisResult{ - &index.AnalysisResult{ - Document: doc, - Analyzed: []analysis.TokenFrequencies{ - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 1, - Position: 1, - Term: []byte("a"), - }, - }, nil, false), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 5, - Position: 1, - Term: []byte("apple"), - }, - &analysis.Token{ - Start: 6, - End: 10, - Position: 2, - Term: []byte("ball"), - }, - &analysis.Token{ - Start: 11, - End: 14, - Position: 3, - Term: []byte("cat"), - }, - &analysis.Token{ - Start: 15, - End: 18, - Position: 4, - Term: []byte("dog"), - }, - &analysis.Token{ - Start: 19, - End: 22, - Position: 5, - Term: []byte("egg"), - }, - &analysis.Token{ - Start: 20, - End: 24, - Position: 6, - Term: []byte("fish"), - }, - &analysis.Token{ - Start: 25, - End: 28, - Position: 7, - Term: []byte("bat"), - }, - }, nil, true), - }, - Length: []int{ - 1, - 7, - }, - }, - } - - segment := NewFromAnalyzedDocs(results) - if segment == nil { - t.Fatalf("segment nil, not expected") - } - - dict, err := segment.Dictionary("desc") - if err != nil { - t.Fatal(err) - } - - // test basic full iterator - expected := []string{"apple", "ball", "bat", "cat", "dog", "egg", "fish"} - var got []string - itr := dict.Iterator() - next, err := itr.Next() - for next != nil && err == nil { - got = append(got, next.Term) - next, err = itr.Next() - } - if err != nil { - t.Fatalf("dict itr error: %v", err) - } - - if !reflect.DeepEqual(expected, got) { - t.Errorf("expected: %v, got: %v", expected, got) - } - - // test prefix iterator - expected = []string{"ball", "bat"} - got = got[:0] - itr = dict.PrefixIterator("b") - next, err = itr.Next() - for next != nil && err == nil { - got = append(got, next.Term) - next, err = itr.Next() - } - if err != nil { - t.Fatalf("dict itr error: %v", err) - } - - if !reflect.DeepEqual(expected, got) { - t.Errorf("expected: %v, got: %v", expected, got) - } - - // test range iterator - expected = []string{"cat", "dog", "egg"} - got = got[:0] - itr = dict.RangeIterator("cat", "egg") - next, err = itr.Next() - for next != nil && err == nil { - got = append(got, next.Term) - next, err = itr.Next() - } - if err != nil { - t.Fatalf("dict itr error: %v", err) - } - - if !reflect.DeepEqual(expected, got) { - t.Errorf("expected: %v, got: %v", expected, got) - } - - // test regexp iterator - expected = []string{"ball", "bat"} - got = got[:0] - itr = dict.RegexpIterator("ba.*") - next, err = itr.Next() - for next != nil && err == nil { - got = append(got, next.Term) - next, err = itr.Next() - } - if err != nil { - t.Fatalf("dict itr error: %v", err) - } - - if !reflect.DeepEqual(expected, got) { - t.Errorf("expected: %v, got: %v", expected, got) - } - - // test regexp iterator with invalid regexp - expected = []string{} - got = got[:0] - itr = dict.RegexpIterator(string([]byte{0xff})) - next, err = itr.Next() - for next != nil && err == nil { - got = append(got, next.Term) - next, err = itr.Next() - } - if err != nil { - t.Fatalf("dict itr error: %v", err) - } - - if !reflect.DeepEqual(expected, got) { - t.Errorf("expected: %v, got: %v", expected, got) - } - - // test fuzzy iterator - expected = []string{"bat", "cat"} - got = got[:0] - itr = dict.FuzzyIterator("vat", 1) - next, err = itr.Next() - for next != nil && err == nil { - got = append(got, next.Term) - next, err = itr.Next() - } - if err != nil { - t.Fatalf("dict itr error: %v", err) - } - - if !reflect.DeepEqual(expected, got) { - t.Errorf("expected: %v, got: %v", expected, got) - } -} diff --git a/index/scorch/segment/mem/posting.go b/index/scorch/segment/mem/posting.go deleted file mode 100644 index 362fdb7c5..000000000 --- a/index/scorch/segment/mem/posting.go +++ /dev/null @@ -1,247 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package mem - -import ( - "reflect" - - "github.com/RoaringBitmap/roaring" - "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/blevesearch/bleve/size" -) - -var reflectStaticSizePostingsList int -var reflectStaticSizePostingsIterator int -var reflectStaticSizePosting int -var reflectStaticSizeLocation int - -func init() { - var pl PostingsList - reflectStaticSizePostingsList = int(reflect.TypeOf(pl).Size()) - var pi PostingsIterator - reflectStaticSizePostingsIterator = int(reflect.TypeOf(pi).Size()) - var p Posting - reflectStaticSizePosting = int(reflect.TypeOf(p).Size()) - var l Location - reflectStaticSizeLocation = int(reflect.TypeOf(l).Size()) -} - -// PostingsList is an in-memory represenation of a postings list -type PostingsList struct { - dictionary *Dictionary - term string - postingsID uint64 - except *roaring.Bitmap -} - -func (p *PostingsList) Size() int { - sizeInBytes := reflectStaticSizePostingsList + size.SizeOfPtr - - if p.dictionary != nil { - sizeInBytes += p.dictionary.Size() - } - - if p.except != nil { - sizeInBytes += int(p.except.GetSizeInBytes()) - } - - return sizeInBytes -} - -// Count returns the number of items on this postings list -func (p *PostingsList) Count() uint64 { - var rv uint64 - if p.postingsID > 0 { - rv = p.dictionary.segment.Postings[p.postingsID-1].GetCardinality() - if p.except != nil { - except := p.except.GetCardinality() - if except > rv { - // avoid underflow - except = rv - } - rv -= except - } - } - return rv -} - -// Iterator returns an iterator for this postings list -func (p *PostingsList) Iterator(includeFreq, includeNorm, includeLocations bool) segment.PostingsIterator { - return p.InitIterator(nil) -} -func (p *PostingsList) InitIterator(prealloc *PostingsIterator) *PostingsIterator { - rv := prealloc - if rv == nil { - rv = &PostingsIterator{postings: p} - } else { - *rv = PostingsIterator{postings: p} - } - - if p.postingsID > 0 { - allbits := p.dictionary.segment.Postings[p.postingsID-1] - rv.locations = p.dictionary.segment.PostingsLocs[p.postingsID-1] - rv.all = allbits.Iterator() - if p.except != nil { - allExcept := allbits.Clone() - allExcept.AndNot(p.except) - rv.actual = allExcept.Iterator() - } else { - rv.actual = allbits.Iterator() - } - } - - return rv -} - -// PostingsIterator provides a way to iterate through the postings list -type PostingsIterator struct { - postings *PostingsList - all roaring.IntIterable - locations *roaring.Bitmap - offset int - locoffset int - actual roaring.IntIterable - reuse Posting -} - -func (i *PostingsIterator) Size() int { - sizeInBytes := reflectStaticSizePostingsIterator + size.SizeOfPtr - - if i.locations != nil { - sizeInBytes += int(i.locations.GetSizeInBytes()) - } - - return sizeInBytes -} - -// Next returns the next posting on the postings list, or nil at the end -func (i *PostingsIterator) Next() (segment.Posting, error) { - if i.actual == nil || !i.actual.HasNext() { - return nil, nil - } - n := i.actual.Next() - allN := i.all.Next() - - // n is the next actual hit (excluding some postings) - // allN is the next hit in the full postings - // if they don't match, adjust offsets to factor in item we're skipping over - // incr the all iterator, and check again - for allN != n { - i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset]) - i.offset++ - allN = i.all.Next() - } - i.reuse = Posting{ - iterator: i, - docNum: uint64(n), - offset: i.offset, - locoffset: i.locoffset, - hasLoc: i.locations.Contains(n), - } - i.locoffset += int(i.postings.dictionary.segment.Freqs[i.postings.postingsID-1][i.offset]) - i.offset++ - return &i.reuse, nil -} - -// Posting is a single entry in a postings list -type Posting struct { - iterator *PostingsIterator - docNum uint64 - offset int - locoffset int - hasLoc bool -} - -func (p *Posting) Size() int { - sizeInBytes := reflectStaticSizePosting + size.SizeOfPtr - - if p.iterator != nil { - sizeInBytes += p.iterator.Size() - } - - return sizeInBytes -} - -// Number returns the document number of this posting in this segment -func (p *Posting) Number() uint64 { - return p.docNum -} - -// Frequency returns the frequence of occurance of this term in this doc/field -func (p *Posting) Frequency() uint64 { - return p.iterator.postings.dictionary.segment.Freqs[p.iterator.postings.postingsID-1][p.offset] -} - -// Norm returns the normalization factor for this posting -func (p *Posting) Norm() float64 { - return float64(p.iterator.postings.dictionary.segment.Norms[p.iterator.postings.postingsID-1][p.offset]) -} - -// Locations returns the location information for each occurance -func (p *Posting) Locations() []segment.Location { - if !p.hasLoc { - return nil - } - freq := int(p.Frequency()) - rv := make([]segment.Location, freq) - for i := 0; i < freq; i++ { - rv[i] = &Location{ - p: p, - offset: p.locoffset + i, - } - } - return rv -} - -// Location represents the location of a single occurance -type Location struct { - p *Posting - offset int -} - -func (l *Location) Size() int { - sizeInBytes := reflectStaticSizeLocation - if l.p != nil { - sizeInBytes += l.p.Size() - } - - return sizeInBytes -} - -// Field returns the name of the field (useful in composite fields to know -// which original field the value came from) -func (l *Location) Field() string { - return l.p.iterator.postings.dictionary.segment.FieldsInv[l.p.iterator.postings.dictionary.segment.Locfields[l.p.iterator.postings.postingsID-1][l.offset]] -} - -// Start returns the start byte offset of this occurance -func (l *Location) Start() uint64 { - return l.p.iterator.postings.dictionary.segment.Locstarts[l.p.iterator.postings.postingsID-1][l.offset] -} - -// End returns the end byte offset of this occurance -func (l *Location) End() uint64 { - return l.p.iterator.postings.dictionary.segment.Locends[l.p.iterator.postings.postingsID-1][l.offset] -} - -// Pos returns the 1-based phrase position of this occurance -func (l *Location) Pos() uint64 { - return l.p.iterator.postings.dictionary.segment.Locpos[l.p.iterator.postings.postingsID-1][l.offset] -} - -// ArrayPositions returns the array position vector associated with this occurance -func (l *Location) ArrayPositions() []uint64 { - return l.p.iterator.postings.dictionary.segment.Locarraypos[l.p.iterator.postings.postingsID-1][l.offset] -} diff --git a/index/scorch/segment/mem/segment.go b/index/scorch/segment/mem/segment.go deleted file mode 100644 index e9c4a2730..000000000 --- a/index/scorch/segment/mem/segment.go +++ /dev/null @@ -1,286 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package mem - -import ( - "fmt" - "reflect" - - "github.com/RoaringBitmap/roaring" - "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/blevesearch/bleve/size" -) - -var reflectStaticSizeSegment int - -func init() { - var s Segment - reflectStaticSizeSegment = int(reflect.TypeOf(s).Size()) -} - -// _id field is always guaranteed to have fieldID of 0 -const idFieldID uint16 = 0 - -// KNOWN ISSUES -// - LIMITATION - we decided whether or not to store term vectors for a field -// at the segment level, based on the first definition of a -// field we see. in normal bleve usage this is fine, all -// instances of a field definition will be the same. however, -// advanced users may violate this and provide unique field -// definitions with each document. this segment does not -// support this usage. - -// TODO -// - need better testing of multiple docs, iterating freqs, locations and -// and verifying the correct results are returned - -// Segment is an in memory implementation of scorch.Segment -type Segment struct { - - // FieldsMap adds 1 to field id to avoid zero value issues - // name -> field id + 1 - FieldsMap map[string]uint16 - - // FieldsInv is the inverse of FieldsMap - // field id -> name - FieldsInv []string - - // Term dictionaries for each field - // field id -> term -> postings list id + 1 - Dicts []map[string]uint64 - - // Terms for each field, where terms are sorted ascending - // field id -> []term - DictKeys [][]string - - // Postings list - // postings list id -> bitmap by docNum - Postings []*roaring.Bitmap - - // Postings list has locations - PostingsLocs []*roaring.Bitmap - - // Term frequencies - // postings list id -> Freqs (one for each hit in bitmap) - Freqs [][]uint64 - - // Field norms - // postings list id -> Norms (one for each hit in bitmap) - Norms [][]float32 - - // Field/start/end/pos/locarraypos - // postings list id -> start/end/pos/locarraypos (one for each freq) - Locfields [][]uint16 - Locstarts [][]uint64 - Locends [][]uint64 - Locpos [][]uint64 - Locarraypos [][][]uint64 - - // Stored field values - // docNum -> field id -> slice of values (each value []byte) - Stored []map[uint16][][]byte - - // Stored field types - // docNum -> field id -> slice of types (each type byte) - StoredTypes []map[uint16][]byte - - // Stored field array positions - // docNum -> field id -> slice of array positions (each is []uint64) - StoredPos []map[uint16][][]uint64 - - // For storing the docValue persisted fields - DocValueFields map[uint16]bool - - // Footprint of the segment, updated when analyzed document mutations - // are added into the segment - sizeInBytes int -} - -// New builds a new empty Segment -func New() *Segment { - return &Segment{ - FieldsMap: map[string]uint16{}, - DocValueFields: map[uint16]bool{}, - } -} - -func (s *Segment) updateSize() { - sizeInBytes := reflectStaticSizeSegment - - // FieldsMap, FieldsInv - for k, _ := range s.FieldsMap { - sizeInBytes += (len(k)+size.SizeOfString)*2 + - size.SizeOfUint16 - } - - // Dicts, DictKeys - for _, entry := range s.Dicts { - for k, _ := range entry { - sizeInBytes += (len(k)+size.SizeOfString)*2 + - size.SizeOfUint64 - } - // overhead from the data structures - sizeInBytes += (size.SizeOfMap + size.SizeOfSlice) - } - - // Postings, PostingsLocs - for i := 0; i < len(s.Postings); i++ { - sizeInBytes += (int(s.Postings[i].GetSizeInBytes()) + size.SizeOfPtr) + - (int(s.PostingsLocs[i].GetSizeInBytes()) + size.SizeOfPtr) - } - - // Freqs, Norms - for i := 0; i < len(s.Freqs); i++ { - sizeInBytes += (len(s.Freqs[i])*size.SizeOfUint64 + - len(s.Norms[i])*size.SizeOfFloat32) + - (size.SizeOfSlice * 2) - } - - // Location data - for i := 0; i < len(s.Locfields); i++ { - sizeInBytes += len(s.Locfields[i])*size.SizeOfUint16 + - len(s.Locstarts[i])*size.SizeOfUint64 + - len(s.Locends[i])*size.SizeOfUint64 + - len(s.Locpos[i])*size.SizeOfUint64 - - for j := 0; j < len(s.Locarraypos[i]); j++ { - sizeInBytes += len(s.Locarraypos[i][j])*size.SizeOfUint64 + - size.SizeOfSlice - } - - sizeInBytes += (size.SizeOfSlice * 5) - } - - // Stored data - for i := 0; i < len(s.Stored); i++ { - for _, v := range s.Stored[i] { - sizeInBytes += size.SizeOfUint16 - for _, arr := range v { - sizeInBytes += len(arr) + size.SizeOfSlice - } - sizeInBytes += size.SizeOfSlice - } - - for _, v := range s.StoredTypes[i] { - sizeInBytes += size.SizeOfUint16 + len(v) + size.SizeOfSlice - } - - for _, v := range s.StoredPos[i] { - sizeInBytes += size.SizeOfUint16 - for _, arr := range v { - sizeInBytes += len(arr)*size.SizeOfUint64 + - size.SizeOfSlice - } - sizeInBytes += size.SizeOfSlice - } - - // overhead from map(s) within Stored, StoredTypes, StoredPos - sizeInBytes += (size.SizeOfMap * 3) - } - - // DocValueFields - sizeInBytes += len(s.DocValueFields) * (size.SizeOfUint16 + size.SizeOfBool) - - s.sizeInBytes = sizeInBytes -} - -func (s *Segment) Size() int { - return s.sizeInBytes -} - -func (s *Segment) AddRef() { -} - -func (s *Segment) DecRef() error { - return nil -} - -// Fields returns the field names used in this segment -func (s *Segment) Fields() []string { - return s.FieldsInv -} - -// VisitDocument invokes the DocFieldValueVistor for each stored field -// for the specified doc number -func (s *Segment) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { - // ensure document number exists - if int(num) > len(s.Stored)-1 { - return nil - } - docFields := s.Stored[int(num)] - st := s.StoredTypes[int(num)] - sp := s.StoredPos[int(num)] - for field, values := range docFields { - for i, value := range values { - keepGoing := visitor(s.FieldsInv[field], st[field][i], value, sp[field][i]) - if !keepGoing { - return nil - } - } - } - return nil -} - -func (s *Segment) getField(name string) (int, error) { - fieldID, ok := s.FieldsMap[name] - if !ok { - return 0, fmt.Errorf("no field named %s", name) - } - return int(fieldID - 1), nil -} - -// Dictionary returns the term dictionary for the specified field -func (s *Segment) Dictionary(field string) (segment.TermDictionary, error) { - fieldID, err := s.getField(field) - if err != nil { - // no such field, return empty dictionary - return &segment.EmptyDictionary{}, nil - } - return &Dictionary{ - segment: s, - field: field, - fieldID: uint16(fieldID), - }, nil -} - -// Count returns the number of documents in this segment -// (this has no notion of deleted docs) -func (s *Segment) Count() uint64 { - return uint64(len(s.Stored)) -} - -// DocNumbers returns a bitset corresponding to the doc numbers of all the -// provided _id strings -func (s *Segment) DocNumbers(ids []string) (*roaring.Bitmap, error) { - rv := roaring.New() - - // guard against empty segment - if len(s.FieldsMap) > 0 { - idDictionary := s.Dicts[idFieldID] - - for _, id := range ids { - postingID := idDictionary[id] - if postingID > 0 { - rv.Or(s.Postings[postingID-1]) - } - } - } - return rv, nil -} - -// Close releases all resources associated with this segment -func (s *Segment) Close() error { - return nil -} diff --git a/index/scorch/segment/mem/segment_test.go b/index/scorch/segment/mem/segment_test.go deleted file mode 100644 index 79a11d122..000000000 --- a/index/scorch/segment/mem/segment_test.go +++ /dev/null @@ -1,876 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package mem - -import ( - "math" - "reflect" - "testing" - - "github.com/blevesearch/bleve/analysis" - "github.com/blevesearch/bleve/document" - "github.com/blevesearch/bleve/index" -) - -func TestEmpty(t *testing.T) { - - emptySegment := New() - - if emptySegment.Count() != 0 { - t.Errorf("expected count 0, got %d", emptySegment.Count()) - } - - dict, err := emptySegment.Dictionary("name") - if err != nil { - t.Fatal(err) - } - if dict == nil { - t.Fatal("got nil dict, expected non-nil") - } - - postingsList, err := dict.PostingsList("marty", nil) - if err != nil { - t.Fatal(err) - } - if postingsList == nil { - t.Fatal("got nil postings list, expected non-nil") - } - - postingsItr := postingsList.Iterator(true, true, true) - if postingsItr == nil { - t.Fatal("got nil iterator, expected non-nil") - } - - count := 0 - nextPosting, err := postingsItr.Next() - for nextPosting != nil && err == nil { - count++ - nextPosting, err = postingsItr.Next() - } - if err != nil { - t.Fatal(err) - } - - if count != 0 { - t.Errorf("expected count to be 0, got %d", count) - } - - // now try and visit a document - err = emptySegment.VisitDocument(0, func(field string, typ byte, value []byte, pos []uint64) bool { - t.Errorf("document visitor called, not expected") - return true - }) - if err != nil { - t.Fatal(err) - } -} - -func TestSingle(t *testing.T) { - - doc := &document.Document{ - ID: "a", - Fields: []document.Field{ - document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil), - document.NewTextFieldCustom("name", nil, []byte("wow"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - }, - CompositeFields: []*document.CompositeField{ - document.NewCompositeField("_all", true, nil, nil), - }, - } - - // forge analyzed docs - results := []*index.AnalysisResult{ - &index.AnalysisResult{ - Document: doc, - Analyzed: []analysis.TokenFrequencies{ - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 1, - Position: 1, - Term: []byte("a"), - }, - }, nil, false), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("wow"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("some"), - }, - &analysis.Token{ - Start: 5, - End: 10, - Position: 2, - Term: []byte("thing"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("cold"), - }, - }, []uint64{0}, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("dark"), - }, - }, []uint64{1}, true), - }, - Length: []int{ - 1, - 1, - 2, - 1, - 1, - }, - }, - } - - // fix up composite fields - for _, ar := range results { - for i, f := range ar.Document.Fields { - for _, cf := range ar.Document.CompositeFields { - cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i]) - } - } - } - - segment := NewFromAnalyzedDocs(results) - if segment == nil { - t.Fatalf("segment nil, not expected") - } - - if segment.Size() <= 0 { - t.Fatalf("segment size not updated") - } - - expectFields := map[string]struct{}{ - "_id": struct{}{}, - "_all": struct{}{}, - "name": struct{}{}, - "desc": struct{}{}, - "tag": struct{}{}, - } - fields := segment.Fields() - if len(fields) != len(expectFields) { - t.Errorf("expected %d fields, only got %d", len(expectFields), len(fields)) - } - for _, field := range fields { - if _, ok := expectFields[field]; !ok { - t.Errorf("got unexpected field: %s", field) - } - } - - if segment.Count() != 1 { - t.Errorf("expected count 1, got %d", segment.Count()) - } - - // check the _id field - dict, err := segment.Dictionary("_id") - if err != nil { - t.Fatal(err) - } - if dict == nil { - t.Fatal("got nil dict, expected non-nil") - } - - postingsList, err := dict.PostingsList("a", nil) - if err != nil { - t.Fatal(err) - } - if postingsList == nil { - t.Fatal("got nil postings list, expected non-nil") - } - - postingsItr := postingsList.Iterator(true, true, true) - if postingsItr == nil { - t.Fatal("got nil iterator, expected non-nil") - } - - count := 0 - nextPosting, err := postingsItr.Next() - for nextPosting != nil && err == nil { - count++ - if nextPosting.Frequency() != 1 { - t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) - } - if nextPosting.Number() != 0 { - t.Errorf("expected doc number 0, got %d", nextPosting.Number()) - } - if nextPosting.Norm() != 1.0 { - t.Errorf("expected norm 1.0, got %f", nextPosting.Norm()) - } - - nextPosting, err = postingsItr.Next() - } - if err != nil { - t.Fatal(err) - } - - if count != 1 { - t.Errorf("expected count to be 1, got %d", count) - } - - // check the name field - dict, err = segment.Dictionary("name") - if err != nil { - t.Fatal(err) - } - if dict == nil { - t.Fatal("got nil dict, expected non-nil") - } - - postingsList, err = dict.PostingsList("wow", nil) - if err != nil { - t.Fatal(err) - } - if postingsList == nil { - t.Fatal("got nil postings list, expected non-nil") - } - - postingsItr = postingsList.Iterator(true, true, true) - if postingsItr == nil { - t.Fatal("got nil iterator, expected non-nil") - } - - count = 0 - nextPosting, err = postingsItr.Next() - for nextPosting != nil && err == nil { - count++ - if nextPosting.Frequency() != 1 { - t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) - } - if nextPosting.Number() != 0 { - t.Errorf("expected doc number 0, got %d", nextPosting.Number()) - } - if nextPosting.Norm() != 1.0 { - t.Errorf("expected norm 1.0, got %f", nextPosting.Norm()) - } - var numLocs uint64 - for _, loc := range nextPosting.Locations() { - numLocs++ - if loc.Field() != "name" { - t.Errorf("expected loc field to be 'name', got '%s'", loc.Field()) - } - if loc.Start() != 0 { - t.Errorf("expected loc start to be 0, got %d", loc.Start()) - } - if loc.End() != 3 { - t.Errorf("expected loc end to be 3, got %d", loc.End()) - } - if loc.Pos() != 1 { - t.Errorf("expected loc pos to be 1, got %d", loc.Pos()) - } - if loc.ArrayPositions() != nil { - t.Errorf("expect loc array pos to be nil, got %v", loc.ArrayPositions()) - } - } - if numLocs != nextPosting.Frequency() { - t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs) - } - - nextPosting, err = postingsItr.Next() - } - if err != nil { - t.Fatal(err) - } - - if count != 1 { - t.Errorf("expected count to be 1, got %d", count) - } - - // check the _all field (composite) - dict, err = segment.Dictionary("_all") - if err != nil { - t.Fatal(err) - } - if dict == nil { - t.Fatal("got nil dict, expected non-nil") - } - - postingsList, err = dict.PostingsList("wow", nil) - if err != nil { - t.Fatal(err) - } - if postingsList == nil { - t.Fatal("got nil postings list, expected non-nil") - } - - postingsItr = postingsList.Iterator(true, true, true) - if postingsItr == nil { - t.Fatal("got nil iterator, expected non-nil") - } - - count = 0 - nextPosting, err = postingsItr.Next() - for nextPosting != nil && err == nil { - count++ - if nextPosting.Frequency() != 1 { - t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) - } - if nextPosting.Number() != 0 { - t.Errorf("expected doc number 0, got %d", nextPosting.Number()) - } - expectedNorm := float32(1.0 / math.Sqrt(float64(6))) - if nextPosting.Norm() != float64(expectedNorm) { - t.Errorf("expected norm %f, got %f", expectedNorm, nextPosting.Norm()) - } - var numLocs uint64 - for _, loc := range nextPosting.Locations() { - numLocs++ - if loc.Field() != "name" { - t.Errorf("expected loc field to be 'name', got '%s'", loc.Field()) - } - if loc.Start() != 0 { - t.Errorf("expected loc start to be 0, got %d", loc.Start()) - } - if loc.End() != 3 { - t.Errorf("expected loc end to be 3, got %d", loc.End()) - } - if loc.Pos() != 1 { - t.Errorf("expected loc pos to be 1, got %d", loc.Pos()) - } - if loc.ArrayPositions() != nil { - t.Errorf("expect loc array pos to be nil, got %v", loc.ArrayPositions()) - } - } - if numLocs != nextPosting.Frequency() { - t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs) - } - - nextPosting, err = postingsItr.Next() - } - if err != nil { - t.Fatal(err) - } - - if count != 1 { - t.Errorf("expected count to be 1, got %d", count) - } - - // now try a field with array positions - dict, err = segment.Dictionary("tag") - if err != nil { - t.Fatal(err) - } - if dict == nil { - t.Fatal("got nil dict, expected non-nil") - } - - postingsList, err = dict.PostingsList("dark", nil) - if err != nil { - t.Fatal(err) - } - if postingsList == nil { - t.Fatal("got nil postings list, expected non-nil") - } - - postingsItr = postingsList.Iterator(true, true, true) - if postingsItr == nil { - t.Fatal("got nil iterator, expected non-nil") - } - - nextPosting, err = postingsItr.Next() - for nextPosting != nil && err == nil { - - if nextPosting.Frequency() != 1 { - t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) - } - if nextPosting.Number() != 0 { - t.Errorf("expected doc number 0, got %d", nextPosting.Number()) - } - var numLocs uint64 - for _, loc := range nextPosting.Locations() { - numLocs++ - if loc.Field() != "tag" { - t.Errorf("expected loc field to be 'name', got '%s'", loc.Field()) - } - if loc.Start() != 0 { - t.Errorf("expected loc start to be 0, got %d", loc.Start()) - } - if loc.End() != 4 { - t.Errorf("expected loc end to be 3, got %d", loc.End()) - } - if loc.Pos() != 1 { - t.Errorf("expected loc pos to be 1, got %d", loc.Pos()) - } - expectArrayPos := []uint64{1} - if !reflect.DeepEqual(loc.ArrayPositions(), expectArrayPos) { - t.Errorf("expect loc array pos to be %v, got %v", expectArrayPos, loc.ArrayPositions()) - } - } - if numLocs != nextPosting.Frequency() { - t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs) - } - - nextPosting, err = postingsItr.Next() - } - if err != nil { - t.Fatal(err) - } - - // now try and visit a document - var fieldValuesSeen int - err = segment.VisitDocument(0, func(field string, typ byte, value []byte, pos []uint64) bool { - fieldValuesSeen++ - return true - }) - if err != nil { - t.Fatal(err) - } - if fieldValuesSeen != 5 { - t.Errorf("expected 5 field values, got %d", fieldValuesSeen) - } - -} - -func TestMultiple(t *testing.T) { - - doc := &document.Document{ - ID: "a", - Fields: []document.Field{ - document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil), - document.NewTextFieldCustom("name", nil, []byte("wow"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - }, - CompositeFields: []*document.CompositeField{ - document.NewCompositeField("_all", true, nil, nil), - }, - } - - doc2 := &document.Document{ - ID: "b", - Fields: []document.Field{ - document.NewTextFieldCustom("_id", nil, []byte("b"), document.IndexField|document.StoreField, nil), - document.NewTextFieldCustom("name", nil, []byte("who"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - }, - CompositeFields: []*document.CompositeField{ - document.NewCompositeField("_all", true, nil, nil), - }, - } - - // forge analyzed docs - results := []*index.AnalysisResult{ - &index.AnalysisResult{ - Document: doc, - Analyzed: []analysis.TokenFrequencies{ - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 1, - Position: 1, - Term: []byte("a"), - }, - }, nil, false), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("wow"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("some"), - }, - &analysis.Token{ - Start: 5, - End: 10, - Position: 2, - Term: []byte("thing"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("cold"), - }, - }, []uint64{0}, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("dark"), - }, - }, []uint64{1}, true), - }, - Length: []int{ - 1, - 1, - 2, - 1, - 1, - }, - }, - &index.AnalysisResult{ - Document: doc2, - Analyzed: []analysis.TokenFrequencies{ - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 1, - Position: 1, - Term: []byte("b"), - }, - }, nil, false), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("who"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("some"), - }, - &analysis.Token{ - Start: 5, - End: 10, - Position: 2, - Term: []byte("thing"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("cold"), - }, - }, []uint64{0}, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("dark"), - }, - }, []uint64{1}, true), - }, - Length: []int{ - 1, - 1, - 2, - 1, - 1, - }, - }, - } - - // fix up composite fields - for _, ar := range results { - for i, f := range ar.Document.Fields { - for _, cf := range ar.Document.CompositeFields { - cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i]) - } - } - } - - segment := NewFromAnalyzedDocs(results) - if segment == nil { - t.Fatalf("segment nil, not expected") - } - - if segment.Count() != 2 { - t.Errorf("expected count 2, got %d", segment.Count()) - } - - // check the desc field - dict, err := segment.Dictionary("desc") - if err != nil { - t.Fatal(err) - } - if dict == nil { - t.Fatal("got nil dict, expected non-nil") - } - - postingsList, err := dict.PostingsList("thing", nil) - if err != nil { - t.Fatal(err) - } - if postingsList == nil { - t.Fatal("got nil postings list, expected non-nil") - } - - postingsItr := postingsList.Iterator(true, true, true) - if postingsItr == nil { - t.Fatal("got nil iterator, expected non-nil") - } - - count := 0 - nextPosting, err := postingsItr.Next() - for nextPosting != nil && err == nil { - count++ - nextPosting, err = postingsItr.Next() - } - if err != nil { - t.Fatal(err) - } - - if count != 2 { - t.Errorf("expected count to be 2, got %d", count) - } - - // get docnum of a - exclude, err := segment.DocNumbers([]string{"a"}) - if err != nil { - t.Fatal(err) - } - - // look for term 'thing' excluding doc 'a' - postingsListExcluding, err := dict.PostingsList("thing", exclude) - if err != nil { - t.Fatal(err) - } - if postingsList == nil { - t.Fatal("got nil postings list, expected non-nil") - } - - postingsListExcludingCount := postingsListExcluding.Count() - if postingsListExcludingCount != 1 { - t.Errorf("expected count from postings list to be 1, got %d", postingsListExcludingCount) - } - - postingsItrExcluding := postingsListExcluding.Iterator(true, true, true) - if postingsItr == nil { - t.Fatal("got nil iterator, expected non-nil") - } - - count = 0 - nextPosting, err = postingsItrExcluding.Next() - for nextPosting != nil && err == nil { - count++ - nextPosting, err = postingsItrExcluding.Next() - } - if err != nil { - t.Fatal(err) - } - - if count != 1 { - t.Errorf("expected count to be 1, got %d", count) - } - -} - -func TestMultipleWithNonOverlappingFields(t *testing.T) { - doc1 := &document.Document{ - ID: "a", - Fields: []document.Field{ - document.NewTextField("_id", []uint64{}, []byte("a")), - document.NewTextField("name", []uint64{}, []byte("ABC")), - document.NewTextField("dept", []uint64{}, []byte("ABC dept")), - document.NewTextField("manages.id", []uint64{}, []byte("XYZ")), - document.NewTextField("manages.count", []uint64{}, []byte("1")), - }, - CompositeFields: []*document.CompositeField{ - document.NewCompositeField("_all", true, nil, []string{"_id"}), - }, - } - - doc2 := &document.Document{ - ID: "b", - Fields: []document.Field{ - document.NewTextField("_id", []uint64{}, []byte("b")), - document.NewTextField("name", []uint64{}, []byte("XYZ")), - document.NewTextField("dept", []uint64{}, []byte("ABC dept")), - document.NewTextField("reportsTo.id", []uint64{}, []byte("ABC")), - }, - CompositeFields: []*document.CompositeField{ - document.NewCompositeField("_all", true, nil, []string{"_id"}), - }, - } - - results := []*index.AnalysisResult{ - &index.AnalysisResult{ - Document: doc1, - Analyzed: []analysis.TokenFrequencies{ - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 1, - Position: 1, - Term: []byte("a"), - }, - }, nil, false), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("ABC"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("ABC"), - }, - &analysis.Token{ - Start: 4, - End: 8, - Position: 2, - Term: []byte("dept"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("XYZ"), - }, - }, []uint64{0}, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 1, - Position: 1, - Term: []byte("1"), - }, - }, []uint64{1}, true), - }, - Length: []int{ - 1, - 1, - 2, - 1, - 1, - }, - }, - &index.AnalysisResult{ - Document: doc2, - Analyzed: []analysis.TokenFrequencies{ - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 1, - Position: 1, - Term: []byte("b"), - }, - }, nil, false), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("XYZ"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("ABC"), - }, - &analysis.Token{ - Start: 4, - End: 8, - Position: 2, - Term: []byte("dept"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("ABC"), - }, - }, []uint64{0}, true), - }, - Length: []int{ - 1, - 1, - 2, - 1, - }, - }, - } - - // fix up composite fields - for _, ar := range results { - for i, f := range ar.Document.Fields { - for _, cf := range ar.Document.CompositeFields { - cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i]) - } - } - } - - segment := NewFromAnalyzedDocs(results) - if segment == nil { - t.Fatalf("segment nil, not expected") - } - - if segment.Count() != 2 { - t.Errorf("expected count 2, got %d", segment.Count()) - } - - expectFields := map[string]struct{}{ - "_id": struct{}{}, - "_all": struct{}{}, - "name": struct{}{}, - "dept": struct{}{}, - "manages.id": struct{}{}, - "manages.count": struct{}{}, - "reportsTo.id": struct{}{}, - } - - fields := segment.Fields() - if len(fields) != len(expectFields) { - t.Errorf("expected %d fields, only got %d", len(expectFields), len(fields)) - } - for _, field := range fields { - if _, ok := expectFields[field]; !ok { - t.Errorf("got unexpected field: %s", field) - } - } -} From 5fea87cf9d52de20448e3b454e137120e6acef43 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 9 Apr 2018 17:05:35 -0400 Subject: [PATCH 370/728] new optional interface, access dictionary just for some terms this interface is useful when attempting to filter a larger list of terms which may or may not be in the dictionary this interface also allows you to not include the count in your dictionary iterator traversal --- index/index.go | 4 ++ index/scorch/segment/empty.go | 5 ++ index/scorch/segment/segment.go | 1 + index/scorch/segment/zap/dict.go | 65 +++++++++++++++++++++---- index/scorch/snapshot_index.go | 7 +++ index/scorch/snapshot_segment.go | 5 ++ search/searcher/search_numeric_range.go | 19 ++++++++ 7 files changed, 96 insertions(+), 10 deletions(-) diff --git a/index/index.go b/index/index.go index 861f6a2ea..42a452cde 100644 --- a/index/index.go +++ b/index/index.go @@ -104,6 +104,10 @@ type IndexReaderFuzzy interface { FieldDictFuzzy(field string, term []byte, fuzziness int) (FieldDict, error) } +type IndexReaderOnly interface { + FieldDictOnly(field string, onlyTerms [][]byte, includeCount bool) (FieldDict, error) +} + // FieldTerms contains the terms used by a document, keyed by field type FieldTerms map[string][]string diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index a0b8434c7..17749f1bf 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -85,6 +85,11 @@ func (e *EmptyDictionary) FuzzyIterator(term string, return &EmptyDictionaryIterator{} } +func (e *EmptyDictionary) OnlyIterator(onlyTerms [][]byte, + includeCount bool) DictionaryIterator { + return &EmptyDictionaryIterator{} +} + type EmptyDictionaryIterator struct{} func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) { diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index b754e94b4..62b8a29b2 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -50,6 +50,7 @@ type TermDictionary interface { RangeIterator(start, end string) DictionaryIterator RegexpIterator(regex string) DictionaryIterator FuzzyIterator(term string, fuzziness int) DictionaryIterator + OnlyIterator(onlyTerms [][]byte, includeCount bool) DictionaryIterator } type DictionaryIterator interface { diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index b06cd7f6c..7cf3364bc 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -15,6 +15,7 @@ package zap import ( + "bytes" "fmt" "github.com/RoaringBitmap/roaring" @@ -191,13 +192,55 @@ func (d *Dictionary) FuzzyIterator(term string, return rv } +func (d *Dictionary) OnlyIterator(onlyTerms [][]byte, + includeCount bool) segment.DictionaryIterator { + + rv := &DictionaryIterator{ + d: d, + omitCount: !includeCount, + } + + var buf bytes.Buffer + builder, err := vellum.New(&buf, nil) + if err != nil { + rv.err = err + return rv + } + for _, term := range onlyTerms { + err = builder.Insert(term, 0) + if err != nil { + rv.err = err + return rv + } + } + err = builder.Close() + if err != nil { + rv.err = err + return rv + } + + onlyFST, err := vellum.Load(buf.Bytes()) + if err != nil { + rv.err = err + return rv + } + + itr, err := d.fst.Search(onlyFST, nil, nil) + if err == nil { + rv.itr = itr + } + + return rv +} + // DictionaryIterator is an iterator for term dictionary type DictionaryIterator struct { - d *Dictionary - itr vellum.Iterator - err error - tmp PostingsList - entry index.DictEntry + d *Dictionary + itr vellum.Iterator + err error + tmp PostingsList + entry index.DictEntry + omitCount bool } // Next returns the next entry in the dictionary @@ -208,12 +251,14 @@ func (i *DictionaryIterator) Next() (*index.DictEntry, error) { return nil, i.err } term, postingsOffset := i.itr.Current() - i.err = i.tmp.read(postingsOffset, i.d) - if i.err != nil { - return nil, i.err - } i.entry.Term = string(term) - i.entry.Count = i.tmp.Count() + if !i.omitCount { + i.err = i.tmp.read(postingsOffset, i.d) + if i.err != nil { + return nil, i.err + } + i.entry.Count = i.tmp.Count() + } i.err = i.itr.Next() return &i.entry, nil } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index cf6e6250d..f4c75fdea 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -190,6 +190,13 @@ func (i *IndexSnapshot) FieldDictFuzzy(field string, }) } +func (i *IndexSnapshot) FieldDictOnly(field string, + onlyTerms [][]byte, includeCount bool) (index.FieldDict, error) { + return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { + return i.OnlyIterator(onlyTerms, includeCount) + }) +} + func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) { results := make(chan *asynchSegmentResult) for index, segment := range i.segment { diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index c29fac997..39539364c 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -58,6 +58,11 @@ func (s *SegmentDictionarySnapshot) FuzzyIterator(term string, return s.d.FuzzyIterator(term, fuzziness) } +func (s *SegmentDictionarySnapshot) OnlyIterator(onlyTerms [][]byte, + includeCount bool) segment.DictionaryIterator { + return s.d.OnlyIterator(onlyTerms, includeCount) +} + type SegmentSnapshot struct { id uint64 segment segment.Segment diff --git a/search/searcher/search_numeric_range.go b/search/searcher/search_numeric_range.go index 7f42d7250..1eae7a5ec 100644 --- a/search/searcher/search_numeric_range.go +++ b/search/searcher/search_numeric_range.go @@ -77,6 +77,25 @@ func NewNumericRangeSearcher(indexReader index.IndexReader, func filterCandidateTerms(indexReader index.IndexReader, terms [][]byte, field string) (rv [][]byte, err error) { + + if ir, ok := indexReader.(index.IndexReaderOnly); ok { + fieldDict, err := ir.FieldDictOnly(field, terms, false) + if err != nil { + return nil, err + } + // enumerate the terms (no need to check them again) + tfd, err := fieldDict.Next() + for err == nil && tfd != nil { + rv = append(rv, []byte(tfd.Term)) + tfd, err = fieldDict.Next() + } + if cerr := fieldDict.Close(); cerr != nil && err == nil { + err = cerr + } + + return rv, err + } + fieldDict, err := indexReader.FieldDictRange(field, terms[0], terms[len(terms)-1]) if err != nil { return nil, err From 6aadf2b305c0758f6e170f7ad2bccb55ac89ce2b Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 9 Apr 2018 17:27:47 -0400 Subject: [PATCH 371/728] make error handing inside iterator creation mroe consistent --- index/scorch/segment/zap/dict.go | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 7cf3364bc..68c062c43 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -100,6 +100,8 @@ func (d *Dictionary) Iterator() segment.DictionaryIterator { itr, err := d.fst.Iterator(nil, nil) if err == nil { rv.itr = itr + } else if err != nil && err != vellum.ErrIteratorDone { + rv.err = err } } @@ -119,7 +121,11 @@ func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { itr, err := d.fst.Search(r, nil, nil) if err == nil { rv.itr = itr + } else if err != nil && err != vellum.ErrIteratorDone { + rv.err = err } + } else { + rv.err = err } } @@ -145,13 +151,15 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator itr, err := d.fst.Iterator([]byte(start), endBytes) if err == nil { rv.itr = itr + } else if err != nil && err != vellum.ErrIteratorDone { + rv.err = err } } return rv } -// RegexIterator returns an iterator which only visits terms having the +// RegexpIterator returns an iterator which only visits terms having the // the specified regex func (d *Dictionary) RegexpIterator(regex string) segment.DictionaryIterator { rv := &DictionaryIterator{ @@ -161,10 +169,14 @@ func (d *Dictionary) RegexpIterator(regex string) segment.DictionaryIterator { if d.fst != nil { r, err := regexp.New(regex) if err == nil { - itr, err := d.fst.Search(r, nil, nil) - if err == nil { + itr, err2 := d.fst.Search(r, nil, nil) + if err2 == nil { rv.itr = itr + } else if err2 != nil && err2 != vellum.ErrIteratorDone { + rv.err = err2 } + } else { + rv.err = err } } @@ -182,10 +194,14 @@ func (d *Dictionary) FuzzyIterator(term string, if d.fst != nil { la, err := levenshtein.New(term, fuzziness) if err == nil { - itr, err := d.fst.Search(la, nil, nil) - if err == nil { + itr, err2 := d.fst.Search(la, nil, nil) + if err2 == nil { rv.itr = itr + } else if err2 != nil && err2 != vellum.ErrIteratorDone { + rv.err = err2 } + } else { + rv.err = err } } @@ -228,6 +244,8 @@ func (d *Dictionary) OnlyIterator(onlyTerms [][]byte, itr, err := d.fst.Search(onlyFST, nil, nil) if err == nil { rv.itr = itr + } else if err != nil && err != vellum.ErrIteratorDone { + rv.err = err } return rv From 901d847f49b419a30cc2dab79fc8abf28d176291 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 9 Apr 2018 16:04:08 -0700 Subject: [PATCH 372/728] scorch optimize loadSegment() to treat empty deleted bitmaps as nil In this optimization, loadSegment() will convert an empty 'deleted' roaring bitmap to nil, so that later codepaths that test for nil'ness can avoid roaring operations like AndNot() -- which helps in the case when there's no mutations / insertions-only. --- index/scorch/persister.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 758b342f2..76fd746d5 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -622,7 +622,9 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro _ = segment.Close() return nil, fmt.Errorf("error reading deleted bytes: %v", err) } - rv.deleted = deletedBitmap + if !deletedBitmap.IsEmpty() { + rv.deleted = deletedBitmap + } } return rv, nil From 02c6befe408658e5d03bc548b96ddf3120fd1245 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 11 Apr 2018 15:46:33 +0530 Subject: [PATCH 373/728] avoiding redundant snappy decode calls in dv --- index/scorch/segment/zap/docvalues.go | 21 +++++++++++++++------ index/scorch/snapshot_index.go | 2 +- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index f46f2bd5b..13f41bd2d 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -125,6 +125,7 @@ func (di *docValueReader) loadDvChunk(chunkNumber, dataLength := curChunkEnd - compressedDataLoc di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength] di.curChunkNum = chunkNumber + di.uncompressed = di.uncompressed[:0] return nil } @@ -135,12 +136,20 @@ func (di *docValueReader) visitDocValues(docNum uint64, if start == math.MaxUint64 || end == math.MaxUint64 { return nil } - // uncompress the already loaded data - uncompressed, err := snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) - if err != nil { - return err + + var uncompressed []byte + var err error + // use the uncompressed copy if available + if len(di.uncompressed) > 0 { + uncompressed = di.uncompressed + } else { + // uncompress the already loaded data + uncompressed, err = snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) + if err != nil { + return err + } + di.uncompressed = uncompressed } - di.uncompressed = uncompressed // pick the terms for the given docNum uncompressed = uncompressed[start:end] @@ -200,7 +209,7 @@ func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []strin // persisted doc value terms ready to be visitable using the // VisitDocumentFieldTerms method. func (s *Segment) VisitableDocValueFields() ([]string, error) { - var rv []string + rv := make([]string, 0, len(s.fieldDvReaders)) for fieldID, field := range s.fieldsInv { if dvIter, ok := s.fieldDvReaders[uint16(fieldID)]; ok && dvIter != nil { diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index f4c75fdea..888e95fb9 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -532,7 +532,7 @@ func visitDocumentFieldCacheTerms(localDocNum uint64, fields []string, } func extractDvPendingFields(requestedFields, persistedFields []string) []string { - removeMap := map[string]struct{}{} + removeMap := make(map[string]struct{}, len(persistedFields)) for _, str := range persistedFields { removeMap[str] = struct{}{} } From 4c642cffdc2af390030c87314586beeb89b80739 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 4 Apr 2018 08:59:11 -0700 Subject: [PATCH 374/728] scorch optimize TermFieldReaders via Dict/PostingsList/Iterator reuse This optimization is helpful for low-frequency term searches. The scorch IndexSnapshot struct now tracks a mutable slice of recycled TermFieldReader instances, to avoid TFR allocations. The TermFieldReader.Close() method recycles the instance back onto the IndexSnapshot. This commit also adds an optional 'prealloc' param to the TermDictionary.PostingsList() and PostingsList.Iterator() methods, which allows the TermFieldReader to avoid allocations by reusing recycled postings list and postings iterators. --- index/scorch/segment/empty.go | 5 +- index/scorch/segment/segment.go | 4 +- index/scorch/segment/zap/dict.go | 10 +++- index/scorch/segment/zap/merge_test.go | 4 +- index/scorch/segment/zap/posting.go | 14 ++++- index/scorch/segment/zap/segment_test.go | 32 +++++----- index/scorch/snapshot_index.go | 76 +++++++++++++++++++----- index/scorch/snapshot_index_tfr.go | 2 + index/scorch/snapshot_segment.go | 13 ++-- 9 files changed, 114 insertions(+), 46 deletions(-) diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index 17749f1bf..9a64f223e 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -60,7 +60,7 @@ func (e *EmptySegment) DecRef() error { type EmptyDictionary struct{} func (e *EmptyDictionary) PostingsList(term string, - except *roaring.Bitmap) (PostingsList, error) { + except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error) { return &EmptyPostingsList{}, nil } @@ -98,7 +98,8 @@ func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) { type EmptyPostingsList struct{} -func (e *EmptyPostingsList) Iterator(includeFreq, includeNorm, includeLocations bool) PostingsIterator { +func (e *EmptyPostingsList) Iterator(includeFreq, includeNorm, includeLocations bool, + prealloc PostingsIterator) PostingsIterator { return &EmptyPostingsIterator{} } diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index 62b8a29b2..ff0b5124e 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -43,7 +43,7 @@ type Segment interface { } type TermDictionary interface { - PostingsList(term string, except *roaring.Bitmap) (PostingsList, error) + PostingsList(term string, except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error) Iterator() DictionaryIterator PrefixIterator(prefix string) DictionaryIterator @@ -58,7 +58,7 @@ type DictionaryIterator interface { } type PostingsList interface { - Iterator(includeFreq, includeNorm, includeLocations bool) PostingsIterator + Iterator(includeFreq, includeNorm, includeLocations bool, prealloc PostingsIterator) PostingsIterator Size() int diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 68c062c43..3010fe6dc 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -35,8 +35,14 @@ type Dictionary struct { } // PostingsList returns the postings list for the specified term -func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) { - return d.postingsList([]byte(term), except, nil) +func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap, + prealloc segment.PostingsList) (segment.PostingsList, error) { + var preallocPL *PostingsList + pl, ok := prealloc.(*PostingsList) + if ok && pl != nil { + preallocPL = pl + } + return d.postingsList([]byte(term), except, preallocPL) } func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { diff --git a/index/scorch/segment/zap/merge_test.go b/index/scorch/segment/zap/merge_test.go index 2675bf838..6b168c907 100644 --- a/index/scorch/segment/zap/merge_test.go +++ b/index/scorch/segment/zap/merge_test.go @@ -332,8 +332,8 @@ func compareSegments(a, b *Segment) string { fieldName, next.Term, aplist.Count(), bplist.Count())) } - apitr := aplist.Iterator(true, true, true) - bpitr := bplist.Iterator(true, true, true) + apitr := aplist.Iterator(true, true, true, nil) + bpitr := bplist.Iterator(true, true, true, nil) if (apitr != nil) != (bpitr != nil) { rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsList.Iterator() results different: %v %v", fieldName, next.Term, apitr, bpitr)) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 1f198df5e..cc8bfefee 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -133,12 +133,22 @@ func (p *PostingsList) OrInto(receiver *roaring.Bitmap) { } // Iterator returns an iterator for this postings list -func (p *PostingsList) Iterator(includeFreq, includeNorm, includeLocs bool) segment.PostingsIterator { +func (p *PostingsList) Iterator(includeFreq, includeNorm, includeLocs bool, + prealloc segment.PostingsIterator) segment.PostingsIterator { if p.normBits1Hit == 0 && p.postings == nil { return emptyPostingsIterator } - return p.iterator(includeFreq, includeNorm, includeLocs, nil) + var preallocPI *PostingsIterator + pi, ok := prealloc.(*PostingsIterator) + if ok && pi != nil { + preallocPI = pi + } + if preallocPI == emptyPostingsIterator { + preallocPI = nil + } + + return p.iterator(includeFreq, includeNorm, includeLocs, preallocPI) } func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool, diff --git a/index/scorch/segment/zap/segment_test.go b/index/scorch/segment/zap/segment_test.go index 00ae1c2f6..a6698e054 100644 --- a/index/scorch/segment/zap/segment_test.go +++ b/index/scorch/segment/zap/segment_test.go @@ -76,7 +76,7 @@ func TestOpen(t *testing.T) { t.Fatal("got nil dict, expected non-nil") } - postingsList, err := dict.PostingsList("a", nil) + postingsList, err := dict.PostingsList("a", nil, nil) if err != nil { t.Fatal(err) } @@ -84,7 +84,7 @@ func TestOpen(t *testing.T) { t.Fatal("got nil postings list, expected non-nil") } - postingsItr := postingsList.Iterator(true, true, true) + postingsItr := postingsList.Iterator(true, true, true, nil) if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") } @@ -122,7 +122,7 @@ func TestOpen(t *testing.T) { t.Fatal("got nil dict, expected non-nil") } - postingsList, err = dict.PostingsList("wow", nil) + postingsList, err = dict.PostingsList("wow", nil, nil) if err != nil { t.Fatal(err) } @@ -130,7 +130,7 @@ func TestOpen(t *testing.T) { t.Fatal("got nil postings list, expected non-nil") } - postingsItr = postingsList.Iterator(true, true, true) + postingsItr = postingsList.Iterator(true, true, true, nil) if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") } @@ -190,7 +190,7 @@ func TestOpen(t *testing.T) { t.Fatal("got nil dict, expected non-nil") } - postingsList, err = dict.PostingsList("wow", nil) + postingsList, err = dict.PostingsList("wow", nil, nil) if err != nil { t.Fatal(err) } @@ -198,7 +198,7 @@ func TestOpen(t *testing.T) { t.Fatal("got nil postings list, expected non-nil") } - postingsItr = postingsList.Iterator(true, true, true) + postingsItr = postingsList.Iterator(true, true, true, nil) if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") } @@ -259,7 +259,7 @@ func TestOpen(t *testing.T) { t.Fatal("got nil dict, expected non-nil") } - postingsList, err = dict.PostingsList("dark", nil) + postingsList, err = dict.PostingsList("dark", nil, nil) if err != nil { t.Fatal(err) } @@ -267,7 +267,7 @@ func TestOpen(t *testing.T) { t.Fatal("got nil postings list, expected non-nil") } - postingsItr = postingsList.Iterator(true, true, true) + postingsItr = postingsList.Iterator(true, true, true, nil) if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") } @@ -358,7 +358,7 @@ func TestOpenMulti(t *testing.T) { t.Fatal("got nil dict, expected non-nil") } - postingsList, err := dict.PostingsList("thing", nil) + postingsList, err := dict.PostingsList("thing", nil, nil) if err != nil { t.Fatal(err) } @@ -366,7 +366,7 @@ func TestOpenMulti(t *testing.T) { t.Fatal("got nil postings list, expected non-nil") } - postingsItr := postingsList.Iterator(true, true, true) + postingsItr := postingsList.Iterator(true, true, true, nil) if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") } @@ -392,7 +392,7 @@ func TestOpenMulti(t *testing.T) { } // look for term 'thing' excluding doc 'a' - postingsListExcluding, err := dict.PostingsList("thing", exclude) + postingsListExcluding, err := dict.PostingsList("thing", exclude, nil) if err != nil { t.Fatal(err) } @@ -405,7 +405,7 @@ func TestOpenMulti(t *testing.T) { t.Errorf("expected count from postings list to be 1, got %d", postingsListExcludingCount) } - postingsItrExcluding := postingsListExcluding.Iterator(true, true, true) + postingsItrExcluding := postingsListExcluding.Iterator(true, true, true, nil) if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") } @@ -458,7 +458,7 @@ func TestOpenMultiWithTwoChunks(t *testing.T) { t.Fatal("got nil dict, expected non-nil") } - postingsList, err := dict.PostingsList("thing", nil) + postingsList, err := dict.PostingsList("thing", nil, nil) if err != nil { t.Fatal(err) } @@ -466,7 +466,7 @@ func TestOpenMultiWithTwoChunks(t *testing.T) { t.Fatal("got nil postings list, expected non-nil") } - postingsItr := postingsList.Iterator(true, true, true) + postingsItr := postingsList.Iterator(true, true, true, nil) if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") } @@ -492,7 +492,7 @@ func TestOpenMultiWithTwoChunks(t *testing.T) { } // look for term 'thing' excluding doc 'a' - postingsListExcluding, err := dict.PostingsList("thing", exclude) + postingsListExcluding, err := dict.PostingsList("thing", exclude, nil) if err != nil { t.Fatal(err) } @@ -500,7 +500,7 @@ func TestOpenMultiWithTwoChunks(t *testing.T) { t.Fatal("got nil postings list, expected non-nil") } - postingsItrExcluding := postingsListExcluding.Iterator(true, true, true) + postingsItrExcluding := postingsListExcluding.Iterator(true, true, true, nil) if postingsItr == nil { t.Fatal("got nil iterator, expected non-nil") } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 888e95fb9..f98dab24c 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -59,6 +59,9 @@ type IndexSnapshot struct { m sync.Mutex // Protects the fields that follow. refs int64 + + tfrsm sync.Mutex // Protects the fields that follow. + tfrs map[string][]*IndexSnapshotTermFieldReader // keyed by field, recycled TFR's } func (i *IndexSnapshot) Segments() []*SegmentSnapshot { @@ -394,34 +397,75 @@ func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err err } func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, - includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { + includeNorm, includeTermVectors bool) (tfr index.TermFieldReader, err error) { + rv := i.allocTermFieldReader(field) + rv.term = term + rv.field = field + rv.snapshot = i + if rv.dicts == nil { + rv.dicts = make([]segment.TermDictionary, len(i.segment)) + } + if rv.postings == nil { + rv.postings = make([]segment.PostingsList, len(i.segment)) + } + if rv.iterators == nil { + rv.iterators = make([]segment.PostingsIterator, len(i.segment)) + } + rv.segmentOffset = 0 + rv.includeFreq = includeFreq + rv.includeNorm = includeNorm + rv.includeTermVectors = includeTermVectors + rv.currPosting = nil + rv.currID = rv.currID[:0] + termStr := string(term) - rv := &IndexSnapshotTermFieldReader{ - term: term, - field: field, - snapshot: i, - postings: make([]segment.PostingsList, len(i.segment)), - iterators: make([]segment.PostingsIterator, len(i.segment)), - includeFreq: includeFreq, - includeNorm: includeNorm, - includeTermVectors: includeTermVectors, - } + for i, segment := range i.segment { - dict, err := segment.Dictionary(field) - if err != nil { - return nil, err + dict := rv.dicts[i] + if dict == nil { + dict, err = segment.Dictionary(field) + if err != nil { + return nil, err + } + rv.dicts[i] = dict } - pl, err := dict.PostingsList(termStr, nil) + pl, err := dict.PostingsList(termStr, nil, rv.postings[i]) if err != nil { return nil, err } rv.postings[i] = pl - rv.iterators[i] = pl.Iterator(includeFreq, includeNorm, includeTermVectors) + rv.iterators[i] = pl.Iterator(includeFreq, includeNorm, includeTermVectors, rv.iterators[i]) } atomic.AddUint64(&i.parent.stats.TotTermSearchersStarted, uint64(1)) return rv, nil } +func (i *IndexSnapshot) allocTermFieldReader(field string) *IndexSnapshotTermFieldReader { + i.tfrsm.Lock() + if i.tfrs != nil { + tfrs := i.tfrs[field] + last := len(tfrs) - 1 + if last >= 0 { + rv := tfrs[last] + tfrs[last] = nil + i.tfrs[field] = tfrs[:last] + i.tfrsm.Unlock() + return rv + } + } + i.tfrsm.Unlock() + return &IndexSnapshotTermFieldReader{} +} + +func (i *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader) { + i.tfrsm.Lock() + if i.tfrs == nil { + i.tfrs = map[string][]*IndexSnapshotTermFieldReader{} + } + i.tfrs[tfr.field] = append(i.tfrs[tfr.field], tfr) + i.tfrsm.Unlock() +} + func docNumberToBytes(buf []byte, in uint64) []byte { if len(buf) != 8 { if cap(buf) >= 8 { diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index c111d5177..11f1283ec 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -35,6 +35,7 @@ type IndexSnapshotTermFieldReader struct { term []byte field string snapshot *IndexSnapshot + dicts []segment.TermDictionary postings []segment.PostingsList iterators []segment.PostingsIterator segmentOffset int @@ -164,6 +165,7 @@ func (i *IndexSnapshotTermFieldReader) Count() uint64 { func (i *IndexSnapshotTermFieldReader) Close() error { if i.snapshot != nil { atomic.AddUint64(&i.snapshot.parent.stats.TotTermSearchersFinished, uint64(1)) + i.snapshot.recycleTermFieldReader(i) } return nil } diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 39539364c..247131ae5 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -32,9 +32,10 @@ type SegmentDictionarySnapshot struct { d segment.TermDictionary } -func (s *SegmentDictionarySnapshot) PostingsList(term string, except *roaring.Bitmap) (segment.PostingsList, error) { +func (s *SegmentDictionarySnapshot) PostingsList(term string, except *roaring.Bitmap, + prealloc segment.PostingsList) (segment.PostingsList, error) { // TODO: if except is non-nil, perhaps need to OR it with s.s.deleted? - return s.d.PostingsList(term, s.s.deleted) + return s.d.PostingsList(term, s.s.deleted, prealloc) } func (s *SegmentDictionarySnapshot) Iterator() segment.DictionaryIterator { @@ -171,17 +172,21 @@ func (cfd *cachedFieldDocs) prepareFields(field string, ss *SegmentSnapshot) { return } + var postings segment.PostingsList + var postingsItr segment.PostingsIterator + dictItr := dict.Iterator() next, err := dictItr.Next() for err == nil && next != nil { - postings, err1 := dict.PostingsList(next.Term, nil) + var err1 error + postings, err1 = dict.PostingsList(next.Term, nil, postings) if err1 != nil { cfd.err = err1 return } cfd.size += uint64(size.SizeOfUint64) /* map key */ - postingsItr := postings.Iterator(false, false, false) + postingsItr = postings.Iterator(false, false, false, postingsItr) nextPosting, err2 := postingsItr.Next() for err2 == nil && nextPosting != nil { docNum := nextPosting.Number() From 8a2250b3ee7add726ca4e770537d0fbd24e70947 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 4 Apr 2018 15:36:55 -0700 Subject: [PATCH 375/728] scorch optimize TermFieldReader to reuse dict even more A single scorch TermDictionary can be shared by all TermFieldReaders. --- index/scorch/snapshot_index.go | 67 ++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index f98dab24c..4e80a7156 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -60,8 +60,9 @@ type IndexSnapshot struct { m sync.Mutex // Protects the fields that follow. refs int64 - tfrsm sync.Mutex // Protects the fields that follow. - tfrs map[string][]*IndexSnapshotTermFieldReader // keyed by field, recycled TFR's + m2 sync.Mutex // Protects the fields that follow. + fieldTFRs map[string][]*IndexSnapshotTermFieldReader // keyed by field, recycled TFR's + fieldDicts map[string][]segment.TermDictionary // keyed by field, recycled dicts } func (i *IndexSnapshot) Segments() []*SegmentSnapshot { @@ -398,13 +399,11 @@ func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err err func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, includeNorm, includeTermVectors bool) (tfr index.TermFieldReader, err error) { - rv := i.allocTermFieldReader(field) + rv, dicts := i.allocTermFieldReaderDicts(field) + rv.term = term rv.field = field rv.snapshot = i - if rv.dicts == nil { - rv.dicts = make([]segment.TermDictionary, len(i.segment)) - } if rv.postings == nil { rv.postings = make([]segment.PostingsList, len(i.segment)) } @@ -418,18 +417,22 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, rv.currPosting = nil rv.currID = rv.currID[:0] - termStr := string(term) - - for i, segment := range i.segment { - dict := rv.dicts[i] - if dict == nil { - dict, err = segment.Dictionary(field) + if dicts == nil { + dicts = make([]segment.TermDictionary, len(i.segment)) + for i, segment := range i.segment { + dict, err := segment.Dictionary(field) if err != nil { return nil, err } - rv.dicts[i] = dict + dicts[i] = dict } - pl, err := dict.PostingsList(termStr, nil, rv.postings[i]) + } + rv.dicts = dicts + + termStr := string(term) + + for i := range i.segment { + pl, err := dicts[i].PostingsList(termStr, nil, rv.postings[i]) if err != nil { return nil, err } @@ -440,30 +443,38 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, return rv, nil } -func (i *IndexSnapshot) allocTermFieldReader(field string) *IndexSnapshotTermFieldReader { - i.tfrsm.Lock() - if i.tfrs != nil { - tfrs := i.tfrs[field] +func (i *IndexSnapshot) allocTermFieldReaderDicts(field string) ( + tfr *IndexSnapshotTermFieldReader, dicts []segment.TermDictionary) { + i.m2.Lock() + if i.fieldDicts != nil { + dicts = i.fieldDicts[field] + } + if i.fieldTFRs != nil { + tfrs := i.fieldTFRs[field] last := len(tfrs) - 1 if last >= 0 { rv := tfrs[last] tfrs[last] = nil - i.tfrs[field] = tfrs[:last] - i.tfrsm.Unlock() - return rv + i.fieldTFRs[field] = tfrs[:last] + i.m2.Unlock() + return rv, dicts } } - i.tfrsm.Unlock() - return &IndexSnapshotTermFieldReader{} + i.m2.Unlock() + return &IndexSnapshotTermFieldReader{}, dicts } func (i *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader) { - i.tfrsm.Lock() - if i.tfrs == nil { - i.tfrs = map[string][]*IndexSnapshotTermFieldReader{} + i.m2.Lock() + if i.fieldTFRs == nil { + i.fieldTFRs = map[string][]*IndexSnapshotTermFieldReader{} + } + i.fieldTFRs[tfr.field] = append(i.fieldTFRs[tfr.field], tfr) + if i.fieldDicts == nil { + i.fieldDicts = map[string][]segment.TermDictionary{} } - i.tfrs[tfr.field] = append(i.tfrs[tfr.field], tfr) - i.tfrsm.Unlock() + i.fieldDicts[tfr.field] = tfr.dicts + i.m2.Unlock() } func docNumberToBytes(buf []byte, in uint64) []byte { From bdc868c07c2ae08fb67826e33d38edcffe5211da Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 11 Apr 2018 19:20:57 -0700 Subject: [PATCH 376/728] Remove unused parameter for docValueReader API: loadDvChunk(..) --- index/scorch/segment/zap/docvalues.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index 13f41bd2d..220897853 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -96,8 +96,7 @@ func (s *SegmentBase) loadFieldDocValueReader(field string, return fdvIter, nil } -func (di *docValueReader) loadDvChunk(chunkNumber, - localDocNum uint64, s *SegmentBase) error { +func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error { // advance to the chunk where the docValues // reside for the given docNum destChunkDataLoc, curChunkEnd := di.dvDataLoc, di.dvDataLoc @@ -193,7 +192,7 @@ func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []strin dvIter != nil { // check if the chunk is already loaded if docInChunk != dvIter.curChunkNumber() { - err := dvIter.loadDvChunk(docInChunk, localDocNum, s) + err := dvIter.loadDvChunk(docInChunk, s) if err != nil { continue } From e1cf8b5b8df482033fbf73bf73b92d10a3414f25 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 9 Apr 2018 16:53:51 -0700 Subject: [PATCH 377/728] scorch PostingsList() takes []byte, not string --- index/scorch/segment/empty.go | 2 +- index/scorch/segment/segment.go | 2 +- index/scorch/segment/zap/dict.go | 4 ++-- index/scorch/segment/zap/segment_test.go | 16 ++++++++-------- index/scorch/snapshot_index.go | 4 +--- index/scorch/snapshot_segment.go | 4 ++-- 6 files changed, 15 insertions(+), 17 deletions(-) diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index 9a64f223e..839d4983c 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -59,7 +59,7 @@ func (e *EmptySegment) DecRef() error { type EmptyDictionary struct{} -func (e *EmptyDictionary) PostingsList(term string, +func (e *EmptyDictionary) PostingsList(term []byte, except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error) { return &EmptyPostingsList{}, nil } diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index ff0b5124e..a3f215804 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -43,7 +43,7 @@ type Segment interface { } type TermDictionary interface { - PostingsList(term string, except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error) + PostingsList(term []byte, except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error) Iterator() DictionaryIterator PrefixIterator(prefix string) DictionaryIterator diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 3010fe6dc..b0664534d 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -35,14 +35,14 @@ type Dictionary struct { } // PostingsList returns the postings list for the specified term -func (d *Dictionary) PostingsList(term string, except *roaring.Bitmap, +func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap, prealloc segment.PostingsList) (segment.PostingsList, error) { var preallocPL *PostingsList pl, ok := prealloc.(*PostingsList) if ok && pl != nil { preallocPL = pl } - return d.postingsList([]byte(term), except, preallocPL) + return d.postingsList(term, except, preallocPL) } func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { diff --git a/index/scorch/segment/zap/segment_test.go b/index/scorch/segment/zap/segment_test.go index a6698e054..27f87faff 100644 --- a/index/scorch/segment/zap/segment_test.go +++ b/index/scorch/segment/zap/segment_test.go @@ -76,7 +76,7 @@ func TestOpen(t *testing.T) { t.Fatal("got nil dict, expected non-nil") } - postingsList, err := dict.PostingsList("a", nil, nil) + postingsList, err := dict.PostingsList([]byte("a"), nil, nil) if err != nil { t.Fatal(err) } @@ -122,7 +122,7 @@ func TestOpen(t *testing.T) { t.Fatal("got nil dict, expected non-nil") } - postingsList, err = dict.PostingsList("wow", nil, nil) + postingsList, err = dict.PostingsList([]byte("wow"), nil, nil) if err != nil { t.Fatal(err) } @@ -190,7 +190,7 @@ func TestOpen(t *testing.T) { t.Fatal("got nil dict, expected non-nil") } - postingsList, err = dict.PostingsList("wow", nil, nil) + postingsList, err = dict.PostingsList([]byte("wow"), nil, nil) if err != nil { t.Fatal(err) } @@ -259,7 +259,7 @@ func TestOpen(t *testing.T) { t.Fatal("got nil dict, expected non-nil") } - postingsList, err = dict.PostingsList("dark", nil, nil) + postingsList, err = dict.PostingsList([]byte("dark"), nil, nil) if err != nil { t.Fatal(err) } @@ -358,7 +358,7 @@ func TestOpenMulti(t *testing.T) { t.Fatal("got nil dict, expected non-nil") } - postingsList, err := dict.PostingsList("thing", nil, nil) + postingsList, err := dict.PostingsList([]byte("thing"), nil, nil) if err != nil { t.Fatal(err) } @@ -392,7 +392,7 @@ func TestOpenMulti(t *testing.T) { } // look for term 'thing' excluding doc 'a' - postingsListExcluding, err := dict.PostingsList("thing", exclude, nil) + postingsListExcluding, err := dict.PostingsList([]byte("thing"), exclude, nil) if err != nil { t.Fatal(err) } @@ -458,7 +458,7 @@ func TestOpenMultiWithTwoChunks(t *testing.T) { t.Fatal("got nil dict, expected non-nil") } - postingsList, err := dict.PostingsList("thing", nil, nil) + postingsList, err := dict.PostingsList([]byte("thing"), nil, nil) if err != nil { t.Fatal(err) } @@ -492,7 +492,7 @@ func TestOpenMultiWithTwoChunks(t *testing.T) { } // look for term 'thing' excluding doc 'a' - postingsListExcluding, err := dict.PostingsList("thing", exclude, nil) + postingsListExcluding, err := dict.PostingsList([]byte("thing"), exclude, nil) if err != nil { t.Fatal(err) } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 4e80a7156..2664fe425 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -429,10 +429,8 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, } rv.dicts = dicts - termStr := string(term) - for i := range i.segment { - pl, err := dicts[i].PostingsList(termStr, nil, rv.postings[i]) + pl, err := dicts[i].PostingsList(term, nil, rv.postings[i]) if err != nil { return nil, err } diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 247131ae5..44aafa523 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -32,7 +32,7 @@ type SegmentDictionarySnapshot struct { d segment.TermDictionary } -func (s *SegmentDictionarySnapshot) PostingsList(term string, except *roaring.Bitmap, +func (s *SegmentDictionarySnapshot) PostingsList(term []byte, except *roaring.Bitmap, prealloc segment.PostingsList) (segment.PostingsList, error) { // TODO: if except is non-nil, perhaps need to OR it with s.s.deleted? return s.d.PostingsList(term, s.s.deleted, prealloc) @@ -179,7 +179,7 @@ func (cfd *cachedFieldDocs) prepareFields(field string, ss *SegmentSnapshot) { next, err := dictItr.Next() for err == nil && next != nil { var err1 error - postings, err1 = dict.PostingsList(next.Term, nil, postings) + postings, err1 = dict.PostingsList([]byte(next.Term), nil, postings) if err1 != nil { cfd.err = err1 return From 293403a012f26f26956ac75e080d1bab7bc3c707 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 13 Apr 2018 09:42:16 -0400 Subject: [PATCH 378/728] improve scorch Advance implementation advance will jump directly to the expected segment then it will skip directly to the expectec chunk --- index/scorch/reader_test.go | 8 +++--- index/scorch/segment/empty.go | 4 +++ index/scorch/segment/segment.go | 6 +++++ index/scorch/segment/zap/posting.go | 29 +++++++++++++++++++--- index/scorch/snapshot_index_tfr.go | 38 ++++++++++++++++++++--------- 5 files changed, 67 insertions(+), 18 deletions(-) diff --git a/index/scorch/reader_test.go b/index/scorch/reader_test.go index 4eb9b5fb9..8414cbdc1 100644 --- a/index/scorch/reader_test.go +++ b/index/scorch/reader_test.go @@ -15,6 +15,7 @@ package scorch import ( + "encoding/binary" "reflect" "testing" @@ -171,9 +172,10 @@ func TestIndexReader(t *testing.T) { if !match.ID.Equals(internalID2) { t.Errorf("Expected ID '2', got '%s'", match.ID) } - // NOTE: no point in changing this to internal id 3, there is no id 3 - // the test is looking for something that doens't exist and this doesn't - match, err = reader.Advance(index.IndexInternalID("3"), nil) + // have to manually construct bogus id, because it doesn't exist + internalID3 := make([]byte, 8) + binary.BigEndian.PutUint64(internalID3, 3) + match, err = reader.Advance(index.IndexInternalID(internalID3), nil) if err != nil { t.Errorf("unexpected error: %v", err) } diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index 9a64f223e..0350267b4 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -96,6 +96,10 @@ func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) { return nil, nil } +func (e *EmptyPostingsIterator) Advance(uint64) (Posting, error) { + return nil, nil +} + type EmptyPostingsList struct{} func (e *EmptyPostingsList) Iterator(includeFreq, includeNorm, includeLocations bool, diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index ff0b5124e..3b4a263a7 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -77,6 +77,12 @@ type PostingsIterator interface { // allocations. Next() (Posting, error) + // Advance will return the posting with the specified doc number + // or if there is no such posting, the next posting. + // Callers MUST NOT attempt to pass a docNum that is less than or + // equal to the currently visited posting doc Num. + Advance(docNum uint64) (Posting, error) + Size() int } diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index cc8bfefee..f709920fc 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -504,7 +504,18 @@ func (i *PostingsIterator) readLocation(l *Location) error { // Next returns the next posting on the postings list, or nil at the end func (i *PostingsIterator) Next() (segment.Posting, error) { - docNum, exists, err := i.nextDocNum() + return i.nextAtOrAfter(0) +} + +// Advance returns the posting at the specified docNum or it is not present +// the next posting, or if the end is reached, nil +func (i *PostingsIterator) Advance(docNum uint64) (segment.Posting, error) { + return i.nextAtOrAfter(docNum) +} + +// Next returns the next posting on the postings list, or nil at the end +func (i *PostingsIterator) nextAtOrAfter(atOrAfter uint64) (segment.Posting, error) { + docNum, exists, err := i.nextDocNumAtOrAfter(atOrAfter) if err != nil || !exists { return nil, err } @@ -557,7 +568,7 @@ var freqHasLocs1Hit = encodeFreqHasLocs(1, false) func (i *PostingsIterator) nextBytes() ( docNumOut uint64, freq uint64, normBits uint64, bytesFreqNorm []byte, bytesLoc []byte, err error) { - docNum, exists, err := i.nextDocNum() + docNum, exists, err := i.nextDocNumAtOrAfter(0) if err != nil || !exists { return 0, 0, 0, nil, nil, err } @@ -602,11 +613,16 @@ func (i *PostingsIterator) nextBytes() ( // nextDocNum returns the next docNum on the postings list, and also // sets up the currChunk / loc related fields of the iterator. -func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { +func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, error) { if i.normBits1Hit != 0 { if i.docNum1Hit == docNum1HitFinished { return 0, false, nil } + if i.docNum1Hit < atOrAfter { + // advanced past our 1-hit + i.docNum1Hit = docNum1HitFinished // consume our 1-hit docNum + return 0, false, nil + } docNum := i.docNum1Hit i.docNum1Hit = docNum1HitFinished // consume our 1-hit docNum return docNum, true, nil @@ -617,6 +633,13 @@ func (i *PostingsIterator) nextDocNum() (uint64, bool, error) { } n := i.Actual.Next() + for uint64(n) < atOrAfter && i.Actual.HasNext() { + n = i.Actual.Next() + } + if uint64(n) < atOrAfter { + // couldn't find anything + return 0, false, nil + } allN := i.all.Next() nChunk := n / i.postings.sb.chunkFactor diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index 11f1283ec..89af3be4c 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -16,6 +16,7 @@ package scorch import ( "bytes" + "fmt" "reflect" "sync/atomic" @@ -134,24 +135,37 @@ func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAllo } *i = *(i2.(*IndexSnapshotTermFieldReader)) } - // FIXME do something better - next, err := i.Next(preAlloced) + num, err := docInternalToNumber(ID) + if err != nil { + return nil, fmt.Errorf("error converting to doc number % x - %v", ID, err) + } + segIndex, ldocNum := i.snapshot.segmentIndexAndLocalDocNumFromGlobal(num) + if segIndex >= len(i.snapshot.segment) { + return nil, fmt.Errorf("computed segment index %d out of bounds %d", + segIndex, len(i.snapshot.segment)) + } + // skip directly to the target segment + i.segmentOffset = segIndex + next, err := i.iterators[i.segmentOffset].Advance(ldocNum) if err != nil { return nil, err } if next == nil { - return nil, nil + // we jumped directly to the segment that should have contained it + // but it wasn't there, so reuse Next() which should correctly + // get the next hit after it (we moved i.segmentOffset) + return i.Next(preAlloced) } - for bytes.Compare(next.ID, ID) < 0 { - next, err = i.Next(preAlloced) - if err != nil { - return nil, err - } - if next == nil { - break - } + + if preAlloced == nil { + preAlloced = &index.TermFieldDoc{} } - return next, nil + preAlloced.ID = docNumberToBytes(preAlloced.ID, next.Number()+ + i.snapshot.offsets[segIndex]) + i.postingToTermFieldDoc(next, preAlloced) + i.currID = preAlloced.ID + i.currPosting = next + return preAlloced, nil } func (i *IndexSnapshotTermFieldReader) Count() uint64 { From a4954aa8577f7da5e543f3c101b2f516eafeea05 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 13 Apr 2018 08:38:56 -0700 Subject: [PATCH 379/728] fixes #895 - scorch zap file format updated w/ # bytes of locations NOTE: This is a zap file format change. In the case of a composite field (i.e., _all field), the component fields might have differing configurations for IncludeTermVectors. This can mean the freq can sometimes be > numberOfLocations, where the previous code incorrectly assumed freq == numberOfLocations. By encoding the # of bytes used for locations data, the loops performing readLocation() will be able to read the correct # of locations. And, encoding the # of bytes used for locations data instead of the count of locations allows for an optimization of no longer reading unneeded locations data when skipping ahead. This fixes issue https://github.com/blevesearch/bleve/issues/895 reported by @xeizmendi. --- index/scorch/scorch_test.go | 55 +++++++++++++++++++++++++++++ index/scorch/segment/zap/build.go | 2 +- index/scorch/segment/zap/merge.go | 21 ++++++++--- index/scorch/segment/zap/new.go | 51 +++++++++++++++++++++----- index/scorch/segment/zap/posting.go | 52 +++++++++++++++++++-------- 5 files changed, 152 insertions(+), 29 deletions(-) diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index 3be52cb1f..cf784755d 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -25,9 +25,12 @@ import ( "time" "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/analyzer/keyword" + "github.com/blevesearch/bleve/analysis/analyzer/standard" regexpTokenizer "github.com/blevesearch/bleve/analysis/tokenizer/regexp" "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/mapping" ) func DestroyTest() error { @@ -1707,3 +1710,55 @@ func TestIndexDocumentVisitFieldTermsWithMultipleFieldOptions(t *testing.T) { } } + +func TestAllFieldWithDifferentTermVectorsEnabled(t *testing.T) { + // Based on https://github.com/blevesearch/bleve/issues/895 from xeizmendi + mp := mapping.NewIndexMapping() + + keywordMapping := mapping.NewTextFieldMapping() + keywordMapping.Analyzer = keyword.Name + keywordMapping.IncludeTermVectors = false + keywordMapping.IncludeInAll = true + + textMapping := mapping.NewTextFieldMapping() + textMapping.Analyzer = standard.Name + textMapping.IncludeTermVectors = true + textMapping.IncludeInAll = true + + docMapping := mapping.NewDocumentStaticMapping() + docMapping.AddFieldMappingsAt("keyword", keywordMapping) + docMapping.AddFieldMappingsAt("text", textMapping) + + mp.DefaultMapping = docMapping + + _ = os.RemoveAll(testConfig["path"].(string)) + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch("storeName", testConfig, analysisQueue) + if err != nil { + log.Fatalln(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + _ = idx.Close() + _ = os.RemoveAll(testConfig["path"].(string)) + }() + + data := map[string]string{ + "keyword": "something", + "text": "A sentence that includes something within.", + } + + doc := document.NewDocument("1") + err = mp.MapDocument(doc, data) + if err != nil { + t.Errorf("error mapping doc: %v", err) + } + + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } +} diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 2c261a3eb..ae460cef1 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -20,7 +20,7 @@ import ( "os" ) -const Version uint32 = 9 +const Version uint32 = 10 const Type string = "zap" diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index f2a881856..9a7041b07 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -427,17 +427,30 @@ func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *Po } if len(locs) > 0 { + numBytesLocs := 0 for _, loc := range locs { - if cap(bufLoc) < 5+len(loc.ArrayPositions()) { - bufLoc = make([]uint64, 0, 5+len(loc.ArrayPositions())) + ap := loc.ArrayPositions() + numBytesLocs += totalUvarintBytes(uint64(fieldsMap[loc.Field()]-1), + loc.Pos(), loc.Start(), loc.End(), uint64(len(ap)), ap) + } + + err = locEncoder.Add(hitNewDocNum, uint64(numBytesLocs)) + if err != nil { + return 0, 0, 0, nil, err + } + + for _, loc := range locs { + ap := loc.ArrayPositions() + if cap(bufLoc) < 5+len(ap) { + bufLoc = make([]uint64, 0, 5+len(ap)) } args := bufLoc[0:5] args[0] = uint64(fieldsMap[loc.Field()] - 1) args[1] = loc.Pos() args[2] = loc.Start() args[3] = loc.End() - args[4] = uint64(len(loc.ArrayPositions())) - args = append(args, loc.ArrayPositions()...) + args[4] = uint64(len(ap)) + args = append(args, ap...) err = locEncoder.Add(hitNewDocNum, args...) if err != nil { return 0, 0, 0, nil, err diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index da24988ae..4d56b0694 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -190,7 +190,7 @@ type interimStoredField struct { type interimFreqNorm struct { freq uint64 norm float32 - hasLocs bool + numLocs int } type interimLoc struct { @@ -446,7 +446,7 @@ func (s *interim) processDocument(docNum uint64, interimFreqNorm{ freq: uint64(tf.Frequency()), norm: norm, - hasLocs: len(tf.Locations) > 0, + numLocs: len(tf.Locations), }) if len(tf.Locations) > 0 { @@ -632,18 +632,28 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err freqNorm := freqNorms[freqNormOffset] err = tfEncoder.Add(docNum, - encodeFreqHasLocs(freqNorm.freq, freqNorm.hasLocs), + encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0), uint64(math.Float32bits(freqNorm.norm))) if err != nil { return 0, nil, err } - for i := uint64(0); i < freqNorm.freq; i++ { - if len(locs) > 0 { - loc := locs[locOffset] + if freqNorm.numLocs > 0 { + numBytesLocs := 0 + for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { + numBytesLocs += totalUvarintBytes( + uint64(loc.fieldID), loc.pos, loc.start, loc.end, + uint64(len(loc.arrayposs)), loc.arrayposs) + } + + err = locEncoder.Add(docNum, uint64(numBytesLocs)) + if err != nil { + return 0, nil, err + } - err = locEncoder.Add(docNum, uint64(loc.fieldID), - loc.pos, loc.start, loc.end, + for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { + err = locEncoder.Add(docNum, + uint64(loc.fieldID), loc.pos, loc.start, loc.end, uint64(len(loc.arrayposs))) if err != nil { return 0, nil, err @@ -655,7 +665,7 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err } } - locOffset++ + locOffset += freqNorm.numLocs } freqNormOffset++ @@ -775,3 +785,26 @@ func encodeFieldType(f document.Field) byte { } return fieldType } + +// returns the total # of bytes needed to encode the given uint64's +// into binary.PutUVarint() encoding +func totalUvarintBytes(a, b, c, d, e uint64, more []uint64) (n int) { + n = numUvarintBytes(a) + n += numUvarintBytes(b) + n += numUvarintBytes(c) + n += numUvarintBytes(d) + n += numUvarintBytes(e) + for _, v := range more { + n += numUvarintBytes(v) + } + return n +} + +// returns # of bytes needed to encode x in binary.PutUvarint() encoding +func numUvarintBytes(x uint64) (n int) { + for x >= 0x80 { + x >>= 7 + n++ + } + return n + 1 +} diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index f709920fc..7bfbe0bef 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -18,6 +18,7 @@ import ( "bytes" "encoding/binary" "fmt" + "io" "math" "reflect" @@ -539,7 +540,10 @@ func (i *PostingsIterator) nextAtOrAfter(atOrAfter uint64) (segment.Posting, err rv.norm = math.Float32frombits(uint32(normBits)) if i.includeLocs && hasLocs { - // read off 'freq' locations, into reused slices + // prepare locations into reused slices, where we assume + // rv.freq >= "number of locs", since in a composite field, + // some component fields might have their IncludeTermVector + // flags disabled while other component fields are enabled if cap(i.nextLocs) >= int(rv.freq) { i.nextLocs = i.nextLocs[0:rv.freq] } else { @@ -548,13 +552,22 @@ func (i *PostingsIterator) nextAtOrAfter(atOrAfter uint64) (segment.Posting, err if cap(i.nextSegmentLocs) < int(rv.freq) { i.nextSegmentLocs = make([]segment.Location, rv.freq, rv.freq*2) } - rv.locs = i.nextSegmentLocs[0:rv.freq] - for j := 0; j < int(rv.freq); j++ { + rv.locs = i.nextSegmentLocs[:0] + + numLocsBytes, err := binary.ReadUvarint(i.locReader) + if err != nil { + return nil, fmt.Errorf("error reading location numLocsBytes: %v", err) + } + + j := 0 + startBytesRemaining := i.locReader.Len() // # bytes remaining in the locReader + for startBytesRemaining-i.locReader.Len() < int(numLocsBytes) { err := i.readLocation(&i.nextLocs[j]) if err != nil { return nil, err } - rv.locs[j] = &i.nextLocs[j] + rv.locs = append(rv.locs, &i.nextLocs[j]) + j++ } } @@ -597,11 +610,16 @@ func (i *PostingsIterator) nextBytes() ( if hasLocs { startLoc := len(i.currChunkLoc) - i.locReader.Len() - for j := uint64(0); j < freq; j++ { - err := i.readLocation(nil) - if err != nil { - return 0, 0, 0, nil, nil, err - } + numLocsBytes, err := binary.ReadUvarint(i.locReader) + if err != nil { + return 0, 0, 0, nil, nil, + fmt.Errorf("error reading location nextBytes numLocs: %v", err) + } + + // skip over all the location bytes + _, err = i.locReader.Seek(int64(numLocsBytes), io.SeekCurrent) + if err != nil { + return 0, 0, 0, nil, nil, err } endLoc := len(i.currChunkLoc) - i.locReader.Len() @@ -659,17 +677,21 @@ func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, } // read off freq/offsets even though we don't care about them - freq, _, hasLocs, err := i.readFreqNormHasLocs() + _, _, hasLocs, err := i.readFreqNormHasLocs() if err != nil { return 0, false, err } if i.includeLocs && hasLocs { - for j := 0; j < int(freq); j++ { - err := i.readLocation(nil) - if err != nil { - return 0, false, err - } + numLocsBytes, err := binary.ReadUvarint(i.locReader) + if err != nil { + return 0, false, fmt.Errorf("error reading location numLocsBytes: %v", err) + } + + // skip over all the location bytes + _, err = i.locReader.Seek(int64(numLocsBytes), io.SeekCurrent) + if err != nil { + return 0, false, err } } } From dc6c1f80c138263ea2c22bf81f745974e5bb7b80 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Tue, 10 Apr 2018 21:07:15 -0700 Subject: [PATCH 380/728] [1/3] Optimization for reducing merger overhead from doc-values Part1: Changing the format in which chunkedContentCoder writes, essentially reversing the way data is written. *This commit changes the zap file format, bumping up the version. --- index/scorch/segment/zap/build.go | 2 +- index/scorch/segment/zap/contentcoder.go | 40 +++++++++++++++---- index/scorch/segment/zap/contentcoder_test.go | 13 ++++-- index/scorch/segment/zap/docvalues.go | 29 +++++++++----- index/scorch/segment/zap/merge.go | 19 ++++++--- index/scorch/segment/zap/new.go | 19 ++++++--- index/scorch/segment/zap/segment.go | 14 +++++-- 7 files changed, 100 insertions(+), 36 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index ae460cef1..91bfd4e24 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -20,7 +20,7 @@ import ( "os" ) -const Version uint32 = 10 +const Version uint32 = 11 const Type string = "zap" diff --git a/index/scorch/segment/zap/contentcoder.go b/index/scorch/segment/zap/contentcoder.go index 2148d1d45..6630a59ce 100644 --- a/index/scorch/segment/zap/contentcoder.go +++ b/index/scorch/segment/zap/contentcoder.go @@ -146,33 +146,59 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error { } // Write commits all the encoded chunked contents to the provided writer. +// +// | ..... data ..... | chunk offsets (varints) +// | position of chunk offsets (uint64) | number of offsets (uint64) | +// func (c *chunkedContentCoder) Write(w io.Writer) (int, error) { var tw int - buf := make([]byte, binary.MaxVarintLen64) - // write out the number of chunks - n := binary.PutUvarint(buf, uint64(len(c.chunkLens))) - nw, err := w.Write(buf[:n]) + + // write out the data section first + nw, err := w.Write(c.final) tw += nw if err != nil { return tw, err } + chunkOffsetsStart := uint64(tw) + + if cap(c.final) < binary.MaxVarintLen64 { + c.final = make([]byte, binary.MaxVarintLen64) + } else { + c.final = c.final[0:binary.MaxVarintLen64] + } chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens) // write out the chunk offsets for _, chunkOffset := range chunkOffsets { - n := binary.PutUvarint(buf, chunkOffset) - nw, err = w.Write(buf[:n]) + n := binary.PutUvarint(c.final, chunkOffset) + nw, err = w.Write(c.final[:n]) tw += nw if err != nil { return tw, err } } - // write out the data + + chunkOffsetsLen := uint64(tw) - chunkOffsetsStart + + c.final = c.final[0:8] + // write out the length of chunk offsets + binary.BigEndian.PutUint64(c.final, chunkOffsetsLen) nw, err = w.Write(c.final) tw += nw if err != nil { return tw, err } + + // write out the number of chunks + binary.BigEndian.PutUint64(c.final, uint64(len(c.chunkLens))) + nw, err = w.Write(c.final) + tw += nw + if err != nil { + return tw, err + } + + c.final = c.final[:0] + return tw, nil } diff --git a/index/scorch/segment/zap/contentcoder_test.go b/index/scorch/segment/zap/contentcoder_test.go index ff26138a6..4392a4046 100644 --- a/index/scorch/segment/zap/contentcoder_test.go +++ b/index/scorch/segment/zap/contentcoder_test.go @@ -35,7 +35,10 @@ func TestChunkContentCoder(t *testing.T) { docNums: []uint64{0}, vals: [][]byte{[]byte("bleve")}, // 1 chunk, chunk-0 length 11(b), value - expected: string([]byte{0x1, 0xa, 0x1, 0x0, 0x05, 0x05, 0x10, 0x62, 0x6c, 0x65, 0x76, 0x65}), + expected: string([]byte{0x1, 0x0, 0x5, 0x5, 0x10, 0x62, 0x6c, 0x65, 0x76, 0x65, + 0xa, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1}), }, { maxDocNum: 1, @@ -46,9 +49,11 @@ func TestChunkContentCoder(t *testing.T) { []byte("scorch"), }, - expected: string([]byte{0x02, 0x0b, 0x16, 0x01, 0x00, 0x06, 0x06, 0x14, - 0x75, 0x70, 0x73, 0x69, 0x64, 0x65, 0x01, 0x01, 0x06, 0x06, - 0x14, 0x73, 0x63, 0x6f, 0x72, 0x63, 0x68}), + expected: string([]byte{0x1, 0x0, 0x6, 0x6, 0x14, 0x75, 0x70, 0x73, 0x69, 0x64, + 0x65, 0x1, 0x1, 0x6, 0x6, 0x14, 0x73, 0x63, 0x6f, 0x72, 0x63, 0x68, + 0xb, 0x16, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2}), }, } diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index 220897853..c87d920b5 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -62,29 +62,34 @@ func (di *docValueReader) curChunkNumber() uint64 { } func (s *SegmentBase) loadFieldDocValueReader(field string, - fieldDvLoc uint64) (*docValueReader, error) { + fieldDvLocStart, fieldDvLocEnd uint64) (*docValueReader, error) { // get the docValue offset for the given fields - if fieldDvLoc == fieldNotUninverted { + if fieldDvLocStart == fieldNotUninverted { return nil, fmt.Errorf("loadFieldDocValueReader: "+ "no docValues found for field: %s", field) } - // read the number of chunks, chunk lengths - var offset, loc uint64 - numChunks, read := binary.Uvarint(s.mem[fieldDvLoc : fieldDvLoc+binary.MaxVarintLen64]) - if read <= 0 { - return nil, fmt.Errorf("failed to read the field "+ - "doc values for field %s", field) + // read the number of chunks, and chunk offsets position + var numChunks, chunkOffsetsPosition uint64 + + if fieldDvLocEnd-fieldDvLocStart > 16 { + numChunks = binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-8 : fieldDvLocEnd]) + // read the length of chunk offsets + chunkOffsetsLen := binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-16 : fieldDvLocEnd-8]) + // acquire position of chunk offsets + chunkOffsetsPosition = (fieldDvLocEnd - 16) - chunkOffsetsLen } - offset += uint64(read) fdvIter := &docValueReader{ curChunkNum: math.MaxUint64, field: field, chunkOffsets: make([]uint64, int(numChunks)), } + + // read the chunk offsets + var offset uint64 for i := 0; i < int(numChunks); i++ { - loc, read = binary.Uvarint(s.mem[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64]) + loc, read := binary.Uvarint(s.mem[chunkOffsetsPosition+offset : chunkOffsetsPosition+offset+binary.MaxVarintLen64]) if read <= 0 { return nil, fmt.Errorf("corrupted chunk offset during segment load") } @@ -92,7 +97,9 @@ func (s *SegmentBase) loadFieldDocValueReader(field string, offset += uint64(read) } - fdvIter.dvDataLoc = fieldDvLoc + offset + // set the data offset + fdvIter.dvDataLoc = fieldDvLocStart + return fdvIter, nil } diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 9a7041b07..fb64a5a26 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -169,7 +169,8 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, var postItr *PostingsIterator rv := make([]uint64, len(fieldsInv)) - fieldDvLocs := make([]uint64, len(fieldsInv)) + fieldDvLocsStart := make([]uint64, len(fieldsInv)) + fieldDvLocsEnd := make([]uint64, len(fieldsInv)) tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) @@ -370,8 +371,8 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, return nil, 0, err } - // get the field doc value offset - fieldDvLocs[fieldID] = uint64(w.Count()) + // get the field doc value offset (start) + fieldDvLocsStart[fieldID] = uint64(w.Count()) // persist the doc value details for this field _, err = fdvEncoder.Write(w) @@ -379,6 +380,9 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, return nil, 0, err } + // get the field doc value offset (end) + fieldDvLocsEnd[fieldID] = uint64(w.Count()) + // reset vellum buffer and vellum builder vellumBuf.Reset() err = newVellum.Reset(&vellumBuf) @@ -390,12 +394,17 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, fieldDvLocsOffset := uint64(w.Count()) buf := bufMaxVarintLen64 - for _, offset := range fieldDvLocs { - n := binary.PutUvarint(buf, uint64(offset)) + for i := 0; i < len(fieldDvLocsStart); i++ { + n := binary.PutUvarint(buf, fieldDvLocsStart[i]) _, err := w.Write(buf[:n]) if err != nil { return nil, 0, err } + n = binary.PutUvarint(buf, fieldDvLocsEnd[i]) + _, err = w.Write(buf[:n]) + if err != nil { + return nil, 0, err + } } return rv, fieldDvLocsOffset, nil diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index 4d56b0694..7b022474e 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -585,7 +585,8 @@ func (s *interim) writeStoredFields() ( func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err error) { dictOffsets = make([]uint64, len(s.FieldsInv)) - fdvOffsets := make([]uint64, len(s.FieldsInv)) + fdvOffsetsStart := make([]uint64, len(s.FieldsInv)) + fdvOffsetsEnd := make([]uint64, len(s.FieldsInv)) buf := s.grabBuf(binary.MaxVarintLen64) @@ -741,27 +742,35 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err return 0, nil, err } - fdvOffsets[fieldID] = uint64(s.w.Count()) + fdvOffsetsStart[fieldID] = uint64(s.w.Count()) _, err = fdvEncoder.Write(s.w) if err != nil { return 0, nil, err } + fdvOffsetsEnd[fieldID] = uint64(s.w.Count()) + fdvEncoder.Reset() } else { - fdvOffsets[fieldID] = fieldNotUninverted + fdvOffsetsStart[fieldID] = fieldNotUninverted + fdvOffsetsEnd[fieldID] = fieldNotUninverted } } fdvIndexOffset = uint64(s.w.Count()) - for _, fdvOffset := range fdvOffsets { - n := binary.PutUvarint(buf, fdvOffset) + for i := 0; i < len(fdvOffsetsStart); i++ { + n := binary.PutUvarint(buf, fdvOffsetsStart[i]) _, err := s.w.Write(buf[:n]) if err != nil { return 0, nil, err } + n = binary.PutUvarint(buf, fdvOffsetsEnd[i]) + _, err = s.w.Write(buf[:n]) + if err != nil { + return 0, nil, err + } } return fdvIndexOffset, dictOffsets, nil diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index f9549416a..3bae48ddb 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -487,12 +487,20 @@ func (s *SegmentBase) loadDvReaders() error { var read uint64 for fieldID, field := range s.fieldsInv { - fieldLoc, n := binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) + var fieldLocStart, fieldLocEnd uint64 + var n int + fieldLocStart, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) if n <= 0 { - return fmt.Errorf("loadDvReaders: failed to read the docvalue offsets for field %d", fieldID) + return fmt.Errorf("loadDvReaders: failed to read the docvalue offset start for field %d", fieldID) } - s.fieldDvReaders[uint16(fieldID)], _ = s.loadFieldDocValueReader(field, fieldLoc) read += uint64(n) + fieldLocEnd, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) + if n <= 0 { + return fmt.Errorf("loadDvReaders: failed to read the docvalue offset end for field %d", fieldID) + } + read += uint64(n) + + s.fieldDvReaders[uint16(fieldID)], _ = s.loadFieldDocValueReader(field, fieldLocStart, fieldLocEnd) } return nil } From e3f2a721e979911bc004b40cf4a29e4a70c4b98c Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 11 Apr 2018 11:29:59 -0700 Subject: [PATCH 381/728] [2/3] Optimization for reducing merger overhead from doc-values Part2: Introduce a new flag to the chunkedContentCoder to write out to the output buffer in chunks. *Added unit tests to compare behavior --- index/scorch/segment/zap/contentcoder.go | 53 +++++++++----- index/scorch/segment/zap/contentcoder_test.go | 69 +++++++++++++++---- index/scorch/segment/zap/merge.go | 10 +-- index/scorch/segment/zap/new.go | 4 +- 4 files changed, 99 insertions(+), 37 deletions(-) diff --git a/index/scorch/segment/zap/contentcoder.go b/index/scorch/segment/zap/contentcoder.go index 6630a59ce..b9ff8179b 100644 --- a/index/scorch/segment/zap/contentcoder.go +++ b/index/scorch/segment/zap/contentcoder.go @@ -34,10 +34,14 @@ var termSeparator byte = 0xff var termSeparatorSplitSlice = []byte{termSeparator} type chunkedContentCoder struct { - final []byte - chunkSize uint64 - currChunk uint64 - chunkLens []uint64 + final []byte + chunkSize uint64 + currChunk uint64 + chunkLens []uint64 + + w io.Writer + progressiveWrite bool + chunkMetaBuf bytes.Buffer chunkBuf bytes.Buffer @@ -55,13 +59,15 @@ type MetaData struct { // newChunkedContentCoder returns a new chunk content coder which // packs data into chunks based on the provided chunkSize -func newChunkedContentCoder(chunkSize uint64, - maxDocNum uint64) *chunkedContentCoder { +func newChunkedContentCoder(chunkSize uint64, maxDocNum uint64, + w io.Writer, progressiveWrite bool) *chunkedContentCoder { total := maxDocNum/chunkSize + 1 rv := &chunkedContentCoder{ - chunkSize: chunkSize, - chunkLens: make([]uint64, total), - chunkMeta: make([]MetaData, 0, total), + chunkSize: chunkSize, + chunkLens: make([]uint64, total), + chunkMeta: make([]MetaData, 0, total), + w: w, + progressiveWrite: progressiveWrite, } return rv @@ -111,6 +117,15 @@ func (c *chunkedContentCoder) flushContents() error { c.final = append(c.final, c.compressed...) c.chunkLens[c.currChunk] = uint64(len(c.compressed) + len(metaData)) + + if c.progressiveWrite { + _, err := c.w.Write(c.final) + if err != nil { + return err + } + c.final = c.final[:0] + } + return nil } @@ -150,14 +165,16 @@ func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error { // | ..... data ..... | chunk offsets (varints) // | position of chunk offsets (uint64) | number of offsets (uint64) | // -func (c *chunkedContentCoder) Write(w io.Writer) (int, error) { +func (c *chunkedContentCoder) Write() (int, error) { var tw int - // write out the data section first - nw, err := w.Write(c.final) - tw += nw - if err != nil { - return tw, err + if c.final != nil { + // write out the data section first + nw, err := c.w.Write(c.final) + tw += nw + if err != nil { + return tw, err + } } chunkOffsetsStart := uint64(tw) @@ -171,7 +188,7 @@ func (c *chunkedContentCoder) Write(w io.Writer) (int, error) { // write out the chunk offsets for _, chunkOffset := range chunkOffsets { n := binary.PutUvarint(c.final, chunkOffset) - nw, err = w.Write(c.final[:n]) + nw, err := c.w.Write(c.final[:n]) tw += nw if err != nil { return tw, err @@ -183,7 +200,7 @@ func (c *chunkedContentCoder) Write(w io.Writer) (int, error) { c.final = c.final[0:8] // write out the length of chunk offsets binary.BigEndian.PutUint64(c.final, chunkOffsetsLen) - nw, err = w.Write(c.final) + nw, err := c.w.Write(c.final) tw += nw if err != nil { return tw, err @@ -191,7 +208,7 @@ func (c *chunkedContentCoder) Write(w io.Writer) (int, error) { // write out the number of chunks binary.BigEndian.PutUint64(c.final, uint64(len(c.chunkLens))) - nw, err = w.Write(c.final) + nw, err = c.w.Write(c.final) tw += nw if err != nil { return tw, err diff --git a/index/scorch/segment/zap/contentcoder_test.go b/index/scorch/segment/zap/contentcoder_test.go index 4392a4046..62ffde413 100644 --- a/index/scorch/segment/zap/contentcoder_test.go +++ b/index/scorch/segment/zap/contentcoder_test.go @@ -16,18 +16,17 @@ package zap import ( "bytes" - "reflect" "testing" ) -func TestChunkContentCoder(t *testing.T) { +func TestChunkedContentCoder(t *testing.T) { tests := []struct { maxDocNum uint64 chunkSize uint64 docNums []uint64 vals [][]byte - expected string + expected []byte }{ { maxDocNum: 0, @@ -35,10 +34,10 @@ func TestChunkContentCoder(t *testing.T) { docNums: []uint64{0}, vals: [][]byte{[]byte("bleve")}, // 1 chunk, chunk-0 length 11(b), value - expected: string([]byte{0x1, 0x0, 0x5, 0x5, 0x10, 0x62, 0x6c, 0x65, 0x76, 0x65, + expected: []byte{0x1, 0x0, 0x5, 0x5, 0x10, 0x62, 0x6c, 0x65, 0x76, 0x65, 0xa, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, - 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1}), + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1}, }, { maxDocNum: 1, @@ -49,17 +48,18 @@ func TestChunkContentCoder(t *testing.T) { []byte("scorch"), }, - expected: string([]byte{0x1, 0x0, 0x6, 0x6, 0x14, 0x75, 0x70, 0x73, 0x69, 0x64, + expected: []byte{0x1, 0x0, 0x6, 0x6, 0x14, 0x75, 0x70, 0x73, 0x69, 0x64, 0x65, 0x1, 0x1, 0x6, 0x6, 0x14, 0x73, 0x63, 0x6f, 0x72, 0x63, 0x68, 0xb, 0x16, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2, - 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2}), + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2}, }, } for _, test := range tests { - cic := newChunkedContentCoder(test.chunkSize, test.maxDocNum) + var actual bytes.Buffer + cic := newChunkedContentCoder(test.chunkSize, test.maxDocNum, &actual, false) for i, docNum := range test.docNums { err := cic.Add(docNum, test.vals[i]) if err != nil { @@ -67,14 +67,59 @@ func TestChunkContentCoder(t *testing.T) { } } _ = cic.Close() - var actual bytes.Buffer - _, err := cic.Write(&actual) + _, err := cic.Write() if err != nil { t.Fatalf("error writing: %v", err) } - if !reflect.DeepEqual(test.expected, string(actual.Bytes())) { - t.Errorf("got:%s, expected:%s", string(actual.Bytes()), test.expected) + if !bytes.Equal(test.expected, actual.Bytes()) { + t.Errorf("got:%s, expected:%s", string(actual.Bytes()), string(test.expected)) } } } + +func TestChunkedContentCoders(t *testing.T) { + maxDocNum := uint64(5) + chunkSize := uint64(1) + docNums := []uint64{0, 1, 2, 3, 4, 5} + vals := [][]byte{ + []byte("scorch"), + []byte("does"), + []byte("better"), + []byte("than"), + []byte("upside"), + []byte("down"), + } + + var actual1, actual2 bytes.Buffer + // chunkedContentCoder that writes out at the end + cic1 := newChunkedContentCoder(chunkSize, maxDocNum, &actual1, false) + // chunkedContentCoder that writes out in chunks + cic2 := newChunkedContentCoder(chunkSize, maxDocNum, &actual2, true) + + for i, docNum := range docNums { + err := cic1.Add(docNum, vals[i]) + if err != nil { + t.Fatalf("error adding to intcoder: %v", err) + } + err = cic2.Add(docNum, vals[i]) + if err != nil { + t.Fatalf("error adding to intcoder: %v", err) + } + } + _ = cic1.Close() + _ = cic2.Close() + + _, err := cic1.Write() + if err != nil { + t.Fatalf("error writing: %v", err) + } + _, err = cic2.Write() + if err != nil { + t.Fatalf("error writing: %v", err) + } + + if !bytes.Equal(actual1.Bytes(), actual2.Bytes()) { + t.Errorf("%s != %s", string(actual1.Bytes()), string(actual2.Bytes())) + } +} diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index fb64a5a26..cd3e76199 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -356,8 +356,11 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, rv[fieldID] = dictOffset + // get the field doc value offset (start) + fieldDvLocsStart[fieldID] = uint64(w.Count()) + // update the field doc values - fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1) + fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1, w, true) for docNum, docTerms := range docTermMap { if len(docTerms) > 0 { err = fdvEncoder.Add(uint64(docNum), docTerms) @@ -371,11 +374,8 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, return nil, 0, err } - // get the field doc value offset (start) - fieldDvLocsStart[fieldID] = uint64(w.Count()) - // persist the doc value details for this field - _, err = fdvEncoder.Write(w) + _, err = fdvEncoder.Write() if err != nil { return nil, 0, err } diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index 7b022474e..7471cf277 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -592,7 +592,7 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err tfEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1)) locEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1)) - fdvEncoder := newChunkedContentCoder(uint64(s.chunkFactor), uint64(len(s.results)-1)) + fdvEncoder := newChunkedContentCoder(uint64(s.chunkFactor), uint64(len(s.results)-1), s.w, false) var docTermMap [][]byte @@ -744,7 +744,7 @@ func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err fdvOffsetsStart[fieldID] = uint64(s.w.Count()) - _, err = fdvEncoder.Write(s.w) + _, err = fdvEncoder.Write() if err != nil { return 0, nil, err } From 3079fdc1a715643b7cf597cd61bf8a2ef6cfcd6a Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 19 Apr 2018 07:46:08 -0700 Subject: [PATCH 382/728] fixes #893 - clone zap docValueReader before using --- index/scorch/segment/zap/docvalues.go | 25 ++++++++++++++++++++++--- 1 file changed, 22 insertions(+), 3 deletions(-) diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index c87d920b5..bc5d1cac6 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -53,6 +53,22 @@ func (di *docValueReader) size() int { len(di.curChunkData) } +func (di *docValueReader) cloneInto(rv *docValueReader) *docValueReader { + if rv == nil { + rv = &docValueReader{} + } + + rv.field = di.field + rv.curChunkNum = math.MaxUint64 + rv.chunkOffsets = di.chunkOffsets // immutable, so it's sharable + rv.dvDataLoc = di.dvDataLoc + rv.curChunkHeader = nil + rv.curChunkData = nil + rv.uncompressed = nil + + return rv +} + func (di *docValueReader) fieldName() string { return di.field } @@ -186,6 +202,7 @@ func (di *docValueReader) getDocValueLocs(docNum uint64) (uint64, uint64) { // DocumentFieldTermVisitable interface func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []string, visitor index.DocumentFieldTermVisitor) error { + var dvIterClone *docValueReader fieldIDPlus1 := uint16(0) ok := true for _, field := range fields { @@ -197,15 +214,17 @@ func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []strin if dvIter, exists := s.fieldDvReaders[fieldIDPlus1-1]; exists && dvIter != nil { + dvIterClone = dvIter.cloneInto(dvIterClone) + // check if the chunk is already loaded - if docInChunk != dvIter.curChunkNumber() { - err := dvIter.loadDvChunk(docInChunk, s) + if docInChunk != dvIterClone.curChunkNumber() { + err := dvIterClone.loadDvChunk(docInChunk, s) if err != nil { continue } } - _ = dvIter.visitDocValues(localDocNum, visitor) + _ = dvIterClone.visitDocValues(localDocNum, visitor) } } return nil From 13ddd75be803d5b6c493c86b603f61c84c09f479 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 18 Apr 2018 17:05:18 -0700 Subject: [PATCH 383/728] [3/3] Optimization for reducing merger overhead from doc-values Part3: Getting rid of docTermMap during the merge operation, instead fetch all valid docNums and their corresponding values from the segmentBases via docValueReaders. --- index/scorch/segment/zap/docvalues.go | 31 +++++++++- index/scorch/segment/zap/merge.go | 83 ++++++++++++++------------- 2 files changed, 73 insertions(+), 41 deletions(-) diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index bc5d1cac6..c73c01215 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -34,10 +34,11 @@ func init() { reflectStaticSizedocValueReader = int(reflect.TypeOf(dvi).Size()) } +type docNumTermsVisitor func(docNum uint64, terms []byte) error + type docValueReader struct { field string curChunkNum uint64 - numChunks uint64 chunkOffsets []uint64 dvDataLoc uint64 curChunkHeader []MetaData @@ -151,6 +152,34 @@ func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error return nil } +func (di *docValueReader) iterateAllDocValues(s *SegmentBase, visitor docNumTermsVisitor) error { + for i := 0; i < len(di.chunkOffsets); i++ { + err := di.loadDvChunk(uint64(i), s) + if err != nil { + return err + } + + // uncompress the already loaded data + uncompressed, err := snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) + if err != nil { + return err + } + di.uncompressed = uncompressed + + start := uint64(0) + for _, entry := range di.curChunkHeader { + err = visitor(entry.DocNum, uncompressed[start:entry.DocDvOffset]) + if err != nil { + return err + } + + start = entry.DocDvOffset + } + } + + return nil +} + func (di *docValueReader) visitDocValues(docNum uint64, visitor index.DocumentFieldTermVisitor) error { // binary search the term locations for the docNum diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index cd3e76199..7f8f23f77 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -175,12 +175,6 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) - // docTermMap is keyed by docNum, where the array impl provides - // better memory usage behavior than a sparse-friendlier hashmap - // for when docs have much structural similarity (i.e., every doc - // has a given field) - var docTermMap [][]byte - var vellumBuf bytes.Buffer newVellum, err := vellum.New(&vellumBuf, nil) if err != nil { @@ -198,6 +192,8 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, var dicts []*Dictionary var itrs []vellum.Iterator + var segmentsInFocus []*SegmentBase + for segmentI, segment := range segments { dict, err2 := segment.dictionary(fieldName) if err2 != nil { @@ -217,19 +213,11 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, } dicts = append(dicts, dict) itrs = append(itrs, itr) + segmentsInFocus = append(segmentsInFocus, segment) } } } - if uint64(cap(docTermMap)) < newSegDocCount { - docTermMap = make([][]byte, newSegDocCount) - } else { - docTermMap = docTermMap[0:newSegDocCount] - for docNum := range docTermMap { // reset the docTermMap - docTermMap[docNum] = docTermMap[docNum][:0] - } - } - var prevTerm []byte newRoaring.Clear() @@ -309,11 +297,11 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, // can optimize by copying freq/norm/loc bytes directly lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying( term, postItr, newDocNums[itrI], newRoaring, - tfEncoder, locEncoder, docTermMap) + tfEncoder, locEncoder) } else { lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs( fieldsMap, term, postItr, newDocNums[itrI], newRoaring, - tfEncoder, locEncoder, docTermMap, bufLoc) + tfEncoder, locEncoder, bufLoc) } if err != nil { return nil, 0, err @@ -361,27 +349,49 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, // update the field doc values fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1, w, true) - for docNum, docTerms := range docTermMap { - if len(docTerms) > 0 { - err = fdvEncoder.Add(uint64(docNum), docTerms) + + fdvReadersAvailable := false + var dvIterClone *docValueReader + for segmentI, segment := range segmentsInFocus { + fieldIDPlus1 := uint16(segment.fieldsMap[fieldName]) + if dvIter, exists := segment.fieldDvReaders[fieldIDPlus1-1]; exists && + dvIter != nil { + fdvReadersAvailable = true + dvIterClone = dvIter.cloneInto(dvIterClone) + err = dvIterClone.iterateAllDocValues(segment, func(docNum uint64, terms []byte) error { + if newDocNums[segmentI][docNum] == docDropped { + return nil + } + err := fdvEncoder.Add(newDocNums[segmentI][docNum], terms) + if err != nil { + return err + } + return nil + }) if err != nil { return nil, 0, err } } } - err = fdvEncoder.Close() - if err != nil { - return nil, 0, err - } - // persist the doc value details for this field - _, err = fdvEncoder.Write() - if err != nil { - return nil, 0, err - } + if fdvReadersAvailable { + err = fdvEncoder.Close() + if err != nil { + return nil, 0, err + } - // get the field doc value offset (end) - fieldDvLocsEnd[fieldID] = uint64(w.Count()) + // persist the doc value details for this field + _, err = fdvEncoder.Write() + if err != nil { + return nil, 0, err + } + + // get the field doc value offset (end) + fieldDvLocsEnd[fieldID] = uint64(w.Count()) + } else { + fieldDvLocsStart[fieldID] = fieldNotUninverted + fieldDvLocsEnd[fieldID] = fieldNotUninverted + } // reset vellum buffer and vellum builder vellumBuf.Reset() @@ -412,8 +422,7 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *PostingsIterator, newDocNums []uint64, newRoaring *roaring.Bitmap, - tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, docTermMap [][]byte, - bufLoc []uint64) ( + tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, bufLoc []uint64) ( lastDocNum uint64, lastFreq uint64, lastNorm uint64, bufLocOut []uint64, err error) { next, err := postItr.Next() for next != nil && err == nil { @@ -467,9 +476,6 @@ func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *Po } } - docTermMap[hitNewDocNum] = - append(append(docTermMap[hitNewDocNum], term...), termSeparator) - lastDocNum = hitNewDocNum lastFreq = nextFreq lastNorm = nextNorm @@ -482,7 +488,7 @@ func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *Po func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator, newDocNums []uint64, newRoaring *roaring.Bitmap, - tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, docTermMap [][]byte) ( + tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder) ( lastDocNum uint64, lastFreq uint64, lastNorm uint64, err error) { nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err := postItr.nextBytes() @@ -505,9 +511,6 @@ func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator, } } - docTermMap[hitNewDocNum] = - append(append(docTermMap[hitNewDocNum], term...), termSeparator) - lastDocNum = hitNewDocNum lastFreq = nextFreq lastNorm = nextNorm From 542efc16c6d8eb59685cd24ed302969b23a8d17d Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 19 Apr 2018 10:18:40 -0400 Subject: [PATCH 384/728] alternate disjunction searcher impl using heap introduce an alternate disjunction searcher implementation that uses a heap to track searchers. when you have a disjunction of many other searchers, this should be more efficient by performing much fewer key comparisions. currently a cut-over from slice to heap is set at 10, this can be adjusted by setting the package-level variable named DisjunctionHeapTakeover --- search/searcher/search_disjunction.go | 302 ++--------------- search/searcher/search_disjunction_heap.go | 342 ++++++++++++++++++++ search/searcher/search_disjunction_slice.go | 298 +++++++++++++++++ 3 files changed, 660 insertions(+), 282 deletions(-) create mode 100644 search/searcher/search_disjunction_heap.go create mode 100644 search/searcher/search_disjunction_slice.go diff --git a/search/searcher/search_disjunction.go b/search/searcher/search_disjunction.go index b75041371..16df7ca36 100644 --- a/search/searcher/search_disjunction.go +++ b/search/searcher/search_disjunction.go @@ -1,4 +1,4 @@ -// Copyright (c) 2014 Couchbase, Inc. +// Copyright (c) 2018 Couchbase, Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -16,308 +16,46 @@ package searcher import ( "fmt" - "math" - "reflect" - "sort" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" - "github.com/blevesearch/bleve/search/scorer" - "github.com/blevesearch/bleve/size" ) -var reflectStaticSizeDisjunctionSearcher int - -func init() { - var ds DisjunctionSearcher - reflectStaticSizeDisjunctionSearcher = int(reflect.TypeOf(ds).Size()) -} - // DisjunctionMaxClauseCount is a compile time setting that applications can // adjust to non-zero value to cause the DisjunctionSearcher to return an // error instead of exeucting searches when the size exceeds this value. var DisjunctionMaxClauseCount = 0 -type DisjunctionSearcher struct { - indexReader index.IndexReader - searchers OrderedSearcherList - numSearchers int - queryNorm float64 - currs []*search.DocumentMatch - scorer *scorer.DisjunctionQueryScorer - min int - matching []*search.DocumentMatch - matchingIdxs []int - initialized bool -} - -func tooManyClauses(count int) bool { - if DisjunctionMaxClauseCount != 0 && count > DisjunctionMaxClauseCount { - return true - } - return false -} - -func tooManyClausesErr() error { - return fmt.Errorf("TooManyClauses[maxClauseCount is set to %d]", - DisjunctionMaxClauseCount) -} +// DisjunctionHeapTakeover is a compile time setting that applications can +// adjust to control when the DisjunctionSearcher will switch from a simple +// slice implementation to a heap implementation. +var DisjunctionHeapTakeover = 10 func NewDisjunctionSearcher(indexReader index.IndexReader, qsearchers []search.Searcher, min float64, options search.SearcherOptions) ( - *DisjunctionSearcher, error) { - return newDisjunctionSearcher(indexReader, qsearchers, min, options, - true) + search.Searcher, error) { + return newDisjunctionSearcher(indexReader, qsearchers, min, options, true) } func newDisjunctionSearcher(indexReader index.IndexReader, qsearchers []search.Searcher, min float64, options search.SearcherOptions, - limit bool) ( - *DisjunctionSearcher, error) { - if limit && tooManyClauses(len(qsearchers)) { - return nil, tooManyClausesErr() - } - // build the downstream searchers - searchers := make(OrderedSearcherList, len(qsearchers)) - for i, searcher := range qsearchers { - searchers[i] = searcher - } - // sort the searchers - sort.Sort(sort.Reverse(searchers)) - // build our searcher - rv := DisjunctionSearcher{ - indexReader: indexReader, - searchers: searchers, - numSearchers: len(searchers), - currs: make([]*search.DocumentMatch, len(searchers)), - scorer: scorer.NewDisjunctionQueryScorer(options), - min: int(min), - matching: make([]*search.DocumentMatch, len(searchers)), - matchingIdxs: make([]int, len(searchers)), - } - rv.computeQueryNorm() - return &rv, nil -} - -func (s *DisjunctionSearcher) Size() int { - sizeInBytes := reflectStaticSizeDisjunctionSearcher + size.SizeOfPtr + - s.scorer.Size() - - for _, entry := range s.searchers { - sizeInBytes += entry.Size() - } - - for _, entry := range s.currs { - if entry != nil { - sizeInBytes += entry.Size() - } - } - - for _, entry := range s.matching { - if entry != nil { - sizeInBytes += entry.Size() - } - } - - sizeInBytes += len(s.matchingIdxs) * size.SizeOfInt - - return sizeInBytes -} - -func (s *DisjunctionSearcher) computeQueryNorm() { - // first calculate sum of squared weights - sumOfSquaredWeights := 0.0 - for _, searcher := range s.searchers { - sumOfSquaredWeights += searcher.Weight() - } - // now compute query norm from this - s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) - // finally tell all the downstream searchers the norm - for _, searcher := range s.searchers { - searcher.SetQueryNorm(s.queryNorm) - } -} - -func (s *DisjunctionSearcher) initSearchers(ctx *search.SearchContext) error { - var err error - // get all searchers pointing at their first match - for i, searcher := range s.searchers { - if s.currs[i] != nil { - ctx.DocumentMatchPool.Put(s.currs[i]) - } - s.currs[i], err = searcher.Next(ctx) - if err != nil { - return err - } - } - - err = s.updateMatches() - if err != nil { - return err - } - - s.initialized = true - return nil -} - -func (s *DisjunctionSearcher) updateMatches() error { - matching := s.matching[:0] - matchingIdxs := s.matchingIdxs[:0] - - for i := 0; i < len(s.currs); i++ { - curr := s.currs[i] - if curr == nil { - continue - } - - if len(matching) > 0 { - cmp := curr.IndexInternalID.Compare(matching[0].IndexInternalID) - if cmp > 0 { - continue - } - - if cmp < 0 { - matching = matching[:0] - matchingIdxs = matchingIdxs[:0] - } - } - - matching = append(matching, curr) - matchingIdxs = append(matchingIdxs, i) - } - - s.matching = matching - s.matchingIdxs = matchingIdxs - - return nil -} - -func (s *DisjunctionSearcher) Weight() float64 { - var rv float64 - for _, searcher := range s.searchers { - rv += searcher.Weight() - } - return rv -} - -func (s *DisjunctionSearcher) SetQueryNorm(qnorm float64) { - for _, searcher := range s.searchers { - searcher.SetQueryNorm(qnorm) - } -} - -func (s *DisjunctionSearcher) Next(ctx *search.SearchContext) ( - *search.DocumentMatch, error) { - if !s.initialized { - err := s.initSearchers(ctx) - if err != nil { - return nil, err - } - } - var err error - var rv *search.DocumentMatch - - found := false - for !found && len(s.matching) > 0 { - if len(s.matching) >= s.min { - found = true - // score this match - rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers) - } - - // invoke next on all the matching searchers - for _, i := range s.matchingIdxs { - searcher := s.searchers[i] - if s.currs[i] != rv { - ctx.DocumentMatchPool.Put(s.currs[i]) - } - s.currs[i], err = searcher.Next(ctx) - if err != nil { - return nil, err - } - } - - err = s.updateMatches() - if err != nil { - return nil, err - } - } - return rv, nil -} - -func (s *DisjunctionSearcher) Advance(ctx *search.SearchContext, - ID index.IndexInternalID) (*search.DocumentMatch, error) { - if !s.initialized { - err := s.initSearchers(ctx) - if err != nil { - return nil, err - } - } - // get all searchers pointing at their first match - var err error - for i, searcher := range s.searchers { - if s.currs[i] != nil { - if s.currs[i].IndexInternalID.Compare(ID) >= 0 { - continue - } - ctx.DocumentMatchPool.Put(s.currs[i]) - } - s.currs[i], err = searcher.Advance(ctx, ID) - if err != nil { - return nil, err - } + limit bool) (search.Searcher, error) { + if len(qsearchers) > DisjunctionHeapTakeover { + return newDisjunctionHeapSearcher(indexReader, qsearchers, min, options, + true) } - - err = s.updateMatches() - if err != nil { - return nil, err - } - - return s.Next(ctx) -} - -func (s *DisjunctionSearcher) Count() uint64 { - // for now return a worst case - var sum uint64 - for _, searcher := range s.searchers { - sum += searcher.Count() - } - return sum -} - -func (s *DisjunctionSearcher) Close() (rv error) { - for _, searcher := range s.searchers { - err := searcher.Close() - if err != nil && rv == nil { - rv = err - } - } - return rv -} - -func (s *DisjunctionSearcher) Min() int { - return s.min + return newDisjunctionSliceSearcher(indexReader, qsearchers, min, options, + true) } -func (s *DisjunctionSearcher) DocumentMatchPoolSize() int { - rv := len(s.currs) - for _, s := range s.searchers { - rv += s.DocumentMatchPoolSize() +func tooManyClauses(count int) bool { + if DisjunctionMaxClauseCount != 0 && count > DisjunctionMaxClauseCount { + return true } - return rv + return false } -// a disjunction searcher implements the index.Optimizable interface -// but only activates on an edge case where the disjunction is a -// wrapper around a single Optimizable child searcher -func (s *DisjunctionSearcher) Optimize(kind string, octx index.OptimizableContext) ( - index.OptimizableContext, error) { - if len(s.searchers) == 1 { - o, ok := s.searchers[0].(index.Optimizable) - if ok { - return o.Optimize(kind, octx) - } - } - - return octx, nil +func tooManyClausesErr() error { + return fmt.Errorf("TooManyClauses[maxClauseCount is set to %d]", + DisjunctionMaxClauseCount) } diff --git a/search/searcher/search_disjunction_heap.go b/search/searcher/search_disjunction_heap.go new file mode 100644 index 000000000..6414da2cc --- /dev/null +++ b/search/searcher/search_disjunction_heap.go @@ -0,0 +1,342 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package searcher + +import ( + "bytes" + "container/heap" + "math" + "reflect" + + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/search/scorer" + "github.com/blevesearch/bleve/size" +) + +var reflectStaticSizeDisjunctionHeapSearcher int +var reflectStaticSizeSearcherCurr int + +func init() { + var dhs DisjunctionHeapSearcher + reflectStaticSizeDisjunctionHeapSearcher = int(reflect.TypeOf(dhs).Size()) + + var sc SearcherCurr + reflectStaticSizeSearcherCurr = int(reflect.TypeOf(sc).Size()) +} + +type SearcherCurr struct { + searcher search.Searcher + curr *search.DocumentMatch +} + +type DisjunctionHeapSearcher struct { + indexReader index.IndexReader + + numSearchers int + scorer *scorer.DisjunctionQueryScorer + min int + queryNorm float64 + initialized bool + searchers []search.Searcher + heap []*SearcherCurr + + matching []*search.DocumentMatch + matchingCurrs []*SearcherCurr +} + +func newDisjunctionHeapSearcher(indexReader index.IndexReader, + searchers []search.Searcher, min float64, options search.SearcherOptions, + limit bool) ( + *DisjunctionHeapSearcher, error) { + if limit && tooManyClauses(len(searchers)) { + return nil, tooManyClausesErr() + } + + // build our searcher + rv := DisjunctionHeapSearcher{ + indexReader: indexReader, + searchers: searchers, + numSearchers: len(searchers), + scorer: scorer.NewDisjunctionQueryScorer(options), + min: int(min), + matching: make([]*search.DocumentMatch, len(searchers)), + matchingCurrs: make([]*SearcherCurr, len(searchers)), + heap: make([]*SearcherCurr, 0, len(searchers)), + } + rv.computeQueryNorm() + return &rv, nil +} + +func (s *DisjunctionHeapSearcher) Size() int { + sizeInBytes := reflectStaticSizeDisjunctionHeapSearcher + size.SizeOfPtr + + s.scorer.Size() + + for _, entry := range s.searchers { + sizeInBytes += entry.Size() + } + + for _, entry := range s.matching { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + // for matchingCurrs and heap, just use static size * len + // since searchers and document matches already counted above + sizeInBytes += len(s.matchingCurrs) * reflectStaticSizeSearcherCurr + sizeInBytes += len(s.heap) * reflectStaticSizeSearcherCurr + + return sizeInBytes +} + +func (s *DisjunctionHeapSearcher) computeQueryNorm() { + // first calculate sum of squared weights + sumOfSquaredWeights := 0.0 + for _, searcher := range s.searchers { + sumOfSquaredWeights += searcher.Weight() + } + // now compute query norm from this + s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) + // finally tell all the downstream searchers the norm + for _, searcher := range s.searchers { + searcher.SetQueryNorm(s.queryNorm) + } +} + +func (s *DisjunctionHeapSearcher) initSearchers(ctx *search.SearchContext) error { + // alloc a single block of SearcherCurrs + block := make([]SearcherCurr, len(s.searchers)) + + // get all searchers pointing at their first match + for i, searcher := range s.searchers { + curr, err := searcher.Next(ctx) + if err != nil { + return err + } + if curr != nil { + block[i].searcher = searcher + block[i].curr = curr + heap.Push(s, &block[i]) + } + } + + err := s.updateMatches() + if err != nil { + return err + } + s.initialized = true + return nil +} + +func (s *DisjunctionHeapSearcher) updateMatches() error { + matching := s.matching[:0] + matchingCurrs := s.matchingCurrs[:0] + + if len(s.heap) > 0 { + + // top of the heap is our next hit + next := heap.Pop(s).(*SearcherCurr) + matching = append(matching, next.curr) + matchingCurrs = append(matchingCurrs, next) + + // now as long as top of heap matches, keep popping + for len(s.heap) > 0 && bytes.Compare(next.curr.IndexInternalID, s.heap[0].curr.IndexInternalID) == 0 { + next = heap.Pop(s).(*SearcherCurr) + matching = append(matching, next.curr) + matchingCurrs = append(matchingCurrs, next) + } + } + + s.matching = matching + s.matchingCurrs = matchingCurrs + + return nil +} + +func (s *DisjunctionHeapSearcher) Weight() float64 { + var rv float64 + for _, searcher := range s.searchers { + rv += searcher.Weight() + } + return rv +} + +func (s *DisjunctionHeapSearcher) SetQueryNorm(qnorm float64) { + for _, searcher := range s.searchers { + searcher.SetQueryNorm(qnorm) + } +} + +func (s *DisjunctionHeapSearcher) Next(ctx *search.SearchContext) ( + *search.DocumentMatch, error) { + if !s.initialized { + err := s.initSearchers(ctx) + if err != nil { + return nil, err + } + } + + var rv *search.DocumentMatch + found := false + for !found && len(s.matching) > 0 { + if len(s.matching) >= s.min { + found = true + // score this match + rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers) + } + + // invoke next on all the matching searchers + for _, matchingCurr := range s.matchingCurrs { + if matchingCurr.curr != rv { + ctx.DocumentMatchPool.Put(matchingCurr.curr) + } + curr, err := matchingCurr.searcher.Next(ctx) + if err != nil { + return nil, err + } + if curr != nil { + matchingCurr.curr = curr + heap.Push(s, matchingCurr) + } + } + + err := s.updateMatches() + if err != nil { + return nil, err + } + } + + return rv, nil +} + +func (s *DisjunctionHeapSearcher) Advance(ctx *search.SearchContext, + ID index.IndexInternalID) (*search.DocumentMatch, error) { + if !s.initialized { + err := s.initSearchers(ctx) + if err != nil { + return nil, err + } + } + + // if there is anything in matching, toss it back onto the heap + for _, matchingCurr := range s.matchingCurrs { + heap.Push(s, matchingCurr) + } + s.matching = s.matching[:0] + s.matchingCurrs = s.matchingCurrs[:0] + + // get all searchers pointing at their first match + for i, searcherCurr := range s.heap { + if searcherCurr.searcher != nil { + if searcherCurr.curr.IndexInternalID.Compare(ID) >= 0 { + continue + } + ctx.DocumentMatchPool.Put(searcherCurr.curr) + } + curr, err := searcherCurr.searcher.Advance(ctx, ID) + if err != nil { + return nil, err + } + searcherCurr.curr = curr + heap.Fix(s, i) + } + // now remove any nil values (at top of heap) + for len(s.heap) > 0 && s.heap[0].curr == nil { + heap.Pop(s) + } + + err := s.updateMatches() + if err != nil { + return nil, err + } + + return s.Next(ctx) +} + +func (s *DisjunctionHeapSearcher) Count() uint64 { + // for now return a worst case + var sum uint64 + for _, searcher := range s.searchers { + sum += searcher.Count() + } + return sum +} + +func (s *DisjunctionHeapSearcher) Close() (rv error) { + for _, searcher := range s.searchers { + err := searcher.Close() + if err != nil && rv == nil { + rv = err + } + } + return rv +} + +func (s *DisjunctionHeapSearcher) Min() int { + return s.min +} + +func (s *DisjunctionHeapSearcher) DocumentMatchPoolSize() int { + rv := len(s.searchers) + for _, s := range s.searchers { + rv += s.DocumentMatchPoolSize() + } + return rv +} + +// a disjunction searcher implements the index.Optimizable interface +// but only activates on an edge case where the disjunction is a +// wrapper around a single Optimizable child searcher +func (s *DisjunctionHeapSearcher) Optimize(kind string, octx index.OptimizableContext) ( + index.OptimizableContext, error) { + if len(s.searchers) == 1 { + o, ok := s.searchers[0].(index.Optimizable) + if ok { + return o.Optimize(kind, octx) + } + } + + return octx, nil +} + +// heap impl + +func (s *DisjunctionHeapSearcher) Len() int { return len(s.heap) } + +func (s *DisjunctionHeapSearcher) Less(i, j int) bool { + if s.heap[i].curr == nil { + return true + } else if s.heap[j].curr == nil { + return false + } + return bytes.Compare(s.heap[i].curr.IndexInternalID, s.heap[j].curr.IndexInternalID) < 0 +} + +func (s *DisjunctionHeapSearcher) Swap(i, j int) { + s.heap[i], s.heap[j] = s.heap[j], s.heap[i] +} + +func (s *DisjunctionHeapSearcher) Push(x interface{}) { + s.heap = append(s.heap, x.(*SearcherCurr)) +} + +func (s *DisjunctionHeapSearcher) Pop() interface{} { + old := s.heap + n := len(old) + x := old[n-1] + s.heap = old[0 : n-1] + return x +} diff --git a/search/searcher/search_disjunction_slice.go b/search/searcher/search_disjunction_slice.go new file mode 100644 index 000000000..e3efdf2a7 --- /dev/null +++ b/search/searcher/search_disjunction_slice.go @@ -0,0 +1,298 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package searcher + +import ( + "math" + "reflect" + "sort" + + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/search/scorer" + "github.com/blevesearch/bleve/size" +) + +var reflectStaticSizeDisjunctionSliceSearcher int + +func init() { + var ds DisjunctionSliceSearcher + reflectStaticSizeDisjunctionSliceSearcher = int(reflect.TypeOf(ds).Size()) +} + +type DisjunctionSliceSearcher struct { + indexReader index.IndexReader + searchers OrderedSearcherList + numSearchers int + queryNorm float64 + currs []*search.DocumentMatch + scorer *scorer.DisjunctionQueryScorer + min int + matching []*search.DocumentMatch + matchingIdxs []int + initialized bool +} + +func newDisjunctionSliceSearcher(indexReader index.IndexReader, + qsearchers []search.Searcher, min float64, options search.SearcherOptions, + limit bool) ( + *DisjunctionSliceSearcher, error) { + if limit && tooManyClauses(len(qsearchers)) { + return nil, tooManyClausesErr() + } + // build the downstream searchers + searchers := make(OrderedSearcherList, len(qsearchers)) + for i, searcher := range qsearchers { + searchers[i] = searcher + } + // sort the searchers + sort.Sort(sort.Reverse(searchers)) + // build our searcher + rv := DisjunctionSliceSearcher{ + indexReader: indexReader, + searchers: searchers, + numSearchers: len(searchers), + currs: make([]*search.DocumentMatch, len(searchers)), + scorer: scorer.NewDisjunctionQueryScorer(options), + min: int(min), + matching: make([]*search.DocumentMatch, len(searchers)), + matchingIdxs: make([]int, len(searchers)), + } + rv.computeQueryNorm() + return &rv, nil +} + +func (s *DisjunctionSliceSearcher) Size() int { + sizeInBytes := reflectStaticSizeDisjunctionSliceSearcher + size.SizeOfPtr + + s.scorer.Size() + + for _, entry := range s.searchers { + sizeInBytes += entry.Size() + } + + for _, entry := range s.currs { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + for _, entry := range s.matching { + if entry != nil { + sizeInBytes += entry.Size() + } + } + + sizeInBytes += len(s.matchingIdxs) * size.SizeOfInt + + return sizeInBytes +} + +func (s *DisjunctionSliceSearcher) computeQueryNorm() { + // first calculate sum of squared weights + sumOfSquaredWeights := 0.0 + for _, searcher := range s.searchers { + sumOfSquaredWeights += searcher.Weight() + } + // now compute query norm from this + s.queryNorm = 1.0 / math.Sqrt(sumOfSquaredWeights) + // finally tell all the downstream searchers the norm + for _, searcher := range s.searchers { + searcher.SetQueryNorm(s.queryNorm) + } +} + +func (s *DisjunctionSliceSearcher) initSearchers(ctx *search.SearchContext) error { + var err error + // get all searchers pointing at their first match + for i, searcher := range s.searchers { + if s.currs[i] != nil { + ctx.DocumentMatchPool.Put(s.currs[i]) + } + s.currs[i], err = searcher.Next(ctx) + if err != nil { + return err + } + } + + err = s.updateMatches() + if err != nil { + return err + } + + s.initialized = true + return nil +} + +func (s *DisjunctionSliceSearcher) updateMatches() error { + matching := s.matching[:0] + matchingIdxs := s.matchingIdxs[:0] + + for i := 0; i < len(s.currs); i++ { + curr := s.currs[i] + if curr == nil { + continue + } + + if len(matching) > 0 { + cmp := curr.IndexInternalID.Compare(matching[0].IndexInternalID) + if cmp > 0 { + continue + } + + if cmp < 0 { + matching = matching[:0] + matchingIdxs = matchingIdxs[:0] + } + } + + matching = append(matching, curr) + matchingIdxs = append(matchingIdxs, i) + } + + s.matching = matching + s.matchingIdxs = matchingIdxs + + return nil +} + +func (s *DisjunctionSliceSearcher) Weight() float64 { + var rv float64 + for _, searcher := range s.searchers { + rv += searcher.Weight() + } + return rv +} + +func (s *DisjunctionSliceSearcher) SetQueryNorm(qnorm float64) { + for _, searcher := range s.searchers { + searcher.SetQueryNorm(qnorm) + } +} + +func (s *DisjunctionSliceSearcher) Next(ctx *search.SearchContext) ( + *search.DocumentMatch, error) { + if !s.initialized { + err := s.initSearchers(ctx) + if err != nil { + return nil, err + } + } + var err error + var rv *search.DocumentMatch + + found := false + for !found && len(s.matching) > 0 { + if len(s.matching) >= s.min { + found = true + // score this match + rv = s.scorer.Score(ctx, s.matching, len(s.matching), s.numSearchers) + } + + // invoke next on all the matching searchers + for _, i := range s.matchingIdxs { + searcher := s.searchers[i] + if s.currs[i] != rv { + ctx.DocumentMatchPool.Put(s.currs[i]) + } + s.currs[i], err = searcher.Next(ctx) + if err != nil { + return nil, err + } + } + + err = s.updateMatches() + if err != nil { + return nil, err + } + } + return rv, nil +} + +func (s *DisjunctionSliceSearcher) Advance(ctx *search.SearchContext, + ID index.IndexInternalID) (*search.DocumentMatch, error) { + if !s.initialized { + err := s.initSearchers(ctx) + if err != nil { + return nil, err + } + } + // get all searchers pointing at their first match + var err error + for i, searcher := range s.searchers { + if s.currs[i] != nil { + if s.currs[i].IndexInternalID.Compare(ID) >= 0 { + continue + } + ctx.DocumentMatchPool.Put(s.currs[i]) + } + s.currs[i], err = searcher.Advance(ctx, ID) + if err != nil { + return nil, err + } + } + + err = s.updateMatches() + if err != nil { + return nil, err + } + + return s.Next(ctx) +} + +func (s *DisjunctionSliceSearcher) Count() uint64 { + // for now return a worst case + var sum uint64 + for _, searcher := range s.searchers { + sum += searcher.Count() + } + return sum +} + +func (s *DisjunctionSliceSearcher) Close() (rv error) { + for _, searcher := range s.searchers { + err := searcher.Close() + if err != nil && rv == nil { + rv = err + } + } + return rv +} + +func (s *DisjunctionSliceSearcher) Min() int { + return s.min +} + +func (s *DisjunctionSliceSearcher) DocumentMatchPoolSize() int { + rv := len(s.currs) + for _, s := range s.searchers { + rv += s.DocumentMatchPoolSize() + } + return rv +} + +// a disjunction searcher implements the index.Optimizable interface +// but only activates on an edge case where the disjunction is a +// wrapper around a single Optimizable child searcher +func (s *DisjunctionSliceSearcher) Optimize(kind string, octx index.OptimizableContext) ( + index.OptimizableContext, error) { + if len(s.searchers) == 1 { + o, ok := s.searchers[0].(index.Optimizable) + if ok { + return o.Optimize(kind, octx) + } + } + + return octx, nil +} From 4bf27cb44d758427a24a9851afdd78aa60820264 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 20 Apr 2018 11:42:26 -0700 Subject: [PATCH 385/728] persistSnapshotMaybeMerge() merges to directly to file The persister goroutine can decide to merge its incoming input segments before persistence... and before this commit, that merging would have been entirely in-memory, leading to a lot of memory usage. After this change, the persistSnapshotMaybeMerge(), which calls mergeSegmentBases(), performs the merging directly to file. --- index/scorch/merge.go | 48 ++++++++++--------------------- index/scorch/persister.go | 2 +- index/scorch/segment/zap/merge.go | 15 ++++++---- 3 files changed, 26 insertions(+), 39 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 73351aa4c..41b734aaf 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -15,7 +15,6 @@ package scorch import ( - "bytes" "encoding/json" "fmt" "os" @@ -195,7 +194,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, fileMergeZapStartTime := time.Now() atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1) - newDocNums, nBytes, err := zap.Merge(segmentsToMerge, docsToDrop, path, 1024) + newDocNums, nBytes, err := zap.Merge(segmentsToMerge, docsToDrop, path, DefaultChunkFactor) atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1) atomic.AddUint64(&s.stats.TotFileMergeWrittenBytes, nBytes) @@ -274,19 +273,20 @@ type segmentMerge struct { // into the root func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, sbs []*zap.SegmentBase, sbsDrops []*roaring.Bitmap, sbsIndexes []int, - chunkFactor uint32) (uint64, *IndexSnapshot, uint64, error) { + chunkFactor uint32) (*IndexSnapshot, uint64, error) { atomic.AddUint64(&s.stats.TotMemMergeBeg, 1) - var br bytes.Buffer - - cr := zap.NewCountHashWriter(&br) - memMergeZapStartTime := time.Now() atomic.AddUint64(&s.stats.TotMemMergeZapBeg, 1) - newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, - docValueOffset, dictLocs, fieldsInv, fieldsMap, err := - zap.MergeToWriter(sbs, sbsDrops, chunkFactor, cr) + + newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) + filename := zapFileName(newSegmentID) + path := s.path + string(os.PathSeparator) + filename + + newDocNums, _, err := + zap.MergeSegmentBases(sbs, sbsDrops, path, chunkFactor) + atomic.AddUint64(&s.stats.TotMemMergeZapEnd, 1) memMergeZapTime := uint64(time.Since(memMergeZapStartTime)) @@ -297,31 +297,13 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, if err != nil { atomic.AddUint64(&s.stats.TotMemMergeErr, 1) - return 0, nil, 0, err - } - - sb, err := zap.InitSegmentBase(br.Bytes(), cr.Sum32(), chunkFactor, - fieldsMap, fieldsInv, numDocs, storedIndexOffset, fieldsIndexOffset, - docValueOffset, dictLocs) - if err != nil { - atomic.AddUint64(&s.stats.TotMemMergeErr, 1) - return 0, nil, 0, err - } - - newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) - - filename := zapFileName(newSegmentID) - path := s.path + string(os.PathSeparator) + filename - err = zap.PersistSegmentBase(sb, path) - if err != nil { - atomic.AddUint64(&s.stats.TotMemMergeErr, 1) - return 0, nil, 0, err + return nil, 0, err } segment, err := zap.Open(path) if err != nil { atomic.AddUint64(&s.stats.TotMemMergeErr, 1) - return 0, nil, 0, err + return nil, 0, err } // update persisted stats @@ -345,16 +327,16 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, select { // send to introducer case <-s.closeCh: _ = segment.DecRef() - return 0, nil, 0, ErrClosed + return nil, 0, ErrClosed case s.merges <- sm: } select { // wait for introduction to complete case <-s.closeCh: - return 0, nil, 0, ErrClosed + return nil, 0, ErrClosed case newSnapshot := <-sm.notify: atomic.AddUint64(&s.stats.TotMemMergeSegments, uint64(len(sbs))) atomic.AddUint64(&s.stats.TotMemMergeDone, 1) - return numDocs, newSnapshot, newSegmentID, nil + return newSnapshot, newSegmentID, nil } } diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 76fd746d5..cbc24cdb7 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -244,7 +244,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) ( return false, nil } - _, newSnapshot, newSegmentID, err := s.mergeSegmentBases( + newSnapshot, newSegmentID, err := s.mergeSegmentBases( snapshot, sbs, sbsDrops, sbsIndexes, DefaultChunkFactor) if err != nil { return false, err diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 7f8f23f77..dab09f6b3 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -37,6 +37,16 @@ const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc // remaining data. This new segment is built at the specified path, // with the provided chunkFactor. func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, + chunkFactor uint32) ([][]uint64, uint64, error) { + segmentBases := make([]*SegmentBase, len(segments)) + for segmenti, segment := range segments { + segmentBases[segmenti] = &segment.SegmentBase + } + + return MergeSegmentBases(segmentBases, drops, path, chunkFactor) +} + +func MergeSegmentBases(segmentBases []*SegmentBase, drops []*roaring.Bitmap, path string, chunkFactor uint32) ([][]uint64, uint64, error) { flag := os.O_RDWR | os.O_CREATE @@ -50,11 +60,6 @@ func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, _ = os.Remove(path) } - segmentBases := make([]*SegmentBase, len(segments)) - for segmenti, segment := range segments { - segmentBases[segmenti] = &segment.SegmentBase - } - // buffer the output br := bufio.NewWriterSize(f, DefaultFileMergerBufferSize) From 5d592a302fd6caa682740d17b1c01e971ae7406f Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 20 Apr 2018 14:36:28 -0700 Subject: [PATCH 386/728] Revert "Merge pull request #900 from abhinavdangeti/merger-overhead-doc-values" This reverts commit 2d5ae21650e05d26a3876b318b860546ab970a9b, reversing changes made to cd840e0996e29962922be75c47014fad6e857078. panic: runtime error: slice bounds out of range github.com/blevesearch/bleve/index/scorch/segment/zap.(*docValueReader).loadDvChunk /Users/steveyen/dev/couchbase-server.vulcan/godeps/src/github.com/blevesearch/bleve/index/scorch/segment/zap/docvalues.go:149 github.com/blevesearch/bleve/index/scorch/segment/zap.(*docValueReader).iterateAllDocValues /Users/steveyen/dev/couchbase-server.vulcan/godeps/src/github.com/blevesearch/bleve/index/scorch/segment/zap/docvalues.go:157 --- index/scorch/segment/zap/docvalues.go | 31 +--------- index/scorch/segment/zap/merge.go | 83 +++++++++++++-------------- 2 files changed, 41 insertions(+), 73 deletions(-) diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index c73c01215..bc5d1cac6 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -34,11 +34,10 @@ func init() { reflectStaticSizedocValueReader = int(reflect.TypeOf(dvi).Size()) } -type docNumTermsVisitor func(docNum uint64, terms []byte) error - type docValueReader struct { field string curChunkNum uint64 + numChunks uint64 chunkOffsets []uint64 dvDataLoc uint64 curChunkHeader []MetaData @@ -152,34 +151,6 @@ func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error return nil } -func (di *docValueReader) iterateAllDocValues(s *SegmentBase, visitor docNumTermsVisitor) error { - for i := 0; i < len(di.chunkOffsets); i++ { - err := di.loadDvChunk(uint64(i), s) - if err != nil { - return err - } - - // uncompress the already loaded data - uncompressed, err := snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) - if err != nil { - return err - } - di.uncompressed = uncompressed - - start := uint64(0) - for _, entry := range di.curChunkHeader { - err = visitor(entry.DocNum, uncompressed[start:entry.DocDvOffset]) - if err != nil { - return err - } - - start = entry.DocDvOffset - } - } - - return nil -} - func (di *docValueReader) visitDocValues(docNum uint64, visitor index.DocumentFieldTermVisitor) error { // binary search the term locations for the docNum diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index dab09f6b3..faeb9768c 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -180,6 +180,12 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) + // docTermMap is keyed by docNum, where the array impl provides + // better memory usage behavior than a sparse-friendlier hashmap + // for when docs have much structural similarity (i.e., every doc + // has a given field) + var docTermMap [][]byte + var vellumBuf bytes.Buffer newVellum, err := vellum.New(&vellumBuf, nil) if err != nil { @@ -197,8 +203,6 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, var dicts []*Dictionary var itrs []vellum.Iterator - var segmentsInFocus []*SegmentBase - for segmentI, segment := range segments { dict, err2 := segment.dictionary(fieldName) if err2 != nil { @@ -218,11 +222,19 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, } dicts = append(dicts, dict) itrs = append(itrs, itr) - segmentsInFocus = append(segmentsInFocus, segment) } } } + if uint64(cap(docTermMap)) < newSegDocCount { + docTermMap = make([][]byte, newSegDocCount) + } else { + docTermMap = docTermMap[0:newSegDocCount] + for docNum := range docTermMap { // reset the docTermMap + docTermMap[docNum] = docTermMap[docNum][:0] + } + } + var prevTerm []byte newRoaring.Clear() @@ -302,11 +314,11 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, // can optimize by copying freq/norm/loc bytes directly lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying( term, postItr, newDocNums[itrI], newRoaring, - tfEncoder, locEncoder) + tfEncoder, locEncoder, docTermMap) } else { lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs( fieldsMap, term, postItr, newDocNums[itrI], newRoaring, - tfEncoder, locEncoder, bufLoc) + tfEncoder, locEncoder, docTermMap, bufLoc) } if err != nil { return nil, 0, err @@ -354,50 +366,28 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, // update the field doc values fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1, w, true) - - fdvReadersAvailable := false - var dvIterClone *docValueReader - for segmentI, segment := range segmentsInFocus { - fieldIDPlus1 := uint16(segment.fieldsMap[fieldName]) - if dvIter, exists := segment.fieldDvReaders[fieldIDPlus1-1]; exists && - dvIter != nil { - fdvReadersAvailable = true - dvIterClone = dvIter.cloneInto(dvIterClone) - err = dvIterClone.iterateAllDocValues(segment, func(docNum uint64, terms []byte) error { - if newDocNums[segmentI][docNum] == docDropped { - return nil - } - err := fdvEncoder.Add(newDocNums[segmentI][docNum], terms) - if err != nil { - return err - } - return nil - }) + for docNum, docTerms := range docTermMap { + if len(docTerms) > 0 { + err = fdvEncoder.Add(uint64(docNum), docTerms) if err != nil { return nil, 0, err } } } + err = fdvEncoder.Close() + if err != nil { + return nil, 0, err + } - if fdvReadersAvailable { - err = fdvEncoder.Close() - if err != nil { - return nil, 0, err - } - - // persist the doc value details for this field - _, err = fdvEncoder.Write() - if err != nil { - return nil, 0, err - } - - // get the field doc value offset (end) - fieldDvLocsEnd[fieldID] = uint64(w.Count()) - } else { - fieldDvLocsStart[fieldID] = fieldNotUninverted - fieldDvLocsEnd[fieldID] = fieldNotUninverted + // persist the doc value details for this field + _, err = fdvEncoder.Write() + if err != nil { + return nil, 0, err } + // get the field doc value offset (end) + fieldDvLocsEnd[fieldID] = uint64(w.Count()) + // reset vellum buffer and vellum builder vellumBuf.Reset() err = newVellum.Reset(&vellumBuf) @@ -427,7 +417,8 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *PostingsIterator, newDocNums []uint64, newRoaring *roaring.Bitmap, - tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, bufLoc []uint64) ( + tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, docTermMap [][]byte, + bufLoc []uint64) ( lastDocNum uint64, lastFreq uint64, lastNorm uint64, bufLocOut []uint64, err error) { next, err := postItr.Next() for next != nil && err == nil { @@ -481,6 +472,9 @@ func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *Po } } + docTermMap[hitNewDocNum] = + append(append(docTermMap[hitNewDocNum], term...), termSeparator) + lastDocNum = hitNewDocNum lastFreq = nextFreq lastNorm = nextNorm @@ -493,7 +487,7 @@ func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *Po func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator, newDocNums []uint64, newRoaring *roaring.Bitmap, - tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder) ( + tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, docTermMap [][]byte) ( lastDocNum uint64, lastFreq uint64, lastNorm uint64, err error) { nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err := postItr.nextBytes() @@ -516,6 +510,9 @@ func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator, } } + docTermMap[hitNewDocNum] = + append(append(docTermMap[hitNewDocNum], term...), termSeparator) + lastDocNum = hitNewDocNum lastFreq = nextFreq lastNorm = nextNorm From 3e4c63a620243a5ea963597de9ffcbdd8d37d901 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 21 Apr 2018 12:45:04 -0700 Subject: [PATCH 387/728] Revert "Revert "Merge pull request #900 from abhinavdangeti/merger-overhead-doc-values"" This reverts commit 5d592a302fd6caa682740d17b1c01e971ae7406f. --- index/scorch/segment/zap/docvalues.go | 31 +++++++++- index/scorch/segment/zap/merge.go | 83 ++++++++++++++------------- 2 files changed, 73 insertions(+), 41 deletions(-) diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index bc5d1cac6..c73c01215 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -34,10 +34,11 @@ func init() { reflectStaticSizedocValueReader = int(reflect.TypeOf(dvi).Size()) } +type docNumTermsVisitor func(docNum uint64, terms []byte) error + type docValueReader struct { field string curChunkNum uint64 - numChunks uint64 chunkOffsets []uint64 dvDataLoc uint64 curChunkHeader []MetaData @@ -151,6 +152,34 @@ func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error return nil } +func (di *docValueReader) iterateAllDocValues(s *SegmentBase, visitor docNumTermsVisitor) error { + for i := 0; i < len(di.chunkOffsets); i++ { + err := di.loadDvChunk(uint64(i), s) + if err != nil { + return err + } + + // uncompress the already loaded data + uncompressed, err := snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) + if err != nil { + return err + } + di.uncompressed = uncompressed + + start := uint64(0) + for _, entry := range di.curChunkHeader { + err = visitor(entry.DocNum, uncompressed[start:entry.DocDvOffset]) + if err != nil { + return err + } + + start = entry.DocDvOffset + } + } + + return nil +} + func (di *docValueReader) visitDocValues(docNum uint64, visitor index.DocumentFieldTermVisitor) error { // binary search the term locations for the docNum diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index faeb9768c..dab09f6b3 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -180,12 +180,6 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) - // docTermMap is keyed by docNum, where the array impl provides - // better memory usage behavior than a sparse-friendlier hashmap - // for when docs have much structural similarity (i.e., every doc - // has a given field) - var docTermMap [][]byte - var vellumBuf bytes.Buffer newVellum, err := vellum.New(&vellumBuf, nil) if err != nil { @@ -203,6 +197,8 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, var dicts []*Dictionary var itrs []vellum.Iterator + var segmentsInFocus []*SegmentBase + for segmentI, segment := range segments { dict, err2 := segment.dictionary(fieldName) if err2 != nil { @@ -222,19 +218,11 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, } dicts = append(dicts, dict) itrs = append(itrs, itr) + segmentsInFocus = append(segmentsInFocus, segment) } } } - if uint64(cap(docTermMap)) < newSegDocCount { - docTermMap = make([][]byte, newSegDocCount) - } else { - docTermMap = docTermMap[0:newSegDocCount] - for docNum := range docTermMap { // reset the docTermMap - docTermMap[docNum] = docTermMap[docNum][:0] - } - } - var prevTerm []byte newRoaring.Clear() @@ -314,11 +302,11 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, // can optimize by copying freq/norm/loc bytes directly lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying( term, postItr, newDocNums[itrI], newRoaring, - tfEncoder, locEncoder, docTermMap) + tfEncoder, locEncoder) } else { lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs( fieldsMap, term, postItr, newDocNums[itrI], newRoaring, - tfEncoder, locEncoder, docTermMap, bufLoc) + tfEncoder, locEncoder, bufLoc) } if err != nil { return nil, 0, err @@ -366,27 +354,49 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, // update the field doc values fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1, w, true) - for docNum, docTerms := range docTermMap { - if len(docTerms) > 0 { - err = fdvEncoder.Add(uint64(docNum), docTerms) + + fdvReadersAvailable := false + var dvIterClone *docValueReader + for segmentI, segment := range segmentsInFocus { + fieldIDPlus1 := uint16(segment.fieldsMap[fieldName]) + if dvIter, exists := segment.fieldDvReaders[fieldIDPlus1-1]; exists && + dvIter != nil { + fdvReadersAvailable = true + dvIterClone = dvIter.cloneInto(dvIterClone) + err = dvIterClone.iterateAllDocValues(segment, func(docNum uint64, terms []byte) error { + if newDocNums[segmentI][docNum] == docDropped { + return nil + } + err := fdvEncoder.Add(newDocNums[segmentI][docNum], terms) + if err != nil { + return err + } + return nil + }) if err != nil { return nil, 0, err } } } - err = fdvEncoder.Close() - if err != nil { - return nil, 0, err - } - // persist the doc value details for this field - _, err = fdvEncoder.Write() - if err != nil { - return nil, 0, err - } + if fdvReadersAvailable { + err = fdvEncoder.Close() + if err != nil { + return nil, 0, err + } - // get the field doc value offset (end) - fieldDvLocsEnd[fieldID] = uint64(w.Count()) + // persist the doc value details for this field + _, err = fdvEncoder.Write() + if err != nil { + return nil, 0, err + } + + // get the field doc value offset (end) + fieldDvLocsEnd[fieldID] = uint64(w.Count()) + } else { + fieldDvLocsStart[fieldID] = fieldNotUninverted + fieldDvLocsEnd[fieldID] = fieldNotUninverted + } // reset vellum buffer and vellum builder vellumBuf.Reset() @@ -417,8 +427,7 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *PostingsIterator, newDocNums []uint64, newRoaring *roaring.Bitmap, - tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, docTermMap [][]byte, - bufLoc []uint64) ( + tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, bufLoc []uint64) ( lastDocNum uint64, lastFreq uint64, lastNorm uint64, bufLocOut []uint64, err error) { next, err := postItr.Next() for next != nil && err == nil { @@ -472,9 +481,6 @@ func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *Po } } - docTermMap[hitNewDocNum] = - append(append(docTermMap[hitNewDocNum], term...), termSeparator) - lastDocNum = hitNewDocNum lastFreq = nextFreq lastNorm = nextNorm @@ -487,7 +493,7 @@ func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *Po func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator, newDocNums []uint64, newRoaring *roaring.Bitmap, - tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, docTermMap [][]byte) ( + tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder) ( lastDocNum uint64, lastFreq uint64, lastNorm uint64, err error) { nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err := postItr.nextBytes() @@ -510,9 +516,6 @@ func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator, } } - docTermMap[hitNewDocNum] = - append(append(docTermMap[hitNewDocNum], term...), termSeparator) - lastDocNum = hitNewDocNum lastFreq = nextFreq lastNorm = nextNorm From c623c57df87b51b47fb42f92470b10f4be555c52 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 21 Apr 2018 12:49:42 -0700 Subject: [PATCH 388/728] fixes #906 - docvalues handles empty chunk --- index/scorch/segment/zap/docvalues.go | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index c73c01215..971b78b78 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -125,6 +125,14 @@ func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error // reside for the given docNum destChunkDataLoc, curChunkEnd := di.dvDataLoc, di.dvDataLoc start, end := readChunkBoundary(int(chunkNumber), di.chunkOffsets) + if start >= end { + di.curChunkHeader = di.curChunkHeader[:0] + di.curChunkData = nil + di.curChunkNum = chunkNumber + di.uncompressed = di.uncompressed[:0] + return nil + } + destChunkDataLoc += start curChunkEnd += end @@ -158,6 +166,9 @@ func (di *docValueReader) iterateAllDocValues(s *SegmentBase, visitor docNumTerm if err != nil { return err } + if di.curChunkData == nil || len(di.curChunkHeader) <= 0 { + continue + } // uncompress the already loaded data uncompressed, err := snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) @@ -184,7 +195,7 @@ func (di *docValueReader) visitDocValues(docNum uint64, visitor index.DocumentFieldTermVisitor) error { // binary search the term locations for the docNum start, end := di.getDocValueLocs(docNum) - if start == math.MaxUint64 || end == math.MaxUint64 { + if start == math.MaxUint64 || end == math.MaxUint64 || start == end { return nil } From 76584c7038dee90b805dda1f8c7b0a3e127be310 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Fri, 20 Apr 2018 16:55:43 -0700 Subject: [PATCH 389/728] MB-28847: Account for bytes of an allocated buffer while building new segment Account for these bytes in the memory used by the scorch index. --- index/scorch/scorch.go | 18 +++++++++++++----- index/scorch/segment/zap/build_test.go | 12 ++++++------ index/scorch/segment/zap/dict_test.go | 4 ++-- index/scorch/segment/zap/merge_test.go | 22 +++++++++++----------- index/scorch/segment/zap/new.go | 6 +++--- index/scorch/segment/zap/segment_test.go | 14 +++++++------- 6 files changed, 42 insertions(+), 34 deletions(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 14796c5e8..398451570 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -76,10 +76,12 @@ type Scorch struct { } type internalStats struct { - persistEpoch uint64 - persistSnapshotSize uint64 - mergeEpoch uint64 - mergeSnapshotSize uint64 + persistEpoch uint64 + persistSnapshotSize uint64 + mergeEpoch uint64 + mergeSnapshotSize uint64 + newSegBufBytesAdded uint64 + newSegBufBytesRemoved uint64 } func NewScorch(storeName string, @@ -308,11 +310,13 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { s.fireEvent(EventKindBatchIntroductionStart, 0) var newSegment segment.Segment + var bufBytes uint64 if len(analysisResults) > 0 { - newSegment, err = zap.AnalysisResultsToSegmentBase(analysisResults, DefaultChunkFactor) + newSegment, bufBytes, err = zap.AnalysisResultsToSegmentBase(analysisResults, DefaultChunkFactor) if err != nil { return err } + atomic.AddUint64(&s.iStats.newSegBufBytesAdded, bufBytes) } else { atomic.AddUint64(&s.stats.TotBatchesEmpty, 1) } @@ -330,6 +334,7 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { atomic.AddUint64(&s.stats.TotIndexedPlainTextBytes, numPlainTextBytes) } + atomic.AddUint64(&s.iStats.newSegBufBytesRemoved, bufBytes) atomic.AddUint64(&s.stats.TotIndexTime, uint64(time.Since(indexStart))) return err @@ -523,6 +528,9 @@ func (s *Scorch) MemoryUsed() uint64 { memUsed += mergeSnapshotSize } + memUsed += (atomic.LoadUint64(&s.iStats.newSegBufBytesAdded) - + atomic.LoadUint64(&s.iStats.newSegBufBytesRemoved)) + return memUsed } diff --git a/index/scorch/segment/zap/build_test.go b/index/scorch/segment/zap/build_test.go index 5b6ffd559..9d1b584f2 100644 --- a/index/scorch/segment/zap/build_test.go +++ b/index/scorch/segment/zap/build_test.go @@ -26,7 +26,7 @@ import ( func TestBuild(t *testing.T) { _ = os.RemoveAll("/tmp/scorch.zap") - sb, err := buildTestSegment() + sb, _, err := buildTestSegment() if err != nil { t.Fatal(err) } @@ -36,7 +36,7 @@ func TestBuild(t *testing.T) { } } -func buildTestSegment() (*SegmentBase, error) { +func buildTestSegment() (*SegmentBase, uint64, error) { doc := &document.Document{ ID: "a", Fields: []document.Field{ @@ -125,19 +125,19 @@ func buildTestSegment() (*SegmentBase, error) { return AnalysisResultsToSegmentBase(results, 1024) } -func buildTestSegmentMulti() (*SegmentBase, error) { +func buildTestSegmentMulti() (*SegmentBase, uint64, error) { results := buildTestAnalysisResultsMulti() return AnalysisResultsToSegmentBase(results, 1024) } -func buildTestSegmentMultiWithChunkFactor(chunkFactor uint32) (*SegmentBase, error) { +func buildTestSegmentMultiWithChunkFactor(chunkFactor uint32) (*SegmentBase, uint64, error) { results := buildTestAnalysisResultsMulti() return AnalysisResultsToSegmentBase(results, chunkFactor) } -func buildTestSegmentMultiWithDifferentFields(includeDocA, includeDocB bool) (*SegmentBase, error) { +func buildTestSegmentMultiWithDifferentFields(includeDocA, includeDocB bool) (*SegmentBase, uint64, error) { results := buildTestAnalysisResultsMultiWithDifferentFields(includeDocA, includeDocB) return AnalysisResultsToSegmentBase(results, 1024) @@ -550,7 +550,7 @@ func buildTestSegmentWithDefaultFieldMapping(chunkFactor uint32) ( } } - sb, err := AnalysisResultsToSegmentBase(results, chunkFactor) + sb, _, err := AnalysisResultsToSegmentBase(results, chunkFactor) return sb, fields, err } diff --git a/index/scorch/segment/zap/dict_test.go b/index/scorch/segment/zap/dict_test.go index b70f2adf7..1a8ce22c5 100644 --- a/index/scorch/segment/zap/dict_test.go +++ b/index/scorch/segment/zap/dict_test.go @@ -24,7 +24,7 @@ import ( "github.com/blevesearch/bleve/index" ) -func buildTestSegmentForDict() (*SegmentBase, error) { +func buildTestSegmentForDict() (*SegmentBase, uint64, error) { doc := &document.Document{ ID: "a", Fields: []document.Field{ @@ -105,7 +105,7 @@ func TestDictionary(t *testing.T) { _ = os.RemoveAll("/tmp/scorch.zap") - testSeg, _ := buildTestSegmentForDict() + testSeg, _, _ := buildTestSegmentForDict() err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") if err != nil { t.Fatalf("error persisting segment: %v", err) diff --git a/index/scorch/segment/zap/merge_test.go b/index/scorch/segment/zap/merge_test.go index 6b168c907..cd21ecb66 100644 --- a/index/scorch/segment/zap/merge_test.go +++ b/index/scorch/segment/zap/merge_test.go @@ -33,13 +33,13 @@ func TestMerge(t *testing.T) { _ = os.RemoveAll("/tmp/scorch2.zap") _ = os.RemoveAll("/tmp/scorch3.zap") - testSeg, _ := buildTestSegmentMulti() + testSeg, _, _ := buildTestSegmentMulti() err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") if err != nil { t.Fatal(err) } - testSeg2, _ := buildTestSegmentMulti2() + testSeg2, _, _ := buildTestSegmentMulti2() err = PersistSegmentBase(testSeg2, "/tmp/scorch2.zap") if err != nil { t.Fatal(err) @@ -120,7 +120,7 @@ func TestMergeWithEmptySegmentsFirst(t *testing.T) { func testMergeWithEmptySegments(t *testing.T, before bool, numEmptySegments int) { _ = os.RemoveAll("/tmp/scorch.zap") - testSeg, _ := buildTestSegmentMulti() + testSeg, _, _ := buildTestSegmentMulti() err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") if err != nil { t.Fatal(err) @@ -147,7 +147,7 @@ func testMergeWithEmptySegments(t *testing.T, before bool, numEmptySegments int) _ = os.RemoveAll("/tmp/" + fname) - emptySegment, _ := AnalysisResultsToSegmentBase([]*index.AnalysisResult{}, 1024) + emptySegment, _, _ := AnalysisResultsToSegmentBase([]*index.AnalysisResult{}, 1024) err = PersistSegmentBase(emptySegment, "/tmp/"+fname) if err != nil { t.Fatal(err) @@ -461,7 +461,7 @@ func testMergeAndDrop(t *testing.T, docsToDrop []*roaring.Bitmap) { _ = os.RemoveAll("/tmp/scorch.zap") _ = os.RemoveAll("/tmp/scorch2.zap") - testSeg, _ := buildTestSegmentMulti() + testSeg, _, _ := buildTestSegmentMulti() err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") if err != nil { t.Fatal(err) @@ -477,7 +477,7 @@ func testMergeAndDrop(t *testing.T, docsToDrop []*roaring.Bitmap) { } }() - testSeg2, _ := buildTestSegmentMulti2() + testSeg2, _, _ := buildTestSegmentMulti2() err = PersistSegmentBase(testSeg2, "/tmp/scorch2.zap") if err != nil { t.Fatal(err) @@ -564,7 +564,7 @@ func testMergeWithUpdates(t *testing.T, segmentDocIds [][]string, docsToDrop []* _ = os.RemoveAll("/tmp/" + fname) - testSeg, _ := buildTestSegmentMultiHelper(docIds) + testSeg, _, _ := buildTestSegmentMultiHelper(docIds) err := PersistSegmentBase(testSeg, "/tmp/"+fname) if err != nil { t.Fatal(err) @@ -615,11 +615,11 @@ func testMergeAndDropSegments(t *testing.T, segsToMerge []*Segment, docsToDrop [ testMergeWithSelf(t, segm.(*Segment), expectedNumDocs) } -func buildTestSegmentMulti2() (*SegmentBase, error) { +func buildTestSegmentMulti2() (*SegmentBase, uint64, error) { return buildTestSegmentMultiHelper([]string{"c", "d"}) } -func buildTestSegmentMultiHelper(docIds []string) (*SegmentBase, error) { +func buildTestSegmentMultiHelper(docIds []string) (*SegmentBase, uint64, error) { doc := &document.Document{ ID: "c", Fields: []document.Field{ @@ -785,13 +785,13 @@ func TestMergeBytesWritten(t *testing.T) { _ = os.RemoveAll("/tmp/scorch2.zap") _ = os.RemoveAll("/tmp/scorch3.zap") - testSeg, _ := buildTestSegmentMulti() + testSeg, _, _ := buildTestSegmentMulti() err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") if err != nil { t.Fatal(err) } - testSeg2, _ := buildTestSegmentMulti2() + testSeg2, _, _ := buildTestSegmentMulti2() err = PersistSegmentBase(testSeg2, "/tmp/scorch2.zap") if err != nil { t.Fatal(err) diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index 7471cf277..a76a8f6af 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -32,7 +32,7 @@ import ( // AnalysisResultsToSegmentBase produces an in-memory zap-encoded // SegmentBase from analysis results func AnalysisResultsToSegmentBase(results []*index.AnalysisResult, - chunkFactor uint32) (*SegmentBase, error) { + chunkFactor uint32) (*SegmentBase, uint64, error) { s := interimPool.Get().(*interim) var br bytes.Buffer @@ -52,7 +52,7 @@ func AnalysisResultsToSegmentBase(results []*index.AnalysisResult, storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, err := s.convert() if err != nil { - return nil, err + return nil, uint64(0), err } sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkFactor, @@ -65,7 +65,7 @@ func AnalysisResultsToSegmentBase(results []*index.AnalysisResult, interimPool.Put(s) } - return sb, err + return sb, uint64(len(br.Bytes())), err } var interimPool = sync.Pool{New: func() interface{} { return &interim{} }} diff --git a/index/scorch/segment/zap/segment_test.go b/index/scorch/segment/zap/segment_test.go index 27f87faff..be7a39e14 100644 --- a/index/scorch/segment/zap/segment_test.go +++ b/index/scorch/segment/zap/segment_test.go @@ -28,7 +28,7 @@ import ( func TestOpen(t *testing.T) { _ = os.RemoveAll("/tmp/scorch.zap") - testSeg, _ := buildTestSegment() + testSeg, _, _ := buildTestSegment() err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") if err != nil { t.Fatalf("error persisting segment: %v", err) @@ -328,7 +328,7 @@ func TestOpen(t *testing.T) { func TestOpenMulti(t *testing.T) { _ = os.RemoveAll("/tmp/scorch.zap") - testSeg, _ := buildTestSegmentMulti() + testSeg, _, _ := buildTestSegmentMulti() err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") if err != nil { t.Fatalf("error persisting segment: %v", err) @@ -428,7 +428,7 @@ func TestOpenMulti(t *testing.T) { func TestOpenMultiWithTwoChunks(t *testing.T) { _ = os.RemoveAll("/tmp/scorch.zap") - testSeg, _ := buildTestSegmentMultiWithChunkFactor(1) + testSeg, _, _ := buildTestSegmentMultiWithChunkFactor(1) err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") if err != nil { t.Fatalf("error persisting segment: %v", err) @@ -523,7 +523,7 @@ func TestOpenMultiWithTwoChunks(t *testing.T) { func TestSegmentVisitableDocValueFieldsList(t *testing.T) { _ = os.RemoveAll("/tmp/scorch.zap") - testSeg, _ := buildTestSegmentMultiWithChunkFactor(1) + testSeg, _, _ := buildTestSegmentMultiWithChunkFactor(1) err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") if err != nil { t.Fatalf("error persisting segment: %v", err) @@ -603,7 +603,7 @@ func TestSegmentVisitableDocValueFieldsList(t *testing.T) { func TestSegmentDocsWithNonOverlappingFields(t *testing.T) { _ = os.RemoveAll("/tmp/scorch.zap") - testSeg, err := buildTestSegmentMultiWithDifferentFields(true, true) + testSeg, _, err := buildTestSegmentMultiWithDifferentFields(true, true) if err != nil { t.Fatalf("error building segment: %v", err) } @@ -653,13 +653,13 @@ func TestMergedSegmentDocsWithNonOverlappingFields(t *testing.T) { _ = os.RemoveAll("/tmp/scorch2.zap") _ = os.RemoveAll("/tmp/scorch3.zap") - testSeg1, _ := buildTestSegmentMultiWithDifferentFields(true, false) + testSeg1, _, _ := buildTestSegmentMultiWithDifferentFields(true, false) err := PersistSegmentBase(testSeg1, "/tmp/scorch1.zap") if err != nil { t.Fatalf("error persisting segment: %v", err) } - testSeg2, _ := buildTestSegmentMultiWithDifferentFields(false, true) + testSeg2, _, _ := buildTestSegmentMultiWithDifferentFields(false, true) err = PersistSegmentBase(testSeg2, "/tmp/scorch2.zap") if err != nil { t.Fatalf("error persisting segment: %v", err) From e12b653120e6cd702e65fa28ab8a70791219e5a0 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 24 Apr 2018 10:16:11 -0700 Subject: [PATCH 390/728] more knobs for estimating new segment buffer size --- index/scorch/segment/zap/new.go | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index a76a8f6af..22b69913e 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -29,6 +29,10 @@ import ( "github.com/golang/snappy" ) +var NewSegmentBufferNumResultsBump int = 100 +var NewSegmentBufferNumResultsFactor float64 = 1.0 +var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0 + // AnalysisResultsToSegmentBase produces an in-memory zap-encoded // SegmentBase from analysis results func AnalysisResultsToSegmentBase(results []*index.AnalysisResult, @@ -41,8 +45,11 @@ func AnalysisResultsToSegmentBase(results []*index.AnalysisResult, // size, but note that the interim instance comes from a // global interimPool, so multiple scorch instances indexing // different docs can lead to low quality estimates - avgBytesPerDoc := s.lastOutSize / s.lastNumDocs - br.Grow(avgBytesPerDoc * (len(results) + 1)) + estimateAvgBytesPerDoc := int(float64(s.lastOutSize/s.lastNumDocs) * + NewSegmentBufferNumResultsFactor) + estimateNumResults := int(float64(len(results)+NewSegmentBufferNumResultsBump) * + NewSegmentBufferAvgBytesPerDocFactor) + br.Grow(estimateAvgBytesPerDoc * estimateNumResults) } s.results = results From 414274b49b0e96dfe9470868992f836ea10e5da1 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 24 Apr 2018 10:16:57 -0700 Subject: [PATCH 391/728] go fmt --- index/scorch/segment/zap/docvalues.go | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index 971b78b78..9c3815068 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -125,13 +125,13 @@ func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error // reside for the given docNum destChunkDataLoc, curChunkEnd := di.dvDataLoc, di.dvDataLoc start, end := readChunkBoundary(int(chunkNumber), di.chunkOffsets) - if start >= end { - di.curChunkHeader = di.curChunkHeader[:0] - di.curChunkData = nil - di.curChunkNum = chunkNumber - di.uncompressed = di.uncompressed[:0] - return nil - } + if start >= end { + di.curChunkHeader = di.curChunkHeader[:0] + di.curChunkData = nil + di.curChunkNum = chunkNumber + di.uncompressed = di.uncompressed[:0] + return nil + } destChunkDataLoc += start curChunkEnd += end @@ -166,9 +166,9 @@ func (di *docValueReader) iterateAllDocValues(s *SegmentBase, visitor docNumTerm if err != nil { return err } - if di.curChunkData == nil || len(di.curChunkHeader) <= 0 { - continue - } + if di.curChunkData == nil || len(di.curChunkHeader) <= 0 { + continue + } // uncompress the already loaded data uncompressed, err := snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) @@ -195,7 +195,7 @@ func (di *docValueReader) visitDocValues(docNum uint64, visitor index.DocumentFieldTermVisitor) error { // binary search the term locations for the docNum start, end := di.getDocValueLocs(docNum) - if start == math.MaxUint64 || end == math.MaxUint64 || start == end { + if start == math.MaxUint64 || end == math.MaxUint64 || start == end { return nil } From 06f7c58a3562345b50a2874e2575ebfc579b427e Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Tue, 24 Apr 2018 12:16:25 -0700 Subject: [PATCH 392/728] Include missing initialization for size entry of integers --- size/sizes.go | 46 ++++++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/size/sizes.go b/size/sizes.go index 4ba544a71..0990bf86e 100644 --- a/size/sizes.go +++ b/size/sizes.go @@ -19,28 +19,30 @@ import ( ) func init() { - var a bool - SizeOfBool = int(reflect.TypeOf(a).Size()) - var b float32 - SizeOfFloat32 = int(reflect.TypeOf(b).Size()) - var c float64 - SizeOfFloat64 = int(reflect.TypeOf(c).Size()) - var d map[int]int - SizeOfMap = int(reflect.TypeOf(d).Size()) - var e *int - SizeOfPtr = int(reflect.TypeOf(e).Size()) - var f []int - SizeOfSlice = int(reflect.TypeOf(f).Size()) - var g string - SizeOfString = int(reflect.TypeOf(g).Size()) - var h uint8 - SizeOfUint8 = int(reflect.TypeOf(h).Size()) - var i uint16 - SizeOfUint16 = int(reflect.TypeOf(i).Size()) - var j uint32 - SizeOfUint32 = int(reflect.TypeOf(j).Size()) - var k uint64 - SizeOfUint64 = int(reflect.TypeOf(k).Size()) + var b bool + SizeOfBool = int(reflect.TypeOf(b).Size()) + var f32 float32 + SizeOfFloat32 = int(reflect.TypeOf(f32).Size()) + var f64 float64 + SizeOfFloat64 = int(reflect.TypeOf(f64).Size()) + var i int + SizeOfInt = int(reflect.TypeOf(i).Size()) + var m map[int]int + SizeOfMap = int(reflect.TypeOf(m).Size()) + var ptr *int + SizeOfPtr = int(reflect.TypeOf(ptr).Size()) + var slice []int + SizeOfSlice = int(reflect.TypeOf(slice).Size()) + var str string + SizeOfString = int(reflect.TypeOf(str).Size()) + var u8 uint8 + SizeOfUint8 = int(reflect.TypeOf(u8).Size()) + var u16 uint16 + SizeOfUint16 = int(reflect.TypeOf(u16).Size()) + var u32 uint32 + SizeOfUint32 = int(reflect.TypeOf(u32).Size()) + var u64 uint64 + SizeOfUint64 = int(reflect.TypeOf(u64).Size()) } var SizeOfBool int From 3c30bc2d476083802ed11044c004edb44f19eaa4 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 24 Apr 2018 15:24:14 -0400 Subject: [PATCH 393/728] account for memory allocated during text analysis --- analysis/freq.go | 41 +++++++++++++++++++++++++++++++++++++++++ index/analysis.go | 19 +++++++++++++++++++ index/scorch/scorch.go | 10 ++++++++++ 3 files changed, 70 insertions(+) diff --git a/analysis/freq.go b/analysis/freq.go index e1ca2cd6f..198c149b2 100644 --- a/analysis/freq.go +++ b/analysis/freq.go @@ -14,6 +14,22 @@ package analysis +import ( + "reflect" + + "github.com/blevesearch/bleve/size" +) + +var reflectStaticSizeTokenLocation int +var reflectStaticSizeTokenFreq int + +func init() { + var tl TokenLocation + reflectStaticSizeTokenLocation = int(reflect.TypeOf(tl).Size()) + var tf TokenFreq + reflectStaticSizeTokenFreq = int(reflect.TypeOf(tf).Size()) +} + // TokenLocation represents one occurrence of a term at a particular location in // a field. Start, End and Position have the same meaning as in analysis.Token. // Field and ArrayPositions identify the field value in the source document. @@ -26,6 +42,12 @@ type TokenLocation struct { Position int } +func (tl *TokenLocation) Size() int { + rv := reflectStaticSizeTokenLocation + rv += len(tl.ArrayPositions) * size.SizeOfUint64 + return rv +} + // TokenFreq represents all the occurrences of a term in all fields of a // document. type TokenFreq struct { @@ -34,6 +56,15 @@ type TokenFreq struct { frequency int } +func (tf *TokenFreq) Size() int { + rv := reflectStaticSizeTokenFreq + rv += len(tf.Term) + for _, loc := range tf.Locations { + rv += loc.Size() + } + return rv +} + func (tf *TokenFreq) Frequency() int { return tf.frequency } @@ -42,6 +73,16 @@ func (tf *TokenFreq) Frequency() int { // fields. type TokenFrequencies map[string]*TokenFreq +func (tfs TokenFrequencies) Size() int { + rv := size.SizeOfMap + rv += len(tfs) * (size.SizeOfString + size.SizeOfPtr) + for k, v := range tfs { + rv += len(k) + rv += v.Size() + } + return rv +} + func (tfs TokenFrequencies) MergeAll(remoteField string, other TokenFrequencies) { // walk the new token frequencies for tfk, tf := range other { diff --git a/index/analysis.go b/index/analysis.go index 840dad97a..82883af01 100644 --- a/index/analysis.go +++ b/index/analysis.go @@ -15,10 +15,20 @@ package index import ( + "reflect" + "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/size" ) +var reflectStaticSizeAnalysisResult int + +func init() { + var ar AnalysisResult + reflectStaticSizeAnalysisResult = int(reflect.TypeOf(ar).Size()) +} + type IndexRow interface { KeySize() int KeyTo([]byte) (int, error) @@ -39,6 +49,15 @@ type AnalysisResult struct { Length []int } +func (a *AnalysisResult) Size() int { + rv := reflectStaticSizeAnalysisResult + for _, analyzedI := range a.Analyzed { + rv += analyzedI.Size() + } + rv += len(a.Length) * size.SizeOfInt + return rv +} + type AnalysisWork struct { i Index d *document.Document diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 398451570..31d31642a 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -82,6 +82,8 @@ type internalStats struct { mergeSnapshotSize uint64 newSegBufBytesAdded uint64 newSegBufBytesRemoved uint64 + analysisBytesAdded uint64 + analysisBytesRemoved uint64 } func NewScorch(storeName string, @@ -295,12 +297,17 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { // wait for analysis result analysisResults := make([]*index.AnalysisResult, int(numUpdates)) var itemsDeQueued uint64 + var totalAnalysisSize int for itemsDeQueued < numUpdates { result := <-resultChan + resultSize := result.Size() + atomic.AddUint64(&s.iStats.analysisBytesAdded, uint64(resultSize)) + totalAnalysisSize += resultSize analysisResults[itemsDeQueued] = result itemsDeQueued++ } close(resultChan) + defer atomic.AddUint64(&s.iStats.analysisBytesRemoved, uint64(totalAnalysisSize)) atomic.AddUint64(&s.stats.TotAnalysisTime, uint64(time.Since(start))) @@ -531,6 +538,9 @@ func (s *Scorch) MemoryUsed() uint64 { memUsed += (atomic.LoadUint64(&s.iStats.newSegBufBytesAdded) - atomic.LoadUint64(&s.iStats.newSegBufBytesRemoved)) + memUsed += (atomic.LoadUint64(&s.iStats.analysisBytesAdded) - + atomic.LoadUint64(&s.iStats.analysisBytesRemoved)) + return memUsed } From d6e1f4557305a24c23888267ce3bac039f7977b5 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 24 Apr 2018 14:41:14 -0700 Subject: [PATCH 394/728] use cap(mem) for SegmentBase.Size() memory accounting --- index/scorch/segment/zap/segment.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 3bae48ddb..4a8a37988 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -108,7 +108,7 @@ func (sb *SegmentBase) Size() int { func (sb *SegmentBase) updateSize() { sizeInBytes := reflectStaticSizeSegmentBase + - len(sb.mem) + cap(sb.mem) // fieldsMap for k, _ := range sb.fieldsMap { @@ -163,7 +163,7 @@ func (s *Segment) Size() int { sizeInBytes += 16 // do not include the mmap'ed part - return sizeInBytes + s.SegmentBase.Size() - len(s.mem) + return sizeInBytes + s.SegmentBase.Size() - cap(s.mem) } func (s *Segment) AddRef() { From fca187c89cfa96e848141c449b7c2d83cc7d9d59 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 26 Apr 2018 08:39:55 -0400 Subject: [PATCH 395/728] disable disjunction heap optimization due to reported problems --- search/searcher/search_disjunction.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/search/searcher/search_disjunction.go b/search/searcher/search_disjunction.go index 16df7ca36..9c1f1d1f9 100644 --- a/search/searcher/search_disjunction.go +++ b/search/searcher/search_disjunction.go @@ -16,6 +16,7 @@ package searcher import ( "fmt" + "math" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" @@ -29,7 +30,7 @@ var DisjunctionMaxClauseCount = 0 // DisjunctionHeapTakeover is a compile time setting that applications can // adjust to control when the DisjunctionSearcher will switch from a simple // slice implementation to a heap implementation. -var DisjunctionHeapTakeover = 10 +var DisjunctionHeapTakeover = math.MaxInt64 func NewDisjunctionSearcher(indexReader index.IndexReader, qsearchers []search.Searcher, min float64, options search.SearcherOptions) ( From 8b92794f4c08934629e86a805e8f49b0a069f74f Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 26 Apr 2018 18:18:23 -0400 Subject: [PATCH 396/728] fix disjunction heap implementation and re-enable previously we incorreclty attempted to heap.Fix while traversing the heap in Advance() the fix also eliminates a full traversal of the heap performing comparisons, by taking advantage of the fact that heap elements are already ordered. so in addition to being correct, this should also be faster. --- search/searcher/search_disjunction.go | 3 +-- search/searcher/search_disjunction_heap.go | 27 +++++++++++----------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/search/searcher/search_disjunction.go b/search/searcher/search_disjunction.go index 9c1f1d1f9..16df7ca36 100644 --- a/search/searcher/search_disjunction.go +++ b/search/searcher/search_disjunction.go @@ -16,7 +16,6 @@ package searcher import ( "fmt" - "math" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" @@ -30,7 +29,7 @@ var DisjunctionMaxClauseCount = 0 // DisjunctionHeapTakeover is a compile time setting that applications can // adjust to control when the DisjunctionSearcher will switch from a simple // slice implementation to a heap implementation. -var DisjunctionHeapTakeover = math.MaxInt64 +var DisjunctionHeapTakeover = 10 func NewDisjunctionSearcher(indexReader index.IndexReader, qsearchers []search.Searcher, min float64, options search.SearcherOptions) ( diff --git a/search/searcher/search_disjunction_heap.go b/search/searcher/search_disjunction_heap.go index 6414da2cc..ffa373d2d 100644 --- a/search/searcher/search_disjunction_heap.go +++ b/search/searcher/search_disjunction_heap.go @@ -238,25 +238,26 @@ func (s *DisjunctionHeapSearcher) Advance(ctx *search.SearchContext, s.matching = s.matching[:0] s.matchingCurrs = s.matchingCurrs[:0] - // get all searchers pointing at their first match - for i, searcherCurr := range s.heap { - if searcherCurr.searcher != nil { - if searcherCurr.curr.IndexInternalID.Compare(ID) >= 0 { - continue - } - ctx.DocumentMatchPool.Put(searcherCurr.curr) - } + // find all searchers that actually need to be advanced + // advance them, using s.matchingCurrs as temp storage + for len(s.heap) > 0 && bytes.Compare(s.heap[0].curr.IndexInternalID, ID) < 0 { + searcherCurr := heap.Pop(s).(*SearcherCurr) + ctx.DocumentMatchPool.Put(searcherCurr.curr) curr, err := searcherCurr.searcher.Advance(ctx, ID) if err != nil { return nil, err } - searcherCurr.curr = curr - heap.Fix(s, i) + if curr != nil { + searcherCurr.curr = curr + s.matchingCurrs = append(s.matchingCurrs, searcherCurr) + } } - // now remove any nil values (at top of heap) - for len(s.heap) > 0 && s.heap[0].curr == nil { - heap.Pop(s) + // now all of the searchers that we advanced have to be pushed back + for _, matchingCurr := range s.matchingCurrs { + heap.Push(s, matchingCurr) } + // reset our temp space + s.matchingCurrs = s.matchingCurrs[:0] err := s.updateMatches() if err != nil { From 96dad7fda05b25b250b4ffe0e593d4e293b96e21 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 27 Apr 2018 15:47:31 +0530 Subject: [PATCH 397/728] dvReader improvement -attempt to improve the reuse of dvReaders -avoid the redundant dvCache prep checks --- index/index.go | 6 ++ index/scorch/segment/segment.go | 20 +++++- index/scorch/segment/zap/docvalues.go | 52 +++++++++++++--- index/scorch/segment/zap/segment_test.go | 4 +- index/scorch/snapshot_index.go | 78 ++++++++++++++++++++---- index/upsidedown/index_reader.go | 14 +++++ search/collector/search_test.go | 13 ++++ search/collector/topn.go | 20 ++++-- 8 files changed, 177 insertions(+), 30 deletions(-) diff --git a/index/index.go b/index/index.go index 42a452cde..7c4edcb8d 100644 --- a/index/index.go +++ b/index/index.go @@ -80,6 +80,8 @@ type IndexReader interface { Document(id string) (*document.Document, error) DocumentVisitFieldTerms(id IndexInternalID, fields []string, visitor DocumentFieldTermVisitor) error + DocValueReader(fields []string) (DocValueReader, error) + Fields() ([]string, error) GetInternal(key []byte) ([]byte, error) @@ -302,3 +304,7 @@ type OptimizableContext interface { // finished or completed via the Finish() method. Finish() error } + +type DocValueReader interface { + VisitDocValues(id IndexInternalID, visitor DocumentFieldTermVisitor) error +} diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index 1dd89b763..c2d7ce913 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -111,10 +111,28 @@ type Location interface { // postings or other indexed values. type DocumentFieldTermVisitable interface { VisitDocumentFieldTerms(localDocNum uint64, fields []string, - visitor index.DocumentFieldTermVisitor) error + visitor index.DocumentFieldTermVisitor, optional DocVisitState) (DocVisitState, error) // VisitableDocValueFields implementation should return // the list of fields which are document value persisted and // therefore visitable by the above VisitDocumentFieldTerms method. VisitableDocValueFields() ([]string, error) } + +type DocVisitState interface { + State() *FieldDocValueState + SetState(*FieldDocValueState) +} + +// FieldDocValueState represents the state details, +// which intents to save the redundant dvCache preparations +type FieldDocValueState struct { + DvFieldsAllPersisted bool + DvFieldsPending []string + DvCachePrepared bool + DvSegment DocumentFieldTermVisitable +} + +func (fdvs *FieldDocValueState) CurrentSegment() DocumentFieldTermVisitable { + return fdvs.DvSegment +} diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index 9c3815068..f37185152 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -23,6 +23,7 @@ import ( "sort" "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/size" "github.com/golang/snappy" ) @@ -36,6 +37,19 @@ func init() { type docNumTermsVisitor func(docNum uint64, terms []byte) error +type docVisitState struct { + dvrs map[uint16]*docValueReader + state *segment.FieldDocValueState +} + +func (dvs *docVisitState) SetState(state *segment.FieldDocValueState) { + dvs.state = state +} + +func (dvs *docVisitState) State() *segment.FieldDocValueState { + return dvs.state +} + type docValueReader struct { field string curChunkNum uint64 @@ -238,13 +252,29 @@ func (di *docValueReader) getDocValueLocs(docNum uint64) (uint64, uint64) { return math.MaxUint64, math.MaxUint64 } +func (s *Segment) CurrentSegment() segment.DocumentFieldTermVisitable { + return s +} + // VisitDocumentFieldTerms is an implementation of the // DocumentFieldTermVisitable interface -func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []string, - visitor index.DocumentFieldTermVisitor) error { - var dvIterClone *docValueReader - fieldIDPlus1 := uint16(0) - ok := true +func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string, + visitor index.DocumentFieldTermVisitor, dvsIn segment.DocVisitState) ( + segment.DocVisitState, error) { + dvs, ok := dvsIn.(*docVisitState) + if !ok || dvs == nil { + dvs = &docVisitState{ + dvrs: make(map[uint16]*docValueReader, len(fields)), + state: &segment.FieldDocValueState{DvSegment: s}, + } + } else { + if dvs.state.DvSegment != s { + dvs.state = &segment.FieldDocValueState{DvSegment: s} + dvs.dvrs = make(map[uint16]*docValueReader, len(fields)) + } + } + + var fieldIDPlus1 uint16 for _, field := range fields { if fieldIDPlus1, ok = s.fieldsMap[field]; !ok { continue @@ -254,20 +284,22 @@ func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []strin if dvIter, exists := s.fieldDvReaders[fieldIDPlus1-1]; exists && dvIter != nil { - dvIterClone = dvIter.cloneInto(dvIterClone) + if _, ok := dvs.dvrs[fieldIDPlus1-1]; !ok { + dvs.dvrs[fieldIDPlus1-1] = dvIter.cloneInto(dvs.dvrs[fieldIDPlus1-1]) + } // check if the chunk is already loaded - if docInChunk != dvIterClone.curChunkNumber() { - err := dvIterClone.loadDvChunk(docInChunk, s) + if docInChunk != dvs.dvrs[fieldIDPlus1-1].curChunkNumber() { + err := dvs.dvrs[fieldIDPlus1-1].loadDvChunk(docInChunk, &s.SegmentBase) if err != nil { continue } } - _ = dvIterClone.visitDocValues(localDocNum, visitor) + _ = dvs.dvrs[fieldIDPlus1-1].visitDocValues(localDocNum, visitor) } } - return nil + return dvs, nil } // VisitableDocValueFields returns the list of fields with diff --git a/index/scorch/segment/zap/segment_test.go b/index/scorch/segment/zap/segment_test.go index be7a39e14..22b8af23f 100644 --- a/index/scorch/segment/zap/segment_test.go +++ b/index/scorch/segment/zap/segment_test.go @@ -581,9 +581,9 @@ func TestSegmentVisitableDocValueFieldsList(t *testing.T) { } fieldTerms := make(index.FieldTerms) - err = zaps.VisitDocumentFieldTerms(0, fields, func(field string, term []byte) { + _, err = zaps.VisitDocumentFieldTerms(0, fields, func(field string, term []byte) { fieldTerms[field] = append(fieldTerms[field], string(term)) - }) + }, nil) if err != nil { t.Error(err) } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 2664fe425..ea73811d4 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -496,30 +496,64 @@ func docInternalToNumber(in index.IndexInternalID) (uint64, error) { func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID, fields []string, visitor index.DocumentFieldTermVisitor) error { + _, err := i.documentVisitFieldTerms(id, fields, visitor, nil) + return err +} + +func (i *IndexSnapshot) documentVisitFieldTerms(id index.IndexInternalID, + fields []string, visitor index.DocumentFieldTermVisitor, dvs segment.DocVisitState) ( + segment.DocVisitState, error) { docNum, err := docInternalToNumber(id) if err != nil { - return err + return nil, err } segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) if segmentIndex >= len(i.segment) { - return nil + return nil, nil } ss := i.segment[segmentIndex] if zaps, ok := ss.segment.(segment.DocumentFieldTermVisitable); ok { + var dvState *segment.FieldDocValueState + if dvs == nil { + dvState = &segment.FieldDocValueState{} + } else { + dvState = dvs.State() + // for a new segment, need to recheck the dvCache preparations + if zaps != dvState.CurrentSegment() { + dvState = &segment.FieldDocValueState{} + } + } + + // if all fields are dv persisted + if dvState.DvFieldsAllPersisted { + return zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor, dvs) + } + + // if the dvCache is already prepared for pending fields + if dvState.DvCachePrepared { + visitDocumentFieldCacheTerms(localDocNum, dvState.DvFieldsPending, ss, visitor) + return zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor, dvs) + } + // get the list of doc value persisted fields pFields, err := zaps.VisitableDocValueFields() if err != nil { - return err + return nil, err } // assort the fields for which terms look up have to // be performed runtime dvPendingFields := extractDvPendingFields(fields, pFields) + // all fields are doc value persisted if len(dvPendingFields) == 0 { - // all fields are doc value persisted - return zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor) + dvs, err = zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor, dvs) + state := dvs.State() + state.DvFieldsAllPersisted = true + state.DvFieldsPending = nil + dvs.SetState(state) + return dvs, err } // concurrently trigger the runtime doc value preparations for @@ -528,29 +562,33 @@ func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID, go func() { defer close(errCh) - err := ss.cachedDocs.prepareFields(fields, ss) + err := ss.cachedDocs.prepareFields(dvPendingFields, ss) if err != nil { errCh <- err } }() - // visit the persisted dv while the cache preparation is in progress - err = zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor) + // visit the requested persisted dv while the cache preparation in progress + dvs, err = zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor, dvs) if err != nil { - return err + return nil, err } // err out if fieldCache preparation failed err = <-errCh if err != nil { - return err + return nil, err } + state := dvs.State() + state.DvCachePrepared = true + state.DvFieldsPending = dvPendingFields + dvs.SetState(state) visitDocumentFieldCacheTerms(localDocNum, dvPendingFields, ss, visitor) - return nil + return dvs, nil } - return prepareCacheVisitDocumentFieldTerms(localDocNum, fields, ss, visitor) + return dvs, prepareCacheVisitDocumentFieldTerms(localDocNum, fields, ss, visitor) } func prepareCacheVisitDocumentFieldTerms(localDocNum uint64, fields []string, @@ -599,6 +637,22 @@ func extractDvPendingFields(requestedFields, persistedFields []string) []string return rv } +func (i *IndexSnapshot) DocValueReader(fields []string) (index.DocValueReader, error) { + return &DocValueReader{i: i, fields: fields}, nil +} + +type DocValueReader struct { + i *IndexSnapshot + fields []string + dvs segment.DocVisitState +} + +func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID, + visitor index.DocumentFieldTermVisitor) (err error) { + dvr.dvs, err = dvr.i.documentVisitFieldTerms(id, dvr.fields, visitor, dvr.dvs) + return err +} + func (i *IndexSnapshot) DumpAll() chan interface{} { rv := make(chan interface{}) go func() { diff --git a/index/upsidedown/index_reader.go b/index/upsidedown/index_reader.go index e045f67c7..ea7243eaa 100644 --- a/index/upsidedown/index_reader.go +++ b/index/upsidedown/index_reader.go @@ -210,3 +210,17 @@ func incrementBytes(in []byte) []byte { } return rv } + +func (i *IndexReader) DocValueReader(fields []string) (index.DocValueReader, error) { + return &DocValueReader{i: i, fields: fields}, nil +} + +type DocValueReader struct { + i *IndexReader + fields []string +} + +func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID, + visitor index.DocumentFieldTermVisitor) error { + return dvr.i.DocumentVisitFieldTerms(id, dvr.fields, visitor) +} diff --git a/search/collector/search_test.go b/search/collector/search_test.go index 3ba71c1d1..233bc9711 100644 --- a/search/collector/search_test.go +++ b/search/collector/search_test.go @@ -161,3 +161,16 @@ func (sr *stubReader) DumpFields() chan interface{} { func (sr *stubReader) Close() error { return nil } + +func (sr *stubReader) DocValueReader(fields []string) (index.DocValueReader, error) { + return &DocValueReader{i: sr, fields: fields}, nil +} + +type DocValueReader struct { + i *stubReader + fields []string +} + +func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID, visitor index.DocumentFieldTermVisitor) error { + return dvr.i.DocumentVisitFieldTerms(id, dvr.fields, visitor) +} diff --git a/search/collector/topn.go b/search/collector/topn.go index 4efe3ef24..8d63685cf 100644 --- a/search/collector/topn.go +++ b/search/collector/topn.go @@ -67,6 +67,8 @@ type TopNCollector struct { cachedDesc []bool lowestMatchOutsideResults *search.DocumentMatch + updateFieldVisitor index.DocumentFieldTermVisitor + dvReader index.DocValueReader } // CheckDoneEvery controls how frequently we check the context deadline @@ -140,6 +142,11 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, DocumentMatchPool: search.NewDocumentMatchPool(backingSize+searcher.DocumentMatchPoolSize(), len(hc.sort)), } + hc.dvReader, err = reader.DocValueReader(hc.neededFields) + if err != nil { + return err + } + select { case <-ctx.Done(): return ctx.Err() @@ -248,13 +255,16 @@ func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.Doc hc.facetsBuilder.StartDoc() } - err := reader.DocumentVisitFieldTerms(d.IndexInternalID, hc.neededFields, func(field string, term []byte) { - if hc.facetsBuilder != nil { - hc.facetsBuilder.UpdateVisitor(field, term) + if hc.updateFieldVisitor == nil { + hc.updateFieldVisitor = func(field string, term []byte) { + if hc.facetsBuilder != nil { + hc.facetsBuilder.UpdateVisitor(field, term) + } + hc.sort.UpdateVisitor(field, term) } - hc.sort.UpdateVisitor(field, term) - }) + } + err := hc.dvReader.VisitDocValues(d.IndexInternalID, hc.updateFieldVisitor) if hc.facetsBuilder != nil { hc.facetsBuilder.EndDoc() } From 14008f445a82c20a916df57a9cbc4e4f6d5075c5 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 27 Apr 2018 20:39:21 -0400 Subject: [PATCH 398/728] fix geo searches hitting max clause limit geo queries are supposed to execute without considering the max disjunction clauses limit, but a recent refactoring introduced this bug, causing them to run with the limit enforced. --- search/searcher/search_disjunction.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/search/searcher/search_disjunction.go b/search/searcher/search_disjunction.go index 16df7ca36..bbf7b4bbc 100644 --- a/search/searcher/search_disjunction.go +++ b/search/searcher/search_disjunction.go @@ -42,10 +42,10 @@ func newDisjunctionSearcher(indexReader index.IndexReader, limit bool) (search.Searcher, error) { if len(qsearchers) > DisjunctionHeapTakeover { return newDisjunctionHeapSearcher(indexReader, qsearchers, min, options, - true) + limit) } return newDisjunctionSliceSearcher(indexReader, qsearchers, min, options, - true) + limit) } func tooManyClauses(count int) bool { From 285ec7a656f30335eea577e13fa3293866fff417 Mon Sep 17 00:00:00 2001 From: Cesar Alvernaz Date: Sun, 29 Apr 2018 17:52:23 +0100 Subject: [PATCH 399/728] use filepath.Join instead, safer and cross-platform --- index_meta.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/index_meta.go b/index_meta.go index 95592a65d..2614292b7 100644 --- a/index_meta.go +++ b/index_meta.go @@ -18,7 +18,8 @@ import ( "encoding/json" "io/ioutil" "os" - + "path/filepath" + "github.com/blevesearch/bleve/index/upsidedown" ) @@ -92,5 +93,5 @@ func (i *indexMeta) Save(path string) (err error) { } func indexMetaPath(path string) string { - return path + string(os.PathSeparator) + metaFilename + return filepath.Join(path, metaFilename) } From 0a3f3e44224c1fc8f8b0e8777b7554abb8113d3b Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 9 Apr 2018 18:40:08 -0700 Subject: [PATCH 400/728] scorch optimize via vellum.FST.Reader() API --- index/scorch/segment/zap/dict.go | 13 +++++++------ index/scorch/segment/zap/segment.go | 4 ++++ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index b0664534d..a6fe58cdd 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -28,10 +28,11 @@ import ( // Dictionary is the zap representation of the term dictionary type Dictionary struct { - sb *SegmentBase - field string - fieldID uint16 - fst *vellum.FST + sb *SegmentBase + field string + fieldID uint16 + fst *vellum.FST + fstReader *vellum.Reader } // PostingsList returns the postings list for the specified term @@ -46,14 +47,14 @@ func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap, } func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { - if d.fst == nil { + if d.fstReader == nil { if rv == nil || rv == emptyPostingsList { return emptyPostingsList, nil } return d.postingsListInit(rv, except), nil } - postingsOffset, exists, err := d.fst.Get(term) + postingsOffset, exists, err := d.fstReader.Get(term) if err != nil { return nil, fmt.Errorf("vellum err: %v", err) } diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 4a8a37988..7d25d5f87 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -265,6 +265,10 @@ func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) { if err != nil { return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err) } + rv.fstReader, err = rv.fst.Reader() + if err != nil { + return nil, fmt.Errorf("dictionary field %s vellum Reader err: %v", field, err) + } } } } From 23aaeb730497c01ec0dfab0f05cc6adb62097b1e Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 9 Apr 2018 18:56:51 -0700 Subject: [PATCH 401/728] scorch optimize ExternalID() with Segment.DocID() API The previous implementation of ExternalID(), which used the generic VisitDocument() API to access the _id field (with extra closure allocations), has now been replaced in this optimization by invoking a new Segment.DocID() method. --- index/scorch/segment/empty.go | 4 ++++ index/scorch/segment/segment.go | 3 +++ index/scorch/segment/zap/segment.go | 24 ++++++++++++++++++++++++ index/scorch/snapshot_index.go | 19 +++++-------------- index/scorch/snapshot_segment.go | 4 ++++ 5 files changed, 40 insertions(+), 14 deletions(-) diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index 95f6d8bc8..0489c8218 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -29,6 +29,10 @@ func (e *EmptySegment) VisitDocument(num uint64, visitor DocumentFieldValueVisit return nil } +func (e *EmptySegment) DocID(num uint64) ([]byte, error) { + return nil, nil +} + func (e *EmptySegment) Count() uint64 { return 0 } diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index 1dd89b763..1620530a4 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -28,6 +28,9 @@ type Segment interface { Dictionary(field string) (TermDictionary, error) VisitDocument(num uint64, visitor DocumentFieldValueVisitor) error + + DocID(num uint64) ([]byte, error) + Count() uint64 DocNumbers([]string) (*roaring.Bitmap, error) diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 4a8a37988..874a9df66 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -368,6 +368,30 @@ func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldVal return nil } +// DocID returns the value of the _id field for the given docNum +func (s *SegmentBase) DocID(num uint64) ([]byte, error) { + if num >= s.numDocs { + return nil, nil + } + + vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) + + meta, compressed := s.getDocStoredMetaAndCompressed(num) + + vdc.reader.Reset(meta) + + // handle _id field special case + idFieldValLen, err := binary.ReadUvarint(&vdc.reader) + if err != nil { + return nil, err + } + idFieldVal := compressed[:idFieldValLen] + + visitDocumentCtxPool.Put(vdc) + + return idFieldVal, nil +} + // Count returns the number of documents in this segment. func (s *SegmentBase) Count() uint64 { return s.numDocs diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 2664fe425..d9251ca5e 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -357,24 +357,15 @@ func (i *IndexSnapshot) ExternalID(id index.IndexInternalID) (string, error) { } segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) - var found bool - var rv string - err = i.segment[segmentIndex].VisitDocument(localDocNum, func(field string, typ byte, value []byte, pos []uint64) bool { - if field == "_id" { - found = true - rv = string(value) - return false - } - return true - }) + v, err := i.segment[segmentIndex].DocID(localDocNum) if err != nil { return "", err } - - if found { - return rv, nil + if v == nil { + return "", fmt.Errorf("document number %d not found", docNum) } - return "", fmt.Errorf("document number %d not found", docNum) + + return string(v), nil } func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err error) { diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 44aafa523..4053244da 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -101,6 +101,10 @@ func (s *SegmentSnapshot) VisitDocument(num uint64, visitor segment.DocumentFiel return s.segment.VisitDocument(num, visitor) } +func (s *SegmentSnapshot) DocID(num uint64) ([]byte, error) { + return s.segment.DocID(num) +} + func (s *SegmentSnapshot) Count() uint64 { rv := s.segment.Count() From 7c996711e7dadf65200295244305ba3a695be1c5 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 30 Apr 2018 14:00:58 -0700 Subject: [PATCH 402/728] Micro optimizations while estimating mem needed for search --- index_impl.go | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/index_impl.go b/index_impl.go index b5373ff0d..c969f3758 100644 --- a/index_impl.go +++ b/index_impl.go @@ -368,6 +368,25 @@ func (i *indexImpl) Search(req *SearchRequest) (sr *SearchResult, err error) { return i.SearchInContext(context.Background(), req) } +var documentMatchEmptySize int +var searchContextEmptySize int +var facetResultEmptySize int +var documentEmptySize int + +func init() { + var dm search.DocumentMatch + documentMatchEmptySize = dm.Size() + + var sc search.SearchContext + searchContextEmptySize = sc.Size() + + var fr search.FacetResult + facetResultEmptySize = fr.Size() + + var d document.Document + documentEmptySize = d.Size() +} + // memNeededForSearch is a helper function that returns an estimate of RAM // needed to execute a search request. func memNeededForSearch(req *SearchRequest, @@ -385,35 +404,27 @@ func memNeededForSearch(req *SearchRequest, // overhead, size in bytes from collector estimate += topnCollector.Size() - var dm search.DocumentMatch - sizeOfDocumentMatch := dm.Size() - // pre-allocing DocumentMatchPool - var sc search.SearchContext - estimate += sc.Size() + numDocMatches*sizeOfDocumentMatch + estimate += searchContextEmptySize + numDocMatches*documentMatchEmptySize // searcher overhead estimate += searcher.Size() // overhead from results, lowestMatchOutsideResults - estimate += (numDocMatches + 1) * sizeOfDocumentMatch + estimate += (numDocMatches + 1) * documentMatchEmptySize // additional overhead from SearchResult - var sr SearchResult - estimate += sr.Size() + estimate += reflectStaticSizeSearchResult + reflectStaticSizeSearchStatus // overhead from facet results if req.Facets != nil { - var fr search.FacetResult - estimate += len(req.Facets) * fr.Size() + estimate += len(req.Facets) * facetResultEmptySize } // highlighting, store - var d document.Document if len(req.Fields) > 0 || req.Highlight != nil { - for i := 0; i < (req.Size + req.From); i++ { // size + from => number of hits - estimate += (req.Size + req.From) * d.Size() - } + // Size + From => number of hits + estimate += (req.Size + req.From) * documentEmptySize } return uint64(estimate) From a8bd3e641e1ead048ded267094922f69b68ca44b Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Tue, 1 May 2018 13:45:34 +0530 Subject: [PATCH 403/728] refactoring the dvReader reuse --- index/scorch/segment/segment.go | 15 ------- index/scorch/segment/zap/docvalues.go | 63 ++++++++++++--------------- index/scorch/snapshot_index.go | 33 +------------- search/collector/topn.go | 16 +++---- 4 files changed, 37 insertions(+), 90 deletions(-) diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index c2d7ce913..9b8d2aa47 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -120,19 +120,4 @@ type DocumentFieldTermVisitable interface { } type DocVisitState interface { - State() *FieldDocValueState - SetState(*FieldDocValueState) -} - -// FieldDocValueState represents the state details, -// which intents to save the redundant dvCache preparations -type FieldDocValueState struct { - DvFieldsAllPersisted bool - DvFieldsPending []string - DvCachePrepared bool - DvSegment DocumentFieldTermVisitable -} - -func (fdvs *FieldDocValueState) CurrentSegment() DocumentFieldTermVisitable { - return fdvs.DvSegment } diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index f37185152..d28964978 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -38,16 +38,8 @@ func init() { type docNumTermsVisitor func(docNum uint64, terms []byte) error type docVisitState struct { - dvrs map[uint16]*docValueReader - state *segment.FieldDocValueState -} - -func (dvs *docVisitState) SetState(state *segment.FieldDocValueState) { - dvs.state = state -} - -func (dvs *docVisitState) State() *segment.FieldDocValueState { - return dvs.state + dvrs map[uint16]*docValueReader + segment *Segment } type docValueReader struct { @@ -252,10 +244,6 @@ func (di *docValueReader) getDocValueLocs(docNum uint64) (uint64, uint64) { return math.MaxUint64, math.MaxUint64 } -func (s *Segment) CurrentSegment() segment.DocumentFieldTermVisitable { - return s -} - // VisitDocumentFieldTerms is an implementation of the // DocumentFieldTermVisitable interface func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string, @@ -263,40 +251,47 @@ func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string, segment.DocVisitState, error) { dvs, ok := dvsIn.(*docVisitState) if !ok || dvs == nil { - dvs = &docVisitState{ - dvrs: make(map[uint16]*docValueReader, len(fields)), - state: &segment.FieldDocValueState{DvSegment: s}, - } + dvs = &docVisitState{} } else { - if dvs.state.DvSegment != s { - dvs.state = &segment.FieldDocValueState{DvSegment: s} - dvs.dvrs = make(map[uint16]*docValueReader, len(fields)) + if dvs.segment != s { + dvs.segment = s + dvs.dvrs = nil } } var fieldIDPlus1 uint16 + if dvs.dvrs == nil { + dvs.dvrs = make(map[uint16]*docValueReader, len(fields)) + for _, field := range fields { + if fieldIDPlus1, ok = s.fieldsMap[field]; !ok { + continue + } + fieldID := fieldIDPlus1 - 1 + if dvIter, exists := s.fieldDvReaders[fieldID]; exists && + dvIter != nil { + dvs.dvrs[fieldID] = dvIter.cloneInto(dvs.dvrs[fieldID]) + } + } + } + + // find the chunkNumber where the docValues are stored + docInChunk := localDocNum / uint64(s.chunkFactor) + var dvr *docValueReader for _, field := range fields { if fieldIDPlus1, ok = s.fieldsMap[field]; !ok { continue } - // find the chunkNumber where the docValues are stored - docInChunk := localDocNum / uint64(s.chunkFactor) - - if dvIter, exists := s.fieldDvReaders[fieldIDPlus1-1]; exists && - dvIter != nil { - if _, ok := dvs.dvrs[fieldIDPlus1-1]; !ok { - dvs.dvrs[fieldIDPlus1-1] = dvIter.cloneInto(dvs.dvrs[fieldIDPlus1-1]) - } - + fieldID := fieldIDPlus1 - 1 + if dvr, ok = dvs.dvrs[fieldID]; ok && dvr != nil { // check if the chunk is already loaded - if docInChunk != dvs.dvrs[fieldIDPlus1-1].curChunkNumber() { - err := dvs.dvrs[fieldIDPlus1-1].loadDvChunk(docInChunk, &s.SegmentBase) + if docInChunk != dvr.curChunkNumber() { + err := dvr.loadDvChunk(docInChunk, &s.SegmentBase) if err != nil { - continue + return dvs, err } } - _ = dvs.dvrs[fieldIDPlus1-1].visitDocValues(localDocNum, visitor) + _ = dvr.visitDocValues(localDocNum, visitor) } } return dvs, nil diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index ea73811d4..6de5e14fd 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -516,28 +516,6 @@ func (i *IndexSnapshot) documentVisitFieldTerms(id index.IndexInternalID, ss := i.segment[segmentIndex] if zaps, ok := ss.segment.(segment.DocumentFieldTermVisitable); ok { - var dvState *segment.FieldDocValueState - if dvs == nil { - dvState = &segment.FieldDocValueState{} - } else { - dvState = dvs.State() - // for a new segment, need to recheck the dvCache preparations - if zaps != dvState.CurrentSegment() { - dvState = &segment.FieldDocValueState{} - } - } - - // if all fields are dv persisted - if dvState.DvFieldsAllPersisted { - return zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor, dvs) - } - - // if the dvCache is already prepared for pending fields - if dvState.DvCachePrepared { - visitDocumentFieldCacheTerms(localDocNum, dvState.DvFieldsPending, ss, visitor) - return zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor, dvs) - } - // get the list of doc value persisted fields pFields, err := zaps.VisitableDocValueFields() if err != nil { @@ -548,12 +526,7 @@ func (i *IndexSnapshot) documentVisitFieldTerms(id index.IndexInternalID, dvPendingFields := extractDvPendingFields(fields, pFields) // all fields are doc value persisted if len(dvPendingFields) == 0 { - dvs, err = zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor, dvs) - state := dvs.State() - state.DvFieldsAllPersisted = true - state.DvFieldsPending = nil - dvs.SetState(state) - return dvs, err + return zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor, dvs) } // concurrently trigger the runtime doc value preparations for @@ -579,10 +552,6 @@ func (i *IndexSnapshot) documentVisitFieldTerms(id index.IndexInternalID, if err != nil { return nil, err } - state := dvs.State() - state.DvCachePrepared = true - state.DvFieldsPending = dvPendingFields - dvs.SetState(state) visitDocumentFieldCacheTerms(localDocNum, dvPendingFields, ss, visitor) return dvs, nil diff --git a/search/collector/topn.go b/search/collector/topn.go index 8d63685cf..4b2682da0 100644 --- a/search/collector/topn.go +++ b/search/collector/topn.go @@ -147,6 +147,13 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, return err } + hc.updateFieldVisitor = func(field string, term []byte) { + if hc.facetsBuilder != nil { + hc.facetsBuilder.UpdateVisitor(field, term) + } + hc.sort.UpdateVisitor(field, term) + } + select { case <-ctx.Done(): return ctx.Err() @@ -255,15 +262,6 @@ func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.Doc hc.facetsBuilder.StartDoc() } - if hc.updateFieldVisitor == nil { - hc.updateFieldVisitor = func(field string, term []byte) { - if hc.facetsBuilder != nil { - hc.facetsBuilder.UpdateVisitor(field, term) - } - hc.sort.UpdateVisitor(field, term) - } - } - err := hc.dvReader.VisitDocValues(d.IndexInternalID, hc.updateFieldVisitor) if hc.facetsBuilder != nil { hc.facetsBuilder.EndDoc() From ebdbbffd06d86d411f1ba4fbbc10c94370e6ddb4 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 3 May 2018 12:54:36 -0400 Subject: [PATCH 404/728] limit fuzzy edit distance to 2 --- search/searcher/search_fuzzy.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/search/searcher/search_fuzzy.go b/search/searcher/search_fuzzy.go index 41ad804f1..575e3c140 100644 --- a/search/searcher/search_fuzzy.go +++ b/search/searcher/search_fuzzy.go @@ -15,13 +15,22 @@ package searcher import ( + "fmt" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" ) +var MaxFuzziness = 2 + func NewFuzzySearcher(indexReader index.IndexReader, term string, prefix, fuzziness int, field string, boost float64, options search.SearcherOptions) (search.Searcher, error) { + + if fuzziness > MaxFuzziness { + return nil, fmt.Errorf("fuzziness exceeds max (%d)", MaxFuzziness) + } + // Note: we don't byte slice the term for a prefix because of runes. prefixTerm := "" for i, r := range term { From e52660b99369b0e79f3de17434f5c7594aa61986 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 3 May 2018 13:32:06 -0400 Subject: [PATCH 405/728] correctly return dictionary iterator creation errors in Next() --- index/scorch/segment/zap/dict.go | 6 ++-- index/scorch/segment/zap/dict_test.go | 40 +++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 3 deletions(-) diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index a6fe58cdd..736fa59f6 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -270,10 +270,10 @@ type DictionaryIterator struct { // Next returns the next entry in the dictionary func (i *DictionaryIterator) Next() (*index.DictEntry, error) { - if i.itr == nil || i.err == vellum.ErrIteratorDone { - return nil, nil - } else if i.err != nil { + if i.err != nil && i.err != vellum.ErrIteratorDone { return nil, i.err + } else if i.itr == nil || i.err == vellum.ErrIteratorDone { + return nil, nil } term, postingsOffset := i.itr.Current() i.entry.Term = string(term) diff --git a/index/scorch/segment/zap/dict_test.go b/index/scorch/segment/zap/dict_test.go index 1a8ce22c5..8cbd5710b 100644 --- a/index/scorch/segment/zap/dict_test.go +++ b/index/scorch/segment/zap/dict_test.go @@ -178,3 +178,43 @@ func TestDictionary(t *testing.T) { t.Errorf("expected: %v, got: %v", expected, got) } } + +func TestDictionaryError(t *testing.T) { + + _ = os.RemoveAll("/tmp/scorch.zap") + + testSeg, _, _ := buildTestSegmentForDict() + err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") + if err != nil { + t.Fatalf("error persisting segment: %v", err) + } + + segment, err := Open("/tmp/scorch.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + dict, err := segment.Dictionary("desc") + if err != nil { + t.Fatal(err) + } + + itr := dict.FuzzyIterator("summer", 5) + if itr == nil { + t.Fatalf("got nil itr") + } + nxt, err := itr.Next() + if nxt != nil { + t.Fatalf("expected nil next") + } + if err == nil { + t.Fatalf("expected error from iterator") + } + +} From b1b570b3d9b0e359833ea7f0819bd7ab87a08490 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 3 May 2018 14:18:13 -0400 Subject: [PATCH 406/728] return err and close properly for fuzzy and regexp search --- search/searcher/search_fuzzy.go | 7 ++++++- search/searcher/search_regexp.go | 3 +++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/search/searcher/search_fuzzy.go b/search/searcher/search_fuzzy.go index 575e3c140..b812f4840 100644 --- a/search/searcher/search_fuzzy.go +++ b/search/searcher/search_fuzzy.go @@ -65,12 +65,17 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, if err != nil { return rv, err } + defer func() { + if cerr := fieldDict.Close(); cerr != nil && err == nil { + err = cerr + } + }() tfd, err := fieldDict.Next() for err == nil && tfd != nil { rv = append(rv, tfd.Term) tfd, err = fieldDict.Next() } - return rv, nil + return rv, err } fieldDict, err = indexReader.FieldDict(field) } diff --git a/search/searcher/search_regexp.go b/search/searcher/search_regexp.go index b88eb3eb5..ad417a056 100644 --- a/search/searcher/search_regexp.go +++ b/search/searcher/search_regexp.go @@ -47,6 +47,9 @@ func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp, candidateTerms = append(candidateTerms, tfd.Term) tfd, err = fieldDict.Next() } + if err != nil { + return nil, err + } } else { prefixTerm, complete := pattern.LiteralPrefix() if complete { From 4d0e0fe826886e61a76221aed6fd9bad082386b2 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 4 May 2018 10:55:02 -0700 Subject: [PATCH 407/728] scorch refactor visitDocumentFieldCacheTerms for lock protection This change refactors visitDocumentFieldCacheTerms() into a method of cachedDocs so that it can acquire the appropriate locks. --- index/scorch/snapshot_index.go | 25 ++----------------------- index/scorch/snapshot_segment.go | 24 ++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 23 deletions(-) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index edb986d0a..be71836d0 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -15,7 +15,6 @@ package scorch import ( - "bytes" "container/heap" "encoding/binary" "fmt" @@ -544,7 +543,7 @@ func (i *IndexSnapshot) documentVisitFieldTerms(id index.IndexInternalID, return nil, err } - visitDocumentFieldCacheTerms(localDocNum, dvPendingFields, ss, visitor) + ss.cachedDocs.visitDoc(localDocNum, dvPendingFields, visitor) return dvs, nil } @@ -558,30 +557,10 @@ func prepareCacheVisitDocumentFieldTerms(localDocNum uint64, fields []string, return err } - visitDocumentFieldCacheTerms(localDocNum, fields, ss, visitor) + ss.cachedDocs.visitDoc(localDocNum, fields, visitor) return nil } -func visitDocumentFieldCacheTerms(localDocNum uint64, fields []string, - ss *SegmentSnapshot, visitor index.DocumentFieldTermVisitor) { - - for _, field := range fields { - if cachedFieldDocs, exists := ss.cachedDocs.cache[field]; exists { - if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists { - for { - i := bytes.Index(tlist, TermSeparatorSplitSlice) - if i < 0 { - break - } - visitor(field, tlist[0:i]) - tlist = tlist[i+1:] - } - } - } - } - -} - func extractDvPendingFields(requestedFields, persistedFields []string) []string { removeMap := make(map[string]struct{}, len(persistedFields)) for _, str := range persistedFields { diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 4053244da..7f7300109 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -15,10 +15,12 @@ package scorch import ( + "bytes" "sync" "sync/atomic" "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/size" ) @@ -270,3 +272,25 @@ func (c *cachedDocs) updateSizeLOCKED() { } atomic.StoreUint64(&c.size, uint64(sizeInBytes)) } + +func (c *cachedDocs) visitDoc(localDocNum uint64, + fields []string, visitor index.DocumentFieldTermVisitor) { + c.m.Lock() + + for _, field := range fields { + if cachedFieldDocs, exists := c.cache[field]; exists { + if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists { + for { + i := bytes.Index(tlist, TermSeparatorSplitSlice) + if i < 0 { + break + } + visitor(field, tlist[0:i]) + tlist = tlist[i+1:] + } + } + } + } + + c.m.Unlock() +} From f8373bd328b9aa44d827c244ed268d66fab2dbe0 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 4 May 2018 13:04:13 -0700 Subject: [PATCH 408/728] rename cachedFieldDocs.prepareField() to singular --- index/scorch/snapshot_segment.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 7f7300109..4e05f5153 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -168,7 +168,7 @@ type cachedFieldDocs struct { size uint64 } -func (cfd *cachedFieldDocs) prepareFields(field string, ss *SegmentSnapshot) { +func (cfd *cachedFieldDocs) prepareField(field string, ss *SegmentSnapshot) { defer close(cfd.readyCh) cfd.size += uint64(size.SizeOfUint64) /* size field */ @@ -224,6 +224,7 @@ type cachedDocs struct { func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) error { c.m.Lock() + if c.cache == nil { c.cache = make(map[string]*cachedFieldDocs, len(ss.Fields())) } @@ -236,7 +237,7 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e docs: make(map[uint64][]byte), } - go c.cache[field].prepareFields(field, ss) + go c.cache[field].prepareField(field, ss) } } @@ -250,6 +251,7 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e } c.m.Lock() } + c.updateSizeLOCKED() c.m.Unlock() From 7ec3ad9e8ae005b3a123a0cd08b9f6484ea3259f Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 4 May 2018 14:04:11 -0700 Subject: [PATCH 409/728] scorch documentVisitFieldTerms() avoids unneeded prepareFields() This optimization checks whether the cachedDocs already has all the wanted fields cached already (from a previous invocation) before spawning a goroutine to prepareFields(). --- index/scorch/snapshot_index.go | 94 +++++++++++++++----------------- index/scorch/snapshot_segment.go | 18 +++++- 2 files changed, 61 insertions(+), 51 deletions(-) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index be71836d0..2525f9899 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -491,13 +491,13 @@ func (i *IndexSnapshot) DocumentVisitFieldTerms(id index.IndexInternalID, } func (i *IndexSnapshot) documentVisitFieldTerms(id index.IndexInternalID, - fields []string, visitor index.DocumentFieldTermVisitor, dvs segment.DocVisitState) ( - segment.DocVisitState, error) { - + fields []string, visitor index.DocumentFieldTermVisitor, + dvs segment.DocVisitState) (segment.DocVisitState, error) { docNum, err := docInternalToNumber(id) if err != nil { return nil, err } + segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) if segmentIndex >= len(i.segment) { return nil, nil @@ -505,75 +505,50 @@ func (i *IndexSnapshot) documentVisitFieldTerms(id index.IndexInternalID, ss := i.segment[segmentIndex] - if zaps, ok := ss.segment.(segment.DocumentFieldTermVisitable); ok { - // get the list of doc value persisted fields - pFields, err := zaps.VisitableDocValueFields() + var vFields []string // fields that are visitable via the segment + + ssv, ssvOk := ss.segment.(segment.DocumentFieldTermVisitable) + if ssvOk && ssv != nil { + vFields, err = ssv.VisitableDocValueFields() if err != nil { return nil, err } - // assort the fields for which terms look up have to - // be performed runtime - dvPendingFields := extractDvPendingFields(fields, pFields) - // all fields are doc value persisted - if len(dvPendingFields) == 0 { - return zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor, dvs) - } + } + + // cFields represents the fields that we'll need from the cachedDocs + cFields := subtractStrings(fields, vFields) - // concurrently trigger the runtime doc value preparations for - // pending fields as well as the visit of the persisted doc values - errCh := make(chan error, 1) + var errCh chan error + + if !ss.cachedDocs.hasFields(cFields) { + errCh = make(chan error, 1) go func() { - defer close(errCh) - err := ss.cachedDocs.prepareFields(dvPendingFields, ss) + err := ss.cachedDocs.prepareFields(cFields, ss) if err != nil { errCh <- err } + close(errCh) }() + } - // visit the requested persisted dv while the cache preparation in progress - dvs, err = zaps.VisitDocumentFieldTerms(localDocNum, fields, visitor, dvs) + if ssvOk && ssv != nil && len(vFields) > 0 { + dvs, err = ssv.VisitDocumentFieldTerms(localDocNum, fields, visitor, dvs) if err != nil { return nil, err } + } - // err out if fieldCache preparation failed + if errCh != nil { err = <-errCh if err != nil { return nil, err } - - ss.cachedDocs.visitDoc(localDocNum, dvPendingFields, visitor) - return dvs, nil } - return dvs, prepareCacheVisitDocumentFieldTerms(localDocNum, fields, ss, visitor) -} + ss.cachedDocs.visitDoc(localDocNum, cFields, visitor) -func prepareCacheVisitDocumentFieldTerms(localDocNum uint64, fields []string, - ss *SegmentSnapshot, visitor index.DocumentFieldTermVisitor) error { - err := ss.cachedDocs.prepareFields(fields, ss) - if err != nil { - return err - } - - ss.cachedDocs.visitDoc(localDocNum, fields, visitor) - return nil -} - -func extractDvPendingFields(requestedFields, persistedFields []string) []string { - removeMap := make(map[string]struct{}, len(persistedFields)) - for _, str := range persistedFields { - removeMap[str] = struct{}{} - } - - rv := make([]string, 0, len(requestedFields)) - for _, s := range requestedFields { - if _, ok := removeMap[s]; !ok { - rv = append(rv, s) - } - } - return rv + return dvs, nil } func (i *IndexSnapshot) DocValueReader(fields []string) (index.DocValueReader, error) { @@ -615,3 +590,22 @@ func (i *IndexSnapshot) DumpFields() chan interface{} { }() return rv } + +// subtractStrings returns set a minus elements of set b. +func subtractStrings(a, b []string) []string { + if len(b) <= 0 { + return a + } + + rv := make([]string, 0, len(a)) +OUTER: + for _, as := range a { + for _, bs := range b { + if as == bs { + continue OUTER + } + } + rv = append(rv, as) + } + return rv +} diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 4e05f5153..7ee43fcf9 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -108,7 +108,6 @@ func (s *SegmentSnapshot) DocID(num uint64) ([]byte, error) { } func (s *SegmentSnapshot) Count() uint64 { - rv := s.segment.Count() if s.deleted != nil { rv -= s.deleted.GetCardinality() @@ -258,6 +257,23 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e return nil } +// hasFields returns true if the cache has all the given fields +func (c *cachedDocs) hasFields(fields []string) bool { + c.m.Lock() +OUTER: + for _, field := range fields { + for f := range c.cache { + if f == field { + continue OUTER + } + } + c.m.Unlock() + return false // found a field not in cache + } + c.m.Unlock() + return true +} + func (c *cachedDocs) Size() int { return int(atomic.LoadUint64(&c.size)) } From 3afbaf3731b6d3d7cd136b59944772d8eebc0e16 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 4 May 2018 14:14:30 -0700 Subject: [PATCH 410/728] scorch optimize zap segment.VisitableDocValueFields() --- index/scorch/segment/zap/docvalues.go | 9 +-------- index/scorch/segment/zap/segment.go | 8 +++++++- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index d28964978..96dd5ab1a 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -301,12 +301,5 @@ func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string, // persisted doc value terms ready to be visitable using the // VisitDocumentFieldTerms method. func (s *Segment) VisitableDocValueFields() ([]string, error) { - rv := make([]string, 0, len(s.fieldDvReaders)) - for fieldID, field := range s.fieldsInv { - if dvIter, ok := s.fieldDvReaders[uint16(fieldID)]; ok && - dvIter != nil { - rv = append(rv, field) - } - } - return rv, nil + return s.fieldDvNames, nil } diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 5f9a562f5..ed09d149d 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -99,6 +99,7 @@ type SegmentBase struct { docValueOffset uint64 dictLocs []uint64 fieldDvReaders map[uint16]*docValueReader // naive chunk cache per field + fieldDvNames []string // field names cached in fieldDvReaders size uint64 } @@ -528,7 +529,12 @@ func (s *SegmentBase) loadDvReaders() error { } read += uint64(n) - s.fieldDvReaders[uint16(fieldID)], _ = s.loadFieldDocValueReader(field, fieldLocStart, fieldLocEnd) + fieldDvReader, _ := s.loadFieldDocValueReader(field, fieldLocStart, fieldLocEnd) + if fieldDvReader != nil { + s.fieldDvReaders[uint16(fieldID)] = fieldDvReader + s.fieldDvNames = append(s.fieldDvNames, field) + } } + return nil } From 03dcd2e2f5b06016668b37d7f61b838d2d2c62a6 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 4 May 2018 14:28:08 -0700 Subject: [PATCH 411/728] scorch zap optimize docValueReader curChunkHeader slice reuse --- index/scorch/segment/zap/docvalues.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index 96dd5ab1a..72ce1248f 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -69,9 +69,9 @@ func (di *docValueReader) cloneInto(rv *docValueReader) *docValueReader { rv.curChunkNum = math.MaxUint64 rv.chunkOffsets = di.chunkOffsets // immutable, so it's sharable rv.dvDataLoc = di.dvDataLoc - rv.curChunkHeader = nil + rv.curChunkHeader = rv.curChunkHeader[:0] rv.curChunkData = nil - rv.uncompressed = nil + rv.uncompressed = rv.uncompressed[:0] return rv } @@ -150,7 +150,11 @@ func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error chunkMetaLoc := destChunkDataLoc + uint64(read) offset := uint64(0) - di.curChunkHeader = make([]MetaData, int(numDocs)) + if cap(di.curChunkHeader) < int(numDocs) { + di.curChunkHeader = make([]MetaData, int(numDocs)) + } else { + di.curChunkHeader = di.curChunkHeader[:int(numDocs)] + } for i := 0; i < int(numDocs); i++ { di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) offset += uint64(read) From a0c5b7fc401620fb071a64dcaa9f9ecbc7d82957 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 4 May 2018 15:41:52 -0700 Subject: [PATCH 412/728] optimize cachedDocs.hasFields() Hat tip to Abhinav D. for optimization idea! --- index/scorch/snapshot_segment.go | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 7ee43fcf9..98084a980 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -260,15 +260,11 @@ func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) e // hasFields returns true if the cache has all the given fields func (c *cachedDocs) hasFields(fields []string) bool { c.m.Lock() -OUTER: for _, field := range fields { - for f := range c.cache { - if f == field { - continue OUTER - } + if _, exists := c.cache[field]; !exists { + c.m.Unlock() + return false // found a field not in cache } - c.m.Unlock() - return false // found a field not in cache } c.m.Unlock() return true From c3a911da80c2cdeebc2e219be8bf868d07b2ca34 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 4 May 2018 16:01:08 -0700 Subject: [PATCH 413/728] scorch optimize DocValueReader as its fields are stable Since the wanted fields are immutable during the lifetime of a DocValueReader, we can optimize as the cached fields doesn't need to be recomputed on every document that's visited. --- index/scorch/snapshot_index.go | 74 +++++++++++++++++++++++++--------- 1 file changed, 55 insertions(+), 19 deletions(-) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 2525f9899..4d5cfc7ba 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -503,6 +503,16 @@ func (i *IndexSnapshot) documentVisitFieldTerms(id index.IndexInternalID, return nil, nil } + _, dvs, err = i.documentVisitFieldTermsOnSegment( + segmentIndex, localDocNum, fields, nil, visitor, dvs) + + return dvs, err +} + +func (i *IndexSnapshot) documentVisitFieldTermsOnSegment( + segmentIndex int, localDocNum uint64, fields []string, cFields []string, + visitor index.DocumentFieldTermVisitor, dvs segment.DocVisitState) ( + cFieldsOut []string, dvsOut segment.DocVisitState, err error) { ss := i.segment[segmentIndex] var vFields []string // fields that are visitable via the segment @@ -511,59 +521,85 @@ func (i *IndexSnapshot) documentVisitFieldTerms(id index.IndexInternalID, if ssvOk && ssv != nil { vFields, err = ssv.VisitableDocValueFields() if err != nil { - return nil, err + return nil, nil, err } } - // cFields represents the fields that we'll need from the cachedDocs - cFields := subtractStrings(fields, vFields) - var errCh chan error - if !ss.cachedDocs.hasFields(cFields) { - errCh = make(chan error, 1) + // cFields represents the fields that we'll need from the + // cachedDocs, and might be optionally be provided by the caller, + // if the caller happens to know we're on the same segmentIndex + // from a previous invocation + if cFields == nil { + cFields = subtractStrings(fields, vFields) - go func() { - err := ss.cachedDocs.prepareFields(cFields, ss) - if err != nil { - errCh <- err - } - close(errCh) - }() + if !ss.cachedDocs.hasFields(cFields) { + errCh = make(chan error, 1) + + go func() { + err := ss.cachedDocs.prepareFields(cFields, ss) + if err != nil { + errCh <- err + } + close(errCh) + }() + } } if ssvOk && ssv != nil && len(vFields) > 0 { dvs, err = ssv.VisitDocumentFieldTerms(localDocNum, fields, visitor, dvs) if err != nil { - return nil, err + return nil, nil, err } } if errCh != nil { err = <-errCh if err != nil { - return nil, err + return nil, nil, err } } ss.cachedDocs.visitDoc(localDocNum, cFields, visitor) - return dvs, nil + return cFields, dvs, nil } -func (i *IndexSnapshot) DocValueReader(fields []string) (index.DocValueReader, error) { - return &DocValueReader{i: i, fields: fields}, nil +func (i *IndexSnapshot) DocValueReader(fields []string) ( + index.DocValueReader, error) { + return &DocValueReader{i: i, fields: fields, currSegmentIndex: -1}, nil } type DocValueReader struct { i *IndexSnapshot fields []string dvs segment.DocVisitState + + currSegmentIndex int + currCachedFields []string } func (dvr *DocValueReader) VisitDocValues(id index.IndexInternalID, visitor index.DocumentFieldTermVisitor) (err error) { - dvr.dvs, err = dvr.i.documentVisitFieldTerms(id, dvr.fields, visitor, dvr.dvs) + docNum, err := docInternalToNumber(id) + if err != nil { + return err + } + + segmentIndex, localDocNum := dvr.i.segmentIndexAndLocalDocNumFromGlobal(docNum) + if segmentIndex >= len(dvr.i.segment) { + return nil + } + + if dvr.currSegmentIndex != segmentIndex { + dvr.currSegmentIndex = segmentIndex + dvr.currCachedFields = nil + } + + dvr.currCachedFields, dvr.dvs, err = dvr.i.documentVisitFieldTermsOnSegment( + dvr.currSegmentIndex, localDocNum, dvr.fields, dvr.currCachedFields, visitor, dvr.dvs) + return err } From 46cdfad30f9c7247d2cd49861752618cf006a529 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 4 May 2018 19:02:54 -0700 Subject: [PATCH 414/728] scorch invokes cachedDocs.visitDoc() only if needed Hat tip to opimization feedback from Sreekanth S. --- index/scorch/snapshot_index.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 4d5cfc7ba..0f8c21371 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -561,7 +561,9 @@ func (i *IndexSnapshot) documentVisitFieldTermsOnSegment( } } - ss.cachedDocs.visitDoc(localDocNum, cFields, visitor) + if len(cFields) > 0 { + ss.cachedDocs.visitDoc(localDocNum, cFields, visitor) + } return cFields, dvs, nil } From 0f9eebda37f2011c7b9996160a7d2c90a79ea98e Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 5 May 2018 11:21:00 -0700 Subject: [PATCH 415/728] optimize sort via values slice reuse --- search/sort.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/search/sort.go b/search/sort.go index 6afc9789b..70fb969be 100644 --- a/search/sort.go +++ b/search/sort.go @@ -359,7 +359,7 @@ func (s *SortField) UpdateVisitor(field string, term []byte) { func (s *SortField) Value(i *DocumentMatch) string { iTerms := s.filterTermsByType(s.values) iTerm := s.filterTermsByMode(iTerms) - s.values = nil + s.values = s.values[:0] return iTerm } @@ -619,7 +619,7 @@ func (s *SortGeoDistance) UpdateVisitor(field string, term []byte) { func (s *SortGeoDistance) Value(i *DocumentMatch) string { iTerms := s.filterTermsByType(s.values) iTerm := s.filterTermsByMode(iTerms) - s.values = nil + s.values = s.values[:0] if iTerm == "" { return maxDistance From 1408317e35f590736fbcaf81cc2693c1a3b6d60b Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 5 May 2018 11:36:21 -0700 Subject: [PATCH 416/728] optimize SortField by delaying string conversion --- numeric/prefix_coded.go | 4 ++++ search/sort.go | 33 ++++++++++++++++++++------------- 2 files changed, 24 insertions(+), 13 deletions(-) diff --git a/numeric/prefix_coded.go b/numeric/prefix_coded.go index 4200c23bb..76ea001ba 100644 --- a/numeric/prefix_coded.go +++ b/numeric/prefix_coded.go @@ -77,6 +77,10 @@ func (p PrefixCoded) Int64() (int64, error) { } func ValidPrefixCodedTerm(p string) (bool, int) { + return ValidPrefixCodedTermBytes([]byte(p)) +} + +func ValidPrefixCodedTermBytes(p []byte) (bool, int) { if len(p) > 0 { if p[0] < ShiftStartInt64 || p[0] > ShiftStartInt64+63 { return false, 0 diff --git a/search/sort.go b/search/sort.go index 70fb969be..78d38d857 100644 --- a/search/sort.go +++ b/search/sort.go @@ -15,6 +15,7 @@ package search import ( + "bytes" "encoding/json" "fmt" "math" @@ -342,14 +343,14 @@ type SortField struct { Type SortFieldType Mode SortFieldMode Missing SortFieldMissing - values []string + values [][]byte } // UpdateVisitor notifies this sort field that in this document // this field has the specified term func (s *SortField) UpdateVisitor(field string, term []byte) { if field == s.Field { - s.values = append(s.values, string(term)) + s.values = append(s.values, term) } } @@ -368,17 +369,17 @@ func (s *SortField) Descending() bool { return s.Desc } -func (s *SortField) filterTermsByMode(terms []string) string { +func (s *SortField) filterTermsByMode(terms [][]byte) string { if len(terms) == 1 || (len(terms) > 1 && s.Mode == SortFieldDefault) { - return terms[0] + return string(terms[0]) } else if len(terms) > 1 { switch s.Mode { case SortFieldMin: - sort.Strings(terms) - return terms[0] + sort.Sort(BytesSlice(terms)) + return string(terms[0]) case SortFieldMax: - sort.Strings(terms) - return terms[len(terms)-1] + sort.Sort(BytesSlice(terms)) + return string(terms[len(terms)-1]) } } @@ -400,13 +401,13 @@ func (s *SortField) filterTermsByMode(terms []string) string { // return only the terms which had shift of 0 // if we are in explicit number or date mode, return only valid // prefix coded numbers with shift of 0 -func (s *SortField) filterTermsByType(terms []string) []string { +func (s *SortField) filterTermsByType(terms [][]byte) [][]byte { stype := s.Type if stype == SortFieldAuto { allTermsPrefixCoded := true - var termsWithShiftZero []string + var termsWithShiftZero [][]byte for _, term := range terms { - valid, shift := numeric.ValidPrefixCodedTerm(term) + valid, shift := numeric.ValidPrefixCodedTermBytes(term) if valid && shift == 0 { termsWithShiftZero = append(termsWithShiftZero, term) } else if !valid { @@ -417,9 +418,9 @@ func (s *SortField) filterTermsByType(terms []string) []string { terms = termsWithShiftZero } } else if stype == SortFieldAsNumber || stype == SortFieldAsDate { - var termsWithShiftZero []string + var termsWithShiftZero [][]byte for _, term := range terms { - valid, shift := numeric.ValidPrefixCodedTerm(term) + valid, shift := numeric.ValidPrefixCodedTermBytes(term) if valid && shift == 0 { termsWithShiftZero = append(termsWithShiftZero, term) } @@ -700,3 +701,9 @@ func (s *SortGeoDistance) Copy() SearchSort { rv := *s return &rv } + +type BytesSlice [][]byte + +func (p BytesSlice) Len() int { return len(p) } +func (p BytesSlice) Less(i, j int) bool { return bytes.Compare(p[i], p[j]) < 0 } +func (p BytesSlice) Swap(i, j int) { p[i], p[j] = p[j], p[i] } From 44e0a2957167084db62fda268afc9c86992dc47e Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 5 May 2018 13:29:59 -0700 Subject: [PATCH 417/728] optimize SortField termsWithShiftZero to reuse slice --- search/sort.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/search/sort.go b/search/sort.go index 78d38d857..e17f70787 100644 --- a/search/sort.go +++ b/search/sort.go @@ -344,6 +344,7 @@ type SortField struct { Mode SortFieldMode Missing SortFieldMissing values [][]byte + tmp [][]byte } // UpdateVisitor notifies this sort field that in this document @@ -405,7 +406,7 @@ func (s *SortField) filterTermsByType(terms [][]byte) [][]byte { stype := s.Type if stype == SortFieldAuto { allTermsPrefixCoded := true - var termsWithShiftZero [][]byte + termsWithShiftZero := s.tmp[:0] for _, term := range terms { valid, shift := numeric.ValidPrefixCodedTermBytes(term) if valid && shift == 0 { @@ -416,9 +417,10 @@ func (s *SortField) filterTermsByType(terms [][]byte) [][]byte { } if allTermsPrefixCoded { terms = termsWithShiftZero + s.tmp = termsWithShiftZero[:0] } } else if stype == SortFieldAsNumber || stype == SortFieldAsDate { - var termsWithShiftZero [][]byte + termsWithShiftZero := s.tmp[:0] for _, term := range terms { valid, shift := numeric.ValidPrefixCodedTermBytes(term) if valid && shift == 0 { @@ -426,6 +428,7 @@ func (s *SortField) filterTermsByType(terms [][]byte) [][]byte { } } terms = termsWithShiftZero + s.tmp = termsWithShiftZero[:0] } return terms } From 1387c024723b947a7bd22828e59dea2d1fc4333b Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 7 May 2018 16:57:07 -0700 Subject: [PATCH 418/728] MB-29554: Handle case when MemoryUsed is invoked after index is closed T1: runtime.sigpanic() /home/couchbase/.cbdepscache/exploded/x86_64/go-1.9.6/go/src/runtime/signal_unix.go:367 +0x17c fp=0xc4315bc928 sp=0xc4315bc8d8 pc=0x44432c github.com/blevesearch/bleve/index/scorch.(*IndexSnapshot).AddRef(0x0) /home/couchbase/jenkins/workspace/couchbase-server-unix/godeps/src/github.com/blevesearch/bleve/index/scorch/snapshot_index.go:77 +0x22 fp=0xc4315bc948 sp=0xc4315bc928 pc=0x6012c2 github.com/blevesearch/bleve/index/scorch.(*Scorch).currentSnapshot(0xc420364380, 0xc4314ca180) /home/couchbase/jenkins/workspace/couchbase-server-unix/godeps/src/github.com/blevesearch/bleve/index/scorch/scorch.go:427 +0x53 fp=0xc4315bc970 sp=0xc4315bc948 pc=0x5ffc03 github.com/blevesearch/bleve/index/scorch.(*Scorch).MemoryUsed(0xc420364380, 0x0) /home/couchbase/jenkins/workspace/couchbase-server-unix/godeps/src/github.com/blevesearch/bleve/index/scorch/scorch.go:512 +0x38 fp=0xc4315bc9a0 sp=0xc4315bc970 pc=0x600dd8 main.scorchSize(0xdbf6c0, 0xc420364380, 0x6812d7) /home/couchbase/jenkins/workspace/couchbase-server-unix/goproj/src/github.com/couchbase/cbft/cmd/cbft/app_herder.go:296 +0x3c fp=0xc4315bc9c8 sp=0xc4315bc9a0 pc=0xbc97cc main.(*appHerder).indexingMemoryLOCKED(0xc42023a690, 0xc42023a6b4) /home/couchbase/jenkins/workspace/couchbase-server-unix/goproj/src/github.com/couchbase/cbft/cmd/cbft/app_herder.go:122 +0x9f fp=0xc4315bca60 sp=0xc4315bc9c8 pc=0xbc87bf T2: sync.runtime_SemacquireMutex(0xc42023a6b4, 0x0) /home/couchbase/.cbdepscache/exploded/x86_64/go-1.9.6/go/src/runtime/sema.go:71 +0x3d fp=0xc420339600 sp=0xc4203395d8 pc=0x44062d sync.(*Mutex).Lock(0xc42023a6b0) /home/couchbase/.cbdepscache/exploded/x86_64/go-1.9.6/go/src/sync/mutex.go:134 +0xee fp=0xc420339638 sp=0xc420339600 pc=0x466f7e main.(*appHerder).onClose(0xc42023a690, 0xdbf6c0, 0xc420364380) /home/couchbase/jenkins/workspace/couchbase-server-unix/goproj/src/github.com/couchbase/cbft/cmd/cbft/app_herder.go:83 +0x3a fp=0xc420339698 sp=0xc420339638 pc=0xbc844a main.(*appHerder).onScorchEvent(0xc42023a690, 0x2, 0xc420364380, 0xd06f1a6) /home/couchbase/jenkins/workspace/couchbase-server-unix/goproj/src/github.com/couchbase/cbft/cmd/cbft/app_herder.go:302 +0xc2 fp=0xc4203396c8 sp=0xc420339698 pc=0xbc98d2 main.(*appHerder).ScorchHerderOnEvent.func1(0x2, 0xc420364380, 0xd06f1a6) /home/couchbase/jenkins/workspace/couchbase-server-unix/goproj/src/github.com/couchbase/cbft/cmd/cbft/app_herder.go:292 +0x48 fp=0xc4203396f8 sp=0xc4203396c8 pc=0xbd9a98 github.com/blevesearch/bleve/index/scorch.(*Scorch).fireEvent(0xc420364380, 0x2, 0xd06f1a6) /home/couchbase/jenkins/workspace/couchbase-server-unix/godeps/src/github.com/blevesearch/bleve/index/scorch/scorch.go:122 +0x55 fp=0xc420339720 sp=0xc4203396f8 pc=0x5fd9b5 github.com/blevesearch/bleve/index/scorch.(*Scorch).Close.func1(0xc420364380, 0xbeb41277d6e9dc06, 0x29ace50f52, 0x144bc40) /home/couchbase/jenkins/workspace/couchbase-server-unix/godeps/src/github.com/blevesearch/bleve/index/scorch/scorch.go:224 +0x63 fp=0xc420339750 sp=0xc420339720 pc=0x60b253 github.com/blevesearch/bleve/index/scorch.(*Scorch).Close(0xc420364380, 0x0, 0x0) /home/couchbase/jenkins/workspace/couchbase-server-unix/godeps/src/github.com/blevesearch/bleve/index/scorch/scorch.go:244 +0xde fp=0xc4203397a0 sp=0xc420339750 pc=0x5fe3ae --- index/scorch/scorch.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 31d31642a..fe4f71b87 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -424,7 +424,9 @@ func (s *Scorch) Reader() (index.IndexReader, error) { func (s *Scorch) currentSnapshot() *IndexSnapshot { s.rootLock.RLock() rv := s.root - rv.AddRef() + if rv != nil { + rv.AddRef() + } s.rootLock.RUnlock() return rv } @@ -508,14 +510,18 @@ func (s *Scorch) AddEligibleForRemoval(epoch uint64) { s.rootLock.Unlock() } -func (s *Scorch) MemoryUsed() uint64 { +func (s *Scorch) MemoryUsed() (memUsed uint64) { indexSnapshot := s.currentSnapshot() + if indexSnapshot == nil { + return + } + defer func() { _ = indexSnapshot.Close() }() // Account for current root snapshot overhead - memUsed := uint64(indexSnapshot.Size()) + memUsed += uint64(indexSnapshot.Size()) // Account for snapshot that the persister may be working on persistEpoch := atomic.LoadUint64(&s.iStats.persistEpoch) From 5f3c372f877f06ac695d7a22e112e3906aa7b3b9 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Thu, 10 May 2018 17:19:04 -0700 Subject: [PATCH 419/728] Fix bug in bleve's zap stored command, formatting + Handle the _id field special case + Format the output for better readability --- cmd/bleve/cmd/scorch/info.go | 2 +- cmd/bleve/cmd/scorch/snapshot.go | 2 +- cmd/bleve/cmd/zap/dict.go | 6 +++--- cmd/bleve/cmd/zap/explore.go | 2 +- cmd/bleve/cmd/zap/stored.go | 9 +++++++-- 5 files changed, 13 insertions(+), 8 deletions(-) diff --git a/cmd/bleve/cmd/scorch/info.go b/cmd/bleve/cmd/scorch/info.go index 2b4674f06..31e6481f9 100644 --- a/cmd/bleve/cmd/scorch/info.go +++ b/cmd/bleve/cmd/scorch/info.go @@ -37,7 +37,7 @@ var infoCmd = &cobra.Command{ return err } - fmt.Printf("count: %d\n", count) + fmt.Printf("doc count: %d\n", count) // var numSnapshots int // var rootSnapshot uint64 diff --git a/cmd/bleve/cmd/scorch/snapshot.go b/cmd/bleve/cmd/scorch/snapshot.go index bb035ce59..df13d4901 100644 --- a/cmd/bleve/cmd/scorch/snapshot.go +++ b/cmd/bleve/cmd/scorch/snapshot.go @@ -35,7 +35,7 @@ var snapshotCmd = &cobra.Command{ return err } for _, snapshotEpoch := range snapshotEpochs { - fmt.Printf("%d\n", snapshotEpoch) + fmt.Printf("snapshot epoch: %d\n", snapshotEpoch) } } else if len(args) < 3 { snapshotEpoch, err := strconv.ParseUint(args[1], 10, 64) diff --git a/cmd/bleve/cmd/zap/dict.go b/cmd/bleve/cmd/zap/dict.go index e80be3601..35952f5f5 100644 --- a/cmd/bleve/cmd/zap/dict.go +++ b/cmd/bleve/cmd/zap/dict.go @@ -45,8 +45,8 @@ var dictCmd = &cobra.Command{ vellumLen, read := binary.Uvarint(data[addr : addr+binary.MaxVarintLen64]) fmt.Printf("vellum length: %d\n", vellumLen) fstBytes := data[addr+uint64(read) : addr+uint64(read)+vellumLen] - fmt.Printf("raw vellum data % x\n", fstBytes) - fmt.Printf("dictionary:\n\n") + fmt.Printf("raw vellum data:\n % x\n", fstBytes) + fmt.Printf("dictionary:\n") if fstBytes != nil { fst, err := vellum.Load(fstBytes) if err != nil { @@ -63,7 +63,7 @@ var dictCmd = &cobra.Command{ extra = fmt.Sprintf("-- docNum: %d, norm: %f", docNum, norm) } - fmt.Printf("%s - %d (%x) %s\n", currTerm, currVal, currVal, extra) + fmt.Printf(" %s - %d (%x) %s\n", currTerm, currVal, currVal, extra) err = itr.Next() } if err != nil && err != vellum.ErrIteratorDone { diff --git a/cmd/bleve/cmd/zap/explore.go b/cmd/bleve/cmd/zap/explore.go index 0c2471edc..d22fa8ce1 100644 --- a/cmd/bleve/cmd/zap/explore.go +++ b/cmd/bleve/cmd/zap/explore.go @@ -46,7 +46,7 @@ var exploreCmd = &cobra.Command{ vellumLen, read := binary.Uvarint(data[addr : addr+binary.MaxVarintLen64]) fmt.Printf("vellum length: %d\n", vellumLen) fstBytes := data[addr+uint64(read) : addr+uint64(read)+vellumLen] - fmt.Printf("raw vellum data % x\n", fstBytes) + fmt.Printf("raw vellum data:\n % x\n", fstBytes) if len(args) >= 3 { if fstBytes != nil { diff --git a/cmd/bleve/cmd/zap/stored.go b/cmd/bleve/cmd/zap/stored.go index ba1143cb1..28d62c0cb 100644 --- a/cmd/bleve/cmd/zap/stored.go +++ b/cmd/bleve/cmd/zap/stored.go @@ -58,11 +58,16 @@ var storedCmd = &cobra.Command{ fmt.Printf("Raw meta: % x\n", meta) raw := data[storedStartAddr+n+metaLen : storedStartAddr+n+metaLen+dataLen] fmt.Printf("Raw data (len %d): % x\n", len(raw), raw) - uncompressed, err := snappy.Decode(nil, raw) + + // handle _id field special case + idFieldValLen, _ := binary.Uvarint(meta) + fmt.Printf("Raw _id (len %d): % x\n", idFieldValLen, raw[:idFieldValLen]) + fmt.Printf("Raw fields (len %d): % x\n", dataLen-idFieldValLen, raw[idFieldValLen:]) + uncompressed, err := snappy.Decode(nil, raw[idFieldValLen:]) if err != nil { panic(err) } - fmt.Printf("Uncompressed data (len %d): % x\n", len(uncompressed), uncompressed) + fmt.Printf("Uncompressed fields (len %d): % x\n", len(uncompressed), uncompressed) return nil }, From f9bb6c0575e4a494638e61ac4af9c9cd07e2ba9b Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 16 May 2018 15:46:48 +0530 Subject: [PATCH 420/728] MB-29516 - geo search adopts DocValueReader migrate geo search use of DocumentVisitFieldTerms to DocValueReader --- search/searcher/search_geoboundingbox.go | 36 ++++++++++++---------- search/searcher/search_geopointdistance.go | 35 ++++++++++++--------- 2 files changed, 40 insertions(+), 31 deletions(-) diff --git a/search/searcher/search_geoboundingbox.go b/search/searcher/search_geoboundingbox.go index f8b1b4cf7..289e41678 100644 --- a/search/searcher/search_geoboundingbox.go +++ b/search/searcher/search_geoboundingbox.go @@ -40,6 +40,11 @@ func NewGeoBoundingBoxSearcher(indexReader index.IndexReader, minLon, minLat, minLon, minLat, maxLon, maxLat, checkBoundaries) var onBoundarySearcher search.Searcher + dvReader, err := indexReader.DocValueReader([]string{field}) + if err != nil { + return nil, err + } + if len(onBoundaryTerms) > 0 { rawOnBoundarySearcher, err := NewMultiTermSearcherBytes(indexReader, onBoundaryTerms, field, boost, options, false) @@ -48,7 +53,7 @@ func NewGeoBoundingBoxSearcher(indexReader index.IndexReader, minLon, minLat, } // add filter to check points near the boundary onBoundarySearcher = NewFilteringSearcher(rawOnBoundarySearcher, - buildRectFilter(indexReader, field, minLon, minLat, maxLon, maxLat)) + buildRectFilter(dvReader, field, minLon, minLat, maxLon, maxLat)) openedSearchers = append(openedSearchers, onBoundarySearcher) } @@ -144,26 +149,25 @@ func relateAndRecurse(start, end uint64, res uint, return nil, nil } -func buildRectFilter(indexReader index.IndexReader, field string, +func buildRectFilter(dvReader index.DocValueReader, field string, minLon, minLat, maxLon, maxLat float64) FilterFunc { return func(d *search.DocumentMatch) bool { var lon, lat float64 var found bool - err := indexReader.DocumentVisitFieldTerms(d.IndexInternalID, - []string{field}, func(field string, term []byte) { - // only consider the values which are shifted 0 - prefixCoded := numeric.PrefixCoded(term) - shift, err := prefixCoded.Shift() - if err == nil && shift == 0 { - var i64 int64 - i64, err = prefixCoded.Int64() - if err == nil { - lon = geo.MortonUnhashLon(uint64(i64)) - lat = geo.MortonUnhashLat(uint64(i64)) - found = true - } + err := dvReader.VisitDocValues(d.IndexInternalID, func(field string, term []byte) { + // only consider the values which are shifted 0 + prefixCoded := numeric.PrefixCoded(term) + shift, err := prefixCoded.Shift() + if err == nil && shift == 0 { + var i64 int64 + i64, err = prefixCoded.Int64() + if err == nil { + lon = geo.MortonUnhashLon(uint64(i64)) + lat = geo.MortonUnhashLat(uint64(i64)) + found = true } - }) + } + }) if err == nil && found { return geo.BoundingBoxContains(lon, lat, minLon, minLat, maxLon, maxLat) diff --git a/search/searcher/search_geopointdistance.go b/search/searcher/search_geopointdistance.go index fd559766f..a15c194e8 100644 --- a/search/searcher/search_geopointdistance.go +++ b/search/searcher/search_geopointdistance.go @@ -39,9 +39,14 @@ func NewGeoPointDistanceSearcher(indexReader index.IndexReader, centerLon, return nil, err } + dvReader, err := indexReader.DocValueReader([]string{field}) + if err != nil { + return nil, err + } + // wrap it in a filtering searcher which checks the actual distance return NewFilteringSearcher(boxSearcher, - buildDistFilter(indexReader, field, centerLon, centerLat, dist)), nil + buildDistFilter(dvReader, field, centerLon, centerLat, dist)), nil } // boxSearcher builds a searcher for the described bounding box @@ -87,25 +92,25 @@ func boxSearcher(indexReader index.IndexReader, return boxSearcher, nil } -func buildDistFilter(indexReader index.IndexReader, field string, +func buildDistFilter(dvReader index.DocValueReader, field string, centerLon, centerLat, maxDist float64) FilterFunc { return func(d *search.DocumentMatch) bool { var lon, lat float64 var found bool - err := indexReader.DocumentVisitFieldTerms(d.IndexInternalID, - []string{field}, func(field string, term []byte) { - // only consider the values which are shifted 0 - prefixCoded := numeric.PrefixCoded(term) - shift, err := prefixCoded.Shift() - if err == nil && shift == 0 { - i64, err := prefixCoded.Int64() - if err == nil { - lon = geo.MortonUnhashLon(uint64(i64)) - lat = geo.MortonUnhashLat(uint64(i64)) - found = true - } + + err := dvReader.VisitDocValues(d.IndexInternalID, func(field string, term []byte) { + // only consider the values which are shifted 0 + prefixCoded := numeric.PrefixCoded(term) + shift, err := prefixCoded.Shift() + if err == nil && shift == 0 { + i64, err := prefixCoded.Int64() + if err == nil { + lon = geo.MortonUnhashLon(uint64(i64)) + lat = geo.MortonUnhashLat(uint64(i64)) + found = true } - }) + } + }) if err == nil && found { dist := geo.Haversin(lon, lat, centerLon, centerLat) if dist <= maxDist/1000 { From a009a463cfa800b0ef8ee46b9a436e23e2bbde45 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Thu, 17 May 2018 11:43:18 -0700 Subject: [PATCH 421/728] errCheck to ignore fmt pkg during checks that travis-ci performs --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index 775fed3a9..934e86268 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,7 +16,7 @@ script: - gvt restore - go test -v $(go list ./... | grep -v vendor/) - go vet $(go list ./... | grep -v vendor/) - - errcheck $(go list ./... | grep -v vendor/) + - errcheck -ignorepkg fmt $(go list ./... | grep -v vendor/) - docs/project-code-coverage.sh - docs/build_children.sh From 02405fd47bb60f1d633678c38c988eba80a63dd1 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Tue, 15 May 2018 19:55:37 -0700 Subject: [PATCH 422/728] MB-29654: Copy value, arrayPositions into the interim structure(s) Copy value, array positions in the VisitDocument callback to preserve them beyond the scope of it's callback. --- index/scorch/segment/zap/merge.go | 10 ++++++++-- index/scorch/segment/zap/merge_test.go | 8 ++++---- index/scorch/segment/zap/segment.go | 10 ++++++++-- index/scorch/snapshot_index.go | 17 +++++++++++------ 4 files changed, 31 insertions(+), 14 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index dab09f6b3..0e6e52962 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -601,6 +601,9 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, docNumOffsets := make([]uint64, newSegDocCount) + vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) + defer visitDocumentCtxPool.Put(vdc) + // for each segment for segI, segment := range segments { segNewDocNums := make([]uint64, segment.numDocs) @@ -645,11 +648,14 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, typs[i] = typs[i][:0] poss[i] = poss[i][:0] } - err := segment.VisitDocument(docNum, func(field string, typ byte, value []byte, pos []uint64) bool { + err := segment.visitDocument(vdc, docNum, func(field string, typ byte, value []byte, pos []uint64) bool { fieldID := int(fieldsMap[field]) - 1 vals[fieldID] = append(vals[fieldID], value) typs[fieldID] = append(typs[fieldID], typ) - poss[fieldID] = append(poss[fieldID], pos) + + // MB-29654: copy array positions to preserve them beyond the scope of this callback + poss[fieldID] = append(poss[fieldID], append([]uint64(nil), pos...)) + return true }) if err != nil { diff --git a/index/scorch/segment/zap/merge_test.go b/index/scorch/segment/zap/merge_test.go index cd21ecb66..163663a49 100644 --- a/index/scorch/segment/zap/merge_test.go +++ b/index/scorch/segment/zap/merge_test.go @@ -406,8 +406,8 @@ func compareSegments(a, b *Segment) string { err = a.VisitDocument(apitrn.Number(), func(field string, typ byte, value []byte, pos []uint64) bool { afields[field+"-typ"] = typ - afields[field+"-value"] = value - afields[field+"-pos"] = pos + afields[field+"-value"] = append([]byte(nil), value...) + afields[field+"-pos"] = append([]uint64(nil), pos...) return true }) if err != nil { @@ -417,8 +417,8 @@ func compareSegments(a, b *Segment) string { err = b.VisitDocument(bpitrn.Number(), func(field string, typ byte, value []byte, pos []uint64) bool { bfields[field+"-typ"] = typ - bfields[field+"-value"] = value - bfields[field+"-pos"] = pos + bfields[field+"-value"] = append([]byte(nil), value...) + bfields[field+"-pos"] = append([]uint64(nil), pos...) return true }) if err != nil { diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index ed09d149d..08d714f39 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -295,10 +295,17 @@ var visitDocumentCtxPool = sync.Pool{ // VisitDocument invokes the DocFieldValueVistor for each stored field // for the specified doc number func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { - // first make sure this is a valid number in this segment if num < s.numDocs { vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) + defer visitDocumentCtxPool.Put(vdc) + return s.visitDocument(vdc, num, visitor) + } + return nil +} +func (s *SegmentBase) visitDocument(vdc *visitDocumentCtx, num uint64, visitor segment.DocumentFieldValueVisitor) error { + // first make sure this is a valid number in this segment + if num < s.numDocs { meta, compressed := s.getDocStoredMetaAndCompressed(num) vdc.reader.Reset(meta) @@ -368,7 +375,6 @@ func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldVal } vdc.buf = uncompressed - visitDocumentCtxPool.Put(vdc) } return nil } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 0f8c21371..da1558593 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -313,21 +313,26 @@ func (i *IndexSnapshot) Document(id string) (rv *document.Document, err error) { segmentIndex, localDocNum := i.segmentIndexAndLocalDocNumFromGlobal(docNum) rv = document.NewDocument(id) - err = i.segment[segmentIndex].VisitDocument(localDocNum, func(name string, typ byte, value []byte, pos []uint64) bool { + err = i.segment[segmentIndex].VisitDocument(localDocNum, func(name string, typ byte, val []byte, pos []uint64) bool { if name == "_id" { return true } + + // MB-29654: copy value, array positions to preserve them beyond the scope of this callback + value := append([]byte(nil), val...) + arrayPos := append([]uint64(nil), pos...) + switch typ { case 't': - rv.AddField(document.NewTextField(name, pos, value)) + rv.AddField(document.NewTextField(name, arrayPos, value)) case 'n': - rv.AddField(document.NewNumericFieldFromBytes(name, pos, value)) + rv.AddField(document.NewNumericFieldFromBytes(name, arrayPos, value)) case 'd': - rv.AddField(document.NewDateTimeFieldFromBytes(name, pos, value)) + rv.AddField(document.NewDateTimeFieldFromBytes(name, arrayPos, value)) case 'b': - rv.AddField(document.NewBooleanFieldFromBytes(name, pos, value)) + rv.AddField(document.NewBooleanFieldFromBytes(name, arrayPos, value)) case 'g': - rv.AddField(document.NewGeoPointFieldFromBytes(name, pos, value)) + rv.AddField(document.NewGeoPointFieldFromBytes(name, arrayPos, value)) } return true From 4bd452b1db9d0113c9be1b6585fea5fdaccf7b73 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 16 May 2018 18:14:51 -0700 Subject: [PATCH 423/728] Avoid going to the allocator everytime for accomodating array positions During the merge operation create a buffer to copy array positions into, and re-use this buffer for all fields of all documents in all segments. --- index/scorch/segment/zap/merge.go | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 0e6e52962..04bf75399 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -599,6 +599,8 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, typs := make([][]byte, len(fieldsInv)) poss := make([][][]uint64, len(fieldsInv)) + var posBuf []uint64 + docNumOffsets := make([]uint64, newSegDocCount) vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) @@ -642,6 +644,8 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, metaBuf.Reset() data = data[:0] + posTemp := posBuf + // collect all the data for i := 0; i < len(fieldsInv); i++ { vals[i] = vals[i][:0] @@ -654,7 +658,17 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, typs[fieldID] = append(typs[fieldID], typ) // MB-29654: copy array positions to preserve them beyond the scope of this callback - poss[fieldID] = append(poss[fieldID], append([]uint64(nil), pos...)) + var curPos []uint64 + if len(pos) > 0 { + if cap(posTemp) < len(pos) { + posBuf = make([]uint64, len(pos)*len(fieldsInv)) + posTemp = posBuf + } + curPos = posTemp[0:len(pos)] + copy(curPos, pos) + posTemp = posTemp[len(pos):] + } + poss[fieldID] = append(poss[fieldID], curPos) return true }) From 21a00e203ec3278f3787aecdaf57828e8a59844b Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Thu, 17 May 2018 11:00:20 -0700 Subject: [PATCH 424/728] Remove extra check within zap's VisitDocument(..) This public API invokes the internal API: visitDocument(..) which performs the check for the validity of the docNum. --- index/scorch/segment/zap/merge.go | 2 +- index/scorch/segment/zap/segment.go | 12 +++++------- index/scorch/snapshot_index.go | 2 +- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 04bf75399..c735caad3 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -657,7 +657,7 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, vals[fieldID] = append(vals[fieldID], value) typs[fieldID] = append(typs[fieldID], typ) - // MB-29654: copy array positions to preserve them beyond the scope of this callback + // copy array positions to preserve them beyond the scope of this callback var curPos []uint64 if len(pos) > 0 { if cap(posTemp) < len(pos) { diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 08d714f39..80798093d 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -295,15 +295,13 @@ var visitDocumentCtxPool = sync.Pool{ // VisitDocument invokes the DocFieldValueVistor for each stored field // for the specified doc number func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { - if num < s.numDocs { - vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) - defer visitDocumentCtxPool.Put(vdc) - return s.visitDocument(vdc, num, visitor) - } - return nil + vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) + defer visitDocumentCtxPool.Put(vdc) + return s.visitDocument(vdc, num, visitor) } -func (s *SegmentBase) visitDocument(vdc *visitDocumentCtx, num uint64, visitor segment.DocumentFieldValueVisitor) error { +func (s *SegmentBase) visitDocument(vdc *visitDocumentCtx, num uint64, + visitor segment.DocumentFieldValueVisitor) error { // first make sure this is a valid number in this segment if num < s.numDocs { meta, compressed := s.getDocStoredMetaAndCompressed(num) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index da1558593..6b615cd17 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -318,7 +318,7 @@ func (i *IndexSnapshot) Document(id string) (rv *document.Document, err error) { return true } - // MB-29654: copy value, array positions to preserve them beyond the scope of this callback + // copy value, array positions to preserve them beyond the scope of this callback value := append([]byte(nil), val...) arrayPos := append([]uint64(nil), pos...) From 9f8b8787616228a716bf424cfa99b6e8b5d4056f Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 18 May 2018 21:05:27 +0530 Subject: [PATCH 425/728] MB-29576 - Indexing blocked on scorch More stats to keep track of last merged, persisted and current root epoch. --- index/scorch/introducer.go | 5 +++++ index/scorch/merge.go | 2 ++ index/scorch/persister.go | 2 ++ index/scorch/stats.go | 4 ++++ 4 files changed, 13 insertions(+) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index fb6afd5de..6989bbc9d 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -202,6 +202,7 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { s.nextSnapshotEpoch++ rootPrev := s.root s.root = newSnapshot + atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch) // release lock s.rootLock.Unlock() @@ -265,6 +266,7 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) { s.rootLock.Lock() rootPrev := s.root s.root = newIndexSnapshot + atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch) s.rootLock.Unlock() if rootPrev != nil { @@ -369,6 +371,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { s.nextSnapshotEpoch++ rootPrev := s.root s.root = newSnapshot + atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch) // release lock s.rootLock.Unlock() @@ -430,6 +433,8 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { // swap in new snapshot rootPrev := s.root s.root = newSnapshot + + atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch) // release lock s.rootLock.Unlock() diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 41b734aaf..171f33ae8 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -72,6 +72,8 @@ OUTER: } lastEpochMergePlanned = ourSnapshot.epoch + atomic.StoreUint64(&s.stats.LastMergedEpoch, ourSnapshot.epoch) + s.fireEvent(EventKindMergerProgress, time.Since(startTime)) } _ = ourSnapshot.DecRef() diff --git a/index/scorch/persister.go b/index/scorch/persister.go index cbc24cdb7..c822ad0b5 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -109,6 +109,8 @@ OUTER: continue OUTER } + atomic.StoreUint64(&s.stats.LastPersistedEpoch, ourSnapshot.epoch) + lastPersistedEpoch = ourSnapshot.epoch for _, ew := range persistWatchers { close(ew.notifyCh) diff --git a/index/scorch/stats.go b/index/scorch/stats.go index e9bcd91d2..d4e07f6b4 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -33,6 +33,10 @@ type Stats struct { TotBatchIntroTime uint64 MaxBatchIntroTime uint64 + CurRootEpoch uint64 + LastPersistedEpoch uint64 + LastMergedEpoch uint64 + TotOnErrors uint64 TotAnalysisTime uint64 From 477688c147c679e43f471d36286c1e462295554b Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 21 May 2018 12:37:22 -0700 Subject: [PATCH 426/728] MB-29763: Revert "scorch optimize via vellum.FST.Reader() API" This reverts commit 0a3f3e44224c1fc8f8b0e8777b7554abb8113d3b. --- index/scorch/segment/zap/dict.go | 13 ++++++------- index/scorch/segment/zap/segment.go | 4 ---- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 736fa59f6..c73cc6e5b 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -28,11 +28,10 @@ import ( // Dictionary is the zap representation of the term dictionary type Dictionary struct { - sb *SegmentBase - field string - fieldID uint16 - fst *vellum.FST - fstReader *vellum.Reader + sb *SegmentBase + field string + fieldID uint16 + fst *vellum.FST } // PostingsList returns the postings list for the specified term @@ -47,14 +46,14 @@ func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap, } func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { - if d.fstReader == nil { + if d.fst == nil { if rv == nil || rv == emptyPostingsList { return emptyPostingsList, nil } return d.postingsListInit(rv, except), nil } - postingsOffset, exists, err := d.fstReader.Get(term) + postingsOffset, exists, err := d.fst.Get(term) if err != nil { return nil, fmt.Errorf("vellum err: %v", err) } diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 80798093d..0fd4e57c4 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -266,10 +266,6 @@ func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) { if err != nil { return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err) } - rv.fstReader, err = rv.fst.Reader() - if err != nil { - return nil, fmt.Errorf("dictionary field %s vellum Reader err: %v", field, err) - } } } } From 6638c5ccff18af37d8be33c011265fbd2b45e625 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 25 May 2018 10:05:54 -0700 Subject: [PATCH 427/728] cachedDocs.visitDoc() check whether docs map is ready for use Before this change, the cachedDocs.visitDoc() method was accessing the docs map before the readyCh was closed. Without this check, another goroutine that's executing the cachedFieldDocs.prepareField() method might be concurrently populating and modifying the docs map. See also: https://issues.couchbase.com/browse/MB-29844 --- index/scorch/snapshot_segment.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 98084a980..90dbcb494 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -293,6 +293,10 @@ func (c *cachedDocs) visitDoc(localDocNum uint64, for _, field := range fields { if cachedFieldDocs, exists := c.cache[field]; exists { + c.m.Unlock() + <-cachedFieldDocs.readyCh + c.m.Lock() + if tlist, exists := cachedFieldDocs.docs[localDocNum]; exists { for { i := bytes.Index(tlist, TermSeparatorSplitSlice) From 323f83aed5898ba819bcc210249fca7bba877e28 Mon Sep 17 00:00:00 2001 From: Ben Gadbois Date: Wed, 30 May 2018 21:51:02 +0200 Subject: [PATCH 428/728] Fix spelling and typo mistakes --- analysis/token/snowball/snowball.go | 4 ++-- analysis/token/unique/unique.go | 2 +- cmd/bleve/cmd/zap/dict.go | 2 +- cmd/bleve/cmd/zap/explore.go | 2 +- geo/parse_test.go | 2 +- index/scorch/README.md | 2 +- index/scorch/segment/zap/README.md | 2 +- index/scorch/segment/zap/posting.go | 16 ++++++++-------- index/scorch/segment/zap/segment.go | 2 +- index_alias_impl_test.go | 6 +++--- index_test.go | 2 +- mapping/document.go | 2 +- mapping/mapping_test.go | 6 +++--- numeric/bin.go | 2 +- search/searcher/search_phrase.go | 4 ++-- search/searcher/search_term_range_test.go | 4 ++-- 16 files changed, 30 insertions(+), 30 deletions(-) diff --git a/analysis/token/snowball/snowball.go b/analysis/token/snowball/snowball.go index ae876137a..99dbf4202 100644 --- a/analysis/token/snowball/snowball.go +++ b/analysis/token/snowball/snowball.go @@ -26,12 +26,12 @@ import ( const Name = "stemmer_snowball" type SnowballStemmer struct { - langauge string + language string } func NewSnowballStemmer(language string) *SnowballStemmer { return &SnowballStemmer{ - langauge: language, + language: language, } } diff --git a/analysis/token/unique/unique.go b/analysis/token/unique/unique.go index f0d96c504..c60e8c979 100644 --- a/analysis/token/unique/unique.go +++ b/analysis/token/unique/unique.go @@ -21,7 +21,7 @@ import ( const Name = "unique" -// UniqueTermFilter retains only the tokens which mark the first occurence of +// UniqueTermFilter retains only the tokens which mark the first occurrence of // a term. Tokens whose term appears in a preceding token are dropped. type UniqueTermFilter struct{} diff --git a/cmd/bleve/cmd/zap/dict.go b/cmd/bleve/cmd/zap/dict.go index 35952f5f5..2c60d31da 100644 --- a/cmd/bleve/cmd/zap/dict.go +++ b/cmd/bleve/cmd/zap/dict.go @@ -38,7 +38,7 @@ var dictCmd = &cobra.Command{ addr, err := segment.DictAddr(args[1]) if err != nil { - return fmt.Errorf("error determing address: %v", err) + return fmt.Errorf("error determining address: %v", err) } fmt.Printf("dictionary for field starts at %d (%x)\n", addr, addr) diff --git a/cmd/bleve/cmd/zap/explore.go b/cmd/bleve/cmd/zap/explore.go index d22fa8ce1..deac086cb 100644 --- a/cmd/bleve/cmd/zap/explore.go +++ b/cmd/bleve/cmd/zap/explore.go @@ -39,7 +39,7 @@ var exploreCmd = &cobra.Command{ addr, err := segment.DictAddr(args[1]) if err != nil { - return fmt.Errorf("error determing address: %v", err) + return fmt.Errorf("error determining address: %v", err) } fmt.Printf("dictionary for field starts at %d (%x)\n", addr, addr) diff --git a/geo/parse_test.go b/geo/parse_test.go index 4d4a36d5d..4cbf66dde 100644 --- a/geo/parse_test.go +++ b/geo/parse_test.go @@ -87,7 +87,7 @@ func TestExtractGeoPoint(t *testing.T) { lat: 7.5, success: true, }, - // struct with lng alterante + // struct with lng alternate { in: struct { Lng float64 diff --git a/index/scorch/README.md b/index/scorch/README.md index 861335a1b..9794aed70 100644 --- a/index/scorch/README.md +++ b/index/scorch/README.md @@ -302,7 +302,7 @@ Map local bitsets into global number space (global meaning cross-segment but sti IndexSnapshot already should have mapping something like: 0 - Offset 0 1 - Offset 3 (because segment 0 had 3 docs) -2 - Offset 4 (becuase segment 1 had 1 doc) +2 - Offset 4 (because segment 1 had 1 doc) This maps to search result bitset: diff --git a/index/scorch/segment/zap/README.md b/index/scorch/segment/zap/README.md index 41f5902a0..872e86c84 100644 --- a/index/scorch/segment/zap/README.md +++ b/index/scorch/segment/zap/README.md @@ -109,7 +109,7 @@ If you know the doc number you're interested in, this format lets you jump to th - remember the start position for this posting list - write freq/norm details offset (remembered from previous, as varint uint64) - write location details offset (remembered from previous, as varint uint64) - - write location bitmap offset (remembered from pervious, as varint uint64) + - write location bitmap offset (remembered from previous, as varint uint64) - write length of encoded roaring bitmap - write the serialized roaring bitmap data diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 7bfbe0bef..71d41a826 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -94,7 +94,7 @@ func under32Bits(x uint64) bool { const docNum1HitFinished = math.MaxUint64 -// PostingsList is an in-memory represenation of a postings list +// PostingsList is an in-memory representation of a postings list type PostingsList struct { sb *SegmentBase postingsOffset uint64 @@ -733,7 +733,7 @@ func (p *Posting) Number() uint64 { return p.docNum } -// Frequency returns the frequence of occurance of this term in this doc/field +// Frequency returns the frequencies of occurrence of this term in this doc/field func (p *Posting) Frequency() uint64 { return p.freq } @@ -743,12 +743,12 @@ func (p *Posting) Norm() float64 { return float64(p.norm) } -// Locations returns the location information for each occurance +// Locations returns the location information for each occurrence func (p *Posting) Locations() []segment.Location { return p.locs } -// Location represents the location of a single occurance +// Location represents the location of a single occurrence type Location struct { field string pos uint64 @@ -769,22 +769,22 @@ func (l *Location) Field() string { return l.field } -// Start returns the start byte offset of this occurance +// Start returns the start byte offset of this occurrence func (l *Location) Start() uint64 { return l.start } -// End returns the end byte offset of this occurance +// End returns the end byte offset of this occurrence func (l *Location) End() uint64 { return l.end } -// Pos returns the 1-based phrase position of this occurance +// Pos returns the 1-based phrase position of this occurrence func (l *Location) Pos() uint64 { return l.pos } -// ArrayPositions returns the array position vector associated with this occurance +// ArrayPositions returns the array position vector associated with this occurrence func (l *Location) ArrayPositions() []uint64 { return l.ap } diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 0fd4e57c4..62933daf3 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -211,7 +211,7 @@ func (s *Segment) loadConfig() error { } func (s *SegmentBase) loadFields() error { - // NOTE for now we assume the fields index immediately preceeds + // NOTE for now we assume the fields index immediately precedes // the footer, and if this changes, need to adjust accordingly (or // store explicit length), where s.mem was sliced from s.mm in Open(). fieldsIndexEnd := uint64(len(s.mem)) diff --git a/index_alias_impl_test.go b/index_alias_impl_test.go index 9599b89d6..6e25157de 100644 --- a/index_alias_impl_test.go +++ b/index_alias_impl_test.go @@ -515,7 +515,7 @@ func TestIndexAliasMulti(t *testing.T) { if err != nil { t.Error(err) } - // cheat and ensure that Took field matches since it invovles time + // cheat and ensure that Took field matches since it involves time expected.Took = results.Took if !reflect.DeepEqual(results, expected) { t.Errorf("expected %#v, got %#v", expected, results) @@ -599,7 +599,7 @@ func TestMultiSearchNoError(t *testing.T) { if err != nil { t.Error(err) } - // cheat and ensure that Took field matches since it invovles time + // cheat and ensure that Took field matches since it involves time expected.Took = results.Took if !reflect.DeepEqual(results, expected) { t.Errorf("expected %#v, got %#v", expected, results) @@ -1229,7 +1229,7 @@ func TestMultiSearchCustomSort(t *testing.T) { if err != nil { t.Error(err) } - // cheat and ensure that Took field matches since it invovles time + // cheat and ensure that Took field matches since it involves time expected.Took = results.Took if !reflect.DeepEqual(results, expected) { t.Errorf("expected %v, got %v", expected, results) diff --git a/index_test.go b/index_test.go index 69ca61a98..604328a41 100644 --- a/index_test.go +++ b/index_test.go @@ -1509,7 +1509,7 @@ func TestSearchTimeout(t *testing.T) { } }() - // first run a search with an absurdly long timeout (should succeeed) + // first run a search with an absurdly long timeout (should succeed) ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() query := NewTermQuery("water") diff --git a/mapping/document.go b/mapping/document.go index 6ec0c66bb..0ebeb5856 100644 --- a/mapping/document.go +++ b/mapping/document.go @@ -324,7 +324,7 @@ func (dm *DocumentMapping) defaultAnalyzerName(path []string) string { } func (dm *DocumentMapping) walkDocument(data interface{}, path []string, indexes []uint64, context *walkContext) { - // allow default "json" tag to be overriden + // allow default "json" tag to be overridden structTagKey := dm.StructTagKey if structTagKey == "" { structTagKey = "json" diff --git a/mapping/mapping_test.go b/mapping/mapping_test.go index 1a7709049..a13a90b8b 100644 --- a/mapping/mapping_test.go +++ b/mapping/mapping_test.go @@ -950,7 +950,7 @@ func TestMappingForTextMarshaler(t *testing.T) { }, } - // first verify that when using a mapping that doesn't explicity + // first verify that when using a mapping that doesn't explicitly // map the stuct field as text, then we traverse inside the struct // and do our best m := NewIndexMapping() @@ -970,7 +970,7 @@ func TestMappingForTextMarshaler(t *testing.T) { t.Errorf("expected field value to be '%s', got: '%s'", tm.Marshalable.Extra, string(doc.Fields[0].Value())) } - // now verify that when a mapping explicity + // now verify that when a mapping explicitly m = NewIndexMapping() txt := NewTextFieldMapping() m.DefaultMapping.AddFieldMappingsAt("Marshalable", txt) @@ -1004,7 +1004,7 @@ func TestMappingForNilTextMarshaler(t *testing.T) { Marshalable: nil, } - // now verify that when a mapping explicity + // now verify that when a mapping explicitly m := NewIndexMapping() txt := NewTextFieldMapping() m.DefaultMapping.AddFieldMappingsAt("Marshalable", txt) diff --git a/numeric/bin.go b/numeric/bin.go index cd71392dc..368952a2c 100644 --- a/numeric/bin.go +++ b/numeric/bin.go @@ -14,7 +14,7 @@ var interleaveShift = []uint{1, 2, 4, 8, 16} // Interleave the first 32 bits of each uint64 // apdated from org.apache.lucene.util.BitUtil -// whcih was adapted from: +// which was adapted from: // http://graphics.stanford.edu/~seander/bithacks.html#InterleaveBMN func Interleave(v1, v2 uint64) uint64 { v1 = (v1 | (v1 << interleaveShift[4])) & interleaveMagic[4] diff --git a/search/searcher/search_phrase.go b/search/searcher/search_phrase.go index 3711da063..08eb13338 100644 --- a/search/searcher/search_phrase.go +++ b/search/searcher/search_phrase.go @@ -210,7 +210,7 @@ func (s *PhraseSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, return nil, nil } -// checkCurrMustMatch is soley concerned with determining if the DocumentMatch +// checkCurrMustMatch is solely concerned with determining if the DocumentMatch // pointed to by s.currMust (which satisifies the pre-condition searcher) // also satisfies the phase constraints. if so, it returns a DocumentMatch // for this document, otherwise nil @@ -241,7 +241,7 @@ func (s *PhraseSearcher) checkCurrMustMatch(ctx *search.SearchContext) *search.D return nil } -// checkCurrMustMatchField is soley concerned with determining if one +// checkCurrMustMatchField is solely concerned with determining if one // particular field within the currMust DocumentMatch Locations // satisfies the phase constraints (possibly more than once). if so, // the matching field term locations are appended to the provided diff --git a/search/searcher/search_term_range_test.go b/search/searcher/search_term_range_test.go index f84e28902..cd4e89114 100644 --- a/search/searcher/search_term_range_test.go +++ b/search/searcher/search_term_range_test.go @@ -94,7 +94,7 @@ func TestTermRangeSearch(t *testing.T) { inclusiveMax: true, want: nil, }, - // max nil sees everyting after marty + // max nil sees everything after marty { min: []byte("marty"), max: nil, @@ -103,7 +103,7 @@ func TestTermRangeSearch(t *testing.T) { inclusiveMax: true, want: []string{"1", "2", "4"}, }, - // min nil sees everyting before ravi + // min nil sees everything before ravi { min: nil, max: []byte("ravi"), From 193c43ecddce54283db96db0c103131f0fc558e3 Mon Sep 17 00:00:00 2001 From: Ben Gadbois Date: Wed, 30 May 2018 22:04:55 +0200 Subject: [PATCH 429/728] Fix another spelling for CI --- analysis/token/snowball/snowball.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/analysis/token/snowball/snowball.go b/analysis/token/snowball/snowball.go index 99dbf4202..6a672c2b2 100644 --- a/analysis/token/snowball/snowball.go +++ b/analysis/token/snowball/snowball.go @@ -39,7 +39,7 @@ func (s *SnowballStemmer) Filter(input analysis.TokenStream) analysis.TokenStrea for _, token := range input { // if it is not a protected keyword, stem it if !token.KeyWord { - stemmed, _ := snowball.Stem(string(token.Term), s.langauge, true) + stemmed, _ := snowball.Stem(string(token.Term), s.language, true) token.Term = []byte(stemmed) } } From d5224d90871d1edc7528cba5092f24010860509a Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Fri, 25 May 2018 11:40:05 -0700 Subject: [PATCH 430/728] Scorch optimize via vellum.FST.Reader() API + This change sets up a vellum.Reader (which carries a prealloc'ed fstState) for the term dictionaries. + However the Reader cannot be used concurrently, so rather than re-using a fieldDicts initialized for the IndexSnapshot, re-use one that is set up for TermFieldReaders which are recycled. --- index/scorch/segment/zap/dict.go | 13 ++++++------ index/scorch/segment/zap/segment.go | 4 ++++ index/scorch/snapshot_index.go | 32 ++++++++++------------------- 3 files changed, 22 insertions(+), 27 deletions(-) diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index c73cc6e5b..736fa59f6 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -28,10 +28,11 @@ import ( // Dictionary is the zap representation of the term dictionary type Dictionary struct { - sb *SegmentBase - field string - fieldID uint16 - fst *vellum.FST + sb *SegmentBase + field string + fieldID uint16 + fst *vellum.FST + fstReader *vellum.Reader } // PostingsList returns the postings list for the specified term @@ -46,14 +47,14 @@ func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap, } func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { - if d.fst == nil { + if d.fstReader == nil { if rv == nil || rv == emptyPostingsList { return emptyPostingsList, nil } return d.postingsListInit(rv, except), nil } - postingsOffset, exists, err := d.fst.Get(term) + postingsOffset, exists, err := d.fstReader.Get(term) if err != nil { return nil, fmt.Errorf("vellum err: %v", err) } diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 62933daf3..8c6de211a 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -266,6 +266,10 @@ func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) { if err != nil { return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err) } + rv.fstReader, err = rv.fst.Reader() + if err != nil { + return nil, fmt.Errorf("dictionary field %s vellum reader err: %v", field, err) + } } } } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 6b615cd17..21e19123d 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -59,9 +59,8 @@ type IndexSnapshot struct { m sync.Mutex // Protects the fields that follow. refs int64 - m2 sync.Mutex // Protects the fields that follow. - fieldTFRs map[string][]*IndexSnapshotTermFieldReader // keyed by field, recycled TFR's - fieldDicts map[string][]segment.TermDictionary // keyed by field, recycled dicts + m2 sync.Mutex // Protects the fields that follow. + fieldTFRs map[string][]*IndexSnapshotTermFieldReader // keyed by field, recycled TFR's } func (i *IndexSnapshot) Segments() []*SegmentSnapshot { @@ -394,7 +393,7 @@ func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err err func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, includeNorm, includeTermVectors bool) (tfr index.TermFieldReader, err error) { - rv, dicts := i.allocTermFieldReaderDicts(field) + rv := i.allocTermFieldReaderDicts(field) rv.term = term rv.field = field @@ -412,20 +411,19 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, rv.currPosting = nil rv.currID = rv.currID[:0] - if dicts == nil { - dicts = make([]segment.TermDictionary, len(i.segment)) + if rv.dicts == nil { + rv.dicts = make([]segment.TermDictionary, len(i.segment)) for i, segment := range i.segment { dict, err := segment.Dictionary(field) if err != nil { return nil, err } - dicts[i] = dict + rv.dicts[i] = dict } } - rv.dicts = dicts for i := range i.segment { - pl, err := dicts[i].PostingsList(term, nil, rv.postings[i]) + pl, err := rv.dicts[i].PostingsList(term, nil, rv.postings[i]) if err != nil { return nil, err } @@ -436,25 +434,21 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, return rv, nil } -func (i *IndexSnapshot) allocTermFieldReaderDicts(field string) ( - tfr *IndexSnapshotTermFieldReader, dicts []segment.TermDictionary) { +func (i *IndexSnapshot) allocTermFieldReaderDicts(field string) (tfr *IndexSnapshotTermFieldReader) { i.m2.Lock() - if i.fieldDicts != nil { - dicts = i.fieldDicts[field] - } if i.fieldTFRs != nil { tfrs := i.fieldTFRs[field] last := len(tfrs) - 1 if last >= 0 { - rv := tfrs[last] + tfr = tfrs[last] tfrs[last] = nil i.fieldTFRs[field] = tfrs[:last] i.m2.Unlock() - return rv, dicts + return } } i.m2.Unlock() - return &IndexSnapshotTermFieldReader{}, dicts + return &IndexSnapshotTermFieldReader{} } func (i *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader) { @@ -463,10 +457,6 @@ func (i *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader i.fieldTFRs = map[string][]*IndexSnapshotTermFieldReader{} } i.fieldTFRs[tfr.field] = append(i.fieldTFRs[tfr.field], tfr) - if i.fieldDicts == nil { - i.fieldDicts = map[string][]segment.TermDictionary{} - } - i.fieldDicts[tfr.field] = tfr.dicts i.m2.Unlock() } From 74b27a75a730249856522c91be962f2ea68658f6 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 26 Jun 2018 08:51:37 -0700 Subject: [PATCH 431/728] NewTermSearcher converts term to bytes only once --- search/searcher/search_term.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/search/searcher/search_term.go b/search/searcher/search_term.go index 4fee58bbf..97b7dbb90 100644 --- a/search/searcher/search_term.go +++ b/search/searcher/search_term.go @@ -38,7 +38,8 @@ type TermSearcher struct { } func NewTermSearcher(indexReader index.IndexReader, term string, field string, boost float64, options search.SearcherOptions) (*TermSearcher, error) { - reader, err := indexReader.TermFieldReader([]byte(term), field, true, true, options.IncludeTermVectors) + termBytes := []byte(term) + reader, err := indexReader.TermFieldReader(termBytes, field, true, true, options.IncludeTermVectors) if err != nil { return nil, err } @@ -47,7 +48,7 @@ func NewTermSearcher(indexReader index.IndexReader, term string, field string, b _ = reader.Close() return nil, err } - scorer := scorer.NewTermQueryScorer([]byte(term), field, boost, count, reader.Count(), options) + scorer := scorer.NewTermQueryScorer(termBytes, field, boost, count, reader.Count(), options) return &TermSearcher{ indexReader: indexReader, reader: reader, From 027187d999549bb29f41edeb3d8a139ff4e381a8 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Tue, 26 Jun 2018 16:03:11 -0700 Subject: [PATCH 432/728] MB-30252: Omit default_analyzer from the json encoding when unset --- mapping/document.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mapping/document.go b/mapping/document.go index 6ec0c66bb..ec4b8bf64 100644 --- a/mapping/document.go +++ b/mapping/document.go @@ -42,7 +42,7 @@ type DocumentMapping struct { Dynamic bool `json:"dynamic"` Properties map[string]*DocumentMapping `json:"properties,omitempty"` Fields []*FieldMapping `json:"fields,omitempty"` - DefaultAnalyzer string `json:"default_analyzer"` + DefaultAnalyzer string `json:"default_analyzer,omitempty"` // StructTagKey overrides "json" when looking for field names in struct tags StructTagKey string `json:"struct_tag_key,omitempty"` From ba11b6df3810344e592ceb2cbc3f745ab706b96d Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Tue, 10 Jul 2018 15:02:52 -0700 Subject: [PATCH 433/728] Fix to boolean searcher's Advance() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit + Advance nested searchers within a boolean searcher only if the cursor is trailing the ID being looked up. + Unit test + Fixes https://github.com/blevesearch/bleve/issues/954 Here's an example bug scenario with nested boolean searchers .. conjunctionSearcher [a] => booleanSearcher [b] Must => conjunctionSearcher [d] => disjunctionSearcher [f] => disjunctionSearcher => termSearcher => disjunctionSearcher => termSearcher => booleanSearcher [c] Must => conjunctionSearcher [e] => termSearcher Consider there to be docs ranging from 1 to 100 in the index and these are the expected matches, for the query: 1, 11, 16, 21,.. On the first Next() call by the collector .. [d] - initSearchers .. sets local cursor of [f] to 1 - moves local cursor of disjunction cursor to 11 [b] - init: local cursor of [d] set to 1 - moves [d]’s local cursor of [f] to 12 - moves local cursor of [d] to 11 [c] - init: local cursor of [e] set to 1 - moves local cursor of [e] to 11 [a] - initSearchers: sets local cursors of [b], [c] to 1 - moves [b]’s local cursor of [d] to 12 - moves [d]’s local cursor of [f] to 16 - moves [c]’s local cursor of [e] to 16 - updates local cursor of [b] to 11 - updates local cursor of [c] to 11 (Note that at this point [b]’s local cursor to [d] and [c]’s local cursor to [e] are pointing to different documents, owing to the different nesting of searchers underneath) On the second Next() call by the collector .. [a] .. maxID: 11 - moves [b]’s local cursor of [d] to 16 - moves [d]’s local cursor of [f] to 17 - moves [c]’s local cursor of [e] to 21 - updates local cursor of [b] to 12 - updates local cursor of [c] to 16 On the third Next() call by the collector .. [a] .. maxID: 16 - sees that maxID 16 > 12 (cursor position of [b]) - invokes advanceChild([b]) .. --> this causes [b]’s local cursor to move from 16 to 17 and this prevents the match:16 from being collected. --- search/searcher/search_boolean.go | 43 +++++----- search_test.go | 128 ++++++++++++++++++++++++++++++ 2 files changed, 153 insertions(+), 18 deletions(-) diff --git a/search/searcher/search_boolean.go b/search/searcher/search_boolean.go index f7ee2cd83..f9684af29 100644 --- a/search/searcher/search_boolean.go +++ b/search/searcher/search_boolean.go @@ -332,31 +332,38 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter } var err error + // Advance nested searcher(s) only if the cursor is trailing the lookup ID if s.mustSearcher != nil { - if s.currMust != nil { - ctx.DocumentMatchPool.Put(s.currMust) - } - s.currMust, err = s.mustSearcher.Advance(ctx, ID) - if err != nil { - return nil, err + if s.currMust == nil || s.currMust.IndexInternalID.Compare(ID) < 0 { + if s.currMust != nil { + ctx.DocumentMatchPool.Put(s.currMust) + } + s.currMust, err = s.mustSearcher.Advance(ctx, ID) + if err != nil { + return nil, err + } } } if s.shouldSearcher != nil { - if s.currShould != nil { - ctx.DocumentMatchPool.Put(s.currShould) - } - s.currShould, err = s.shouldSearcher.Advance(ctx, ID) - if err != nil { - return nil, err + if s.currShould == nil || s.currShould.IndexInternalID.Compare(ID) < 0 { + if s.currShould != nil { + ctx.DocumentMatchPool.Put(s.currShould) + } + s.currShould, err = s.shouldSearcher.Advance(ctx, ID) + if err != nil { + return nil, err + } } } if s.mustNotSearcher != nil { - if s.currMustNot != nil { - ctx.DocumentMatchPool.Put(s.currMustNot) - } - s.currMustNot, err = s.mustNotSearcher.Advance(ctx, ID) - if err != nil { - return nil, err + if s.currMustNot == nil || s.currMustNot.IndexInternalID.Compare(ID) < 0 { + if s.currMustNot != nil { + ctx.DocumentMatchPool.Put(s.currMustNot) + } + s.currMustNot, err = s.mustNotSearcher.Advance(ctx, ID) + if err != nil { + return nil, err + } } } diff --git a/search_test.go b/search_test.go index 87a718285..7fcc71d6e 100644 --- a/search_test.go +++ b/search_test.go @@ -17,12 +17,21 @@ package bleve import ( "encoding/json" "fmt" + "os" "reflect" + "strconv" "strings" "testing" "time" + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/analysis/analyzer/custom" + "github.com/blevesearch/bleve/analysis/token/lowercase" + "github.com/blevesearch/bleve/analysis/tokenizer/single" + "github.com/blevesearch/bleve/analysis/tokenizer/whitespace" + "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/search/query" ) func TestSearchResultString(t *testing.T) { @@ -414,3 +423,122 @@ func TestMemoryNeededForSearchResult(t *testing.T) { t.Errorf("estimate not what is expected: %v != %v", estimate, expect) } } + +// https://github.com/blevesearch/bleve/issues/954 +func TestNestedBooleanSearchers(t *testing.T) { + defer func() { + err := os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + }() + + // create an index with a custom analyzer + idxMapping := NewIndexMapping() + if err := idxMapping.AddCustomAnalyzer("3xbla", map[string]interface{}{ + "type": custom.Name, + "tokenizer": whitespace.Name, + "token_filters": []interface{}{lowercase.Name, "stop_en"}, + }); err != nil { + t.Fatal(err) + } + + idxMapping.DefaultAnalyzer = "3xbla" + idx, err := New("testidx", idxMapping) + if err != nil { + t.Fatal(err) + } + + // create and insert documents as a batch + batch := idx.NewBatch() + matches := 0 + for i := 0; i < 100; i++ { + hostname := fmt.Sprintf("planner_hostname_%d", i%5) + metadata := map[string]string{"region": fmt.Sprintf("planner_us-east-%d", i%5)} + + // Expected matches + if (hostname == "planner_hostname_1" || hostname == "planner_hostname_2") && + metadata["region"] == "planner_us-east-1" { + matches++ + } + + doc := document.NewDocument(strconv.Itoa(i)) + doc.Fields = []document.Field{ + document.NewTextFieldCustom("hostname", []uint64{}, []byte(hostname), + document.IndexField, + &analysis.Analyzer{ + Tokenizer: single.NewSingleTokenTokenizer(), + TokenFilters: []analysis.TokenFilter{ + lowercase.NewLowerCaseFilter(), + }, + }, + ), + } + for k, v := range metadata { + doc.AddField(document.NewTextFieldWithIndexingOptions( + fmt.Sprintf("metadata.%s", k), []uint64{}, []byte(v), document.IndexField)) + } + doc.CompositeFields = []*document.CompositeField{ + document.NewCompositeFieldWithIndexingOptions( + "_all", true, []string{"text"}, []string{}, + document.IndexField|document.IncludeTermVectors), + } + + if err = batch.IndexAdvanced(doc); err != nil { + t.Fatal(err) + } + } + + if err = idx.Batch(batch); err != nil { + t.Fatal(err) + } + + que, err := query.ParseQuery([]byte( + `{ + "conjuncts": [ + { + "must": { + "conjuncts": [ + { + "disjuncts": [ + { + "match": "planner_hostname_1", + "field": "hostname" + }, + { + "match": "planner_hostname_2", + "field": "hostname" + } + ] + } + ] + } + }, + { + "must": { + "conjuncts": [ + { + "match": "planner_us-east-1", + "field": "metadata.region" + } + ] + } + } + ] + }`, + )) + if err != nil { + t.Fatal(err) + } + + req := NewSearchRequest(que) + req.Size = 100 + req.Fields = []string{"hostname", "metadata.region"} + searchResults, err := idx.Search(req) + if err != nil { + t.Fatal(err) + } + if matches != len(searchResults.Hits) { + t.Fatalf("Unexpected result set, %v != %v", matches, len(searchResults.Hits)) + } +} From 17f59588a39d25babbf4e04fe3bc79fc183a26e5 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 12 Jul 2018 15:09:38 -0700 Subject: [PATCH 434/728] optimize scorch FieldDictFuzzy to check for tooManyClauses earlier This should not affect functionality, as the fuzzy searcher uses a disjunction searcher underneath the hood, and the disjunction searcher later on checks for the tooManyClauses() case. But, the idea is that the tooManyClauses() check should be handled the same whether or not the IndexReaderFuzzy interface is implemented by the index-reader, and if we can bail earlier, then we waste less resources. Also, this commit also checks for error more consistently. --- search/searcher/search_fuzzy.go | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/search/searcher/search_fuzzy.go b/search/searcher/search_fuzzy.go index b812f4840..1ce3ba71d 100644 --- a/search/searcher/search_fuzzy.go +++ b/search/searcher/search_fuzzy.go @@ -53,6 +53,7 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string, func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, fuzziness int, field, prefixTerm string) (rv []string, err error) { rv = make([]string, 0) + var fieldDict index.FieldDict if len(prefixTerm) > 0 { fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm)) @@ -63,7 +64,7 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, if ir, ok := indexReader.(index.IndexReaderFuzzy); ok { fieldDict, err = ir.FieldDictFuzzy(field, []byte(term), fuzziness) if err != nil { - return rv, err + return nil, err } defer func() { if cerr := fieldDict.Close(); cerr != nil && err == nil { @@ -73,12 +74,19 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, tfd, err := fieldDict.Next() for err == nil && tfd != nil { rv = append(rv, tfd.Term) + if tooManyClauses(len(rv)) { + return nil, tooManyClausesErr() + } tfd, err = fieldDict.Next() } return rv, err } + fieldDict, err = indexReader.FieldDict(field) } + if err != nil { + return nil, err + } defer func() { if cerr := fieldDict.Close(); cerr != nil && err == nil { err = cerr @@ -95,7 +103,7 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, if !exceeded && ld <= fuzziness { rv = append(rv, tfd.Term) if tooManyClauses(len(rv)) { - return rv, tooManyClausesErr() + return nil, tooManyClausesErr() } } tfd, err = fieldDict.Next() From 269785dea09a720fc5c6acd6f120d77cb58a403d Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 13 Jul 2018 15:09:45 -0700 Subject: [PATCH 435/728] scorch root Add/DecRef() codepath scrubbing --- index/scorch/introducer.go | 9 +++++++++ index/scorch/merge.go | 4 ++-- index/scorch/persister.go | 9 ++++++--- index/scorch/scorch.go | 4 ++-- 4 files changed, 19 insertions(+), 7 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 6989bbc9d..9f42a5b36 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -107,8 +107,11 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { s.rootLock.RLock() root := s.root + root.AddRef() s.rootLock.RUnlock() + defer func() { _ = root.DecRef() }() + nsegs := len(root.segment) // prepare new index snapshot @@ -221,10 +224,13 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) { s.rootLock.Lock() root := s.root + root.AddRef() nextSnapshotEpoch := s.nextSnapshotEpoch s.nextSnapshotEpoch++ s.rootLock.Unlock() + defer func() { _ = root.DecRef() }() + newIndexSnapshot := &IndexSnapshot{ parent: s, epoch: nextSnapshotEpoch, @@ -282,8 +288,11 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { s.rootLock.RLock() root := s.root + root.AddRef() s.rootLock.RUnlock() + defer func() { _ = root.DecRef() }() + newSnapshot := &IndexSnapshot{ parent: s, internal: root.internal, diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 171f33ae8..38646bf0b 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -46,12 +46,12 @@ OUTER: default: // check to see if there is a new snapshot to persist - s.rootLock.RLock() + s.rootLock.Lock() ourSnapshot := s.root ourSnapshot.AddRef() atomic.StoreUint64(&s.iStats.mergeSnapshotSize, uint64(ourSnapshot.Size())) atomic.StoreUint64(&s.iStats.mergeEpoch, ourSnapshot.epoch) - s.rootLock.RUnlock() + s.rootLock.Unlock() if ourSnapshot.epoch != lastEpochMergePlanned { startTime := time.Now() diff --git a/index/scorch/persister.go b/index/scorch/persister.go index c822ad0b5..bb40c33f8 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -521,11 +521,14 @@ func (s *Scorch) loadFromBolt() error { s.nextSegmentID++ s.rootLock.Lock() s.nextSnapshotEpoch = snapshotEpoch + 1 - if s.root != nil { - _ = s.root.DecRef() - } + rootPrev := s.root s.root = indexSnapshot s.rootLock.Unlock() + + if rootPrev != nil { + _ = rootPrev.DecRef() + } + foundRoot = true } return nil diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index fe4f71b87..357836524 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -370,6 +370,8 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, root.AddRef() s.rootLock.RUnlock() + defer func() { _ = root.DecRef() }() + for _, seg := range root.segment { delta, err := seg.segment.DocNumbers(ids) if err != nil { @@ -378,8 +380,6 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, introduction.obsoletes[seg.id] = delta } - _ = root.DecRef() - introStartTime := time.Now() s.introductions <- introduction From a899d29c1b2baa769d836511352993c54ec0b04f Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 16 Jul 2018 13:25:24 +0530 Subject: [PATCH 436/728] MB-30342 - ref_count leak from FieldDict Attempt to fix the ref_count leak by closing the field dictionary for term_prefix and term_range searches --- search/searcher/search_term_prefix.go | 8 ++++++++ search/searcher/search_term_range.go | 6 ++++++ 2 files changed, 14 insertions(+) diff --git a/search/searcher/search_term_prefix.go b/search/searcher/search_term_prefix.go index 05d092249..c49788c71 100644 --- a/search/searcher/search_term_prefix.go +++ b/search/searcher/search_term_prefix.go @@ -27,6 +27,11 @@ func NewTermPrefixSearcher(indexReader index.IndexReader, prefix string, if err != nil { return nil, err } + defer func() { + if cerr := fieldDict.Close(); cerr != nil && err == nil { + err = cerr + } + }() var terms []string tfd, err := fieldDict.Next() @@ -34,6 +39,9 @@ func NewTermPrefixSearcher(indexReader index.IndexReader, prefix string, terms = append(terms, tfd.Term) tfd, err = fieldDict.Next() } + if err != nil { + return nil, err + } return NewMultiTermSearcher(indexReader, terms, field, boost, options, true) } diff --git a/search/searcher/search_term_range.go b/search/searcher/search_term_range.go index 267c681b4..90be1e11a 100644 --- a/search/searcher/search_term_range.go +++ b/search/searcher/search_term_range.go @@ -48,6 +48,12 @@ func NewTermRangeSearcher(indexReader index.IndexReader, return nil, err } + defer func() { + if cerr := fieldDict.Close(); cerr != nil && err == nil { + err = cerr + } + }() + var terms []string tfd, err := fieldDict.Next() for err == nil && tfd != nil { From 84502bf2c73fd9a85477d7018220e0af6a69374e Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 11 Jul 2018 14:40:35 +0530 Subject: [PATCH 437/728] MB-29923 - high memory consumption in scorch The fix aims to loosen the persister-merger lock stepping and also introduces a configurable wait in the persister work loop to favour healthier in-memory segment merges. --- index/scorch/persister.go | 75 ++++++++++++++++++++++++++++++------- index/scorch/scorch.go | 18 ++++++--- index/scorch/scorch_test.go | 5 +++ test/versus_test.go | 6 +++ 4 files changed, 85 insertions(+), 19 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index c822ad0b5..e7f591671 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -17,6 +17,7 @@ package scorch import ( "bytes" "encoding/binary" + "encoding/json" "fmt" "io/ioutil" "log" @@ -35,16 +36,22 @@ import ( var DefaultChunkFactor uint32 = 1024 -// Arbitrary number, need to make it configurable. -// Lower values like 10/making persister really slow -// doesn't work well as it is creating more files to -// persist for in next persist iteration and spikes the # FDs. -// Ideal value should let persister also proceed at -// an optimum pace so that the merger can skip -// many intermediate snapshots. -// This needs to be based on empirical data. -// TODO - may need to revisit this approach/value. -var epochDistance = uint64(5) +var DefaultPersisterNapTimeMSec int = 2000 // ms + +var DefaultPersisterNapUnderNumFiles int = 1000 + +type persisterOptions struct { + // PersisterNapTimeMSec controls the wait/delay injected into + // persistence workloop to improve the chances for + // a healthier and heavier in-memory merging + PersisterNapTimeMSec int + + // PersisterNapTimeMSec > 0, and the number of files is less than + // PersisterNapUnderNumFiles, then the persister will sleep + // PersisterNapTimeMSec amount of time to improve the chances for + // a healthier and heavier in-memory merging + PersisterNapUnderNumFiles int +} type notificationChan chan struct{} @@ -54,6 +61,13 @@ func (s *Scorch) persisterLoop() { var persistWatchers []*epochWatcher var lastPersistedEpoch, lastMergedEpoch uint64 var ew *epochWatcher + po, err := s.parsePersisterOptions() + if err != nil { + s.fireAsyncError(fmt.Errorf("persisterOptions json parsing err: %v", err)) + s.asyncTasks.Done() + return + } + OUTER: for { atomic.AddUint64(&s.stats.TotPersistLoopBeg, 1) @@ -69,7 +83,7 @@ OUTER: lastMergedEpoch = ew.epoch } lastMergedEpoch, persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch, - lastMergedEpoch, persistWatchers) + lastMergedEpoch, persistWatchers, po) var ourSnapshot *IndexSnapshot var ourPersisted []chan error @@ -180,14 +194,26 @@ func notifyMergeWatchers(lastPersistedEpoch uint64, } func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch uint64, - persistWatchers []*epochWatcher) (uint64, []*epochWatcher) { + persistWatchers []*epochWatcher, po *persisterOptions) (uint64, []*epochWatcher) { // first, let the watchers proceed if they lag behind persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) + // check the merger lag by counting the segment files on disk, + // On finding fewer files on disk, persister takes a short pause + // for sufficient in-memory segments to pile up for the next + // memory merge cum persist loop. + // On finding too many files on disk, persister pause until the merger + // catches up to reduce the segment file count under the threshold. + numFilesOnDisk, _ := s.diskFileStats() + if numFilesOnDisk < uint64(po.PersisterNapUnderNumFiles) && + po.PersisterNapTimeMSec > 0 { + time.Sleep(time.Millisecond * time.Duration(po.PersisterNapTimeMSec)) + return lastMergedEpoch, persistWatchers + } + OUTER: - // check for slow merger and await until the merger catch up - for lastPersistedEpoch > lastMergedEpoch+epochDistance { + for numFilesOnDisk > uint64(po.PersisterNapUnderNumFiles) { atomic.AddUint64(&s.stats.TotPersisterSlowMergerPause, 1) select { @@ -202,11 +228,32 @@ OUTER: // let the watchers proceed if they lag behind persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) + + numFilesOnDisk, _ = s.diskFileStats() } return lastMergedEpoch, persistWatchers } +func (s *Scorch) parsePersisterOptions() (*persisterOptions, error) { + po := persisterOptions{ + PersisterNapTimeMSec: DefaultPersisterNapTimeMSec, + PersisterNapUnderNumFiles: DefaultPersisterNapUnderNumFiles, + } + if v, ok := s.config["scorchPersisterOptions"]; ok { + b, err := json.Marshal(v) + if err != nil { + return &po, err + } + + err = json.Unmarshal(b, &po) + if err != nil { + return &po, err + } + } + return &po, nil +} + func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { persisted, err := s.persistSnapshotMaybeMerge(snapshot) if err != nil { diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index fe4f71b87..50208311d 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -434,9 +434,9 @@ func (s *Scorch) currentSnapshot() *IndexSnapshot { func (s *Scorch) Stats() json.Marshaler { return &s.stats } -func (s *Scorch) StatsMap() map[string]interface{} { - m := s.stats.ToMap() +func (s *Scorch) diskFileStats() (uint64, uint64) { + var numFilesOnDisk, numBytesUsedDisk uint64 if s.path != "" { finfos, err := ioutil.ReadDir(s.path) if err == nil { @@ -447,11 +447,19 @@ func (s *Scorch) StatsMap() map[string]interface{} { numFilesOnDisk++ } } - - m["CurOnDiskBytes"] = numBytesUsedDisk - m["CurOnDiskFiles"] = numFilesOnDisk } } + return numFilesOnDisk, numBytesUsedDisk +} + +func (s *Scorch) StatsMap() map[string]interface{} { + m := s.stats.ToMap() + + numFilesOnDisk, numBytesUsedDisk := s.diskFileStats() + if numFilesOnDisk > 0 || numBytesUsedDisk > 0 { + m["CurOnDiskBytes"] = numBytesUsedDisk + m["CurOnDiskFiles"] = numFilesOnDisk + } // TODO: consider one day removing these backwards compatible // names for apps using the old names diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index cf784755d..adcabd22f 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -33,6 +33,11 @@ import ( "github.com/blevesearch/bleve/mapping" ) +func init() { + // override for tests + DefaultPersisterNapTimeMSec = 1 +} + func DestroyTest() error { return os.RemoveAll("/tmp/bleve-scorch-test") } diff --git a/test/versus_test.go b/test/versus_test.go index dbc7dd752..10faa311e 100644 --- a/test/versus_test.go +++ b/test/versus_test.go @@ -41,6 +41,12 @@ import ( // go test -v -run TestScorchVersusUpsideDownBolt ./test // VERBOSE=1 FOCUS=Trista go test -v -run TestScorchVersusUpsideDownBolt ./test // + +func init() { + // override for tests + scorch.DefaultPersisterNapTimeMSec = 1 +} + func TestScorchVersusUpsideDownBoltAll(t *testing.T) { (&VersusTest{ t: t, From a7e64a5289dbffe898ab09ac5fa28b7c8535b90f Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 25 Jul 2018 16:09:56 -0700 Subject: [PATCH 438/728] [Scorch] Fix bug in estimating: numFilesOnDisk, numBytesUsedDisk --- index/scorch/scorch.go | 1 - 1 file changed, 1 deletion(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index c22b8d401..644a5ea3d 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -440,7 +440,6 @@ func (s *Scorch) diskFileStats() (uint64, uint64) { if s.path != "" { finfos, err := ioutil.ReadDir(s.path) if err == nil { - var numFilesOnDisk, numBytesUsedDisk uint64 for _, finfo := range finfos { if !finfo.IsDir() { numBytesUsedDisk += uint64(finfo.Size()) From e4b290d350a7bcb636f4498e828ef42de4b3d98d Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Sat, 28 Jul 2018 12:19:37 +0530 Subject: [PATCH 439/728] MB-30694: reflect.Value.Type on zero Value panics Adding checks to validate the geo point values while parsing --- geo/parse.go | 3 +++ mapping/document.go | 4 ++++ mapping/reflect.go | 3 +++ 3 files changed, 10 insertions(+) diff --git a/geo/parse.go b/geo/parse.go index 04a57538d..703ff6718 100644 --- a/geo/parse.go +++ b/geo/parse.go @@ -113,6 +113,9 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) { // extract numeric value (if possible) and returns a float64 func extractNumericVal(v interface{}) (float64, bool) { val := reflect.ValueOf(v) + if !val.IsValid() { + return 0, false + } typ := val.Type() switch typ.Kind() { case reflect.Float32, reflect.Float64: diff --git a/mapping/document.go b/mapping/document.go index 6898e54e8..cc3582cad 100644 --- a/mapping/document.go +++ b/mapping/document.go @@ -331,6 +331,10 @@ func (dm *DocumentMapping) walkDocument(data interface{}, path []string, indexes } val := reflect.ValueOf(data) + if !val.IsValid() { + return + } + typ := val.Type() switch typ.Kind() { case reflect.Map: diff --git a/mapping/reflect.go b/mapping/reflect.go index 3068b1906..6500a7059 100644 --- a/mapping/reflect.go +++ b/mapping/reflect.go @@ -35,6 +35,9 @@ func lookupPropertyPath(data interface{}, path string) interface{} { func lookupPropertyPathPart(data interface{}, part string) interface{} { val := reflect.ValueOf(data) + if !val.IsValid() { + return nil + } typ := val.Type() switch typ.Kind() { case reflect.Map: From aa949f448b5442a2d329c64baf887310b2ee5157 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Sun, 29 Jul 2018 18:56:54 +0530 Subject: [PATCH 440/728] adding UTs and extra checks --- geo/parse.go | 8 ++++++-- geo/parse_test.go | 17 +++++++++++++++++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/geo/parse.go b/geo/parse.go index 703ff6718..8dfc6eed2 100644 --- a/geo/parse.go +++ b/geo/parse.go @@ -36,10 +36,14 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) { var foundLon, foundLat bool thingVal := reflect.ValueOf(thing) + if !thingVal.IsValid() { + return lon, lat, false + } + thingTyp := thingVal.Type() // is it a slice - if thingVal.IsValid() && thingVal.Kind() == reflect.Slice { + if thingVal.Kind() == reflect.Slice { // must be length 2 if thingVal.Len() == 2 { first := thingVal.Index(0) @@ -68,7 +72,7 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) { } // now try reflection on struct fields - if thingVal.IsValid() && thingVal.Kind() == reflect.Struct { + if thingVal.Kind() == reflect.Struct { for i := 0; i < thingVal.NumField(); i++ { fieldName := thingTyp.Field(i).Name if strings.HasPrefix(strings.ToLower(fieldName), "lon") { diff --git a/geo/parse_test.go b/geo/parse_test.go index 4cbf66dde..c7d088d93 100644 --- a/geo/parse_test.go +++ b/geo/parse_test.go @@ -141,6 +141,23 @@ func TestExtractGeoPoint(t *testing.T) { lat: 5.9, success: true, }, + // values are nil (not supported) + { + in: map[string]interface{}{ + "lat": nil, + "lon": nil, + }, + lon: 0, + lat: 0, + success: false, + }, + // input is nil + { + in: nil, + lon: 0, + lat: 0, + success: false, + }, } for _, test := range tests { From 1d7c871429c3e77f2532f4d59071e492abf523fb Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 30 Jul 2018 10:43:16 -0700 Subject: [PATCH 441/728] MB-30616: Revert "MB-29923 - high memory consumption in scorch" Noticed operational deadlocks by stalling the persister to allow more in-memory merges. This reverts commit 84502bf2c73fd9a85477d7018220e0af6a69374e. --- index/scorch/persister.go | 75 +++++++------------------------------ index/scorch/scorch.go | 19 +++------- index/scorch/scorch_test.go | 5 --- test/versus_test.go | 6 --- 4 files changed, 20 insertions(+), 85 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 96b4c2ec8..bb40c33f8 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -17,7 +17,6 @@ package scorch import ( "bytes" "encoding/binary" - "encoding/json" "fmt" "io/ioutil" "log" @@ -36,22 +35,16 @@ import ( var DefaultChunkFactor uint32 = 1024 -var DefaultPersisterNapTimeMSec int = 2000 // ms - -var DefaultPersisterNapUnderNumFiles int = 1000 - -type persisterOptions struct { - // PersisterNapTimeMSec controls the wait/delay injected into - // persistence workloop to improve the chances for - // a healthier and heavier in-memory merging - PersisterNapTimeMSec int - - // PersisterNapTimeMSec > 0, and the number of files is less than - // PersisterNapUnderNumFiles, then the persister will sleep - // PersisterNapTimeMSec amount of time to improve the chances for - // a healthier and heavier in-memory merging - PersisterNapUnderNumFiles int -} +// Arbitrary number, need to make it configurable. +// Lower values like 10/making persister really slow +// doesn't work well as it is creating more files to +// persist for in next persist iteration and spikes the # FDs. +// Ideal value should let persister also proceed at +// an optimum pace so that the merger can skip +// many intermediate snapshots. +// This needs to be based on empirical data. +// TODO - may need to revisit this approach/value. +var epochDistance = uint64(5) type notificationChan chan struct{} @@ -61,13 +54,6 @@ func (s *Scorch) persisterLoop() { var persistWatchers []*epochWatcher var lastPersistedEpoch, lastMergedEpoch uint64 var ew *epochWatcher - po, err := s.parsePersisterOptions() - if err != nil { - s.fireAsyncError(fmt.Errorf("persisterOptions json parsing err: %v", err)) - s.asyncTasks.Done() - return - } - OUTER: for { atomic.AddUint64(&s.stats.TotPersistLoopBeg, 1) @@ -83,7 +69,7 @@ OUTER: lastMergedEpoch = ew.epoch } lastMergedEpoch, persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch, - lastMergedEpoch, persistWatchers, po) + lastMergedEpoch, persistWatchers) var ourSnapshot *IndexSnapshot var ourPersisted []chan error @@ -194,26 +180,14 @@ func notifyMergeWatchers(lastPersistedEpoch uint64, } func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch uint64, - persistWatchers []*epochWatcher, po *persisterOptions) (uint64, []*epochWatcher) { + persistWatchers []*epochWatcher) (uint64, []*epochWatcher) { // first, let the watchers proceed if they lag behind persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) - // check the merger lag by counting the segment files on disk, - // On finding fewer files on disk, persister takes a short pause - // for sufficient in-memory segments to pile up for the next - // memory merge cum persist loop. - // On finding too many files on disk, persister pause until the merger - // catches up to reduce the segment file count under the threshold. - numFilesOnDisk, _ := s.diskFileStats() - if numFilesOnDisk < uint64(po.PersisterNapUnderNumFiles) && - po.PersisterNapTimeMSec > 0 { - time.Sleep(time.Millisecond * time.Duration(po.PersisterNapTimeMSec)) - return lastMergedEpoch, persistWatchers - } - OUTER: - for numFilesOnDisk > uint64(po.PersisterNapUnderNumFiles) { + // check for slow merger and await until the merger catch up + for lastPersistedEpoch > lastMergedEpoch+epochDistance { atomic.AddUint64(&s.stats.TotPersisterSlowMergerPause, 1) select { @@ -228,32 +202,11 @@ OUTER: // let the watchers proceed if they lag behind persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) - - numFilesOnDisk, _ = s.diskFileStats() } return lastMergedEpoch, persistWatchers } -func (s *Scorch) parsePersisterOptions() (*persisterOptions, error) { - po := persisterOptions{ - PersisterNapTimeMSec: DefaultPersisterNapTimeMSec, - PersisterNapUnderNumFiles: DefaultPersisterNapUnderNumFiles, - } - if v, ok := s.config["scorchPersisterOptions"]; ok { - b, err := json.Marshal(v) - if err != nil { - return &po, err - } - - err = json.Unmarshal(b, &po) - if err != nil { - return &po, err - } - } - return &po, nil -} - func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { persisted, err := s.persistSnapshotMaybeMerge(snapshot) if err != nil { diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 644a5ea3d..357836524 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -434,30 +434,23 @@ func (s *Scorch) currentSnapshot() *IndexSnapshot { func (s *Scorch) Stats() json.Marshaler { return &s.stats } +func (s *Scorch) StatsMap() map[string]interface{} { + m := s.stats.ToMap() -func (s *Scorch) diskFileStats() (uint64, uint64) { - var numFilesOnDisk, numBytesUsedDisk uint64 if s.path != "" { finfos, err := ioutil.ReadDir(s.path) if err == nil { + var numFilesOnDisk, numBytesUsedDisk uint64 for _, finfo := range finfos { if !finfo.IsDir() { numBytesUsedDisk += uint64(finfo.Size()) numFilesOnDisk++ } } - } - } - return numFilesOnDisk, numBytesUsedDisk -} -func (s *Scorch) StatsMap() map[string]interface{} { - m := s.stats.ToMap() - - numFilesOnDisk, numBytesUsedDisk := s.diskFileStats() - if numFilesOnDisk > 0 || numBytesUsedDisk > 0 { - m["CurOnDiskBytes"] = numBytesUsedDisk - m["CurOnDiskFiles"] = numFilesOnDisk + m["CurOnDiskBytes"] = numBytesUsedDisk + m["CurOnDiskFiles"] = numFilesOnDisk + } } // TODO: consider one day removing these backwards compatible diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index adcabd22f..cf784755d 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -33,11 +33,6 @@ import ( "github.com/blevesearch/bleve/mapping" ) -func init() { - // override for tests - DefaultPersisterNapTimeMSec = 1 -} - func DestroyTest() error { return os.RemoveAll("/tmp/bleve-scorch-test") } diff --git a/test/versus_test.go b/test/versus_test.go index 10faa311e..dbc7dd752 100644 --- a/test/versus_test.go +++ b/test/versus_test.go @@ -41,12 +41,6 @@ import ( // go test -v -run TestScorchVersusUpsideDownBolt ./test // VERBOSE=1 FOCUS=Trista go test -v -run TestScorchVersusUpsideDownBolt ./test // - -func init() { - // override for tests - scorch.DefaultPersisterNapTimeMSec = 1 -} - func TestScorchVersusUpsideDownBoltAll(t *testing.T) { (&VersusTest{ t: t, From 6795aad1ccc6feedc1ee40c2005bcee228364a83 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Tue, 31 Jul 2018 16:36:25 +0530 Subject: [PATCH 442/728] replacing maps with slice for faster lookups --- search/facets_builder.go | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/search/facets_builder.go b/search/facets_builder.go index c5d41e2d3..7fc0bedf3 100644 --- a/search/facets_builder.go +++ b/search/facets_builder.go @@ -54,14 +54,14 @@ type FacetBuilder interface { type FacetsBuilder struct { indexReader index.IndexReader - facets map[string]FacetBuilder + facetNames []string + facets []FacetBuilder fields []string } func NewFacetsBuilder(indexReader index.IndexReader) *FacetsBuilder { return &FacetsBuilder{ indexReader: indexReader, - facets: make(map[string]FacetBuilder, 0), } } @@ -69,8 +69,7 @@ func (fb *FacetsBuilder) Size() int { sizeInBytes := reflectStaticSizeFacetsBuilder + size.SizeOfPtr for k, v := range fb.facets { - sizeInBytes += size.SizeOfString + len(k) + - v.Size() + sizeInBytes += size.SizeOfString + v.Size() + len(fb.facetNames[k]) } for _, entry := range fb.fields { @@ -81,7 +80,8 @@ func (fb *FacetsBuilder) Size() int { } func (fb *FacetsBuilder) Add(name string, facetBuilder FacetBuilder) { - fb.facets[name] = facetBuilder + fb.facetNames = append(fb.facetNames, name) + fb.facets = append(fb.facets, facetBuilder) fb.fields = append(fb.fields, facetBuilder.Field()) } @@ -333,9 +333,9 @@ func (fr FacetResults) Fixup(name string, size int) { func (fb *FacetsBuilder) Results() FacetResults { fr := make(FacetResults) - for facetName, facetBuilder := range fb.facets { + for i, facetBuilder := range fb.facets { facetResult := facetBuilder.Result() - fr[facetName] = facetResult + fr[fb.facetNames[i]] = facetResult } return fr } From a05fcc02e479be66289bf20324f314121064d02a Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 8 Aug 2018 18:28:40 -0700 Subject: [PATCH 443/728] optimize scorch/zap dict PrefixIterator() This optimization avoids creating a regexp for the scorch/zap dictionary prefix iterator. See also: https://issues.couchbase.com/browse/MB-30263 --- index/scorch/segment/zap/dict.go | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 736fa59f6..8e1a03f9b 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -122,16 +122,14 @@ func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { d: d, } + kBeg := []byte(prefix) + kEnd := incrementBytes(kBeg) + if d.fst != nil { - r, err := regexp.New(prefix + ".*") + itr, err := d.fst.Iterator(kBeg, kEnd) if err == nil { - itr, err := d.fst.Search(r, nil, nil) - if err == nil { - rv.itr = itr - } else if err != nil && err != vellum.ErrIteratorDone { - rv.err = err - } - } else { + rv.itr = itr + } else if err != nil && err != vellum.ErrIteratorDone { rv.err = err } } @@ -139,6 +137,18 @@ func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { return rv } +func incrementBytes(in []byte) []byte { + rv := make([]byte, len(in)) + copy(rv, in) + for i := len(rv) - 1; i >= 0; i-- { + rv[i] = rv[i] + 1 + if rv[i] != 0 { + return rv // didn't overflow, so stop + } + } + return nil // overflowed +} + // RangeIterator returns an iterator which only visits terms between the // start and end terms. NOTE: bleve.index API specifies the end is inclusive. func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator { From 0453284a97c2c0b75b5ac609edb2d3f491a47b56 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 10 Aug 2018 10:21:27 -0700 Subject: [PATCH 444/728] RegexpSearcher takes index.Regexp interface instead of *regexp.Regexp This level of indirection allows for alternate regexp implementations. --- index/index.go | 11 +++++++++++ search/searcher/search_regexp.go | 6 ++---- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/index/index.go b/index/index.go index 7c4edcb8d..dd5271cf7 100644 --- a/index/index.go +++ b/index/index.go @@ -98,6 +98,17 @@ type IndexReader interface { Close() error } +// The Regexp interface defines the subset of the regexp.Regexp API +// methods that are used by bleve indexes, allowing callers to pass in +// alternate implementations. +type Regexp interface { + FindStringIndex(s string) (loc []int) + + LiteralPrefix() (prefix string, complete bool) + + String() string +} + type IndexReaderRegexp interface { FieldDictRegexp(field string, regex []byte) (FieldDict, error) } diff --git a/search/searcher/search_regexp.go b/search/searcher/search_regexp.go index ad417a056..a55909bbc 100644 --- a/search/searcher/search_regexp.go +++ b/search/searcher/search_regexp.go @@ -15,8 +15,6 @@ package searcher import ( - "regexp" - "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" ) @@ -26,7 +24,7 @@ import ( // matching the entire term. The provided regexp SHOULD NOT start with ^ // or end with $ as this can intefere with the implementation. Separately, // matches will be checked to ensure they match the entire term. -func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp, +func NewRegexpSearcher(indexReader index.IndexReader, pattern index.Regexp, field string, boost float64, options search.SearcherOptions) ( search.Searcher, error) { var candidateTerms []string @@ -70,7 +68,7 @@ func NewRegexpSearcher(indexReader index.IndexReader, pattern *regexp.Regexp, } func findRegexpCandidateTerms(indexReader index.IndexReader, - pattern *regexp.Regexp, field, prefixTerm string) (rv []string, err error) { + pattern index.Regexp, field, prefixTerm string) (rv []string, err error) { rv = make([]string, 0) var fieldDict index.FieldDict if len(prefixTerm) > 0 { From 96657413a75bb36fb12dc30400c1042a97b5b670 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 10 Aug 2018 10:38:21 -0700 Subject: [PATCH 445/728] index API & scorch uses index.Regexp instead string'ified regexp In this optimization and bleve "non-porcelain" index API change, the index.IndexReaderRegexp API is changed to accept an index.Regexp instance instead of the string representation of a regexp. This allows scorch to leverage the LiteralPrefix() information of the regexp instance (which is not implemented by the vellum.regexp API), so that the FST dictionary searches can be more selective by invoking... d.fst.Search(r, prefixBeg, prefixEnd) instead of the previous... d.fst.Search(r, nil, nil) See also: https://issues.couchbase.com/browse/MB-30264 --- index/index.go | 2 +- index/scorch/segment/empty.go | 2 +- index/scorch/segment/segment.go | 2 +- index/scorch/segment/zap/dict.go | 17 ++++++++++++++--- index/scorch/snapshot_index.go | 4 ++-- index/scorch/snapshot_segment.go | 2 +- index_meta.go | 2 +- search/searcher/search_regexp.go | 2 +- 8 files changed, 22 insertions(+), 11 deletions(-) diff --git a/index/index.go b/index/index.go index dd5271cf7..d734c1db2 100644 --- a/index/index.go +++ b/index/index.go @@ -110,7 +110,7 @@ type Regexp interface { } type IndexReaderRegexp interface { - FieldDictRegexp(field string, regex []byte) (FieldDict, error) + FieldDictRegexp(field string, regex Regexp) (FieldDict, error) } type IndexReaderFuzzy interface { diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index 0489c8218..59b2af4f0 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -80,7 +80,7 @@ func (e *EmptyDictionary) RangeIterator(start, end string) DictionaryIterator { return &EmptyDictionaryIterator{} } -func (e *EmptyDictionary) RegexpIterator(start string) DictionaryIterator { +func (e *EmptyDictionary) RegexpIterator(r index.Regexp) DictionaryIterator { return &EmptyDictionaryIterator{} } diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index 3fc315995..bc85aebc0 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -51,7 +51,7 @@ type TermDictionary interface { Iterator() DictionaryIterator PrefixIterator(prefix string) DictionaryIterator RangeIterator(start, end string) DictionaryIterator - RegexpIterator(regex string) DictionaryIterator + RegexpIterator(regex index.Regexp) DictionaryIterator FuzzyIterator(term string, fuzziness int) DictionaryIterator OnlyIterator(onlyTerms [][]byte, includeCount bool) DictionaryIterator } diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 8e1a03f9b..ab4c8311e 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -178,15 +178,26 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator // RegexpIterator returns an iterator which only visits terms having the // the specified regex -func (d *Dictionary) RegexpIterator(regex string) segment.DictionaryIterator { +func (d *Dictionary) RegexpIterator(rIn index.Regexp) segment.DictionaryIterator { + prefixTerm, complete := rIn.LiteralPrefix() + if complete { + return d.PrefixIterator(prefixTerm) + } + rv := &DictionaryIterator{ d: d, } if d.fst != nil { - r, err := regexp.New(regex) + r, err := regexp.New(rIn.String()) if err == nil { - itr, err2 := d.fst.Search(r, nil, nil) + var prefixBeg, prefixEnd []byte + if prefixTerm != "" { + prefixBeg = []byte(prefixTerm) + prefixEnd = incrementBytes(prefixEnd) + } + + itr, err2 := d.fst.Search(r, prefixBeg, prefixEnd) if err2 == nil { rv.itr = itr } else if err2 != nil && err2 != vellum.ErrIteratorDone { diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 21e19123d..dc2ace54c 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -179,9 +179,9 @@ func (i *IndexSnapshot) FieldDictPrefix(field string, } func (i *IndexSnapshot) FieldDictRegexp(field string, - termRegex []byte) (index.FieldDict, error) { + termRegex index.Regexp) (index.FieldDict, error) { return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { - return i.RegexpIterator(string(termRegex)) + return i.RegexpIterator(termRegex) }) } diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 90dbcb494..5b51981c1 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -52,7 +52,7 @@ func (s *SegmentDictionarySnapshot) RangeIterator(start, end string) segment.Dic return s.d.RangeIterator(start, end) } -func (s *SegmentDictionarySnapshot) RegexpIterator(regex string) segment.DictionaryIterator { +func (s *SegmentDictionarySnapshot) RegexpIterator(regex index.Regexp) segment.DictionaryIterator { return s.d.RegexpIterator(regex) } diff --git a/index_meta.go b/index_meta.go index 2614292b7..d814799a8 100644 --- a/index_meta.go +++ b/index_meta.go @@ -19,7 +19,7 @@ import ( "io/ioutil" "os" "path/filepath" - + "github.com/blevesearch/bleve/index/upsidedown" ) diff --git a/search/searcher/search_regexp.go b/search/searcher/search_regexp.go index a55909bbc..dc8573148 100644 --- a/search/searcher/search_regexp.go +++ b/search/searcher/search_regexp.go @@ -29,7 +29,7 @@ func NewRegexpSearcher(indexReader index.IndexReader, pattern index.Regexp, search.Searcher, error) { var candidateTerms []string if ir, ok := indexReader.(index.IndexReaderRegexp); ok { - fieldDict, err := ir.FieldDictRegexp(field, []byte(pattern.String())) + fieldDict, err := ir.FieldDictRegexp(field, pattern) if err != nil { return nil, err } From 44023bec88c9a74603abb766d4d8c5c5e51e84a2 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 10 Aug 2018 16:14:45 -0700 Subject: [PATCH 446/728] fixed 1-hit scorch optimization freq/norm & added scorch regexp test The "1-hit" encoding optimization for scorch was incorrectly returning from the postings list iterator constructor too early, before setting the includeFreqNorm & includeLocs flags. This was caught while refactoring the existing regexp searcher unit test from upside-down to also test scorch. --- index/scorch/segment/zap/posting.go | 5 ++- search/searcher/base_test.go | 31 +++++++++++++-- search/searcher/search_regexp_test.go | 54 +++++++++++++++++++-------- 3 files changed, 70 insertions(+), 20 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 71d41a826..0ac7938e1 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -188,7 +188,10 @@ func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool, rv.buf = buf } + rv.postings = p + rv.includeFreqNorm = includeFreq || includeNorm + rv.includeLocs = includeLocs if p.normBits1Hit != 0 { // "1-hit" encoding @@ -211,7 +214,6 @@ func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool, var read int // prepare the freq chunk details - rv.includeFreqNorm = includeFreq || includeNorm if rv.includeFreqNorm { var numFreqChunks uint64 numFreqChunks, read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) @@ -229,7 +231,6 @@ func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool, } // prepare the loc chunk details - rv.includeLocs = includeLocs if rv.includeLocs { n = 0 var numLocChunks uint64 diff --git a/search/searcher/base_test.go b/search/searcher/base_test.go index c47cd18c1..425d6703c 100644 --- a/search/searcher/base_test.go +++ b/search/searcher/base_test.go @@ -15,6 +15,7 @@ package searcher import ( + "io/ioutil" "math" "regexp" @@ -22,6 +23,7 @@ import ( regexpTokenizer "github.com/blevesearch/bleve/analysis/tokenizer/regexp" "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch" "github.com/blevesearch/bleve/index/store/gtreap" "github.com/blevesearch/bleve/index/upsidedown" ) @@ -29,9 +31,12 @@ import ( var twoDocIndex index.Index //= upside_down.NewUpsideDownCouch(inmem.MustOpen()) func init() { + twoDocIndex = initTwoDocUpsideDown() +} + +func initTwoDocUpsideDown() index.Index { analysisQueue := index.NewAnalysisQueue(1) - var err error - twoDocIndex, err = upsidedown.NewUpsideDownCouch( + twoDocIndex, err := upsidedown.NewUpsideDownCouch( gtreap.Name, map[string]interface{}{ "path": "", @@ -39,7 +44,27 @@ func init() { if err != nil { panic(err) } - err = twoDocIndex.Open() + initTwoDocs(twoDocIndex) + return twoDocIndex +} + +func initTwoDocScorch() index.Index { + analysisQueue := index.NewAnalysisQueue(1) + dir, _ := ioutil.TempDir("", "scorchTwoDoc") + twoDocIndex, err := scorch.NewScorch( + scorch.Name, + map[string]interface{}{ + "path": dir, + }, analysisQueue) + if err != nil { + panic(err) + } + initTwoDocs(twoDocIndex) + return twoDocIndex +} + +func initTwoDocs(twoDocIndex index.Index) { + err := twoDocIndex.Open() if err != nil { panic(err) } diff --git a/search/searcher/search_regexp_test.go b/search/searcher/search_regexp_test.go index daa5d9c14..2de1162a5 100644 --- a/search/searcher/search_regexp_test.go +++ b/search/searcher/search_regexp_test.go @@ -15,6 +15,8 @@ package searcher import ( + "encoding/binary" + "fmt" "regexp" "testing" @@ -22,8 +24,28 @@ import ( "github.com/blevesearch/bleve/search" ) -func TestRegexpSearch(t *testing.T) { +func TestRegexpSearchUpsideDown(t *testing.T) { + twoDocIndex := initTwoDocUpsideDown() + testRegexpSearch(t, twoDocIndex, + func(id int) index.IndexInternalID { + return index.IndexInternalID(fmt.Sprintf("%d", id)) + }) + _ = twoDocIndex.Close() +} + +func TestRegexpSearchScorch(t *testing.T) { + twoDocIndex := initTwoDocScorch() + testRegexpSearch(t, twoDocIndex, + func(id int) index.IndexInternalID { + buf := make([]byte, 8) + binary.BigEndian.PutUint64(buf, uint64(id)) + return index.IndexInternalID(buf) + }) + _ = twoDocIndex.Close() +} +func testRegexpSearch(t *testing.T, twoDocIndex index.Index, + internalIDMaker func(int) index.IndexInternalID) { twoDocIndexReader, err := twoDocIndex.Reader() if err != nil { t.Error(err) @@ -58,27 +80,27 @@ func TestRegexpSearch(t *testing.T) { } tests := []struct { - searcher search.Searcher - results []*search.DocumentMatch + searcher search.Searcher + expecteds []*search.DocumentMatch }{ { searcher: regexpSearcher, - results: []*search.DocumentMatch{ + expecteds: []*search.DocumentMatch{ { - IndexInternalID: index.IndexInternalID("1"), + IndexInternalID: internalIDMaker(1), Score: 1.916290731874155, }, }, }, { searcher: regexpSearcherCo, - results: []*search.DocumentMatch{ + expecteds: []*search.DocumentMatch{ { - IndexInternalID: index.IndexInternalID("2"), + IndexInternalID: internalIDMaker(2), Score: 0.33875554280828685, }, { - IndexInternalID: index.IndexInternalID("3"), + IndexInternalID: internalIDMaker(3), Score: 0.33875554280828685, }, }, @@ -99,12 +121,14 @@ func TestRegexpSearch(t *testing.T) { next, err := test.searcher.Next(ctx) i := 0 for err == nil && next != nil { - if i < len(test.results) { - if !next.IndexInternalID.Equals(test.results[i].IndexInternalID) { - t.Errorf("expected result %d to have id %s got %s for test %d", i, test.results[i].IndexInternalID, next.IndexInternalID, testIndex) + if i < len(test.expecteds) { + if !next.IndexInternalID.Equals(test.expecteds[i].IndexInternalID) { + t.Errorf("test %d, expected result %d to have id %s got %s, next: %#v", + testIndex, i, test.expecteds[i].IndexInternalID, next.IndexInternalID, next) } - if next.Score != test.results[i].Score { - t.Errorf("expected result %d to have score %v got %v for test %d", i, test.results[i].Score, next.Score, testIndex) + if next.Score != test.expecteds[i].Score { + t.Errorf("test %d, expected result %d to have score %v got %v,next: %#v", + testIndex, i, test.expecteds[i].Score, next.Score, next) t.Logf("scoring explanation: %s", next.Expl) } } @@ -115,8 +139,8 @@ func TestRegexpSearch(t *testing.T) { if err != nil { t.Fatalf("error iterating searcher: %v for test %d", err, testIndex) } - if len(test.results) != i { - t.Errorf("expected %d results got %d for test %d", len(test.results), i, testIndex) + if len(test.expecteds) != i { + t.Errorf("expected %d results got %d for test %d", len(test.expecteds), i, testIndex) } } } From d4c0a3e0221c5530ed8b9c772243360e26294942 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 13 Aug 2018 16:11:50 +0530 Subject: [PATCH 447/728] MB-29923 - high memory usage in scorch This change includes, -persister nap to favour in-memory merging -skip in-memory merging on memory pressure -skip persister pause on memory pressure -checks during persister wait to guard against an already advanced merger --- index/scorch/persister.go | 95 +++++++++++++++++++++++++++++-------- index/scorch/scorch.go | 43 ++++++++++++++--- index/scorch/scorch_test.go | 5 ++ test/versus_test.go | 6 +++ 4 files changed, 123 insertions(+), 26 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index bb40c33f8..0de78c41b 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -17,6 +17,7 @@ package scorch import ( "bytes" "encoding/binary" + "encoding/json" "fmt" "io/ioutil" "log" @@ -35,16 +36,22 @@ import ( var DefaultChunkFactor uint32 = 1024 -// Arbitrary number, need to make it configurable. -// Lower values like 10/making persister really slow -// doesn't work well as it is creating more files to -// persist for in next persist iteration and spikes the # FDs. -// Ideal value should let persister also proceed at -// an optimum pace so that the merger can skip -// many intermediate snapshots. -// This needs to be based on empirical data. -// TODO - may need to revisit this approach/value. -var epochDistance = uint64(5) +var DefaultPersisterNapTimeMSec int = 2000 // ms + +var DefaultPersisterNapUnderNumFiles int = 1000 + +type persisterOptions struct { + // PersisterNapTimeMSec controls the wait/delay injected into + // persistence workloop to improve the chances for + // a healthier and heavier in-memory merging + PersisterNapTimeMSec int + + // PersisterNapTimeMSec > 0, and the number of files is less than + // PersisterNapUnderNumFiles, then the persister will sleep + // PersisterNapTimeMSec amount of time to improve the chances for + // a healthier and heavier in-memory merging + PersisterNapUnderNumFiles int +} type notificationChan chan struct{} @@ -54,6 +61,13 @@ func (s *Scorch) persisterLoop() { var persistWatchers []*epochWatcher var lastPersistedEpoch, lastMergedEpoch uint64 var ew *epochWatcher + po, err := s.parsePersisterOptions() + if err != nil { + s.fireAsyncError(fmt.Errorf("persisterOptions json parsing err: %v", err)) + s.asyncTasks.Done() + return + } + OUTER: for { atomic.AddUint64(&s.stats.TotPersistLoopBeg, 1) @@ -69,7 +83,7 @@ OUTER: lastMergedEpoch = ew.epoch } lastMergedEpoch, persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch, - lastMergedEpoch, persistWatchers) + lastMergedEpoch, persistWatchers, po) var ourSnapshot *IndexSnapshot var ourPersisted []chan error @@ -180,14 +194,31 @@ func notifyMergeWatchers(lastPersistedEpoch uint64, } func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch uint64, - persistWatchers []*epochWatcher) (uint64, []*epochWatcher) { + persistWatchers []*epochWatcher, po *persisterOptions) (uint64, []*epochWatcher) { // first, let the watchers proceed if they lag behind persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) + // check the merger lag by counting the segment files on disk, + // On finding fewer files on disk, persister takes a short pause + // for sufficient in-memory segments to pile up for the next + // memory merge cum persist loop. + // On finding too many files on disk, persister pause until the merger + // catches up to reduce the segment file count under the threshold. + // But if there is a memory pressue, then skip this sleep maneuver. + numFilesOnDisk, _ := s.diskFileStats() + if numFilesOnDisk < uint64(po.PersisterNapUnderNumFiles) && + po.PersisterNapTimeMSec > 0 && s.paused() == 0 { + select { + case <-s.closeCh: + case <-time.After(time.Millisecond * time.Duration(po.PersisterNapTimeMSec)): + } + return lastMergedEpoch, persistWatchers + } + OUTER: - // check for slow merger and await until the merger catch up - for lastPersistedEpoch > lastMergedEpoch+epochDistance { + for numFilesOnDisk > uint64(po.PersisterNapUnderNumFiles) && + lastMergedEpoch < lastPersistedEpoch { atomic.AddUint64(&s.stats.TotPersisterSlowMergerPause, 1) select { @@ -202,18 +233,42 @@ OUTER: // let the watchers proceed if they lag behind persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) + + numFilesOnDisk, _ = s.diskFileStats() } return lastMergedEpoch, persistWatchers } -func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { - persisted, err := s.persistSnapshotMaybeMerge(snapshot) - if err != nil { - return err +func (s *Scorch) parsePersisterOptions() (*persisterOptions, error) { + po := persisterOptions{ + PersisterNapTimeMSec: DefaultPersisterNapTimeMSec, + PersisterNapUnderNumFiles: DefaultPersisterNapUnderNumFiles, } - if persisted { - return nil + if v, ok := s.config["scorchPersisterOptions"]; ok { + b, err := json.Marshal(v) + if err != nil { + return &po, err + } + + err = json.Unmarshal(b, &po) + if err != nil { + return &po, err + } + } + return &po, nil +} + +func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { + // perform in-memory merging only when there is no memory pressure + if s.paused() == 0 { + persisted, err := s.persistSnapshotMaybeMerge(snapshot) + if err != nil { + return err + } + if persisted { + return nil + } } return s.persistSnapshotDirect(snapshot) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 357836524..3980a8a82 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -73,6 +73,10 @@ type Scorch struct { onAsyncError func(err error) iStats internalStats + + pauseLock sync.RWMutex + + pauseCount uint64 } type internalStats struct { @@ -117,9 +121,30 @@ func NewScorch(storeName string, return rv, nil } +func (s *Scorch) paused() uint64 { + s.pauseLock.Lock() + pc := s.pauseCount + s.pauseLock.Unlock() + return pc +} + +func (s *Scorch) incrPause() { + s.pauseLock.Lock() + s.pauseCount++ + s.pauseLock.Unlock() +} + +func (s *Scorch) decrPause() { + s.pauseLock.Lock() + s.pauseCount-- + s.pauseLock.Unlock() +} + func (s *Scorch) fireEvent(kind EventKind, dur time.Duration) { if s.onEvent != nil { + s.incrPause() s.onEvent(Event{Kind: kind, Scorch: s, Duration: dur}) + s.decrPause() } } @@ -434,24 +459,30 @@ func (s *Scorch) currentSnapshot() *IndexSnapshot { func (s *Scorch) Stats() json.Marshaler { return &s.stats } -func (s *Scorch) StatsMap() map[string]interface{} { - m := s.stats.ToMap() +func (s *Scorch) diskFileStats() (uint64, uint64) { + var numFilesOnDisk, numBytesUsedDisk uint64 if s.path != "" { finfos, err := ioutil.ReadDir(s.path) if err == nil { - var numFilesOnDisk, numBytesUsedDisk uint64 for _, finfo := range finfos { if !finfo.IsDir() { numBytesUsedDisk += uint64(finfo.Size()) numFilesOnDisk++ } } - - m["CurOnDiskBytes"] = numBytesUsedDisk - m["CurOnDiskFiles"] = numFilesOnDisk } } + return numFilesOnDisk, numBytesUsedDisk +} + +func (s *Scorch) StatsMap() map[string]interface{} { + m := s.stats.ToMap() + + numFilesOnDisk, numBytesUsedDisk := s.diskFileStats() + + m["CurOnDiskBytes"] = numBytesUsedDisk + m["CurOnDiskFiles"] = numFilesOnDisk // TODO: consider one day removing these backwards compatible // names for apps using the old names diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index cf784755d..adcabd22f 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -33,6 +33,11 @@ import ( "github.com/blevesearch/bleve/mapping" ) +func init() { + // override for tests + DefaultPersisterNapTimeMSec = 1 +} + func DestroyTest() error { return os.RemoveAll("/tmp/bleve-scorch-test") } diff --git a/test/versus_test.go b/test/versus_test.go index dbc7dd752..10faa311e 100644 --- a/test/versus_test.go +++ b/test/versus_test.go @@ -41,6 +41,12 @@ import ( // go test -v -run TestScorchVersusUpsideDownBolt ./test // VERBOSE=1 FOCUS=Trista go test -v -run TestScorchVersusUpsideDownBolt ./test // + +func init() { + // override for tests + scorch.DefaultPersisterNapTimeMSec = 1 +} + func TestScorchVersusUpsideDownBoltAll(t *testing.T) { (&VersusTest{ t: t, From 11596bf28dd099f05b4854ffa642d16f116a2d8f Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 13 Aug 2018 12:54:03 -0700 Subject: [PATCH 448/728] NewRegexpStringSearcher() API allows scorch to parse regexp string The additional NewRegexpStringSearcher() API is introduced in this optimization, where this new constructor takes the raw regexp string as input, allowing the raw regexp string to be passed down to advanced index implementations (like scorch), to avoid extra regexp string parsings. Lower-level index interfaces are also correspondingly modified to take a regexp string as input. Of note, there are some future TODO's found as part of this commit... * syntax.Regexp supports a Simplify() API that should be explored. * the LiteralPrefix() optimization is no longer supported with this commit and should be revisited. From the golang regexp source code, the LiteralPrefix() computation given an input *syntax.Regexp appears to be non-trivial. See also: https://issues.couchbase.com/browse/MB-30264 --- index/index.go | 2 +- index/scorch/segment/empty.go | 2 +- index/scorch/segment/segment.go | 2 +- index/scorch/segment/zap/dict.go | 27 ++++++++--- index/scorch/snapshot_index.go | 2 +- index/scorch/snapshot_segment.go | 2 +- search/searcher/search_regexp.go | 79 ++++++++++++++++++++------------ 7 files changed, 75 insertions(+), 41 deletions(-) diff --git a/index/index.go b/index/index.go index d734c1db2..2b577e2e1 100644 --- a/index/index.go +++ b/index/index.go @@ -110,7 +110,7 @@ type Regexp interface { } type IndexReaderRegexp interface { - FieldDictRegexp(field string, regex Regexp) (FieldDict, error) + FieldDictRegexp(field string, regex string) (FieldDict, error) } type IndexReaderFuzzy interface { diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index 59b2af4f0..83968a11d 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -80,7 +80,7 @@ func (e *EmptyDictionary) RangeIterator(start, end string) DictionaryIterator { return &EmptyDictionaryIterator{} } -func (e *EmptyDictionary) RegexpIterator(r index.Regexp) DictionaryIterator { +func (e *EmptyDictionary) RegexpIterator(r string) DictionaryIterator { return &EmptyDictionaryIterator{} } diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index bc85aebc0..3fc315995 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -51,7 +51,7 @@ type TermDictionary interface { Iterator() DictionaryIterator PrefixIterator(prefix string) DictionaryIterator RangeIterator(start, end string) DictionaryIterator - RegexpIterator(regex index.Regexp) DictionaryIterator + RegexpIterator(regex string) DictionaryIterator FuzzyIterator(term string, fuzziness int) DictionaryIterator OnlyIterator(onlyTerms [][]byte, includeCount bool) DictionaryIterator } diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index ab4c8311e..522731fb3 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -17,6 +17,7 @@ package zap import ( "bytes" "fmt" + "regexp/syntax" "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" @@ -178,18 +179,30 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator // RegexpIterator returns an iterator which only visits terms having the // the specified regex -func (d *Dictionary) RegexpIterator(rIn index.Regexp) segment.DictionaryIterator { - prefixTerm, complete := rIn.LiteralPrefix() - if complete { - return d.PrefixIterator(prefixTerm) - } - +func (d *Dictionary) RegexpIterator(expr string) segment.DictionaryIterator { rv := &DictionaryIterator{ d: d, } + parsed, err := syntax.Parse(expr, syntax.Perl) + if err != nil { + rv.err = err + return rv + } + + // TODO: potential optimization where syntax.Regexp supports a Simplify() API? + + var prefixTerm string + /* TODO: potential optimization of (re-)supporting LiteralPrefix(), + even perhaps a brute-force, naive prefix detection? + prefixTerm, complete := rIn.LiteralPrefix() + if complete { + return d.PrefixIterator(prefixTerm) + } + */ + if d.fst != nil { - r, err := regexp.New(rIn.String()) + r, err := regexp.NewParsedWithLimit(expr, parsed, regexp.DefaultLimit) if err == nil { var prefixBeg, prefixEnd []byte if prefixTerm != "" { diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index dc2ace54c..48fa1faa1 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -179,7 +179,7 @@ func (i *IndexSnapshot) FieldDictPrefix(field string, } func (i *IndexSnapshot) FieldDictRegexp(field string, - termRegex index.Regexp) (index.FieldDict, error) { + termRegex string) (index.FieldDict, error) { return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { return i.RegexpIterator(termRegex) }) diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 5b51981c1..90dbcb494 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -52,7 +52,7 @@ func (s *SegmentDictionarySnapshot) RangeIterator(start, end string) segment.Dic return s.d.RangeIterator(start, end) } -func (s *SegmentDictionarySnapshot) RegexpIterator(regex index.Regexp) segment.DictionaryIterator { +func (s *SegmentDictionarySnapshot) RegexpIterator(regex string) segment.DictionaryIterator { return s.d.RegexpIterator(regex) } diff --git a/search/searcher/search_regexp.go b/search/searcher/search_regexp.go index dc8573148..299d9cdbe 100644 --- a/search/searcher/search_regexp.go +++ b/search/searcher/search_regexp.go @@ -15,10 +15,52 @@ package searcher import ( + "regexp" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" ) +// NewRegexpStringSearcher is similar to NewRegexpSearcher, but +// additionally optimizes for index readers that handle regexp's. +func NewRegexpStringSearcher(indexReader index.IndexReader, pattern string, + field string, boost float64, options search.SearcherOptions) ( + search.Searcher, error) { + ir, ok := indexReader.(index.IndexReaderRegexp) + if !ok { + r, err := regexp.Compile(pattern) + if err != nil { + return nil, err + } + + return NewRegexpSearcher(indexReader, r, field, boost, options) + } + + fieldDict, err := ir.FieldDictRegexp(field, pattern) + if err != nil { + return nil, err + } + defer func() { + if cerr := fieldDict.Close(); cerr != nil && err == nil { + err = cerr + } + }() + + var candidateTerms []string + + tfd, err := fieldDict.Next() + for err == nil && tfd != nil { + candidateTerms = append(candidateTerms, tfd.Term) + tfd, err = fieldDict.Next() + } + if err != nil { + return nil, err + } + + return NewMultiTermSearcher(indexReader, candidateTerms, field, boost, + options, true) +} + // NewRegexpSearcher creates a searcher which will match documents that // contain terms which match the pattern regexp. The match must be EXACT // matching the entire term. The provided regexp SHOULD NOT start with ^ @@ -28,39 +70,18 @@ func NewRegexpSearcher(indexReader index.IndexReader, pattern index.Regexp, field string, boost float64, options search.SearcherOptions) ( search.Searcher, error) { var candidateTerms []string - if ir, ok := indexReader.(index.IndexReaderRegexp); ok { - fieldDict, err := ir.FieldDictRegexp(field, pattern) - if err != nil { - return nil, err - } - defer func() { - if cerr := fieldDict.Close(); cerr != nil && err == nil { - err = cerr - } - }() - // enumerate the terms and check against regexp - tfd, err := fieldDict.Next() - for err == nil && tfd != nil { - candidateTerms = append(candidateTerms, tfd.Term) - tfd, err = fieldDict.Next() - } + prefixTerm, complete := pattern.LiteralPrefix() + if complete { + // there is no pattern + candidateTerms = []string{prefixTerm} + } else { + var err error + candidateTerms, err = findRegexpCandidateTerms(indexReader, pattern, field, + prefixTerm) if err != nil { return nil, err } - } else { - prefixTerm, complete := pattern.LiteralPrefix() - if complete { - // there is no pattern - candidateTerms = []string{prefixTerm} - } else { - var err error - candidateTerms, err = findRegexpCandidateTerms(indexReader, pattern, field, - prefixTerm) - if err != nil { - return nil, err - } - } } return NewMultiTermSearcher(indexReader, candidateTerms, field, boost, From 538612dbad353165f839631ae558e64ca928cc40 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 13 Aug 2018 13:15:30 -0700 Subject: [PATCH 449/728] regexp & wildcard queries use NewRegexpStringSearcher() API This commit is mainly a scorch targetted optimization, moving regexp string parsing from the regexp/wildcard query tier to the index tier for scorch. Additionally, the upside_down index backend should not have any regexp or wildcard query performance impact with this change. However, there is one behavior change, in that regexp input validation is now delayed until the searcher constructor, turning Validate() until a no-op. See also: https://issues.couchbase.com/browse/MB-30264 --- search/query/regexp.go | 37 +++++++++++-------------------------- search/query/wildcard.go | 23 +++++------------------ 2 files changed, 16 insertions(+), 44 deletions(-) diff --git a/search/query/regexp.go b/search/query/regexp.go index 09544fcf1..0c87a6f92 100644 --- a/search/query/regexp.go +++ b/search/query/regexp.go @@ -15,7 +15,6 @@ package query import ( - "regexp" "strings" "github.com/blevesearch/bleve/index" @@ -28,7 +27,6 @@ type RegexpQuery struct { Regexp string `json:"regexp"` FieldVal string `json:"field,omitempty"` BoostVal *Boost `json:"boost,omitempty"` - compiled *regexp.Regexp } // NewRegexpQuery creates a new Query which finds @@ -64,33 +62,20 @@ func (q *RegexpQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, opti if q.FieldVal == "" { field = m.DefaultSearchField() } - err := q.compile() - if err != nil { - return nil, err + + // require that pattern NOT be anchored to start and end of term. + // do not attempt to remove trailing $, its presence is not + // known to interfere with LiteralPrefix() the way ^ does + // and removing $ introduces possible ambiguities with escaped \$, \\$, etc + actualRegexp := q.Regexp + if strings.HasPrefix(actualRegexp, "^") { + actualRegexp = actualRegexp[1:] // remove leading ^ } - return searcher.NewRegexpSearcher(i, q.compiled, field, q.BoostVal.Value(), options) + return searcher.NewRegexpStringSearcher(i, actualRegexp, field, + q.BoostVal.Value(), options) } func (q *RegexpQuery) Validate() error { - return q.compile() -} - -func (q *RegexpQuery) compile() error { - if q.compiled == nil { - // require that pattern NOT be anchored to start and end of term - actualRegexp := q.Regexp - if strings.HasPrefix(actualRegexp, "^") { - actualRegexp = actualRegexp[1:] // remove leading ^ - } - // do not attempt to remove trailing $, it's presence is not - // known to interfere with LiteralPrefix() the way ^ does - // and removing $ introduces possible ambiguities with escaped \$, \\$, etc - var err error - q.compiled, err = regexp.Compile(actualRegexp) - if err != nil { - return err - } - } - return nil + return nil // real validation delayed until searcher constructor } diff --git a/search/query/wildcard.go b/search/query/wildcard.go index 7fd7482c4..747dfe76f 100644 --- a/search/query/wildcard.go +++ b/search/query/wildcard.go @@ -15,7 +15,6 @@ package query import ( - "regexp" "strings" "github.com/blevesearch/bleve/index" @@ -47,7 +46,6 @@ type WildcardQuery struct { Wildcard string `json:"wildcard"` FieldVal string `json:"field,omitempty"` BoostVal *Boost `json:"boost,omitempty"` - compiled *regexp.Regexp } // NewWildcardQuery creates a new Query which finds @@ -83,24 +81,13 @@ func (q *WildcardQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, op if q.FieldVal == "" { field = m.DefaultSearchField() } - if q.compiled == nil { - var err error - q.compiled, err = q.convertToRegexp() - if err != nil { - return nil, err - } - } - return searcher.NewRegexpSearcher(i, q.compiled, field, q.BoostVal.Value(), options) -} + regexpString := wildcardRegexpReplacer.Replace(q.Wildcard) -func (q *WildcardQuery) Validate() error { - var err error - q.compiled, err = q.convertToRegexp() - return err + return searcher.NewRegexpStringSearcher(i, regexpString, field, + q.BoostVal.Value(), options) } -func (q *WildcardQuery) convertToRegexp() (*regexp.Regexp, error) { - regexpString := wildcardRegexpReplacer.Replace(q.Wildcard) - return regexp.Compile(regexpString) +func (q *WildcardQuery) Validate() error { + return nil // real validation delayed until searcher constructor } From aeb6995a8b4d4adab2e7bfd0c74bd275eca86f98 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 13 Aug 2018 15:07:30 -0700 Subject: [PATCH 450/728] added unit test for NewRegexpStringSearcher() See also: https://issues.couchbase.com/browse/MB-30264 --- search/searcher/search_regexp_test.go | 77 +++++++++++++++++---------- 1 file changed, 49 insertions(+), 28 deletions(-) diff --git a/search/searcher/search_regexp_test.go b/search/searcher/search_regexp_test.go index 2de1162a5..0c29068bd 100644 --- a/search/searcher/search_regexp_test.go +++ b/search/searcher/search_regexp_test.go @@ -26,58 +26,79 @@ import ( func TestRegexpSearchUpsideDown(t *testing.T) { twoDocIndex := initTwoDocUpsideDown() - testRegexpSearch(t, twoDocIndex, - func(id int) index.IndexInternalID { - return index.IndexInternalID(fmt.Sprintf("%d", id)) - }) + testRegexpSearch(t, twoDocIndex, internalIDMakerUpsideDown, searcherMaker) + _ = twoDocIndex.Close() +} + +func TestRegexpStringSearchUpsideDown(t *testing.T) { + twoDocIndex := initTwoDocUpsideDown() + testRegexpSearch(t, twoDocIndex, internalIDMakerUpsideDown, searcherStringMaker) _ = twoDocIndex.Close() } func TestRegexpSearchScorch(t *testing.T) { twoDocIndex := initTwoDocScorch() - testRegexpSearch(t, twoDocIndex, - func(id int) index.IndexInternalID { - buf := make([]byte, 8) - binary.BigEndian.PutUint64(buf, uint64(id)) - return index.IndexInternalID(buf) - }) + testRegexpSearch(t, twoDocIndex, internalIDMakerScorch, searcherMaker) _ = twoDocIndex.Close() } -func testRegexpSearch(t *testing.T, twoDocIndex index.Index, - internalIDMaker func(int) index.IndexInternalID) { - twoDocIndexReader, err := twoDocIndex.Reader() - if err != nil { - t.Error(err) - } - defer func() { - err := twoDocIndexReader.Close() - if err != nil { - t.Fatal(err) - } - }() +func TestRegexpStringSearchScorch(t *testing.T) { + twoDocIndex := initTwoDocScorch() + testRegexpSearch(t, twoDocIndex, internalIDMakerScorch, searcherStringMaker) + _ = twoDocIndex.Close() +} - explainTrue := search.SearcherOptions{Explain: true} +func internalIDMakerUpsideDown(id int) index.IndexInternalID { + return index.IndexInternalID(fmt.Sprintf("%d", id)) +} + +func internalIDMakerScorch(id int) index.IndexInternalID { + buf := make([]byte, 8) + binary.BigEndian.PutUint64(buf, uint64(id)) + return index.IndexInternalID(buf) +} - pattern, err := regexp.Compile("ma.*") +func searcherMaker(t *testing.T, ir index.IndexReader, re, field string) search.Searcher { + pattern, err := regexp.Compile(re) if err != nil { t.Fatal(err) } - regexpSearcher, err := NewRegexpSearcher(twoDocIndexReader, pattern, "name", 1.0, explainTrue) + regexpSearcher, err := NewRegexpSearcher(ir, pattern, field, 1.0, + search.SearcherOptions{Explain: true}) if err != nil { t.Fatal(err) } - patternCo, err := regexp.Compile("co.*") + return regexpSearcher +} + +func searcherStringMaker(t *testing.T, ir index.IndexReader, re, field string) search.Searcher { + regexpSearcher, err := NewRegexpStringSearcher(ir, re, field, 1.0, + search.SearcherOptions{Explain: true}) if err != nil { t.Fatal(err) } - regexpSearcherCo, err := NewRegexpSearcher(twoDocIndexReader, patternCo, "desc", 1.0, explainTrue) + return regexpSearcher +} + +func testRegexpSearch(t *testing.T, twoDocIndex index.Index, + internalIDMaker func(int) index.IndexInternalID, + searcherMaker func(t *testing.T, ir index.IndexReader, re, field string) search.Searcher) { + twoDocIndexReader, err := twoDocIndex.Reader() if err != nil { - t.Fatal(err) + t.Error(err) } + defer func() { + err := twoDocIndexReader.Close() + if err != nil { + t.Fatal(err) + } + }() + + regexpSearcher := searcherMaker(t, twoDocIndexReader, "ma.*", "name") + regexpSearcherCo := searcherMaker(t, twoDocIndexReader, "co.*", "desc") tests := []struct { searcher search.Searcher From e7a78b6e98cbe18203c20ecb970f45282686c00f Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 13 Aug 2018 15:57:44 -0700 Subject: [PATCH 451/728] scorch regexp leverages naive regexp literal prefix This commit re-introduces a literal prefix optimization to the scorch regexp dictionary iterator, albeit with only a simplified detection of a literal prefix from the regexp syntax parse tree. In contrast, the golang standard library's implementation of the literal prefix detection first converts the regexp syntax parse tree to a state machine (or "prog"), performs one-pass conversions/optimizations, and then examines that for any potential literal prefix in a more advanced approach. See also: https://issues.couchbase.com/browse/MB-30264 --- index/scorch/segment/zap/dict.go | 11 ++--- index/scorch/segment/zap/regexp.go | 38 +++++++++++++++++ index/scorch/segment/zap/regexp_test.go | 56 +++++++++++++++++++++++++ 3 files changed, 97 insertions(+), 8 deletions(-) create mode 100644 index/scorch/segment/zap/regexp.go create mode 100644 index/scorch/segment/zap/regexp_test.go diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 522731fb3..3a9945fd4 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -191,15 +191,10 @@ func (d *Dictionary) RegexpIterator(expr string) segment.DictionaryIterator { } // TODO: potential optimization where syntax.Regexp supports a Simplify() API? + // TODO: potential optimization where the literal prefix represents the, + // entire regexp, allowing us to use PrefixIterator(prefixTerm)? - var prefixTerm string - /* TODO: potential optimization of (re-)supporting LiteralPrefix(), - even perhaps a brute-force, naive prefix detection? - prefixTerm, complete := rIn.LiteralPrefix() - if complete { - return d.PrefixIterator(prefixTerm) - } - */ + prefixTerm := LiteralPrefix(parsed) if d.fst != nil { r, err := regexp.NewParsedWithLimit(expr, parsed, regexp.DefaultLimit) diff --git a/index/scorch/segment/zap/regexp.go b/index/scorch/segment/zap/regexp.go new file mode 100644 index 000000000..65c1beeb5 --- /dev/null +++ b/index/scorch/segment/zap/regexp.go @@ -0,0 +1,38 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "regexp/syntax" +) + +// Returns the literal prefix given the parse tree for a regexp +func LiteralPrefix(s *syntax.Regexp) string { + // traverse the left-most branch in the parse tree as long as the + // node represents a concatenation + for s != nil && s.Op == syntax.OpConcat { + if len(s.Sub) < 1 { + return "" + } + + s = s.Sub[0] + } + + if s.Op == syntax.OpLiteral { + return string(s.Rune) + } + + return "" // no literal prefix +} diff --git a/index/scorch/segment/zap/regexp_test.go b/index/scorch/segment/zap/regexp_test.go new file mode 100644 index 000000000..83e1388dc --- /dev/null +++ b/index/scorch/segment/zap/regexp_test.go @@ -0,0 +1,56 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package zap + +import ( + "regexp/syntax" + "testing" +) + +func TestLiteralPrefix(t *testing.T) { + tests := []struct { + input, expected string + }{ + {"", ""}, + {"hello", "hello"}, + {"hello.?", "hello"}, + {"hello$", "hello"}, + {`[h][e][l][l][o].*world`, "hello"}, + {`[h-h][e-e][l-l][l-l][o-o].*world`, "hello"}, + {".*", ""}, + {"h.*", "h"}, + {"h.?", "h"}, + {"h[a-z]", "h"}, + {`h\s`, "h"}, + {`(hello)world`, ""}, + {`日本語`, "日本語"}, + {`日本語\w`, "日本語"}, + {`^hello`, ""}, + {`^`, ""}, + {`$`, ""}, + } + + for i, test := range tests { + s, err := syntax.Parse(test.input, syntax.Perl) + if err != nil { + t.Fatalf("expected no syntax.Parse error, got: %v", err) + } + + got := LiteralPrefix(s) + if test.expected != got { + t.Fatalf("test: %d, %+v, got: %s", i, test, got) + } + } +} From 7ec9fe0868ab735d605e49ebe11f86d25e22c427 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Tue, 14 Aug 2018 12:18:53 -0700 Subject: [PATCH 452/728] Update vendor/manifest to include vellum --- vendor/manifest | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/vendor/manifest b/vendor/manifest index 1883de76e..d45734e9e 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -132,6 +132,14 @@ "branch": "master", "path": "/unicode/norm", "notests": true + }, + { + "importpath": "github.com/couchbase/vellum", + "repository": "https://github.com/couchbase/vellum", + "vcs": "git", + "revision": "dc6110ee42850a6553b6469bf0dc03383900353d", + "branch": "master", + "notests": true } ] } From 0ba6248ccd51804e61ad79349ad4047469c1ee70 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 15 Aug 2018 00:08:50 -0700 Subject: [PATCH 453/728] optimize scorch regexp & fuzzy parsing across multiple segments The recent, previous optimization for regexp searches for scorch took 1 step forwards and N steps backwards, in that although it removed an extra regexp/syntax re-parsing, it inefficiently pushed the work of regexp/syntax and vellum/regexp handling too far down to the zap layer. That matters especially in the case when there's a large number (N) of zap segments in an index snapshot, meaning that conversion from regexp/syntax to vellum/regexp would happen (inefficiently) N times! That is, each zap segment would be independently re-performing the same conversion work. Instead, this commit more efficiently performs that conversion just once, at the index/scorch level, and reuses that resulting automaton across all N segments of the index snapshot. The literal prefix computation is similarly handled now just once and reused across all N segments of the index snapshot. Finally, the fuzzy levenshtein dict iteration follows the same pattern as regexp, so this optimization approach is now also used to handle levenshtein automaton iteration. Because both regexp and levenshtein are automatons, the previous two methods of RegexpIterator() and FuzzyIterator() have now been collapsed to a single segment/Segment.AutomatonIterator() method. --- index/scorch/segment/empty.go | 9 +-- index/scorch/segment/{zap => }/regexp.go | 39 +++++++++- index/scorch/segment/{zap => }/regexp_test.go | 2 +- index/scorch/segment/segment.go | 5 +- index/scorch/segment/zap/dict.go | 78 ++----------------- index/scorch/segment/zap/dict_test.go | 67 +++++++++++++++- index/scorch/snapshot_index.go | 18 ++++- index/scorch/snapshot_segment.go | 11 +-- 8 files changed, 137 insertions(+), 92 deletions(-) rename index/scorch/segment/{zap => }/regexp.go (53%) rename index/scorch/segment/{zap => }/regexp_test.go (98%) diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index 83968a11d..af50d0aaf 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -17,6 +17,7 @@ package segment import ( "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" + "github.com/couchbase/vellum" ) type EmptySegment struct{} @@ -80,12 +81,8 @@ func (e *EmptyDictionary) RangeIterator(start, end string) DictionaryIterator { return &EmptyDictionaryIterator{} } -func (e *EmptyDictionary) RegexpIterator(r string) DictionaryIterator { - return &EmptyDictionaryIterator{} -} - -func (e *EmptyDictionary) FuzzyIterator(term string, - fuzziness int) DictionaryIterator { +func (e *EmptyDictionary) AutomatonIterator(a vellum.Automaton, + startKeyInclusive, endKeyExclusive []byte) DictionaryIterator { return &EmptyDictionaryIterator{} } diff --git a/index/scorch/segment/zap/regexp.go b/index/scorch/segment/regexp.go similarity index 53% rename from index/scorch/segment/zap/regexp.go rename to index/scorch/segment/regexp.go index 65c1beeb5..3aa151d64 100644 --- a/index/scorch/segment/zap/regexp.go +++ b/index/scorch/segment/regexp.go @@ -12,12 +12,37 @@ // See the License for the specific language governing permissions and // limitations under the License. -package zap +package segment import ( "regexp/syntax" + + "github.com/couchbase/vellum/regexp" ) +func ParseRegexp(pattern string) (a *regexp.Regexp, prefixBeg, prefixEnd []byte, err error) { + // TODO: potential optimization where syntax.Regexp supports a Simplify() API? + + parsed, err := syntax.Parse(pattern, syntax.Perl) + if err != nil { + return nil, nil, nil, err + } + + re, err := regexp.NewParsedWithLimit(pattern, parsed, regexp.DefaultLimit) + if err != nil { + return nil, nil, nil, err + } + + prefix := LiteralPrefix(parsed) + if prefix != "" { + prefixBeg := []byte(prefix) + prefixEnd := IncrementBytes(prefixBeg) + return re, prefixBeg, prefixEnd, nil + } + + return re, nil, nil, nil +} + // Returns the literal prefix given the parse tree for a regexp func LiteralPrefix(s *syntax.Regexp) string { // traverse the left-most branch in the parse tree as long as the @@ -36,3 +61,15 @@ func LiteralPrefix(s *syntax.Regexp) string { return "" // no literal prefix } + +func IncrementBytes(in []byte) []byte { + rv := make([]byte, len(in)) + copy(rv, in) + for i := len(rv) - 1; i >= 0; i-- { + rv[i] = rv[i] + 1 + if rv[i] != 0 { + return rv // didn't overflow, so stop + } + } + return nil // overflowed +} diff --git a/index/scorch/segment/zap/regexp_test.go b/index/scorch/segment/regexp_test.go similarity index 98% rename from index/scorch/segment/zap/regexp_test.go rename to index/scorch/segment/regexp_test.go index 83e1388dc..b4731d6b8 100644 --- a/index/scorch/segment/zap/regexp_test.go +++ b/index/scorch/segment/regexp_test.go @@ -12,7 +12,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -package zap +package segment import ( "regexp/syntax" diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index 3fc315995..28a879949 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -17,6 +17,7 @@ package segment import ( "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" + "github.com/couchbase/vellum" ) // DocumentFieldValueVisitor defines a callback to be visited for each @@ -51,8 +52,8 @@ type TermDictionary interface { Iterator() DictionaryIterator PrefixIterator(prefix string) DictionaryIterator RangeIterator(start, end string) DictionaryIterator - RegexpIterator(regex string) DictionaryIterator - FuzzyIterator(term string, fuzziness int) DictionaryIterator + AutomatonIterator(a vellum.Automaton, + startKeyInclusive, endKeyExclusive []byte) DictionaryIterator OnlyIterator(onlyTerms [][]byte, includeCount bool) DictionaryIterator } diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 3a9945fd4..219bf1526 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -17,14 +17,11 @@ package zap import ( "bytes" "fmt" - "regexp/syntax" "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" "github.com/couchbase/vellum" - "github.com/couchbase/vellum/levenshtein" - "github.com/couchbase/vellum/regexp" ) // Dictionary is the zap representation of the term dictionary @@ -124,7 +121,7 @@ func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { } kBeg := []byte(prefix) - kEnd := incrementBytes(kBeg) + kEnd := segment.IncrementBytes(kBeg) if d.fst != nil { itr, err := d.fst.Iterator(kBeg, kEnd) @@ -138,18 +135,6 @@ func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { return rv } -func incrementBytes(in []byte) []byte { - rv := make([]byte, len(in)) - copy(rv, in) - for i := len(rv) - 1; i >= 0; i-- { - rv[i] = rv[i] + 1 - if rv[i] != 0 { - return rv // didn't overflow, so stop - } - } - return nil // overflowed -} - // RangeIterator returns an iterator which only visits terms between the // start and end terms. NOTE: bleve.index API specifies the end is inclusive. func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator { @@ -177,66 +162,19 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator return rv } -// RegexpIterator returns an iterator which only visits terms having the -// the specified regex -func (d *Dictionary) RegexpIterator(expr string) segment.DictionaryIterator { - rv := &DictionaryIterator{ - d: d, - } - - parsed, err := syntax.Parse(expr, syntax.Perl) - if err != nil { - rv.err = err - return rv - } - - // TODO: potential optimization where syntax.Regexp supports a Simplify() API? - // TODO: potential optimization where the literal prefix represents the, - // entire regexp, allowing us to use PrefixIterator(prefixTerm)? - - prefixTerm := LiteralPrefix(parsed) - - if d.fst != nil { - r, err := regexp.NewParsedWithLimit(expr, parsed, regexp.DefaultLimit) - if err == nil { - var prefixBeg, prefixEnd []byte - if prefixTerm != "" { - prefixBeg = []byte(prefixTerm) - prefixEnd = incrementBytes(prefixEnd) - } - - itr, err2 := d.fst.Search(r, prefixBeg, prefixEnd) - if err2 == nil { - rv.itr = itr - } else if err2 != nil && err2 != vellum.ErrIteratorDone { - rv.err = err2 - } - } else { - rv.err = err - } - } - - return rv -} - -// FuzzyIterator returns an iterator which only visits terms having the -// the specified edit/levenshtein distance -func (d *Dictionary) FuzzyIterator(term string, - fuzziness int) segment.DictionaryIterator { +// AutomatonIterator returns an iterator which only visits terms +// having the the vellum automaton and start/end key range +func (d *Dictionary) AutomatonIterator(a vellum.Automaton, + startKeyInclusive, endKeyExclusive []byte) segment.DictionaryIterator { rv := &DictionaryIterator{ d: d, } if d.fst != nil { - la, err := levenshtein.New(term, fuzziness) + itr, err := d.fst.Search(a, startKeyInclusive, endKeyExclusive) if err == nil { - itr, err2 := d.fst.Search(la, nil, nil) - if err2 == nil { - rv.itr = itr - } else if err2 != nil && err2 != vellum.ErrIteratorDone { - rv.err = err2 - } - } else { + rv.itr = itr + } else if err != nil && err != vellum.ErrIteratorDone { rv.err = err } } diff --git a/index/scorch/segment/zap/dict_test.go b/index/scorch/segment/zap/dict_test.go index 8cbd5710b..b654bf45f 100644 --- a/index/scorch/segment/zap/dict_test.go +++ b/index/scorch/segment/zap/dict_test.go @@ -22,6 +22,7 @@ import ( "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" + "github.com/couchbase/vellum/levenshtein" ) func buildTestSegmentForDict() (*SegmentBase, uint64, error) { @@ -205,7 +206,11 @@ func TestDictionaryError(t *testing.T) { t.Fatal(err) } - itr := dict.FuzzyIterator("summer", 5) + a, err := levenshtein.New("summer", 2) + if err != nil { + t.Fatal(err) + } + itr := dict.AutomatonIterator(a, nil, nil) if itr == nil { t.Fatalf("got nil itr") } @@ -213,8 +218,64 @@ func TestDictionaryError(t *testing.T) { if nxt != nil { t.Fatalf("expected nil next") } - if err == nil { - t.Fatalf("expected error from iterator") + if err != nil { + t.Fatalf("expected nil error from iterator, got: %v", err) } + a, err = levenshtein.New("cat", 1) // cat & bat + if err != nil { + t.Fatal(err) + } + itr = dict.AutomatonIterator(a, nil, nil) + if itr == nil { + t.Fatalf("got nil itr") + } + for i := 0; i < 2; i++ { + nxt, err = itr.Next() + if nxt == nil || err != nil { + t.Fatalf("expected non-nil next and nil err, got: %v, %v", nxt, err) + } + } + nxt, err = itr.Next() + if nxt != nil || err != nil { + t.Fatalf("expected nil next and nil err, got: %v, %v", nxt, err) + } + + a, err = levenshtein.New("cat", 2) // cat & bat + if err != nil { + t.Fatal(err) + } + itr = dict.AutomatonIterator(a, nil, nil) + if itr == nil { + t.Fatalf("got nil itr") + } + for i := 0; i < 2; i++ { + nxt, err = itr.Next() + if nxt == nil || err != nil { + t.Fatalf("expected non-nil next and nil err, got: %v, %v", nxt, err) + } + } + nxt, err = itr.Next() + if nxt != nil || err != nil { + t.Fatalf("expected nil next and nil err, got: %v, %v", nxt, err) + } + + a, err = levenshtein.New("cat", 3) + if err != nil { + t.Fatal(err) + } + itr = dict.AutomatonIterator(a, nil, nil) + if itr == nil { + t.Fatalf("got nil itr") + } + for i := 0; i < 5; i++ { + nxt, err = itr.Next() + if nxt == nil || err != nil { + t.Fatalf("expected non-nil next and nil err, got: %v, %v", nxt, err) + } + } + nxt, err = itr.Next() + if nxt != nil || err != nil { + t.Fatalf("expected nil next and nil err, got: %v, %v", nxt, err) + } } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 48fa1faa1..4722d9559 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -27,6 +27,7 @@ import ( "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/couchbase/vellum/levenshtein" ) type asynchSegmentResult struct { @@ -180,15 +181,28 @@ func (i *IndexSnapshot) FieldDictPrefix(field string, func (i *IndexSnapshot) FieldDictRegexp(field string, termRegex string) (index.FieldDict, error) { + // TODO: potential optimization where the literal prefix represents the, + // entire regexp, allowing us to use PrefixIterator(prefixTerm)? + + a, prefixBeg, prefixEnd, err := segment.ParseRegexp(termRegex) + if err != nil { + return nil, err + } + return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { - return i.RegexpIterator(termRegex) + return i.AutomatonIterator(a, prefixBeg, prefixEnd) }) } func (i *IndexSnapshot) FieldDictFuzzy(field string, term []byte, fuzziness int) (index.FieldDict, error) { + a, err := levenshtein.New(string(term), fuzziness) + if err != nil { + return nil, err + } + return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { - return i.FuzzyIterator(string(term), fuzziness) + return i.AutomatonIterator(a, nil, nil) }) } diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 90dbcb494..1cc8b76c2 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -23,6 +23,7 @@ import ( "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/size" + "github.com/couchbase/vellum" ) var TermSeparator byte = 0xff @@ -52,13 +53,9 @@ func (s *SegmentDictionarySnapshot) RangeIterator(start, end string) segment.Dic return s.d.RangeIterator(start, end) } -func (s *SegmentDictionarySnapshot) RegexpIterator(regex string) segment.DictionaryIterator { - return s.d.RegexpIterator(regex) -} - -func (s *SegmentDictionarySnapshot) FuzzyIterator(term string, - fuzziness int) segment.DictionaryIterator { - return s.d.FuzzyIterator(term, fuzziness) +func (s *SegmentDictionarySnapshot) AutomatonIterator(a vellum.Automaton, + startKeyInclusive, endKeyExclusive []byte) segment.DictionaryIterator { + return s.d.AutomatonIterator(a, startKeyInclusive, endKeyExclusive) } func (s *SegmentDictionarySnapshot) OnlyIterator(onlyTerms [][]byte, From 3cb6fd4c12ca387ddf319efdadeb4bfbeae007ee Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 15 Aug 2018 11:17:43 -0700 Subject: [PATCH 454/728] optimize scorch fuzzy if prefix is available In this commit, the advanced/optional FieldDictFuzzy() method is modified to propagate the prefix string, if available. See also: https://issues.couchbase.com/browse/MB-30456 --- index/index.go | 2 +- index/scorch/snapshot_index.go | 12 ++++++--- search/searcher/search_fuzzy.go | 48 ++++++++++++++++----------------- 3 files changed, 34 insertions(+), 28 deletions(-) diff --git a/index/index.go b/index/index.go index 2b577e2e1..62128c3c2 100644 --- a/index/index.go +++ b/index/index.go @@ -114,7 +114,7 @@ type IndexReaderRegexp interface { } type IndexReaderFuzzy interface { - FieldDictFuzzy(field string, term []byte, fuzziness int) (FieldDict, error) + FieldDictFuzzy(field string, term string, fuzziness int, prefix string) (FieldDict, error) } type IndexReaderOnly interface { diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 4722d9559..97266a900 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -195,14 +195,20 @@ func (i *IndexSnapshot) FieldDictRegexp(field string, } func (i *IndexSnapshot) FieldDictFuzzy(field string, - term []byte, fuzziness int) (index.FieldDict, error) { - a, err := levenshtein.New(string(term), fuzziness) + term string, fuzziness int, prefix string) (index.FieldDict, error) { + a, err := levenshtein.New(term, fuzziness) if err != nil { return nil, err } + var prefixBeg, prefixEnd []byte + if prefix != "" { + prefixBeg = []byte(prefix) + prefixEnd = segment.IncrementBytes(prefixBeg) + } + return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { - return i.AutomatonIterator(a, nil, nil) + return i.AutomatonIterator(a, prefixBeg, prefixEnd) }) } diff --git a/search/searcher/search_fuzzy.go b/search/searcher/search_fuzzy.go index 1ce3ba71d..b99528af4 100644 --- a/search/searcher/search_fuzzy.go +++ b/search/searcher/search_fuzzy.go @@ -54,34 +54,34 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, fuzziness int, field, prefixTerm string) (rv []string, err error) { rv = make([]string, 0) - var fieldDict index.FieldDict - if len(prefixTerm) > 0 { - fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm)) - } else { - // in case of advanced reader implementations directly call - // the levenshtein automaton based iterator to collect the - // candidate terms - if ir, ok := indexReader.(index.IndexReaderFuzzy); ok { - fieldDict, err = ir.FieldDictFuzzy(field, []byte(term), fuzziness) - if err != nil { - return nil, err + // in case of advanced reader implementations directly call + // the levenshtein automaton based iterator to collect the + // candidate terms + if ir, ok := indexReader.(index.IndexReaderFuzzy); ok { + fieldDict, err := ir.FieldDictFuzzy(field, term, fuzziness, prefixTerm) + if err != nil { + return nil, err + } + defer func() { + if cerr := fieldDict.Close(); cerr != nil && err == nil { + err = cerr } - defer func() { - if cerr := fieldDict.Close(); cerr != nil && err == nil { - err = cerr - } - }() - tfd, err := fieldDict.Next() - for err == nil && tfd != nil { - rv = append(rv, tfd.Term) - if tooManyClauses(len(rv)) { - return nil, tooManyClausesErr() - } - tfd, err = fieldDict.Next() + }() + tfd, err := fieldDict.Next() + for err == nil && tfd != nil { + rv = append(rv, tfd.Term) + if tooManyClauses(len(rv)) { + return nil, tooManyClausesErr() } - return rv, err + tfd, err = fieldDict.Next() } + return rv, err + } + var fieldDict index.FieldDict + if len(prefixTerm) > 0 { + fieldDict, err = indexReader.FieldDictPrefix(field, []byte(prefixTerm)) + } else { fieldDict, err = indexReader.FieldDict(field) } if err != nil { From cdd8ff3d30e8250af22d2cc3c68cce02d917e58c Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Tue, 14 Aug 2018 18:33:27 -0700 Subject: [PATCH 455/728] MB-30776: [Scorch] Merger's enumerator to not skip empty terms Fixes: https://github.com/blevesearch/bleve/issues/967 --- index/scorch/segment/zap/enumerator.go | 16 ++++--- index/scorch/segment/zap/merge.go | 17 +++---- search_test.go | 63 ++++++++++++++++++++++++++ 3 files changed, 78 insertions(+), 18 deletions(-) diff --git a/index/scorch/segment/zap/enumerator.go b/index/scorch/segment/zap/enumerator.go index 3c708dd57..cd6ff73c7 100644 --- a/index/scorch/segment/zap/enumerator.go +++ b/index/scorch/segment/zap/enumerator.go @@ -46,26 +46,27 @@ func newEnumerator(itrs []vellum.Iterator) (*enumerator, error) { for i, itr := range rv.itrs { rv.currKs[i], rv.currVs[i] = itr.Current() } - rv.updateMatches() - if rv.lowK == nil { + rv.updateMatches(false) + if rv.lowK == nil && len(rv.lowIdxs) == 0 { return rv, vellum.ErrIteratorDone } return rv, nil } // updateMatches maintains the low key matches based on the currKs -func (m *enumerator) updateMatches() { +func (m *enumerator) updateMatches(skipEmptyKey bool) { m.lowK = nil m.lowIdxs = m.lowIdxs[:0] m.lowCurr = 0 for i, key := range m.currKs { - if key == nil { + if (key == nil && m.currVs[i] == 0) || // in case of empty iterator + (len(key) == 0 && skipEmptyKey) { // skip empty keys continue } cmp := bytes.Compare(key, m.lowK) - if cmp < 0 || m.lowK == nil { + if cmp < 0 || len(m.lowIdxs) == 0 { // reached a new low m.lowK = key m.lowIdxs = m.lowIdxs[:0] @@ -102,9 +103,10 @@ func (m *enumerator) Next() error { } m.currKs[vi], m.currVs[vi] = m.itrs[vi].Current() } - m.updateMatches() + // can skip any empty keys encountered at this point + m.updateMatches(true) } - if m.lowK == nil { + if m.lowK == nil && len(m.lowIdxs) == 0 { return vellum.ErrIteratorDone } return nil diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index c735caad3..37e391bab 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -243,10 +243,6 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, } finishTerm := func(term []byte) error { - if term == nil { - return nil - } - tfEncoder.Close() locEncoder.Close() @@ -283,17 +279,16 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, if !bytes.Equal(prevTerm, term) { // if the term changed, write out the info collected // for the previous term - err2 := finishTerm(prevTerm) - if err2 != nil { - return nil, 0, err2 + err = finishTerm(prevTerm) + if err != nil { + return nil, 0, err } } - var err2 error - postings, err2 = dicts[itrI].postingsListFromOffset( + postings, err = dicts[itrI].postingsListFromOffset( postingsOffset, drops[itrI], postings) - if err2 != nil { - return nil, 0, err2 + if err != nil { + return nil, 0, err } postItr = postings.iterator(true, true, true, postItr) diff --git a/search_test.go b/search_test.go index 7fcc71d6e..e7a878653 100644 --- a/search_test.go +++ b/search_test.go @@ -26,10 +26,14 @@ import ( "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/analysis/analyzer/custom" + "github.com/blevesearch/bleve/analysis/analyzer/keyword" + "github.com/blevesearch/bleve/analysis/analyzer/standard" "github.com/blevesearch/bleve/analysis/token/lowercase" "github.com/blevesearch/bleve/analysis/tokenizer/single" "github.com/blevesearch/bleve/analysis/tokenizer/whitespace" "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index/scorch" + "github.com/blevesearch/bleve/mapping" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/query" ) @@ -542,3 +546,62 @@ func TestNestedBooleanSearchers(t *testing.T) { t.Fatalf("Unexpected result set, %v != %v", matches, len(searchResults.Hits)) } } + +func TestSearchScorchOverEmptyKeyword(t *testing.T) { + defaultIndexType := Config.DefaultIndexType + + defer func() { + err := os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + Config.DefaultIndexType = defaultIndexType + }() + + Config.DefaultIndexType = scorch.Name + + dmap := mapping.NewDocumentMapping() + dmap.DefaultAnalyzer = standard.Name + + fm := mapping.NewTextFieldMapping() + fm.Analyzer = keyword.Name + + fm1 := mapping.NewTextFieldMapping() + fm1.Analyzer = standard.Name + + dmap.AddFieldMappingsAt("id", fm) + dmap.AddFieldMappingsAt("name", fm1) + + imap := mapping.NewIndexMapping() + imap.DefaultMapping = dmap + imap.DefaultAnalyzer = standard.Name + + idx, err := New("testidx", imap) + if err != nil { + t.Fatal(err) + } + for i := 0; i < 10; i++ { + err = idx.Index(fmt.Sprint(i), map[string]string{"name": fmt.Sprintf("test%d", i), "id": ""}) + if err != nil { + t.Fatal(err) + } + } + + count, err := idx.DocCount() + if err != nil { + t.Fatal(err) + } + if count != 10 { + t.Fatalf("Unexpected doc count: %v, expected 10", count) + } + + q := query.NewWildcardQuery("test*") + sr := NewSearchRequestOptions(q, 40, 0, false) + res, err := idx.Search(sr) + if err != nil { + t.Fatal(err) + } + if res.Total != 10 { + t.Fatalf("Unexpected search hits: %v, expected 10", res.Total) + } +} From af1fc320ab846bf2154db8c1a97928a1c946808a Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 16 Aug 2018 13:31:05 +0530 Subject: [PATCH 456/728] unblock merger while persister in nap --- index/scorch/persister.go | 13 +++++++++++-- index/scorch/stats.go | 3 +++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 0de78c41b..5e9450189 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -205,19 +205,28 @@ func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastM // memory merge cum persist loop. // On finding too many files on disk, persister pause until the merger // catches up to reduce the segment file count under the threshold. - // But if there is a memory pressue, then skip this sleep maneuver. + // But if there is memory pressure, then skip this sleep maneuvers. numFilesOnDisk, _ := s.diskFileStats() if numFilesOnDisk < uint64(po.PersisterNapUnderNumFiles) && po.PersisterNapTimeMSec > 0 && s.paused() == 0 { select { case <-s.closeCh: case <-time.After(time.Millisecond * time.Duration(po.PersisterNapTimeMSec)): + atomic.AddUint64(&s.stats.TotPersisterNapPauseCompleted, 1) + + case ew := <-s.persisterNotifier: + // unblock the merger in meantime + persistWatchers = append(persistWatchers, ew) + lastMergedEpoch = ew.epoch + persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) + atomic.AddUint64(&s.stats.TotPersisterMergerNapBreak, 1) } return lastMergedEpoch, persistWatchers } OUTER: - for numFilesOnDisk > uint64(po.PersisterNapUnderNumFiles) && + for po.PersisterNapUnderNumFiles > 0 && + numFilesOnDisk >= uint64(po.PersisterNapUnderNumFiles) && lastMergedEpoch < lastPersistedEpoch { atomic.AddUint64(&s.stats.TotPersisterSlowMergerPause, 1) diff --git a/index/scorch/stats.go b/index/scorch/stats.go index d4e07f6b4..353641370 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -74,6 +74,9 @@ type Stats struct { TotPersisterSlowMergerPause uint64 TotPersisterSlowMergerResume uint64 + TotPersisterNapPauseCompleted uint64 + TotPersisterMergerNapBreak uint64 + TotFileMergeLoopBeg uint64 TotFileMergeLoopErr uint64 TotFileMergeLoopEnd uint64 From 92d330cd555c971a7acf1356376f1c3ccf85a19d Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Sat, 18 Aug 2018 23:51:33 +0530 Subject: [PATCH 457/728] MB-30943 - incorrect num_recs_to_persist stat Fixing the num_recs_to_persist stat --- index/scorch/introducer.go | 37 ++++++++++++++++++++++++++++++++++++- index/scorch/scorch.go | 1 + index/scorch/stats.go | 2 ++ 3 files changed, 39 insertions(+), 1 deletion(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 9f42a5b36..dae45f331 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -20,6 +20,7 @@ import ( "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index/scorch/segment" + "github.com/blevesearch/bleve/index/scorch/segment/zap" ) type segmentIntroduction struct { @@ -126,6 +127,7 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { // iterate through current segments var running uint64 + var docsToPersistCount uint64 for i := range root.segment { // see if optimistic work included this segment delta, ok := next.obsoletes[root.segment[i].id] @@ -164,8 +166,14 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { newSnapshot.offsets = append(newSnapshot.offsets, running) running += newss.segment.Count() } + + if isMemorySegment(root.segment[i]) { + docsToPersistCount += root.segment[i].Count() + } } + atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount) + // append new segment, if any, to end of the new index snapshot if next.data != nil { newSegmentSnapshot := &SegmentSnapshot{ @@ -241,6 +249,7 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) { creator: "introducePersist", } + var docsToPersistCount uint64 for i, segmentSnapshot := range root.segment { // see if this segment has been replaced if replacement, ok := persist.persisted[segmentSnapshot.id]; ok { @@ -260,6 +269,10 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) { } else { newIndexSnapshot.segment[i] = root.segment[i] newIndexSnapshot.segment[i].segment.AddRef() + + if isMemorySegment(root.segment[i]) { + docsToPersistCount += root.segment[i].Count() + } } newIndexSnapshot.offsets[i] = root.offsets[i] } @@ -268,6 +281,8 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) { newIndexSnapshot.internal[k] = v } + atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount) + newIndexSnapshot.updateSize() s.rootLock.Lock() rootPrev := s.root @@ -302,7 +317,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { // iterate through current segments newSegmentDeleted := roaring.NewBitmap() - var running uint64 + var running, docsToPersistCount uint64 for i := range root.segment { segmentID := root.segment[i].id if segSnapAtMerge, ok := nextMerge.old[segmentID]; ok { @@ -338,7 +353,12 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { root.segment[i].segment.AddRef() newSnapshot.offsets = append(newSnapshot.offsets, running) running += root.segment[i].segment.Count() + + if isMemorySegment(root.segment[i]) { + docsToPersistCount += root.segment[i].Count() + } } + } // before the newMerge introduction, need to clean the newly @@ -369,8 +389,15 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { }) newSnapshot.offsets = append(newSnapshot.offsets, running) atomic.AddUint64(&s.stats.TotIntroducedSegmentsMerge, 1) + + switch nextMerge.new.(type) { + case *zap.SegmentBase: + docsToPersistCount += nextMerge.new.Count() - newSegmentDeleted.GetCardinality() + } } + atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount) + newSnapshot.AddRef() // 1 ref for the nextMerge.notify response newSnapshot.updateSize() @@ -393,6 +420,14 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { close(nextMerge.notify) } +func isMemorySegment(s *SegmentSnapshot) bool { + switch s.segment.(type) { + case *zap.SegmentBase: + return true + } + return false +} + func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { atomic.AddUint64(&s.stats.TotIntroduceRevertBeg, 1) defer atomic.AddUint64(&s.stats.TotIntroduceRevertEnd, 1) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 3980a8a82..f182e83c4 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -497,6 +497,7 @@ func (s *Scorch) StatsMap() map[string]interface{} { m["num_plain_text_bytes_indexed"] = m["TotIndexedPlainTextBytes"] m["num_items_introduced"] = m["TotIntroducedItems"] m["num_items_persisted"] = m["TotPersistedItems"] + m["num_recs_to_persist"] = m["TotItemsToPersist"] m["num_bytes_used_disk"] = m["CurOnDiskBytes"] m["num_files_on_disk"] = m["CurOnDiskFiles"] m["total_compaction_written_bytes"] = m["TotFileMergeWrittenBytes"] diff --git a/index/scorch/stats.go b/index/scorch/stats.go index 353641370..eb6d946be 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -61,6 +61,8 @@ type Stats struct { TotIntroducedSegmentsBatch uint64 TotIntroducedSegmentsMerge uint64 + TotItemsToPersist uint64 + TotPersistLoopBeg uint64 TotPersistLoopErr uint64 TotPersistLoopProgress uint64 From 7f2f365802df79d4babd43542c7de2b85d7845ed Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Sun, 19 Aug 2018 15:26:27 +0530 Subject: [PATCH 458/728] more stats for root segments adding file,mem segment stats for currrent root --- index/scorch/introducer.go | 54 +++++++++++++++++++++++++++++--------- index/scorch/scorch.go | 2 ++ index/scorch/stats.go | 21 ++++++++------- 3 files changed, 55 insertions(+), 22 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index dae45f331..f17b3bc5b 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -127,7 +127,7 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { // iterate through current segments var running uint64 - var docsToPersistCount uint64 + var docsToPersistCount, memSegments, fileSegments uint64 for i := range root.segment { // see if optimistic work included this segment delta, ok := next.obsoletes[root.segment[i].id] @@ -169,10 +169,15 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { if isMemorySegment(root.segment[i]) { docsToPersistCount += root.segment[i].Count() + memSegments++ + } else { + fileSegments++ } } atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount) + atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments) + atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments) // append new segment, if any, to end of the new index snapshot if next.data != nil { @@ -249,7 +254,7 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) { creator: "introducePersist", } - var docsToPersistCount uint64 + var docsToPersistCount, memSegments, fileSegments uint64 for i, segmentSnapshot := range root.segment { // see if this segment has been replaced if replacement, ok := persist.persisted[segmentSnapshot.id]; ok { @@ -266,12 +271,16 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) { // update items persisted incase of a new segment snapshot atomic.AddUint64(&s.stats.TotPersistedItems, newSegmentSnapshot.Count()) atomic.AddUint64(&s.stats.TotPersistedSegments, 1) + fileSegments++ } else { newIndexSnapshot.segment[i] = root.segment[i] newIndexSnapshot.segment[i].segment.AddRef() if isMemorySegment(root.segment[i]) { docsToPersistCount += root.segment[i].Count() + memSegments++ + } else { + fileSegments++ } } newIndexSnapshot.offsets[i] = root.offsets[i] @@ -282,7 +291,8 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) { } atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount) - + atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments) + atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments) newIndexSnapshot.updateSize() s.rootLock.Lock() rootPrev := s.root @@ -317,7 +327,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { // iterate through current segments newSegmentDeleted := roaring.NewBitmap() - var running, docsToPersistCount uint64 + var running, docsToPersistCount, memSegments, fileSegments uint64 for i := range root.segment { segmentID := root.segment[i].id if segSnapAtMerge, ok := nextMerge.old[segmentID]; ok { @@ -356,6 +366,9 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { if isMemorySegment(root.segment[i]) { docsToPersistCount += root.segment[i].Count() + memSegments++ + } else { + fileSegments++ } } @@ -393,10 +406,15 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { switch nextMerge.new.(type) { case *zap.SegmentBase: docsToPersistCount += nextMerge.new.Count() - newSegmentDeleted.GetCardinality() + memSegments++ + case *zap.Segment: + fileSegments++ } } atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount) + atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments) + atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments) newSnapshot.AddRef() // 1 ref for the nextMerge.notify response @@ -420,14 +438,6 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { close(nextMerge.notify) } -func isMemorySegment(s *SegmentSnapshot) bool { - switch s.segment.(type) { - case *zap.SegmentBase: - return true - } - return false -} - func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { atomic.AddUint64(&s.stats.TotIntroduceRevertBeg, 1) defer atomic.AddUint64(&s.stats.TotIntroduceRevertEnd, 1) @@ -453,6 +463,7 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { } s.nextSnapshotEpoch++ + var docsToPersistCount, memSegments, fileSegments uint64 // iterate through segments for i, segmentSnapshot := range revertTo.snapshot.segment { newSnapshot.segment[i] = &SegmentSnapshot{ @@ -467,8 +478,19 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { // remove segment from ineligibleForRemoval map filename := zapFileName(segmentSnapshot.id) delete(s.ineligibleForRemoval, filename) + + if isMemorySegment(segmentSnapshot) { + docsToPersistCount += segmentSnapshot.Count() + memSegments++ + } else { + fileSegments++ + } } + atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount) + atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments) + atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments) + if revertTo.persisted != nil { s.rootPersisted = append(s.rootPersisted, revertTo.persisted) } @@ -490,3 +512,11 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { return nil } + +func isMemorySegment(s *SegmentSnapshot) bool { + switch s.segment.(type) { + case *zap.SegmentBase: + return true + } + return false +} diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index f182e83c4..39832eb88 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -214,6 +214,8 @@ func (s *Scorch) openBolt() error { } } + atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, uint64(len(s.root.segment))) + s.introductions = make(chan *segmentIntroduction) s.persists = make(chan *persistIntroduction) s.merges = make(chan *segmentMerge) diff --git a/index/scorch/stats.go b/index/scorch/stats.go index eb6d946be..bc1ca4bd8 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -61,8 +61,6 @@ type Stats struct { TotIntroducedSegmentsBatch uint64 TotIntroducedSegmentsMerge uint64 - TotItemsToPersist uint64 - TotPersistLoopBeg uint64 TotPersistLoopErr uint64 TotPersistLoopProgress uint64 @@ -71,6 +69,7 @@ type Stats struct { TotPersistLoopEnd uint64 TotPersistedItems uint64 + TotItemsToPersist uint64 TotPersistedSegments uint64 TotPersisterSlowMergerPause uint64 @@ -96,6 +95,7 @@ type Stats struct { TotFileMergeSegmentsEmpty uint64 TotFileMergeSegments uint64 + TotFileSegmentsAtRoot uint64 TotFileMergeWrittenBytes uint64 TotFileMergeZapBeg uint64 @@ -106,14 +106,15 @@ type Stats struct { TotFileMergeIntroductions uint64 TotFileMergeIntroductionsDone uint64 - TotMemMergeBeg uint64 - TotMemMergeErr uint64 - TotMemMergeDone uint64 - TotMemMergeZapBeg uint64 - TotMemMergeZapEnd uint64 - TotMemMergeZapTime uint64 - MaxMemMergeZapTime uint64 - TotMemMergeSegments uint64 + TotMemMergeBeg uint64 + TotMemMergeErr uint64 + TotMemMergeDone uint64 + TotMemMergeZapBeg uint64 + TotMemMergeZapEnd uint64 + TotMemMergeZapTime uint64 + MaxMemMergeZapTime uint64 + TotMemMergeSegments uint64 + TotMemorySegmentsAtRoot uint64 } // atomically populates the returned map From 0da7b8fc580c5e430b793c769fc89a5e64f4e4d4 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 22 Aug 2018 17:07:53 -0700 Subject: [PATCH 459/728] remove scorch SegmentDictionarySnapshot wrapper The SegmentDictionarySnapshot wrapper didn't provide a lot of extra functionality, mainly to help construct PostingsLists with the right deleted bitmaps. Removing it helps optimize a little bit on memory allocations. See also: https://issues.couchbase.com/browse/MB-21318 --- index/scorch/snapshot_index.go | 8 +++--- index/scorch/snapshot_segment.go | 45 -------------------------------- 2 files changed, 4 insertions(+), 49 deletions(-) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 97266a900..b42f142ba 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -118,7 +118,7 @@ func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i s results := make(chan *asynchSegmentResult) for index, segment := range i.segment { go func(index int, segment *SegmentSnapshot) { - dict, err := segment.Dictionary(field) + dict, err := segment.segment.Dictionary(field) if err != nil { results <- &asynchSegmentResult{err: err} } else { @@ -434,7 +434,7 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, if rv.dicts == nil { rv.dicts = make([]segment.TermDictionary, len(i.segment)) for i, segment := range i.segment { - dict, err := segment.Dictionary(field) + dict, err := segment.segment.Dictionary(field) if err != nil { return nil, err } @@ -442,8 +442,8 @@ func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, } } - for i := range i.segment { - pl, err := rv.dicts[i].PostingsList(term, nil, rv.postings[i]) + for i, segment := range i.segment { + pl, err := rv.dicts[i].PostingsList(term, segment.deleted, rv.postings[i]) if err != nil { return nil, err } diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 1cc8b76c2..7672e853b 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -23,46 +23,12 @@ import ( "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/size" - "github.com/couchbase/vellum" ) var TermSeparator byte = 0xff var TermSeparatorSplitSlice = []byte{TermSeparator} -type SegmentDictionarySnapshot struct { - s *SegmentSnapshot - d segment.TermDictionary -} - -func (s *SegmentDictionarySnapshot) PostingsList(term []byte, except *roaring.Bitmap, - prealloc segment.PostingsList) (segment.PostingsList, error) { - // TODO: if except is non-nil, perhaps need to OR it with s.s.deleted? - return s.d.PostingsList(term, s.s.deleted, prealloc) -} - -func (s *SegmentDictionarySnapshot) Iterator() segment.DictionaryIterator { - return s.d.Iterator() -} - -func (s *SegmentDictionarySnapshot) PrefixIterator(prefix string) segment.DictionaryIterator { - return s.d.PrefixIterator(prefix) -} - -func (s *SegmentDictionarySnapshot) RangeIterator(start, end string) segment.DictionaryIterator { - return s.d.RangeIterator(start, end) -} - -func (s *SegmentDictionarySnapshot) AutomatonIterator(a vellum.Automaton, - startKeyInclusive, endKeyExclusive []byte) segment.DictionaryIterator { - return s.d.AutomatonIterator(a, startKeyInclusive, endKeyExclusive) -} - -func (s *SegmentDictionarySnapshot) OnlyIterator(onlyTerms [][]byte, - includeCount bool) segment.DictionaryIterator { - return s.d.OnlyIterator(onlyTerms, includeCount) -} - type SegmentSnapshot struct { id uint64 segment segment.Segment @@ -112,17 +78,6 @@ func (s *SegmentSnapshot) Count() uint64 { return rv } -func (s *SegmentSnapshot) Dictionary(field string) (segment.TermDictionary, error) { - d, err := s.segment.Dictionary(field) - if err != nil { - return nil, err - } - return &SegmentDictionarySnapshot{ - s: s, - d: d, - }, nil -} - func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) { rv, err := s.segment.DocNumbers(docIDs) if err != nil { From c0335041e896d4f4868d401b9192505c54fe9c8b Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 23 Aug 2018 08:12:59 -0700 Subject: [PATCH 460/728] optimize scorch to only recycle current TermFieldReaders When scorch is finished using a TermFieldReader, if the indexSnapshot that it belongs to is obsolete (not the current indexSnapshot), then don't recycle the TermFieldReader so that GC can take care of it sooner. This can matter in a scenario of ongoing mutations (the current indexSnapshot keeps on changing) when there are searches (like prefix & wildcard/regexp) that might involve lots (hundreds or more) of terms per search request, meaning lots of TermFieldReaders can be in play. See also: https://issues.couchbase.com/browse/MB-29923 --- index/scorch/snapshot_index.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index b42f142ba..285412cf3 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -472,6 +472,14 @@ func (i *IndexSnapshot) allocTermFieldReaderDicts(field string) (tfr *IndexSnaps } func (i *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader) { + i.parent.rootLock.RLock() + obsolete := i.parent.root != i + i.parent.rootLock.RUnlock() + if obsolete { + // if we're not the current root (mutations happened), don't bother recycling + return + } + i.m2.Lock() if i.fieldTFRs == nil { i.fieldTFRs = map[string][]*IndexSnapshotTermFieldReader{} From 816b0c6aaea4fa4b9a192f0c630035e774cc8c23 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Sun, 26 Aug 2018 23:53:29 +0530 Subject: [PATCH 461/728] adding new batch merge api --- index.go | 10 ++++ index/index.go | 20 +++++++ index_test.go | 156 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 186 insertions(+) diff --git a/index.go b/index.go index 197f4d4df..f9462a41d 100644 --- a/index.go +++ b/index.go @@ -119,6 +119,16 @@ func (b *Batch) Reset() { b.internal.Reset() } +func (b *Batch) Merge(o *Batch) { + if o != nil && o.internal != nil { + b.internal.Merge(o.internal) + if o.LastDocSize() > 0 { + b.lastDocSize = o.LastDocSize() + } + b.totalSize = uint64(b.internal.TotalDocSize()) + } +} + // An Index implements all the indexing and searching // capabilities of bleve. An Index can be created // using the New() and Open() methods. diff --git a/index/index.go b/index/index.go index 62128c3c2..a44046134 100644 --- a/index/index.go +++ b/index/index.go @@ -300,6 +300,26 @@ func (b *Batch) Reset() { b.InternalOps = make(map[string][]byte) } +func (b *Batch) Merge(o *Batch) { + for k, v := range o.IndexOps { + b.IndexOps[k] = v + } + for k, v := range o.InternalOps { + b.InternalOps[k] = v + } +} + +func (b *Batch) TotalDocSize() int { + var s int + for k, v := range b.IndexOps { + if v != nil { + s += v.Size() + size.SizeOfString + } + s += len(k) + } + return s +} + // Optimizable represents an optional interface that implementable by // optimizable resources (e.g., TermFieldReaders, Searchers). These // optimizable resources are provided the same OptimizableContext diff --git a/index_test.go b/index_test.go index 604328a41..0d81eba40 100644 --- a/index_test.go +++ b/index_test.go @@ -1917,3 +1917,159 @@ func TestSearchQueryCallback(t *testing.T) { t.Fatalf("Expected: %v, Got: %v", expErr, err) } } + +func TestBatchMerge(t *testing.T) { + defer func() { + err := os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + }() + + index, err := New("testidx", NewIndexMapping()) + if err != nil { + t.Fatal(err) + } + doca := map[string]interface{}{ + "name": "scorch", + "desc": "gophercon india", + "nation": "india", + } + + batchA := index.NewBatch() + err = batchA.Index("a", doca) + if err != nil { + t.Error(err) + } + batchA.SetInternal([]byte("batchkA"), []byte("batchvA")) + + docb := map[string]interface{}{ + "name": "moss", + "desc": "gophercon MV", + } + + batchB := index.NewBatch() + err = batchB.Index("b", docb) + if err != nil { + t.Error(err) + } + batchB.SetInternal([]byte("batchkB"), []byte("batchvB")) + + docC := map[string]interface{}{ + "name": "blahblah", + "desc": "inProgress", + "country": "usa", + } + + batchC := index.NewBatch() + err = batchC.Index("c", docC) + if err != nil { + t.Error(err) + } + batchC.SetInternal([]byte("batchkC"), []byte("batchvC")) + batchC.SetInternal([]byte("batchkB"), []byte("batchvBNew")) + batchC.Delete("a") + batchC.DeleteInternal([]byte("batchkA")) + + batchA.Merge(batchB) + + if batchA.Size() != 4 { + t.Errorf("expected batch size 4, got %d", batchA.Size()) + } + + batchA.Merge(batchC) + + if batchA.Size() != 6 { + t.Errorf("expected batch size 6, got %d", batchA.Size()) + } + + err = index.Batch(batchA) + if err != nil { + t.Fatal(err) + } + + // close the index, open it again, and try some more things + err = index.Close() + if err != nil { + t.Fatal(err) + } + + index, err = Open("testidx") + if err != nil { + t.Fatal(err) + } + defer func() { + err := index.Close() + if err != nil { + t.Fatal(err) + } + }() + + count, err := index.DocCount() + if err != nil { + t.Fatal(err) + } + if count != 2 { + t.Errorf("expected doc count 2, got %d", count) + } + + doc, err := index.Document("c") + if err != nil { + t.Fatal(err) + } + if doc == nil { + t.Errorf("expected doc not nil, got nil") + } + + val, err := index.GetInternal([]byte("batchkB")) + if err != nil { + t.Fatal(err) + } + if val == nil || string(val) != "batchvBNew" { + t.Errorf("expected val: batchvBNew , got %s", val) + } + + val, err = index.GetInternal([]byte("batchkA")) + if err != nil { + t.Fatal(err) + } + if val != nil { + t.Errorf("expected nil, got %s", val) + } + + foundNameField := false + for _, field := range doc.Fields { + if field.Name() == "name" && string(field.Value()) == "blahblah" { + foundNameField = true + } + } + if !foundNameField { + t.Errorf("expected to find field named 'name' with value 'blahblah'") + } + + fields, err := index.Fields() + if err != nil { + t.Fatal(err) + } + + expectedFields := map[string]bool{ + "_all": false, + "name": false, + "desc": false, + "country": false, + } + if len(fields) < len(expectedFields) { + t.Fatalf("expected %d fields got %d", len(expectedFields), len(fields)) + } + + for _, f := range fields { + expectedFields[f] = true + } + + for ef, efp := range expectedFields { + if !efp { + t.Errorf("field %s is missing", ef) + } + } + +} From bbeefaf3518cc9d8975f963771af33737bef12c3 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 29 Aug 2018 14:24:04 +0530 Subject: [PATCH 462/728] exposing segment,persister stats --- index/scorch/scorch.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 39832eb88..5e56c49b0 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -502,6 +502,10 @@ func (s *Scorch) StatsMap() map[string]interface{} { m["num_recs_to_persist"] = m["TotItemsToPersist"] m["num_bytes_used_disk"] = m["CurOnDiskBytes"] m["num_files_on_disk"] = m["CurOnDiskFiles"] + m["num_root_memorysegments"] = m["TotMemorySegmentsAtRoot"] + m["num_root_filesegments"] = m["TotFileSegmentsAtRoot"] + m["num_persister_nap_pause_completed"] = m["TotPersisterNapPauseCompleted"] + m["num_persister_nap_merger_break"] = m["TotPersisterMergerNapBreak"] m["total_compaction_written_bytes"] = m["TotFileMergeWrittenBytes"] return m From aefc7de851b095f845be56dadb1bea7ed1a38a9c Mon Sep 17 00:00:00 2001 From: Oleg Kovalov Date: Sat, 1 Sep 2018 13:25:28 +0200 Subject: [PATCH 463/728] minor code cleanups --- analysis/token/camelcase/parser.go | 8 ++++---- index/scorch/introducer.go | 3 ++- search/collector/heap.go | 4 ++-- search/collector/list.go | 5 ++--- search/collector/slice.go | 4 ++-- search/query/query.go | 12 ++++-------- 6 files changed, 16 insertions(+), 20 deletions(-) diff --git a/analysis/token/camelcase/parser.go b/analysis/token/camelcase/parser.go index d691e5646..ff4ce2fea 100644 --- a/analysis/token/camelcase/parser.go +++ b/analysis/token/camelcase/parser.go @@ -46,11 +46,11 @@ type Parser struct { index int } -func NewParser(len, position, index int) *Parser { +func NewParser(length, position, index int) *Parser { return &Parser{ - bufferLen: len, - buffer: make([]rune, 0, len), - tokens: make([]*analysis.Token, 0, len), + bufferLen: length, + buffer: make([]rune, 0, length), + tokens: make([]*analysis.Token, 0, length), position: position, index: index, } diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index f17b3bc5b..12f27af66 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -517,6 +517,7 @@ func isMemorySegment(s *SegmentSnapshot) bool { switch s.segment.(type) { case *zap.SegmentBase: return true + default: + return false } - return false } diff --git a/search/collector/heap.go b/search/collector/heap.go index bdf72eade..05502d5df 100644 --- a/search/collector/heap.go +++ b/search/collector/heap.go @@ -25,9 +25,9 @@ type collectStoreHeap struct { compare collectorCompare } -func newStoreHeap(cap int, compare collectorCompare) *collectStoreHeap { +func newStoreHeap(capacity int, compare collectorCompare) *collectStoreHeap { rv := &collectStoreHeap{ - heap: make(search.DocumentMatchCollection, 0, cap), + heap: make(search.DocumentMatchCollection, 0, capacity), compare: compare, } heap.Init(rv) diff --git a/search/collector/list.go b/search/collector/list.go index ec2f69cb8..f01d205c9 100644 --- a/search/collector/list.go +++ b/search/collector/list.go @@ -25,7 +25,7 @@ type collectStoreList struct { compare collectorCompare } -func newStoreList(cap int, compare collectorCompare) *collectStoreList { +func newStoreList(capacity int, compare collectorCompare) *collectStoreList { rv := &collectStoreList{ results: list.New(), compare: compare, @@ -34,8 +34,7 @@ func newStoreList(cap int, compare collectorCompare) *collectStoreList { return rv } -func (c *collectStoreList) AddNotExceedingSize(doc *search.DocumentMatch, - size int) *search.DocumentMatch { +func (c *collectStoreList) AddNotExceedingSize(doc *search.DocumentMatch, size int) *search.DocumentMatch { c.add(doc) if c.len() > size { return c.removeLast() diff --git a/search/collector/slice.go b/search/collector/slice.go index 32cb86244..85fe73c40 100644 --- a/search/collector/slice.go +++ b/search/collector/slice.go @@ -21,9 +21,9 @@ type collectStoreSlice struct { compare collectorCompare } -func newStoreSlice(cap int, compare collectorCompare) *collectStoreSlice { +func newStoreSlice(capacity int, compare collectorCompare) *collectStoreSlice { rv := &collectStoreSlice{ - slice: make(search.DocumentMatchCollection, 0, cap), + slice: make(search.DocumentMatchCollection, 0, capacity), compare: compare, } return rv diff --git a/search/query/query.go b/search/query/query.go index 1b0d94c01..c7c1eefb8 100644 --- a/search/query/query.go +++ b/search/query/query.go @@ -296,32 +296,28 @@ func expandQuery(m mapping.IndexMapping, query Query) (Query, error) { } expand = func(query Query) (Query, error) { - switch query.(type) { + switch q := query.(type) { case *QueryStringQuery: - q := query.(*QueryStringQuery) parsed, err := parseQuerySyntax(q.Query) if err != nil { return nil, fmt.Errorf("could not parse '%s': %s", q.Query, err) } return expand(parsed) case *ConjunctionQuery: - q := *query.(*ConjunctionQuery) children, err := expandSlice(q.Conjuncts) if err != nil { return nil, err } q.Conjuncts = children - return &q, nil + return q, nil case *DisjunctionQuery: - q := *query.(*DisjunctionQuery) children, err := expandSlice(q.Disjuncts) if err != nil { return nil, err } q.Disjuncts = children - return &q, nil + return q, nil case *BooleanQuery: - q := *query.(*BooleanQuery) var err error q.Must, err = expand(q.Must) if err != nil { @@ -335,7 +331,7 @@ func expandQuery(m mapping.IndexMapping, query Query) (Query, error) { if err != nil { return nil, err } - return &q, nil + return q, nil default: return query, nil } From f65b131da12d6a66ed77d4dd561123e96692df09 Mon Sep 17 00:00:00 2001 From: Oleg Kovalov Date: Sat, 1 Sep 2018 13:36:29 +0200 Subject: [PATCH 464/728] minor code cleanups --- index/scorch/mergeplan/merge_plan.go | 4 ++-- index/scorch/persister.go | 2 +- index/scorch/segment/zap/docvalues.go | 2 +- index/scorch/snapshot_index.go | 2 +- index/scorch/snapshot_index_dict.go | 2 +- index/store/metrics/metrics_test.go | 2 +- index/upsidedown/row.go | 2 +- index/upsidedown/upsidedown.go | 2 +- search/scorer/scorer_term_test.go | 2 +- test/versus_test.go | 10 +++++----- 10 files changed, 15 insertions(+), 15 deletions(-) diff --git a/index/scorch/mergeplan/merge_plan.go b/index/scorch/mergeplan/merge_plan.go index b09e5381e..c2a0d3c64 100644 --- a/index/scorch/mergeplan/merge_plan.go +++ b/index/scorch/mergeplan/merge_plan.go @@ -217,14 +217,14 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) { if len(roster) > 0 { rosterScore := scoreSegments(roster, o) - if len(bestRoster) <= 0 || rosterScore < bestRosterScore { + if len(bestRoster) == 0 || rosterScore < bestRosterScore { bestRoster = roster bestRosterScore = rosterScore } } } - if len(bestRoster) <= 0 { + if len(bestRoster) == 0 { return rv, nil } diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 5e9450189..fa6b249f5 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -757,7 +757,7 @@ func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) { s.eligibleForRemoval = newEligible s.rootLock.Unlock() - if len(epochsToRemove) <= 0 { + if len(epochsToRemove) == 0 { return 0, nil } diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index 72ce1248f..bcc0f9472 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -176,7 +176,7 @@ func (di *docValueReader) iterateAllDocValues(s *SegmentBase, visitor docNumTerm if err != nil { return err } - if di.curChunkData == nil || len(di.curChunkHeader) <= 0 { + if di.curChunkData == nil || len(di.curChunkHeader) == 0 { continue } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 285412cf3..0d312fcca 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -654,7 +654,7 @@ func (i *IndexSnapshot) DumpFields() chan interface{} { // subtractStrings returns set a minus elements of set b. func subtractStrings(a, b []string) []string { - if len(b) <= 0 { + if len(b) == 0 { return a } diff --git a/index/scorch/snapshot_index_dict.go b/index/scorch/snapshot_index_dict.go index 2d229ca0f..abd3bde8c 100644 --- a/index/scorch/snapshot_index_dict.go +++ b/index/scorch/snapshot_index_dict.go @@ -52,7 +52,7 @@ func (i *IndexSnapshotFieldDict) Pop() interface{} { } func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) { - if len(i.cursors) <= 0 { + if len(i.cursors) == 0 { return nil, nil } i.entry = i.cursors[0].curr diff --git a/index/store/metrics/metrics_test.go b/index/store/metrics/metrics_test.go index cda7a2aef..fb78514bf 100644 --- a/index/store/metrics/metrics_test.go +++ b/index/store/metrics/metrics_test.go @@ -57,7 +57,7 @@ func TestMetricsStore(t *testing.T) { if err != nil { t.Errorf("expected WriteJSON to be unmarshallable") } - if len(m) <= 0 { + if len(m) == 0 { t.Errorf("expected some entries") } diff --git a/index/upsidedown/row.go b/index/upsidedown/row.go index ba50314cd..531e0a0d3 100644 --- a/index/upsidedown/row.go +++ b/index/upsidedown/row.go @@ -584,7 +584,7 @@ func (tfr *TermFrequencyRow) parseK(key []byte) error { func (tfr *TermFrequencyRow) parseKDoc(key []byte, term []byte) error { tfr.doc = key[3+len(term)+1:] - if len(tfr.doc) <= 0 { + if len(tfr.doc) == 0 { return fmt.Errorf("invalid term frequency key, empty docid") } diff --git a/index/upsidedown/upsidedown.go b/index/upsidedown/upsidedown.go index 70e6e457f..6d3738539 100644 --- a/index/upsidedown/upsidedown.go +++ b/index/upsidedown/upsidedown.go @@ -775,7 +775,7 @@ func (udc *UpsideDownCouch) termVectorsFromTokenFreq(field uint16, tf *analysis. } func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) []*index.TermFieldVector { - if len(in) <= 0 { + if len(in) == 0 { return nil } diff --git a/search/scorer/scorer_term_test.go b/search/scorer/scorer_term_test.go index 23d449788..b491ee00c 100644 --- a/search/scorer/scorer_term_test.go +++ b/search/scorer/scorer_term_test.go @@ -157,7 +157,7 @@ func TestTermScorer(t *testing.T) { } actual := scorer.Score(ctx, test.termMatch) actual.Complete(nil) - if len(actual.FieldTermLocations) <= 0 { + if len(actual.FieldTermLocations) == 0 { actual.FieldTermLocations = nil } diff --git a/test/versus_test.go b/test/versus_test.go index 10faa311e..20a85d161 100644 --- a/test/versus_test.go +++ b/test/versus_test.go @@ -253,7 +253,7 @@ func testVersusSearches(vt *VersusTest, searchTemplates []string, idxA, idxB ble // definitely find at least one document. "bodyWord": func(i int) string { body := vt.Bodies[vt.CurAttempt%len(vt.Bodies)] - if len(body) <= 0 { + if len(body) == 0 { return "" } return body[i%len(body)] @@ -324,10 +324,10 @@ func testVersusSearches(vt *VersusTest, searchTemplates []string, idxA, idxB ble hitsB := hitsById(resB) for id, hitA := range hitsA { hitB := hitsB[id] - if len(hitA.FieldTermLocations) <= 0 { + if len(hitA.FieldTermLocations) == 0 { hitA.FieldTermLocations = nil } - if len(hitB.FieldTermLocations) <= 0 { + if len(hitB.FieldTermLocations) == 0 { hitB.FieldTermLocations = nil } if !reflect.DeepEqual(hitA, hitB) { @@ -338,10 +338,10 @@ func testVersusSearches(vt *VersusTest, searchTemplates []string, idxA, idxB ble } for id, hitB := range hitsB { hitA := hitsA[id] - if len(hitA.FieldTermLocations) <= 0 { + if len(hitA.FieldTermLocations) == 0 { hitA.FieldTermLocations = nil } - if len(hitB.FieldTermLocations) <= 0 { + if len(hitB.FieldTermLocations) == 0 { hitB.FieldTermLocations = nil } if !reflect.DeepEqual(hitA, hitB) { From 38f239cf70326c6983e7e09a22a4a3769969f6a6 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Thu, 23 Aug 2018 08:59:55 -0700 Subject: [PATCH 465/728] optimize scorch with sync.Pool of zap Dict/PostingsList/Iterator Some searches like prefix searches can generate many term field readers per search request, and in the face of ongoing mutations, scorch will not recycle those term field readers. This change adds sync pooling or recycling of the constituent parts of a scorch TermFieldReader, which hopefully reduces some garbage and takes some pressure off the memory allocator. --- index/scorch/segment/empty.go | 6 ++ index/scorch/segment/zap/dict.go | 13 ++++- index/scorch/segment/zap/posting.go | 87 +++++++++++++++++++---------- index/scorch/segment/zap/segment.go | 9 ++- index/scorch/snapshot_index.go | 6 +- index/scorch/snapshot_index_tfr.go | 39 ++++++++++++- index/scorch/stats.go | 7 +++ 7 files changed, 126 insertions(+), 41 deletions(-) diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index af50d0aaf..a0fe74555 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -91,6 +91,8 @@ func (e *EmptyDictionary) OnlyIterator(onlyTerms [][]byte, return &EmptyDictionaryIterator{} } +func (e *EmptyDictionary) Recycle() {} + type EmptyDictionaryIterator struct{} func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) { @@ -116,6 +118,8 @@ func (e *EmptyPostingsList) Count() uint64 { return 0 } +func (e *EmptyPostingsList) Recycle() {} + type EmptyPostingsIterator struct{} func (e *EmptyPostingsIterator) Next() (Posting, error) { @@ -125,3 +129,5 @@ func (e *EmptyPostingsIterator) Next() (Posting, error) { func (e *EmptyPostingsIterator) Size() int { return 0 } + +func (e *EmptyPostingsIterator) Recycle() {} diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 219bf1526..4af29409a 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -17,6 +17,7 @@ package zap import ( "bytes" "fmt" + "sync" "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" @@ -33,6 +34,8 @@ type Dictionary struct { fstReader *vellum.Reader } +var dictionaryPool = sync.Pool{New: func() interface{} { return &Dictionary{} }} + // PostingsList returns the postings list for the specified term func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap, prealloc segment.PostingsList) (segment.PostingsList, error) { @@ -79,7 +82,7 @@ func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, except *roari func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) *PostingsList { if rv == nil || rv == emptyPostingsList { - rv = &PostingsList{} + rv = postingsListPool.Get().(*PostingsList) } else { postings := rv.postings if postings != nil { @@ -225,6 +228,14 @@ func (d *Dictionary) OnlyIterator(onlyTerms [][]byte, return rv } +func (d *Dictionary) Recycle() { + *d = Dictionary{} // clear fields + + // TODO: need vellum API's to allow for recycled or prealloc'ed FST's + + dictionaryPool.Put(d) +} + // DictionaryIterator is an iterator for term dictionary type DictionaryIterator struct { d *Dictionary diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 0ac7938e1..64f223896 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -21,6 +21,7 @@ import ( "io" "math" "reflect" + "sync" "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index/scorch/segment" @@ -112,6 +113,8 @@ type PostingsList struct { // represents an immutable, empty postings list var emptyPostingsList = &PostingsList{} +var postingsListPool = sync.Pool{New: func() interface{} { return &PostingsList{} }} + func (p *PostingsList) Size() int { sizeInBytes := reflectStaticSizePostingsList + size.SizeOfPtr @@ -155,38 +158,9 @@ func (p *PostingsList) Iterator(includeFreq, includeNorm, includeLocs bool, func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool, rv *PostingsIterator) *PostingsIterator { if rv == nil { - rv = &PostingsIterator{} + rv = postingsIteratorPool.Get().(*PostingsIterator) } else { - freqNormReader := rv.freqNormReader - if freqNormReader != nil { - freqNormReader.Reset([]byte(nil)) - } - - locReader := rv.locReader - if locReader != nil { - locReader.Reset([]byte(nil)) - } - - freqChunkOffsets := rv.freqChunkOffsets[:0] - locChunkOffsets := rv.locChunkOffsets[:0] - - nextLocs := rv.nextLocs[:0] - nextSegmentLocs := rv.nextSegmentLocs[:0] - - buf := rv.buf - - *rv = PostingsIterator{} // clear the struct - - rv.freqNormReader = freqNormReader - rv.locReader = locReader - - rv.freqChunkOffsets = freqChunkOffsets - rv.locChunkOffsets = locChunkOffsets - - rv.nextLocs = nextLocs - rv.nextSegmentLocs = nextSegmentLocs - - rv.buf = buf + rv.Clear() } rv.postings = p @@ -322,6 +296,14 @@ func (rv *PostingsList) init1Hit(fstVal uint64) error { return nil } +func (rv *PostingsList) Recycle() { + if rv != emptyPostingsList { + *rv = PostingsList{} + // TODO: can we also recycle the roaring bitmaps? + postingsListPool.Put(rv) + } +} + // PostingsIterator provides a way to iterate through the postings list type PostingsIterator struct { postings *PostingsList @@ -357,6 +339,8 @@ type PostingsIterator struct { var emptyPostingsIterator = &PostingsIterator{} +var postingsIteratorPool = sync.Pool{New: func() interface{} { return &PostingsIterator{} }} + func (i *PostingsIterator) Size() int { sizeInBytes := reflectStaticSizePostingsIterator + size.SizeOfPtr + len(i.currChunkFreqNorm) + @@ -711,6 +695,47 @@ func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, return uint64(n), true, nil } +func (rv *PostingsIterator) Clear() { + freqNormReader := rv.freqNormReader + if freqNormReader != nil { + freqNormReader.Reset([]byte(nil)) + } + + locReader := rv.locReader + if locReader != nil { + locReader.Reset([]byte(nil)) + } + + freqChunkOffsets := rv.freqChunkOffsets[:0] + locChunkOffsets := rv.locChunkOffsets[:0] + + nextLocs := rv.nextLocs[:0] + nextSegmentLocs := rv.nextSegmentLocs[:0] + + buf := rv.buf + + *rv = PostingsIterator{} // clear the struct + + rv.freqNormReader = freqNormReader + rv.locReader = locReader + + rv.freqChunkOffsets = freqChunkOffsets + rv.locChunkOffsets = locChunkOffsets + + rv.nextLocs = nextLocs + rv.nextSegmentLocs = nextSegmentLocs + + rv.buf = buf +} + +func (i *PostingsIterator) Recycle() { + if i != emptyPostingsIterator { + i.Clear() + + postingsIteratorPool.Put(i) + } +} + // Posting is a single entry in a postings list type Posting struct { docNum uint64 diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 8c6de211a..b9a328ad4 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -250,11 +250,10 @@ func (s *SegmentBase) Dictionary(field string) (segment.TermDictionary, error) { func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) { fieldIDPlus1 := sb.fieldsMap[field] if fieldIDPlus1 > 0 { - rv = &Dictionary{ - sb: sb, - field: field, - fieldID: fieldIDPlus1 - 1, - } + rv = dictionaryPool.Get().(*Dictionary) + rv.sb = sb + rv.field = field + rv.fieldID = fieldIDPlus1 - 1 dictStart := sb.dictLocs[rv.fieldID] if dictStart > 0 { diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 285412cf3..7dab97599 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -471,13 +471,13 @@ func (i *IndexSnapshot) allocTermFieldReaderDicts(field string) (tfr *IndexSnaps return &IndexSnapshotTermFieldReader{} } -func (i *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader) { +func (i *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader) bool { i.parent.rootLock.RLock() obsolete := i.parent.root != i i.parent.rootLock.RUnlock() if obsolete { // if we're not the current root (mutations happened), don't bother recycling - return + return false } i.m2.Lock() @@ -486,6 +486,8 @@ func (i *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader } i.fieldTFRs[tfr.field] = append(i.fieldTFRs[tfr.field], tfr) i.m2.Unlock() + + return true } func docNumberToBytes(buf []byte, in uint64) []byte { diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index 89af3be4c..229869537 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -178,8 +178,43 @@ func (i *IndexSnapshotTermFieldReader) Count() uint64 { func (i *IndexSnapshotTermFieldReader) Close() error { if i.snapshot != nil { - atomic.AddUint64(&i.snapshot.parent.stats.TotTermSearchersFinished, uint64(1)) - i.snapshot.recycleTermFieldReader(i) + stats := &i.snapshot.parent.stats + + atomic.AddUint64(&stats.TotTermSearchersFinished, uint64(1)) + + if i.snapshot.recycleTermFieldReader(i) { + atomic.AddUint64(&stats.TotTermFieldReadersRecycled, uint64(1)) + } else { + atomic.AddUint64(&stats.TotTermFieldReadersGarbaged, uint64(1)) + + // the snapshot couldn't recycle the TFR (it was too + // obsolete), so recycle its constituent parts + + for _, x := range i.iterators { + recycle(x) + } + atomic.AddUint64(&stats.TotPostingsIteratorsRecycled, uint64(len(i.iterators))) + + for _, x := range i.postings { + recycle(x) + } + atomic.AddUint64(&stats.TotPostingsListsRecycled, uint64(len(i.postings))) + + for _, x := range i.dicts { + recycle(x) + } + atomic.AddUint64(&stats.TotTermDictionariesRecycled, uint64(len(i.dicts))) + } } return nil } + +type Recycler interface { + Recycle() +} + +func recycle(x interface{}) { + if r, ok := x.(Recycler); ok { + r.Recycle() + } +} diff --git a/index/scorch/stats.go b/index/scorch/stats.go index bc1ca4bd8..ba497de99 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -115,6 +115,13 @@ type Stats struct { MaxMemMergeZapTime uint64 TotMemMergeSegments uint64 TotMemorySegmentsAtRoot uint64 + + TotTermFieldReadersGarbaged uint64 + TotTermFieldReadersRecycled uint64 + + TotTermDictionariesRecycled uint64 + TotPostingsListsRecycled uint64 + TotPostingsIteratorsRecycled uint64 } // atomically populates the returned map From 15e5f6d3059089149b59cd47c5a5b069d3e21d59 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Thu, 6 Sep 2018 11:28:37 -0700 Subject: [PATCH 466/728] Invvoke fst/reader's Close() to reuse via sync pool on recycle --- index/scorch/segment/zap/dict.go | 10 +++++++--- vendor/manifest | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 4af29409a..cfbbd2397 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -229,10 +229,14 @@ func (d *Dictionary) OnlyIterator(onlyTerms [][]byte, } func (d *Dictionary) Recycle() { - *d = Dictionary{} // clear fields - - // TODO: need vellum API's to allow for recycled or prealloc'ed FST's + if d.fst != nil { + _ = d.fst.Close() + } + if d.fstReader != nil { + d.fstReader.Close() + } + *d = Dictionary{} // clear fields dictionaryPool.Put(d) } diff --git a/vendor/manifest b/vendor/manifest index d45734e9e..46755ffd8 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -137,7 +137,7 @@ "importpath": "github.com/couchbase/vellum", "repository": "https://github.com/couchbase/vellum", "vcs": "git", - "revision": "dc6110ee42850a6553b6469bf0dc03383900353d", + "revision": "7b31f610cdce81be8b91a93cdcb0d6a151c6f8fd", "branch": "master", "notests": true } From cc4fd7137ccfcc34fcf5ae766b0b6cf9fc47f482 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Thu, 6 Sep 2018 12:14:28 -0700 Subject: [PATCH 467/728] Check for TooManyClauses as early as possible where applicable Context: + MultiTermSearcher + TermPrefixSearcher --- search/searcher/search_multi_term.go | 8 ++++++++ search/searcher/search_term_prefix.go | 3 +++ 2 files changed, 11 insertions(+) diff --git a/search/searcher/search_multi_term.go b/search/searcher/search_multi_term.go index b469beadb..a723aedc5 100644 --- a/search/searcher/search_multi_term.go +++ b/search/searcher/search_multi_term.go @@ -22,6 +22,10 @@ import ( func NewMultiTermSearcher(indexReader index.IndexReader, terms []string, field string, boost float64, options search.SearcherOptions, limit bool) ( search.Searcher, error) { + if limit && tooManyClauses(len(terms)) { + return nil, tooManyClausesErr() + } + qsearchers := make([]search.Searcher, len(terms)) qsearchersClose := func() { for _, searcher := range qsearchers { @@ -46,6 +50,10 @@ func NewMultiTermSearcher(indexReader index.IndexReader, terms []string, func NewMultiTermSearcherBytes(indexReader index.IndexReader, terms [][]byte, field string, boost float64, options search.SearcherOptions, limit bool) ( search.Searcher, error) { + if limit && tooManyClauses(len(terms)) { + return nil, tooManyClausesErr() + } + qsearchers := make([]search.Searcher, len(terms)) qsearchersClose := func() { for _, searcher := range qsearchers { diff --git a/search/searcher/search_term_prefix.go b/search/searcher/search_term_prefix.go index c49788c71..59db93101 100644 --- a/search/searcher/search_term_prefix.go +++ b/search/searcher/search_term_prefix.go @@ -37,6 +37,9 @@ func NewTermPrefixSearcher(indexReader index.IndexReader, prefix string, tfd, err := fieldDict.Next() for err == nil && tfd != nil { terms = append(terms, tfd.Term) + if tooManyClauses(len(terms)) { + return nil, tooManyClausesErr() + } tfd, err = fieldDict.Next() } if err != nil { From 6f6636037e5344564da3eaa75054010440357ceb Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Thu, 6 Sep 2018 13:05:22 -0700 Subject: [PATCH 468/728] Update vendor/manifest revision of vellum Point to: 35d9e73 Prateek Rungta | Re-work pooling to have a fixed memory cost --- vendor/manifest | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/manifest b/vendor/manifest index 46755ffd8..5d1fb4717 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -137,7 +137,7 @@ "importpath": "github.com/couchbase/vellum", "repository": "https://github.com/couchbase/vellum", "vcs": "git", - "revision": "7b31f610cdce81be8b91a93cdcb0d6a151c6f8fd", + "revision": "35d9e7346a69d2499623063cd39f305599fc7eae", "branch": "master", "notests": true } From 504bcd3a1da5f1001032052a580efd33ce3b2676 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 10 Sep 2018 10:45:06 -0700 Subject: [PATCH 469/728] Revert "Invvoke fst/reader's Close() to reuse via sync pool on recycle" This reverts commit 15e5f6d3059089149b59cd47c5a5b069d3e21d59. --- index/scorch/segment/zap/dict.go | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index cfbbd2397..4af29409a 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -229,14 +229,10 @@ func (d *Dictionary) OnlyIterator(onlyTerms [][]byte, } func (d *Dictionary) Recycle() { - if d.fst != nil { - _ = d.fst.Close() - } - if d.fstReader != nil { - d.fstReader.Close() - } - *d = Dictionary{} // clear fields + + // TODO: need vellum API's to allow for recycled or prealloc'ed FST's + dictionaryPool.Put(d) } From 13d37d5a469b69a534ddd6096111d242a816eaa9 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 10 Sep 2018 11:45:21 -0700 Subject: [PATCH 470/728] Revert "optimize scorch with sync.Pool of zap Dict/PostingsList/Iterator" + Observed an overall performace regression with this change. This reverts commit 38f239cf70326c6983e7e09a22a4a3769969f6a6. --- index/scorch/segment/empty.go | 6 -- index/scorch/segment/zap/dict.go | 13 +---- index/scorch/segment/zap/posting.go | 87 ++++++++++------------------- index/scorch/segment/zap/segment.go | 9 +-- index/scorch/snapshot_index.go | 6 +- index/scorch/snapshot_index_tfr.go | 39 +------------ index/scorch/stats.go | 7 --- 7 files changed, 41 insertions(+), 126 deletions(-) diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index a0fe74555..af50d0aaf 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -91,8 +91,6 @@ func (e *EmptyDictionary) OnlyIterator(onlyTerms [][]byte, return &EmptyDictionaryIterator{} } -func (e *EmptyDictionary) Recycle() {} - type EmptyDictionaryIterator struct{} func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) { @@ -118,8 +116,6 @@ func (e *EmptyPostingsList) Count() uint64 { return 0 } -func (e *EmptyPostingsList) Recycle() {} - type EmptyPostingsIterator struct{} func (e *EmptyPostingsIterator) Next() (Posting, error) { @@ -129,5 +125,3 @@ func (e *EmptyPostingsIterator) Next() (Posting, error) { func (e *EmptyPostingsIterator) Size() int { return 0 } - -func (e *EmptyPostingsIterator) Recycle() {} diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 4af29409a..219bf1526 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -17,7 +17,6 @@ package zap import ( "bytes" "fmt" - "sync" "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" @@ -34,8 +33,6 @@ type Dictionary struct { fstReader *vellum.Reader } -var dictionaryPool = sync.Pool{New: func() interface{} { return &Dictionary{} }} - // PostingsList returns the postings list for the specified term func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap, prealloc segment.PostingsList) (segment.PostingsList, error) { @@ -82,7 +79,7 @@ func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, except *roari func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) *PostingsList { if rv == nil || rv == emptyPostingsList { - rv = postingsListPool.Get().(*PostingsList) + rv = &PostingsList{} } else { postings := rv.postings if postings != nil { @@ -228,14 +225,6 @@ func (d *Dictionary) OnlyIterator(onlyTerms [][]byte, return rv } -func (d *Dictionary) Recycle() { - *d = Dictionary{} // clear fields - - // TODO: need vellum API's to allow for recycled or prealloc'ed FST's - - dictionaryPool.Put(d) -} - // DictionaryIterator is an iterator for term dictionary type DictionaryIterator struct { d *Dictionary diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 64f223896..0ac7938e1 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -21,7 +21,6 @@ import ( "io" "math" "reflect" - "sync" "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index/scorch/segment" @@ -113,8 +112,6 @@ type PostingsList struct { // represents an immutable, empty postings list var emptyPostingsList = &PostingsList{} -var postingsListPool = sync.Pool{New: func() interface{} { return &PostingsList{} }} - func (p *PostingsList) Size() int { sizeInBytes := reflectStaticSizePostingsList + size.SizeOfPtr @@ -158,9 +155,38 @@ func (p *PostingsList) Iterator(includeFreq, includeNorm, includeLocs bool, func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool, rv *PostingsIterator) *PostingsIterator { if rv == nil { - rv = postingsIteratorPool.Get().(*PostingsIterator) + rv = &PostingsIterator{} } else { - rv.Clear() + freqNormReader := rv.freqNormReader + if freqNormReader != nil { + freqNormReader.Reset([]byte(nil)) + } + + locReader := rv.locReader + if locReader != nil { + locReader.Reset([]byte(nil)) + } + + freqChunkOffsets := rv.freqChunkOffsets[:0] + locChunkOffsets := rv.locChunkOffsets[:0] + + nextLocs := rv.nextLocs[:0] + nextSegmentLocs := rv.nextSegmentLocs[:0] + + buf := rv.buf + + *rv = PostingsIterator{} // clear the struct + + rv.freqNormReader = freqNormReader + rv.locReader = locReader + + rv.freqChunkOffsets = freqChunkOffsets + rv.locChunkOffsets = locChunkOffsets + + rv.nextLocs = nextLocs + rv.nextSegmentLocs = nextSegmentLocs + + rv.buf = buf } rv.postings = p @@ -296,14 +322,6 @@ func (rv *PostingsList) init1Hit(fstVal uint64) error { return nil } -func (rv *PostingsList) Recycle() { - if rv != emptyPostingsList { - *rv = PostingsList{} - // TODO: can we also recycle the roaring bitmaps? - postingsListPool.Put(rv) - } -} - // PostingsIterator provides a way to iterate through the postings list type PostingsIterator struct { postings *PostingsList @@ -339,8 +357,6 @@ type PostingsIterator struct { var emptyPostingsIterator = &PostingsIterator{} -var postingsIteratorPool = sync.Pool{New: func() interface{} { return &PostingsIterator{} }} - func (i *PostingsIterator) Size() int { sizeInBytes := reflectStaticSizePostingsIterator + size.SizeOfPtr + len(i.currChunkFreqNorm) + @@ -695,47 +711,6 @@ func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, return uint64(n), true, nil } -func (rv *PostingsIterator) Clear() { - freqNormReader := rv.freqNormReader - if freqNormReader != nil { - freqNormReader.Reset([]byte(nil)) - } - - locReader := rv.locReader - if locReader != nil { - locReader.Reset([]byte(nil)) - } - - freqChunkOffsets := rv.freqChunkOffsets[:0] - locChunkOffsets := rv.locChunkOffsets[:0] - - nextLocs := rv.nextLocs[:0] - nextSegmentLocs := rv.nextSegmentLocs[:0] - - buf := rv.buf - - *rv = PostingsIterator{} // clear the struct - - rv.freqNormReader = freqNormReader - rv.locReader = locReader - - rv.freqChunkOffsets = freqChunkOffsets - rv.locChunkOffsets = locChunkOffsets - - rv.nextLocs = nextLocs - rv.nextSegmentLocs = nextSegmentLocs - - rv.buf = buf -} - -func (i *PostingsIterator) Recycle() { - if i != emptyPostingsIterator { - i.Clear() - - postingsIteratorPool.Put(i) - } -} - // Posting is a single entry in a postings list type Posting struct { docNum uint64 diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index b9a328ad4..8c6de211a 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -250,10 +250,11 @@ func (s *SegmentBase) Dictionary(field string) (segment.TermDictionary, error) { func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) { fieldIDPlus1 := sb.fieldsMap[field] if fieldIDPlus1 > 0 { - rv = dictionaryPool.Get().(*Dictionary) - rv.sb = sb - rv.field = field - rv.fieldID = fieldIDPlus1 - 1 + rv = &Dictionary{ + sb: sb, + field: field, + fieldID: fieldIDPlus1 - 1, + } dictStart := sb.dictLocs[rv.fieldID] if dictStart > 0 { diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 7dab97599..285412cf3 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -471,13 +471,13 @@ func (i *IndexSnapshot) allocTermFieldReaderDicts(field string) (tfr *IndexSnaps return &IndexSnapshotTermFieldReader{} } -func (i *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader) bool { +func (i *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader) { i.parent.rootLock.RLock() obsolete := i.parent.root != i i.parent.rootLock.RUnlock() if obsolete { // if we're not the current root (mutations happened), don't bother recycling - return false + return } i.m2.Lock() @@ -486,8 +486,6 @@ func (i *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader } i.fieldTFRs[tfr.field] = append(i.fieldTFRs[tfr.field], tfr) i.m2.Unlock() - - return true } func docNumberToBytes(buf []byte, in uint64) []byte { diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index 229869537..89af3be4c 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -178,43 +178,8 @@ func (i *IndexSnapshotTermFieldReader) Count() uint64 { func (i *IndexSnapshotTermFieldReader) Close() error { if i.snapshot != nil { - stats := &i.snapshot.parent.stats - - atomic.AddUint64(&stats.TotTermSearchersFinished, uint64(1)) - - if i.snapshot.recycleTermFieldReader(i) { - atomic.AddUint64(&stats.TotTermFieldReadersRecycled, uint64(1)) - } else { - atomic.AddUint64(&stats.TotTermFieldReadersGarbaged, uint64(1)) - - // the snapshot couldn't recycle the TFR (it was too - // obsolete), so recycle its constituent parts - - for _, x := range i.iterators { - recycle(x) - } - atomic.AddUint64(&stats.TotPostingsIteratorsRecycled, uint64(len(i.iterators))) - - for _, x := range i.postings { - recycle(x) - } - atomic.AddUint64(&stats.TotPostingsListsRecycled, uint64(len(i.postings))) - - for _, x := range i.dicts { - recycle(x) - } - atomic.AddUint64(&stats.TotTermDictionariesRecycled, uint64(len(i.dicts))) - } + atomic.AddUint64(&i.snapshot.parent.stats.TotTermSearchersFinished, uint64(1)) + i.snapshot.recycleTermFieldReader(i) } return nil } - -type Recycler interface { - Recycle() -} - -func recycle(x interface{}) { - if r, ok := x.(Recycler); ok { - r.Recycle() - } -} diff --git a/index/scorch/stats.go b/index/scorch/stats.go index ba497de99..bc1ca4bd8 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -115,13 +115,6 @@ type Stats struct { MaxMemMergeZapTime uint64 TotMemMergeSegments uint64 TotMemorySegmentsAtRoot uint64 - - TotTermFieldReadersGarbaged uint64 - TotTermFieldReadersRecycled uint64 - - TotTermDictionariesRecycled uint64 - TotPostingsListsRecycled uint64 - TotPostingsIteratorsRecycled uint64 } // atomically populates the returned map From 424c56dbfe8301f7338e70cdc01c5755827d3c21 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 12 Sep 2018 12:15:58 -0700 Subject: [PATCH 471/728] Update vellum's revision in the vendor/manifest --- vendor/manifest | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/manifest b/vendor/manifest index 5d1fb4717..72f0b6d7c 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -137,7 +137,7 @@ "importpath": "github.com/couchbase/vellum", "repository": "https://github.com/couchbase/vellum", "vcs": "git", - "revision": "35d9e7346a69d2499623063cd39f305599fc7eae", + "revision": "01d5c56e609533acd717717c8acc0d2dea6bfb89", "branch": "master", "notests": true } From 478805bc7f6790f6f088c4d49ca0bee9e443ca10 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 17 Sep 2018 16:46:22 -0700 Subject: [PATCH 472/728] Add support for GeoSpatial points in string format + Following specification used in elastic search: https://www.elastic.co/guide/en/elasticsearch/reference/current/geo-point.html + String formats allowed: - "lat,lon" - " lat, lon " - "geohash" + All the necessary code to decode geohashes is obtained from: - https://github.com/mmcloughlin/geohash + Also see: https://issues.couchbase.com/browse/MB-30542 --- geo/geohash.go | 174 ++++++++++++++++++++++++++++++++++++++++ geo/parse.go | 32 ++++++++ mapping/document.go | 6 +- mapping/mapping_test.go | 78 ++++++++++++++++-- 4 files changed, 283 insertions(+), 7 deletions(-) create mode 100644 geo/geohash.go diff --git a/geo/geohash.go b/geo/geohash.go new file mode 100644 index 000000000..35db720c0 --- /dev/null +++ b/geo/geohash.go @@ -0,0 +1,174 @@ +// The code here was obtained from: +// https://github.com/mmcloughlin/geohash + +// The MIT License (MIT) +// Copyright (c) 2015 Michael McLoughlin +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: + +// The above copyright notice and this permission notice shall be included in all +// copies or substantial portions of the Software. + +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +// SOFTWARE. + +package geo + +import ( + "math" +) + +// encoding encapsulates an encoding defined by a given base32 alphabet. +type encoding struct { + enc string + dec [256]byte +} + +// newEncoding constructs a new encoding defined by the given alphabet, +// which must be a 32-byte string. +func newEncoding(encoder string) *encoding { + e := new(encoding) + e.enc = encoder + for i := 0; i < len(e.dec); i++ { + e.dec[i] = 0xff + } + for i := 0; i < len(encoder); i++ { + e.dec[encoder[i]] = byte(i) + } + return e +} + +// Decode string into bits of a 64-bit word. The string s may be at most 12 +// characters. +func (e *encoding) decode(s string) uint64 { + x := uint64(0) + for i := 0; i < len(s); i++ { + x = (x << 5) | uint64(e.dec[s[i]]) + } + return x +} + +// Encode bits of 64-bit word into a string. +func (e *encoding) encode(x uint64) string { + b := [12]byte{} + for i := 0; i < 12; i++ { + b[11-i] = e.enc[x&0x1f] + x >>= 5 + } + return string(b[:]) +} + +// Base32Encoding with the Geohash alphabet. +var base32encoding = newEncoding("0123456789bcdefghjkmnpqrstuvwxyz") + +// BoundingBox returns the region encoded by the given string geohash. +func geoBoundingBox(hash string) geoBox { + bits := uint(5 * len(hash)) + inthash := base32encoding.decode(hash) + return geoBoundingBoxIntWithPrecision(inthash, bits) +} + +// Box represents a rectangle in latitude/longitude space. +type geoBox struct { + minLat float64 + maxLat float64 + minLng float64 + maxLng float64 +} + +// Round returns a point inside the box, making an effort to round to minimal +// precision. +func (b geoBox) round() (lat, lng float64) { + x := maxDecimalPower(b.maxLat - b.minLat) + lat = math.Ceil(b.minLat/x) * x + x = maxDecimalPower(b.maxLng - b.minLng) + lng = math.Ceil(b.minLng/x) * x + return +} + +// precalculated for performance +var exp232 = math.Exp2(32) + +// errorWithPrecision returns the error range in latitude and longitude for in +// integer geohash with bits of precision. +func errorWithPrecision(bits uint) (latErr, lngErr float64) { + b := int(bits) + latBits := b / 2 + lngBits := b - latBits + latErr = math.Ldexp(180.0, -latBits) + lngErr = math.Ldexp(360.0, -lngBits) + return +} + +// minDecimalPlaces returns the minimum number of decimal places such that +// there must exist an number with that many places within any range of width +// r. This is intended for returning minimal precision coordinates inside a +// box. +func maxDecimalPower(r float64) float64 { + m := int(math.Floor(math.Log10(r))) + return math.Pow10(m) +} + +// Encode the position of x within the range -r to +r as a 32-bit integer. +func encodeRange(x, r float64) uint32 { + p := (x + r) / (2 * r) + return uint32(p * exp232) +} + +// Decode the 32-bit range encoding X back to a value in the range -r to +r. +func decodeRange(X uint32, r float64) float64 { + p := float64(X) / exp232 + x := 2*r*p - r + return x +} + +// Squash the even bitlevels of X into a 32-bit word. Odd bitlevels of X are +// ignored, and may take any value. +func squash(X uint64) uint32 { + X &= 0x5555555555555555 + X = (X | (X >> 1)) & 0x3333333333333333 + X = (X | (X >> 2)) & 0x0f0f0f0f0f0f0f0f + X = (X | (X >> 4)) & 0x00ff00ff00ff00ff + X = (X | (X >> 8)) & 0x0000ffff0000ffff + X = (X | (X >> 16)) & 0x00000000ffffffff + return uint32(X) +} + +// Deinterleave the bits of X into 32-bit words containing the even and odd +// bitlevels of X, respectively. +func deinterleave(X uint64) (uint32, uint32) { + return squash(X), squash(X >> 1) +} + +// BoundingBoxIntWithPrecision returns the region encoded by the integer +// geohash with the specified precision. +func geoBoundingBoxIntWithPrecision(hash uint64, bits uint) geoBox { + fullHash := hash << (64 - bits) + latInt, lngInt := deinterleave(fullHash) + lat := decodeRange(latInt, 90) + lng := decodeRange(lngInt, 180) + latErr, lngErr := errorWithPrecision(bits) + return geoBox{ + minLat: lat, + maxLat: lat + latErr, + minLng: lng, + maxLng: lng + lngErr, + } +} + +// ---------------------------------------------------------------------- + +// Decode the string geohash to a (lat, lng) point. +func GeoHashDecode(hash string) (lat, lng float64) { + box := geoBoundingBox(hash) + return box.round() +} diff --git a/geo/parse.go b/geo/parse.go index 8dfc6eed2..0511fea7b 100644 --- a/geo/parse.go +++ b/geo/parse.go @@ -16,6 +16,7 @@ package geo import ( "reflect" + "strconv" "strings" ) @@ -24,6 +25,8 @@ import ( // Container: // slice length 2 (GeoJSON) // first element lon, second element lat +// string (coordinates separated by comma, or a geohash) +// first element lat, second element lon // map[string]interface{} // exact keys lat and lon or lng // struct @@ -59,6 +62,35 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) { } } + // is it a string + if thingVal.Kind() == reflect.String { + geoStr := thingVal.Interface().(string) + if strings.Contains(geoStr, ",") { + // geo point with coordinates split by comma + points := strings.Split(geoStr, ",") + for i, point := range points { + // trim any leading or trailing white spaces + points[i] = strings.TrimSpace(point) + } + if len(points) == 2 { + var err error + lat, err = strconv.ParseFloat(points[0], 64) + if err == nil { + foundLat = true + } + lon, err = strconv.ParseFloat(points[1], 64) + if err == nil { + foundLon = true + } + } + } else { + // geohash + lat, lon = GeoHashDecode(geoStr) + foundLat = true + foundLon = true + } + } + // is it a map if l, ok := thing.(map[string]interface{}); ok { if lval, ok := l["lon"]; ok { diff --git a/mapping/document.go b/mapping/document.go index cc3582cad..f950b59be 100644 --- a/mapping/document.go +++ b/mapping/document.go @@ -424,7 +424,11 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string, if subDocMapping != nil { // index by explicit mapping for _, fieldMapping := range subDocMapping.Fields { - fieldMapping.processString(propertyValueString, pathString, path, indexes, context) + if fieldMapping.Type == "geopoint" { + fieldMapping.processGeoPoint(property, pathString, path, indexes, context) + } else { + fieldMapping.processString(propertyValueString, pathString, path, indexes, context) + } } } else if closestDocMapping.Dynamic { // automatic indexing behavior diff --git a/mapping/mapping_test.go b/mapping/mapping_test.go index a13a90b8b..498fdb4ec 100644 --- a/mapping/mapping_test.go +++ b/mapping/mapping_test.go @@ -18,13 +18,13 @@ import ( "encoding/json" "fmt" "reflect" + "strconv" "testing" "time" "github.com/blevesearch/bleve/analysis/tokenizer/exception" "github.com/blevesearch/bleve/analysis/tokenizer/regexp" "github.com/blevesearch/bleve/document" - "github.com/blevesearch/bleve/numeric" ) var mappingSource = []byte(`{ @@ -870,6 +870,7 @@ func TestMappingForGeo(t *testing.T) { mapping.DefaultMapping = thingMapping geopoints := []interface{}{} + expect := [][]float64{} // to contain expected [lon,lat] for geopoints // geopoint as a struct geopoints = append(geopoints, struct { @@ -882,6 +883,7 @@ func TestMappingForGeo(t *testing.T) { Lat: -90, }, }) + expect = append(expect, []float64{-180, -90}) // geopoint as a map geopoints = append(geopoints, struct { @@ -894,8 +896,9 @@ func TestMappingForGeo(t *testing.T) { "lat": -90, }, }) + expect = append(expect, []float64{-180, -90}) - // geopoint as a slice + // geopoint as a slice, format: {lon, lat} geopoints = append(geopoints, struct { Name string `json:"name"` Location []interface{} `json:"location"` @@ -905,6 +908,55 @@ func TestMappingForGeo(t *testing.T) { -180, -90, }, }) + expect = append(expect, []float64{-180, -90}) + + // geopoint as a string, format: "lat,lon" + geopoints = append(geopoints, struct { + Name string `json:"name"` + Location []interface{} `json:"location"` + }{ + Name: "string", + Location: []interface{}{ + "-90,-180", + }, + }) + expect = append(expect, []float64{-180, -90}) + + // geopoint as a string, format: "lat , lon" with leading/trailing whitespaces + geopoints = append(geopoints, struct { + Name string `json:"name"` + Location []interface{} `json:"location"` + }{ + Name: "string", + Location: []interface{}{ + "-90 , -180", + }, + }) + expect = append(expect, []float64{-180, -90}) + + // geopoint as a string - geohash + geopoints = append(geopoints, struct { + Name string `json:"name"` + Location []interface{} `json:"location"` + }{ + Name: "string", + Location: []interface{}{ + "000000000000", + }, + }) + expect = append(expect, []float64{-180, -90}) + + // geopoint as a string - geohash + geopoints = append(geopoints, struct { + Name string `json:"name"` + Location []interface{} `json:"location"` + }{ + Name: "string", + Location: []interface{}{ + "drm3btev3e86", + }, + }) + expect = append(expect, []float64{-71.34, 41.12}) for i, geopoint := range geopoints { doc := document.NewDocument(string(i)) @@ -917,10 +969,24 @@ func TestMappingForGeo(t *testing.T) { for _, f := range doc.Fields { if f.Name() == "location" { foundGeo = true - got := f.Value() - expect := []byte(numeric.MustNewPrefixCodedInt64(0, 0)) - if !reflect.DeepEqual(got, expect) { - t.Errorf("expected geo value: %v, got %v", expect, got) + geoF, ok := f.(*document.GeoPointField) + if !ok { + t.Errorf("expected a geopoint field!") + } + lon, err := geoF.Lon() + if err != nil { + t.Errorf("error in fetching lon, err: %v", err) + } + lat, err := geoF.Lat() + if err != nil { + t.Errorf("error in fetching lat, err: %v", err) + } + // round obtained lon, lat to 2 decimal places + roundLon, _ := strconv.ParseFloat(fmt.Sprintf("%.2f", lon), 64) + roundLat, _ := strconv.ParseFloat(fmt.Sprintf("%.2f", lat), 64) + if roundLon != expect[i][0] || roundLat != expect[i][1] { + t.Errorf("expected geo point: {%v, %v}, got {%v, %v}", + expect[i][0], expect[i][1], lon, lat) } } } From 2e631fb90052bd251a388944111cd0da102ed4cc Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 15 Oct 2018 11:22:13 +0530 Subject: [PATCH 473/728] MB-31405 - higher disk usage in DGM Always perform in-memory segment merging before persisting segments, esp even when the memory pressure is applied. --- index/scorch/persister.go | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 5e9450189..60c2c4ef1 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -269,15 +269,12 @@ func (s *Scorch) parsePersisterOptions() (*persisterOptions, error) { } func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { - // perform in-memory merging only when there is no memory pressure - if s.paused() == 0 { - persisted, err := s.persistSnapshotMaybeMerge(snapshot) - if err != nil { - return err - } - if persisted { - return nil - } + persisted, err := s.persistSnapshotMaybeMerge(snapshot) + if err != nil { + return err + } + if persisted { + return nil } return s.persistSnapshotDirect(snapshot) From f92653698f2903d2e066c3f22794efdc079839c5 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 15 Oct 2018 12:57:53 -0700 Subject: [PATCH 474/728] Fix Boolean searcher's Advance + Relates to: https://github.com/blevesearch/bleve/pull/955 + Advance the boolean searcher's cursor only if necessary, the boolean searcher's cursor is tracked by it's currentID. + With this in place, we shouldn't be further checking the tracked cursor positions of the nested searchers. + Unit test + Fixes: https://github.com/blevesearch/bleve/issues/1021 --- search/searcher/search_boolean.go | 31 ++++---- search_test.go | 123 ++++++++++++++++++++++++++++++ 2 files changed, 137 insertions(+), 17 deletions(-) diff --git a/search/searcher/search_boolean.go b/search/searcher/search_boolean.go index f9684af29..a6f3a150b 100644 --- a/search/searcher/search_boolean.go +++ b/search/searcher/search_boolean.go @@ -331,10 +331,10 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter } } - var err error - // Advance nested searcher(s) only if the cursor is trailing the lookup ID - if s.mustSearcher != nil { - if s.currMust == nil || s.currMust.IndexInternalID.Compare(ID) < 0 { + // Advance the searcher only if the cursor is trailing the lookup ID + if s.currentID == nil || s.currentID.Compare(ID) < 0 { + var err error + if s.mustSearcher != nil { if s.currMust != nil { ctx.DocumentMatchPool.Put(s.currMust) } @@ -343,9 +343,7 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter return nil, err } } - } - if s.shouldSearcher != nil { - if s.currShould == nil || s.currShould.IndexInternalID.Compare(ID) < 0 { + if s.shouldSearcher != nil { if s.currShould != nil { ctx.DocumentMatchPool.Put(s.currShould) } @@ -354,9 +352,8 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter return nil, err } } - } - if s.mustNotSearcher != nil { - if s.currMustNot == nil || s.currMustNot.IndexInternalID.Compare(ID) < 0 { + + if s.mustNotSearcher != nil { if s.currMustNot != nil { ctx.DocumentMatchPool.Put(s.currMustNot) } @@ -365,14 +362,14 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter return nil, err } } - } - if s.mustSearcher != nil && s.currMust != nil { - s.currentID = s.currMust.IndexInternalID - } else if s.mustSearcher == nil && s.currShould != nil { - s.currentID = s.currShould.IndexInternalID - } else { - s.currentID = nil + if s.mustSearcher != nil && s.currMust != nil { + s.currentID = s.currMust.IndexInternalID + } else if s.mustSearcher == nil && s.currShould != nil { + s.currentID = s.currShould.IndexInternalID + } else { + s.currentID = nil + } } return s.Next(ctx) diff --git a/search_test.go b/search_test.go index e7a878653..3c9da5f35 100644 --- a/search_test.go +++ b/search_test.go @@ -547,6 +547,129 @@ func TestNestedBooleanSearchers(t *testing.T) { } } +func TestNestedBooleanMustNotSearcher(t *testing.T) { + defer func() { + err := os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + }() + + // create an index with default settings + idxMapping := NewIndexMapping() + idx, err := New("testidx", idxMapping) + if err != nil { + t.Fatal(err) + } + + // create and insert documents as a batch + batch := idx.NewBatch() + + docs := []struct { + id string + hasRole bool + investigationId string + }{ + { + id: "1@1", + hasRole: true, + investigationId: "1", + }, + { + id: "1@2", + hasRole: false, + investigationId: "2", + }, + { + id: "2@1", + hasRole: true, + investigationId: "1", + }, + { + id: "2@2", + hasRole: false, + investigationId: "2", + }, + { + id: "3@1", + hasRole: true, + investigationId: "1", + }, + { + id: "3@2", + hasRole: false, + investigationId: "2", + }, + { + id: "4@1", + hasRole: true, + investigationId: "1", + }, + { + id: "5@1", + hasRole: true, + investigationId: "1", + }, + { + id: "6@1", + hasRole: true, + investigationId: "1", + }, + { + id: "7@1", + hasRole: true, + investigationId: "1", + }, + } + + for i := 0; i < len(docs); i++ { + doc := document.NewDocument(docs[i].id) + doc.Fields = []document.Field{ + document.NewTextField("id", []uint64{}, []byte(docs[i].id)), + document.NewBooleanField("hasRole", []uint64{}, docs[i].hasRole), + document.NewTextField("investigationId", []uint64{}, []byte(docs[i].investigationId)), + } + + doc.CompositeFields = []*document.CompositeField{ + document.NewCompositeFieldWithIndexingOptions( + "_all", true, []string{"text"}, []string{}, + document.IndexField|document.IncludeTermVectors), + } + + if err = batch.IndexAdvanced(doc); err != nil { + t.Fatal(err) + } + } + + if err = idx.Batch(batch); err != nil { + t.Fatal(err) + } + + tq := NewTermQuery("1") + tq.SetField("investigationId") + // using must not, for cases that the field did not exists at all + hasRole := NewBoolFieldQuery(true) + hasRole.SetField("hasRole") + noRole := NewBooleanQuery() + noRole.AddMustNot(hasRole) + oneRolesOrNoRoles := NewBooleanQuery() + oneRolesOrNoRoles.AddShould(noRole) + oneRolesOrNoRoles.SetMinShould(1) + q := NewConjunctionQuery(tq, oneRolesOrNoRoles) + + sr := NewSearchRequestOptions(q, 100, 0, false) + sr.Fields = []string{"hasRole"} + sr.Highlight = NewHighlight() + + res, err := idx.Search(sr) + if err != nil { + t.Fatal(err) + } + if res.Total != 0 { + t.Fatalf("Unexpected result, %v != 0", res.Total) + } +} + func TestSearchScorchOverEmptyKeyword(t *testing.T) { defaultIndexType := Config.DefaultIndexType From de7c7204188a403d73fb2360f0f1a66ea44845d9 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 19 Oct 2018 10:28:25 +0530 Subject: [PATCH 475/728] MB-31660 - Scorch file leaks Faster zap file cleanup on index deletion --- index/scorch/merge.go | 35 ++++++++------- index/scorch/persister.go | 8 ++-- index/scorch/segment/segment.go | 4 ++ index/scorch/segment/zap/merge.go | 54 ++++++++++++++++++++---- index/scorch/segment/zap/merge_test.go | 10 ++--- index/scorch/segment/zap/segment_test.go | 2 +- index/scorch/stats.go | 5 ++- 7 files changed, 81 insertions(+), 37 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 38646bf0b..61abe6951 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -60,7 +60,7 @@ OUTER: err := s.planMergeAtSnapshot(ourSnapshot, mergePlannerOptions) if err != nil { atomic.StoreUint64(&s.iStats.mergeEpoch, 0) - if err == ErrClosed { + if err == segment.ErrClosed { // index has been closed _ = ourSnapshot.DecRef() break OUTER @@ -187,7 +187,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, } var oldNewDocNums map[uint64][]uint64 - var segment segment.Segment + var seg segment.Segment if len(segmentsToMerge) > 0 { filename := zapFileName(newSegmentID) s.markIneligibleForRemoval(filename) @@ -196,7 +196,8 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, fileMergeZapStartTime := time.Now() atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1) - newDocNums, nBytes, err := zap.Merge(segmentsToMerge, docsToDrop, path, DefaultChunkFactor) + newDocNums, nBytes, err := zap.Merge(segmentsToMerge, docsToDrop, path, + DefaultChunkFactor, s.closeCh) atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1) atomic.AddUint64(&s.stats.TotFileMergeWrittenBytes, nBytes) @@ -209,10 +210,13 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, if err != nil { s.unmarkIneligibleForRemoval(filename) atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1) + if err == segment.ErrClosed { + return err + } return fmt.Errorf("merging failed: %v", err) } - segment, err = zap.Open(path) + seg, err = zap.Open(path) if err != nil { s.unmarkIneligibleForRemoval(filename) atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1) @@ -230,7 +234,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, id: newSegmentID, old: oldMap, oldNewDocNums: oldNewDocNums, - new: segment, + new: seg, notify: make(chan *IndexSnapshot, 1), } notifications = append(notifications, sm.notify) @@ -238,8 +242,8 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, // give it to the introducer select { case <-s.closeCh: - _ = segment.Close() - return ErrClosed + _ = seg.Close() + return segment.ErrClosed case s.merges <- sm: atomic.AddUint64(&s.stats.TotFileMergeIntroductions, 1) } @@ -250,7 +254,8 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, for _, notification := range notifications { select { case <-s.closeCh: - return ErrClosed + atomic.AddUint64(&s.stats.TotFileMergeIntroductionsSkipped, 1) + return segment.ErrClosed case newSnapshot := <-notification: atomic.AddUint64(&s.stats.TotFileMergeIntroductionsDone, 1) if newSnapshot != nil { @@ -287,7 +292,7 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, path := s.path + string(os.PathSeparator) + filename newDocNums, _, err := - zap.MergeSegmentBases(sbs, sbsDrops, path, chunkFactor) + zap.MergeSegmentBases(sbs, sbsDrops, path, chunkFactor, s.closeCh) atomic.AddUint64(&s.stats.TotMemMergeZapEnd, 1) @@ -302,21 +307,21 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, return nil, 0, err } - segment, err := zap.Open(path) + seg, err := zap.Open(path) if err != nil { atomic.AddUint64(&s.stats.TotMemMergeErr, 1) return nil, 0, err } // update persisted stats - atomic.AddUint64(&s.stats.TotPersistedItems, segment.Count()) + atomic.AddUint64(&s.stats.TotPersistedItems, seg.Count()) atomic.AddUint64(&s.stats.TotPersistedSegments, 1) sm := &segmentMerge{ id: newSegmentID, old: make(map[uint64]*SegmentSnapshot), oldNewDocNums: make(map[uint64][]uint64), - new: segment, + new: seg, notify: make(chan *IndexSnapshot, 1), } @@ -328,14 +333,14 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, select { // send to introducer case <-s.closeCh: - _ = segment.DecRef() - return nil, 0, ErrClosed + _ = seg.DecRef() + return nil, 0, segment.ErrClosed case s.merges <- sm: } select { // wait for introduction to complete case <-s.closeCh: - return nil, 0, ErrClosed + return nil, 0, segment.ErrClosed case newSnapshot := <-sm.notify: atomic.AddUint64(&s.stats.TotMemMergeSegments, uint64(len(sbs))) atomic.AddUint64(&s.stats.TotMemMergeDone, 1) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 59e84d42f..01102c2f2 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -112,7 +112,7 @@ OUTER: } if err != nil { atomic.StoreUint64(&s.iStats.persistEpoch, 0) - if err == ErrClosed { + if err == segment.ErrClosed { // index has been closed _ = ourSnapshot.DecRef() break OUTER @@ -497,15 +497,13 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { select { case <-s.closeCh: - err = ErrClosed - return err + return segment.ErrClosed case s.persists <- persist: } select { case <-s.closeCh: - err = ErrClosed - return err + return segment.ErrClosed case <-persist.applied: } } diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index 28a879949..be9142c40 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -15,11 +15,15 @@ package segment import ( + "fmt" + "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" "github.com/couchbase/vellum" ) +var ErrClosed = fmt.Errorf("index closed") + // DocumentFieldValueVisitor defines a callback to be visited for each // stored field value. The return value determines if the visitor // should keep going. Returning true continues visiting, false stops. diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 37e391bab..901115898 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -24,6 +24,7 @@ import ( "sort" "github.com/RoaringBitmap/roaring" + seg "github.com/blevesearch/bleve/index/scorch/segment" "github.com/couchbase/vellum" "github.com/golang/snappy" ) @@ -37,17 +38,17 @@ const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc // remaining data. This new segment is built at the specified path, // with the provided chunkFactor. func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, - chunkFactor uint32) ([][]uint64, uint64, error) { + chunkFactor uint32, closeCh chan struct{}) ([][]uint64, uint64, error) { segmentBases := make([]*SegmentBase, len(segments)) for segmenti, segment := range segments { segmentBases[segmenti] = &segment.SegmentBase } - return MergeSegmentBases(segmentBases, drops, path, chunkFactor) + return MergeSegmentBases(segmentBases, drops, path, chunkFactor, closeCh) } func MergeSegmentBases(segmentBases []*SegmentBase, drops []*roaring.Bitmap, path string, - chunkFactor uint32) ([][]uint64, uint64, error) { + chunkFactor uint32, closeCh chan struct{}) ([][]uint64, uint64, error) { flag := os.O_RDWR | os.O_CREATE f, err := os.OpenFile(path, flag, 0600) @@ -67,7 +68,7 @@ func MergeSegmentBases(segmentBases []*SegmentBase, drops []*roaring.Bitmap, pat cr := NewCountHashWriter(br) newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, _, _, err := - MergeToWriter(segmentBases, drops, chunkFactor, cr) + MergeToWriter(segmentBases, drops, chunkFactor, cr, closeCh) if err != nil { cleanup() return nil, 0, err @@ -102,7 +103,7 @@ func MergeSegmentBases(segmentBases []*SegmentBase, drops []*roaring.Bitmap, pat } func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap, - chunkFactor uint32, cr *CountHashWriter) ( + chunkFactor uint32, cr *CountHashWriter, closeCh chan struct{}) ( newDocNums [][]uint64, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64, dictLocs []uint64, fieldsInv []string, fieldsMap map[string]uint16, @@ -114,16 +115,21 @@ func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap, fieldsMap = mapFields(fieldsInv) numDocs = computeNewDocCount(segments, drops) + + if isClosed(closeCh) { + return nil, 0, 0, 0, 0, nil, nil, nil, seg.ErrClosed + } + if numDocs > 0 { storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, - fieldsMap, fieldsInv, fieldsSame, numDocs, cr) + fieldsMap, fieldsInv, fieldsSame, numDocs, cr, closeCh) if err != nil { return nil, 0, 0, 0, 0, nil, nil, nil, err } dictLocs, docValueOffset, err = persistMergedRest(segments, drops, fieldsInv, fieldsMap, fieldsSame, - newDocNums, numDocs, chunkFactor, cr) + newDocNums, numDocs, chunkFactor, cr, closeCh) if err != nil { return nil, 0, 0, 0, 0, nil, nil, nil, err } @@ -165,7 +171,7 @@ func computeNewDocCount(segments []*SegmentBase, drops []*roaring.Bitmap) uint64 func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, fieldsInv []string, fieldsMap map[string]uint16, fieldsSame bool, newDocNumsIn [][]uint64, newSegDocCount uint64, chunkFactor uint32, - w *CountHashWriter) ([]uint64, uint64, error) { + w *CountHashWriter, closeCh chan struct{}) ([]uint64, uint64, error) { var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64) var bufLoc []uint64 @@ -200,6 +206,12 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, var segmentsInFocus []*SegmentBase for segmentI, segment := range segments { + + // check for the closure in meantime + if isClosed(closeCh) { + return nil, 0, seg.ErrClosed + } + dict, err2 := segment.dictionary(fieldName) if err2 != nil { return nil, 0, err2 @@ -277,6 +289,11 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, term, itrI, postingsOffset := enumerator.Current() if !bytes.Equal(prevTerm, term) { + // check for the closure in meantime + if isClosed(closeCh) { + return nil, 0, seg.ErrClosed + } + // if the term changed, write out the info collected // for the previous term err = finishTerm(prevTerm) @@ -353,6 +370,11 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, fdvReadersAvailable := false var dvIterClone *docValueReader for segmentI, segment := range segmentsInFocus { + // check for the closure in meantime + if isClosed(closeCh) { + return nil, 0, seg.ErrClosed + } + fieldIDPlus1 := uint16(segment.fieldsMap[fieldName]) if dvIter, exists := segment.fieldDvReaders[fieldIDPlus1-1]; exists && dvIter != nil { @@ -576,7 +598,7 @@ type varintEncoder func(uint64) (int, error) func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, fieldsMap map[string]uint16, fieldsInv []string, fieldsSame bool, newSegDocCount uint64, - w *CountHashWriter) (uint64, [][]uint64, error) { + w *CountHashWriter, closeCh chan struct{}) (uint64, [][]uint64, error) { var rv [][]uint64 // The remapped or newDocNums for each segment. var newDocNum uint64 @@ -603,6 +625,11 @@ func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, // for each segment for segI, segment := range segments { + // check for the closure in meantime + if isClosed(closeCh) { + return 0, nil, seg.ErrClosed + } + segNewDocNums := make([]uint64, segment.numDocs) dropsI := drops[segI] @@ -814,3 +841,12 @@ func mergeFields(segments []*SegmentBase) (bool, []string) { return fieldsSame, rv } + +func isClosed(closeCh chan struct{}) bool { + select { + case <-closeCh: + return true + default: + return false + } +} diff --git a/index/scorch/segment/zap/merge_test.go b/index/scorch/segment/zap/merge_test.go index 163663a49..276259c71 100644 --- a/index/scorch/segment/zap/merge_test.go +++ b/index/scorch/segment/zap/merge_test.go @@ -71,7 +71,7 @@ func TestMerge(t *testing.T) { segsToMerge[0] = segment.(*Segment) segsToMerge[1] = segment2.(*Segment) - _, _, err = Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024) + _, _, err = Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024, nil) if err != nil { t.Fatal(err) } @@ -175,7 +175,7 @@ func testMergeWithEmptySegments(t *testing.T, before bool, numEmptySegments int) drops := make([]*roaring.Bitmap, len(segsToMerge)) - _, _, err = Merge(segsToMerge, drops, "/tmp/scorch3.zap", 1024) + _, _, err = Merge(segsToMerge, drops, "/tmp/scorch3.zap", 1024, nil) if err != nil { t.Fatal(err) } @@ -217,7 +217,7 @@ func testMergeWithSelf(t *testing.T, segCur *Segment, expectedCount uint64) { segsToMerge := make([]*Segment, 1) segsToMerge[0] = segCur - _, _, err := Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/"+fname, 1024) + _, _, err := Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/"+fname, 1024, nil) if err != nil { t.Fatal(err) } @@ -589,7 +589,7 @@ func testMergeWithUpdates(t *testing.T, segmentDocIds [][]string, docsToDrop []* func testMergeAndDropSegments(t *testing.T, segsToMerge []*Segment, docsToDrop []*roaring.Bitmap, expectedNumDocs uint64) { _ = os.RemoveAll("/tmp/scorch-merged.zap") - _, _, err := Merge(segsToMerge, docsToDrop, "/tmp/scorch-merged.zap", 1024) + _, _, err := Merge(segsToMerge, docsToDrop, "/tmp/scorch-merged.zap", 1024, nil) if err != nil { t.Fatal(err) } @@ -823,7 +823,7 @@ func TestMergeBytesWritten(t *testing.T) { segsToMerge[0] = segment.(*Segment) segsToMerge[1] = segment2.(*Segment) - _, nBytes, err := Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024) + _, nBytes, err := Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024, nil) if err != nil { t.Fatal(err) } diff --git a/index/scorch/segment/zap/segment_test.go b/index/scorch/segment/zap/segment_test.go index 22b8af23f..623198c63 100644 --- a/index/scorch/segment/zap/segment_test.go +++ b/index/scorch/segment/zap/segment_test.go @@ -691,7 +691,7 @@ func TestMergedSegmentDocsWithNonOverlappingFields(t *testing.T) { segsToMerge[0] = segment1.(*Segment) segsToMerge[1] = segment2.(*Segment) - _, nBytes, err := Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024) + _, nBytes, err := Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024, nil) if err != nil { t.Fatal(err) } diff --git a/index/scorch/stats.go b/index/scorch/stats.go index bc1ca4bd8..2eb832f2c 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -103,8 +103,9 @@ type Stats struct { TotFileMergeZapTime uint64 MaxFileMergeZapTime uint64 - TotFileMergeIntroductions uint64 - TotFileMergeIntroductionsDone uint64 + TotFileMergeIntroductions uint64 + TotFileMergeIntroductionsDone uint64 + TotFileMergeIntroductionsSkipped uint64 TotMemMergeBeg uint64 TotMemMergeErr uint64 From 8b0d5dc7031dc4ee955b89b24f2c74578ebec47c Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Thu, 25 Oct 2018 11:57:05 -0700 Subject: [PATCH 476/728] [zap cli] Handle case when read chunk is 0 bytes --- cmd/bleve/cmd/zap/docvalue.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmd/bleve/cmd/zap/docvalue.go b/cmd/bleve/cmd/zap/docvalue.go index dcfa58de1..065f078e4 100644 --- a/cmd/bleve/cmd/zap/docvalue.go +++ b/cmd/bleve/cmd/zap/docvalue.go @@ -196,6 +196,10 @@ var docvalueCmd = &cobra.Command{ } curChunkSize := chunkLens[docInChunk] + if curChunkSize == 0 { + return nil + } + // read the number of docs reside in the chunk numDocs := uint64(0) numDocs, nread = binary.Uvarint(data[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64]) From 0954b3af441c3fd7ad1f216e82cb15bc4ca27d6d Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Sat, 27 Oct 2018 12:46:15 +0530 Subject: [PATCH 477/728] configurable merge of mem segments This change makes the memory merge operations of persister configuable. --- index/scorch/persister.go | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 01102c2f2..45bab7ab0 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -21,6 +21,7 @@ import ( "fmt" "io/ioutil" "log" + "math" "os" "path/filepath" "strconv" @@ -40,6 +41,8 @@ var DefaultPersisterNapTimeMSec int = 2000 // ms var DefaultPersisterNapUnderNumFiles int = 1000 +var DefaultMemoryPressurePauseThreshold uint64 = math.MaxUint64 + type persisterOptions struct { // PersisterNapTimeMSec controls the wait/delay injected into // persistence workloop to improve the chances for @@ -51,6 +54,13 @@ type persisterOptions struct { // PersisterNapTimeMSec amount of time to improve the chances for // a healthier and heavier in-memory merging PersisterNapUnderNumFiles int + + // MemoryPressurePauseThreshold let persister to have a better leeway + // for prudently performing the memory merge of segments on a memory + // pressure situation. Here the config value is an upper threshold + // for the number of paused application threads. The default value would + // be a very high number to always favour the merging of memory segments. + MemoryPressurePauseThreshold uint64 } type notificationChan chan struct{} @@ -103,7 +113,7 @@ OUTER: if ourSnapshot != nil { startTime := time.Now() - err := s.persistSnapshot(ourSnapshot) + err := s.persistSnapshot(ourSnapshot, po) for _, ch := range ourPersisted { if err != nil { ch <- err @@ -251,8 +261,9 @@ OUTER: func (s *Scorch) parsePersisterOptions() (*persisterOptions, error) { po := persisterOptions{ - PersisterNapTimeMSec: DefaultPersisterNapTimeMSec, - PersisterNapUnderNumFiles: DefaultPersisterNapUnderNumFiles, + PersisterNapTimeMSec: DefaultPersisterNapTimeMSec, + PersisterNapUnderNumFiles: DefaultPersisterNapUnderNumFiles, + MemoryPressurePauseThreshold: DefaultMemoryPressurePauseThreshold, } if v, ok := s.config["scorchPersisterOptions"]; ok { b, err := json.Marshal(v) @@ -268,13 +279,19 @@ func (s *Scorch) parsePersisterOptions() (*persisterOptions, error) { return &po, nil } -func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot) error { - persisted, err := s.persistSnapshotMaybeMerge(snapshot) - if err != nil { - return err - } - if persisted { - return nil +func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot, + po *persisterOptions) error { + // Perform in-memory segment merging only when the memory pressure is + // below the configured threshold, else the persister performs the + // direct persistence of segments. + if s.paused() < po.MemoryPressurePauseThreshold { + persisted, err := s.persistSnapshotMaybeMerge(snapshot) + if err != nil { + return err + } + if persisted { + return nil + } } return s.persistSnapshotDirect(snapshot) From e1479789acb991022bf323547f3ecf6a40ed35c2 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Sun, 28 Oct 2018 16:57:08 -0700 Subject: [PATCH 478/728] Prevent boolean must not cursor from incorrectly advancing The currMustNot cursor of the boolean searcher isn't tracked by the currentID. This means that during Advance(), we'll need to additionally check to prevent updating the cached cursor ID for the must not searcher. Fixes https://github.com/blevesearch/bleve/issues/1029 --- search/searcher/search_boolean.go | 19 ++-- search_test.go | 141 ++++++++++++++++++++++++++++++ 2 files changed, 154 insertions(+), 6 deletions(-) diff --git a/search/searcher/search_boolean.go b/search/searcher/search_boolean.go index a6f3a150b..0a223bb0b 100644 --- a/search/searcher/search_boolean.go +++ b/search/searcher/search_boolean.go @@ -319,6 +319,7 @@ func (s *BooleanSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch return nil, err } } + return rv, nil } @@ -343,6 +344,7 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter return nil, err } } + if s.shouldSearcher != nil { if s.currShould != nil { ctx.DocumentMatchPool.Put(s.currShould) @@ -354,12 +356,17 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter } if s.mustNotSearcher != nil { - if s.currMustNot != nil { - ctx.DocumentMatchPool.Put(s.currMustNot) - } - s.currMustNot, err = s.mustNotSearcher.Advance(ctx, ID) - if err != nil { - return nil, err + // Additional check for mustNotSearcher whose cursor isn't tracked by + // currentID to prevent it from moving when the searcher's already + // where it should be. + if s.currMustNot == nil || !s.currMustNot.IndexInternalID.Equals(ID) { + if s.currMustNot != nil { + ctx.DocumentMatchPool.Put(s.currMustNot) + } + s.currMustNot, err = s.mustNotSearcher.Advance(ctx, ID) + if err != nil { + return nil, err + } } } diff --git a/search_test.go b/search_test.go index 3c9da5f35..75aeac713 100644 --- a/search_test.go +++ b/search_test.go @@ -728,3 +728,144 @@ func TestSearchScorchOverEmptyKeyword(t *testing.T) { t.Fatalf("Unexpected search hits: %v, expected 10", res.Total) } } + +func TestMultipleNestedBooleanMustNotSearchersOnScorch(t *testing.T) { + defaultIndexType := Config.DefaultIndexType + + defer func() { + err := os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + Config.DefaultIndexType = defaultIndexType + }() + + Config.DefaultIndexType = scorch.Name + + // create an index with default settings + idxMapping := NewIndexMapping() + idx, err := New("testidx", idxMapping) + if err != nil { + t.Fatal(err) + } + + // create and insert documents as a batch + batch := idx.NewBatch() + + doc := document.NewDocument("1-child-0") + doc.Fields = []document.Field{ + document.NewTextField("id", []uint64{}, []byte("1-child-0")), + document.NewBooleanField("hasRole", []uint64{}, false), + document.NewTextField("roles", []uint64{}, []byte("R1")), + document.NewNumericField("type", []uint64{}, 0), + } + doc.CompositeFields = []*document.CompositeField{ + document.NewCompositeFieldWithIndexingOptions( + "_all", true, []string{"text"}, []string{}, + document.IndexField|document.IncludeTermVectors), + } + + if err = batch.IndexAdvanced(doc); err != nil { + t.Fatal(err) + } + + docs := []struct { + id string + hasRole bool + typ int + }{ + { + id: "16d6fa37-48fd-4dea-8b3d-a52bddf73951", + hasRole: false, + typ: 9, + }, + { + id: "18fa9eb2-8b1f-46f0-8b56-b4c551213f78", + hasRole: false, + typ: 9, + }, + { + id: "3085855b-d74b-474a-86c3-9bf3e4504382", + hasRole: false, + typ: 9, + }, + { + id: "38ef5d28-0f85-4fb0-8a94-dd20751c3364", + hasRole: false, + typ: 9, + }, + } + + for i := 0; i < len(docs); i++ { + doc := document.NewDocument(docs[i].id) + doc.Fields = []document.Field{ + document.NewTextField("id", []uint64{}, []byte(docs[i].id)), + document.NewBooleanField("hasRole", []uint64{}, docs[i].hasRole), + document.NewNumericField("type", []uint64{}, float64(docs[i].typ)), + } + + doc.CompositeFields = []*document.CompositeField{ + document.NewCompositeFieldWithIndexingOptions( + "_all", true, []string{"text"}, []string{}, + document.IndexField|document.IncludeTermVectors), + } + + if err = batch.IndexAdvanced(doc); err != nil { + t.Fatal(err) + } + } + + if err = idx.Batch(batch); err != nil { + t.Fatal(err) + } + + batch = idx.NewBatch() + + // Update 1st doc + doc = document.NewDocument("1-child-0") + doc.Fields = []document.Field{ + document.NewTextField("id", []uint64{}, []byte("1-child-0")), + document.NewBooleanField("hasRole", []uint64{}, false), + document.NewNumericField("type", []uint64{}, 0), + } + doc.CompositeFields = []*document.CompositeField{ + document.NewCompositeFieldWithIndexingOptions( + "_all", true, []string{"text"}, []string{}, + document.IndexField|document.IncludeTermVectors), + } + + if err = batch.IndexAdvanced(doc); err != nil { + t.Fatal(err) + } + + if err = idx.Batch(batch); err != nil { + t.Fatal(err) + } + + inclusive := true + val := float64(9) + q := query.NewNumericRangeInclusiveQuery(&val, &val, &inclusive, &inclusive) + q.SetField("type") + initialQuery := query.NewBooleanQuery(nil, nil, []query.Query{q}) + + // using must not, for cases that the field did not exists at all + hasRole := NewBoolFieldQuery(true) + hasRole.SetField("hasRole") + noRole := NewBooleanQuery() + noRole.AddMustNot(hasRole) + + rq := query.NewBooleanQuery([]query.Query{initialQuery, noRole}, nil, nil) + + sr := NewSearchRequestOptions(rq, 100, 0, false) + sr.Fields = []string{"id", "hasRole", "type"} + sr.Highlight = NewHighlight() + + res, err := idx.Search(sr) + if err != nil { + t.Fatal(err) + } + + if res.Total != 1 { + t.Fatalf("Unexpected result, %v != 1", res.Total) + } +} From 7e1adf70140b9e5686fae9e721eff141ab2d63c8 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 5 Nov 2018 15:22:09 -0800 Subject: [PATCH 479/728] Log number of clauses that caused the TooManyClauses error --- search/searcher/search_disjunction.go | 6 +++--- search/searcher/search_disjunction_heap.go | 2 +- search/searcher/search_disjunction_slice.go | 2 +- search/searcher/search_fuzzy.go | 4 ++-- search/searcher/search_multi_term.go | 4 ++-- search/searcher/search_numeric_range.go | 2 +- search/searcher/search_regexp.go | 2 +- search/searcher/search_term_prefix.go | 2 +- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/search/searcher/search_disjunction.go b/search/searcher/search_disjunction.go index bbf7b4bbc..882b02ccb 100644 --- a/search/searcher/search_disjunction.go +++ b/search/searcher/search_disjunction.go @@ -55,7 +55,7 @@ func tooManyClauses(count int) bool { return false } -func tooManyClausesErr() error { - return fmt.Errorf("TooManyClauses[maxClauseCount is set to %d]", - DisjunctionMaxClauseCount) +func tooManyClausesErr(count int) error { + return fmt.Errorf("TooManyClauses[%d > maxClauseCount, which is set to %d]", + count, DisjunctionMaxClauseCount) } diff --git a/search/searcher/search_disjunction_heap.go b/search/searcher/search_disjunction_heap.go index ffa373d2d..ec133f1f8 100644 --- a/search/searcher/search_disjunction_heap.go +++ b/search/searcher/search_disjunction_heap.go @@ -62,7 +62,7 @@ func newDisjunctionHeapSearcher(indexReader index.IndexReader, limit bool) ( *DisjunctionHeapSearcher, error) { if limit && tooManyClauses(len(searchers)) { - return nil, tooManyClausesErr() + return nil, tooManyClausesErr(len(searchers)) } // build our searcher diff --git a/search/searcher/search_disjunction_slice.go b/search/searcher/search_disjunction_slice.go index e3efdf2a7..e47f39ad0 100644 --- a/search/searcher/search_disjunction_slice.go +++ b/search/searcher/search_disjunction_slice.go @@ -50,7 +50,7 @@ func newDisjunctionSliceSearcher(indexReader index.IndexReader, limit bool) ( *DisjunctionSliceSearcher, error) { if limit && tooManyClauses(len(qsearchers)) { - return nil, tooManyClausesErr() + return nil, tooManyClausesErr(len(qsearchers)) } // build the downstream searchers searchers := make(OrderedSearcherList, len(qsearchers)) diff --git a/search/searcher/search_fuzzy.go b/search/searcher/search_fuzzy.go index b99528af4..668d11afc 100644 --- a/search/searcher/search_fuzzy.go +++ b/search/searcher/search_fuzzy.go @@ -71,7 +71,7 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, for err == nil && tfd != nil { rv = append(rv, tfd.Term) if tooManyClauses(len(rv)) { - return nil, tooManyClausesErr() + return nil, tooManyClausesErr(len(rv)) } tfd, err = fieldDict.Next() } @@ -103,7 +103,7 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, if !exceeded && ld <= fuzziness { rv = append(rv, tfd.Term) if tooManyClauses(len(rv)) { - return nil, tooManyClausesErr() + return nil, tooManyClausesErr(len(rv)) } } tfd, err = fieldDict.Next() diff --git a/search/searcher/search_multi_term.go b/search/searcher/search_multi_term.go index a723aedc5..c48366ee2 100644 --- a/search/searcher/search_multi_term.go +++ b/search/searcher/search_multi_term.go @@ -23,7 +23,7 @@ func NewMultiTermSearcher(indexReader index.IndexReader, terms []string, field string, boost float64, options search.SearcherOptions, limit bool) ( search.Searcher, error) { if limit && tooManyClauses(len(terms)) { - return nil, tooManyClausesErr() + return nil, tooManyClausesErr(len(terms)) } qsearchers := make([]search.Searcher, len(terms)) @@ -51,7 +51,7 @@ func NewMultiTermSearcherBytes(indexReader index.IndexReader, terms [][]byte, field string, boost float64, options search.SearcherOptions, limit bool) ( search.Searcher, error) { if limit && tooManyClauses(len(terms)) { - return nil, tooManyClausesErr() + return nil, tooManyClausesErr(len(terms)) } qsearchers := make([]search.Searcher, len(terms)) diff --git a/search/searcher/search_numeric_range.go b/search/searcher/search_numeric_range.go index 1eae7a5ec..e52ef9a82 100644 --- a/search/searcher/search_numeric_range.go +++ b/search/searcher/search_numeric_range.go @@ -68,7 +68,7 @@ func NewNumericRangeSearcher(indexReader index.IndexReader, return nil, err } if tooManyClauses(len(terms)) { - return nil, tooManyClausesErr() + return nil, tooManyClausesErr(len(terms)) } return NewMultiTermSearcherBytes(indexReader, terms, field, boost, options, diff --git a/search/searcher/search_regexp.go b/search/searcher/search_regexp.go index 299d9cdbe..4def832c4 100644 --- a/search/searcher/search_regexp.go +++ b/search/searcher/search_regexp.go @@ -110,7 +110,7 @@ func findRegexpCandidateTerms(indexReader index.IndexReader, if matchPos != nil && matchPos[0] == 0 && matchPos[1] == len(tfd.Term) { rv = append(rv, tfd.Term) if tooManyClauses(len(rv)) { - return rv, tooManyClausesErr() + return rv, tooManyClausesErr(len(rv)) } } tfd, err = fieldDict.Next() diff --git a/search/searcher/search_term_prefix.go b/search/searcher/search_term_prefix.go index 59db93101..b5af4631f 100644 --- a/search/searcher/search_term_prefix.go +++ b/search/searcher/search_term_prefix.go @@ -38,7 +38,7 @@ func NewTermPrefixSearcher(indexReader index.IndexReader, prefix string, for err == nil && tfd != nil { terms = append(terms, tfd.Term) if tooManyClauses(len(terms)) { - return nil, tooManyClausesErr() + return nil, tooManyClausesErr(len(terms)) } tfd, err = fieldDict.Next() } From b5e94b7320f3e7e3a4311b5c9447528254683f9f Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 8 Nov 2018 12:07:34 +0530 Subject: [PATCH 480/728] persister nap regression with safe batches Re-initialising the default persister nap configs to favour direct persistence without any naps. --- index/scorch/persister.go | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 45bab7ab0..8d54b3a70 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -37,8 +37,26 @@ import ( var DefaultChunkFactor uint32 = 1024 -var DefaultPersisterNapTimeMSec int = 2000 // ms - +// DefaultPersisterNapTimeMSec is kept to zero as this helps in direct +// persistence of segments with the default safe batch option. +// If the default safe batch option results in high number of +// files on disk, then users may initialise this configuration parameter +// with higher values so that the persister will nap a bit within it's +// work loop to favour better in-memory merging of segments to result +// in fewer segment files on disk. But that may come with an indexing +// performance overhead. +// Unsafe batch users are advised to override this to higher value +// for better performance especially with high data density. +var DefaultPersisterNapTimeMSec int = 0 // ms + +// DefaultPersisterNapUnderNumFiles helps in controlling the pace of +// persister. At times of a slow merger progress with heavy file merging +// operations, its better to pace down the persister for letting the merger +// to catch up within a range defined by this parameter. +// Fewer files on disk (as per the merge plan) would result in keeping the +// file handle usage under limit, faster disk merger and a healthier index. +// Its been observed that such a loosely sync'ed introducer-persister-merger +// trio results in better overall performance. var DefaultPersisterNapUnderNumFiles int = 1000 var DefaultMemoryPressurePauseThreshold uint64 = math.MaxUint64 From d39180b2f0a10c6a0d83c5c865b76147eee024ff Mon Sep 17 00:00:00 2001 From: Dmitriy Kalugin-Balashov Date: Tue, 13 Nov 2018 16:21:33 -0800 Subject: [PATCH 481/728] Improving ZAP File Format Documentation. --- index/scorch/segment/zap/zap.md | 177 ++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 index/scorch/segment/zap/zap.md diff --git a/index/scorch/segment/zap/zap.md b/index/scorch/segment/zap/zap.md new file mode 100644 index 000000000..bc54af92d --- /dev/null +++ b/index/scorch/segment/zap/zap.md @@ -0,0 +1,177 @@ +# ZAP File Format + +## Legend + +### Sections + + |========| + | | section + |========| + +### Fixed-size fields + + |--------| |----| |--| |-| + | | uint64 | | uint32 | | uint16 | | uint8 + |--------| |----| |--| |-| + +### Varints + + |~~~~~~~~| + | | varint(up to uint64) + |~~~~~~~~| + +### Arbitary-length fields + + |--------...---| + | | arbitrary-length field (string, vellum, roaring bitmap) + |--------...---| + +### Chunked data + + [--------] + [ ] + [--------] + +## Overview + +Footer sectrion describes the configuration of particular ZAP file. The format of footer is version-dependent, so it is necessary to check `V` field before the parsing. + + |==================================================| + | Stored Fields | + |==================================================| + |-----> | Stored Fields Index | + | |==================================================| + | | Dictionaries + Postings + DocValues | + | |==================================================| + | |---> | DocValues Index | + | | |==================================================| + | | | Fields | + | | |==================================================| + | | |-> | Fields Index | + | | | |========|========|========|========|====|====|====| + | | | | D# | SF | F | FDV | CF | V | CC | (Footer) + | | | |========|====|===|====|===|====|===|====|====|====| + | | | | | | + |-+-+-----------------| | | + | |--------------------------| | + |-------------------------------------| + + D#. Number of Docs. + SF. Stored Fields Index Offset. + F. Field Index Offset. + FDV. Field DocValue Offset. + CF. Chunk Factor. + V. Version. + CC. CRC32. + +## Stored Fields + +Stored Fields Index is `D#` consecutive 64-bit unsigned integers - offsets, where relevant Stored Fields Data records are located. + + 0 [SF] [SF + D# * 8] + | Stored Fields | Stored Fields Index | + |================================|==================================| + | | | + | |--------------------| ||--------|--------|. . .|--------|| + | |-> | Stored Fields Data | || 0 | 1 | | D# - 1 || + | | |--------------------| ||--------|----|---|. . .|--------|| + | | | | | + |===|============================|==============|===================| + | | + |-------------------------------------------| + +Stored Fields Data is an arbitrary size record, which consists of metadata and [Snappy](https://github.com/golang/snappy)-compressed data. + + Stored Fields Data + |~~~~~~~~|~~~~~~~~|~~~~~~~~...~~~~~~~~|~~~~~~~~...~~~~~~~~| + | MDS | CDS | MD | CD | + |~~~~~~~~|~~~~~~~~|~~~~~~~~...~~~~~~~~|~~~~~~~~...~~~~~~~~| + + MDS. Metadata size. + CDS. Compressed data size. + MD. Metadata. + CD. Snappy-compressed data. + +## Fields + +Fields Index section located between addresses `F` and `len(file) - len(footer)` and consist of `uint64` values (`F1`, `F2`, ...) which are offsets to records in Fields section. We have `F# = (len(file) - len(footer) - F) / sizeof(uint64)` fields. + + + (...) [F] [F + F#] + | Fields | Fields Index. | + |================================|================================| + | | | + | |~~~~~~~~|~~~~~~~~|---...---|||--------|--------|...|--------|| + ||->| Dict | Length | Name ||| 0 | 1 | | F# - 1 || + || |~~~~~~~~|~~~~~~~~|---...---|||--------|----|---|...|--------|| + || | | | + ||===============================|==============|=================| + | | + |----------------------------------------------| + + +## Dictionaries + Postings + +Each of fields has its own dictionary, encoded in [Vellum](https://github.com/couchbase/vellum) format. Dictionary consists of pairs `(term, offset)`, where `offset` indicates the position of postings (list of documents) for this particular term. + + |================================================================|- Dictionaries + + | | Postings + + | | DocValues + | Freq/Norm (chunked) | + | [~~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~] | + | |->[ Freq | Norm (float32 under varint) ] | + | | [~~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~] | + | | | + | |------------------------------------------------------------| | + | Location Details (chunked) | | + | [~~~~~~|~~~~~|~~~~~~~|~~~~~|~~~~~~|~~~~~~~~|~~~~~] | | + | |->[ Size | Pos | Start | End | Arr# | ArrPos | ... ] | | + | | [~~~~~~|~~~~~|~~~~~~~|~~~~~|~~~~~~|~~~~~~~~|~~~~~] | | + | | | | + | |----------------------| | | + | Postings List | | | + | |~~~~~~~~|~~~~~|~~|~~~~~~~~|-----------...--| | | + | |->| F/N | LD | Length | ROARING BITMAP | | | + | | |~~~~~|~~|~~~~~~~~|~~~~~~~~|-----------...--| | | + | | |----------------------------------------------| | + | |--------------------------------------| | + | Dictionary | | + | |~~~~~~~~|--------------------------|-...-| | + | |->| Length | VELLUM DATA : (TERM -> OFFSET) | | + | | |~~~~~~~~|----------------------------...-| | + | | | + |======|=========================================================|- DocValues Index + | | | + |======|=========================================================|- Fields + | | | + | |~~~~|~~~|~~~~~~~~|---...---| | + | | Dict | Length | Name | | + | |~~~~~~~~|~~~~~~~~|---...---| | + | | + |================================================================| + +## DocValues + +DocValues Index is `F#` pairs of varints, one pair per field. Each pair of varints indicates start and end point of DocValues slice. + + |================================================================| + | |------...--| | + | |->| DocValues |<-| | + | | |------...--| | | + |==|=================|===========================================|- DocValues Index + ||~|~~~~~~~~~|~~~~~~~|~~| |~~~~~~~~~~~~~~|~~~~~~~~~~~~|| + || DV1 START | DV1 STOP | . . . . . | DV(F#) START | DV(F#) END || + ||~~~~~~~~~~~|~~~~~~~~~~| |~~~~~~~~~~~~~~|~~~~~~~~~~~~|| + |================================================================| + +DocValues is chunked Snappy-compressed values for each document and field. + + [~~~~~~~~~~~~~~~|~~~~~~|~~~~~~~~~|-...-|~~~~~~|~~~~~~~~~|--------------------...-] + [ Doc# in Chunk | Doc1 | Offset1 | ... | DocN | OffsetN | SNAPPY COMPRESSED DATA ] + [~~~~~~~~~~~~~~~|~~~~~~|~~~~~~~~~|-...-|~~~~~~|~~~~~~~~~|--------------------...-] + +Last 16 bytes are description of chunks. + + |~~~~~~~~~~~~...~|----------------|----------------| + | Chunk Sizes | Chunk Size Arr | Chunk# | + |~~~~~~~~~~~~...~|----------------|----------------| \ No newline at end of file From d0ea32a67c16a368209190dc5ac752a5903e2b1b Mon Sep 17 00:00:00 2001 From: Dmitriy Kalugin-Balashov Date: Tue, 13 Nov 2018 16:25:44 -0800 Subject: [PATCH 482/728] index/scorch/segment/zap/README.md - removing legacy. --- index/scorch/segment/zap/README.md | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/index/scorch/segment/zap/README.md b/index/scorch/segment/zap/README.md index 872e86c84..0facb669f 100644 --- a/index/scorch/segment/zap/README.md +++ b/index/scorch/segment/zap/README.md @@ -1,5 +1,7 @@ # zap file format +Advanced ZAP File Format Documentation is [here](zap.md). + The file is written in the reverse order that we typically access data. This helps us write in one pass since later sections of the file require file offsets of things we've already written. Current usage: @@ -90,16 +92,6 @@ If you know the doc number you're interested in, this format lets you jump to th If you know the doc number you're interested in, this format lets you jump to the correct chunk (docNum/chunkFactor) directly and then seek within that chunk until you find it. -## bitmaps of hits with location info - -- for each posting list - - preparation phase: - - encode roaring bitmap (inidicating which hits have location details indexed) posting list to bytes (so we know the length) - - file writing phase: - - remember the start position for this bitmap - - write length of encoded roaring bitmap - - write the serialized roaring bitmap data - ## postings list section - for each posting list @@ -109,7 +101,6 @@ If you know the doc number you're interested in, this format lets you jump to th - remember the start position for this posting list - write freq/norm details offset (remembered from previous, as varint uint64) - write location details offset (remembered from previous, as varint uint64) - - write location bitmap offset (remembered from previous, as varint uint64) - write length of encoded roaring bitmap - write the serialized roaring bitmap data From ccf5e17bb77855e4ca757c68e29feb4094d23f4c Mon Sep 17 00:00:00 2001 From: Dmitriy Kalugin-Balashov Date: Wed, 14 Nov 2018 11:33:30 -0800 Subject: [PATCH 483/728] Typos fix. --- index/scorch/segment/zap/zap.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/index/scorch/segment/zap/zap.md b/index/scorch/segment/zap/zap.md index bc54af92d..d74dc548b 100644 --- a/index/scorch/segment/zap/zap.md +++ b/index/scorch/segment/zap/zap.md @@ -20,7 +20,7 @@ | | varint(up to uint64) |~~~~~~~~| -### Arbitary-length fields +### Arbitrary-length fields |--------...---| | | arbitrary-length field (string, vellum, roaring bitmap) @@ -34,7 +34,7 @@ ## Overview -Footer sectrion describes the configuration of particular ZAP file. The format of footer is version-dependent, so it is necessary to check `V` field before the parsing. +Footer section describes the configuration of particular ZAP file. The format of footer is version-dependent, so it is necessary to check `V` field before the parsing. |==================================================| | Stored Fields | @@ -174,4 +174,4 @@ Last 16 bytes are description of chunks. |~~~~~~~~~~~~...~|----------------|----------------| | Chunk Sizes | Chunk Size Arr | Chunk# | - |~~~~~~~~~~~~...~|----------------|----------------| \ No newline at end of file + |~~~~~~~~~~~~...~|----------------|----------------| From bd21bf72cd7aaf06e260590a735e4a01a9a1db9d Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 19 Nov 2018 15:22:26 +0530 Subject: [PATCH 484/728] fixing go vet errs --- index/scorch/segment/zap/dict.go | 10 +++++----- index/scorch/segment/zap/merge.go | 2 +- index/scorch/segment/zap/merge_test.go | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 219bf1526..2c0e1bf2a 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -105,7 +105,7 @@ func (d *Dictionary) Iterator() segment.DictionaryIterator { itr, err := d.fst.Iterator(nil, nil) if err == nil { rv.itr = itr - } else if err != nil && err != vellum.ErrIteratorDone { + } else if err != vellum.ErrIteratorDone { rv.err = err } } @@ -127,7 +127,7 @@ func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { itr, err := d.fst.Iterator(kBeg, kEnd) if err == nil { rv.itr = itr - } else if err != nil && err != vellum.ErrIteratorDone { + } else if err != vellum.ErrIteratorDone { rv.err = err } } @@ -154,7 +154,7 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator itr, err := d.fst.Iterator([]byte(start), endBytes) if err == nil { rv.itr = itr - } else if err != nil && err != vellum.ErrIteratorDone { + } else if err != vellum.ErrIteratorDone { rv.err = err } } @@ -174,7 +174,7 @@ func (d *Dictionary) AutomatonIterator(a vellum.Automaton, itr, err := d.fst.Search(a, startKeyInclusive, endKeyExclusive) if err == nil { rv.itr = itr - } else if err != nil && err != vellum.ErrIteratorDone { + } else if err != vellum.ErrIteratorDone { rv.err = err } } @@ -218,7 +218,7 @@ func (d *Dictionary) OnlyIterator(onlyTerms [][]byte, itr, err := d.fst.Search(onlyFST, nil, nil) if err == nil { rv.itr = itr - } else if err != nil && err != vellum.ErrIteratorDone { + } else if err != vellum.ErrIteratorDone { rv.err = err } diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 901115898..13e9bf97c 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -329,7 +329,7 @@ func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, err = enumerator.Next() } - if err != nil && err != vellum.ErrIteratorDone { + if err != vellum.ErrIteratorDone { return nil, 0, err } diff --git a/index/scorch/segment/zap/merge_test.go b/index/scorch/segment/zap/merge_test.go index 276259c71..db1cfff15 100644 --- a/index/scorch/segment/zap/merge_test.go +++ b/index/scorch/segment/zap/merge_test.go @@ -342,7 +342,7 @@ func compareSegments(a, b *Segment) string { for { apitrn, aerr := apitr.Next() - bpitrn, aerr := bpitr.Next() + bpitrn, berr := bpitr.Next() if aerr != berr { rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsListIterator Next() errors different: %v %v", fieldName, next.Term, aerr, berr)) From ff5953ff61a54a28581a12e4e361e7741a9c4e84 Mon Sep 17 00:00:00 2001 From: Aviad Lichtenstadt Date: Wed, 21 Nov 2018 17:57:05 -0800 Subject: [PATCH 485/728] Add callbacks So we can know when a batch was persisted properly to disk --- index.go | 8 +++++ index/index.go | 13 ++++++++ index/scorch/introducer.go | 5 +++ index/scorch/persister.go | 7 ++++ index/scorch/scorch.go | 6 ++-- index/scorch/scorch_test.go | 59 ++++++++++++++++++++++++++++++++++ index/upsidedown/upsidedown.go | 5 +++ 7 files changed, 101 insertions(+), 2 deletions(-) diff --git a/index.go b/index.go index f9462a41d..a9369f485 100644 --- a/index.go +++ b/index.go @@ -129,6 +129,14 @@ func (b *Batch) Merge(o *Batch) { } } +func (b *Batch) AddCallback(f index.BatchCallbackFunction) { + b.internal.AddCallback(f) +} + +func (b *Batch) Callback() []index.BatchCallbackFunction { + return b.internal.Callback() +} + // An Index implements all the indexing and searching // capabilities of bleve. An Index can be created // using the New() and Open() methods. diff --git a/index/index.go b/index/index.go index a44046134..c4f3d5dfa 100644 --- a/index/index.go +++ b/index/index.go @@ -248,15 +248,19 @@ type DocIDReader interface { Close() error } +type BatchCallbackFunction func(error) + type Batch struct { IndexOps map[string]*document.Document InternalOps map[string][]byte + callBack []BatchCallbackFunction } func NewBatch() *Batch { return &Batch{ IndexOps: make(map[string]*document.Document), InternalOps: make(map[string][]byte), + callBack: nil, } } @@ -276,6 +280,14 @@ func (b *Batch) DeleteInternal(key []byte) { b.InternalOps[string(key)] = nil } +func (b *Batch) AddCallback(f BatchCallbackFunction) { + b.callBack = append(b.callBack, f) +} + +func (b *Batch) Callback() []BatchCallbackFunction { + return b.callBack +} + func (b *Batch) String() string { rv := fmt.Sprintf("Batch (%d ops, %d internal ops)\n", len(b.IndexOps), len(b.InternalOps)) for k, v := range b.IndexOps { @@ -298,6 +310,7 @@ func (b *Batch) String() string { func (b *Batch) Reset() { b.IndexOps = make(map[string]*document.Document) b.InternalOps = make(map[string][]byte) + b.callBack = nil } func (b *Batch) Merge(o *Batch) { diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 12f27af66..fd39d4a94 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -19,6 +19,7 @@ import ( "sync/atomic" "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/index/scorch/segment/zap" ) @@ -32,6 +33,7 @@ type segmentIntroduction struct { applied chan error persisted chan error + callbacks []index.BatchCallbackFunction } type persistIntroduction struct { @@ -213,6 +215,9 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { if next.persisted != nil { s.rootPersisted = append(s.rootPersisted, next.persisted) } + if next.callbacks != nil { + s.callbacks = append(s.callbacks, next.callbacks...) + } // swap in new index snapshot newSnapshot.epoch = s.nextSnapshotEpoch s.nextSnapshotEpoch++ diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 8d54b3a70..1e1b9b42c 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -30,6 +30,7 @@ import ( "time" "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/index/scorch/segment/zap" "github.com/boltdb/bolt" @@ -115,6 +116,7 @@ OUTER: var ourSnapshot *IndexSnapshot var ourPersisted []chan error + var ourCallbacks []index.BatchCallbackFunction // check to see if there is a new snapshot to persist s.rootLock.Lock() @@ -123,6 +125,8 @@ OUTER: ourSnapshot.AddRef() ourPersisted = s.rootPersisted s.rootPersisted = nil + ourCallbacks = s.callbacks + s.callbacks = nil atomic.StoreUint64(&s.iStats.persistSnapshotSize, uint64(ourSnapshot.Size())) atomic.StoreUint64(&s.iStats.persistEpoch, ourSnapshot.epoch) } @@ -150,6 +154,9 @@ OUTER: atomic.AddUint64(&s.stats.TotPersistLoopErr, 1) continue OUTER } + for i := range ourCallbacks { + ourCallbacks[i](err) + } atomic.StoreUint64(&s.stats.LastPersistedEpoch, ourSnapshot.epoch) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 5e56c49b0..691b9d971 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -54,6 +54,7 @@ type Scorch struct { rootLock sync.RWMutex root *IndexSnapshot // holds 1 ref-count on the root rootPersisted []chan error // closed when root is persisted + callbacks []index.BatchCallbackFunction nextSnapshotEpoch uint64 eligibleForRemoval []uint64 // Index snapshot epochs that are safe to GC. ineligibleForRemoval map[string]bool // Filenames that should not be GC'ed yet. @@ -355,7 +356,7 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { atomic.AddUint64(&s.stats.TotBatchesEmpty, 1) } - err = s.prepareSegment(newSegment, ids, batch.InternalOps) + err = s.prepareSegment(newSegment, ids, batch.InternalOps, batch.Callback()) if err != nil { if newSegment != nil { _ = newSegment.Close() @@ -375,7 +376,7 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { } func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, - internalOps map[string][]byte) error { + internalOps map[string][]byte, callbacks []index.BatchCallbackFunction) error { // new introduction introduction := &segmentIntroduction{ @@ -385,6 +386,7 @@ func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, obsoletes: make(map[uint64]*roaring.Bitmap), internal: internalOps, applied: make(chan error), + callbacks: callbacks, } if !s.unsafeBatch { diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index adcabd22f..1fed924e1 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -15,6 +15,7 @@ package scorch import ( + "fmt" "log" "os" "reflect" @@ -883,6 +884,64 @@ func TestIndexBatch(t *testing.T) { } } +func TestIndexBatchWithCallbacks(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, testConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Fatalf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + // Check that callback function works + updated := false + cbErr := fmt.Errorf("") + + batch := index.NewBatch() + doc := document.NewDocument("3") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test3"))) + batch.Update(doc) + batch.AddCallback(func(e error) { + updated = true + cbErr = e + + }) + + err = idx.Batch(batch) + if err != nil { + t.Error(err) + } + + for i := 0; i < 30; i++ { + if updated { + break + } + time.Sleep(500 * time.Millisecond) + } + if !updated { + t.Fatal("Callback function wasn't called") + } + if cbErr != nil { + t.Fatal("Error wasn't updated properly on callback function") + } + +} + func TestIndexInsertUpdateDeleteWithMultipleTypesStored(t *testing.T) { defer func() { err := DestroyTest() diff --git a/index/upsidedown/upsidedown.go b/index/upsidedown/upsidedown.go index 6d3738539..c7caa26b5 100644 --- a/index/upsidedown/upsidedown.go +++ b/index/upsidedown/upsidedown.go @@ -958,6 +958,11 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) { } else { atomic.AddUint64(&udc.stats.errors, 1) } + + // For sake of completeness + for i := range batch.Callback() { + batch.Callback()[i](err) + } return } From 939c6e2255066a93e6968ff942f8652c5bb2dd37 Mon Sep 17 00:00:00 2001 From: Aviad Lichtenstadt Date: Wed, 21 Nov 2018 18:31:59 -0800 Subject: [PATCH 486/728] trigger tests again --- index/scorch/scorch_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index 1fed924e1..13ec719e8 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -885,6 +885,7 @@ func TestIndexBatch(t *testing.T) { } func TestIndexBatchWithCallbacks(t *testing.T) { + defer func() { err := DestroyTest() if err != nil { From a7e3fd0f45728d8ca3d26ca8ea5a02a7d199441e Mon Sep 17 00:00:00 2001 From: Aviad Lichtenstadt Date: Wed, 21 Nov 2018 19:40:35 -0800 Subject: [PATCH 487/728] CR fixes Naming conventions & remove unneeded `if` --- index.go | 6 ++-- index/index.go | 16 +++++----- index/scorch/introducer.go | 4 ++- index/scorch/persister.go | 2 +- index/scorch/scorch.go | 6 ++-- index/scorch/scorch_test.go | 58 +++++++++++++++++++++++++++++++++++++ 6 files changed, 76 insertions(+), 16 deletions(-) diff --git a/index.go b/index.go index a9369f485..0747dd65f 100644 --- a/index.go +++ b/index.go @@ -129,12 +129,12 @@ func (b *Batch) Merge(o *Batch) { } } -func (b *Batch) AddCallback(f index.BatchCallbackFunction) { +func (b *Batch) AddCallback(f index.BatchCallback) { b.internal.AddCallback(f) } -func (b *Batch) Callback() []index.BatchCallbackFunction { - return b.internal.Callback() +func (b *Batch) Callbacks() []index.BatchCallback { + return b.internal.Callbacks() } // An Index implements all the indexing and searching diff --git a/index/index.go b/index/index.go index c4f3d5dfa..398c9fc87 100644 --- a/index/index.go +++ b/index/index.go @@ -248,19 +248,19 @@ type DocIDReader interface { Close() error } -type BatchCallbackFunction func(error) +type BatchCallback func(error) type Batch struct { IndexOps map[string]*document.Document InternalOps map[string][]byte - callBack []BatchCallbackFunction + callbacks []BatchCallback } func NewBatch() *Batch { return &Batch{ IndexOps: make(map[string]*document.Document), InternalOps: make(map[string][]byte), - callBack: nil, + callbacks: nil, } } @@ -280,12 +280,12 @@ func (b *Batch) DeleteInternal(key []byte) { b.InternalOps[string(key)] = nil } -func (b *Batch) AddCallback(f BatchCallbackFunction) { - b.callBack = append(b.callBack, f) +func (b *Batch) AddCallback(f BatchCallback) { + b.callbacks = append(b.callbacks, f) } -func (b *Batch) Callback() []BatchCallbackFunction { - return b.callBack +func (b *Batch) Callbacks() []BatchCallback { + return b.callbacks } func (b *Batch) String() string { @@ -310,7 +310,7 @@ func (b *Batch) String() string { func (b *Batch) Reset() { b.IndexOps = make(map[string]*document.Document) b.InternalOps = make(map[string][]byte) - b.callBack = nil + b.callbacks = nil } func (b *Batch) Merge(o *Batch) { diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index fd39d4a94..7b012a390 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -33,7 +33,7 @@ type segmentIntroduction struct { applied chan error persisted chan error - callbacks []index.BatchCallbackFunction + callbacks []index.BatchCallback } type persistIntroduction struct { @@ -50,6 +50,7 @@ type snapshotReversion struct { snapshot *IndexSnapshot applied chan error persisted chan error + callbacks []index.BatchCallback } func (s *Scorch) mainLoop() { @@ -218,6 +219,7 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { if next.callbacks != nil { s.callbacks = append(s.callbacks, next.callbacks...) } + s.callbacks = append(s.callbacks, next.callbacks...) // swap in new index snapshot newSnapshot.epoch = s.nextSnapshotEpoch s.nextSnapshotEpoch++ diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 1e1b9b42c..ebbc240e7 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -116,7 +116,7 @@ OUTER: var ourSnapshot *IndexSnapshot var ourPersisted []chan error - var ourCallbacks []index.BatchCallbackFunction + var ourCallbacks []index.BatchCallback // check to see if there is a new snapshot to persist s.rootLock.Lock() diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 691b9d971..89b3ba408 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -54,7 +54,7 @@ type Scorch struct { rootLock sync.RWMutex root *IndexSnapshot // holds 1 ref-count on the root rootPersisted []chan error // closed when root is persisted - callbacks []index.BatchCallbackFunction + callbacks []index.BatchCallback nextSnapshotEpoch uint64 eligibleForRemoval []uint64 // Index snapshot epochs that are safe to GC. ineligibleForRemoval map[string]bool // Filenames that should not be GC'ed yet. @@ -356,7 +356,7 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { atomic.AddUint64(&s.stats.TotBatchesEmpty, 1) } - err = s.prepareSegment(newSegment, ids, batch.InternalOps, batch.Callback()) + err = s.prepareSegment(newSegment, ids, batch.InternalOps, batch.Callbacks()) if err != nil { if newSegment != nil { _ = newSegment.Close() @@ -376,7 +376,7 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { } func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, - internalOps map[string][]byte, callbacks []index.BatchCallbackFunction) error { + internalOps map[string][]byte, callbacks []index.BatchCallback) error { // new introduction introduction := &segmentIntroduction{ diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index 13ec719e8..e1cfb68ab 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -943,6 +943,64 @@ func TestIndexBatchWithCallbacks(t *testing.T) { } +func TestIndexBatchWithCallbacks(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, testConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Fatalf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + // Check that callback function works + updated := false + cbErr := fmt.Errorf("") + + batch := index.NewBatch() + doc := document.NewDocument("3") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test3"))) + batch.Update(doc) + batch.AddCallback(func(e error) { + updated = true + cbErr = e + + }) + + err = idx.Batch(batch) + if err != nil { + t.Error(err) + } + + for i := 0; i < 30; i++ { + if updated { + break + } + time.Sleep(500 * time.Millisecond) + } + if !updated { + t.Fatal("Callback function wasn't called") + } + if cbErr != nil { + t.Fatal("Error wasn't updated properly on callback function") + } + +} + func TestIndexInsertUpdateDeleteWithMultipleTypesStored(t *testing.T) { defer func() { err := DestroyTest() From 08c3a849cdc2691bba72164d293d948a8b32f243 Mon Sep 17 00:00:00 2001 From: Aviad Lichtenstadt Date: Wed, 21 Nov 2018 19:44:16 -0800 Subject: [PATCH 488/728] Make sure code compiles (fix upsidedown flow) --- index/upsidedown/upsidedown.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/index/upsidedown/upsidedown.go b/index/upsidedown/upsidedown.go index c7caa26b5..d0a27c000 100644 --- a/index/upsidedown/upsidedown.go +++ b/index/upsidedown/upsidedown.go @@ -960,8 +960,8 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) { } // For sake of completeness - for i := range batch.Callback() { - batch.Callback()[i](err) + for i := range batch.Callbacks() { + batch.Callbacks()[i](err) } return } From 364fb19f5f547feb79c83ce03c5dee650d644c19 Mon Sep 17 00:00:00 2001 From: Aviad Lichtenstadt Date: Wed, 21 Nov 2018 21:05:40 -0800 Subject: [PATCH 489/728] bad merge fix --- index/scorch/scorch_test.go | 58 ------------------------------------- 1 file changed, 58 deletions(-) diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index e1cfb68ab..13ec719e8 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -943,64 +943,6 @@ func TestIndexBatchWithCallbacks(t *testing.T) { } -func TestIndexBatchWithCallbacks(t *testing.T) { - defer func() { - err := DestroyTest() - if err != nil { - t.Fatal(err) - } - }() - - analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) - if err != nil { - t.Fatal(err) - } - err = idx.Open() - if err != nil { - t.Fatalf("error opening index: %v", err) - } - defer func() { - err := idx.Close() - if err != nil { - t.Fatal(err) - } - }() - - // Check that callback function works - updated := false - cbErr := fmt.Errorf("") - - batch := index.NewBatch() - doc := document.NewDocument("3") - doc.AddField(document.NewTextField("name", []uint64{}, []byte("test3"))) - batch.Update(doc) - batch.AddCallback(func(e error) { - updated = true - cbErr = e - - }) - - err = idx.Batch(batch) - if err != nil { - t.Error(err) - } - - for i := 0; i < 30; i++ { - if updated { - break - } - time.Sleep(500 * time.Millisecond) - } - if !updated { - t.Fatal("Callback function wasn't called") - } - if cbErr != nil { - t.Fatal("Error wasn't updated properly on callback function") - } - -} - func TestIndexInsertUpdateDeleteWithMultipleTypesStored(t *testing.T) { defer func() { err := DestroyTest() From 51097a76ae6c9a40b729c4df312c116310a0a569 Mon Sep 17 00:00:00 2001 From: Aviad Lichtenstadt Date: Sun, 25 Nov 2018 21:54:22 -0500 Subject: [PATCH 490/728] Fixing bad merge --- index/scorch/introducer.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 7b012a390..836e716c4 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -216,9 +216,6 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { if next.persisted != nil { s.rootPersisted = append(s.rootPersisted, next.persisted) } - if next.callbacks != nil { - s.callbacks = append(s.callbacks, next.callbacks...) - } s.callbacks = append(s.callbacks, next.callbacks...) // swap in new index snapshot newSnapshot.epoch = s.nextSnapshotEpoch From 55d3031c6ab8e91e135ad6a0d5e3eb43bd7315c6 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 26 Nov 2018 12:35:58 -0800 Subject: [PATCH 491/728] fix doc comment typo --- index/scorch/snapshot_segment.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 7672e853b..0e0c59e9f 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -89,7 +89,7 @@ func (s *SegmentSnapshot) DocNumbers(docIDs []string) (*roaring.Bitmap, error) { return rv, nil } -// DocNumbersLive returns bitsit containing doc numbers for all live docs +// DocNumbersLive returns a bitmap containing doc numbers for all live docs func (s *SegmentSnapshot) DocNumbersLive() *roaring.Bitmap { rv := roaring.NewBitmap() rv.AddRange(0, s.segment.Count()) From 367c7402b36681f493829e049ef77068a4862ce4 Mon Sep 17 00:00:00 2001 From: Dmitriy Kalugin-Balashov Date: Thu, 29 Nov 2018 17:25:55 -0800 Subject: [PATCH 492/728] Travis-CI reports sporadic failures on TestRegexpSearchScorch (#1057) * Bugfix: Travis-CI reports sporadic failures on TestRegexpSearchScorch. * Check return values. * TestRegexpSearchScorch - Changing InternalID to docID ('_id'). * Map initializing. --- search/searcher/base_test.go | 14 +++---- search/searcher/search_regexp_test.go | 57 ++++++++++++++------------- 2 files changed, 37 insertions(+), 34 deletions(-) diff --git a/search/searcher/base_test.go b/search/searcher/base_test.go index 425d6703c..feb00a6aa 100644 --- a/search/searcher/base_test.go +++ b/search/searcher/base_test.go @@ -15,7 +15,6 @@ package searcher import ( - "io/ioutil" "math" "regexp" @@ -48,9 +47,8 @@ func initTwoDocUpsideDown() index.Index { return twoDocIndex } -func initTwoDocScorch() index.Index { +func initTwoDocScorch(dir string) index.Index { analysisQueue := index.NewAnalysisQueue(1) - dir, _ := ioutil.TempDir("", "scorchTwoDoc") twoDocIndex, err := scorch.NewScorch( scorch.Name, map[string]interface{}{ @@ -68,11 +66,13 @@ func initTwoDocs(twoDocIndex index.Index) { if err != nil { panic(err) } + batch := index.NewBatch() for _, doc := range twoDocIndexDocs { - err := twoDocIndex.Update(doc) - if err != nil { - panic(err) - } + batch.Update(doc) + } + err = twoDocIndex.Batch(batch) + if err != nil { + panic(err) } } diff --git a/search/searcher/search_regexp_test.go b/search/searcher/search_regexp_test.go index 0c29068bd..feb23579c 100644 --- a/search/searcher/search_regexp_test.go +++ b/search/searcher/search_regexp_test.go @@ -17,6 +17,8 @@ package searcher import ( "encoding/binary" "fmt" + "io/ioutil" + "os" "regexp" "testing" @@ -37,13 +39,23 @@ func TestRegexpStringSearchUpsideDown(t *testing.T) { } func TestRegexpSearchScorch(t *testing.T) { - twoDocIndex := initTwoDocScorch() + dir, _ := ioutil.TempDir("", "scorchTwoDoc") + defer func() { + _ = os.RemoveAll(dir) + }() + + twoDocIndex := initTwoDocScorch(dir) testRegexpSearch(t, twoDocIndex, internalIDMakerScorch, searcherMaker) _ = twoDocIndex.Close() } func TestRegexpStringSearchScorch(t *testing.T) { - twoDocIndex := initTwoDocScorch() + dir, _ := ioutil.TempDir("", "scorchTwoDoc") + defer func() { + _ = os.RemoveAll(dir) + }() + + twoDocIndex := initTwoDocScorch(dir) testRegexpSearch(t, twoDocIndex, internalIDMakerScorch, searcherStringMaker) _ = twoDocIndex.Close() } @@ -101,29 +113,20 @@ func testRegexpSearch(t *testing.T, twoDocIndex index.Index, regexpSearcherCo := searcherMaker(t, twoDocIndexReader, "co.*", "desc") tests := []struct { - searcher search.Searcher - expecteds []*search.DocumentMatch + searcher search.Searcher + id2score map[string]float64 }{ { searcher: regexpSearcher, - expecteds: []*search.DocumentMatch{ - { - IndexInternalID: internalIDMaker(1), - Score: 1.916290731874155, - }, + id2score: map[string]float64{ + "1": 1.916290731874155, }, }, { searcher: regexpSearcherCo, - expecteds: []*search.DocumentMatch{ - { - IndexInternalID: internalIDMaker(2), - Score: 0.33875554280828685, - }, - { - IndexInternalID: internalIDMaker(3), - Score: 0.33875554280828685, - }, + id2score: map[string]float64{ + "2": 0.33875554280828685, + "3": 0.33875554280828685, }, }, } @@ -142,14 +145,14 @@ func testRegexpSearch(t *testing.T, twoDocIndex index.Index, next, err := test.searcher.Next(ctx) i := 0 for err == nil && next != nil { - if i < len(test.expecteds) { - if !next.IndexInternalID.Equals(test.expecteds[i].IndexInternalID) { - t.Errorf("test %d, expected result %d to have id %s got %s, next: %#v", - testIndex, i, test.expecteds[i].IndexInternalID, next.IndexInternalID, next) - } - if next.Score != test.expecteds[i].Score { + exID, _ := twoDocIndexReader.ExternalID(next.IndexInternalID) + if _, ok := test.id2score[exID]; !ok { + t.Errorf("test %d, found unexpected docID = %v, next = %v", testIndex, exID, next) + } else { + score := test.id2score[exID] + if next.Score != score { t.Errorf("test %d, expected result %d to have score %v got %v,next: %#v", - testIndex, i, test.expecteds[i].Score, next.Score, next) + testIndex, i, score, next.Score, next) t.Logf("scoring explanation: %s", next.Expl) } } @@ -160,8 +163,8 @@ func testRegexpSearch(t *testing.T, twoDocIndex index.Index, if err != nil { t.Fatalf("error iterating searcher: %v for test %d", err, testIndex) } - if len(test.expecteds) != i { - t.Errorf("expected %d results got %d for test %d", len(test.expecteds), i, testIndex) + if len(test.id2score) != i { + t.Errorf("expected %d results got %d for test %d", len(test.id2score), i, testIndex) } } } From 84c8ddee12a428200ba00c645b5c38658742aa22 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 3 Dec 2018 10:59:58 +0530 Subject: [PATCH 493/728] Adding error messages for negative fuzziness Querying for negative fuzziness or edit distance is an invalid option and adding error messages for that. This would help further wrinkles down the line due to wrapping of -ve ED to a higher value during the uint typecasting. --- search/searcher/search_fuzzy.go | 4 ++++ search/searcher/search_fuzzy_test.go | 13 +++++++++++++ 2 files changed, 17 insertions(+) diff --git a/search/searcher/search_fuzzy.go b/search/searcher/search_fuzzy.go index 668d11afc..8176e59b5 100644 --- a/search/searcher/search_fuzzy.go +++ b/search/searcher/search_fuzzy.go @@ -31,6 +31,10 @@ func NewFuzzySearcher(indexReader index.IndexReader, term string, return nil, fmt.Errorf("fuzziness exceeds max (%d)", MaxFuzziness) } + if fuzziness < 0 { + return nil, fmt.Errorf("invalid fuzziness, negative") + } + // Note: we don't byte slice the term for a prefix because of runes. prefixTerm := "" for i, r := range term { diff --git a/search/searcher/search_fuzzy_test.go b/search/searcher/search_fuzzy_test.go index dfac2f468..4a87f139a 100644 --- a/search/searcher/search_fuzzy_test.go +++ b/search/searcher/search_fuzzy_test.go @@ -140,3 +140,16 @@ func TestFuzzySearch(t *testing.T) { } } } + +func TestFuzzySearchLimitErrors(t *testing.T) { + explainTrue := search.SearcherOptions{Explain: true} + _, err := NewFuzzySearcher(nil, "water", 3, 3, "desc", 1.0, explainTrue) + if err == nil { + t.Fatal("`fuzziness exceeds max (2)` error expected") + } + + _, err = NewFuzzySearcher(nil, "water", 3, -1, "desc", 1.0, explainTrue) + if err == nil { + t.Fatal("`invalid fuzziness, negative` error expected") + } +} From 9bb1687420e429f94ffcd4a159ad1c7181772452 Mon Sep 17 00:00:00 2001 From: Dmitriy Kalugin-Balashov Date: Tue, 4 Dec 2018 11:36:41 -0800 Subject: [PATCH 494/728] DocNumbers - skip segments, that don't include requested ids. --- index/scorch/segment/zap/segment.go | 31 ++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 8c6de211a..1bab522e2 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -418,12 +418,33 @@ func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) { } postingsList := emptyPostingsList - for _, id := range ids { - postingsList, err = idDict.postingsList([]byte(id), nil, postingsList) - if err != nil { - return nil, err + + skipCheck := false + sMax := "" + iMin := "" + + sMaxB, err := idDict.fst.GetMaxKey() + if err != nil { + skipCheck = true + } else { + sMax = string(sMaxB) + iMin = ids[0] + for i := 1; i < len(ids); i++ { + if ids[i] < iMin { + iMin = ids[i] + } + } + } + if skipCheck || (iMin <= sMax) { + for _, id := range ids { + if skipCheck || (id <= sMax) { + postingsList, err = idDict.postingsList([]byte(id), nil, postingsList) + if err != nil { + return nil, err + } + postingsList.OrInto(rv) + } } - postingsList.OrInto(rv) } } From 66e3c954fc4dd7ff51a9197480ce87dca10bc055 Mon Sep 17 00:00:00 2001 From: Dmitriy Kalugin-Balashov Date: Tue, 4 Dec 2018 13:56:08 -0800 Subject: [PATCH 495/728] vendor/manifest update - new vellum API. --- vendor/manifest | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/manifest b/vendor/manifest index 72f0b6d7c..9471f14e2 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -137,7 +137,7 @@ "importpath": "github.com/couchbase/vellum", "repository": "https://github.com/couchbase/vellum", "vcs": "git", - "revision": "01d5c56e609533acd717717c8acc0d2dea6bfb89", + "revision": "f377ee3282b954c46915d89482bf93288ee7dd12", "branch": "master", "notests": true } From 8395e90b544ddfa13cbe9b112b330a8093bdec2c Mon Sep 17 00:00:00 2001 From: Dmitriy Kalugin-Balashov Date: Tue, 4 Dec 2018 14:12:07 -0800 Subject: [PATCH 496/728] Fix: DocNumbers must work with empty ids array. --- index/scorch/segment/zap/segment.go | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 1bab522e2..9342dd273 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -423,17 +423,21 @@ func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) { sMax := "" iMin := "" - sMaxB, err := idDict.fst.GetMaxKey() - if err != nil { - skipCheck = true - } else { - sMax = string(sMaxB) - iMin = ids[0] - for i := 1; i < len(ids); i++ { - if ids[i] < iMin { - iMin = ids[i] + if len(ids) > 0 { + sMaxB, err := idDict.fst.GetMaxKey() + if err != nil { + skipCheck = true + } else { + sMax = string(sMaxB) + iMin = ids[0] + for i := 1; i < len(ids); i++ { + if ids[i] < iMin { + iMin = ids[i] + } } } + } else { + skipCheck = true } if skipCheck || (iMin <= sMax) { for _, id := range ids { From 43e8e27f8e2cf5d076973dbad26f436aaa8a12c6 Mon Sep 17 00:00:00 2001 From: Dmitriy Kalugin-Balashov Date: Tue, 4 Dec 2018 17:10:12 -0800 Subject: [PATCH 497/728] Simplification - Using filteredIds instead of ids. --- index/scorch/segment/zap/segment.go | 39 +++++++++++------------------ 1 file changed, 15 insertions(+), 24 deletions(-) diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 9342dd273..606ea0cfa 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -418,38 +418,29 @@ func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) { } postingsList := emptyPostingsList - - skipCheck := false + filteredIds := ids[:0] sMax := "" - iMin := "" - if len(ids) > 0 { - sMaxB, err := idDict.fst.GetMaxKey() - if err != nil { - skipCheck = true - } else { - sMax = string(sMaxB) - iMin = ids[0] - for i := 1; i < len(ids); i++ { - if ids[i] < iMin { - iMin = ids[i] - } + sMaxB, err := idDict.fst.GetMaxKey() + if err == nil { + sMax = string(sMaxB) + for _, id := range ids { + if id <= sMax { + filteredIds = append(filteredIds, id) } } } else { - skipCheck = true + filteredIds = ids } - if skipCheck || (iMin <= sMax) { - for _, id := range ids { - if skipCheck || (id <= sMax) { - postingsList, err = idDict.postingsList([]byte(id), nil, postingsList) - if err != nil { - return nil, err - } - postingsList.OrInto(rv) - } + + for _, id := range filteredIds { + postingsList, err = idDict.postingsList([]byte(id), nil, postingsList) + if err != nil { + return nil, err } + postingsList.OrInto(rv) } + } return rv, nil From e3510d8ce59a6dcae03cd454cbf09926f89c20df Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 5 Dec 2018 14:04:58 +0530 Subject: [PATCH 498/728] Fix for Travis CI failuers Adding index close for better cleanup --- search_test.go | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/search_test.go b/search_test.go index 75aeac713..9937eb7b2 100644 --- a/search_test.go +++ b/search_test.go @@ -545,6 +545,11 @@ func TestNestedBooleanSearchers(t *testing.T) { if matches != len(searchResults.Hits) { t.Fatalf("Unexpected result set, %v != %v", matches, len(searchResults.Hits)) } + + err = idx.Close() + if err != nil { + t.Fatal(err) + } } func TestNestedBooleanMustNotSearcher(t *testing.T) { @@ -668,6 +673,11 @@ func TestNestedBooleanMustNotSearcher(t *testing.T) { if res.Total != 0 { t.Fatalf("Unexpected result, %v != 0", res.Total) } + + err = idx.Close() + if err != nil { + t.Fatal(err) + } } func TestSearchScorchOverEmptyKeyword(t *testing.T) { @@ -727,6 +737,11 @@ func TestSearchScorchOverEmptyKeyword(t *testing.T) { if res.Total != 10 { t.Fatalf("Unexpected search hits: %v, expected 10", res.Total) } + + err = idx.Close() + if err != nil { + t.Fatal(err) + } } func TestMultipleNestedBooleanMustNotSearchersOnScorch(t *testing.T) { @@ -868,4 +883,9 @@ func TestMultipleNestedBooleanMustNotSearchersOnScorch(t *testing.T) { if res.Total != 1 { t.Fatalf("Unexpected result, %v != 1", res.Total) } + + err = idx.Close() + if err != nil { + t.Fatal(err) + } } From 57aadf7f935b36a3fd7e1669b28d3df8da79fdf0 Mon Sep 17 00:00:00 2001 From: Dmitriy Kalugin-Balashov Date: Wed, 5 Dec 2018 11:18:41 -0800 Subject: [PATCH 499/728] Scorch test refactoring (#1055) * Scorch test refactoring. * Go 1.7 support for scorch tests. * Explicit test folder assigning. --- index/scorch/event_test.go | 9 +- index/scorch/field_dict_test.go | 14 +- index/scorch/reader_test.go | 34 +++- index/scorch/scorch_test.go | 236 ++++++++++++++++++------- index/scorch/snapshot_rollback_test.go | 13 +- 5 files changed, 225 insertions(+), 81 deletions(-) diff --git a/index/scorch/event_test.go b/index/scorch/event_test.go index 92b49d20d..58ba7c06c 100644 --- a/index/scorch/event_test.go +++ b/index/scorch/event_test.go @@ -22,8 +22,13 @@ import ( ) func TestEventBatchIntroductionStart(t *testing.T) { + testConfig := CreateConfig("TestEventBatchIntroductionStart") + err := InitTest(testConfig) + if err != nil { + t.Fatal(err) + } defer func() { - err := DestroyTest() + err := DestroyTest(testConfig) if err != nil { t.Fatal(err) } @@ -70,4 +75,4 @@ func TestEventBatchIntroductionStart(t *testing.T) { if count != 1 { t.Fatalf("expected to see 1 batch introduction event event, saw %d", count) } -} +} \ No newline at end of file diff --git a/index/scorch/field_dict_test.go b/index/scorch/field_dict_test.go index a25c5c984..92c49890e 100644 --- a/index/scorch/field_dict_test.go +++ b/index/scorch/field_dict_test.go @@ -23,15 +23,21 @@ import ( ) func TestIndexFieldDict(t *testing.T) { + cfg := CreateConfig("TestIndexFieldDict") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } defer func() { - err := DestroyTest() + err := DestroyTest(cfg) if err != nil { - t.Fatal(err) + t.Log(err) } }() + analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) + idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } @@ -181,4 +187,4 @@ func TestIndexFieldDict(t *testing.T) { if !reflect.DeepEqual(expectedTerms, terms) { t.Errorf("expected %#v, got %#v", expectedTerms, terms) } -} +} \ No newline at end of file diff --git a/index/scorch/reader_test.go b/index/scorch/reader_test.go index 8414cbdc1..b89ba6dd9 100644 --- a/index/scorch/reader_test.go +++ b/index/scorch/reader_test.go @@ -24,15 +24,21 @@ import ( ) func TestIndexReader(t *testing.T) { + cfg := CreateConfig("TestIndexReader") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } defer func() { - err := DestroyTest() + err := DestroyTest(cfg) if err != nil { - t.Fatal(err) + t.Log(err) } }() + analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) + idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } @@ -214,15 +220,20 @@ func TestIndexReader(t *testing.T) { } func TestIndexDocIdReader(t *testing.T) { + cfg := CreateConfig("TestIndexDocIdReader") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } defer func() { - err := DestroyTest() + err := DestroyTest(cfg) if err != nil { - t.Fatal(err) + t.Log(err) } }() analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) + idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } @@ -325,15 +336,20 @@ func TestIndexDocIdReader(t *testing.T) { } func TestIndexDocIdOnlyReader(t *testing.T) { + cfg := CreateConfig("TestIndexDocIdOnlyReader") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } defer func() { - err := DestroyTest() + err := DestroyTest(cfg) if err != nil { - t.Fatal(err) + t.Log(err) } }() analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) + idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index adcabd22f..424afbfc6 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -38,12 +38,19 @@ func init() { DefaultPersisterNapTimeMSec = 1 } -func DestroyTest() error { - return os.RemoveAll("/tmp/bleve-scorch-test") +func InitTest(cfg map[string]interface{}) error { + return os.RemoveAll(cfg["path"].(string)) } -var testConfig = map[string]interface{}{ - "path": "/tmp/bleve-scorch-test", +func DestroyTest(cfg map[string]interface{}) error { + return os.RemoveAll(cfg["path"].(string)) +} + +func CreateConfig(name string) map[string]interface{} { + // TODO: Use t.Name() when Go 1.7 support terminates. + rv := make(map[string]interface{}) + rv["path"] = os.TempDir() + "/bleve-scorch-test-" + name + return rv } var testAnalyzer = &analysis.Analyzer{ @@ -51,15 +58,20 @@ var testAnalyzer = &analysis.Analyzer{ } func TestIndexOpenReopen(t *testing.T) { + cfg := CreateConfig("TestIndexOpenReopen") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } defer func() { - err := DestroyTest() + err := DestroyTest(cfg) if err != nil { - t.Fatal(err) + t.Log(err) } }() analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) + idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } @@ -116,7 +128,7 @@ func TestIndexOpenReopen(t *testing.T) { t.Fatal(err) } - idx, err = NewScorch(Name, testConfig, analysisQueue) + idx, err = NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } @@ -150,15 +162,20 @@ func TestIndexOpenReopen(t *testing.T) { } func TestIndexInsert(t *testing.T) { + cfg := CreateConfig("TestIndexInsert") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } defer func() { - err := DestroyTest() + err := DestroyTest(cfg) if err != nil { - t.Fatal(err) + t.Log(err) } }() analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) + idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } @@ -216,15 +233,20 @@ func TestIndexInsert(t *testing.T) { } func TestIndexInsertThenDelete(t *testing.T) { + cfg := CreateConfig("TestIndexInsertThenDelete") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } defer func() { - err := DestroyTest() + err := DestroyTest(cfg) if err != nil { - t.Fatal(err) + t.Log(err) } }() analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) + idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } @@ -342,7 +364,7 @@ func TestIndexInsertThenDelete(t *testing.T) { t.Fatal(err) } - idx, err = NewScorch(Name, testConfig, analysisQueue) // reopen + idx, err = NewScorch(Name, cfg, analysisQueue) // reopen if err != nil { t.Fatal(err) } @@ -427,15 +449,20 @@ func TestIndexInsertThenDelete(t *testing.T) { } func TestIndexInsertThenUpdate(t *testing.T) { + cfg := CreateConfig("TestIndexInsertThenUpdate") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } defer func() { - err := DestroyTest() + err := DestroyTest(cfg) if err != nil { - t.Fatal(err) + t.Log(err) } }() analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) + idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } @@ -494,15 +521,20 @@ func TestIndexInsertThenUpdate(t *testing.T) { } func TestIndexInsertMultiple(t *testing.T) { + cfg := CreateConfig("TestIndexInsertMultiple") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } defer func() { - err := DestroyTest() + err := DestroyTest(cfg) if err != nil { - t.Fatal(err) + t.Log(err) } }() analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) + idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } @@ -561,15 +593,20 @@ func TestIndexInsertMultiple(t *testing.T) { } func TestIndexInsertWithStore(t *testing.T) { + cfg := CreateConfig("TestIndexInsertWithStore") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } defer func() { - err := DestroyTest() + err := DestroyTest(cfg) if err != nil { - t.Fatal(err) + t.Log(err) } }() analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) + idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } @@ -578,9 +615,9 @@ func TestIndexInsertWithStore(t *testing.T) { t.Fatalf("error opening index: %v", err) } defer func() { - cerr := idx.Close() + err := idx.Close() if err != nil { - t.Fatal(cerr) + t.Fatal(err) } }() @@ -660,15 +697,20 @@ func TestIndexInsertWithStore(t *testing.T) { } func TestIndexInternalCRUD(t *testing.T) { + cfg := CreateConfig("TestIndexInternalCRUD") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } defer func() { - err := DestroyTest() + err := DestroyTest(cfg) if err != nil { - t.Fatal(err) + t.Log(err) } }() analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) + idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } @@ -766,15 +808,20 @@ func TestIndexInternalCRUD(t *testing.T) { } func TestIndexBatch(t *testing.T) { + cfg := CreateConfig("TestIndexBatch") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } defer func() { - err := DestroyTest() + err := DestroyTest(cfg) if err != nil { - t.Fatal(err) + t.Log(err) } }() analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) + idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } @@ -884,15 +931,20 @@ func TestIndexBatch(t *testing.T) { } func TestIndexInsertUpdateDeleteWithMultipleTypesStored(t *testing.T) { + cfg := CreateConfig("TestIndexInsertUpdateDeleteWithMultipleTypesStored") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } defer func() { - err := DestroyTest() + err := DestroyTest(cfg) if err != nil { - t.Fatal(err) + t.Log(err) } }() analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) + idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } @@ -1102,15 +1154,20 @@ func TestIndexInsertUpdateDeleteWithMultipleTypesStored(t *testing.T) { } func TestIndexInsertFields(t *testing.T) { + cfg := CreateConfig("TestIndexInsertFields") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } defer func() { - err := DestroyTest() + err := DestroyTest(cfg) if err != nil { - t.Fatal(err) + t.Log(err) } }() analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) + idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } @@ -1171,15 +1228,20 @@ func TestIndexInsertFields(t *testing.T) { } func TestIndexUpdateComposites(t *testing.T) { + cfg := CreateConfig("TestIndexUpdateComposites") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } defer func() { - err := DestroyTest() + err := DestroyTest(cfg) if err != nil { - t.Fatal(err) + t.Log(err) } }() analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) + idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } @@ -1248,15 +1310,20 @@ func TestIndexUpdateComposites(t *testing.T) { } func TestIndexTermReaderCompositeFields(t *testing.T) { + cfg := CreateConfig("TestIndexTermReaderCompositeFields") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } defer func() { - err := DestroyTest() + err := DestroyTest(cfg) if err != nil { - t.Fatal(err) + t.Log(err) } }() analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) + idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } @@ -1313,15 +1380,20 @@ func TestIndexTermReaderCompositeFields(t *testing.T) { } func TestIndexDocumentVisitFieldTerms(t *testing.T) { + cfg := CreateConfig("TestIndexDocumentVisitFieldTerms") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } defer func() { - err := DestroyTest() + err := DestroyTest(cfg) if err != nil { - t.Fatal(err) + t.Log(err) } }() analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) + idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } @@ -1378,15 +1450,20 @@ func TestIndexDocumentVisitFieldTerms(t *testing.T) { } func TestConcurrentUpdate(t *testing.T) { + cfg := CreateConfig("TestConcurrentUpdate") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } defer func() { - err := DestroyTest() + err := DestroyTest(cfg) if err != nil { - t.Fatal(err) + t.Log(err) } }() analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) + idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } @@ -1422,6 +1499,12 @@ func TestConcurrentUpdate(t *testing.T) { if err != nil { log.Fatal(err) } + defer func() { + err := r.Close() + if err != nil { + t.Fatal(err) + } + }() doc, err := r.Document("1") if err != nil { @@ -1434,15 +1517,20 @@ func TestConcurrentUpdate(t *testing.T) { } func TestLargeField(t *testing.T) { + cfg := CreateConfig("TestLargeField") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } defer func() { - err := DestroyTest() + err := DestroyTest(cfg) if err != nil { - t.Fatal(err) + t.Log(err) } }() analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) + idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } @@ -1496,15 +1584,20 @@ This section needs additional citations for verification. Please help improve th There are three characteristics of liquids which are relevant to the discussion of a BLEVE:`) func TestIndexDocumentVisitFieldTermsWithMultipleDocs(t *testing.T) { + cfg := CreateConfig("TestIndexDocumentVisitFieldTermsWithMultipleDocs") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } defer func() { - err := DestroyTest() + err := DestroyTest(cfg) if err != nil { - t.Fatal(err) + t.Log(err) } }() analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) + idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } @@ -1648,15 +1741,20 @@ func TestIndexDocumentVisitFieldTermsWithMultipleDocs(t *testing.T) { } func TestIndexDocumentVisitFieldTermsWithMultipleFieldOptions(t *testing.T) { + cfg := CreateConfig("TestIndexDocumentVisitFieldTermsWithMultipleFieldOptions") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } defer func() { - err := DestroyTest() + err := DestroyTest(cfg) if err != nil { - t.Fatal(err) + t.Log(err) } }() analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) + idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } @@ -1718,6 +1816,19 @@ func TestIndexDocumentVisitFieldTermsWithMultipleFieldOptions(t *testing.T) { func TestAllFieldWithDifferentTermVectorsEnabled(t *testing.T) { // Based on https://github.com/blevesearch/bleve/issues/895 from xeizmendi + cfg := CreateConfig("TestAllFieldWithDifferentTermVectorsEnabled") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } + defer func() { + err := DestroyTest(cfg) + if err != nil { + t.Log(err) + } + }() + + testConfig := cfg mp := mapping.NewIndexMapping() keywordMapping := mapping.NewTextFieldMapping() @@ -1736,7 +1847,6 @@ func TestAllFieldWithDifferentTermVectorsEnabled(t *testing.T) { mp.DefaultMapping = docMapping - _ = os.RemoveAll(testConfig["path"].(string)) analysisQueue := index.NewAnalysisQueue(1) idx, err := NewScorch("storeName", testConfig, analysisQueue) if err != nil { @@ -1747,8 +1857,10 @@ func TestAllFieldWithDifferentTermVectorsEnabled(t *testing.T) { t.Errorf("error opening index: %v", err) } defer func() { - _ = idx.Close() - _ = os.RemoveAll(testConfig["path"].(string)) + err := idx.Close() + if err != nil { + t.Fatal(err) + } }() data := map[string]string{ diff --git a/index/scorch/snapshot_rollback_test.go b/index/scorch/snapshot_rollback_test.go index 0065a746d..1fe2a5909 100644 --- a/index/scorch/snapshot_rollback_test.go +++ b/index/scorch/snapshot_rollback_test.go @@ -22,20 +22,25 @@ import ( ) func TestIndexRollback(t *testing.T) { + cfg := CreateConfig("TestIndexRollback") numSnapshotsToKeepOrig := NumSnapshotsToKeep NumSnapshotsToKeep = 1000 + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } defer func() { NumSnapshotsToKeep = numSnapshotsToKeepOrig - err := DestroyTest() + err := DestroyTest(cfg) if err != nil { - t.Fatal(err) + t.Log(err) } }() analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) + idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } @@ -212,4 +217,4 @@ func TestIndexRollback(t *testing.T) { if err != nil { t.Fatal(err) } -} +} \ No newline at end of file From 934a36ea597bf6c20b8b05bda553bbf3f1d2d073 Mon Sep 17 00:00:00 2001 From: Dmitriy Kalugin-Balashov Date: Wed, 5 Dec 2018 12:00:55 -0800 Subject: [PATCH 500/728] Refactoring. --- index/scorch/segment/zap/segment.go | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 606ea0cfa..4ee6c290b 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -418,19 +418,18 @@ func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) { } postingsList := emptyPostingsList - filteredIds := ids[:0] + filteredIds := make([]string, 0, len(ids)) sMax := "" sMaxB, err := idDict.fst.GetMaxKey() - if err == nil { - sMax = string(sMaxB) - for _, id := range ids { - if id <= sMax { - filteredIds = append(filteredIds, id) - } + if err != nil { + return nil, err + } + sMax = string(sMaxB) + for _, id := range ids { + if id <= sMax { + filteredIds = append(filteredIds, id) } - } else { - filteredIds = ids } for _, id := range filteredIds { From 1ef74c961c24647952d6626f40b12034d5ffda7d Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 5 Dec 2018 12:18:41 -0800 Subject: [PATCH 501/728] Formatting: go fmt ./... --- index/scorch/event_test.go | 2 +- index/scorch/field_dict_test.go | 3 +-- index/scorch/reader_test.go | 1 - index/scorch/snapshot_rollback_test.go | 2 +- 4 files changed, 3 insertions(+), 5 deletions(-) diff --git a/index/scorch/event_test.go b/index/scorch/event_test.go index 58ba7c06c..9d39332e9 100644 --- a/index/scorch/event_test.go +++ b/index/scorch/event_test.go @@ -75,4 +75,4 @@ func TestEventBatchIntroductionStart(t *testing.T) { if count != 1 { t.Fatalf("expected to see 1 batch introduction event event, saw %d", count) } -} \ No newline at end of file +} diff --git a/index/scorch/field_dict_test.go b/index/scorch/field_dict_test.go index 92c49890e..8cffe9e62 100644 --- a/index/scorch/field_dict_test.go +++ b/index/scorch/field_dict_test.go @@ -35,7 +35,6 @@ func TestIndexFieldDict(t *testing.T) { } }() - analysisQueue := index.NewAnalysisQueue(1) idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { @@ -187,4 +186,4 @@ func TestIndexFieldDict(t *testing.T) { if !reflect.DeepEqual(expectedTerms, terms) { t.Errorf("expected %#v, got %#v", expectedTerms, terms) } -} \ No newline at end of file +} diff --git a/index/scorch/reader_test.go b/index/scorch/reader_test.go index b89ba6dd9..a1d612b5f 100644 --- a/index/scorch/reader_test.go +++ b/index/scorch/reader_test.go @@ -36,7 +36,6 @@ func TestIndexReader(t *testing.T) { } }() - analysisQueue := index.NewAnalysisQueue(1) idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { diff --git a/index/scorch/snapshot_rollback_test.go b/index/scorch/snapshot_rollback_test.go index 1fe2a5909..73523a0ba 100644 --- a/index/scorch/snapshot_rollback_test.go +++ b/index/scorch/snapshot_rollback_test.go @@ -217,4 +217,4 @@ func TestIndexRollback(t *testing.T) { if err != nil { t.Fatal(err) } -} \ No newline at end of file +} From 2ab80c36124329279fda5621bd1213cc2b0a64dd Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 5 Dec 2018 16:37:11 -0800 Subject: [PATCH 502/728] Defer closing the index in tests to avoid cascade failures If a test fails, to avoid a subsequent test from failing because of the earlier test not closing the index, defer the closing part. --- search_test.go | 101 ++++++++++++++++++++++++------------------------- 1 file changed, 50 insertions(+), 51 deletions(-) diff --git a/search_test.go b/search_test.go index 9937eb7b2..59ac13402 100644 --- a/search_test.go +++ b/search_test.go @@ -430,13 +430,6 @@ func TestMemoryNeededForSearchResult(t *testing.T) { // https://github.com/blevesearch/bleve/issues/954 func TestNestedBooleanSearchers(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() - // create an index with a custom analyzer idxMapping := NewIndexMapping() if err := idxMapping.AddCustomAnalyzer("3xbla", map[string]interface{}{ @@ -453,6 +446,18 @@ func TestNestedBooleanSearchers(t *testing.T) { t.Fatal(err) } + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + + err = os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + }() + // create and insert documents as a batch batch := idx.NewBatch() matches := 0 @@ -545,27 +550,27 @@ func TestNestedBooleanSearchers(t *testing.T) { if matches != len(searchResults.Hits) { t.Fatalf("Unexpected result set, %v != %v", matches, len(searchResults.Hits)) } +} - err = idx.Close() +func TestNestedBooleanMustNotSearcher(t *testing.T) { + // create an index with default settings + idxMapping := NewIndexMapping() + idx, err := New("testidx", idxMapping) if err != nil { t.Fatal(err) } -} -func TestNestedBooleanMustNotSearcher(t *testing.T) { defer func() { - err := os.RemoveAll("testidx") + err = idx.Close() if err != nil { t.Fatal(err) } - }() - // create an index with default settings - idxMapping := NewIndexMapping() - idx, err := New("testidx", idxMapping) - if err != nil { - t.Fatal(err) - } + err = os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + }() // create and insert documents as a batch batch := idx.NewBatch() @@ -673,24 +678,10 @@ func TestNestedBooleanMustNotSearcher(t *testing.T) { if res.Total != 0 { t.Fatalf("Unexpected result, %v != 0", res.Total) } - - err = idx.Close() - if err != nil { - t.Fatal(err) - } } func TestSearchScorchOverEmptyKeyword(t *testing.T) { defaultIndexType := Config.DefaultIndexType - - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - Config.DefaultIndexType = defaultIndexType - }() - Config.DefaultIndexType = scorch.Name dmap := mapping.NewDocumentMapping() @@ -713,6 +704,20 @@ func TestSearchScorchOverEmptyKeyword(t *testing.T) { if err != nil { t.Fatal(err) } + + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + + err = os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + Config.DefaultIndexType = defaultIndexType + }() + for i := 0; i < 10; i++ { err = idx.Index(fmt.Sprint(i), map[string]string{"name": fmt.Sprintf("test%d", i), "id": ""}) if err != nil { @@ -737,24 +742,10 @@ func TestSearchScorchOverEmptyKeyword(t *testing.T) { if res.Total != 10 { t.Fatalf("Unexpected search hits: %v, expected 10", res.Total) } - - err = idx.Close() - if err != nil { - t.Fatal(err) - } } func TestMultipleNestedBooleanMustNotSearchersOnScorch(t *testing.T) { defaultIndexType := Config.DefaultIndexType - - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - Config.DefaultIndexType = defaultIndexType - }() - Config.DefaultIndexType = scorch.Name // create an index with default settings @@ -764,6 +755,19 @@ func TestMultipleNestedBooleanMustNotSearchersOnScorch(t *testing.T) { t.Fatal(err) } + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + + err = os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + Config.DefaultIndexType = defaultIndexType + }() + // create and insert documents as a batch batch := idx.NewBatch() @@ -883,9 +887,4 @@ func TestMultipleNestedBooleanMustNotSearchersOnScorch(t *testing.T) { if res.Total != 1 { t.Fatalf("Unexpected result, %v != 1", res.Total) } - - err = idx.Close() - if err != nil { - t.Fatal(err) - } } From c19cdeeb03bd0f2ef0f4fa4cee097ff012ba85c5 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 5 Dec 2018 20:16:51 -0800 Subject: [PATCH 503/728] Update .travis.yml to use supported versions of golang https://golang.org/doc/devel/release.html#policy --- .travis.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.travis.yml b/.travis.yml index 934e86268..fd6e1ba38 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,10 +3,9 @@ sudo: false language: go go: - - 1.7.x - - 1.8.x - - 1.9.x - - "1.10" + - "1.9.x" + - "1.10.x" + - "1.11.x" script: - go get golang.org/x/tools/cmd/cover From a321e9eb77d9db3fb3fd45943dbcc7599b89f165 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 6 Dec 2018 13:04:21 +0530 Subject: [PATCH 504/728] MB-32183 - new Levenshtein automaton Adopting new levenshtein automaton --- index/scorch/snapshot_index.go | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 0d312fcca..4c4d92144 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -27,9 +27,13 @@ import ( "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/couchbase/vellum/levenshtein" + "github.com/couchbase/vellum" + lev2 "github.com/couchbase/vellum/levenshtein2" ) +// re usable, threadsafe levenshtein builders +var lb1, lb2 *lev2.LevenshteinAutomatonBuilder + type asynchSegmentResult struct { dictItr segment.DictionaryIterator @@ -46,6 +50,15 @@ var reflectStaticSizeIndexSnapshot int func init() { var is interface{} = IndexSnapshot{} reflectStaticSizeIndexSnapshot = int(reflect.TypeOf(is).Size()) + var err error + lb1, err = lev2.NewLevenshteinAutomatonBuilder(1, true) + if err != nil { + panic(fmt.Errorf("Levenshtein automaton ed1 builder err: %v", err)) + } + lb2, err = lev2.NewLevenshteinAutomatonBuilder(2, true) + if err != nil { + panic(fmt.Errorf("Levenshtein automaton ed2 builder err: %v", err)) + } } type IndexSnapshot struct { @@ -194,9 +207,19 @@ func (i *IndexSnapshot) FieldDictRegexp(field string, }) } +func (i *IndexSnapshot) getLevAutomaton(term string, + fuzziness uint8) (vellum.Automaton, error) { + if fuzziness == 1 { + return lb1.BuildDfa(term, fuzziness) + } else if fuzziness == 2 { + return lb2.BuildDfa(term, fuzziness) + } + return nil, fmt.Errorf("fuzziness exceeds the max limit") +} + func (i *IndexSnapshot) FieldDictFuzzy(field string, term string, fuzziness int, prefix string) (index.FieldDict, error) { - a, err := levenshtein.New(term, fuzziness) + a, err := i.getLevAutomaton(term, uint8(fuzziness)) if err != nil { return nil, err } From 482566ce61e617fbfdc006408861189a9106b6dc Mon Sep 17 00:00:00 2001 From: Funzinator Date: Thu, 6 Dec 2018 21:10:52 +0100 Subject: [PATCH 505/728] add ASCII Folding Filter fixes #957 --- analysis/char/asciifolding/asciifolding.go | 3570 +++++++++++++++++ .../char/asciifolding/asciifolding_test.go | 56 + 2 files changed, 3626 insertions(+) create mode 100644 analysis/char/asciifolding/asciifolding.go create mode 100644 analysis/char/asciifolding/asciifolding_test.go diff --git a/analysis/char/asciifolding/asciifolding.go b/analysis/char/asciifolding/asciifolding.go new file mode 100644 index 000000000..469102e2c --- /dev/null +++ b/analysis/char/asciifolding/asciifolding.go @@ -0,0 +1,3570 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// converted to Go from Lucene's AsciiFoldingFilter +// https://lucene.apache.org/core/4_0_0/analyzers-common/org/apache/lucene/analysis/miscellaneous/ASCIIFoldingFilter.html + +package asciifolding + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +const Name = "asciifolding" + +type AsciiFoldingFilter struct{} + +func New() *AsciiFoldingFilter { + return &AsciiFoldingFilter{} +} + +func (s *AsciiFoldingFilter) Filter(input []byte) []byte { + if len(input) == 0 { + return input + } + + in := []rune(string(input)) + length := len(in) + + // Worst-case length required if all runes fold to 4 runes + out := make([]rune, length, length*4) + + out = foldToASCII(in, 0, out, 0, length) + return []byte(string(out)) +} + +func AsciiFoldingFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) { + return New(), nil +} + +func init() { + registry.RegisterCharFilter(Name, AsciiFoldingFilterConstructor) +} + +// Converts characters above ASCII to their ASCII equivalents. +// For example, accents are removed from accented characters. +func foldToASCII(input []rune, inputPos int, output []rune, outputPos int, length int) []rune { + end := inputPos + length + for pos := inputPos; pos < end; pos++ { + c := input[pos] + + // Quick test: if it's not in range then just keep current character + if c < '\u0080' { + output[outputPos] = c + outputPos++ + } else { + switch c { + case '\u00C0': // À [LATIN CAPITAL LETTER A WITH GRAVE] + fallthrough + case '\u00C1': // Á [LATIN CAPITAL LETTER A WITH ACUTE] + fallthrough + case '\u00C2': //  [LATIN CAPITAL LETTER A WITH CIRCUMFLEX] + fallthrough + case '\u00C3': // à [LATIN CAPITAL LETTER A WITH TILDE] + fallthrough + case '\u00C4': // Ä [LATIN CAPITAL LETTER A WITH DIAERESIS] + fallthrough + case '\u00C5': // Å [LATIN CAPITAL LETTER A WITH RING ABOVE] + fallthrough + case '\u0100': // Ā [LATIN CAPITAL LETTER A WITH MACRON] + fallthrough + case '\u0102': // Ă [LATIN CAPITAL LETTER A WITH BREVE] + fallthrough + case '\u0104': // Ą [LATIN CAPITAL LETTER A WITH OGONEK] + fallthrough + case '\u018F': // Ə http://en.wikipedia.org/wiki/Schwa [LATIN CAPITAL LETTER SCHWA] + fallthrough + case '\u01CD': // Ǎ [LATIN CAPITAL LETTER A WITH CARON] + fallthrough + case '\u01DE': // Ǟ [LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON] + fallthrough + case '\u01E0': // Ǡ [LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON] + fallthrough + case '\u01FA': // Ǻ [LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE] + fallthrough + case '\u0200': // Ȁ [LATIN CAPITAL LETTER A WITH DOUBLE GRAVE] + fallthrough + case '\u0202': // Ȃ [LATIN CAPITAL LETTER A WITH INVERTED BREVE] + fallthrough + case '\u0226': // Ȧ [LATIN CAPITAL LETTER A WITH DOT ABOVE] + fallthrough + case '\u023A': // Ⱥ [LATIN CAPITAL LETTER A WITH STROKE] + fallthrough + case '\u1D00': // ᴀ [LATIN LETTER SMALL CAPITAL A] + fallthrough + case '\u1E00': // Ḁ [LATIN CAPITAL LETTER A WITH RING BELOW] + fallthrough + case '\u1EA0': // Ạ [LATIN CAPITAL LETTER A WITH DOT BELOW] + fallthrough + case '\u1EA2': // Ả [LATIN CAPITAL LETTER A WITH HOOK ABOVE] + fallthrough + case '\u1EA4': // Ấ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND ACUTE] + fallthrough + case '\u1EA6': // Ầ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND GRAVE] + fallthrough + case '\u1EA8': // Ẩ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE] + fallthrough + case '\u1EAA': // Ẫ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND TILDE] + fallthrough + case '\u1EAC': // Ậ [LATIN CAPITAL LETTER A WITH CIRCUMFLEX AND DOT BELOW] + fallthrough + case '\u1EAE': // Ắ [LATIN CAPITAL LETTER A WITH BREVE AND ACUTE] + fallthrough + case '\u1EB0': // Ằ [LATIN CAPITAL LETTER A WITH BREVE AND GRAVE] + fallthrough + case '\u1EB2': // Ẳ [LATIN CAPITAL LETTER A WITH BREVE AND HOOK ABOVE] + fallthrough + case '\u1EB4': // Ẵ [LATIN CAPITAL LETTER A WITH BREVE AND TILDE] + fallthrough + case '\u24B6': // Ⓐ [CIRCLED LATIN CAPITAL LETTER A] + fallthrough + case '\uFF21': // A [FULLWIDTH LATIN CAPITAL LETTER A] + fallthrough + case '\u1EB6': // Ặ [LATIN CAPITAL LETTER A WITH BREVE AND DOT BELOW] + output[outputPos] = 'A' + outputPos++ + + case '\u00E0': // à [LATIN SMALL LETTER A WITH GRAVE] + fallthrough + case '\u00E1': // á [LATIN SMALL LETTER A WITH ACUTE] + fallthrough + case '\u00E2': // â [LATIN SMALL LETTER A WITH CIRCUMFLEX] + fallthrough + case '\u00E3': // ã [LATIN SMALL LETTER A WITH TILDE] + fallthrough + case '\u00E4': // ä [LATIN SMALL LETTER A WITH DIAERESIS] + fallthrough + case '\u00E5': // å [LATIN SMALL LETTER A WITH RING ABOVE] + fallthrough + case '\u0101': // ā [LATIN SMALL LETTER A WITH MACRON] + fallthrough + case '\u0103': // ă [LATIN SMALL LETTER A WITH BREVE] + fallthrough + case '\u0105': // ą [LATIN SMALL LETTER A WITH OGONEK] + fallthrough + case '\u01CE': // ǎ [LATIN SMALL LETTER A WITH CARON] + fallthrough + case '\u01DF': // ǟ [LATIN SMALL LETTER A WITH DIAERESIS AND MACRON] + fallthrough + case '\u01E1': // ǡ [LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON] + fallthrough + case '\u01FB': // ǻ [LATIN SMALL LETTER A WITH RING ABOVE AND ACUTE] + fallthrough + case '\u0201': // ȁ [LATIN SMALL LETTER A WITH DOUBLE GRAVE] + fallthrough + case '\u0203': // ȃ [LATIN SMALL LETTER A WITH INVERTED BREVE] + fallthrough + case '\u0227': // ȧ [LATIN SMALL LETTER A WITH DOT ABOVE] + fallthrough + case '\u0250': // ɐ [LATIN SMALL LETTER TURNED A] + fallthrough + case '\u0259': // ə [LATIN SMALL LETTER SCHWA] + fallthrough + case '\u025A': // ɚ [LATIN SMALL LETTER SCHWA WITH HOOK] + fallthrough + case '\u1D8F': // ᶏ [LATIN SMALL LETTER A WITH RETROFLEX HOOK] + fallthrough + case '\u1D95': // ᶕ [LATIN SMALL LETTER SCHWA WITH RETROFLEX HOOK] + fallthrough + case '\u1E01': // ạ [LATIN SMALL LETTER A WITH RING BELOW] + fallthrough + case '\u1E9A': // ả [LATIN SMALL LETTER A WITH RIGHT HALF RING] + fallthrough + case '\u1EA1': // ạ [LATIN SMALL LETTER A WITH DOT BELOW] + fallthrough + case '\u1EA3': // ả [LATIN SMALL LETTER A WITH HOOK ABOVE] + fallthrough + case '\u1EA5': // ấ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND ACUTE] + fallthrough + case '\u1EA7': // ầ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND GRAVE] + fallthrough + case '\u1EA9': // ẩ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND HOOK ABOVE] + fallthrough + case '\u1EAB': // ẫ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND TILDE] + fallthrough + case '\u1EAD': // ậ [LATIN SMALL LETTER A WITH CIRCUMFLEX AND DOT BELOW] + fallthrough + case '\u1EAF': // ắ [LATIN SMALL LETTER A WITH BREVE AND ACUTE] + fallthrough + case '\u1EB1': // ằ [LATIN SMALL LETTER A WITH BREVE AND GRAVE] + fallthrough + case '\u1EB3': // ẳ [LATIN SMALL LETTER A WITH BREVE AND HOOK ABOVE] + fallthrough + case '\u1EB5': // ẵ [LATIN SMALL LETTER A WITH BREVE AND TILDE] + fallthrough + case '\u1EB7': // ặ [LATIN SMALL LETTER A WITH BREVE AND DOT BELOW] + fallthrough + case '\u2090': // ₐ [LATIN SUBSCRIPT SMALL LETTER A] + fallthrough + case '\u2094': // ₔ [LATIN SUBSCRIPT SMALL LETTER SCHWA] + fallthrough + case '\u24D0': // ⓐ [CIRCLED LATIN SMALL LETTER A] + fallthrough + case '\u2C65': // ⱥ [LATIN SMALL LETTER A WITH STROKE] + fallthrough + case '\u2C6F': // Ɐ [LATIN CAPITAL LETTER TURNED A] + fallthrough + case '\uFF41': // a [FULLWIDTH LATIN SMALL LETTER A] + output[outputPos] = 'a' + outputPos++ + + case '\uA732': // Ꜳ [LATIN CAPITAL LETTER AA] + output = output[:(len(output) + 1)] + output[outputPos] = 'A' + outputPos++ + output[outputPos] = 'A' + outputPos++ + + case '\u00C6': // Æ [LATIN CAPITAL LETTER AE] + fallthrough + case '\u01E2': // Ǣ [LATIN CAPITAL LETTER AE WITH MACRON] + fallthrough + case '\u01FC': // Ǽ [LATIN CAPITAL LETTER AE WITH ACUTE] + fallthrough + case '\u1D01': // ᴁ [LATIN LETTER SMALL CAPITAL AE] + output = output[:(len(output) + 1)] + output[outputPos] = 'A' + outputPos++ + output[outputPos] = 'E' + outputPos++ + + case '\uA734': // Ꜵ [LATIN CAPITAL LETTER AO] + output = output[:(len(output) + 1)] + output[outputPos] = 'A' + outputPos++ + output[outputPos] = 'O' + outputPos++ + + case '\uA736': // Ꜷ [LATIN CAPITAL LETTER AU] + output = output[:(len(output) + 1)] + output[outputPos] = 'A' + outputPos++ + output[outputPos] = 'U' + outputPos++ + + case '\uA738': // Ꜹ [LATIN CAPITAL LETTER AV] + fallthrough + case '\uA73A': // Ꜻ [LATIN CAPITAL LETTER AV WITH HORIZONTAL BAR] + output = output[:(len(output) + 1)] + output[outputPos] = 'A' + outputPos++ + output[outputPos] = 'V' + outputPos++ + + case '\uA73C': // Ꜽ [LATIN CAPITAL LETTER AY] + output = output[:(len(output) + 1)] + output[outputPos] = 'A' + outputPos++ + output[outputPos] = 'Y' + outputPos++ + + case '\u249C': // ⒜ [PARENTHESIZED LATIN SMALL LETTER A] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'a' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\uA733': // ꜳ [LATIN SMALL LETTER AA] + output = output[:(len(output) + 1)] + output[outputPos] = 'a' + outputPos++ + output[outputPos] = 'a' + outputPos++ + + case '\u00E6': // æ [LATIN SMALL LETTER AE] + fallthrough + case '\u01E3': // ǣ [LATIN SMALL LETTER AE WITH MACRON] + fallthrough + case '\u01FD': // ǽ [LATIN SMALL LETTER AE WITH ACUTE] + fallthrough + case '\u1D02': // ᴂ [LATIN SMALL LETTER TURNED AE] + output = output[:(len(output) + 1)] + output[outputPos] = 'a' + outputPos++ + output[outputPos] = 'e' + outputPos++ + + case '\uA735': // ꜵ [LATIN SMALL LETTER AO] + output = output[:(len(output) + 1)] + output[outputPos] = 'a' + outputPos++ + output[outputPos] = 'o' + outputPos++ + + case '\uA737': // ꜷ [LATIN SMALL LETTER AU] + output = output[:(len(output) + 1)] + output[outputPos] = 'a' + outputPos++ + output[outputPos] = 'u' + outputPos++ + + case '\uA739': // ꜹ [LATIN SMALL LETTER AV] + fallthrough + case '\uA73B': // ꜻ [LATIN SMALL LETTER AV WITH HORIZONTAL BAR] + output = output[:(len(output) + 1)] + output[outputPos] = 'a' + outputPos++ + output[outputPos] = 'v' + outputPos++ + + case '\uA73D': // ꜽ [LATIN SMALL LETTER AY] + output = output[:(len(output) + 1)] + output[outputPos] = 'a' + outputPos++ + output[outputPos] = 'y' + outputPos++ + + case '\u0181': // Ɓ [LATIN CAPITAL LETTER B WITH HOOK] + fallthrough + case '\u0182': // Ƃ [LATIN CAPITAL LETTER B WITH TOPBAR] + fallthrough + case '\u0243': // Ƀ [LATIN CAPITAL LETTER B WITH STROKE] + fallthrough + case '\u0299': // ʙ [LATIN LETTER SMALL CAPITAL B] + fallthrough + case '\u1D03': // ᴃ [LATIN LETTER SMALL CAPITAL BARRED B] + fallthrough + case '\u1E02': // Ḃ [LATIN CAPITAL LETTER B WITH DOT ABOVE] + fallthrough + case '\u1E04': // Ḅ [LATIN CAPITAL LETTER B WITH DOT BELOW] + fallthrough + case '\u1E06': // Ḇ [LATIN CAPITAL LETTER B WITH LINE BELOW] + fallthrough + case '\u24B7': // Ⓑ [CIRCLED LATIN CAPITAL LETTER B] + fallthrough + case '\uFF22': // B [FULLWIDTH LATIN CAPITAL LETTER B] + output[outputPos] = 'B' + outputPos++ + + case '\u0180': // ƀ [LATIN SMALL LETTER B WITH STROKE] + fallthrough + case '\u0183': // ƃ [LATIN SMALL LETTER B WITH TOPBAR] + fallthrough + case '\u0253': // ɓ [LATIN SMALL LETTER B WITH HOOK] + fallthrough + case '\u1D6C': // ᵬ [LATIN SMALL LETTER B WITH MIDDLE TILDE] + fallthrough + case '\u1D80': // ᶀ [LATIN SMALL LETTER B WITH PALATAL HOOK] + fallthrough + case '\u1E03': // ḃ [LATIN SMALL LETTER B WITH DOT ABOVE] + fallthrough + case '\u1E05': // ḅ [LATIN SMALL LETTER B WITH DOT BELOW] + fallthrough + case '\u1E07': // ḇ [LATIN SMALL LETTER B WITH LINE BELOW] + fallthrough + case '\u24D1': // ⓑ [CIRCLED LATIN SMALL LETTER B] + fallthrough + case '\uFF42': // b [FULLWIDTH LATIN SMALL LETTER B] + output[outputPos] = 'b' + outputPos++ + + case '\u249D': // ⒝ [PARENTHESIZED LATIN SMALL LETTER B] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'b' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u00C7': // Ç [LATIN CAPITAL LETTER C WITH CEDILLA] + fallthrough + case '\u0106': // Ć [LATIN CAPITAL LETTER C WITH ACUTE] + fallthrough + case '\u0108': // Ĉ [LATIN CAPITAL LETTER C WITH CIRCUMFLEX] + fallthrough + case '\u010A': // Ċ [LATIN CAPITAL LETTER C WITH DOT ABOVE] + fallthrough + case '\u010C': // Č [LATIN CAPITAL LETTER C WITH CARON] + fallthrough + case '\u0187': // Ƈ [LATIN CAPITAL LETTER C WITH HOOK] + fallthrough + case '\u023B': // Ȼ [LATIN CAPITAL LETTER C WITH STROKE] + fallthrough + case '\u0297': // ʗ [LATIN LETTER STRETCHED C] + fallthrough + case '\u1D04': // ᴄ [LATIN LETTER SMALL CAPITAL C] + fallthrough + case '\u1E08': // Ḉ [LATIN CAPITAL LETTER C WITH CEDILLA AND ACUTE] + fallthrough + case '\u24B8': // Ⓒ [CIRCLED LATIN CAPITAL LETTER C] + fallthrough + case '\uFF23': // C [FULLWIDTH LATIN CAPITAL LETTER C] + output[outputPos] = 'C' + outputPos++ + + case '\u00E7': // ç [LATIN SMALL LETTER C WITH CEDILLA] + fallthrough + case '\u0107': // ć [LATIN SMALL LETTER C WITH ACUTE] + fallthrough + case '\u0109': // ĉ [LATIN SMALL LETTER C WITH CIRCUMFLEX] + fallthrough + case '\u010B': // ċ [LATIN SMALL LETTER C WITH DOT ABOVE] + fallthrough + case '\u010D': // č [LATIN SMALL LETTER C WITH CARON] + fallthrough + case '\u0188': // ƈ [LATIN SMALL LETTER C WITH HOOK] + fallthrough + case '\u023C': // ȼ [LATIN SMALL LETTER C WITH STROKE] + fallthrough + case '\u0255': // ɕ [LATIN SMALL LETTER C WITH CURL] + fallthrough + case '\u1E09': // ḉ [LATIN SMALL LETTER C WITH CEDILLA AND ACUTE] + fallthrough + case '\u2184': // ↄ [LATIN SMALL LETTER REVERSED C] + fallthrough + case '\u24D2': // ⓒ [CIRCLED LATIN SMALL LETTER C] + fallthrough + case '\uA73E': // Ꜿ [LATIN CAPITAL LETTER REVERSED C WITH DOT] + fallthrough + case '\uA73F': // ꜿ [LATIN SMALL LETTER REVERSED C WITH DOT] + fallthrough + case '\uFF43': // c [FULLWIDTH LATIN SMALL LETTER C] + output[outputPos] = 'c' + outputPos++ + + case '\u249E': // ⒞ [PARENTHESIZED LATIN SMALL LETTER C] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'c' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u00D0': // Ð [LATIN CAPITAL LETTER ETH] + fallthrough + case '\u010E': // Ď [LATIN CAPITAL LETTER D WITH CARON] + fallthrough + case '\u0110': // Đ [LATIN CAPITAL LETTER D WITH STROKE] + fallthrough + case '\u0189': // Ɖ [LATIN CAPITAL LETTER AFRICAN D] + fallthrough + case '\u018A': // Ɗ [LATIN CAPITAL LETTER D WITH HOOK] + fallthrough + case '\u018B': // Ƌ [LATIN CAPITAL LETTER D WITH TOPBAR] + fallthrough + case '\u1D05': // ᴅ [LATIN LETTER SMALL CAPITAL D] + fallthrough + case '\u1D06': // ᴆ [LATIN LETTER SMALL CAPITAL ETH] + fallthrough + case '\u1E0A': // Ḋ [LATIN CAPITAL LETTER D WITH DOT ABOVE] + fallthrough + case '\u1E0C': // Ḍ [LATIN CAPITAL LETTER D WITH DOT BELOW] + fallthrough + case '\u1E0E': // Ḏ [LATIN CAPITAL LETTER D WITH LINE BELOW] + fallthrough + case '\u1E10': // Ḑ [LATIN CAPITAL LETTER D WITH CEDILLA] + fallthrough + case '\u1E12': // Ḓ [LATIN CAPITAL LETTER D WITH CIRCUMFLEX BELOW] + fallthrough + case '\u24B9': // Ⓓ [CIRCLED LATIN CAPITAL LETTER D] + fallthrough + case '\uA779': // Ꝺ [LATIN CAPITAL LETTER INSULAR D] + fallthrough + case '\uFF24': // D [FULLWIDTH LATIN CAPITAL LETTER D] + output[outputPos] = 'D' + outputPos++ + + case '\u00F0': // ð [LATIN SMALL LETTER ETH] + fallthrough + case '\u010F': // ď [LATIN SMALL LETTER D WITH CARON] + fallthrough + case '\u0111': // đ [LATIN SMALL LETTER D WITH STROKE] + fallthrough + case '\u018C': // ƌ [LATIN SMALL LETTER D WITH TOPBAR] + fallthrough + case '\u0221': // ȡ [LATIN SMALL LETTER D WITH CURL] + fallthrough + case '\u0256': // ɖ [LATIN SMALL LETTER D WITH TAIL] + fallthrough + case '\u0257': // ɗ [LATIN SMALL LETTER D WITH HOOK] + fallthrough + case '\u1D6D': // ᵭ [LATIN SMALL LETTER D WITH MIDDLE TILDE] + fallthrough + case '\u1D81': // ᶁ [LATIN SMALL LETTER D WITH PALATAL HOOK] + fallthrough + case '\u1D91': // ᶑ [LATIN SMALL LETTER D WITH HOOK AND TAIL] + fallthrough + case '\u1E0B': // ḋ [LATIN SMALL LETTER D WITH DOT ABOVE] + fallthrough + case '\u1E0D': // ḍ [LATIN SMALL LETTER D WITH DOT BELOW] + fallthrough + case '\u1E0F': // ḏ [LATIN SMALL LETTER D WITH LINE BELOW] + fallthrough + case '\u1E11': // ḑ [LATIN SMALL LETTER D WITH CEDILLA] + fallthrough + case '\u1E13': // ḓ [LATIN SMALL LETTER D WITH CIRCUMFLEX BELOW] + fallthrough + case '\u24D3': // ⓓ [CIRCLED LATIN SMALL LETTER D] + fallthrough + case '\uA77A': // ꝺ [LATIN SMALL LETTER INSULAR D] + fallthrough + case '\uFF44': // d [FULLWIDTH LATIN SMALL LETTER D] + output[outputPos] = 'd' + outputPos++ + + case '\u01C4': // DŽ [LATIN CAPITAL LETTER DZ WITH CARON] + fallthrough + case '\u01F1': // DZ [LATIN CAPITAL LETTER DZ] + output = output[:(len(output) + 1)] + output[outputPos] = 'D' + outputPos++ + output[outputPos] = 'Z' + outputPos++ + + case '\u01C5': // Dž [LATIN CAPITAL LETTER D WITH SMALL LETTER Z WITH CARON] + fallthrough + case '\u01F2': // Dz [LATIN CAPITAL LETTER D WITH SMALL LETTER Z] + output = output[:(len(output) + 1)] + output[outputPos] = 'D' + outputPos++ + output[outputPos] = 'z' + outputPos++ + + case '\u249F': // ⒟ [PARENTHESIZED LATIN SMALL LETTER D] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'd' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u0238': // ȸ [LATIN SMALL LETTER DB DIGRAPH] + output = output[:(len(output) + 1)] + output[outputPos] = 'd' + outputPos++ + output[outputPos] = 'b' + outputPos++ + + case '\u01C6': // dž [LATIN SMALL LETTER DZ WITH CARON] + fallthrough + case '\u01F3': // dz [LATIN SMALL LETTER DZ] + fallthrough + case '\u02A3': // ʣ [LATIN SMALL LETTER DZ DIGRAPH] + fallthrough + case '\u02A5': // ʥ [LATIN SMALL LETTER DZ DIGRAPH WITH CURL] + output = output[:(len(output) + 1)] + output[outputPos] = 'd' + outputPos++ + output[outputPos] = 'z' + outputPos++ + + case '\u00C8': // È [LATIN CAPITAL LETTER E WITH GRAVE] + fallthrough + case '\u00C9': // É [LATIN CAPITAL LETTER E WITH ACUTE] + fallthrough + case '\u00CA': // Ê [LATIN CAPITAL LETTER E WITH CIRCUMFLEX] + fallthrough + case '\u00CB': // Ë [LATIN CAPITAL LETTER E WITH DIAERESIS] + fallthrough + case '\u0112': // Ē [LATIN CAPITAL LETTER E WITH MACRON] + fallthrough + case '\u0114': // Ĕ [LATIN CAPITAL LETTER E WITH BREVE] + fallthrough + case '\u0116': // Ė [LATIN CAPITAL LETTER E WITH DOT ABOVE] + fallthrough + case '\u0118': // Ę [LATIN CAPITAL LETTER E WITH OGONEK] + fallthrough + case '\u011A': // Ě [LATIN CAPITAL LETTER E WITH CARON] + fallthrough + case '\u018E': // Ǝ [LATIN CAPITAL LETTER REVERSED E] + fallthrough + case '\u0190': // Ɛ [LATIN CAPITAL LETTER OPEN E] + fallthrough + case '\u0204': // Ȅ [LATIN CAPITAL LETTER E WITH DOUBLE GRAVE] + fallthrough + case '\u0206': // Ȇ [LATIN CAPITAL LETTER E WITH INVERTED BREVE] + fallthrough + case '\u0228': // Ȩ [LATIN CAPITAL LETTER E WITH CEDILLA] + fallthrough + case '\u0246': // Ɇ [LATIN CAPITAL LETTER E WITH STROKE] + fallthrough + case '\u1D07': // ᴇ [LATIN LETTER SMALL CAPITAL E] + fallthrough + case '\u1E14': // Ḕ [LATIN CAPITAL LETTER E WITH MACRON AND GRAVE] + fallthrough + case '\u1E16': // Ḗ [LATIN CAPITAL LETTER E WITH MACRON AND ACUTE] + fallthrough + case '\u1E18': // Ḙ [LATIN CAPITAL LETTER E WITH CIRCUMFLEX BELOW] + fallthrough + case '\u1E1A': // Ḛ [LATIN CAPITAL LETTER E WITH TILDE BELOW] + fallthrough + case '\u1E1C': // Ḝ [LATIN CAPITAL LETTER E WITH CEDILLA AND BREVE] + fallthrough + case '\u1EB8': // Ẹ [LATIN CAPITAL LETTER E WITH DOT BELOW] + fallthrough + case '\u1EBA': // Ẻ [LATIN CAPITAL LETTER E WITH HOOK ABOVE] + fallthrough + case '\u1EBC': // Ẽ [LATIN CAPITAL LETTER E WITH TILDE] + fallthrough + case '\u1EBE': // Ế [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND ACUTE] + fallthrough + case '\u1EC0': // Ề [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND GRAVE] + fallthrough + case '\u1EC2': // Ể [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE] + fallthrough + case '\u1EC4': // Ễ [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND TILDE] + fallthrough + case '\u1EC6': // Ệ [LATIN CAPITAL LETTER E WITH CIRCUMFLEX AND DOT BELOW] + fallthrough + case '\u24BA': // Ⓔ [CIRCLED LATIN CAPITAL LETTER E] + fallthrough + case '\u2C7B': // ⱻ [LATIN LETTER SMALL CAPITAL TURNED E] + fallthrough + case '\uFF25': // E [FULLWIDTH LATIN CAPITAL LETTER E] + output[outputPos] = 'E' + outputPos++ + + case '\u00E8': // è [LATIN SMALL LETTER E WITH GRAVE] + fallthrough + case '\u00E9': // é [LATIN SMALL LETTER E WITH ACUTE] + fallthrough + case '\u00EA': // ê [LATIN SMALL LETTER E WITH CIRCUMFLEX] + fallthrough + case '\u00EB': // ë [LATIN SMALL LETTER E WITH DIAERESIS] + fallthrough + case '\u0113': // ē [LATIN SMALL LETTER E WITH MACRON] + fallthrough + case '\u0115': // ĕ [LATIN SMALL LETTER E WITH BREVE] + fallthrough + case '\u0117': // ė [LATIN SMALL LETTER E WITH DOT ABOVE] + fallthrough + case '\u0119': // ę [LATIN SMALL LETTER E WITH OGONEK] + fallthrough + case '\u011B': // ě [LATIN SMALL LETTER E WITH CARON] + fallthrough + case '\u01DD': // ǝ [LATIN SMALL LETTER TURNED E] + fallthrough + case '\u0205': // ȅ [LATIN SMALL LETTER E WITH DOUBLE GRAVE] + fallthrough + case '\u0207': // ȇ [LATIN SMALL LETTER E WITH INVERTED BREVE] + fallthrough + case '\u0229': // ȩ [LATIN SMALL LETTER E WITH CEDILLA] + fallthrough + case '\u0247': // ɇ [LATIN SMALL LETTER E WITH STROKE] + fallthrough + case '\u0258': // ɘ [LATIN SMALL LETTER REVERSED E] + fallthrough + case '\u025B': // ɛ [LATIN SMALL LETTER OPEN E] + fallthrough + case '\u025C': // ɜ [LATIN SMALL LETTER REVERSED OPEN E] + fallthrough + case '\u025D': // ɝ [LATIN SMALL LETTER REVERSED OPEN E WITH HOOK] + fallthrough + case '\u025E': // ɞ [LATIN SMALL LETTER CLOSED REVERSED OPEN E] + fallthrough + case '\u029A': // ʚ [LATIN SMALL LETTER CLOSED OPEN E] + fallthrough + case '\u1D08': // ᴈ [LATIN SMALL LETTER TURNED OPEN E] + fallthrough + case '\u1D92': // ᶒ [LATIN SMALL LETTER E WITH RETROFLEX HOOK] + fallthrough + case '\u1D93': // ᶓ [LATIN SMALL LETTER OPEN E WITH RETROFLEX HOOK] + fallthrough + case '\u1D94': // ᶔ [LATIN SMALL LETTER REVERSED OPEN E WITH RETROFLEX HOOK] + fallthrough + case '\u1E15': // ḕ [LATIN SMALL LETTER E WITH MACRON AND GRAVE] + fallthrough + case '\u1E17': // ḗ [LATIN SMALL LETTER E WITH MACRON AND ACUTE] + fallthrough + case '\u1E19': // ḙ [LATIN SMALL LETTER E WITH CIRCUMFLEX BELOW] + fallthrough + case '\u1E1B': // ḛ [LATIN SMALL LETTER E WITH TILDE BELOW] + fallthrough + case '\u1E1D': // ḝ [LATIN SMALL LETTER E WITH CEDILLA AND BREVE] + fallthrough + case '\u1EB9': // ẹ [LATIN SMALL LETTER E WITH DOT BELOW] + fallthrough + case '\u1EBB': // ẻ [LATIN SMALL LETTER E WITH HOOK ABOVE] + fallthrough + case '\u1EBD': // ẽ [LATIN SMALL LETTER E WITH TILDE] + fallthrough + case '\u1EBF': // ế [LATIN SMALL LETTER E WITH CIRCUMFLEX AND ACUTE] + fallthrough + case '\u1EC1': // ề [LATIN SMALL LETTER E WITH CIRCUMFLEX AND GRAVE] + fallthrough + case '\u1EC3': // ể [LATIN SMALL LETTER E WITH CIRCUMFLEX AND HOOK ABOVE] + fallthrough + case '\u1EC5': // ễ [LATIN SMALL LETTER E WITH CIRCUMFLEX AND TILDE] + fallthrough + case '\u1EC7': // ệ [LATIN SMALL LETTER E WITH CIRCUMFLEX AND DOT BELOW] + fallthrough + case '\u2091': // ₑ [LATIN SUBSCRIPT SMALL LETTER E] + fallthrough + case '\u24D4': // ⓔ [CIRCLED LATIN SMALL LETTER E] + fallthrough + case '\u2C78': // ⱸ [LATIN SMALL LETTER E WITH NOTCH] + fallthrough + case '\uFF45': // e [FULLWIDTH LATIN SMALL LETTER E] + output[outputPos] = 'e' + outputPos++ + + case '\u24A0': // ⒠ [PARENTHESIZED LATIN SMALL LETTER E] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'e' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u0191': // Ƒ [LATIN CAPITAL LETTER F WITH HOOK] + fallthrough + case '\u1E1E': // Ḟ [LATIN CAPITAL LETTER F WITH DOT ABOVE] + fallthrough + case '\u24BB': // Ⓕ [CIRCLED LATIN CAPITAL LETTER F] + fallthrough + case '\uA730': // ꜰ [LATIN LETTER SMALL CAPITAL F] + fallthrough + case '\uA77B': // Ꝼ [LATIN CAPITAL LETTER INSULAR F] + fallthrough + case '\uA7FB': // ꟻ [LATIN EPIGRAPHIC LETTER REVERSED F] + fallthrough + case '\uFF26': // F [FULLWIDTH LATIN CAPITAL LETTER F] + output[outputPos] = 'F' + outputPos++ + + case '\u0192': // ƒ [LATIN SMALL LETTER F WITH HOOK] + fallthrough + case '\u1D6E': // ᵮ [LATIN SMALL LETTER F WITH MIDDLE TILDE] + fallthrough + case '\u1D82': // ᶂ [LATIN SMALL LETTER F WITH PALATAL HOOK] + fallthrough + case '\u1E1F': // ḟ [LATIN SMALL LETTER F WITH DOT ABOVE] + fallthrough + case '\u1E9B': // ẛ [LATIN SMALL LETTER LONG S WITH DOT ABOVE] + fallthrough + case '\u24D5': // ⓕ [CIRCLED LATIN SMALL LETTER F] + fallthrough + case '\uA77C': // ꝼ [LATIN SMALL LETTER INSULAR F] + fallthrough + case '\uFF46': // f [FULLWIDTH LATIN SMALL LETTER F] + output[outputPos] = 'f' + outputPos++ + + case '\u24A1': // ⒡ [PARENTHESIZED LATIN SMALL LETTER F] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'f' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\uFB00': // ff [LATIN SMALL LIGATURE FF] + output = output[:(len(output) + 1)] + output[outputPos] = 'f' + outputPos++ + output[outputPos] = 'f' + outputPos++ + + case '\uFB03': // ffi [LATIN SMALL LIGATURE FFI] + output = output[:(len(output) + 2)] + output[outputPos] = 'f' + outputPos++ + output[outputPos] = 'f' + outputPos++ + output[outputPos] = 'i' + outputPos++ + + case '\uFB04': // ffl [LATIN SMALL LIGATURE FFL] + output = output[:(len(output) + 2)] + output[outputPos] = 'f' + outputPos++ + output[outputPos] = 'f' + outputPos++ + output[outputPos] = 'l' + outputPos++ + + case '\uFB01': // fi [LATIN SMALL LIGATURE FI] + output = output[:(len(output) + 1)] + output[outputPos] = 'f' + outputPos++ + output[outputPos] = 'i' + outputPos++ + + case '\uFB02': // fl [LATIN SMALL LIGATURE FL] + output = output[:(len(output) + 1)] + output[outputPos] = 'f' + outputPos++ + output[outputPos] = 'l' + outputPos++ + + case '\u011C': // Ĝ [LATIN CAPITAL LETTER G WITH CIRCUMFLEX] + fallthrough + case '\u011E': // Ğ [LATIN CAPITAL LETTER G WITH BREVE] + fallthrough + case '\u0120': // Ġ [LATIN CAPITAL LETTER G WITH DOT ABOVE] + fallthrough + case '\u0122': // Ģ [LATIN CAPITAL LETTER G WITH CEDILLA] + fallthrough + case '\u0193': // Ɠ [LATIN CAPITAL LETTER G WITH HOOK] + fallthrough + case '\u01E4': // Ǥ [LATIN CAPITAL LETTER G WITH STROKE] + fallthrough + case '\u01E5': // ǥ [LATIN SMALL LETTER G WITH STROKE] + fallthrough + case '\u01E6': // Ǧ [LATIN CAPITAL LETTER G WITH CARON] + fallthrough + case '\u01E7': // ǧ [LATIN SMALL LETTER G WITH CARON] + fallthrough + case '\u01F4': // Ǵ [LATIN CAPITAL LETTER G WITH ACUTE] + fallthrough + case '\u0262': // ɢ [LATIN LETTER SMALL CAPITAL G] + fallthrough + case '\u029B': // ʛ [LATIN LETTER SMALL CAPITAL G WITH HOOK] + fallthrough + case '\u1E20': // Ḡ [LATIN CAPITAL LETTER G WITH MACRON] + fallthrough + case '\u24BC': // Ⓖ [CIRCLED LATIN CAPITAL LETTER G] + fallthrough + case '\uA77D': // Ᵹ [LATIN CAPITAL LETTER INSULAR G] + fallthrough + case '\uA77E': // Ꝿ [LATIN CAPITAL LETTER TURNED INSULAR G] + fallthrough + case '\uFF27': // G [FULLWIDTH LATIN CAPITAL LETTER G] + output[outputPos] = 'G' + outputPos++ + + case '\u011D': // ĝ [LATIN SMALL LETTER G WITH CIRCUMFLEX] + fallthrough + case '\u011F': // ğ [LATIN SMALL LETTER G WITH BREVE] + fallthrough + case '\u0121': // ġ [LATIN SMALL LETTER G WITH DOT ABOVE] + fallthrough + case '\u0123': // ģ [LATIN SMALL LETTER G WITH CEDILLA] + fallthrough + case '\u01F5': // ǵ [LATIN SMALL LETTER G WITH ACUTE] + fallthrough + case '\u0260': // ɠ [LATIN SMALL LETTER G WITH HOOK] + fallthrough + case '\u0261': // ɡ [LATIN SMALL LETTER SCRIPT G] + fallthrough + case '\u1D77': // ᵷ [LATIN SMALL LETTER TURNED G] + fallthrough + case '\u1D79': // ᵹ [LATIN SMALL LETTER INSULAR G] + fallthrough + case '\u1D83': // ᶃ [LATIN SMALL LETTER G WITH PALATAL HOOK] + fallthrough + case '\u1E21': // ḡ [LATIN SMALL LETTER G WITH MACRON] + fallthrough + case '\u24D6': // ⓖ [CIRCLED LATIN SMALL LETTER G] + fallthrough + case '\uA77F': // ꝿ [LATIN SMALL LETTER TURNED INSULAR G] + fallthrough + case '\uFF47': // g [FULLWIDTH LATIN SMALL LETTER G] + output[outputPos] = 'g' + outputPos++ + + case '\u24A2': // ⒢ [PARENTHESIZED LATIN SMALL LETTER G] + output = output[:(len(output) + 1)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'g' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u0124': // Ĥ [LATIN CAPITAL LETTER H WITH CIRCUMFLEX] + fallthrough + case '\u0126': // Ħ [LATIN CAPITAL LETTER H WITH STROKE] + fallthrough + case '\u021E': // Ȟ [LATIN CAPITAL LETTER H WITH CARON] + fallthrough + case '\u029C': // ʜ [LATIN LETTER SMALL CAPITAL H] + fallthrough + case '\u1E22': // Ḣ [LATIN CAPITAL LETTER H WITH DOT ABOVE] + fallthrough + case '\u1E24': // Ḥ [LATIN CAPITAL LETTER H WITH DOT BELOW] + fallthrough + case '\u1E26': // Ḧ [LATIN CAPITAL LETTER H WITH DIAERESIS] + fallthrough + case '\u1E28': // Ḩ [LATIN CAPITAL LETTER H WITH CEDILLA] + fallthrough + case '\u1E2A': // Ḫ [LATIN CAPITAL LETTER H WITH BREVE BELOW] + fallthrough + case '\u24BD': // Ⓗ [CIRCLED LATIN CAPITAL LETTER H] + fallthrough + case '\u2C67': // Ⱨ [LATIN CAPITAL LETTER H WITH DESCENDER] + fallthrough + case '\u2C75': // Ⱶ [LATIN CAPITAL LETTER HALF H] + fallthrough + case '\uFF28': // H [FULLWIDTH LATIN CAPITAL LETTER H] + output[outputPos] = 'H' + outputPos++ + + case '\u0125': // ĥ [LATIN SMALL LETTER H WITH CIRCUMFLEX] + fallthrough + case '\u0127': // ħ [LATIN SMALL LETTER H WITH STROKE] + fallthrough + case '\u021F': // ȟ [LATIN SMALL LETTER H WITH CARON] + fallthrough + case '\u0265': // ɥ [LATIN SMALL LETTER TURNED H] + fallthrough + case '\u0266': // ɦ [LATIN SMALL LETTER H WITH HOOK] + fallthrough + case '\u02AE': // ʮ [LATIN SMALL LETTER TURNED H WITH FISHHOOK] + fallthrough + case '\u02AF': // ʯ [LATIN SMALL LETTER TURNED H WITH FISHHOOK AND TAIL] + fallthrough + case '\u1E23': // ḣ [LATIN SMALL LETTER H WITH DOT ABOVE] + fallthrough + case '\u1E25': // ḥ [LATIN SMALL LETTER H WITH DOT BELOW] + fallthrough + case '\u1E27': // ḧ [LATIN SMALL LETTER H WITH DIAERESIS] + fallthrough + case '\u1E29': // ḩ [LATIN SMALL LETTER H WITH CEDILLA] + fallthrough + case '\u1E2B': // ḫ [LATIN SMALL LETTER H WITH BREVE BELOW] + fallthrough + case '\u1E96': // ẖ [LATIN SMALL LETTER H WITH LINE BELOW] + fallthrough + case '\u24D7': // ⓗ [CIRCLED LATIN SMALL LETTER H] + fallthrough + case '\u2C68': // ⱨ [LATIN SMALL LETTER H WITH DESCENDER] + fallthrough + case '\u2C76': // ⱶ [LATIN SMALL LETTER HALF H] + fallthrough + case '\uFF48': // h [FULLWIDTH LATIN SMALL LETTER H] + output[outputPos] = 'h' + outputPos++ + + case '\u01F6': // Ƕ http://en.wikipedia.org/wiki/Hwair [LATIN CAPITAL LETTER HWAIR] + output = output[:(len(output) + 1)] + output[outputPos] = 'H' + outputPos++ + output[outputPos] = 'V' + outputPos++ + + case '\u24A3': // ⒣ [PARENTHESIZED LATIN SMALL LETTER H] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'h' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u0195': // ƕ [LATIN SMALL LETTER HV] + output = output[:(len(output) + 1)] + output[outputPos] = 'h' + outputPos++ + output[outputPos] = 'v' + outputPos++ + + case '\u00CC': // Ì [LATIN CAPITAL LETTER I WITH GRAVE] + fallthrough + case '\u00CD': // Í [LATIN CAPITAL LETTER I WITH ACUTE] + fallthrough + case '\u00CE': // Î [LATIN CAPITAL LETTER I WITH CIRCUMFLEX] + fallthrough + case '\u00CF': // Ï [LATIN CAPITAL LETTER I WITH DIAERESIS] + fallthrough + case '\u0128': // Ĩ [LATIN CAPITAL LETTER I WITH TILDE] + fallthrough + case '\u012A': // Ī [LATIN CAPITAL LETTER I WITH MACRON] + fallthrough + case '\u012C': // Ĭ [LATIN CAPITAL LETTER I WITH BREVE] + fallthrough + case '\u012E': // Į [LATIN CAPITAL LETTER I WITH OGONEK] + fallthrough + case '\u0130': // İ [LATIN CAPITAL LETTER I WITH DOT ABOVE] + fallthrough + case '\u0196': // Ɩ [LATIN CAPITAL LETTER IOTA] + fallthrough + case '\u0197': // Ɨ [LATIN CAPITAL LETTER I WITH STROKE] + fallthrough + case '\u01CF': // Ǐ [LATIN CAPITAL LETTER I WITH CARON] + fallthrough + case '\u0208': // Ȉ [LATIN CAPITAL LETTER I WITH DOUBLE GRAVE] + fallthrough + case '\u020A': // Ȋ [LATIN CAPITAL LETTER I WITH INVERTED BREVE] + fallthrough + case '\u026A': // ɪ [LATIN LETTER SMALL CAPITAL I] + fallthrough + case '\u1D7B': // ᵻ [LATIN SMALL CAPITAL LETTER I WITH STROKE] + fallthrough + case '\u1E2C': // Ḭ [LATIN CAPITAL LETTER I WITH TILDE BELOW] + fallthrough + case '\u1E2E': // Ḯ [LATIN CAPITAL LETTER I WITH DIAERESIS AND ACUTE] + fallthrough + case '\u1EC8': // Ỉ [LATIN CAPITAL LETTER I WITH HOOK ABOVE] + fallthrough + case '\u1ECA': // Ị [LATIN CAPITAL LETTER I WITH DOT BELOW] + fallthrough + case '\u24BE': // Ⓘ [CIRCLED LATIN CAPITAL LETTER I] + fallthrough + case '\uA7FE': // ꟾ [LATIN EPIGRAPHIC LETTER I LONGA] + fallthrough + case '\uFF29': // I [FULLWIDTH LATIN CAPITAL LETTER I] + output[outputPos] = 'I' + outputPos++ + + case '\u00EC': // ì [LATIN SMALL LETTER I WITH GRAVE] + fallthrough + case '\u00ED': // í [LATIN SMALL LETTER I WITH ACUTE] + fallthrough + case '\u00EE': // î [LATIN SMALL LETTER I WITH CIRCUMFLEX] + fallthrough + case '\u00EF': // ï [LATIN SMALL LETTER I WITH DIAERESIS] + fallthrough + case '\u0129': // ĩ [LATIN SMALL LETTER I WITH TILDE] + fallthrough + case '\u012B': // ī [LATIN SMALL LETTER I WITH MACRON] + fallthrough + case '\u012D': // ĭ [LATIN SMALL LETTER I WITH BREVE] + fallthrough + case '\u012F': // į [LATIN SMALL LETTER I WITH OGONEK] + fallthrough + case '\u0131': // ı [LATIN SMALL LETTER DOTLESS I] + fallthrough + case '\u01D0': // ǐ [LATIN SMALL LETTER I WITH CARON] + fallthrough + case '\u0209': // ȉ [LATIN SMALL LETTER I WITH DOUBLE GRAVE] + fallthrough + case '\u020B': // ȋ [LATIN SMALL LETTER I WITH INVERTED BREVE] + fallthrough + case '\u0268': // ɨ [LATIN SMALL LETTER I WITH STROKE] + fallthrough + case '\u1D09': // ᴉ [LATIN SMALL LETTER TURNED I] + fallthrough + case '\u1D62': // ᵢ [LATIN SUBSCRIPT SMALL LETTER I] + fallthrough + case '\u1D7C': // ᵼ [LATIN SMALL LETTER IOTA WITH STROKE] + fallthrough + case '\u1D96': // ᶖ [LATIN SMALL LETTER I WITH RETROFLEX HOOK] + fallthrough + case '\u1E2D': // ḭ [LATIN SMALL LETTER I WITH TILDE BELOW] + fallthrough + case '\u1E2F': // ḯ [LATIN SMALL LETTER I WITH DIAERESIS AND ACUTE] + fallthrough + case '\u1EC9': // ỉ [LATIN SMALL LETTER I WITH HOOK ABOVE] + fallthrough + case '\u1ECB': // ị [LATIN SMALL LETTER I WITH DOT BELOW] + fallthrough + case '\u2071': // ⁱ [SUPERSCRIPT LATIN SMALL LETTER I] + fallthrough + case '\u24D8': // ⓘ [CIRCLED LATIN SMALL LETTER I] + fallthrough + case '\uFF49': // i [FULLWIDTH LATIN SMALL LETTER I] + output[outputPos] = 'i' + outputPos++ + + case '\u0132': // IJ [LATIN CAPITAL LIGATURE IJ] + output = output[:(len(output) + 1)] + output[outputPos] = 'I' + outputPos++ + output[outputPos] = 'J' + outputPos++ + + case '\u24A4': // ⒤ [PARENTHESIZED LATIN SMALL LETTER I] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'i' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u0133': // ij [LATIN SMALL LIGATURE IJ] + output = output[:(len(output) + 1)] + output[outputPos] = 'i' + outputPos++ + output[outputPos] = 'j' + outputPos++ + + case '\u0134': // Ĵ [LATIN CAPITAL LETTER J WITH CIRCUMFLEX] + fallthrough + case '\u0248': // Ɉ [LATIN CAPITAL LETTER J WITH STROKE] + fallthrough + case '\u1D0A': // ᴊ [LATIN LETTER SMALL CAPITAL J] + fallthrough + case '\u24BF': // Ⓙ [CIRCLED LATIN CAPITAL LETTER J] + fallthrough + case '\uFF2A': // J [FULLWIDTH LATIN CAPITAL LETTER J] + output[outputPos] = 'J' + outputPos++ + + case '\u0135': // ĵ [LATIN SMALL LETTER J WITH CIRCUMFLEX] + fallthrough + case '\u01F0': // ǰ [LATIN SMALL LETTER J WITH CARON] + fallthrough + case '\u0237': // ȷ [LATIN SMALL LETTER DOTLESS J] + fallthrough + case '\u0249': // ɉ [LATIN SMALL LETTER J WITH STROKE] + fallthrough + case '\u025F': // ɟ [LATIN SMALL LETTER DOTLESS J WITH STROKE] + fallthrough + case '\u0284': // ʄ [LATIN SMALL LETTER DOTLESS J WITH STROKE AND HOOK] + fallthrough + case '\u029D': // ʝ [LATIN SMALL LETTER J WITH CROSSED-TAIL] + fallthrough + case '\u24D9': // ⓙ [CIRCLED LATIN SMALL LETTER J] + fallthrough + case '\u2C7C': // ⱼ [LATIN SUBSCRIPT SMALL LETTER J] + fallthrough + case '\uFF4A': // j [FULLWIDTH LATIN SMALL LETTER J] + output[outputPos] = 'j' + outputPos++ + + case '\u24A5': // ⒥ [PARENTHESIZED LATIN SMALL LETTER J] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'j' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u0136': // Ķ [LATIN CAPITAL LETTER K WITH CEDILLA] + fallthrough + case '\u0198': // Ƙ [LATIN CAPITAL LETTER K WITH HOOK] + fallthrough + case '\u01E8': // Ǩ [LATIN CAPITAL LETTER K WITH CARON] + fallthrough + case '\u1D0B': // ᴋ [LATIN LETTER SMALL CAPITAL K] + fallthrough + case '\u1E30': // Ḱ [LATIN CAPITAL LETTER K WITH ACUTE] + fallthrough + case '\u1E32': // Ḳ [LATIN CAPITAL LETTER K WITH DOT BELOW] + fallthrough + case '\u1E34': // Ḵ [LATIN CAPITAL LETTER K WITH LINE BELOW] + fallthrough + case '\u24C0': // Ⓚ [CIRCLED LATIN CAPITAL LETTER K] + fallthrough + case '\u2C69': // Ⱪ [LATIN CAPITAL LETTER K WITH DESCENDER] + fallthrough + case '\uA740': // Ꝁ [LATIN CAPITAL LETTER K WITH STROKE] + fallthrough + case '\uA742': // Ꝃ [LATIN CAPITAL LETTER K WITH DIAGONAL STROKE] + fallthrough + case '\uA744': // Ꝅ [LATIN CAPITAL LETTER K WITH STROKE AND DIAGONAL STROKE] + fallthrough + case '\uFF2B': // K [FULLWIDTH LATIN CAPITAL LETTER K] + output[outputPos] = 'K' + outputPos++ + + case '\u0137': // ķ [LATIN SMALL LETTER K WITH CEDILLA] + fallthrough + case '\u0199': // ƙ [LATIN SMALL LETTER K WITH HOOK] + fallthrough + case '\u01E9': // ǩ [LATIN SMALL LETTER K WITH CARON] + fallthrough + case '\u029E': // ʞ [LATIN SMALL LETTER TURNED K] + fallthrough + case '\u1D84': // ᶄ [LATIN SMALL LETTER K WITH PALATAL HOOK] + fallthrough + case '\u1E31': // ḱ [LATIN SMALL LETTER K WITH ACUTE] + fallthrough + case '\u1E33': // ḳ [LATIN SMALL LETTER K WITH DOT BELOW] + fallthrough + case '\u1E35': // ḵ [LATIN SMALL LETTER K WITH LINE BELOW] + fallthrough + case '\u24DA': // ⓚ [CIRCLED LATIN SMALL LETTER K] + fallthrough + case '\u2C6A': // ⱪ [LATIN SMALL LETTER K WITH DESCENDER] + fallthrough + case '\uA741': // ꝁ [LATIN SMALL LETTER K WITH STROKE] + fallthrough + case '\uA743': // ꝃ [LATIN SMALL LETTER K WITH DIAGONAL STROKE] + fallthrough + case '\uA745': // ꝅ [LATIN SMALL LETTER K WITH STROKE AND DIAGONAL STROKE] + fallthrough + case '\uFF4B': // k [FULLWIDTH LATIN SMALL LETTER K] + output[outputPos] = 'k' + outputPos++ + + case '\u24A6': // ⒦ [PARENTHESIZED LATIN SMALL LETTER K] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'k' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u0139': // Ĺ [LATIN CAPITAL LETTER L WITH ACUTE] + fallthrough + case '\u013B': // Ļ [LATIN CAPITAL LETTER L WITH CEDILLA] + fallthrough + case '\u013D': // Ľ [LATIN CAPITAL LETTER L WITH CARON] + fallthrough + case '\u013F': // Ŀ [LATIN CAPITAL LETTER L WITH MIDDLE DOT] + fallthrough + case '\u0141': // Ł [LATIN CAPITAL LETTER L WITH STROKE] + fallthrough + case '\u023D': // Ƚ [LATIN CAPITAL LETTER L WITH BAR] + fallthrough + case '\u029F': // ʟ [LATIN LETTER SMALL CAPITAL L] + fallthrough + case '\u1D0C': // ᴌ [LATIN LETTER SMALL CAPITAL L WITH STROKE] + fallthrough + case '\u1E36': // Ḷ [LATIN CAPITAL LETTER L WITH DOT BELOW] + fallthrough + case '\u1E38': // Ḹ [LATIN CAPITAL LETTER L WITH DOT BELOW AND MACRON] + fallthrough + case '\u1E3A': // Ḻ [LATIN CAPITAL LETTER L WITH LINE BELOW] + fallthrough + case '\u1E3C': // Ḽ [LATIN CAPITAL LETTER L WITH CIRCUMFLEX BELOW] + fallthrough + case '\u24C1': // Ⓛ [CIRCLED LATIN CAPITAL LETTER L] + fallthrough + case '\u2C60': // Ⱡ [LATIN CAPITAL LETTER L WITH DOUBLE BAR] + fallthrough + case '\u2C62': // Ɫ [LATIN CAPITAL LETTER L WITH MIDDLE TILDE] + fallthrough + case '\uA746': // Ꝇ [LATIN CAPITAL LETTER BROKEN L] + fallthrough + case '\uA748': // Ꝉ [LATIN CAPITAL LETTER L WITH HIGH STROKE] + fallthrough + case '\uA780': // Ꞁ [LATIN CAPITAL LETTER TURNED L] + fallthrough + case '\uFF2C': // L [FULLWIDTH LATIN CAPITAL LETTER L] + output[outputPos] = 'L' + outputPos++ + + case '\u013A': // ĺ [LATIN SMALL LETTER L WITH ACUTE] + fallthrough + case '\u013C': // ļ [LATIN SMALL LETTER L WITH CEDILLA] + fallthrough + case '\u013E': // ľ [LATIN SMALL LETTER L WITH CARON] + fallthrough + case '\u0140': // ŀ [LATIN SMALL LETTER L WITH MIDDLE DOT] + fallthrough + case '\u0142': // ł [LATIN SMALL LETTER L WITH STROKE] + fallthrough + case '\u019A': // ƚ [LATIN SMALL LETTER L WITH BAR] + fallthrough + case '\u0234': // ȴ [LATIN SMALL LETTER L WITH CURL] + fallthrough + case '\u026B': // ɫ [LATIN SMALL LETTER L WITH MIDDLE TILDE] + fallthrough + case '\u026C': // ɬ [LATIN SMALL LETTER L WITH BELT] + fallthrough + case '\u026D': // ɭ [LATIN SMALL LETTER L WITH RETROFLEX HOOK] + fallthrough + case '\u1D85': // ᶅ [LATIN SMALL LETTER L WITH PALATAL HOOK] + fallthrough + case '\u1E37': // ḷ [LATIN SMALL LETTER L WITH DOT BELOW] + fallthrough + case '\u1E39': // ḹ [LATIN SMALL LETTER L WITH DOT BELOW AND MACRON] + fallthrough + case '\u1E3B': // ḻ [LATIN SMALL LETTER L WITH LINE BELOW] + fallthrough + case '\u1E3D': // ḽ [LATIN SMALL LETTER L WITH CIRCUMFLEX BELOW] + fallthrough + case '\u24DB': // ⓛ [CIRCLED LATIN SMALL LETTER L] + fallthrough + case '\u2C61': // ⱡ [LATIN SMALL LETTER L WITH DOUBLE BAR] + fallthrough + case '\uA747': // ꝇ [LATIN SMALL LETTER BROKEN L] + fallthrough + case '\uA749': // ꝉ [LATIN SMALL LETTER L WITH HIGH STROKE] + fallthrough + case '\uA781': // ꞁ [LATIN SMALL LETTER TURNED L] + fallthrough + case '\uFF4C': // l [FULLWIDTH LATIN SMALL LETTER L] + output[outputPos] = 'l' + outputPos++ + + case '\u01C7': // LJ [LATIN CAPITAL LETTER LJ] + output = output[:(len(output) + 1)] + output[outputPos] = 'L' + outputPos++ + output[outputPos] = 'J' + outputPos++ + + case '\u1EFA': // Ỻ [LATIN CAPITAL LETTER MIDDLE-WELSH LL] + output = output[:(len(output) + 1)] + output[outputPos] = 'L' + outputPos++ + output[outputPos] = 'L' + outputPos++ + + case '\u01C8': // Lj [LATIN CAPITAL LETTER L WITH SMALL LETTER J] + output = output[:(len(output) + 1)] + output[outputPos] = 'L' + outputPos++ + output[outputPos] = 'j' + outputPos++ + + case '\u24A7': // ⒧ [PARENTHESIZED LATIN SMALL LETTER L] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'l' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u01C9': // lj [LATIN SMALL LETTER LJ] + output = output[:(len(output) + 1)] + output[outputPos] = 'l' + outputPos++ + output[outputPos] = 'j' + outputPos++ + + case '\u1EFB': // ỻ [LATIN SMALL LETTER MIDDLE-WELSH LL] + output = output[:(len(output) + 1)] + output[outputPos] = 'l' + outputPos++ + output[outputPos] = 'l' + outputPos++ + + case '\u02AA': // ʪ [LATIN SMALL LETTER LS DIGRAPH] + output = output[:(len(output) + 1)] + output[outputPos] = 'l' + outputPos++ + output[outputPos] = 's' + outputPos++ + + case '\u02AB': // ʫ [LATIN SMALL LETTER LZ DIGRAPH] + output = output[:(len(output) + 1)] + output[outputPos] = 'l' + outputPos++ + output[outputPos] = 'z' + outputPos++ + + case '\u019C': // Ɯ [LATIN CAPITAL LETTER TURNED M] + fallthrough + case '\u1D0D': // ᴍ [LATIN LETTER SMALL CAPITAL M] + fallthrough + case '\u1E3E': // Ḿ [LATIN CAPITAL LETTER M WITH ACUTE] + fallthrough + case '\u1E40': // Ṁ [LATIN CAPITAL LETTER M WITH DOT ABOVE] + fallthrough + case '\u1E42': // Ṃ [LATIN CAPITAL LETTER M WITH DOT BELOW] + fallthrough + case '\u24C2': // Ⓜ [CIRCLED LATIN CAPITAL LETTER M] + fallthrough + case '\u2C6E': // Ɱ [LATIN CAPITAL LETTER M WITH HOOK] + fallthrough + case '\uA7FD': // ꟽ [LATIN EPIGRAPHIC LETTER INVERTED M] + fallthrough + case '\uA7FF': // ꟿ [LATIN EPIGRAPHIC LETTER ARCHAIC M] + fallthrough + case '\uFF2D': // M [FULLWIDTH LATIN CAPITAL LETTER M] + output[outputPos] = 'M' + outputPos++ + + case '\u026F': // ɯ [LATIN SMALL LETTER TURNED M] + fallthrough + case '\u0270': // ɰ [LATIN SMALL LETTER TURNED M WITH LONG LEG] + fallthrough + case '\u0271': // ɱ [LATIN SMALL LETTER M WITH HOOK] + fallthrough + case '\u1D6F': // ᵯ [LATIN SMALL LETTER M WITH MIDDLE TILDE] + fallthrough + case '\u1D86': // ᶆ [LATIN SMALL LETTER M WITH PALATAL HOOK] + fallthrough + case '\u1E3F': // ḿ [LATIN SMALL LETTER M WITH ACUTE] + fallthrough + case '\u1E41': // ṁ [LATIN SMALL LETTER M WITH DOT ABOVE] + fallthrough + case '\u1E43': // ṃ [LATIN SMALL LETTER M WITH DOT BELOW] + fallthrough + case '\u24DC': // ⓜ [CIRCLED LATIN SMALL LETTER M] + fallthrough + case '\uFF4D': // m [FULLWIDTH LATIN SMALL LETTER M] + output[outputPos] = 'm' + outputPos++ + + case '\u24A8': // ⒨ [PARENTHESIZED LATIN SMALL LETTER M] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'm' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u00D1': // Ñ [LATIN CAPITAL LETTER N WITH TILDE] + fallthrough + case '\u0143': // Ń [LATIN CAPITAL LETTER N WITH ACUTE] + fallthrough + case '\u0145': // Ņ [LATIN CAPITAL LETTER N WITH CEDILLA] + fallthrough + case '\u0147': // Ň [LATIN CAPITAL LETTER N WITH CARON] + fallthrough + case '\u014A': // Ŋ http://en.wikipedia.org/wiki/Eng_(letter) [LATIN CAPITAL LETTER ENG] + fallthrough + case '\u019D': // Ɲ [LATIN CAPITAL LETTER N WITH LEFT HOOK] + fallthrough + case '\u01F8': // Ǹ [LATIN CAPITAL LETTER N WITH GRAVE] + fallthrough + case '\u0220': // Ƞ [LATIN CAPITAL LETTER N WITH LONG RIGHT LEG] + fallthrough + case '\u0274': // ɴ [LATIN LETTER SMALL CAPITAL N] + fallthrough + case '\u1D0E': // ᴎ [LATIN LETTER SMALL CAPITAL REVERSED N] + fallthrough + case '\u1E44': // Ṅ [LATIN CAPITAL LETTER N WITH DOT ABOVE] + fallthrough + case '\u1E46': // Ṇ [LATIN CAPITAL LETTER N WITH DOT BELOW] + fallthrough + case '\u1E48': // Ṉ [LATIN CAPITAL LETTER N WITH LINE BELOW] + fallthrough + case '\u1E4A': // Ṋ [LATIN CAPITAL LETTER N WITH CIRCUMFLEX BELOW] + fallthrough + case '\u24C3': // Ⓝ [CIRCLED LATIN CAPITAL LETTER N] + fallthrough + case '\uFF2E': // N [FULLWIDTH LATIN CAPITAL LETTER N] + output[outputPos] = 'N' + outputPos++ + + case '\u00F1': // ñ [LATIN SMALL LETTER N WITH TILDE] + fallthrough + case '\u0144': // ń [LATIN SMALL LETTER N WITH ACUTE] + fallthrough + case '\u0146': // ņ [LATIN SMALL LETTER N WITH CEDILLA] + fallthrough + case '\u0148': // ň [LATIN SMALL LETTER N WITH CARON] + fallthrough + case '\u0149': // ʼn [LATIN SMALL LETTER N PRECEDED BY APOSTROPHE] + fallthrough + case '\u014B': // ŋ http://en.wikipedia.org/wiki/Eng_(letter) [LATIN SMALL LETTER ENG] + fallthrough + case '\u019E': // ƞ [LATIN SMALL LETTER N WITH LONG RIGHT LEG] + fallthrough + case '\u01F9': // ǹ [LATIN SMALL LETTER N WITH GRAVE] + fallthrough + case '\u0235': // ȵ [LATIN SMALL LETTER N WITH CURL] + fallthrough + case '\u0272': // ɲ [LATIN SMALL LETTER N WITH LEFT HOOK] + fallthrough + case '\u0273': // ɳ [LATIN SMALL LETTER N WITH RETROFLEX HOOK] + fallthrough + case '\u1D70': // ᵰ [LATIN SMALL LETTER N WITH MIDDLE TILDE] + fallthrough + case '\u1D87': // ᶇ [LATIN SMALL LETTER N WITH PALATAL HOOK] + fallthrough + case '\u1E45': // ṅ [LATIN SMALL LETTER N WITH DOT ABOVE] + fallthrough + case '\u1E47': // ṇ [LATIN SMALL LETTER N WITH DOT BELOW] + fallthrough + case '\u1E49': // ṉ [LATIN SMALL LETTER N WITH LINE BELOW] + fallthrough + case '\u1E4B': // ṋ [LATIN SMALL LETTER N WITH CIRCUMFLEX BELOW] + fallthrough + case '\u207F': // ⁿ [SUPERSCRIPT LATIN SMALL LETTER N] + fallthrough + case '\u24DD': // ⓝ [CIRCLED LATIN SMALL LETTER N] + fallthrough + case '\uFF4E': // n [FULLWIDTH LATIN SMALL LETTER N] + output[outputPos] = 'n' + outputPos++ + + case '\u01CA': // NJ [LATIN CAPITAL LETTER NJ] + output = output[:(len(output) + 1)] + output[outputPos] = 'N' + outputPos++ + output[outputPos] = 'J' + outputPos++ + + case '\u01CB': // Nj [LATIN CAPITAL LETTER N WITH SMALL LETTER J] + output = output[:(len(output) + 1)] + output[outputPos] = 'N' + outputPos++ + output[outputPos] = 'j' + outputPos++ + + case '\u24A9': // ⒩ [PARENTHESIZED LATIN SMALL LETTER N] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'n' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u01CC': // nj [LATIN SMALL LETTER NJ] + output = output[:(len(output) + 1)] + output[outputPos] = 'n' + outputPos++ + output[outputPos] = 'j' + outputPos++ + + case '\u00D2': // Ò [LATIN CAPITAL LETTER O WITH GRAVE] + fallthrough + case '\u00D3': // Ó [LATIN CAPITAL LETTER O WITH ACUTE] + fallthrough + case '\u00D4': // Ô [LATIN CAPITAL LETTER O WITH CIRCUMFLEX] + fallthrough + case '\u00D5': // Õ [LATIN CAPITAL LETTER O WITH TILDE] + fallthrough + case '\u00D6': // Ö [LATIN CAPITAL LETTER O WITH DIAERESIS] + fallthrough + case '\u00D8': // Ø [LATIN CAPITAL LETTER O WITH STROKE] + fallthrough + case '\u014C': // Ō [LATIN CAPITAL LETTER O WITH MACRON] + fallthrough + case '\u014E': // Ŏ [LATIN CAPITAL LETTER O WITH BREVE] + fallthrough + case '\u0150': // Ő [LATIN CAPITAL LETTER O WITH DOUBLE ACUTE] + fallthrough + case '\u0186': // Ɔ [LATIN CAPITAL LETTER OPEN O] + fallthrough + case '\u019F': // Ɵ [LATIN CAPITAL LETTER O WITH MIDDLE TILDE] + fallthrough + case '\u01A0': // Ơ [LATIN CAPITAL LETTER O WITH HORN] + fallthrough + case '\u01D1': // Ǒ [LATIN CAPITAL LETTER O WITH CARON] + fallthrough + case '\u01EA': // Ǫ [LATIN CAPITAL LETTER O WITH OGONEK] + fallthrough + case '\u01EC': // Ǭ [LATIN CAPITAL LETTER O WITH OGONEK AND MACRON] + fallthrough + case '\u01FE': // Ǿ [LATIN CAPITAL LETTER O WITH STROKE AND ACUTE] + fallthrough + case '\u020C': // Ȍ [LATIN CAPITAL LETTER O WITH DOUBLE GRAVE] + fallthrough + case '\u020E': // Ȏ [LATIN CAPITAL LETTER O WITH INVERTED BREVE] + fallthrough + case '\u022A': // Ȫ [LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON] + fallthrough + case '\u022C': // Ȭ [LATIN CAPITAL LETTER O WITH TILDE AND MACRON] + fallthrough + case '\u022E': // Ȯ [LATIN CAPITAL LETTER O WITH DOT ABOVE] + fallthrough + case '\u0230': // Ȱ [LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON] + fallthrough + case '\u1D0F': // ᴏ [LATIN LETTER SMALL CAPITAL O] + fallthrough + case '\u1D10': // ᴐ [LATIN LETTER SMALL CAPITAL OPEN O] + fallthrough + case '\u1E4C': // Ṍ [LATIN CAPITAL LETTER O WITH TILDE AND ACUTE] + fallthrough + case '\u1E4E': // Ṏ [LATIN CAPITAL LETTER O WITH TILDE AND DIAERESIS] + fallthrough + case '\u1E50': // Ṑ [LATIN CAPITAL LETTER O WITH MACRON AND GRAVE] + fallthrough + case '\u1E52': // Ṓ [LATIN CAPITAL LETTER O WITH MACRON AND ACUTE] + fallthrough + case '\u1ECC': // Ọ [LATIN CAPITAL LETTER O WITH DOT BELOW] + fallthrough + case '\u1ECE': // Ỏ [LATIN CAPITAL LETTER O WITH HOOK ABOVE] + fallthrough + case '\u1ED0': // Ố [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND ACUTE] + fallthrough + case '\u1ED2': // Ồ [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND GRAVE] + fallthrough + case '\u1ED4': // Ổ [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE] + fallthrough + case '\u1ED6': // Ỗ [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND TILDE] + fallthrough + case '\u1ED8': // Ộ [LATIN CAPITAL LETTER O WITH CIRCUMFLEX AND DOT BELOW] + fallthrough + case '\u1EDA': // Ớ [LATIN CAPITAL LETTER O WITH HORN AND ACUTE] + fallthrough + case '\u1EDC': // Ờ [LATIN CAPITAL LETTER O WITH HORN AND GRAVE] + fallthrough + case '\u1EDE': // Ở [LATIN CAPITAL LETTER O WITH HORN AND HOOK ABOVE] + fallthrough + case '\u1EE0': // Ỡ [LATIN CAPITAL LETTER O WITH HORN AND TILDE] + fallthrough + case '\u1EE2': // Ợ [LATIN CAPITAL LETTER O WITH HORN AND DOT BELOW] + fallthrough + case '\u24C4': // Ⓞ [CIRCLED LATIN CAPITAL LETTER O] + fallthrough + case '\uA74A': // Ꝋ [LATIN CAPITAL LETTER O WITH LONG STROKE OVERLAY] + fallthrough + case '\uA74C': // Ꝍ [LATIN CAPITAL LETTER O WITH LOOP] + fallthrough + case '\uFF2F': // O [FULLWIDTH LATIN CAPITAL LETTER O] + output[outputPos] = 'O' + outputPos++ + + case '\u00F2': // ò [LATIN SMALL LETTER O WITH GRAVE] + fallthrough + case '\u00F3': // ó [LATIN SMALL LETTER O WITH ACUTE] + fallthrough + case '\u00F4': // ô [LATIN SMALL LETTER O WITH CIRCUMFLEX] + fallthrough + case '\u00F5': // õ [LATIN SMALL LETTER O WITH TILDE] + fallthrough + case '\u00F6': // ö [LATIN SMALL LETTER O WITH DIAERESIS] + fallthrough + case '\u00F8': // ø [LATIN SMALL LETTER O WITH STROKE] + fallthrough + case '\u014D': // ō [LATIN SMALL LETTER O WITH MACRON] + fallthrough + case '\u014F': // ŏ [LATIN SMALL LETTER O WITH BREVE] + fallthrough + case '\u0151': // ő [LATIN SMALL LETTER O WITH DOUBLE ACUTE] + fallthrough + case '\u01A1': // ơ [LATIN SMALL LETTER O WITH HORN] + fallthrough + case '\u01D2': // ǒ [LATIN SMALL LETTER O WITH CARON] + fallthrough + case '\u01EB': // ǫ [LATIN SMALL LETTER O WITH OGONEK] + fallthrough + case '\u01ED': // ǭ [LATIN SMALL LETTER O WITH OGONEK AND MACRON] + fallthrough + case '\u01FF': // ǿ [LATIN SMALL LETTER O WITH STROKE AND ACUTE] + fallthrough + case '\u020D': // ȍ [LATIN SMALL LETTER O WITH DOUBLE GRAVE] + fallthrough + case '\u020F': // ȏ [LATIN SMALL LETTER O WITH INVERTED BREVE] + fallthrough + case '\u022B': // ȫ [LATIN SMALL LETTER O WITH DIAERESIS AND MACRON] + fallthrough + case '\u022D': // ȭ [LATIN SMALL LETTER O WITH TILDE AND MACRON] + fallthrough + case '\u022F': // ȯ [LATIN SMALL LETTER O WITH DOT ABOVE] + fallthrough + case '\u0231': // ȱ [LATIN SMALL LETTER O WITH DOT ABOVE AND MACRON] + fallthrough + case '\u0254': // ɔ [LATIN SMALL LETTER OPEN O] + fallthrough + case '\u0275': // ɵ [LATIN SMALL LETTER BARRED O] + fallthrough + case '\u1D16': // ᴖ [LATIN SMALL LETTER TOP HALF O] + fallthrough + case '\u1D17': // ᴗ [LATIN SMALL LETTER BOTTOM HALF O] + fallthrough + case '\u1D97': // ᶗ [LATIN SMALL LETTER OPEN O WITH RETROFLEX HOOK] + fallthrough + case '\u1E4D': // ṍ [LATIN SMALL LETTER O WITH TILDE AND ACUTE] + fallthrough + case '\u1E4F': // ṏ [LATIN SMALL LETTER O WITH TILDE AND DIAERESIS] + fallthrough + case '\u1E51': // ṑ [LATIN SMALL LETTER O WITH MACRON AND GRAVE] + fallthrough + case '\u1E53': // ṓ [LATIN SMALL LETTER O WITH MACRON AND ACUTE] + fallthrough + case '\u1ECD': // ọ [LATIN SMALL LETTER O WITH DOT BELOW] + fallthrough + case '\u1ECF': // ỏ [LATIN SMALL LETTER O WITH HOOK ABOVE] + fallthrough + case '\u1ED1': // ố [LATIN SMALL LETTER O WITH CIRCUMFLEX AND ACUTE] + fallthrough + case '\u1ED3': // ồ [LATIN SMALL LETTER O WITH CIRCUMFLEX AND GRAVE] + fallthrough + case '\u1ED5': // ổ [LATIN SMALL LETTER O WITH CIRCUMFLEX AND HOOK ABOVE] + fallthrough + case '\u1ED7': // ỗ [LATIN SMALL LETTER O WITH CIRCUMFLEX AND TILDE] + fallthrough + case '\u1ED9': // ộ [LATIN SMALL LETTER O WITH CIRCUMFLEX AND DOT BELOW] + fallthrough + case '\u1EDB': // ớ [LATIN SMALL LETTER O WITH HORN AND ACUTE] + fallthrough + case '\u1EDD': // ờ [LATIN SMALL LETTER O WITH HORN AND GRAVE] + fallthrough + case '\u1EDF': // ở [LATIN SMALL LETTER O WITH HORN AND HOOK ABOVE] + fallthrough + case '\u1EE1': // ỡ [LATIN SMALL LETTER O WITH HORN AND TILDE] + fallthrough + case '\u1EE3': // ợ [LATIN SMALL LETTER O WITH HORN AND DOT BELOW] + fallthrough + case '\u2092': // ₒ [LATIN SUBSCRIPT SMALL LETTER O] + fallthrough + case '\u24DE': // ⓞ [CIRCLED LATIN SMALL LETTER O] + fallthrough + case '\u2C7A': // ⱺ [LATIN SMALL LETTER O WITH LOW RING INSIDE] + fallthrough + case '\uA74B': // ꝋ [LATIN SMALL LETTER O WITH LONG STROKE OVERLAY] + fallthrough + case '\uA74D': // ꝍ [LATIN SMALL LETTER O WITH LOOP] + fallthrough + case '\uFF4F': // o [FULLWIDTH LATIN SMALL LETTER O] + output[outputPos] = 'o' + outputPos++ + + case '\u0152': // Œ [LATIN CAPITAL LIGATURE OE] + fallthrough + case '\u0276': // ɶ [LATIN LETTER SMALL CAPITAL OE] + output = output[:(len(output) + 1)] + output[outputPos] = 'O' + outputPos++ + output[outputPos] = 'E' + outputPos++ + + case '\uA74E': // Ꝏ [LATIN CAPITAL LETTER OO] + output = output[:(len(output) + 1)] + output[outputPos] = 'O' + outputPos++ + output[outputPos] = 'O' + outputPos++ + + case '\u0222': // Ȣ http://en.wikipedia.org/wiki/OU [LATIN CAPITAL LETTER OU] + fallthrough + case '\u1D15': // ᴕ [LATIN LETTER SMALL CAPITAL OU] + output = output[:(len(output) + 1)] + output[outputPos] = 'O' + outputPos++ + output[outputPos] = 'U' + outputPos++ + + case '\u24AA': // ⒪ [PARENTHESIZED LATIN SMALL LETTER O] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'o' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u0153': // œ [LATIN SMALL LIGATURE OE] + fallthrough + case '\u1D14': // ᴔ [LATIN SMALL LETTER TURNED OE] + output = output[:(len(output) + 1)] + output[outputPos] = 'o' + outputPos++ + output[outputPos] = 'e' + outputPos++ + + case '\uA74F': // ꝏ [LATIN SMALL LETTER OO] + output = output[:(len(output) + 1)] + output[outputPos] = 'o' + outputPos++ + output[outputPos] = 'o' + outputPos++ + + case '\u0223': // ȣ http://en.wikipedia.org/wiki/OU [LATIN SMALL LETTER OU] + output = output[:(len(output) + 1)] + output[outputPos] = 'o' + outputPos++ + output[outputPos] = 'u' + outputPos++ + + case '\u01A4': // Ƥ [LATIN CAPITAL LETTER P WITH HOOK] + fallthrough + case '\u1D18': // ᴘ [LATIN LETTER SMALL CAPITAL P] + fallthrough + case '\u1E54': // Ṕ [LATIN CAPITAL LETTER P WITH ACUTE] + fallthrough + case '\u1E56': // Ṗ [LATIN CAPITAL LETTER P WITH DOT ABOVE] + fallthrough + case '\u24C5': // Ⓟ [CIRCLED LATIN CAPITAL LETTER P] + fallthrough + case '\u2C63': // Ᵽ [LATIN CAPITAL LETTER P WITH STROKE] + fallthrough + case '\uA750': // Ꝑ [LATIN CAPITAL LETTER P WITH STROKE THROUGH DESCENDER] + fallthrough + case '\uA752': // Ꝓ [LATIN CAPITAL LETTER P WITH FLOURISH] + fallthrough + case '\uA754': // Ꝕ [LATIN CAPITAL LETTER P WITH SQUIRREL TAIL] + fallthrough + case '\uFF30': // P [FULLWIDTH LATIN CAPITAL LETTER P] + output[outputPos] = 'P' + outputPos++ + + case '\u01A5': // ƥ [LATIN SMALL LETTER P WITH HOOK] + fallthrough + case '\u1D71': // ᵱ [LATIN SMALL LETTER P WITH MIDDLE TILDE] + fallthrough + case '\u1D7D': // ᵽ [LATIN SMALL LETTER P WITH STROKE] + fallthrough + case '\u1D88': // ᶈ [LATIN SMALL LETTER P WITH PALATAL HOOK] + fallthrough + case '\u1E55': // ṕ [LATIN SMALL LETTER P WITH ACUTE] + fallthrough + case '\u1E57': // ṗ [LATIN SMALL LETTER P WITH DOT ABOVE] + fallthrough + case '\u24DF': // ⓟ [CIRCLED LATIN SMALL LETTER P] + fallthrough + case '\uA751': // ꝑ [LATIN SMALL LETTER P WITH STROKE THROUGH DESCENDER] + fallthrough + case '\uA753': // ꝓ [LATIN SMALL LETTER P WITH FLOURISH] + fallthrough + case '\uA755': // ꝕ [LATIN SMALL LETTER P WITH SQUIRREL TAIL] + fallthrough + case '\uA7FC': // ꟼ [LATIN EPIGRAPHIC LETTER REVERSED P] + fallthrough + case '\uFF50': // p [FULLWIDTH LATIN SMALL LETTER P] + output[outputPos] = 'p' + outputPos++ + + case '\u24AB': // ⒫ [PARENTHESIZED LATIN SMALL LETTER P] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'p' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u024A': // Ɋ [LATIN CAPITAL LETTER SMALL Q WITH HOOK TAIL] + fallthrough + case '\u24C6': // Ⓠ [CIRCLED LATIN CAPITAL LETTER Q] + fallthrough + case '\uA756': // Ꝗ [LATIN CAPITAL LETTER Q WITH STROKE THROUGH DESCENDER] + fallthrough + case '\uA758': // Ꝙ [LATIN CAPITAL LETTER Q WITH DIAGONAL STROKE] + fallthrough + case '\uFF31': // Q [FULLWIDTH LATIN CAPITAL LETTER Q] + output[outputPos] = 'Q' + outputPos++ + + case '\u0138': // ĸ http://en.wikipedia.org/wiki/Kra_(letter) [LATIN SMALL LETTER KRA] + fallthrough + case '\u024B': // ɋ [LATIN SMALL LETTER Q WITH HOOK TAIL] + fallthrough + case '\u02A0': // ʠ [LATIN SMALL LETTER Q WITH HOOK] + fallthrough + case '\u24E0': // ⓠ [CIRCLED LATIN SMALL LETTER Q] + fallthrough + case '\uA757': // ꝗ [LATIN SMALL LETTER Q WITH STROKE THROUGH DESCENDER] + fallthrough + case '\uA759': // ꝙ [LATIN SMALL LETTER Q WITH DIAGONAL STROKE] + fallthrough + case '\uFF51': // q [FULLWIDTH LATIN SMALL LETTER Q] + output[outputPos] = 'q' + outputPos++ + + case '\u24AC': // ⒬ [PARENTHESIZED LATIN SMALL LETTER Q] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'q' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u0239': // ȹ [LATIN SMALL LETTER QP DIGRAPH] + output = output[:(len(output) + 1)] + output[outputPos] = 'q' + outputPos++ + output[outputPos] = 'p' + outputPos++ + + case '\u0154': // Ŕ [LATIN CAPITAL LETTER R WITH ACUTE] + fallthrough + case '\u0156': // Ŗ [LATIN CAPITAL LETTER R WITH CEDILLA] + fallthrough + case '\u0158': // Ř [LATIN CAPITAL LETTER R WITH CARON] + fallthrough + case '\u0210': // Ȓ [LATIN CAPITAL LETTER R WITH DOUBLE GRAVE] + fallthrough + case '\u0212': // Ȓ [LATIN CAPITAL LETTER R WITH INVERTED BREVE] + fallthrough + case '\u024C': // Ɍ [LATIN CAPITAL LETTER R WITH STROKE] + fallthrough + case '\u0280': // ʀ [LATIN LETTER SMALL CAPITAL R] + fallthrough + case '\u0281': // ʁ [LATIN LETTER SMALL CAPITAL INVERTED R] + fallthrough + case '\u1D19': // ᴙ [LATIN LETTER SMALL CAPITAL REVERSED R] + fallthrough + case '\u1D1A': // ᴚ [LATIN LETTER SMALL CAPITAL TURNED R] + fallthrough + case '\u1E58': // Ṙ [LATIN CAPITAL LETTER R WITH DOT ABOVE] + fallthrough + case '\u1E5A': // Ṛ [LATIN CAPITAL LETTER R WITH DOT BELOW] + fallthrough + case '\u1E5C': // Ṝ [LATIN CAPITAL LETTER R WITH DOT BELOW AND MACRON] + fallthrough + case '\u1E5E': // Ṟ [LATIN CAPITAL LETTER R WITH LINE BELOW] + fallthrough + case '\u24C7': // Ⓡ [CIRCLED LATIN CAPITAL LETTER R] + fallthrough + case '\u2C64': // Ɽ [LATIN CAPITAL LETTER R WITH TAIL] + fallthrough + case '\uA75A': // Ꝛ [LATIN CAPITAL LETTER R ROTUNDA] + fallthrough + case '\uA782': // Ꞃ [LATIN CAPITAL LETTER INSULAR R] + fallthrough + case '\uFF32': // R [FULLWIDTH LATIN CAPITAL LETTER R] + output[outputPos] = 'R' + outputPos++ + + case '\u0155': // ŕ [LATIN SMALL LETTER R WITH ACUTE] + fallthrough + case '\u0157': // ŗ [LATIN SMALL LETTER R WITH CEDILLA] + fallthrough + case '\u0159': // ř [LATIN SMALL LETTER R WITH CARON] + fallthrough + case '\u0211': // ȑ [LATIN SMALL LETTER R WITH DOUBLE GRAVE] + fallthrough + case '\u0213': // ȓ [LATIN SMALL LETTER R WITH INVERTED BREVE] + fallthrough + case '\u024D': // ɍ [LATIN SMALL LETTER R WITH STROKE] + fallthrough + case '\u027C': // ɼ [LATIN SMALL LETTER R WITH LONG LEG] + fallthrough + case '\u027D': // ɽ [LATIN SMALL LETTER R WITH TAIL] + fallthrough + case '\u027E': // ɾ [LATIN SMALL LETTER R WITH FISHHOOK] + fallthrough + case '\u027F': // ɿ [LATIN SMALL LETTER REVERSED R WITH FISHHOOK] + fallthrough + case '\u1D63': // ᵣ [LATIN SUBSCRIPT SMALL LETTER R] + fallthrough + case '\u1D72': // ᵲ [LATIN SMALL LETTER R WITH MIDDLE TILDE] + fallthrough + case '\u1D73': // ᵳ [LATIN SMALL LETTER R WITH FISHHOOK AND MIDDLE TILDE] + fallthrough + case '\u1D89': // ᶉ [LATIN SMALL LETTER R WITH PALATAL HOOK] + fallthrough + case '\u1E59': // ṙ [LATIN SMALL LETTER R WITH DOT ABOVE] + fallthrough + case '\u1E5B': // ṛ [LATIN SMALL LETTER R WITH DOT BELOW] + fallthrough + case '\u1E5D': // ṝ [LATIN SMALL LETTER R WITH DOT BELOW AND MACRON] + fallthrough + case '\u1E5F': // ṟ [LATIN SMALL LETTER R WITH LINE BELOW] + fallthrough + case '\u24E1': // ⓡ [CIRCLED LATIN SMALL LETTER R] + fallthrough + case '\uA75B': // ꝛ [LATIN SMALL LETTER R ROTUNDA] + fallthrough + case '\uA783': // ꞃ [LATIN SMALL LETTER INSULAR R] + fallthrough + case '\uFF52': // r [FULLWIDTH LATIN SMALL LETTER R] + output[outputPos] = 'r' + outputPos++ + + case '\u24AD': // ⒭ [PARENTHESIZED LATIN SMALL LETTER R] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'r' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u015A': // Ś [LATIN CAPITAL LETTER S WITH ACUTE] + fallthrough + case '\u015C': // Ŝ [LATIN CAPITAL LETTER S WITH CIRCUMFLEX] + fallthrough + case '\u015E': // Ş [LATIN CAPITAL LETTER S WITH CEDILLA] + fallthrough + case '\u0160': // Š [LATIN CAPITAL LETTER S WITH CARON] + fallthrough + case '\u0218': // Ș [LATIN CAPITAL LETTER S WITH COMMA BELOW] + fallthrough + case '\u1E60': // Ṡ [LATIN CAPITAL LETTER S WITH DOT ABOVE] + fallthrough + case '\u1E62': // Ṣ [LATIN CAPITAL LETTER S WITH DOT BELOW] + fallthrough + case '\u1E64': // Ṥ [LATIN CAPITAL LETTER S WITH ACUTE AND DOT ABOVE] + fallthrough + case '\u1E66': // Ṧ [LATIN CAPITAL LETTER S WITH CARON AND DOT ABOVE] + fallthrough + case '\u1E68': // Ṩ [LATIN CAPITAL LETTER S WITH DOT BELOW AND DOT ABOVE] + fallthrough + case '\u24C8': // Ⓢ [CIRCLED LATIN CAPITAL LETTER S] + fallthrough + case '\uA731': // ꜱ [LATIN LETTER SMALL CAPITAL S] + fallthrough + case '\uA785': // ꞅ [LATIN SMALL LETTER INSULAR S] + fallthrough + case '\uFF33': // S [FULLWIDTH LATIN CAPITAL LETTER S] + output[outputPos] = 'S' + outputPos++ + + case '\u015B': // ś [LATIN SMALL LETTER S WITH ACUTE] + fallthrough + case '\u015D': // ŝ [LATIN SMALL LETTER S WITH CIRCUMFLEX] + fallthrough + case '\u015F': // ş [LATIN SMALL LETTER S WITH CEDILLA] + fallthrough + case '\u0161': // š [LATIN SMALL LETTER S WITH CARON] + fallthrough + case '\u017F': // ſ http://en.wikipedia.org/wiki/Long_S [LATIN SMALL LETTER LONG S] + fallthrough + case '\u0219': // ș [LATIN SMALL LETTER S WITH COMMA BELOW] + fallthrough + case '\u023F': // ȿ [LATIN SMALL LETTER S WITH SWASH TAIL] + fallthrough + case '\u0282': // ʂ [LATIN SMALL LETTER S WITH HOOK] + fallthrough + case '\u1D74': // ᵴ [LATIN SMALL LETTER S WITH MIDDLE TILDE] + fallthrough + case '\u1D8A': // ᶊ [LATIN SMALL LETTER S WITH PALATAL HOOK] + fallthrough + case '\u1E61': // ṡ [LATIN SMALL LETTER S WITH DOT ABOVE] + fallthrough + case '\u1E63': // ṣ [LATIN SMALL LETTER S WITH DOT BELOW] + fallthrough + case '\u1E65': // ṥ [LATIN SMALL LETTER S WITH ACUTE AND DOT ABOVE] + fallthrough + case '\u1E67': // ṧ [LATIN SMALL LETTER S WITH CARON AND DOT ABOVE] + fallthrough + case '\u1E69': // ṩ [LATIN SMALL LETTER S WITH DOT BELOW AND DOT ABOVE] + fallthrough + case '\u1E9C': // ẜ [LATIN SMALL LETTER LONG S WITH DIAGONAL STROKE] + fallthrough + case '\u1E9D': // ẝ [LATIN SMALL LETTER LONG S WITH HIGH STROKE] + fallthrough + case '\u24E2': // ⓢ [CIRCLED LATIN SMALL LETTER S] + fallthrough + case '\uA784': // Ꞅ [LATIN CAPITAL LETTER INSULAR S] + fallthrough + case '\uFF53': // s [FULLWIDTH LATIN SMALL LETTER S] + output[outputPos] = 's' + outputPos++ + + case '\u1E9E': // ẞ [LATIN CAPITAL LETTER SHARP S] + output = output[:(len(output) + 1)] + output[outputPos] = 'S' + outputPos++ + output[outputPos] = 'S' + outputPos++ + + case '\u24AE': // ⒮ [PARENTHESIZED LATIN SMALL LETTER S] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 's' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u00DF': // ß [LATIN SMALL LETTER SHARP S] + output = output[:(len(output) + 1)] + output[outputPos] = 's' + outputPos++ + output[outputPos] = 's' + outputPos++ + + case '\uFB06': // st [LATIN SMALL LIGATURE ST] + output = output[:(len(output) + 1)] + output[outputPos] = 's' + outputPos++ + output[outputPos] = 't' + outputPos++ + + case '\u0162': // Ţ [LATIN CAPITAL LETTER T WITH CEDILLA] + fallthrough + case '\u0164': // Ť [LATIN CAPITAL LETTER T WITH CARON] + fallthrough + case '\u0166': // Ŧ [LATIN CAPITAL LETTER T WITH STROKE] + fallthrough + case '\u01AC': // Ƭ [LATIN CAPITAL LETTER T WITH HOOK] + fallthrough + case '\u01AE': // Ʈ [LATIN CAPITAL LETTER T WITH RETROFLEX HOOK] + fallthrough + case '\u021A': // Ț [LATIN CAPITAL LETTER T WITH COMMA BELOW] + fallthrough + case '\u023E': // Ⱦ [LATIN CAPITAL LETTER T WITH DIAGONAL STROKE] + fallthrough + case '\u1D1B': // ᴛ [LATIN LETTER SMALL CAPITAL T] + fallthrough + case '\u1E6A': // Ṫ [LATIN CAPITAL LETTER T WITH DOT ABOVE] + fallthrough + case '\u1E6C': // Ṭ [LATIN CAPITAL LETTER T WITH DOT BELOW] + fallthrough + case '\u1E6E': // Ṯ [LATIN CAPITAL LETTER T WITH LINE BELOW] + fallthrough + case '\u1E70': // Ṱ [LATIN CAPITAL LETTER T WITH CIRCUMFLEX BELOW] + fallthrough + case '\u24C9': // Ⓣ [CIRCLED LATIN CAPITAL LETTER T] + fallthrough + case '\uA786': // Ꞇ [LATIN CAPITAL LETTER INSULAR T] + fallthrough + case '\uFF34': // T [FULLWIDTH LATIN CAPITAL LETTER T] + output[outputPos] = 'T' + outputPos++ + + case '\u0163': // ţ [LATIN SMALL LETTER T WITH CEDILLA] + fallthrough + case '\u0165': // ť [LATIN SMALL LETTER T WITH CARON] + fallthrough + case '\u0167': // ŧ [LATIN SMALL LETTER T WITH STROKE] + fallthrough + case '\u01AB': // ƫ [LATIN SMALL LETTER T WITH PALATAL HOOK] + fallthrough + case '\u01AD': // ƭ [LATIN SMALL LETTER T WITH HOOK] + fallthrough + case '\u021B': // ț [LATIN SMALL LETTER T WITH COMMA BELOW] + fallthrough + case '\u0236': // ȶ [LATIN SMALL LETTER T WITH CURL] + fallthrough + case '\u0287': // ʇ [LATIN SMALL LETTER TURNED T] + fallthrough + case '\u0288': // ʈ [LATIN SMALL LETTER T WITH RETROFLEX HOOK] + fallthrough + case '\u1D75': // ᵵ [LATIN SMALL LETTER T WITH MIDDLE TILDE] + fallthrough + case '\u1E6B': // ṫ [LATIN SMALL LETTER T WITH DOT ABOVE] + fallthrough + case '\u1E6D': // ṭ [LATIN SMALL LETTER T WITH DOT BELOW] + fallthrough + case '\u1E6F': // ṯ [LATIN SMALL LETTER T WITH LINE BELOW] + fallthrough + case '\u1E71': // ṱ [LATIN SMALL LETTER T WITH CIRCUMFLEX BELOW] + fallthrough + case '\u1E97': // ẗ [LATIN SMALL LETTER T WITH DIAERESIS] + fallthrough + case '\u24E3': // ⓣ [CIRCLED LATIN SMALL LETTER T] + fallthrough + case '\u2C66': // ⱦ [LATIN SMALL LETTER T WITH DIAGONAL STROKE] + fallthrough + case '\uFF54': // t [FULLWIDTH LATIN SMALL LETTER T] + output[outputPos] = 't' + outputPos++ + + case '\u00DE': // Þ [LATIN CAPITAL LETTER THORN] + fallthrough + case '\uA766': // Ꝧ [LATIN CAPITAL LETTER THORN WITH STROKE THROUGH DESCENDER] + output = output[:(len(output) + 1)] + output[outputPos] = 'T' + outputPos++ + output[outputPos] = 'H' + outputPos++ + + case '\uA728': // Ꜩ [LATIN CAPITAL LETTER TZ] + output = output[:(len(output) + 1)] + output[outputPos] = 'T' + outputPos++ + output[outputPos] = 'Z' + outputPos++ + + case '\u24AF': // ⒯ [PARENTHESIZED LATIN SMALL LETTER T] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 't' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u02A8': // ʨ [LATIN SMALL LETTER TC DIGRAPH WITH CURL] + output = output[:(len(output) + 1)] + output[outputPos] = 't' + outputPos++ + output[outputPos] = 'c' + outputPos++ + + case '\u00FE': // þ [LATIN SMALL LETTER THORN] + fallthrough + case '\u1D7A': // ᵺ [LATIN SMALL LETTER TH WITH STRIKETHROUGH] + fallthrough + case '\uA767': // ꝧ [LATIN SMALL LETTER THORN WITH STROKE THROUGH DESCENDER] + output = output[:(len(output) + 1)] + output[outputPos] = 't' + outputPos++ + output[outputPos] = 'h' + outputPos++ + + case '\u02A6': // ʦ [LATIN SMALL LETTER TS DIGRAPH] + output = output[:(len(output) + 1)] + output[outputPos] = 't' + outputPos++ + output[outputPos] = 's' + outputPos++ + + case '\uA729': // ꜩ [LATIN SMALL LETTER TZ] + output = output[:(len(output) + 1)] + output[outputPos] = 't' + outputPos++ + output[outputPos] = 'z' + outputPos++ + + case '\u00D9': // Ù [LATIN CAPITAL LETTER U WITH GRAVE] + fallthrough + case '\u00DA': // Ú [LATIN CAPITAL LETTER U WITH ACUTE] + fallthrough + case '\u00DB': // Û [LATIN CAPITAL LETTER U WITH CIRCUMFLEX] + fallthrough + case '\u00DC': // Ü [LATIN CAPITAL LETTER U WITH DIAERESIS] + fallthrough + case '\u0168': // Ũ [LATIN CAPITAL LETTER U WITH TILDE] + fallthrough + case '\u016A': // Ū [LATIN CAPITAL LETTER U WITH MACRON] + fallthrough + case '\u016C': // Ŭ [LATIN CAPITAL LETTER U WITH BREVE] + fallthrough + case '\u016E': // Ů [LATIN CAPITAL LETTER U WITH RING ABOVE] + fallthrough + case '\u0170': // Ű [LATIN CAPITAL LETTER U WITH DOUBLE ACUTE] + fallthrough + case '\u0172': // Ų [LATIN CAPITAL LETTER U WITH OGONEK] + fallthrough + case '\u01AF': // Ư [LATIN CAPITAL LETTER U WITH HORN] + fallthrough + case '\u01D3': // Ǔ [LATIN CAPITAL LETTER U WITH CARON] + fallthrough + case '\u01D5': // Ǖ [LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON] + fallthrough + case '\u01D7': // Ǘ [LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE] + fallthrough + case '\u01D9': // Ǚ [LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON] + fallthrough + case '\u01DB': // Ǜ [LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE] + fallthrough + case '\u0214': // Ȕ [LATIN CAPITAL LETTER U WITH DOUBLE GRAVE] + fallthrough + case '\u0216': // Ȗ [LATIN CAPITAL LETTER U WITH INVERTED BREVE] + fallthrough + case '\u0244': // Ʉ [LATIN CAPITAL LETTER U BAR] + fallthrough + case '\u1D1C': // ᴜ [LATIN LETTER SMALL CAPITAL U] + fallthrough + case '\u1D7E': // ᵾ [LATIN SMALL CAPITAL LETTER U WITH STROKE] + fallthrough + case '\u1E72': // Ṳ [LATIN CAPITAL LETTER U WITH DIAERESIS BELOW] + fallthrough + case '\u1E74': // Ṵ [LATIN CAPITAL LETTER U WITH TILDE BELOW] + fallthrough + case '\u1E76': // Ṷ [LATIN CAPITAL LETTER U WITH CIRCUMFLEX BELOW] + fallthrough + case '\u1E78': // Ṹ [LATIN CAPITAL LETTER U WITH TILDE AND ACUTE] + fallthrough + case '\u1E7A': // Ṻ [LATIN CAPITAL LETTER U WITH MACRON AND DIAERESIS] + fallthrough + case '\u1EE4': // Ụ [LATIN CAPITAL LETTER U WITH DOT BELOW] + fallthrough + case '\u1EE6': // Ủ [LATIN CAPITAL LETTER U WITH HOOK ABOVE] + fallthrough + case '\u1EE8': // Ứ [LATIN CAPITAL LETTER U WITH HORN AND ACUTE] + fallthrough + case '\u1EEA': // Ừ [LATIN CAPITAL LETTER U WITH HORN AND GRAVE] + fallthrough + case '\u1EEC': // Ử [LATIN CAPITAL LETTER U WITH HORN AND HOOK ABOVE] + fallthrough + case '\u1EEE': // Ữ [LATIN CAPITAL LETTER U WITH HORN AND TILDE] + fallthrough + case '\u1EF0': // Ự [LATIN CAPITAL LETTER U WITH HORN AND DOT BELOW] + fallthrough + case '\u24CA': // Ⓤ [CIRCLED LATIN CAPITAL LETTER U] + fallthrough + case '\uFF35': // U [FULLWIDTH LATIN CAPITAL LETTER U] + output[outputPos] = 'U' + outputPos++ + + case '\u00F9': // ù [LATIN SMALL LETTER U WITH GRAVE] + fallthrough + case '\u00FA': // ú [LATIN SMALL LETTER U WITH ACUTE] + fallthrough + case '\u00FB': // û [LATIN SMALL LETTER U WITH CIRCUMFLEX] + fallthrough + case '\u00FC': // ü [LATIN SMALL LETTER U WITH DIAERESIS] + fallthrough + case '\u0169': // ũ [LATIN SMALL LETTER U WITH TILDE] + fallthrough + case '\u016B': // ū [LATIN SMALL LETTER U WITH MACRON] + fallthrough + case '\u016D': // ŭ [LATIN SMALL LETTER U WITH BREVE] + fallthrough + case '\u016F': // ů [LATIN SMALL LETTER U WITH RING ABOVE] + fallthrough + case '\u0171': // ű [LATIN SMALL LETTER U WITH DOUBLE ACUTE] + fallthrough + case '\u0173': // ų [LATIN SMALL LETTER U WITH OGONEK] + fallthrough + case '\u01B0': // ư [LATIN SMALL LETTER U WITH HORN] + fallthrough + case '\u01D4': // ǔ [LATIN SMALL LETTER U WITH CARON] + fallthrough + case '\u01D6': // ǖ [LATIN SMALL LETTER U WITH DIAERESIS AND MACRON] + fallthrough + case '\u01D8': // ǘ [LATIN SMALL LETTER U WITH DIAERESIS AND ACUTE] + fallthrough + case '\u01DA': // ǚ [LATIN SMALL LETTER U WITH DIAERESIS AND CARON] + fallthrough + case '\u01DC': // ǜ [LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE] + fallthrough + case '\u0215': // ȕ [LATIN SMALL LETTER U WITH DOUBLE GRAVE] + fallthrough + case '\u0217': // ȗ [LATIN SMALL LETTER U WITH INVERTED BREVE] + fallthrough + case '\u0289': // ʉ [LATIN SMALL LETTER U BAR] + fallthrough + case '\u1D64': // ᵤ [LATIN SUBSCRIPT SMALL LETTER U] + fallthrough + case '\u1D99': // ᶙ [LATIN SMALL LETTER U WITH RETROFLEX HOOK] + fallthrough + case '\u1E73': // ṳ [LATIN SMALL LETTER U WITH DIAERESIS BELOW] + fallthrough + case '\u1E75': // ṵ [LATIN SMALL LETTER U WITH TILDE BELOW] + fallthrough + case '\u1E77': // ṷ [LATIN SMALL LETTER U WITH CIRCUMFLEX BELOW] + fallthrough + case '\u1E79': // ṹ [LATIN SMALL LETTER U WITH TILDE AND ACUTE] + fallthrough + case '\u1E7B': // ṻ [LATIN SMALL LETTER U WITH MACRON AND DIAERESIS] + fallthrough + case '\u1EE5': // ụ [LATIN SMALL LETTER U WITH DOT BELOW] + fallthrough + case '\u1EE7': // ủ [LATIN SMALL LETTER U WITH HOOK ABOVE] + fallthrough + case '\u1EE9': // ứ [LATIN SMALL LETTER U WITH HORN AND ACUTE] + fallthrough + case '\u1EEB': // ừ [LATIN SMALL LETTER U WITH HORN AND GRAVE] + fallthrough + case '\u1EED': // ử [LATIN SMALL LETTER U WITH HORN AND HOOK ABOVE] + fallthrough + case '\u1EEF': // ữ [LATIN SMALL LETTER U WITH HORN AND TILDE] + fallthrough + case '\u1EF1': // ự [LATIN SMALL LETTER U WITH HORN AND DOT BELOW] + fallthrough + case '\u24E4': // ⓤ [CIRCLED LATIN SMALL LETTER U] + fallthrough + case '\uFF55': // u [FULLWIDTH LATIN SMALL LETTER U] + output[outputPos] = 'u' + outputPos++ + + case '\u24B0': // ⒰ [PARENTHESIZED LATIN SMALL LETTER U] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'u' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u1D6B': // ᵫ [LATIN SMALL LETTER UE] + output = output[:(len(output) + 1)] + output[outputPos] = 'u' + outputPos++ + output[outputPos] = 'e' + outputPos++ + + case '\u01B2': // Ʋ [LATIN CAPITAL LETTER V WITH HOOK] + fallthrough + case '\u0245': // Ʌ [LATIN CAPITAL LETTER TURNED V] + fallthrough + case '\u1D20': // ᴠ [LATIN LETTER SMALL CAPITAL V] + fallthrough + case '\u1E7C': // Ṽ [LATIN CAPITAL LETTER V WITH TILDE] + fallthrough + case '\u1E7E': // Ṿ [LATIN CAPITAL LETTER V WITH DOT BELOW] + fallthrough + case '\u1EFC': // Ỽ [LATIN CAPITAL LETTER MIDDLE-WELSH V] + fallthrough + case '\u24CB': // Ⓥ [CIRCLED LATIN CAPITAL LETTER V] + fallthrough + case '\uA75E': // Ꝟ [LATIN CAPITAL LETTER V WITH DIAGONAL STROKE] + fallthrough + case '\uA768': // Ꝩ [LATIN CAPITAL LETTER VEND] + fallthrough + case '\uFF36': // V [FULLWIDTH LATIN CAPITAL LETTER V] + output[outputPos] = 'V' + outputPos++ + + case '\u028B': // ʋ [LATIN SMALL LETTER V WITH HOOK] + fallthrough + case '\u028C': // ʌ [LATIN SMALL LETTER TURNED V] + fallthrough + case '\u1D65': // ᵥ [LATIN SUBSCRIPT SMALL LETTER V] + fallthrough + case '\u1D8C': // ᶌ [LATIN SMALL LETTER V WITH PALATAL HOOK] + fallthrough + case '\u1E7D': // ṽ [LATIN SMALL LETTER V WITH TILDE] + fallthrough + case '\u1E7F': // ṿ [LATIN SMALL LETTER V WITH DOT BELOW] + fallthrough + case '\u24E5': // ⓥ [CIRCLED LATIN SMALL LETTER V] + fallthrough + case '\u2C71': // ⱱ [LATIN SMALL LETTER V WITH RIGHT HOOK] + fallthrough + case '\u2C74': // ⱴ [LATIN SMALL LETTER V WITH CURL] + fallthrough + case '\uA75F': // ꝟ [LATIN SMALL LETTER V WITH DIAGONAL STROKE] + fallthrough + case '\uFF56': // v [FULLWIDTH LATIN SMALL LETTER V] + output[outputPos] = 'v' + outputPos++ + + case '\uA760': // Ꝡ [LATIN CAPITAL LETTER VY] + output = output[:(len(output) + 1)] + output[outputPos] = 'V' + outputPos++ + output[outputPos] = 'Y' + outputPos++ + + case '\u24B1': // ⒱ [PARENTHESIZED LATIN SMALL LETTER V] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'v' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\uA761': // ꝡ [LATIN SMALL LETTER VY] + output = output[:(len(output) + 1)] + output[outputPos] = 'v' + outputPos++ + output[outputPos] = 'y' + outputPos++ + + case '\u0174': // Ŵ [LATIN CAPITAL LETTER W WITH CIRCUMFLEX] + fallthrough + case '\u01F7': // Ƿ http://en.wikipedia.org/wiki/Wynn [LATIN CAPITAL LETTER WYNN] + fallthrough + case '\u1D21': // ᴡ [LATIN LETTER SMALL CAPITAL W] + fallthrough + case '\u1E80': // Ẁ [LATIN CAPITAL LETTER W WITH GRAVE] + fallthrough + case '\u1E82': // Ẃ [LATIN CAPITAL LETTER W WITH ACUTE] + fallthrough + case '\u1E84': // Ẅ [LATIN CAPITAL LETTER W WITH DIAERESIS] + fallthrough + case '\u1E86': // Ẇ [LATIN CAPITAL LETTER W WITH DOT ABOVE] + fallthrough + case '\u1E88': // Ẉ [LATIN CAPITAL LETTER W WITH DOT BELOW] + fallthrough + case '\u24CC': // Ⓦ [CIRCLED LATIN CAPITAL LETTER W] + fallthrough + case '\u2C72': // Ⱳ [LATIN CAPITAL LETTER W WITH HOOK] + fallthrough + case '\uFF37': // W [FULLWIDTH LATIN CAPITAL LETTER W] + output[outputPos] = 'W' + outputPos++ + + case '\u0175': // ŵ [LATIN SMALL LETTER W WITH CIRCUMFLEX] + fallthrough + case '\u01BF': // ƿ http://en.wikipedia.org/wiki/Wynn [LATIN LETTER WYNN] + fallthrough + case '\u028D': // ʍ [LATIN SMALL LETTER TURNED W] + fallthrough + case '\u1E81': // ẁ [LATIN SMALL LETTER W WITH GRAVE] + fallthrough + case '\u1E83': // ẃ [LATIN SMALL LETTER W WITH ACUTE] + fallthrough + case '\u1E85': // ẅ [LATIN SMALL LETTER W WITH DIAERESIS] + fallthrough + case '\u1E87': // ẇ [LATIN SMALL LETTER W WITH DOT ABOVE] + fallthrough + case '\u1E89': // ẉ [LATIN SMALL LETTER W WITH DOT BELOW] + fallthrough + case '\u1E98': // ẘ [LATIN SMALL LETTER W WITH RING ABOVE] + fallthrough + case '\u24E6': // ⓦ [CIRCLED LATIN SMALL LETTER W] + fallthrough + case '\u2C73': // ⱳ [LATIN SMALL LETTER W WITH HOOK] + fallthrough + case '\uFF57': // w [FULLWIDTH LATIN SMALL LETTER W] + output[outputPos] = 'w' + outputPos++ + + case '\u24B2': // ⒲ [PARENTHESIZED LATIN SMALL LETTER W] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'w' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u1E8A': // Ẋ [LATIN CAPITAL LETTER X WITH DOT ABOVE] + fallthrough + case '\u1E8C': // Ẍ [LATIN CAPITAL LETTER X WITH DIAERESIS] + fallthrough + case '\u24CD': // Ⓧ [CIRCLED LATIN CAPITAL LETTER X] + fallthrough + case '\uFF38': // X [FULLWIDTH LATIN CAPITAL LETTER X] + output[outputPos] = 'X' + outputPos++ + + case '\u1D8D': // ᶍ [LATIN SMALL LETTER X WITH PALATAL HOOK] + fallthrough + case '\u1E8B': // ẋ [LATIN SMALL LETTER X WITH DOT ABOVE] + fallthrough + case '\u1E8D': // ẍ [LATIN SMALL LETTER X WITH DIAERESIS] + fallthrough + case '\u2093': // ₓ [LATIN SUBSCRIPT SMALL LETTER X] + fallthrough + case '\u24E7': // ⓧ [CIRCLED LATIN SMALL LETTER X] + fallthrough + case '\uFF58': // x [FULLWIDTH LATIN SMALL LETTER X] + output[outputPos] = 'x' + outputPos++ + + case '\u24B3': // ⒳ [PARENTHESIZED LATIN SMALL LETTER X] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'x' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u00DD': // Ý [LATIN CAPITAL LETTER Y WITH ACUTE] + fallthrough + case '\u0176': // Ŷ [LATIN CAPITAL LETTER Y WITH CIRCUMFLEX] + fallthrough + case '\u0178': // Ÿ [LATIN CAPITAL LETTER Y WITH DIAERESIS] + fallthrough + case '\u01B3': // Ƴ [LATIN CAPITAL LETTER Y WITH HOOK] + fallthrough + case '\u0232': // Ȳ [LATIN CAPITAL LETTER Y WITH MACRON] + fallthrough + case '\u024E': // Ɏ [LATIN CAPITAL LETTER Y WITH STROKE] + fallthrough + case '\u028F': // ʏ [LATIN LETTER SMALL CAPITAL Y] + fallthrough + case '\u1E8E': // Ẏ [LATIN CAPITAL LETTER Y WITH DOT ABOVE] + fallthrough + case '\u1EF2': // Ỳ [LATIN CAPITAL LETTER Y WITH GRAVE] + fallthrough + case '\u1EF4': // Ỵ [LATIN CAPITAL LETTER Y WITH DOT BELOW] + fallthrough + case '\u1EF6': // Ỷ [LATIN CAPITAL LETTER Y WITH HOOK ABOVE] + fallthrough + case '\u1EF8': // Ỹ [LATIN CAPITAL LETTER Y WITH TILDE] + fallthrough + case '\u1EFE': // Ỿ [LATIN CAPITAL LETTER Y WITH LOOP] + fallthrough + case '\u24CE': // Ⓨ [CIRCLED LATIN CAPITAL LETTER Y] + fallthrough + case '\uFF39': // Y [FULLWIDTH LATIN CAPITAL LETTER Y] + output[outputPos] = 'Y' + outputPos++ + + case '\u00FD': // ý [LATIN SMALL LETTER Y WITH ACUTE] + fallthrough + case '\u00FF': // ÿ [LATIN SMALL LETTER Y WITH DIAERESIS] + fallthrough + case '\u0177': // ŷ [LATIN SMALL LETTER Y WITH CIRCUMFLEX] + fallthrough + case '\u01B4': // ƴ [LATIN SMALL LETTER Y WITH HOOK] + fallthrough + case '\u0233': // ȳ [LATIN SMALL LETTER Y WITH MACRON] + fallthrough + case '\u024F': // ɏ [LATIN SMALL LETTER Y WITH STROKE] + fallthrough + case '\u028E': // ʎ [LATIN SMALL LETTER TURNED Y] + fallthrough + case '\u1E8F': // ẏ [LATIN SMALL LETTER Y WITH DOT ABOVE] + fallthrough + case '\u1E99': // ẙ [LATIN SMALL LETTER Y WITH RING ABOVE] + fallthrough + case '\u1EF3': // ỳ [LATIN SMALL LETTER Y WITH GRAVE] + fallthrough + case '\u1EF5': // ỵ [LATIN SMALL LETTER Y WITH DOT BELOW] + fallthrough + case '\u1EF7': // ỷ [LATIN SMALL LETTER Y WITH HOOK ABOVE] + fallthrough + case '\u1EF9': // ỹ [LATIN SMALL LETTER Y WITH TILDE] + fallthrough + case '\u1EFF': // ỿ [LATIN SMALL LETTER Y WITH LOOP] + fallthrough + case '\u24E8': // ⓨ [CIRCLED LATIN SMALL LETTER Y] + fallthrough + case '\uFF59': // y [FULLWIDTH LATIN SMALL LETTER Y] + output[outputPos] = 'y' + outputPos++ + + case '\u24B4': // ⒴ [PARENTHESIZED LATIN SMALL LETTER Y] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'y' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u0179': // Ź [LATIN CAPITAL LETTER Z WITH ACUTE] + fallthrough + case '\u017B': // Ż [LATIN CAPITAL LETTER Z WITH DOT ABOVE] + fallthrough + case '\u017D': // Ž [LATIN CAPITAL LETTER Z WITH CARON] + fallthrough + case '\u01B5': // Ƶ [LATIN CAPITAL LETTER Z WITH STROKE] + fallthrough + case '\u021C': // Ȝ http://en.wikipedia.org/wiki/Yogh [LATIN CAPITAL LETTER YOGH] + fallthrough + case '\u0224': // Ȥ [LATIN CAPITAL LETTER Z WITH HOOK] + fallthrough + case '\u1D22': // ᴢ [LATIN LETTER SMALL CAPITAL Z] + fallthrough + case '\u1E90': // Ẑ [LATIN CAPITAL LETTER Z WITH CIRCUMFLEX] + fallthrough + case '\u1E92': // Ẓ [LATIN CAPITAL LETTER Z WITH DOT BELOW] + fallthrough + case '\u1E94': // Ẕ [LATIN CAPITAL LETTER Z WITH LINE BELOW] + fallthrough + case '\u24CF': // Ⓩ [CIRCLED LATIN CAPITAL LETTER Z] + fallthrough + case '\u2C6B': // Ⱬ [LATIN CAPITAL LETTER Z WITH DESCENDER] + fallthrough + case '\uA762': // Ꝣ [LATIN CAPITAL LETTER VISIGOTHIC Z] + fallthrough + case '\uFF3A': // Z [FULLWIDTH LATIN CAPITAL LETTER Z] + output[outputPos] = 'Z' + outputPos++ + + case '\u017A': // ź [LATIN SMALL LETTER Z WITH ACUTE] + fallthrough + case '\u017C': // ż [LATIN SMALL LETTER Z WITH DOT ABOVE] + fallthrough + case '\u017E': // ž [LATIN SMALL LETTER Z WITH CARON] + fallthrough + case '\u01B6': // ƶ [LATIN SMALL LETTER Z WITH STROKE] + fallthrough + case '\u021D': // ȝ http://en.wikipedia.org/wiki/Yogh [LATIN SMALL LETTER YOGH] + fallthrough + case '\u0225': // ȥ [LATIN SMALL LETTER Z WITH HOOK] + fallthrough + case '\u0240': // ɀ [LATIN SMALL LETTER Z WITH SWASH TAIL] + fallthrough + case '\u0290': // ʐ [LATIN SMALL LETTER Z WITH RETROFLEX HOOK] + fallthrough + case '\u0291': // ʑ [LATIN SMALL LETTER Z WITH CURL] + fallthrough + case '\u1D76': // ᵶ [LATIN SMALL LETTER Z WITH MIDDLE TILDE] + fallthrough + case '\u1D8E': // ᶎ [LATIN SMALL LETTER Z WITH PALATAL HOOK] + fallthrough + case '\u1E91': // ẑ [LATIN SMALL LETTER Z WITH CIRCUMFLEX] + fallthrough + case '\u1E93': // ẓ [LATIN SMALL LETTER Z WITH DOT BELOW] + fallthrough + case '\u1E95': // ẕ [LATIN SMALL LETTER Z WITH LINE BELOW] + fallthrough + case '\u24E9': // ⓩ [CIRCLED LATIN SMALL LETTER Z] + fallthrough + case '\u2C6C': // ⱬ [LATIN SMALL LETTER Z WITH DESCENDER] + fallthrough + case '\uA763': // ꝣ [LATIN SMALL LETTER VISIGOTHIC Z] + fallthrough + case '\uFF5A': // z [FULLWIDTH LATIN SMALL LETTER Z] + output[outputPos] = 'z' + outputPos++ + + case '\u24B5': // ⒵ [PARENTHESIZED LATIN SMALL LETTER Z] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = 'z' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u2070': // ⁰ [SUPERSCRIPT ZERO] + fallthrough + case '\u2080': // ₀ [SUBSCRIPT ZERO] + fallthrough + case '\u24EA': // ⓪ [CIRCLED DIGIT ZERO] + fallthrough + case '\u24FF': // ⓿ [NEGATIVE CIRCLED DIGIT ZERO] + fallthrough + case '\uFF10': // 0 [FULLWIDTH DIGIT ZERO] + output[outputPos] = '0' + outputPos++ + + case '\u00B9': // ¹ [SUPERSCRIPT ONE] + fallthrough + case '\u2081': // ₁ [SUBSCRIPT ONE] + fallthrough + case '\u2460': // ① [CIRCLED DIGIT ONE] + fallthrough + case '\u24F5': // ⓵ [DOUBLE CIRCLED DIGIT ONE] + fallthrough + case '\u2776': // ❶ [DINGBAT NEGATIVE CIRCLED DIGIT ONE] + fallthrough + case '\u2780': // ➀ [DINGBAT CIRCLED SANS-SERIF DIGIT ONE] + fallthrough + case '\u278A': // ➊ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT ONE] + fallthrough + case '\uFF11': // 1 [FULLWIDTH DIGIT ONE] + output[outputPos] = '1' + outputPos++ + + case '\u2488': // ⒈ [DIGIT ONE FULL STOP] + output = output[:(len(output) + 1)] + output[outputPos] = '1' + outputPos++ + output[outputPos] = '.' + outputPos++ + + case '\u2474': // ⑴ [PARENTHESIZED DIGIT ONE] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = '1' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u00B2': // ² [SUPERSCRIPT TWO] + fallthrough + case '\u2082': // ₂ [SUBSCRIPT TWO] + fallthrough + case '\u2461': // ② [CIRCLED DIGIT TWO] + fallthrough + case '\u24F6': // ⓶ [DOUBLE CIRCLED DIGIT TWO] + fallthrough + case '\u2777': // ❷ [DINGBAT NEGATIVE CIRCLED DIGIT TWO] + fallthrough + case '\u2781': // ➁ [DINGBAT CIRCLED SANS-SERIF DIGIT TWO] + fallthrough + case '\u278B': // ➋ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT TWO] + fallthrough + case '\uFF12': // 2 [FULLWIDTH DIGIT TWO] + output[outputPos] = '2' + outputPos++ + + case '\u2489': // ⒉ [DIGIT TWO FULL STOP] + output = output[:(len(output) + 1)] + output[outputPos] = '2' + outputPos++ + output[outputPos] = '.' + outputPos++ + + case '\u2475': // ⑵ [PARENTHESIZED DIGIT TWO] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = '2' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u00B3': // ³ [SUPERSCRIPT THREE] + fallthrough + case '\u2083': // ₃ [SUBSCRIPT THREE] + fallthrough + case '\u2462': // ③ [CIRCLED DIGIT THREE] + fallthrough + case '\u24F7': // ⓷ [DOUBLE CIRCLED DIGIT THREE] + fallthrough + case '\u2778': // ❸ [DINGBAT NEGATIVE CIRCLED DIGIT THREE] + fallthrough + case '\u2782': // ➂ [DINGBAT CIRCLED SANS-SERIF DIGIT THREE] + fallthrough + case '\u278C': // ➌ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT THREE] + fallthrough + case '\uFF13': // 3 [FULLWIDTH DIGIT THREE] + output[outputPos] = '3' + outputPos++ + + case '\u248A': // ⒊ [DIGIT THREE FULL STOP] + output = output[:(len(output) + 1)] + output[outputPos] = '3' + outputPos++ + output[outputPos] = '.' + outputPos++ + + case '\u2476': // ⑶ [PARENTHESIZED DIGIT THREE] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = '3' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u2074': // ⁴ [SUPERSCRIPT FOUR] + fallthrough + case '\u2084': // ₄ [SUBSCRIPT FOUR] + fallthrough + case '\u2463': // ④ [CIRCLED DIGIT FOUR] + fallthrough + case '\u24F8': // ⓸ [DOUBLE CIRCLED DIGIT FOUR] + fallthrough + case '\u2779': // ❹ [DINGBAT NEGATIVE CIRCLED DIGIT FOUR] + fallthrough + case '\u2783': // ➃ [DINGBAT CIRCLED SANS-SERIF DIGIT FOUR] + fallthrough + case '\u278D': // ➍ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT FOUR] + fallthrough + case '\uFF14': // 4 [FULLWIDTH DIGIT FOUR] + output[outputPos] = '4' + outputPos++ + + case '\u248B': // ⒋ [DIGIT FOUR FULL STOP] + output = output[:(len(output) + 1)] + output[outputPos] = '4' + outputPos++ + output[outputPos] = '.' + outputPos++ + + case '\u2477': // ⑷ [PARENTHESIZED DIGIT FOUR] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = '4' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u2075': // ⁵ [SUPERSCRIPT FIVE] + fallthrough + case '\u2085': // ₅ [SUBSCRIPT FIVE] + fallthrough + case '\u2464': // ⑤ [CIRCLED DIGIT FIVE] + fallthrough + case '\u24F9': // ⓹ [DOUBLE CIRCLED DIGIT FIVE] + fallthrough + case '\u277A': // ❺ [DINGBAT NEGATIVE CIRCLED DIGIT FIVE] + fallthrough + case '\u2784': // ➄ [DINGBAT CIRCLED SANS-SERIF DIGIT FIVE] + fallthrough + case '\u278E': // ➎ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT FIVE] + fallthrough + case '\uFF15': // 5 [FULLWIDTH DIGIT FIVE] + output[outputPos] = '5' + outputPos++ + + case '\u248C': // ⒌ [DIGIT FIVE FULL STOP] + output = output[:(len(output) + 1)] + output[outputPos] = '5' + outputPos++ + output[outputPos] = '.' + outputPos++ + + case '\u2478': // ⑸ [PARENTHESIZED DIGIT FIVE] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = '5' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u2076': // ⁶ [SUPERSCRIPT SIX] + fallthrough + case '\u2086': // ₆ [SUBSCRIPT SIX] + fallthrough + case '\u2465': // ⑥ [CIRCLED DIGIT SIX] + fallthrough + case '\u24FA': // ⓺ [DOUBLE CIRCLED DIGIT SIX] + fallthrough + case '\u277B': // ❻ [DINGBAT NEGATIVE CIRCLED DIGIT SIX] + fallthrough + case '\u2785': // ➅ [DINGBAT CIRCLED SANS-SERIF DIGIT SIX] + fallthrough + case '\u278F': // ➏ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT SIX] + fallthrough + case '\uFF16': // 6 [FULLWIDTH DIGIT SIX] + output[outputPos] = '6' + outputPos++ + + case '\u248D': // ⒍ [DIGIT SIX FULL STOP] + output = output[:(len(output) + 1)] + output[outputPos] = '6' + outputPos++ + output[outputPos] = '.' + outputPos++ + + case '\u2479': // ⑹ [PARENTHESIZED DIGIT SIX] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = '6' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u2077': // ⁷ [SUPERSCRIPT SEVEN] + fallthrough + case '\u2087': // ₇ [SUBSCRIPT SEVEN] + fallthrough + case '\u2466': // ⑦ [CIRCLED DIGIT SEVEN] + fallthrough + case '\u24FB': // ⓻ [DOUBLE CIRCLED DIGIT SEVEN] + fallthrough + case '\u277C': // ❼ [DINGBAT NEGATIVE CIRCLED DIGIT SEVEN] + fallthrough + case '\u2786': // ➆ [DINGBAT CIRCLED SANS-SERIF DIGIT SEVEN] + fallthrough + case '\u2790': // ➐ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT SEVEN] + fallthrough + case '\uFF17': // 7 [FULLWIDTH DIGIT SEVEN] + output[outputPos] = '7' + outputPos++ + + case '\u248E': // ⒎ [DIGIT SEVEN FULL STOP] + output = output[:(len(output) + 1)] + output[outputPos] = '7' + outputPos++ + output[outputPos] = '.' + outputPos++ + + case '\u247A': // ⑺ [PARENTHESIZED DIGIT SEVEN] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = '7' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u2078': // ⁸ [SUPERSCRIPT EIGHT] + fallthrough + case '\u2088': // ₈ [SUBSCRIPT EIGHT] + fallthrough + case '\u2467': // ⑧ [CIRCLED DIGIT EIGHT] + fallthrough + case '\u24FC': // ⓼ [DOUBLE CIRCLED DIGIT EIGHT] + fallthrough + case '\u277D': // ❽ [DINGBAT NEGATIVE CIRCLED DIGIT EIGHT] + fallthrough + case '\u2787': // ➇ [DINGBAT CIRCLED SANS-SERIF DIGIT EIGHT] + fallthrough + case '\u2791': // ➑ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT EIGHT] + fallthrough + case '\uFF18': // 8 [FULLWIDTH DIGIT EIGHT] + output[outputPos] = '8' + outputPos++ + + case '\u248F': // ⒏ [DIGIT EIGHT FULL STOP] + output = output[:(len(output) + 1)] + output[outputPos] = '8' + outputPos++ + output[outputPos] = '.' + outputPos++ + + case '\u247B': // ⑻ [PARENTHESIZED DIGIT EIGHT] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = '8' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u2079': // ⁹ [SUPERSCRIPT NINE] + fallthrough + case '\u2089': // ₉ [SUBSCRIPT NINE] + fallthrough + case '\u2468': // ⑨ [CIRCLED DIGIT NINE] + fallthrough + case '\u24FD': // ⓽ [DOUBLE CIRCLED DIGIT NINE] + fallthrough + case '\u277E': // ❾ [DINGBAT NEGATIVE CIRCLED DIGIT NINE] + fallthrough + case '\u2788': // ➈ [DINGBAT CIRCLED SANS-SERIF DIGIT NINE] + fallthrough + case '\u2792': // ➒ [DINGBAT NEGATIVE CIRCLED SANS-SERIF DIGIT NINE] + fallthrough + case '\uFF19': // 9 [FULLWIDTH DIGIT NINE] + output[outputPos] = '9' + outputPos++ + + case '\u2490': // ⒐ [DIGIT NINE FULL STOP] + output = output[:(len(output) + 1)] + output[outputPos] = '9' + outputPos++ + output[outputPos] = '.' + outputPos++ + + case '\u247C': // ⑼ [PARENTHESIZED DIGIT NINE] + output = output[:(len(output) + 2)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = '9' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u2469': // ⑩ [CIRCLED NUMBER TEN] + fallthrough + case '\u24FE': // ⓾ [DOUBLE CIRCLED NUMBER TEN] + fallthrough + case '\u277F': // ❿ [DINGBAT NEGATIVE CIRCLED NUMBER TEN] + fallthrough + case '\u2789': // ➉ [DINGBAT CIRCLED SANS-SERIF NUMBER TEN] + fallthrough + case '\u2793': // ➓ [DINGBAT NEGATIVE CIRCLED SANS-SERIF NUMBER TEN] + output = output[:(len(output) + 1)] + output[outputPos] = '1' + outputPos++ + output[outputPos] = '0' + outputPos++ + + case '\u2491': // ⒑ [NUMBER TEN FULL STOP] + output = output[:(len(output) + 2)] + output[outputPos] = '1' + outputPos++ + output[outputPos] = '0' + outputPos++ + output[outputPos] = '.' + outputPos++ + + case '\u247D': // ⑽ [PARENTHESIZED NUMBER TEN] + output = output[:(len(output) + 3)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = '1' + outputPos++ + output[outputPos] = '0' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u246A': // ⑪ [CIRCLED NUMBER ELEVEN] + fallthrough + case '\u24EB': // ⓫ [NEGATIVE CIRCLED NUMBER ELEVEN] + output = output[:(len(output) + 1)] + output[outputPos] = '1' + outputPos++ + output[outputPos] = '1' + outputPos++ + + case '\u2492': // ⒒ [NUMBER ELEVEN FULL STOP] + output = output[:(len(output) + 2)] + output[outputPos] = '1' + outputPos++ + output[outputPos] = '1' + outputPos++ + output[outputPos] = '.' + outputPos++ + + case '\u247E': // ⑾ [PARENTHESIZED NUMBER ELEVEN] + output = output[:(len(output) + 3)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = '1' + outputPos++ + output[outputPos] = '1' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u246B': // ⑫ [CIRCLED NUMBER TWELVE] + fallthrough + case '\u24EC': // ⓬ [NEGATIVE CIRCLED NUMBER TWELVE] + output = output[:(len(output) + 1)] + output[outputPos] = '1' + outputPos++ + output[outputPos] = '2' + outputPos++ + + case '\u2493': // ⒓ [NUMBER TWELVE FULL STOP] + output = output[:(len(output) + 2)] + output[outputPos] = '1' + outputPos++ + output[outputPos] = '2' + outputPos++ + output[outputPos] = '.' + outputPos++ + + case '\u247F': // ⑿ [PARENTHESIZED NUMBER TWELVE] + output = output[:(len(output) + 3)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = '1' + outputPos++ + output[outputPos] = '2' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u246C': // ⑬ [CIRCLED NUMBER THIRTEEN] + fallthrough + case '\u24ED': // ⓭ [NEGATIVE CIRCLED NUMBER THIRTEEN] + output = output[:(len(output) + 1)] + output[outputPos] = '1' + outputPos++ + output[outputPos] = '3' + outputPos++ + + case '\u2494': // ⒔ [NUMBER THIRTEEN FULL STOP] + output = output[:(len(output) + 2)] + output[outputPos] = '1' + outputPos++ + output[outputPos] = '3' + outputPos++ + output[outputPos] = '.' + outputPos++ + + case '\u2480': // ⒀ [PARENTHESIZED NUMBER THIRTEEN] + output = output[:(len(output) + 3)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = '1' + outputPos++ + output[outputPos] = '3' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u246D': // ⑭ [CIRCLED NUMBER FOURTEEN] + fallthrough + case '\u24EE': // ⓮ [NEGATIVE CIRCLED NUMBER FOURTEEN] + output = output[:(len(output) + 1)] + output[outputPos] = '1' + outputPos++ + output[outputPos] = '4' + outputPos++ + + case '\u2495': // ⒕ [NUMBER FOURTEEN FULL STOP] + output = output[:(len(output) + 2)] + output[outputPos] = '1' + outputPos++ + output[outputPos] = '4' + outputPos++ + output[outputPos] = '.' + outputPos++ + + case '\u2481': // ⒁ [PARENTHESIZED NUMBER FOURTEEN] + output = output[:(len(output) + 3)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = '1' + outputPos++ + output[outputPos] = '4' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u246E': // ⑮ [CIRCLED NUMBER FIFTEEN] + fallthrough + case '\u24EF': // ⓯ [NEGATIVE CIRCLED NUMBER FIFTEEN] + output = output[:(len(output) + 1)] + output[outputPos] = '1' + outputPos++ + output[outputPos] = '5' + outputPos++ + + case '\u2496': // ⒖ [NUMBER FIFTEEN FULL STOP] + output = output[:(len(output) + 2)] + output[outputPos] = '1' + outputPos++ + output[outputPos] = '5' + outputPos++ + output[outputPos] = '.' + outputPos++ + + case '\u2482': // ⒂ [PARENTHESIZED NUMBER FIFTEEN] + output = output[:(len(output) + 3)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = '1' + outputPos++ + output[outputPos] = '5' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u246F': // ⑯ [CIRCLED NUMBER SIXTEEN] + fallthrough + case '\u24F0': // ⓰ [NEGATIVE CIRCLED NUMBER SIXTEEN] + output = output[:(len(output) + 1)] + output[outputPos] = '1' + outputPos++ + output[outputPos] = '6' + outputPos++ + + case '\u2497': // ⒗ [NUMBER SIXTEEN FULL STOP] + output = output[:(len(output) + 2)] + output[outputPos] = '1' + outputPos++ + output[outputPos] = '6' + outputPos++ + output[outputPos] = '.' + outputPos++ + + case '\u2483': // ⒃ [PARENTHESIZED NUMBER SIXTEEN] + output = output[:(len(output) + 3)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = '1' + outputPos++ + output[outputPos] = '6' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u2470': // ⑰ [CIRCLED NUMBER SEVENTEEN] + fallthrough + case '\u24F1': // ⓱ [NEGATIVE CIRCLED NUMBER SEVENTEEN] + output = output[:(len(output) + 1)] + output[outputPos] = '1' + outputPos++ + output[outputPos] = '7' + outputPos++ + + case '\u2498': // ⒘ [NUMBER SEVENTEEN FULL STOP] + output = output[:(len(output) + 2)] + output[outputPos] = '1' + outputPos++ + output[outputPos] = '7' + outputPos++ + output[outputPos] = '.' + outputPos++ + + case '\u2484': // ⒄ [PARENTHESIZED NUMBER SEVENTEEN] + output = output[:(len(output) + 3)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = '1' + outputPos++ + output[outputPos] = '7' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u2471': // ⑱ [CIRCLED NUMBER EIGHTEEN] + fallthrough + case '\u24F2': // ⓲ [NEGATIVE CIRCLED NUMBER EIGHTEEN] + output = output[:(len(output) + 1)] + output[outputPos] = '1' + outputPos++ + output[outputPos] = '8' + outputPos++ + + case '\u2499': // ⒙ [NUMBER EIGHTEEN FULL STOP] + output = output[:(len(output) + 2)] + output[outputPos] = '1' + outputPos++ + output[outputPos] = '8' + outputPos++ + output[outputPos] = '.' + outputPos++ + + case '\u2485': // ⒅ [PARENTHESIZED NUMBER EIGHTEEN] + output = output[:(len(output) + 3)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = '1' + outputPos++ + output[outputPos] = '8' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u2472': // ⑲ [CIRCLED NUMBER NINETEEN] + fallthrough + case '\u24F3': // ⓳ [NEGATIVE CIRCLED NUMBER NINETEEN] + output = output[:(len(output) + 1)] + output[outputPos] = '1' + outputPos++ + output[outputPos] = '9' + outputPos++ + + case '\u249A': // ⒚ [NUMBER NINETEEN FULL STOP] + output = output[:(len(output) + 2)] + output[outputPos] = '1' + outputPos++ + output[outputPos] = '9' + outputPos++ + output[outputPos] = '.' + outputPos++ + + case '\u2486': // ⒆ [PARENTHESIZED NUMBER NINETEEN] + output = output[:(len(output) + 3)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = '1' + outputPos++ + output[outputPos] = '9' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u2473': // ⑳ [CIRCLED NUMBER TWENTY] + fallthrough + case '\u24F4': // ⓴ [NEGATIVE CIRCLED NUMBER TWENTY] + output = output[:(len(output) + 1)] + output[outputPos] = '2' + outputPos++ + output[outputPos] = '0' + outputPos++ + + case '\u249B': // ⒛ [NUMBER TWENTY FULL STOP] + output = output[:(len(output) + 2)] + output[outputPos] = '2' + outputPos++ + output[outputPos] = '0' + outputPos++ + output[outputPos] = '.' + outputPos++ + + case '\u2487': // ⒇ [PARENTHESIZED NUMBER TWENTY] + output = output[:(len(output) + 3)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = '2' + outputPos++ + output[outputPos] = '0' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u00AB': // « [LEFT-POINTING DOUBLE ANGLE QUOTATION MARK] + fallthrough + case '\u00BB': // » [RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK] + fallthrough + case '\u201C': // “ [LEFT DOUBLE QUOTATION MARK] + fallthrough + case '\u201D': // ” [RIGHT DOUBLE QUOTATION MARK] + fallthrough + case '\u201E': // „ [DOUBLE LOW-9 QUOTATION MARK] + fallthrough + case '\u2033': // ″ [DOUBLE PRIME] + fallthrough + case '\u2036': // ‶ [REVERSED DOUBLE PRIME] + fallthrough + case '\u275D': // ❝ [HEAVY DOUBLE TURNED COMMA QUOTATION MARK ORNAMENT] + fallthrough + case '\u275E': // ❞ [HEAVY DOUBLE COMMA QUOTATION MARK ORNAMENT] + fallthrough + case '\u276E': // ❮ [HEAVY LEFT-POINTING ANGLE QUOTATION MARK ORNAMENT] + fallthrough + case '\u276F': // ❯ [HEAVY RIGHT-POINTING ANGLE QUOTATION MARK ORNAMENT] + fallthrough + case '\uFF02': // " [FULLWIDTH QUOTATION MARK] + output[outputPos] = '"' + outputPos++ + + case '\u2018': // ‘ [LEFT SINGLE QUOTATION MARK] + fallthrough + case '\u2019': // ’ [RIGHT SINGLE QUOTATION MARK] + fallthrough + case '\u201A': // ‚ [SINGLE LOW-9 QUOTATION MARK] + fallthrough + case '\u201B': // ‛ [SINGLE HIGH-REVERSED-9 QUOTATION MARK] + fallthrough + case '\u2032': // ′ [PRIME] + fallthrough + case '\u2035': // ‵ [REVERSED PRIME] + fallthrough + case '\u2039': // ‹ [SINGLE LEFT-POINTING ANGLE QUOTATION MARK] + fallthrough + case '\u203A': // › [SINGLE RIGHT-POINTING ANGLE QUOTATION MARK] + fallthrough + case '\u275B': // ❛ [HEAVY SINGLE TURNED COMMA QUOTATION MARK ORNAMENT] + fallthrough + case '\u275C': // ❜ [HEAVY SINGLE COMMA QUOTATION MARK ORNAMENT] + fallthrough + case '\uFF07': // ' [FULLWIDTH APOSTROPHE] + output[outputPos] = '\'' + outputPos++ + + case '\u2010': // ‐ [HYPHEN] + fallthrough + case '\u2011': // ‑ [NON-BREAKING HYPHEN] + fallthrough + case '\u2012': // ‒ [FIGURE DASH] + fallthrough + case '\u2013': // – [EN DASH] + fallthrough + case '\u2014': // — [EM DASH] + fallthrough + case '\u207B': // ⁻ [SUPERSCRIPT MINUS] + fallthrough + case '\u208B': // ₋ [SUBSCRIPT MINUS] + fallthrough + case '\uFF0D': // - [FULLWIDTH HYPHEN-MINUS] + output[outputPos] = '-' + outputPos++ + + case '\u2045': // ⁅ [LEFT SQUARE BRACKET WITH QUILL] + fallthrough + case '\u2772': // ❲ [LIGHT LEFT TORTOISE SHELL BRACKET ORNAMENT] + fallthrough + case '\uFF3B': // [ [FULLWIDTH LEFT SQUARE BRACKET] + output[outputPos] = '[' + outputPos++ + + case '\u2046': // ⁆ [RIGHT SQUARE BRACKET WITH QUILL] + fallthrough + case '\u2773': // ❳ [LIGHT RIGHT TORTOISE SHELL BRACKET ORNAMENT] + fallthrough + case '\uFF3D': // ] [FULLWIDTH RIGHT SQUARE BRACKET] + output[outputPos] = ']' + outputPos++ + + case '\u207D': // ⁽ [SUPERSCRIPT LEFT PARENTHESIS] + fallthrough + case '\u208D': // ₍ [SUBSCRIPT LEFT PARENTHESIS] + fallthrough + case '\u2768': // ❨ [MEDIUM LEFT PARENTHESIS ORNAMENT] + fallthrough + case '\u276A': // ❪ [MEDIUM FLATTENED LEFT PARENTHESIS ORNAMENT] + fallthrough + case '\uFF08': // ( [FULLWIDTH LEFT PARENTHESIS] + output[outputPos] = '(' + outputPos++ + + case '\u2E28': // ⸨ [LEFT DOUBLE PARENTHESIS] + output = output[:(len(output) + 1)] + output[outputPos] = '(' + outputPos++ + output[outputPos] = '(' + outputPos++ + + case '\u207E': // ⁾ [SUPERSCRIPT RIGHT PARENTHESIS] + fallthrough + case '\u208E': // ₎ [SUBSCRIPT RIGHT PARENTHESIS] + fallthrough + case '\u2769': // ❩ [MEDIUM RIGHT PARENTHESIS ORNAMENT] + fallthrough + case '\u276B': // ❫ [MEDIUM FLATTENED RIGHT PARENTHESIS ORNAMENT] + fallthrough + case '\uFF09': // ) [FULLWIDTH RIGHT PARENTHESIS] + output[outputPos] = ')' + outputPos++ + + case '\u2E29': // ⸩ [RIGHT DOUBLE PARENTHESIS] + output = output[:(len(output) + 1)] + output[outputPos] = ')' + outputPos++ + output[outputPos] = ')' + outputPos++ + + case '\u276C': // ❬ [MEDIUM LEFT-POINTING ANGLE BRACKET ORNAMENT] + fallthrough + case '\u2770': // ❰ [HEAVY LEFT-POINTING ANGLE BRACKET ORNAMENT] + fallthrough + case '\uFF1C': // < [FULLWIDTH LESS-THAN SIGN] + output[outputPos] = '<' + outputPos++ + + case '\u276D': // ❭ [MEDIUM RIGHT-POINTING ANGLE BRACKET ORNAMENT] + fallthrough + case '\u2771': // ❱ [HEAVY RIGHT-POINTING ANGLE BRACKET ORNAMENT] + fallthrough + case '\uFF1E': // > [FULLWIDTH GREATER-THAN SIGN] + output[outputPos] = '>' + outputPos++ + + case '\u2774': // ❴ [MEDIUM LEFT CURLY BRACKET ORNAMENT] + fallthrough + case '\uFF5B': // { [FULLWIDTH LEFT CURLY BRACKET] + output[outputPos] = '{' + outputPos++ + + case '\u2775': // ❵ [MEDIUM RIGHT CURLY BRACKET ORNAMENT] + fallthrough + case '\uFF5D': // } [FULLWIDTH RIGHT CURLY BRACKET] + output[outputPos] = '}' + outputPos++ + + case '\u207A': // ⁺ [SUPERSCRIPT PLUS SIGN] + fallthrough + case '\u208A': // ₊ [SUBSCRIPT PLUS SIGN] + fallthrough + case '\uFF0B': // + [FULLWIDTH PLUS SIGN] + output[outputPos] = '+' + outputPos++ + + case '\u207C': // ⁼ [SUPERSCRIPT EQUALS SIGN] + fallthrough + case '\u208C': // ₌ [SUBSCRIPT EQUALS SIGN] + fallthrough + case '\uFF1D': // = [FULLWIDTH EQUALS SIGN] + output[outputPos] = '=' + outputPos++ + + case '\uFF01': // ! [FULLWIDTH EXCLAMATION MARK] + output[outputPos] = '!' + outputPos++ + + case '\u203C': // ‼ [DOUBLE EXCLAMATION MARK] + output = output[:(len(output) + 1)] + output[outputPos] = '!' + outputPos++ + output[outputPos] = '!' + outputPos++ + + case '\u2049': // ⁉ [EXCLAMATION QUESTION MARK] + output = output[:(len(output) + 1)] + output[outputPos] = '!' + outputPos++ + output[outputPos] = '?' + outputPos++ + + case '\uFF03': // # [FULLWIDTH NUMBER SIGN] + output[outputPos] = '#' + outputPos++ + + case '\uFF04': // $ [FULLWIDTH DOLLAR SIGN] + output[outputPos] = '$' + outputPos++ + + case '\u2052': // ⁒ [COMMERCIAL MINUS SIGN] + fallthrough + case '\uFF05': // % [FULLWIDTH PERCENT SIGN] + output[outputPos] = '%' + outputPos++ + + case '\uFF06': // & [FULLWIDTH AMPERSAND] + output[outputPos] = '&' + outputPos++ + + case '\u204E': // ⁎ [LOW ASTERISK] + fallthrough + case '\uFF0A': // * [FULLWIDTH ASTERISK] + output[outputPos] = '*' + outputPos++ + + case '\uFF0C': // , [FULLWIDTH COMMA] + output[outputPos] = ',' + outputPos++ + + case '\uFF0E': // . [FULLWIDTH FULL STOP] + output[outputPos] = '.' + outputPos++ + + case '\u2044': // ⁄ [FRACTION SLASH] + fallthrough + case '\uFF0F': // / [FULLWIDTH SOLIDUS] + output[outputPos] = '/' + outputPos++ + + case '\uFF1A': // : [FULLWIDTH COLON] + output[outputPos] = ':' + outputPos++ + + case '\u204F': // ⁏ [REVERSED SEMICOLON] + fallthrough + case '\uFF1B': // ; [FULLWIDTH SEMICOLON] + output[outputPos] = ';' + outputPos++ + + case '\uFF1F': // ? [FULLWIDTH QUESTION MARK] + output[outputPos] = '?' + outputPos++ + + case '\u2047': // ⁇ [DOUBLE QUESTION MARK] + output = output[:(len(output) + 1)] + output[outputPos] = '?' + outputPos++ + output[outputPos] = '?' + outputPos++ + + case '\u2048': // ⁈ [QUESTION EXCLAMATION MARK] + output = output[:(len(output) + 1)] + output[outputPos] = '?' + outputPos++ + output[outputPos] = '!' + outputPos++ + + case '\uFF20': // @ [FULLWIDTH COMMERCIAL AT] + output[outputPos] = '@' + outputPos++ + + case '\uFF3C': // \ [FULLWIDTH REVERSE SOLIDUS] + output[outputPos] = '\\' + outputPos++ + + case '\u2038': // ‸ [CARET] + fallthrough + case '\uFF3E': // ^ [FULLWIDTH CIRCUMFLEX ACCENT] + output[outputPos] = '^' + outputPos++ + + case '\uFF3F': // _ [FULLWIDTH LOW LINE] + output[outputPos] = '_' + outputPos++ + + case '\u2053': // ⁓ [SWUNG DASH] + fallthrough + case '\uFF5E': // ~ [FULLWIDTH TILDE] + output[outputPos] = '~' + outputPos++ + break + + default: + output[outputPos] = c + outputPos++ + } + } + } + return output +} diff --git a/analysis/char/asciifolding/asciifolding_test.go b/analysis/char/asciifolding/asciifolding_test.go new file mode 100644 index 000000000..82dc67868 --- /dev/null +++ b/analysis/char/asciifolding/asciifolding_test.go @@ -0,0 +1,56 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package asciifolding + +import ( + "reflect" + "testing" +) + +func TestAsciiFoldingFilter(t *testing.T) { + + tests := []struct { + input []byte + output []byte + }{ + { + // empty input passes + input: []byte(``), + output: []byte(``), + }, + { + // no modification for plain ASCII + input: []byte(`The quick brown fox jumps over the lazy dog`), + output: []byte(`The quick brown fox jumps over the lazy dog`), + }, + { + // Umlauts are folded to plain ASCII + input: []byte(`The quick bröwn fox jümps over the läzy dog`), + output: []byte(`The quick brown fox jumps over the lazy dog`), + }, { + // composite unicode runes are folded to more than one ASCII rune + input: []byte(`ÆꜴ`), + output: []byte(`AEAO`), + }, + } + + for _, test := range tests { + filter := New() + output := filter.Filter(test.input) + if !reflect.DeepEqual(output, test.output) { + t.Errorf("Expected:\n`%s`\ngot:\n`%s`\nfor:\n`%s`\n", string(test.output), string(output), string(test.input)) + } + } +} From 748984d4d21ea9edbfa89b75262e0857f5c03507 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 5 Dec 2018 11:48:31 -0800 Subject: [PATCH 506/728] Fix Boolean Searcher's MustNotSearcher from advancing incorrectly + Noted incorrect advancing of the boolean's currMustNot position: [0xc4200b82c0] BOOL Advance to ID: 13965, currID: 13943, currMust: 13943, currMustNot: 13966 [0xc4200b82c0] BOOL Advanced, now currID: 13965, currMust: 13965, currMustNot: 13967 + Fix the incorrect comparison for Boolean searcher's MustNot cached entry - If the current position is already where it should be do not advance - If the position is only less than (and not just inequal) to the requested ID, advance + Unit tests (thanks @mschoch) + Addresses https://github.com/blevesearch/bleve/issues/1059 --- search/searcher/search_boolean.go | 8 +-- search_test.go | 104 ++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+), 4 deletions(-) diff --git a/search/searcher/search_boolean.go b/search/searcher/search_boolean.go index 0a223bb0b..c8509ef09 100644 --- a/search/searcher/search_boolean.go +++ b/search/searcher/search_boolean.go @@ -356,10 +356,10 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter } if s.mustNotSearcher != nil { - // Additional check for mustNotSearcher whose cursor isn't tracked by - // currentID to prevent it from moving when the searcher's already - // where it should be. - if s.currMustNot == nil || !s.currMustNot.IndexInternalID.Equals(ID) { + // Additional check for mustNotSearcher, whose cursor isn't tracked by + // currentID to prevent it from moving when the searcher's tracked + // position is already ahead of or at the requested ID. + if s.currMustNot == nil || s.currMustNot.IndexInternalID.Compare(ID) < 0 { if s.currMustNot != nil { ctx.DocumentMatchPool.Put(s.currMustNot) } diff --git a/search_test.go b/search_test.go index 59ac13402..113b9dfc1 100644 --- a/search_test.go +++ b/search_test.go @@ -33,6 +33,7 @@ import ( "github.com/blevesearch/bleve/analysis/tokenizer/whitespace" "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index/scorch" + "github.com/blevesearch/bleve/index/upsidedown" "github.com/blevesearch/bleve/mapping" "github.com/blevesearch/bleve/search" "github.com/blevesearch/bleve/search/query" @@ -888,3 +889,106 @@ func TestMultipleNestedBooleanMustNotSearchersOnScorch(t *testing.T) { t.Fatalf("Unexpected result, %v != 1", res.Total) } } + +func testBooleanMustNotSearcher(t *testing.T, indexName string) { + defer func() { + err := os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + }() + + im := NewIndexMapping() + idx, err := NewUsing("testidx", im, indexName, Config.DefaultKVStore, nil) + if err != nil { + t.Fatal(err) + } + + docs := []struct { + Name string + HasRole bool + }{ + { + Name: "13900", + }, + { + Name: "13901", + }, + { + Name: "13965", + }, + { + Name: "13966", + HasRole: true, + }, + { + Name: "13967", + HasRole: true, + }, + } + + for _, doc := range docs { + err := idx.Index(doc.Name, doc) + if err != nil { + t.Fatal(err) + } + } + + lhs := NewDocIDQuery([]string{"13965", "13966", "13967"}) + hasRole := NewBoolFieldQuery(true) + hasRole.SetField("HasRole") + rhs := NewBooleanQuery() + rhs.AddMustNot(hasRole) + + var compareLeftRightAndConjunction = func(idx Index, left, right query.Query) error { + // left + lr := NewSearchRequestOptions(left, 100, 0, false) + lres, err := idx.Search(lr) + if err != nil { + return fmt.Errorf("error left: %v", err) + } + lresIds := map[string]struct{}{} + for i := range lres.Hits { + lresIds[lres.Hits[i].ID] = struct{}{} + } + // right + rr := NewSearchRequestOptions(right, 100, 0, false) + rres, err := idx.Search(rr) + if err != nil { + return fmt.Errorf("error right: %v", err) + } + rresIds := map[string]struct{}{} + for i := range rres.Hits { + rresIds[rres.Hits[i].ID] = struct{}{} + } + // conjunction + cr := NewSearchRequestOptions(NewConjunctionQuery(left, right), 100, 0, false) + cres, err := idx.Search(cr) + if err != nil { + return fmt.Errorf("error conjunction: %v", err) + } + for i := range cres.Hits { + if _, ok := lresIds[cres.Hits[i].ID]; ok { + if _, ok := rresIds[cres.Hits[i].ID]; !ok { + return fmt.Errorf("error id %s missing from right", cres.Hits[i].ID) + } + } else { + return fmt.Errorf("error id %s missing from left", cres.Hits[i].ID) + } + } + return nil + } + + err = compareLeftRightAndConjunction(idx, lhs, rhs) + if err != nil { + t.Fatal(err) + } +} + +func TestBooleanMustNotSearcherUpsidedown(t *testing.T) { + testBooleanMustNotSearcher(t, upsidedown.Name) +} + +func TestBooleanMustNotSearcherScorch(t *testing.T) { + testBooleanMustNotSearcher(t, scorch.Name) +} From dd49ef0aa0a006a46e90b20dcba578265c043c07 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 5 Dec 2018 17:13:59 -0800 Subject: [PATCH 507/728] Advancing boolean searcher's currentID w.r.t currMustNot + Advance the boolean searcher's nested searchers only if the cursor is trailing the lookup ID, additionally if the mustNotSearcher has been initialized, ensure that the cursor used to track the mustNotSearcher (which isn't tracked by the other cursor) is trailing the lookup ID as well, to avoid any searcher from advancing incorrectly. + Unit test that catches the cornercase: TestNestedBooleanMustNotSearcherUpsidedown --- search/searcher/search_boolean.go | 27 ++++++++++++++------------- search_test.go | 19 ++++++++++++------- 2 files changed, 26 insertions(+), 20 deletions(-) diff --git a/search/searcher/search_boolean.go b/search/searcher/search_boolean.go index c8509ef09..bbbced479 100644 --- a/search/searcher/search_boolean.go +++ b/search/searcher/search_boolean.go @@ -332,8 +332,14 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter } } - // Advance the searcher only if the cursor is trailing the lookup ID - if s.currentID == nil || s.currentID.Compare(ID) < 0 { + // Advance the searchers only if the currentID cursor is trailing the lookup ID, + // additionally if the mustNotSearcher has been initialized, ensure that the + // cursor used to track the mustNotSearcher (currMustNot, which isn't tracked by + // currentID) is trailing the lookup ID as well - for in the case where currentID + // is nil and currMustNot is already at or ahead of the lookup ID, we MUST NOT + // advance the currentID or the currMustNot cursors. + if (s.currentID == nil || s.currentID.Compare(ID) < 0) && + (s.currMustNot == nil || s.currMustNot.IndexInternalID.Compare(ID) < 0) { var err error if s.mustSearcher != nil { if s.currMust != nil { @@ -356,17 +362,12 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter } if s.mustNotSearcher != nil { - // Additional check for mustNotSearcher, whose cursor isn't tracked by - // currentID to prevent it from moving when the searcher's tracked - // position is already ahead of or at the requested ID. - if s.currMustNot == nil || s.currMustNot.IndexInternalID.Compare(ID) < 0 { - if s.currMustNot != nil { - ctx.DocumentMatchPool.Put(s.currMustNot) - } - s.currMustNot, err = s.mustNotSearcher.Advance(ctx, ID) - if err != nil { - return nil, err - } + if s.currMustNot != nil { + ctx.DocumentMatchPool.Put(s.currMustNot) + } + s.currMustNot, err = s.mustNotSearcher.Advance(ctx, ID) + if err != nil { + return nil, err } } diff --git a/search_test.go b/search_test.go index 113b9dfc1..d078ecfdc 100644 --- a/search_test.go +++ b/search_test.go @@ -553,7 +553,7 @@ func TestNestedBooleanSearchers(t *testing.T) { } } -func TestNestedBooleanMustNotSearcher(t *testing.T) { +func TestNestedBooleanMustNotSearcherUpsidedown(t *testing.T) { // create an index with default settings idxMapping := NewIndexMapping() idx, err := New("testidx", idxMapping) @@ -891,19 +891,24 @@ func TestMultipleNestedBooleanMustNotSearchersOnScorch(t *testing.T) { } func testBooleanMustNotSearcher(t *testing.T, indexName string) { + im := NewIndexMapping() + idx, err := NewUsing("testidx", im, indexName, Config.DefaultKVStore, nil) + if err != nil { + t.Fatal(err) + } + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + err := os.RemoveAll("testidx") if err != nil { t.Fatal(err) } }() - im := NewIndexMapping() - idx, err := NewUsing("testidx", im, indexName, Config.DefaultKVStore, nil) - if err != nil { - t.Fatal(err) - } - docs := []struct { Name string HasRole bool From be3692849832dd9d505186d5b01ddad203439065 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 10 Dec 2018 12:00:13 +0530 Subject: [PATCH 508/728] MB-32287 - zap docvalue cmd is broken Fixing the docvalue cmd to adopt the newer disk format And minor improvements --- cmd/bleve/cmd/zap/docvalue.go | 207 +++++++++++++++++----------------- 1 file changed, 106 insertions(+), 101 deletions(-) diff --git a/cmd/bleve/cmd/zap/docvalue.go b/cmd/bleve/cmd/zap/docvalue.go index 065f078e4..b8563be93 100644 --- a/cmd/bleve/cmd/zap/docvalue.go +++ b/cmd/bleve/cmd/zap/docvalue.go @@ -48,10 +48,10 @@ var docvalueCmd = &cobra.Command{ // iterate through fields index var fieldInv []string - var id, read, fieldLoc uint64 - var nread int + var id uint64 for fieldsIndexOffset+(8*id) < fieldsIndexEnd { - addr := binary.BigEndian.Uint64(data[fieldsIndexOffset+(8*id) : fieldsIndexOffset+(8*id)+8]) + addr := binary.BigEndian.Uint64( + data[fieldsIndexOffset+(8*id) : fieldsIndexOffset+(8*id)+8]) var n uint64 _, read := binary.Uvarint(data[addr+n : fieldsIndexEnd]) n += uint64(read) @@ -67,164 +67,160 @@ var docvalueCmd = &cobra.Command{ } dvLoc := segment.DocValueOffset() - fieldDvLoc, total, fdvread := uint64(0), uint64(0), int(0) - + var n int var fieldName string var fieldID uint16 + var fieldDvSize float64 + var read, fieldStartLoc, fieldEndLoc uint64 + var fieldChunkCount, fieldDvStart, fieldDvEnd, totalDvSize uint64 + var fieldChunkLens []uint64 // if no fields are specified then print the docValue offsets for all fields set for id, field := range fieldInv { - fieldLoc, fdvread = binary.Uvarint(data[dvLoc+read : dvLoc+read+binary.MaxVarintLen64]) - if fdvread <= 0 { - return fmt.Errorf("loadDvIterators: failed to read the docvalue offsets for field %d", fieldID) + fieldStartLoc, n = binary.Uvarint( + data[dvLoc+read : dvLoc+read+binary.MaxVarintLen64]) + if n <= 0 { + return fmt.Errorf("loadDvIterators: failed to read the "+ + " docvalue offsets for field %d", fieldID) + } + + read += uint64(n) + fieldEndLoc, n = binary.Uvarint( + data[dvLoc+read : dvLoc+read+binary.MaxVarintLen64]) + if n <= 0 { + return fmt.Errorf("Failed to read the docvalue offset "+ + "end for field %d", fieldID) } - read += uint64(fdvread) - if fieldLoc == math.MaxUint64 { - fmt.Printf("fieldID: %d '%s' docvalue at %d (%x) not persisted \n", id, field, fieldLoc, fieldLoc) + + read += uint64(n) + if fieldStartLoc == math.MaxUint64 && len(args) == 1 { + fmt.Printf("FieldID: %d '%s' docvalue at %d (%x) not "+ + " persisted \n", id, field, fieldStartLoc, fieldStartLoc) continue } - var offset, clen, numChunks uint64 - numChunks, nread = binary.Uvarint(data[fieldLoc : fieldLoc+binary.MaxVarintLen64]) - if nread <= 0 { - return fmt.Errorf("failed to read the field "+ - "doc values for field %s", fieldName) + var chunkOffsetsPosition, offset, numChunks uint64 + if fieldEndLoc-fieldStartLoc > 16 { + numChunks = binary.BigEndian.Uint64(data[fieldEndLoc-8 : fieldEndLoc]) + // read the length of chunk offsets + chunkOffsetsLen := binary.BigEndian.Uint64(data[fieldEndLoc-16 : fieldEndLoc-8]) + // acquire position of chunk offsets + chunkOffsetsPosition = (fieldEndLoc - 16) - chunkOffsetsLen } - offset += uint64(nread) - // read the length of chunks - totalSize := uint64(0) + // read the chunk offsets chunkLens := make([]uint64, numChunks) + dvSize := uint64(0) for i := 0; i < int(numChunks); i++ { - clen, nread = binary.Uvarint(data[fieldLoc+offset : fieldLoc+offset+binary.MaxVarintLen64]) - if nread <= 0 { - return fmt.Errorf("corrupted chunk length for chunk number: %d", i) + length, read := binary.Uvarint( + data[chunkOffsetsPosition+offset : chunkOffsetsPosition+offset+ + binary.MaxVarintLen64]) + if read <= 0 { + return fmt.Errorf("Corrupted chunk offset during segment load") } - chunkLens[i] = clen - totalSize += clen - offset += uint64(nread) + offset += uint64(read) + chunkLens[i] = length + dvSize += length } - total += totalSize + totalDvSize += dvSize + // if no field args are given, then print out the dv locations for all fields if len(args) == 1 { - // if no field args are given, then print out the dv locations for all fields - mbsize := float64(totalSize) / (1024 * 1024) - fmt.Printf("fieldID: %d '%s' docvalue at %d (%x) numChunks %d diskSize %.3f MB\n", id, field, fieldLoc, fieldLoc, numChunks, mbsize) + mbsize := float64(dvSize) / (1024 * 1024) + fmt.Printf("FieldID: %d '%s' docvalue at %d (%x) numChunks "+ + "%d diskSize %.6f MB\n", id, field, fieldStartLoc, + fieldStartLoc, numChunks, mbsize) continue } - if field != args[1] { - continue - } else { - fieldDvLoc = fieldLoc + // if the field is the requested one for more details, + // then remember the details + if field == args[1] { + fieldDvStart = fieldStartLoc + fieldDvEnd = fieldEndLoc fieldName = field fieldID = uint16(id) + fieldDvSize = float64(dvSize) / (1024 * 1024) + fieldChunkLens = append(fieldChunkLens, chunkLens...) + fieldChunkCount = numChunks } - } - mbsize := float64(total) / (1024 * 1024) - fmt.Printf("Total Doc Values Size on Disk: %.3f MB\n", mbsize) + mbsize := float64(totalDvSize) / (1024 * 1024) + fmt.Printf("Total Doc Values Size on Disk: %.6f MB\n", mbsize) // done with the fields dv locs printing for the given zap file if len(args) == 1 { return nil } - if fieldName == "" || fieldDvLoc == 0 { - return fmt.Errorf("no field found for given field arg: %s", args[1]) - } - - // read the number of chunks - var offset, clen, numChunks uint64 - numChunks, nread = binary.Uvarint(data[fieldDvLoc : fieldDvLoc+binary.MaxVarintLen64]) - if nread <= 0 { - return fmt.Errorf("failed to read the field "+ - "doc values for field %s", fieldName) - } - offset += uint64(nread) - - if len(args) == 2 { - fmt.Printf("number of chunks: %d\n", numChunks) - } - - // read the length of chunks - chunkLens := make([]uint64, numChunks) - for i := 0; i < int(numChunks); i++ { - clen, nread = binary.Uvarint(data[fieldDvLoc+offset : fieldDvLoc+offset+binary.MaxVarintLen64]) - if nread <= 0 { - return fmt.Errorf("corrupted chunk length for chunk number: %d", i) - } - - chunkLens[i] = clen - offset += uint64(nread) - if len(args) == 2 { - fmt.Printf("chunk: %d size: %d \n", i, clen) - } - /* - TODO => dump all chunk headers?? - if len(args) == 3 && args[2] == ">" { - dumpChunkDocNums(data, ) - - }*/ + if fieldName == "" || fieldDvEnd == 0 { + return fmt.Errorf("No docvalue persisted for given field arg: %s", + args[1]) } if len(args) == 2 { + fmt.Printf("FieldID: %d '%s' docvalue at %d (%x) numChunks "+ + "%d diskSize %.6f MB\n", fieldID, fieldName, fieldDvStart, + fieldDvStart, fieldChunkCount, fieldDvSize) + fmt.Printf("Number of docvalue chunks: %d\n", fieldChunkCount) return nil } localDocNum, err := strconv.Atoi(args[2]) if err != nil { - return fmt.Errorf("unable to parse doc number: %v", err) + return fmt.Errorf("Unable to parse doc number: %v", err) } if localDocNum >= int(segment.NumDocs()) { - return fmt.Errorf("invalid doc number %d (valid 0 - %d)", localDocNum, segment.NumDocs()-1) + return fmt.Errorf("Invalid doc number %d (valid 0 - %d)", + localDocNum, segment.NumDocs()-1) } // find the chunkNumber where the docValues are stored docInChunk := uint64(localDocNum) / uint64(segment.ChunkFactor()) - if numChunks < docInChunk { - return fmt.Errorf("no chunk exists for chunk number: %d for localDocNum: %d", docInChunk, localDocNum) - } - - destChunkDataLoc := fieldDvLoc + offset - for i := 0; i < int(docInChunk); i++ { - destChunkDataLoc += chunkLens[i] + if fieldChunkCount < docInChunk { + return fmt.Errorf("No chunk exists for chunk number: %d for "+ + "localDocNum: %d", docInChunk, localDocNum) } - curChunkSize := chunkLens[docInChunk] - if curChunkSize == 0 { - return nil - } + start, end := readChunkBoundary(int(docInChunk), fieldChunkLens) + destChunkDataLoc := fieldDvStart + start + curChunkEnd := fieldDvStart + end // read the number of docs reside in the chunk - numDocs := uint64(0) - numDocs, nread = binary.Uvarint(data[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64]) - if nread <= 0 { - return fmt.Errorf("failed to read the target chunk: %d", docInChunk) + var numDocs uint64 + var nr int + numDocs, nr = binary.Uvarint( + data[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64]) + if nr <= 0 { + return fmt.Errorf("Failed to read the chunk") } - chunkMetaLoc := destChunkDataLoc + uint64(nread) - offset = uint64(0) + chunkMetaLoc := destChunkDataLoc + uint64(nr) curChunkHeader := make([]zap.MetaData, int(numDocs)) + offset := uint64(0) for i := 0; i < int(numDocs); i++ { - curChunkHeader[i].DocNum, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) - offset += uint64(nread) - curChunkHeader[i].DocDvOffset, nread = binary.Uvarint(data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) - offset += uint64(nread) + curChunkHeader[i].DocNum, nr = binary.Uvarint( + data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + offset += uint64(nr) + curChunkHeader[i].DocDvOffset, nr = binary.Uvarint( + data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) + offset += uint64(nr) } compressedDataLoc := chunkMetaLoc + offset - dataLength := destChunkDataLoc + curChunkSize - compressedDataLoc + dataLength := curChunkEnd - compressedDataLoc curChunkData := data[compressedDataLoc : compressedDataLoc+dataLength] - start, length := getDocValueLocs(uint64(localDocNum), curChunkHeader) - if start == math.MaxUint64 || length == math.MaxUint64 { - fmt.Printf("no field values found for localDocNum: %d\n", localDocNum) - fmt.Printf("Try docNums present in chunk: %s\n", metaDataDocNums(curChunkHeader)) + start, end = getDocValueLocs(uint64(localDocNum), curChunkHeader) + if start == math.MaxUint64 || end == math.MaxUint64 { + fmt.Printf("No field values found for localDocNum: %d\n", + localDocNum) + fmt.Printf("Try docNums present in chunk: %s\n", + metaDataDocNums(curChunkHeader)) return nil } // uncompress the already loaded data @@ -236,8 +232,9 @@ var docvalueCmd = &cobra.Command{ var termSeparator byte = 0xff var termSeparatorSplitSlice = []byte{termSeparator} + // pick the terms for the given docNum - uncompressed = uncompressed[start : start+length] + uncompressed = uncompressed[start:end] for { i := bytes.Index(uncompressed, termSeparatorSplitSlice) if i < 0 { @@ -270,6 +267,14 @@ func metaDataDocNums(metaHeader []zap.MetaData) string { return docNums } +func readChunkBoundary(chunk int, offsets []uint64) (uint64, uint64) { + var start uint64 + if chunk > 0 { + start = offsets[chunk-1] + } + return start, offsets[chunk] +} + func init() { RootCmd.AddCommand(docvalueCmd) } From 685ce244479fa0933d68c1f8ccb7a465898ec4d1 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 10 Dec 2018 16:39:37 -0800 Subject: [PATCH 509/728] Protect conjunction searcher with no searchers Fixes https://github.com/blevesearch/bleve/issues/1074 --- search/searcher/search_conjunction.go | 2 +- search_test.go | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/search/searcher/search_conjunction.go b/search/searcher/search_conjunction.go index a48052679..62966c13f 100644 --- a/search/searcher/search_conjunction.go +++ b/search/searcher/search_conjunction.go @@ -158,7 +158,7 @@ func (s *ConjunctionSearcher) Next(ctx *search.SearchContext) (*search.DocumentM var rv *search.DocumentMatch var err error OUTER: - for s.currs[s.maxIDIdx] != nil { + for s.maxIDIdx < len(s.currs) && s.currs[s.maxIDIdx] != nil { maxID := s.currs[s.maxIDIdx].IndexInternalID i := 0 diff --git a/search_test.go b/search_test.go index d078ecfdc..52f406d62 100644 --- a/search_test.go +++ b/search_test.go @@ -997,3 +997,20 @@ func TestBooleanMustNotSearcherUpsidedown(t *testing.T) { func TestBooleanMustNotSearcherScorch(t *testing.T) { testBooleanMustNotSearcher(t, scorch.Name) } + +func TestQueryStringEmptyConjunctionSearcher(t *testing.T) { + mapping := NewIndexMapping() + mapping.DefaultAnalyzer = keyword.Name + index, err := NewMemOnly(mapping) + if err != nil { + t.Fatal(err) + } + defer func() { + _ = index.Close() + }() + + query := NewQueryStringQuery("foo:bar +baz:\"\"") + searchReq := NewSearchRequest(query) + + _, _ = index.Search(searchReq) +} From 3120a9e67c6a39458dead3f7084fe4481ec1727c Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 11 Dec 2018 17:06:35 -0500 Subject: [PATCH 510/728] renamed Callbacks to PersistedCallback changed to single callback per-batch removed callbacks from snapshotReversion (unused) removed unnecessary comment from upside impl --- index.go | 8 ++++---- index/index.go | 17 ++++++++--------- index/scorch/introducer.go | 11 ++++++----- index/scorch/persister.go | 10 +++++----- index/scorch/scorch.go | 20 ++++++++++---------- index/scorch/scorch_test.go | 8 ++++---- index/upsidedown/upsidedown.go | 6 +++--- 7 files changed, 40 insertions(+), 40 deletions(-) diff --git a/index.go b/index.go index 0747dd65f..99357eee0 100644 --- a/index.go +++ b/index.go @@ -129,12 +129,12 @@ func (b *Batch) Merge(o *Batch) { } } -func (b *Batch) AddCallback(f index.BatchCallback) { - b.internal.AddCallback(f) +func (b *Batch) SetPersistedCallback(f index.BatchCallback) { + b.internal.SetPersistedCallback(f) } -func (b *Batch) Callbacks() []index.BatchCallback { - return b.internal.Callbacks() +func (b *Batch) PersistedCallback() index.BatchCallback { + return b.internal.PersistedCallback() } // An Index implements all the indexing and searching diff --git a/index/index.go b/index/index.go index 398c9fc87..4ae79056e 100644 --- a/index/index.go +++ b/index/index.go @@ -251,16 +251,15 @@ type DocIDReader interface { type BatchCallback func(error) type Batch struct { - IndexOps map[string]*document.Document - InternalOps map[string][]byte - callbacks []BatchCallback + IndexOps map[string]*document.Document + InternalOps map[string][]byte + persistedCallback BatchCallback } func NewBatch() *Batch { return &Batch{ IndexOps: make(map[string]*document.Document), InternalOps: make(map[string][]byte), - callbacks: nil, } } @@ -280,12 +279,12 @@ func (b *Batch) DeleteInternal(key []byte) { b.InternalOps[string(key)] = nil } -func (b *Batch) AddCallback(f BatchCallback) { - b.callbacks = append(b.callbacks, f) +func (b *Batch) SetPersistedCallback(f BatchCallback) { + b.persistedCallback = f } -func (b *Batch) Callbacks() []BatchCallback { - return b.callbacks +func (b *Batch) PersistedCallback() BatchCallback { + return b.persistedCallback } func (b *Batch) String() string { @@ -310,7 +309,7 @@ func (b *Batch) String() string { func (b *Batch) Reset() { b.IndexOps = make(map[string]*document.Document) b.InternalOps = make(map[string][]byte) - b.callbacks = nil + b.persistedCallback = nil } func (b *Batch) Merge(o *Batch) { diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 836e716c4..ac627796f 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -31,9 +31,9 @@ type segmentIntroduction struct { ids []string internal map[string][]byte - applied chan error - persisted chan error - callbacks []index.BatchCallback + applied chan error + persisted chan error + persistedCallback index.BatchCallback } type persistIntroduction struct { @@ -50,7 +50,6 @@ type snapshotReversion struct { snapshot *IndexSnapshot applied chan error persisted chan error - callbacks []index.BatchCallback } func (s *Scorch) mainLoop() { @@ -216,7 +215,9 @@ func (s *Scorch) introduceSegment(next *segmentIntroduction) error { if next.persisted != nil { s.rootPersisted = append(s.rootPersisted, next.persisted) } - s.callbacks = append(s.callbacks, next.callbacks...) + if next.persistedCallback != nil { + s.persistedCallbacks = append(s.persistedCallbacks, next.persistedCallback) + } // swap in new index snapshot newSnapshot.epoch = s.nextSnapshotEpoch s.nextSnapshotEpoch++ diff --git a/index/scorch/persister.go b/index/scorch/persister.go index ebbc240e7..2ba50867d 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -116,7 +116,7 @@ OUTER: var ourSnapshot *IndexSnapshot var ourPersisted []chan error - var ourCallbacks []index.BatchCallback + var ourPersistedCallbacks []index.BatchCallback // check to see if there is a new snapshot to persist s.rootLock.Lock() @@ -125,8 +125,8 @@ OUTER: ourSnapshot.AddRef() ourPersisted = s.rootPersisted s.rootPersisted = nil - ourCallbacks = s.callbacks - s.callbacks = nil + ourPersistedCallbacks = s.persistedCallbacks + s.persistedCallbacks = nil atomic.StoreUint64(&s.iStats.persistSnapshotSize, uint64(ourSnapshot.Size())) atomic.StoreUint64(&s.iStats.persistEpoch, ourSnapshot.epoch) } @@ -154,8 +154,8 @@ OUTER: atomic.AddUint64(&s.stats.TotPersistLoopErr, 1) continue OUTER } - for i := range ourCallbacks { - ourCallbacks[i](err) + for i := range ourPersistedCallbacks { + ourPersistedCallbacks[i](err) } atomic.StoreUint64(&s.stats.LastPersistedEpoch, ourSnapshot.epoch) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 89b3ba408..3430bd368 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -54,7 +54,7 @@ type Scorch struct { rootLock sync.RWMutex root *IndexSnapshot // holds 1 ref-count on the root rootPersisted []chan error // closed when root is persisted - callbacks []index.BatchCallback + persistedCallbacks []index.BatchCallback nextSnapshotEpoch uint64 eligibleForRemoval []uint64 // Index snapshot epochs that are safe to GC. ineligibleForRemoval map[string]bool // Filenames that should not be GC'ed yet. @@ -356,7 +356,7 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { atomic.AddUint64(&s.stats.TotBatchesEmpty, 1) } - err = s.prepareSegment(newSegment, ids, batch.InternalOps, batch.Callbacks()) + err = s.prepareSegment(newSegment, ids, batch.InternalOps, batch.PersistedCallback()) if err != nil { if newSegment != nil { _ = newSegment.Close() @@ -376,17 +376,17 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { } func (s *Scorch) prepareSegment(newSegment segment.Segment, ids []string, - internalOps map[string][]byte, callbacks []index.BatchCallback) error { + internalOps map[string][]byte, persistedCallback index.BatchCallback) error { // new introduction introduction := &segmentIntroduction{ - id: atomic.AddUint64(&s.nextSegmentID, 1), - data: newSegment, - ids: ids, - obsoletes: make(map[uint64]*roaring.Bitmap), - internal: internalOps, - applied: make(chan error), - callbacks: callbacks, + id: atomic.AddUint64(&s.nextSegmentID, 1), + data: newSegment, + ids: ids, + obsoletes: make(map[uint64]*roaring.Bitmap), + internal: internalOps, + applied: make(chan error), + persistedCallback: persistedCallback, } if !s.unsafeBatch { diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index 0e78a45d4..a1691fd09 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -932,16 +932,16 @@ func TestIndexBatch(t *testing.T) { } func TestIndexBatchWithCallbacks(t *testing.T) { - + cfg := CreateConfig("TestIndexBatchWithCallbacks") defer func() { - err := DestroyTest() + err := DestroyTest(cfg) if err != nil { t.Fatal(err) } }() analysisQueue := index.NewAnalysisQueue(1) - idx, err := NewScorch(Name, testConfig, analysisQueue) + idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { t.Fatal(err) } @@ -964,7 +964,7 @@ func TestIndexBatchWithCallbacks(t *testing.T) { doc := document.NewDocument("3") doc.AddField(document.NewTextField("name", []uint64{}, []byte("test3"))) batch.Update(doc) - batch.AddCallback(func(e error) { + batch.SetPersistedCallback(func(e error) { updated = true cbErr = e diff --git a/index/upsidedown/upsidedown.go b/index/upsidedown/upsidedown.go index d0a27c000..8edbb5b3d 100644 --- a/index/upsidedown/upsidedown.go +++ b/index/upsidedown/upsidedown.go @@ -959,9 +959,9 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) { atomic.AddUint64(&udc.stats.errors, 1) } - // For sake of completeness - for i := range batch.Callbacks() { - batch.Callbacks()[i](err) + persistedCallback := batch.PersistedCallback() + if persistedCallback != nil { + persistedCallback(err) } return } From 9ec8f83514c46f72b2fc97e2dc7a5eabef435615 Mon Sep 17 00:00:00 2001 From: Kevin Gillieron Date: Wed, 12 Dec 2018 16:51:42 +0100 Subject: [PATCH 511/728] add test to reproduce bug --- search/query/query_string_parser_test.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/search/query/query_string_parser_test.go b/search/query/query_string_parser_test.go index 356127fdc..f10170edd 100644 --- a/search/query/query_string_parser_test.go +++ b/search/query/query_string_parser_test.go @@ -49,6 +49,16 @@ func TestQuerySyntaxParserValid(t *testing.T) { }, nil), }, + { + input: "127.0.0.1", + mapping: mapping.NewIndexMapping(), + result: NewBooleanQueryForQueryString( + nil, + []Query{ + NewMatchQuery("127.0.0.1"), + }, + nil), + }, { input: `"test phrase 1"`, mapping: mapping.NewIndexMapping(), From 85453e97b2a394cc7d13e800e48c7c778946574d Mon Sep 17 00:00:00 2001 From: Kevin Gillieron Date: Wed, 12 Dec 2018 16:52:29 +0100 Subject: [PATCH 512/728] fix number parsing issue Numbers separated by a dot, like IP addresses, are parsed as floating point number instead of strings. --- search/query/query_string_lex.go | 1 + 1 file changed, 1 insertion(+) diff --git a/search/query/query_string_lex.go b/search/query/query_string_lex.go index 9c59cedde..3a9cf2398 100644 --- a/search/query/query_string_lex.go +++ b/search/query/query_string_lex.go @@ -273,6 +273,7 @@ func inNumOrStrState(l *queryStringLex, next rune, eof bool) (lexState, bool) { // see where to go if !l.seenDot && next == '.' { // stay in this state + l.seenDot = true l.buf += string(next) return inNumOrStrState, true } else if unicode.IsDigit(next) { From 1d14bcc36f1b4b9d42d14ade4866b7c9a20e6786 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 17 Dec 2018 15:42:09 +0530 Subject: [PATCH 513/728] 1079 - Incorrect scorch score The composite field norm/scoring is broken with scorch, as it incorrectly considers the "_id" field during the composite field analysis. --- index/scorch/scorch.go | 2 +- test/versus_score_test.go | 135 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 136 insertions(+), 1 deletion(-) create mode 100644 test/versus_score_test.go diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 3430bd368..07568e6e5 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -526,7 +526,7 @@ func (s *Scorch) Analyze(d *document.Document) *index.AnalysisResult { rv.Analyzed[i] = tokenFreqs rv.Length[i] = fieldLength - if len(d.CompositeFields) > 0 { + if len(d.CompositeFields) > 0 && field.Name() != "_id" { // see if any of the composite fields need this for _, compositeField := range d.CompositeFields { compositeField.Compose(field.Name(), fieldLength, tokenFreqs) diff --git a/test/versus_score_test.go b/test/versus_score_test.go new file mode 100644 index 000000000..dcaf8d650 --- /dev/null +++ b/test/versus_score_test.go @@ -0,0 +1,135 @@ +// Copyright (c) 2018 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package test + +import ( + "fmt" + "os" + "strconv" + "testing" + + "github.com/blevesearch/bleve" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index/scorch" + "github.com/blevesearch/bleve/index/upsidedown" + "github.com/blevesearch/bleve/mapping" + "github.com/blevesearch/bleve/search" +) + +func TestDisjunctionSearchScoreIndexWithCompositeFields(t *testing.T) { + upHits := disjunctionQueryiOnIndexWithCompositeFields(upsidedown.Name, t) + scHits := disjunctionQueryiOnIndexWithCompositeFields(scorch.Name, t) + + if upHits[0].ID != scHits[0].ID || upHits[1].ID != scHits[1].ID { + t.Errorf("upsidedown, scorch returned different docs;\n"+ + "upsidedown: (%s, %s), scorch: (%s, %s)\n", + upHits[0].ID, upHits[1].ID, scHits[0].ID, scHits[1].ID) + } + + if scHits[0].Score != upHits[0].Score || scHits[1].Score != upHits[1].Score { + t.Errorf("upsidedown, scorch showing different scores;\n"+ + "upsidedown: (%+v, %+v), scorch: (%+v, %+v)\n", + *upHits[0].Expl, *upHits[1].Expl, *scHits[0].Expl, *scHits[1].Expl) + } + +} + +func disjunctionQueryiOnIndexWithCompositeFields(indexName string, + t *testing.T) []*search.DocumentMatch { + // create an index + idxMapping := mapping.NewIndexMapping() + idx, err := bleve.NewUsing("testidx", idxMapping, indexName, + bleve.Config.DefaultKVStore, nil) + if err != nil { + t.Error(err) + } + + defer func() { + err = idx.Close() + if err != nil { + t.Error(err) + } + err = os.RemoveAll("testidx") + if err != nil { + t.Error(err) + } + }() + + // create and insert documents as a batch + batch := idx.NewBatch() + docs := []struct { + field1 string + field2 int + }{ + { + field1: "one", + field2: 1, + }, + { + field1: "two", + field2: 2, + }, + } + + for i := 0; i < len(docs); i++ { + doc := document.NewDocument(strconv.Itoa(docs[i].field2)) + doc.Fields = []document.Field{ + document.NewTextField("field1", []uint64{}, []byte(docs[i].field1)), + document.NewNumericField("field2", []uint64{}, float64(docs[i].field2)), + } + doc.CompositeFields = []*document.CompositeField{ + document.NewCompositeFieldWithIndexingOptions( + "_all", true, []string{"field1"}, []string{}, + document.IndexField|document.IncludeTermVectors), + } + if err = batch.IndexAdvanced(doc); err != nil { + t.Error(err) + } + } + if err = idx.Batch(batch); err != nil { + t.Error(err) + } + + /* + Query: + DISJ + / \ + CONJ TERM(two) + / + TERM(one) + */ + + tq1 := bleve.NewTermQuery("one") + tq1.SetBoost(2) + tq2 := bleve.NewTermQuery("two") + tq2.SetBoost(3) + + cq := bleve.NewConjunctionQuery(tq1) + cq.SetBoost(4) + + q := bleve.NewDisjunctionQuery(tq1, tq2) + sr := bleve.NewSearchRequestOptions(q, 2, 0, true) + res, err := idx.Search(sr) + if err != nil { + t.Error(err) + } + + if len(res.Hits) != 2 { + t.Errorf(fmt.Sprintf("indexType: %s Expected 2 hits, "+ + "but got: %v", indexName, len(res.Hits))) + } + + return res.Hits +} From a67bab66242517e6802a57d8f7ceaeac0797f188 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 10 Dec 2018 14:19:32 -0800 Subject: [PATCH 514/728] MB-32295: Minimal optimization for conjunction/disjunction searchers If the number of nested searchers within a conjunction/disjunction searcher is one, just return the nested searcher unwrapped as is, to avoid an unnecessary level of indirection. --- search/query/conjunction.go | 5 +++++ search/query/disjunction.go | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/search/query/conjunction.go b/search/query/conjunction.go index 39cc312de..8e2d056ed 100644 --- a/search/query/conjunction.go +++ b/search/query/conjunction.go @@ -70,9 +70,14 @@ func (q *ConjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, } ss = append(ss, sr) } + if len(ss) < 1 { return searcher.NewMatchNoneSearcher(i) + } else if len(ss) == 1 { + // return single nested searcher as is + return ss[0], nil } + return searcher.NewConjunctionSearcher(i, ss, options) } diff --git a/search/query/disjunction.go b/search/query/disjunction.go index dacc3a75b..6b0e383bc 100644 --- a/search/query/disjunction.go +++ b/search/query/disjunction.go @@ -76,9 +76,14 @@ func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, } ss = append(ss, sr) } + if len(ss) < 1 { return searcher.NewMatchNoneSearcher(i) + } else if len(ss) == 1 { + // return the single nested searcher as is + return ss[0], nil } + return searcher.NewDisjunctionSearcher(i, ss, q.Min, options) } From d1d06dcc9bfa63cc396c2fcf69e6eaf3c22422f0 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 17 Dec 2018 10:36:27 -0800 Subject: [PATCH 515/728] MB-32295: "min" propagation for disjunction queries with single clause + Optimization of returning the nested searcher within a disjunction searcher should be done only when min <= 1. --- search/query/conjunction.go | 2 +- search/query/disjunction.go | 4 +-- search_test.go | 71 ++++++++++++++++++++++++++++++++++++- 3 files changed, 73 insertions(+), 4 deletions(-) diff --git a/search/query/conjunction.go b/search/query/conjunction.go index 8e2d056ed..f14cbdd88 100644 --- a/search/query/conjunction.go +++ b/search/query/conjunction.go @@ -74,7 +74,7 @@ func (q *ConjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, if len(ss) < 1 { return searcher.NewMatchNoneSearcher(i) } else if len(ss) == 1 { - // return single nested searcher as is + // return single nested searcher as is return ss[0], nil } diff --git a/search/query/disjunction.go b/search/query/disjunction.go index 6b0e383bc..b884cb659 100644 --- a/search/query/disjunction.go +++ b/search/query/disjunction.go @@ -79,8 +79,8 @@ func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, if len(ss) < 1 { return searcher.NewMatchNoneSearcher(i) - } else if len(ss) == 1 { - // return the single nested searcher as is + } else if len(ss) == 1 && q.Min <= 1 { + // return the single nested searcher as is; only if min clauses is not greater than 1; return ss[0], nil } diff --git a/search_test.go b/search_test.go index 52f406d62..71fae4236 100644 --- a/search_test.go +++ b/search_test.go @@ -1012,5 +1012,74 @@ func TestQueryStringEmptyConjunctionSearcher(t *testing.T) { query := NewQueryStringQuery("foo:bar +baz:\"\"") searchReq := NewSearchRequest(query) - _, _ = index.Search(searchReq) + _, _ = index.Search(searchReq) +} + +func TestDisjunctionQueryIncorrectMin(t *testing.T) { + // create an index with default settings + idxMapping := NewIndexMapping() + idx, err := New("testidx", idxMapping) + if err != nil { + t.Fatal(err) + } + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + err = os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + }() + + // create and insert documents as a batch + batch := idx.NewBatch() + docs := []struct { + field1 string + field2 int + }{ + { + field1: "one", + field2: 1, + }, + { + field1: "two", + field2: 2, + }, + } + + for i := 0; i < len(docs); i++ { + doc := document.NewDocument(strconv.Itoa(docs[i].field2)) + doc.Fields = []document.Field{ + document.NewTextField("field1", []uint64{}, []byte(docs[i].field1)), + document.NewNumericField("field2", []uint64{}, float64(docs[i].field2)), + } + doc.CompositeFields = []*document.CompositeField{ + document.NewCompositeFieldWithIndexingOptions( + "_all", true, []string{"text"}, []string{}, + document.IndexField|document.IncludeTermVectors), + } + if err = batch.IndexAdvanced(doc); err != nil { + t.Fatal(err) + } + } + + if err = idx.Batch(batch); err != nil { + t.Fatal(err) + } + + tq := NewTermQuery("one") + dq := NewDisjunctionQuery(tq) + dq.SetMin(2) + sr := NewSearchRequestOptions(dq, 1, 0, false) + res, err := idx.Search(sr) + if err != nil { + t.Fatal(err) + } + + if res.Total > 0 { + t.Fatalf("Expected 0 matches as disjunction query contains a single clause"+ + " but got: %v", res.Total) + } } From cc0219a5f4ebff25fdd709cf5ed14536b0a0d08d Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Fri, 4 Jan 2019 14:05:07 -0800 Subject: [PATCH 516/728] Revert "Refactoring." This reverts commit 934a36ea597bf6c20b8b05bda553bbf3f1d2d073. --- index/scorch/segment/zap/segment.go | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 4ee6c290b..606ea0cfa 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -418,18 +418,19 @@ func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) { } postingsList := emptyPostingsList - filteredIds := make([]string, 0, len(ids)) + filteredIds := ids[:0] sMax := "" sMaxB, err := idDict.fst.GetMaxKey() - if err != nil { - return nil, err - } - sMax = string(sMaxB) - for _, id := range ids { - if id <= sMax { - filteredIds = append(filteredIds, id) + if err == nil { + sMax = string(sMaxB) + for _, id := range ids { + if id <= sMax { + filteredIds = append(filteredIds, id) + } } + } else { + filteredIds = ids } for _, id := range filteredIds { From 843bbf265ed77cc3b122da4e856eaf865fe3f012 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Fri, 4 Jan 2019 14:05:20 -0800 Subject: [PATCH 517/728] Revert "Simplification - Using filteredIds instead of ids." This reverts commit 43e8e27f8e2cf5d076973dbad26f436aaa8a12c6. --- index/scorch/segment/zap/segment.go | 39 ++++++++++++++++++----------- 1 file changed, 24 insertions(+), 15 deletions(-) diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 606ea0cfa..9342dd273 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -418,29 +418,38 @@ func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) { } postingsList := emptyPostingsList - filteredIds := ids[:0] + + skipCheck := false sMax := "" + iMin := "" - sMaxB, err := idDict.fst.GetMaxKey() - if err == nil { - sMax = string(sMaxB) - for _, id := range ids { - if id <= sMax { - filteredIds = append(filteredIds, id) + if len(ids) > 0 { + sMaxB, err := idDict.fst.GetMaxKey() + if err != nil { + skipCheck = true + } else { + sMax = string(sMaxB) + iMin = ids[0] + for i := 1; i < len(ids); i++ { + if ids[i] < iMin { + iMin = ids[i] + } } } } else { - filteredIds = ids + skipCheck = true } - - for _, id := range filteredIds { - postingsList, err = idDict.postingsList([]byte(id), nil, postingsList) - if err != nil { - return nil, err + if skipCheck || (iMin <= sMax) { + for _, id := range ids { + if skipCheck || (id <= sMax) { + postingsList, err = idDict.postingsList([]byte(id), nil, postingsList) + if err != nil { + return nil, err + } + postingsList.OrInto(rv) + } } - postingsList.OrInto(rv) } - } return rv, nil From ecc0efcc4ffa22707a1d9bee9cfcab1ebd270783 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Fri, 4 Jan 2019 14:05:36 -0800 Subject: [PATCH 518/728] Revert "Fix: DocNumbers must work with empty ids array." This reverts commit 8395e90b544ddfa13cbe9b112b330a8093bdec2c. --- index/scorch/segment/zap/segment.go | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 9342dd273..1bab522e2 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -423,21 +423,17 @@ func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) { sMax := "" iMin := "" - if len(ids) > 0 { - sMaxB, err := idDict.fst.GetMaxKey() - if err != nil { - skipCheck = true - } else { - sMax = string(sMaxB) - iMin = ids[0] - for i := 1; i < len(ids); i++ { - if ids[i] < iMin { - iMin = ids[i] - } + sMaxB, err := idDict.fst.GetMaxKey() + if err != nil { + skipCheck = true + } else { + sMax = string(sMaxB) + iMin = ids[0] + for i := 1; i < len(ids); i++ { + if ids[i] < iMin { + iMin = ids[i] } } - } else { - skipCheck = true } if skipCheck || (iMin <= sMax) { for _, id := range ids { From 0d76d2adf9a8e9f5804b62cd4ca83c86144464eb Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Fri, 4 Jan 2019 14:06:59 -0800 Subject: [PATCH 519/728] Revert "DocNumbers - skip segments, that don't include requested ids." This reverts commit 9bb1687420e429f94ffcd4a159ad1c7181772452. --- index/scorch/segment/zap/segment.go | 31 +++++------------------------ 1 file changed, 5 insertions(+), 26 deletions(-) diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 1bab522e2..8c6de211a 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -418,33 +418,12 @@ func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) { } postingsList := emptyPostingsList - - skipCheck := false - sMax := "" - iMin := "" - - sMaxB, err := idDict.fst.GetMaxKey() - if err != nil { - skipCheck = true - } else { - sMax = string(sMaxB) - iMin = ids[0] - for i := 1; i < len(ids); i++ { - if ids[i] < iMin { - iMin = ids[i] - } - } - } - if skipCheck || (iMin <= sMax) { - for _, id := range ids { - if skipCheck || (id <= sMax) { - postingsList, err = idDict.postingsList([]byte(id), nil, postingsList) - if err != nil { - return nil, err - } - postingsList.OrInto(rv) - } + for _, id := range ids { + postingsList, err = idDict.postingsList([]byte(id), nil, postingsList) + if err != nil { + return nil, err } + postingsList.OrInto(rv) } } From 5784878872b9fc934e1720e6e8f5a28ba3195f4f Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Sat, 5 Jan 2019 18:37:52 -0500 Subject: [PATCH 520/728] add unit test for #1096 --- index_test.go | 76 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 76 insertions(+) diff --git a/index_test.go b/index_test.go index 0d81eba40..55b38909d 100644 --- a/index_test.go +++ b/index_test.go @@ -2073,3 +2073,79 @@ func TestBatchMerge(t *testing.T) { } } + +func TestBug1096(t *testing.T) { + defer func() { + err := os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + }() + + // use default mapping + mapping := NewIndexMapping() + + // create a scorch index with default SAFE batches + var idx Index + idx, err = NewUsing("testidx", mapping, "scorch", "scorch", nil) + if err != nil { + log.Fatal(err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + // create a single batch instance that we will reuse + // this should be safe because we have single goroutine + // and we always wait for batch execution to finish + batch := idx.NewBatch() + + // number of batches to execute + for i := 0; i < 10; i++ { + + // number of documents to put into the batch + for j := 0; j < 91; j++ { + + // create a doc id 0-90 (important so that we get id's 9 and 90) + // this could duplicate something already in the index + // this too should be OK and update the item in the index + id := fmt.Sprintf("%d", j) + + err = batch.Index(id, map[string]interface{}{ + "name": id, + "batch": fmt.Sprintf("%d", i), + }) + if err != nil { + log.Fatal(err) + } + } + + // execute the batch + err = idx.Batch(batch) + if err != nil { + log.Fatal(err) + } + + // reset the batch before reusing it + batch.Reset() + } + + // search for docs having name starting with the number 9 + q := NewWildcardQuery("9*") + q.SetField("name") + req := NewSearchRequestOptions(q, 1000, 0, false) + req.Fields = []string{"*"} + var res *SearchResult + res, err = idx.Search(req) + if err != nil { + log.Fatal(err) + } + + // we expect only 2 hits, for docs 9 and 90 + if res.Total > 2 { + t.Fatalf("expected only 2 hits '9' and '90', got %v", res) + } +} From af8b2ab37f5b30915e2cdfdc4e2b8b4c6d2b13b1 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 7 Jan 2019 16:23:42 -0800 Subject: [PATCH 521/728] Update SHA of vellum in vendor/manifest --- vendor/manifest | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/manifest b/vendor/manifest index 9471f14e2..5bdb6a076 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -137,7 +137,7 @@ "importpath": "github.com/couchbase/vellum", "repository": "https://github.com/couchbase/vellum", "vcs": "git", - "revision": "f377ee3282b954c46915d89482bf93288ee7dd12", + "revision": "28880ab96d9361ab5a74f0e12000f8fe0cd20712", "branch": "master", "notests": true } From fa29ecd3f1875b37f0fd25de9cdd2f7fb2eeaadd Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Tue, 8 Jan 2019 16:21:14 -0800 Subject: [PATCH 522/728] MB-31406: Accounting for FileMergeWrittenBytes incrementally --- index/scorch/merge.go | 11 +++++++---- index/scorch/segment/segment.go | 4 ++++ index/scorch/segment/zap/count.go | 10 ++++++++++ index/scorch/segment/zap/merge.go | 10 ++++++---- index/scorch/segment/zap/merge_test.go | 10 +++++----- index/scorch/segment/zap/segment_test.go | 2 +- 6 files changed, 33 insertions(+), 14 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 61abe6951..bcbf5b710 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -196,10 +196,9 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, fileMergeZapStartTime := time.Now() atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1) - newDocNums, nBytes, err := zap.Merge(segmentsToMerge, docsToDrop, path, - DefaultChunkFactor, s.closeCh) + newDocNums, _, err := zap.Merge(segmentsToMerge, docsToDrop, path, + DefaultChunkFactor, s.closeCh, s) atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1) - atomic.AddUint64(&s.stats.TotFileMergeWrittenBytes, nBytes) fileMergeZapTime := uint64(time.Since(fileMergeZapStartTime)) atomic.AddUint64(&s.stats.TotFileMergeZapTime, fileMergeZapTime) @@ -292,7 +291,7 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, path := s.path + string(os.PathSeparator) + filename newDocNums, _, err := - zap.MergeSegmentBases(sbs, sbsDrops, path, chunkFactor, s.closeCh) + zap.MergeSegmentBases(sbs, sbsDrops, path, chunkFactor, s.closeCh, s) atomic.AddUint64(&s.stats.TotMemMergeZapEnd, 1) @@ -347,3 +346,7 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, return newSnapshot, newSegmentID, nil } } + +func (s *Scorch) ReportBytesWritten(bytesWritten uint64) { + atomic.AddUint64(&s.stats.TotFileMergeWrittenBytes, bytesWritten) +} diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index be9142c40..b94d6f979 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -129,3 +129,7 @@ type DocumentFieldTermVisitable interface { type DocVisitState interface { } + +type StatsReporter interface { + ReportBytesWritten(bytesWritten uint64) +} diff --git a/index/scorch/segment/zap/count.go b/index/scorch/segment/zap/count.go index d75e83c03..50290f888 100644 --- a/index/scorch/segment/zap/count.go +++ b/index/scorch/segment/zap/count.go @@ -17,6 +17,8 @@ package zap import ( "hash/crc32" "io" + + "github.com/blevesearch/bleve/index/scorch/segment" ) // CountHashWriter is a wrapper around a Writer which counts the number of @@ -25,6 +27,7 @@ type CountHashWriter struct { w io.Writer crc uint32 n int + s segment.StatsReporter } // NewCountHashWriter returns a CountHashWriter which wraps the provided Writer @@ -32,11 +35,18 @@ func NewCountHashWriter(w io.Writer) *CountHashWriter { return &CountHashWriter{w: w} } +func NewCountHashWriterWithStatsReporter(w io.Writer, s segment.StatsReporter) *CountHashWriter { + return &CountHashWriter{w: w, s: s} +} + // Write writes the provided bytes to the wrapped writer and counts the bytes func (c *CountHashWriter) Write(b []byte) (int, error) { n, err := c.w.Write(b) c.crc = crc32.Update(c.crc, crc32.IEEETable, b[:n]) c.n += n + if c.s != nil { + c.s.ReportBytesWritten(uint64(n)) + } return n, err } diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 13e9bf97c..4ef222c1a 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -38,17 +38,19 @@ const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc // remaining data. This new segment is built at the specified path, // with the provided chunkFactor. func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, - chunkFactor uint32, closeCh chan struct{}) ([][]uint64, uint64, error) { + chunkFactor uint32, closeCh chan struct{}, s seg.StatsReporter) ( + [][]uint64, uint64, error) { segmentBases := make([]*SegmentBase, len(segments)) for segmenti, segment := range segments { segmentBases[segmenti] = &segment.SegmentBase } - return MergeSegmentBases(segmentBases, drops, path, chunkFactor, closeCh) + return MergeSegmentBases(segmentBases, drops, path, chunkFactor, closeCh, s) } func MergeSegmentBases(segmentBases []*SegmentBase, drops []*roaring.Bitmap, path string, - chunkFactor uint32, closeCh chan struct{}) ([][]uint64, uint64, error) { + chunkFactor uint32, closeCh chan struct{}, s seg.StatsReporter) ( + [][]uint64, uint64, error) { flag := os.O_RDWR | os.O_CREATE f, err := os.OpenFile(path, flag, 0600) @@ -65,7 +67,7 @@ func MergeSegmentBases(segmentBases []*SegmentBase, drops []*roaring.Bitmap, pat br := bufio.NewWriterSize(f, DefaultFileMergerBufferSize) // wrap it for counting (tracking offsets) - cr := NewCountHashWriter(br) + cr := NewCountHashWriterWithStatsReporter(br, s) newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, _, _, err := MergeToWriter(segmentBases, drops, chunkFactor, cr, closeCh) diff --git a/index/scorch/segment/zap/merge_test.go b/index/scorch/segment/zap/merge_test.go index db1cfff15..450ecba91 100644 --- a/index/scorch/segment/zap/merge_test.go +++ b/index/scorch/segment/zap/merge_test.go @@ -71,7 +71,7 @@ func TestMerge(t *testing.T) { segsToMerge[0] = segment.(*Segment) segsToMerge[1] = segment2.(*Segment) - _, _, err = Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024, nil) + _, _, err = Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024, nil, nil) if err != nil { t.Fatal(err) } @@ -175,7 +175,7 @@ func testMergeWithEmptySegments(t *testing.T, before bool, numEmptySegments int) drops := make([]*roaring.Bitmap, len(segsToMerge)) - _, _, err = Merge(segsToMerge, drops, "/tmp/scorch3.zap", 1024, nil) + _, _, err = Merge(segsToMerge, drops, "/tmp/scorch3.zap", 1024, nil, nil) if err != nil { t.Fatal(err) } @@ -217,7 +217,7 @@ func testMergeWithSelf(t *testing.T, segCur *Segment, expectedCount uint64) { segsToMerge := make([]*Segment, 1) segsToMerge[0] = segCur - _, _, err := Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/"+fname, 1024, nil) + _, _, err := Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/"+fname, 1024, nil, nil) if err != nil { t.Fatal(err) } @@ -589,7 +589,7 @@ func testMergeWithUpdates(t *testing.T, segmentDocIds [][]string, docsToDrop []* func testMergeAndDropSegments(t *testing.T, segsToMerge []*Segment, docsToDrop []*roaring.Bitmap, expectedNumDocs uint64) { _ = os.RemoveAll("/tmp/scorch-merged.zap") - _, _, err := Merge(segsToMerge, docsToDrop, "/tmp/scorch-merged.zap", 1024, nil) + _, _, err := Merge(segsToMerge, docsToDrop, "/tmp/scorch-merged.zap", 1024, nil, nil) if err != nil { t.Fatal(err) } @@ -823,7 +823,7 @@ func TestMergeBytesWritten(t *testing.T) { segsToMerge[0] = segment.(*Segment) segsToMerge[1] = segment2.(*Segment) - _, nBytes, err := Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024, nil) + _, nBytes, err := Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024, nil, nil) if err != nil { t.Fatal(err) } diff --git a/index/scorch/segment/zap/segment_test.go b/index/scorch/segment/zap/segment_test.go index 623198c63..ffe4c7c80 100644 --- a/index/scorch/segment/zap/segment_test.go +++ b/index/scorch/segment/zap/segment_test.go @@ -691,7 +691,7 @@ func TestMergedSegmentDocsWithNonOverlappingFields(t *testing.T) { segsToMerge[0] = segment1.(*Segment) segsToMerge[1] = segment2.(*Segment) - _, nBytes, err := Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024, nil) + _, nBytes, err := Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024, nil, nil) if err != nil { t.Fatal(err) } From ec80de1199ba6815873df0a05fe0d2e0136f2941 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 11 Jan 2019 21:51:00 -0800 Subject: [PATCH 523/728] optimize scorch postings list that has no obsoleted docs A postings list that has no obsoleted docs will have a postings iterator with an actual roaring-bitmap that's logically the same as the all roaring-bitmap (e.g., actual = all - obsoletions). This optimization handles this edge case by using the same exact roaring-bitmap iterator for both the actual and all needs. This means that Next() calls will be only needed on a single roaring iterator rather than the general case of Next()'ing through two separate iterators. A helper method, PostingsIterator.currChunkNext(), is refactored out as part of this change so that maintaining the freq/norm/location readers correctly is more straightforward. On a scorch index of 200,000 en-wiki docs, high-freq term searches went from ~278 q/s before this change to ~307 q/s after. Low-freq term searches went from ~84K q/s before to ~92K q/s after. --- index/scorch/segment/zap/posting.go | 106 ++++++++++++++++++++++------ 1 file changed, 83 insertions(+), 23 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 0ac7938e1..1bada4983 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -254,7 +254,7 @@ func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool, rv.Actual = rv.ActualBM.Iterator() } else { rv.ActualBM = p.postings - rv.Actual = p.postings.Iterator() + rv.Actual = rv.all // Optimize to use same iterator for all & Actual. } return rv @@ -651,6 +651,10 @@ func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, return 0, false, nil } + if i.postings.postings == i.ActualBM { + return i.nextDocNumAtOrAfterClean(atOrAfter) + } + n := i.Actual.Next() for uint64(n) < atOrAfter && i.Actual.HasNext() { n = i.Actual.Next() @@ -670,31 +674,10 @@ func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, for allN != n { // in the same chunk, so move the freq/norm/loc decoders forward if i.includeFreqNorm && allNChunk == nChunk { - if i.currChunk != nChunk || i.currChunkFreqNorm == nil { - err := i.loadChunk(int(nChunk)) - if err != nil { - return 0, false, fmt.Errorf("error loading chunk: %v", err) - } - } - - // read off freq/offsets even though we don't care about them - _, _, hasLocs, err := i.readFreqNormHasLocs() + err := i.currChunkNext(nChunk) if err != nil { return 0, false, err } - - if i.includeLocs && hasLocs { - numLocsBytes, err := binary.ReadUvarint(i.locReader) - if err != nil { - return 0, false, fmt.Errorf("error reading location numLocsBytes: %v", err) - } - - // skip over all the location bytes - _, err = i.locReader.Seek(int64(numLocsBytes), io.SeekCurrent) - if err != nil { - return 0, false, err - } - } } allN = i.all.Next() @@ -711,6 +694,83 @@ func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, return uint64(n), true, nil } +// optimization when the postings list is "clean" (e.g., no updates & +// no deletions) where the all bitmap is the same as the actual bitmap +func (i *PostingsIterator) nextDocNumAtOrAfterClean( + atOrAfter uint64) (uint64, bool, error) { + sameChunkNexts := 0 // # of times we called Next() in the same chunk + + n := i.Actual.Next() + + nChunk := n / i.postings.sb.chunkFactor + + for uint64(n) < atOrAfter && i.Actual.HasNext() { + n = i.Actual.Next() + + nChunkPrev := nChunk + nChunk = n / i.postings.sb.chunkFactor + + if nChunk != nChunkPrev { + sameChunkNexts = 0 + } else { + sameChunkNexts += 1 + } + } + + if uint64(n) < atOrAfter { + // couldn't find anything + return 0, false, nil + } + + if i.includeFreqNorm { + for j := 0; j < sameChunkNexts; j++ { + err := i.currChunkNext(nChunk) + if err != nil { + return 0, false, fmt.Errorf("error optimized currChunkNext: %v", err) + } + } + + if i.currChunk != nChunk || i.currChunkFreqNorm == nil { + err := i.loadChunk(int(nChunk)) + if err != nil { + return 0, false, fmt.Errorf("error loading chunk: %v", err) + } + } + } + + return uint64(n), true, nil +} + +func (i *PostingsIterator) currChunkNext(nChunk uint32) error { + if i.currChunk != nChunk || i.currChunkFreqNorm == nil { + err := i.loadChunk(int(nChunk)) + if err != nil { + return fmt.Errorf("error loading chunk: %v", err) + } + } + + // read off freq/offsets even though we don't care about them + _, _, hasLocs, err := i.readFreqNormHasLocs() + if err != nil { + return err + } + + if i.includeLocs && hasLocs { + numLocsBytes, err := binary.ReadUvarint(i.locReader) + if err != nil { + return fmt.Errorf("error reading location numLocsBytes: %v", err) + } + + // skip over all the location bytes + _, err = i.locReader.Seek(int64(numLocsBytes), io.SeekCurrent) + if err != nil { + return err + } + } + + return nil +} + // Posting is a single entry in a postings list type Posting struct { docNum uint64 From b57a98d6dff94226dd200ec6477fe2e98b2eb457 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 15 Jan 2019 16:17:59 -0800 Subject: [PATCH 524/728] optimize scorch postingsIter chunk maintenance on skip ahead Consider a scenario of a conjunction search with a high-frequency term search and a low-frequency term search. The scorch PostingsIterator for the high-frequency term will have its nextDocNumAtOrAfter(atOrAfter) method invoked with a non-zero atOrAfter parameter -- driven by docNum's from the low frequency term search. In this case, before this change, the inner loop that moves the high-frequency "all iterator" forwards performs a division by chunkFactor on every single "all" docNum, in order to compute the all iterator's current chunk number, so that it can be checked against the "acutal iterator"'s current chunk number (in order to maintain the PostingIterator's freqNorm & loc reader positions). This integer division was appearing in pprof, so this change is to replace the division op with a faster integer >= comparison. On a 200K en-wiki docs perf test, a query-string search for "+http +schoch" went from ~7650 q/sec throughput before the change to ~8000 q/sec afterwards. --- index/scorch/segment/zap/posting.go | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 1bada4983..d0bdd6852 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -666,14 +666,16 @@ func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, allN := i.all.Next() nChunk := n / i.postings.sb.chunkFactor - allNChunk := allN / i.postings.sb.chunkFactor + + // when allN becomes >= to here, then allN is in the same chunk as nChunk. + allNReachesNChunk := nChunk * i.postings.sb.chunkFactor // n is the next actual hit (excluding some postings), and // allN is the next hit in the full postings, and // if they don't match, move 'all' forwards until they do for allN != n { - // in the same chunk, so move the freq/norm/loc decoders forward - if i.includeFreqNorm && allNChunk == nChunk { + // we've reached same chunk, so move the freq/norm/loc decoders forward + if i.includeFreqNorm && allN >= allNReachesNChunk { err := i.currChunkNext(nChunk) if err != nil { return 0, false, err @@ -681,7 +683,6 @@ func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, } allN = i.all.Next() - allNChunk = allN / i.postings.sb.chunkFactor } if i.includeFreqNorm && (i.currChunk != nChunk || i.currChunkFreqNorm == nil) { From 5554236b024202656ea23aa49a81261b10cef7d6 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 16 Jan 2019 09:17:12 -0800 Subject: [PATCH 525/728] [Scorch] Skipping segments that don't include requested ids Skip the segment if the current segment's maximum docId is smaller than the smallest requested id, while fetching the DocNumbers. --- index/scorch/segment/zap/segment.go | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 8c6de211a..7ba28c236 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -418,7 +418,20 @@ func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) { } postingsList := emptyPostingsList + + sMax, err := idDict.fst.GetMaxKey() + if err != nil { + return nil, err + } + sMaxStr := string(sMax) + filteredIds := make([]string, 0, len(ids)) for _, id := range ids { + if id <= sMaxStr { + filteredIds = append(filteredIds, id) + } + } + + for _, id := range filteredIds { postingsList, err = idDict.postingsList([]byte(id), nil, postingsList) if err != nil { return nil, err From c15bb65ae6df459768345ee9a68f3122ef3d4fbd Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 19 Jan 2019 13:52:43 -0800 Subject: [PATCH 526/728] optimization Finish() may return optimized resources This is a plumbing level API change, where an optimization is now allowed to return alternative resources in its Finish() implementation. For example, this might be useful in future commits, where scorch codepaths might return an alternate index.TermFieldReader instance for optimizing non-scoring disjunctions. --- index/index.go | 10 +++++++++- index/scorch/optimize.go | 6 +++--- search/searcher/search_conjunction.go | 2 +- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/index/index.go b/index/index.go index 4ae79056e..6aa444cfd 100644 --- a/index/index.go +++ b/index/index.go @@ -341,11 +341,19 @@ type Optimizable interface { Optimize(kind string, octx OptimizableContext) (OptimizableContext, error) } +// Represents a result of optimization -- see the Finish() method. +type Optimized interface{} + type OptimizableContext interface { // Once all the optimzable resources have been provided the same // OptimizableContext instance, the optimization preparations are // finished or completed via the Finish() method. - Finish() error + // + // Depending on the optimization being performed, the Finish() + // method might return a non-nil Optimized instance. For example, + // the Optimized instance might represent an optimized + // TermFieldReader instance. + Finish() (Optimized, error) } type DocValueReader interface { diff --git a/index/scorch/optimize.go b/index/scorch/optimize.go index b45fc8b0d..9ba7247c9 100644 --- a/index/scorch/optimize.go +++ b/index/scorch/optimize.go @@ -53,9 +53,9 @@ type OptimizeTFRConjunction struct { tfrs []*IndexSnapshotTermFieldReader } -func (o *OptimizeTFRConjunction) Finish() error { +func (o *OptimizeTFRConjunction) Finish() (index.Optimized, error) { if len(o.tfrs) <= 1 { - return nil + return nil, nil } for i := range o.snapshot.segment { @@ -89,5 +89,5 @@ func (o *OptimizeTFRConjunction) Finish() error { } } - return nil + return nil, nil } diff --git a/search/searcher/search_conjunction.go b/search/searcher/search_conjunction.go index 62966c13f..e89848b79 100644 --- a/search/searcher/search_conjunction.go +++ b/search/searcher/search_conjunction.go @@ -77,7 +77,7 @@ func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.S } if octx != nil { - err := octx.Finish() + _, err := octx.Finish() if err != nil { return nil, err } From eef432c1349f0df8859e8f28acb63bc90345e7c1 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sat, 19 Jan 2019 15:12:59 -0800 Subject: [PATCH 527/728] API: added SearchRequest.NoScore flag This is a backwards compatible API addition. --- index_impl.go | 1 + search.go | 1 + search/search.go | 1 + search/searcher/search_term.go | 6 ++++-- 4 files changed, 7 insertions(+), 2 deletions(-) diff --git a/index_impl.go b/index_impl.go index c969f3758..ad9b3505e 100644 --- a/index_impl.go +++ b/index_impl.go @@ -458,6 +458,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr searcher, err := req.Query.Searcher(indexReader, i.m, search.SearcherOptions{ Explain: req.Explain, IncludeTermVectors: req.IncludeLocations || req.Highlight != nil, + NoScore: req.NoScore, }) if err != nil { return nil, err diff --git a/search.go b/search.go index 86ea4193a..019d6fd04 100644 --- a/search.go +++ b/search.go @@ -273,6 +273,7 @@ type SearchRequest struct { Explain bool `json:"explain"` Sort search.SortOrder `json:"sort"` IncludeLocations bool `json:"includeLocations"` + NoScore bool `json:"noScore"` } func (r *SearchRequest) Validate() error { diff --git a/search/search.go b/search/search.go index 440c09571..b241de668 100644 --- a/search/search.go +++ b/search/search.go @@ -280,6 +280,7 @@ type Searcher interface { type SearcherOptions struct { Explain bool IncludeTermVectors bool + NoScore bool } // SearchContext represents the context around a single search diff --git a/search/searcher/search_term.go b/search/searcher/search_term.go index 97b7dbb90..e805bf121 100644 --- a/search/searcher/search_term.go +++ b/search/searcher/search_term.go @@ -39,7 +39,8 @@ type TermSearcher struct { func NewTermSearcher(indexReader index.IndexReader, term string, field string, boost float64, options search.SearcherOptions) (*TermSearcher, error) { termBytes := []byte(term) - reader, err := indexReader.TermFieldReader(termBytes, field, true, true, options.IncludeTermVectors) + needFreqNorm := !options.NoScore + reader, err := indexReader.TermFieldReader(termBytes, field, needFreqNorm, needFreqNorm, options.IncludeTermVectors) if err != nil { return nil, err } @@ -57,7 +58,8 @@ func NewTermSearcher(indexReader index.IndexReader, term string, field string, b } func NewTermSearcherBytes(indexReader index.IndexReader, term []byte, field string, boost float64, options search.SearcherOptions) (*TermSearcher, error) { - reader, err := indexReader.TermFieldReader(term, field, true, true, options.IncludeTermVectors) + needFreqNorm := !options.NoScore + reader, err := indexReader.TermFieldReader(term, field, needFreqNorm, needFreqNorm, options.IncludeTermVectors) if err != nil { return nil, err } From 58e6641d13844e212687bd35823914c5e26b04dd Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 20 Jan 2019 12:05:56 -0800 Subject: [PATCH 528/728] optimize scorch 'unadorned' disjunctions via roaring OR's Normally, the procesing of disjunctions would need postings iterators that provide the matching internal-id's along with associated freq-norm and term-vector information. A so-called "unadorned" disjunction is a more restricted edge case, where the application does not need the extra freq-norm information (a non-scoring disjunction) and does not need term-vector information. The scorch indexer can optimize these unadorned disjunctions by OR'ing together the constituent, actual bitmaps, since we do not need to maintain readers for the freq-norm and term-vector data. The resulting OR'ed bitmap is then wrapped by a single term-field-reader as an alternative to the normal disjunction searcher machinery. Regarding perf microbenchmarks (bleve-query) on a 200K en-wiki docs scorch index... for a high number of high-frequency terms... - wildcard search on "th*" (~31K hits)... before the change, with normal scoring - ~4.7 q/sec w/ NoScore:true - ~5.1 q/sec w/ NoScore:true & unadorned disj. optimization - ~11.6 q/sec for a low number of high-frequency terms... - query-string search on "http www com" (~25K hits)... before the change, with normal scoring - ~190 q/sec w/ NoScore:true - ~260 q/sec w/ NoScore:true & unadorned disj. optimization - ~415 q/sec for a low number of low-frequency terms... - query-string search on "marty shoch" (207 hits)... before the change, with normal scoring - ~15.0K q/sec w/ NoScore:true - ~21.3K q/sec w/ NoScore:true & unadorned disj. optimization - ~16.1K q/sec Searching with NoScore:true is faster than normal scoring in all cases. However, for low numbers of low-frequency terms, the unadorned disjunction optimization is slower (~16.1K q/sec) than the classic disjunction searcher implementation (~21.3K q/sec). This is likely due to more memory/garbage creation for the computed OR'ed bitmaps. This might be addressed in future commits by using heuristics. --- index/scorch/optimize.go | 129 +++++++++++++++++++++++++- index/scorch/segment/zap/posting.go | 60 +++++++++--- index/scorch/snapshot_index.go | 2 +- index/scorch/snapshot_index_tfr.go | 2 +- search/searcher/search_disjunction.go | 51 ++++++++++ search/searcher/search_term.go | 23 ++--- 6 files changed, 229 insertions(+), 38 deletions(-) diff --git a/index/scorch/optimize.go b/index/scorch/optimize.go index 9ba7247c9..a5846121a 100644 --- a/index/scorch/optimize.go +++ b/index/scorch/optimize.go @@ -20,15 +20,27 @@ import ( "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/index/scorch/segment/zap" ) -func (s *IndexSnapshotTermFieldReader) Optimize(kind string, octx index.OptimizableContext) ( - index.OptimizableContext, error) { - if kind != "conjunction" { - return octx, nil +func (s *IndexSnapshotTermFieldReader) Optimize(kind string, + octx index.OptimizableContext) (index.OptimizableContext, error) { + if kind == "conjunction" { + return s.optimizeConjunction(octx) + } + + if kind == "disjunction:unadorned" { + return s.optimizeDisjunctionUnadorned(octx) } + return octx, nil +} + +// ---------------------------------------------------------------- + +func (s *IndexSnapshotTermFieldReader) optimizeConjunction( + octx index.OptimizableContext) (index.OptimizableContext, error) { if octx == nil { octx = &OptimizeTFRConjunction{snapshot: s.snapshot} } @@ -39,7 +51,7 @@ func (s *IndexSnapshotTermFieldReader) Optimize(kind string, octx index.Optimiza } if o.snapshot != s.snapshot { - return nil, fmt.Errorf("tried to optimize across different snapshots") + return nil, fmt.Errorf("tried to optimize conjunction across different snapshots") } o.tfrs = append(o.tfrs, s) @@ -80,6 +92,8 @@ func (o *OptimizeTFRConjunction) Finish() (index.Optimized, error) { bm.And(itr.ActualBM) } + // in this conjunction optimization, the postings iterators + // will all share the same AND'ed together actual bitmap for _, tfr := range o.tfrs { itr, ok := tfr.iterators[i].(*zap.PostingsIterator) if ok && itr.ActualBM != nil { @@ -91,3 +105,108 @@ func (o *OptimizeTFRConjunction) Finish() (index.Optimized, error) { return nil, nil } + +// ---------------------------------------------------------------- + +// An "unadorned" disjunction optimization is appropriate when +// additional or subsidiary information like freq-norm's and +// term-vectors are not required, and instead only the internal-id's +// are needed. +func (s *IndexSnapshotTermFieldReader) optimizeDisjunctionUnadorned( + octx index.OptimizableContext) (index.OptimizableContext, error) { + if octx == nil { + octx = &OptimizeTFRDisjunctionUnadorned{snapshot: s.snapshot} + } + + o, ok := octx.(*OptimizeTFRDisjunctionUnadorned) + if !ok { + return nil, nil + } + + if o.snapshot != s.snapshot { + return nil, fmt.Errorf("tried to optimize unadorned disjunction across different snapshots") + } + + o.tfrs = append(o.tfrs, s) + + return o, nil +} + +type OptimizeTFRDisjunctionUnadorned struct { + snapshot *IndexSnapshot + + tfrs []*IndexSnapshotTermFieldReader +} + +var OptimizeTFRDisjunctionUnadornedTerm = []byte("") +var OptimizeTFRDisjunctionUnadornedField = "*" + +// Finish of an unadorned disjunction optimization will compute a +// termFieldReader with an "actual" bitmap that represents the +// constituent bitmaps OR'ed together. This termFieldReader cannot +// provide any freq-norm or termVector associated information. +func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err error) { + if len(o.tfrs) <= 1 { + return nil, nil + } + + // We use an artificial term and field because the optimized + // termFieldReader can represent multiple terms and fields. + oTFR := &IndexSnapshotTermFieldReader{ + term: OptimizeTFRDisjunctionUnadornedTerm, + field: OptimizeTFRDisjunctionUnadornedField, + snapshot: o.snapshot, + iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)), + segmentOffset: 0, + includeFreq: false, + includeNorm: false, + includeTermVectors: false, + } + + var docNums []uint32 // Collected docNum's from 1-hit posting lists. + var actualBMs []*roaring.Bitmap // Collected from regular posting lists. + + for i := range o.snapshot.segment { + docNums = docNums[:0] + actualBMs = actualBMs[:0] + + for _, tfr := range o.tfrs { + itr, ok := tfr.iterators[i].(*zap.PostingsIterator) + if !ok { + return nil, nil + } + + docNum, ok := itr.DocNum1Hit() + if ok { + docNums = append(docNums, uint32(docNum)) + continue + } + + if itr.ActualBM != nil { + actualBMs = append(actualBMs, itr.ActualBM) + } + } + + var bm *roaring.Bitmap + if len(actualBMs) > 2 { + bm = roaring.HeapOr(actualBMs...) + } else if len(actualBMs) == 2 { + bm = roaring.Or(actualBMs[0], actualBMs[1]) + } else if len(actualBMs) == 1 { + bm = actualBMs[0].Clone() + } + + if bm == nil { + bm = roaring.New() + } + + bm.AddMany(docNums) + + oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap(bm, false, false) + if err != nil { + return nil, nil + } + } + + return oTFR, nil +} diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index d0bdd6852..0155e8222 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -651,7 +651,7 @@ func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, return 0, false, nil } - if i.postings.postings == i.ActualBM { + if i.postings == nil || i.postings.postings == i.ActualBM { return i.nextDocNumAtOrAfterClean(atOrAfter) } @@ -699,10 +699,23 @@ func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, // no deletions) where the all bitmap is the same as the actual bitmap func (i *PostingsIterator) nextDocNumAtOrAfterClean( atOrAfter uint64) (uint64, bool, error) { - sameChunkNexts := 0 // # of times we called Next() in the same chunk - n := i.Actual.Next() + if !i.includeFreqNorm { + for uint64(n) < atOrAfter && i.Actual.HasNext() { + n = i.Actual.Next() + } + + if uint64(n) < atOrAfter { + return 0, false, nil // couldn't find anything + } + + return uint64(n), true, nil + } + + // freq-norm's needed, so maintain freq-norm chunk reader + sameChunkNexts := 0 // # of times we called Next() in the same chunk + nChunk := n / i.postings.sb.chunkFactor for uint64(n) < atOrAfter && i.Actual.HasNext() { @@ -723,19 +736,17 @@ func (i *PostingsIterator) nextDocNumAtOrAfterClean( return 0, false, nil } - if i.includeFreqNorm { - for j := 0; j < sameChunkNexts; j++ { - err := i.currChunkNext(nChunk) - if err != nil { - return 0, false, fmt.Errorf("error optimized currChunkNext: %v", err) - } + for j := 0; j < sameChunkNexts; j++ { + err := i.currChunkNext(nChunk) + if err != nil { + return 0, false, fmt.Errorf("error optimized currChunkNext: %v", err) } + } - if i.currChunk != nChunk || i.currChunkFreqNorm == nil { - err := i.loadChunk(int(nChunk)) - if err != nil { - return 0, false, fmt.Errorf("error loading chunk: %v", err) - } + if i.currChunk != nChunk || i.currChunkFreqNorm == nil { + err := i.loadChunk(int(nChunk)) + if err != nil { + return 0, false, fmt.Errorf("error loading chunk: %v", err) } } @@ -772,6 +783,27 @@ func (i *PostingsIterator) currChunkNext(nChunk uint32) error { return nil } +// DocNum1Hit returns the docNum and true if this is "1-hit" optimized +// and the docNum is available. +func (p *PostingsIterator) DocNum1Hit() (uint64, bool) { + if p.normBits1Hit != 0 && p.docNum1Hit != docNum1HitFinished { + return p.docNum1Hit, true + } + return 0, false +} + +// PostingsIteratorFromBitmap constructs a PostingsIterator given an +// "actual" bitmap. +func PostingsIteratorFromBitmap(bm *roaring.Bitmap, + includeFreqNorm, includeLocs bool) (*PostingsIterator, error) { + return &PostingsIterator{ + ActualBM: bm, + Actual: bm.Iterator(), + includeFreqNorm: includeFreqNorm, + includeLocs: includeLocs, + }, nil +} + // Posting is a single entry in a postings list type Posting struct { docNum uint64 diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 4c4d92144..8babb31fa 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -435,7 +435,7 @@ func (i *IndexSnapshot) InternalID(id string) (rv index.IndexInternalID, err err } func (i *IndexSnapshot) TermFieldReader(term []byte, field string, includeFreq, - includeNorm, includeTermVectors bool) (tfr index.TermFieldReader, err error) { + includeNorm, includeTermVectors bool) (index.TermFieldReader, error) { rv := i.allocTermFieldReaderDicts(field) rv.term = term diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index 89af3be4c..5d56f1944 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -74,7 +74,7 @@ func (i *IndexSnapshotTermFieldReader) Next(preAlloced *index.TermFieldDoc) (*in rv = &index.TermFieldDoc{} } // find the next hit - for i.segmentOffset < len(i.postings) { + for i.segmentOffset < len(i.iterators) { next, err := i.iterators[i.segmentOffset].Next() if err != nil { return nil, err diff --git a/search/searcher/search_disjunction.go b/search/searcher/search_disjunction.go index 882b02ccb..d4db7d9bf 100644 --- a/search/searcher/search_disjunction.go +++ b/search/searcher/search_disjunction.go @@ -40,6 +40,11 @@ func NewDisjunctionSearcher(indexReader index.IndexReader, func newDisjunctionSearcher(indexReader index.IndexReader, qsearchers []search.Searcher, min float64, options search.SearcherOptions, limit bool) (search.Searcher, error) { + rv, err := optimizeDisjunctionSearcher(indexReader, qsearchers, min, options) + if err != nil || rv != nil { + return rv, err + } + if len(qsearchers) > DisjunctionHeapTakeover { return newDisjunctionHeapSearcher(indexReader, qsearchers, min, options, limit) @@ -48,6 +53,52 @@ func newDisjunctionSearcher(indexReader index.IndexReader, limit) } +// optimizeDisjunctionSearcher might return an optimized searcher that +// represents the disjunction, especially in the edge case of a +// non-scoring, no-term-vectors disjunction +func optimizeDisjunctionSearcher(indexReader index.IndexReader, + qsearchers []search.Searcher, min float64, options search.SearcherOptions) ( + search.Searcher, error) { + // we cannot use the "unadorned" disjunction optimization if the + // caller wants extra information like freq-norm's for scoring or + // term vectors + if len(qsearchers) <= 1 || !options.NoScore || options.IncludeTermVectors { + return nil, nil + } + + var octx index.OptimizableContext + + for _, searcher := range qsearchers { + o, ok := searcher.(index.Optimizable) + if !ok { + return nil, nil + } + + var err error + octx, err = o.Optimize("disjunction:unadorned", octx) + if err != nil { + return nil, err + } + + if octx == nil { + return nil, nil + } + } + + optimized, err := octx.Finish() + if err != nil || optimized == nil { + return nil, err + } + + tfr, ok := optimized.(index.TermFieldReader) + if !ok { + return nil, nil + } + + return newTermSearcherFromReader(indexReader, tfr, + []byte(""), "*", 1.0, options) +} + func tooManyClauses(count int) bool { if DisjunctionMaxClauseCount != 0 && count > DisjunctionMaxClauseCount { return true diff --git a/search/searcher/search_term.go b/search/searcher/search_term.go index e805bf121..c90008eef 100644 --- a/search/searcher/search_term.go +++ b/search/searcher/search_term.go @@ -38,23 +38,7 @@ type TermSearcher struct { } func NewTermSearcher(indexReader index.IndexReader, term string, field string, boost float64, options search.SearcherOptions) (*TermSearcher, error) { - termBytes := []byte(term) - needFreqNorm := !options.NoScore - reader, err := indexReader.TermFieldReader(termBytes, field, needFreqNorm, needFreqNorm, options.IncludeTermVectors) - if err != nil { - return nil, err - } - count, err := indexReader.DocCount() - if err != nil { - _ = reader.Close() - return nil, err - } - scorer := scorer.NewTermQueryScorer(termBytes, field, boost, count, reader.Count(), options) - return &TermSearcher{ - indexReader: indexReader, - reader: reader, - scorer: scorer, - }, nil + return NewTermSearcherBytes(indexReader, []byte(term), field, boost, options) } func NewTermSearcherBytes(indexReader index.IndexReader, term []byte, field string, boost float64, options search.SearcherOptions) (*TermSearcher, error) { @@ -63,6 +47,11 @@ func NewTermSearcherBytes(indexReader index.IndexReader, term []byte, field stri if err != nil { return nil, err } + return newTermSearcherFromReader(indexReader, reader, term, field, boost, options) +} + +func newTermSearcherFromReader(indexReader index.IndexReader, reader index.TermFieldReader, + term []byte, field string, boost float64, options search.SearcherOptions) (*TermSearcher, error) { count, err := indexReader.DocCount() if err != nil { _ = reader.Close() From 68561b5c8e9888983989afb66711f0fc96ac5e48 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Sun, 20 Jan 2019 21:58:44 -0800 Subject: [PATCH 529/728] low-frequency terms heuristic for unadorned disj. optimization The heuristic check in this commit skips the unadorned disjunction optimization in the case of a disjunction of low-frequency terms. --- index/scorch/optimize.go | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/index/scorch/optimize.go b/index/scorch/optimize.go index a5846121a..37b61af20 100644 --- a/index/scorch/optimize.go +++ b/index/scorch/optimize.go @@ -37,6 +37,8 @@ func (s *IndexSnapshotTermFieldReader) Optimize(kind string, return octx, nil } +var OptimizeDisjunctionUnadornedMinChildCardinality = uint64(256) + // ---------------------------------------------------------------- func (s *IndexSnapshotTermFieldReader) optimizeConjunction( @@ -150,6 +152,31 @@ func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err erro return nil, nil } + for i := range o.snapshot.segment { + var cMax uint64 + + for _, tfr := range o.tfrs { + itr, ok := tfr.iterators[i].(*zap.PostingsIterator) + if !ok { + return nil, nil + } + + if itr.ActualBM != nil { + c := itr.ActualBM.GetCardinality() + if cMax < c { + cMax = c + } + } + } + + // Heuristic to skip the optimization if all the constituent + // bitmaps are too small, where the processing & resource + // overhead to create the OR'ed bitmap outweighs the benefit. + if cMax < OptimizeDisjunctionUnadornedMinChildCardinality { + return nil, nil + } + } + // We use an artificial term and field because the optimized // termFieldReader can represent multiple terms and fields. oTFR := &IndexSnapshotTermFieldReader{ From b1b8cecef9045b23f21d02025fecc78b347ece1d Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 21 Jan 2019 07:56:54 -0800 Subject: [PATCH 530/728] unadorned disjunction optimization checks min param --- search/searcher/search_disjunction.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/search/searcher/search_disjunction.go b/search/searcher/search_disjunction.go index d4db7d9bf..520be0fb1 100644 --- a/search/searcher/search_disjunction.go +++ b/search/searcher/search_disjunction.go @@ -61,8 +61,8 @@ func optimizeDisjunctionSearcher(indexReader index.IndexReader, search.Searcher, error) { // we cannot use the "unadorned" disjunction optimization if the // caller wants extra information like freq-norm's for scoring or - // term vectors - if len(qsearchers) <= 1 || !options.NoScore || options.IncludeTermVectors { + // term vectors, or leverages the min feature + if len(qsearchers) <= 1 || min > 1 || !options.NoScore || options.IncludeTermVectors { return nil, nil } From 4ac43411b730fe85a9ad2533e73627871766188f Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 21 Jan 2019 08:06:42 -0800 Subject: [PATCH 531/728] optimize unadorned conjunctions (API change) NOTE: API change -- NewConjunctionSearcher() now returns a search.Searcher instead of the previous *ConjunctionSearcher This commit provides an optimization for "unadorned" conjunctions similar to the previous optimization for unadorned disjunctions. As part of this, the optimizeCompositeSearcher() func was renamed/refactored for reuse for both disjunctions and conjunctions. Regarding perf microbenchmarks (bleve-query) on a 200K document en-wiki scorch index... On a query-string search for "+www +com +http" (231 hits)... before the change, with normal scoring - ~689 q/sec w/ NoScore:true - ~2184 q/sec w/ NoScore:true & unadorned conj. optimization - ~10148 q/sec --- index/scorch/optimize.go | 179 +++++++++++++++++++++++++- index/scorch/segment/empty.go | 2 + index/scorch/segment/zap/posting.go | 26 +++- search/searcher/search_conjunction.go | 40 +++--- search/searcher/search_disjunction.go | 33 +++-- search/searcher/search_phrase.go | 2 +- 6 files changed, 235 insertions(+), 47 deletions(-) diff --git a/index/scorch/optimize.go b/index/scorch/optimize.go index 37b61af20..3e71e3696 100644 --- a/index/scorch/optimize.go +++ b/index/scorch/optimize.go @@ -30,6 +30,10 @@ func (s *IndexSnapshotTermFieldReader) Optimize(kind string, return s.optimizeConjunction(octx) } + if kind == "conjunction:unadorned" { + return s.optimizeConjunctionUnadorned(octx) + } + if kind == "disjunction:unadorned" { return s.optimizeDisjunctionUnadorned(octx) } @@ -95,7 +99,9 @@ func (o *OptimizeTFRConjunction) Finish() (index.Optimized, error) { } // in this conjunction optimization, the postings iterators - // will all share the same AND'ed together actual bitmap + // will all share the same AND'ed together actual bitmap. The + // regular conjunction searcher machinery will still be used, + // but the underlying bitmap will be smaller. for _, tfr := range o.tfrs { itr, ok := tfr.iterators[i].(*zap.PostingsIterator) if ok && itr.ActualBM != nil { @@ -110,6 +116,177 @@ func (o *OptimizeTFRConjunction) Finish() (index.Optimized, error) { // ---------------------------------------------------------------- +// An "unadorned" conjunction optimization is appropriate when +// additional or subsidiary information like freq-norm's and +// term-vectors are not required, and instead only the internal-id's +// are needed. +func (s *IndexSnapshotTermFieldReader) optimizeConjunctionUnadorned( + octx index.OptimizableContext) (index.OptimizableContext, error) { + if octx == nil { + octx = &OptimizeTFRConjunctionUnadorned{snapshot: s.snapshot} + } + + o, ok := octx.(*OptimizeTFRConjunctionUnadorned) + if !ok { + return nil, nil + } + + if o.snapshot != s.snapshot { + return nil, fmt.Errorf("tried to optimize unadorned conjunction across different snapshots") + } + + o.tfrs = append(o.tfrs, s) + + return o, nil +} + +type OptimizeTFRConjunctionUnadorned struct { + snapshot *IndexSnapshot + + tfrs []*IndexSnapshotTermFieldReader +} + +var OptimizeTFRConjunctionUnadornedTerm = []byte("") +var OptimizeTFRConjunctionUnadornedField = "*" + +// Finish of an unadorned conjunction optimization will compute a +// termFieldReader with an "actual" bitmap that represents the +// constituent bitmaps AND'ed together. This termFieldReader cannot +// provide any freq-norm or termVector associated information. +func (o *OptimizeTFRConjunctionUnadorned) Finish() (rv index.Optimized, err error) { + if len(o.tfrs) <= 1 { + return nil, nil + } + + // We use an artificial term and field because the optimized + // termFieldReader can represent multiple terms and fields. + oTFR := &IndexSnapshotTermFieldReader{ + term: OptimizeTFRConjunctionUnadornedTerm, + field: OptimizeTFRConjunctionUnadornedField, + snapshot: o.snapshot, + iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)), + segmentOffset: 0, + includeFreq: false, + includeNorm: false, + includeTermVectors: false, + } + + var actualBMs []*roaring.Bitmap // Collected from regular posting lists. + +OUTER: + for i := range o.snapshot.segment { + actualBMs = actualBMs[:0] + + var docNum1HitLast uint64 + var docNum1HitLastOk bool + + for _, tfr := range o.tfrs { + if _, ok := tfr.iterators[i].(*segment.EmptyPostingsIterator); ok { + // An empty postings iterator means the entire AND is empty. + oTFR.iterators[i] = segment.AnEmptyPostingsIterator + continue OUTER + } + + itr, ok := tfr.iterators[i].(*zap.PostingsIterator) + if !ok { + // We optimize zap postings iterators only. + return nil, nil + } + + // If the postings iterator is "1-hit" optimized, then we + // can perform several optimizations up-front here. + docNum1Hit, ok := itr.DocNum1Hit() + if ok { + if docNum1Hit == zap.DocNum1HitFinished { + // An empty docNum here means the entire AND is empty. + oTFR.iterators[i] = segment.AnEmptyPostingsIterator + continue OUTER + } + + if docNum1HitLastOk && docNum1HitLast != docNum1Hit { + // The docNum1Hit doesn't match the previous + // docNum1HitLast, so the entire AND is empty. + oTFR.iterators[i] = segment.AnEmptyPostingsIterator + continue OUTER + } + + docNum1HitLast = docNum1Hit + docNum1HitLastOk = true + + continue + } + + if itr.ActualBM == nil { + // An empty actual bitmap means the entire AND is empty. + oTFR.iterators[i] = segment.AnEmptyPostingsIterator + continue OUTER + } + + // Collect the actual bitmap for more processing later. + actualBMs = append(actualBMs, itr.ActualBM) + } + + if docNum1HitLastOk { + // We reach here if all the 1-hit optimized posting + // iterators had the same 1-hit docNum, so we can check if + // our collected actual bitmaps also have that docNum. + for _, bm := range actualBMs { + if !bm.Contains(uint32(docNum1HitLast)) { + // The docNum1Hit isn't in one of our actual + // bitmaps, so the entire AND is empty. + oTFR.iterators[i] = segment.AnEmptyPostingsIterator + continue OUTER + } + } + + // The actual bitmaps and docNum1Hits all contain or have + // the same 1-hit docNum, so that's our AND'ed result. + oTFR.iterators[i], err = zap.PostingsIteratorFrom1Hit( + docNum1HitLast, zap.NormBits1Hit, false, false) + if err != nil { + return nil, nil + } + + continue OUTER + } + + if len(actualBMs) == 0 { + // If we've collected no actual bitmaps at this point, + // then the entire AND is empty. + oTFR.iterators[i] = segment.AnEmptyPostingsIterator + continue OUTER + } + + if len(actualBMs) == 1 { + // If we've only 1 actual bitmap, then that's our result. + oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap( + actualBMs[0], false, false) + if err != nil { + return nil, nil + } + + continue OUTER + } + + // Else, AND together our collected bitmaps as our result. + bm := roaring.And(actualBMs[0], actualBMs[1]) + + for _, actualBM := range actualBMs[2:] { + bm.And(actualBM) + } + + oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap( + bm, false, false) + if err != nil { + return nil, nil + } + } + + return oTFR, nil +} + +// ---------------------------------------------------------------- + // An "unadorned" disjunction optimization is appropriate when // additional or subsidiary information like freq-norm's and // term-vectors are not required, and instead only the internal-id's diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index af50d0aaf..165a01bc1 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -125,3 +125,5 @@ func (e *EmptyPostingsIterator) Next() (Posting, error) { func (e *EmptyPostingsIterator) Size() int { return 0 } + +var AnEmptyPostingsIterator = &EmptyPostingsIterator{} diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 0155e8222..26378c27e 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -92,7 +92,9 @@ func under32Bits(x uint64) bool { return x <= mask31Bits } -const docNum1HitFinished = math.MaxUint64 +const DocNum1HitFinished = math.MaxUint64 + +var NormBits1Hit = uint64(math.Float32bits(float32(1))) // PostingsList is an in-memory representation of a postings list type PostingsList struct { @@ -199,7 +201,7 @@ func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool, rv.normBits1Hit = p.normBits1Hit if p.except != nil && p.except.Contains(uint32(rv.docNum1Hit)) { - rv.docNum1Hit = docNum1HitFinished + rv.docNum1Hit = DocNum1HitFinished } return rv @@ -634,16 +636,16 @@ func (i *PostingsIterator) nextBytes() ( // sets up the currChunk / loc related fields of the iterator. func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, error) { if i.normBits1Hit != 0 { - if i.docNum1Hit == docNum1HitFinished { + if i.docNum1Hit == DocNum1HitFinished { return 0, false, nil } if i.docNum1Hit < atOrAfter { // advanced past our 1-hit - i.docNum1Hit = docNum1HitFinished // consume our 1-hit docNum + i.docNum1Hit = DocNum1HitFinished // consume our 1-hit docNum return 0, false, nil } docNum := i.docNum1Hit - i.docNum1Hit = docNum1HitFinished // consume our 1-hit docNum + i.docNum1Hit = DocNum1HitFinished // consume our 1-hit docNum return docNum, true, nil } @@ -786,7 +788,7 @@ func (i *PostingsIterator) currChunkNext(nChunk uint32) error { // DocNum1Hit returns the docNum and true if this is "1-hit" optimized // and the docNum is available. func (p *PostingsIterator) DocNum1Hit() (uint64, bool) { - if p.normBits1Hit != 0 && p.docNum1Hit != docNum1HitFinished { + if p.normBits1Hit != 0 && p.docNum1Hit != DocNum1HitFinished { return p.docNum1Hit, true } return 0, false @@ -804,6 +806,18 @@ func PostingsIteratorFromBitmap(bm *roaring.Bitmap, }, nil } +// PostingsIteratorFrom1Hit constructs a PostingsIterator given a +// 1-hit docNum. +func PostingsIteratorFrom1Hit(docNum1Hit, normBits1Hit uint64, + includeFreqNorm, includeLocs bool) (*PostingsIterator, error) { + return &PostingsIterator{ + docNum1Hit: docNum1Hit, + normBits1Hit: normBits1Hit, + includeFreqNorm: includeFreqNorm, + includeLocs: includeLocs, + }, nil +} + // Posting is a single entry in a postings list type Posting struct { docNum uint64 diff --git a/search/searcher/search_conjunction.go b/search/searcher/search_conjunction.go index e89848b79..0b21ec53a 100644 --- a/search/searcher/search_conjunction.go +++ b/search/searcher/search_conjunction.go @@ -43,14 +43,26 @@ type ConjunctionSearcher struct { options search.SearcherOptions } -func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.Searcher, options search.SearcherOptions) (*ConjunctionSearcher, error) { - // build the downstream searchers +func NewConjunctionSearcher(indexReader index.IndexReader, + qsearchers []search.Searcher, options search.SearcherOptions) ( + search.Searcher, error) { + // build the sorted downstream searchers searchers := make(OrderedSearcherList, len(qsearchers)) for i, searcher := range qsearchers { searchers[i] = searcher } - // sort the searchers sort.Sort(searchers) + + // attempt the "unadorned" conjunction optimization only when we + // do not need extra information like freq-norm's or term vectors + if len(searchers) > 1 && options.NoScore && !options.IncludeTermVectors { + rv, err := optimizeCompositeSearcher("conjunction:unadorned", + indexReader, searchers, options) + if err != nil || rv != nil { + return rv, err + } + } + // build our searcher rv := ConjunctionSearcher{ indexReader: indexReader, @@ -63,24 +75,10 @@ func NewConjunctionSearcher(indexReader index.IndexReader, qsearchers []search.S // attempt push-down conjunction optimization when there's >1 searchers if len(searchers) > 1 { - var octx index.OptimizableContext - - for _, searcher := range searchers { - o, ok := searcher.(index.Optimizable) - if ok { - var err error - octx, err = o.Optimize("conjunction", octx) - if err != nil { - return nil, err - } - } - } - - if octx != nil { - _, err := octx.Finish() - if err != nil { - return nil, err - } + rv, err := optimizeCompositeSearcher("conjunction", + indexReader, searchers, options) + if err != nil || rv != nil { + return rv, err } } diff --git a/search/searcher/search_disjunction.go b/search/searcher/search_disjunction.go index 520be0fb1..8b9a0dade 100644 --- a/search/searcher/search_disjunction.go +++ b/search/searcher/search_disjunction.go @@ -40,9 +40,16 @@ func NewDisjunctionSearcher(indexReader index.IndexReader, func newDisjunctionSearcher(indexReader index.IndexReader, qsearchers []search.Searcher, min float64, options search.SearcherOptions, limit bool) (search.Searcher, error) { - rv, err := optimizeDisjunctionSearcher(indexReader, qsearchers, min, options) - if err != nil || rv != nil { - return rv, err + // attempt the "unadorned" disjunction optimization only when we + // do not need extra information like freq-norm's or term vectors + // and the requested min is simple + if len(qsearchers) > 1 && min <= 1 && + options.NoScore && !options.IncludeTermVectors { + rv, err := optimizeCompositeSearcher("disjunction:unadorned", + indexReader, qsearchers, options) + if err != nil || rv != nil { + return rv, err + } } if len(qsearchers) > DisjunctionHeapTakeover { @@ -53,19 +60,9 @@ func newDisjunctionSearcher(indexReader index.IndexReader, limit) } -// optimizeDisjunctionSearcher might return an optimized searcher that -// represents the disjunction, especially in the edge case of a -// non-scoring, no-term-vectors disjunction -func optimizeDisjunctionSearcher(indexReader index.IndexReader, - qsearchers []search.Searcher, min float64, options search.SearcherOptions) ( - search.Searcher, error) { - // we cannot use the "unadorned" disjunction optimization if the - // caller wants extra information like freq-norm's for scoring or - // term vectors, or leverages the min feature - if len(qsearchers) <= 1 || min > 1 || !options.NoScore || options.IncludeTermVectors { - return nil, nil - } - +func optimizeCompositeSearcher(optimizationKind string, + indexReader index.IndexReader, qsearchers []search.Searcher, + options search.SearcherOptions) (search.Searcher, error) { var octx index.OptimizableContext for _, searcher := range qsearchers { @@ -75,7 +72,7 @@ func optimizeDisjunctionSearcher(indexReader index.IndexReader, } var err error - octx, err = o.Optimize("disjunction:unadorned", octx) + octx, err = o.Optimize(optimizationKind, octx) if err != nil { return nil, err } @@ -96,7 +93,7 @@ func optimizeDisjunctionSearcher(indexReader index.IndexReader, } return newTermSearcherFromReader(indexReader, tfr, - []byte(""), "*", 1.0, options) + []byte(optimizationKind), "*", 1.0, options) } func tooManyClauses(count int) bool { diff --git a/search/searcher/search_phrase.go b/search/searcher/search_phrase.go index 08eb13338..51b7e5bd8 100644 --- a/search/searcher/search_phrase.go +++ b/search/searcher/search_phrase.go @@ -32,7 +32,7 @@ func init() { } type PhraseSearcher struct { - mustSearcher *ConjunctionSearcher + mustSearcher search.Searcher queryNorm float64 currMust *search.DocumentMatch terms [][]string From a1c886716cd508e12dce4d7b2dc1fcf7f170e888 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 21 Jan 2019 20:00:23 -0800 Subject: [PATCH 532/728] API: renamed SearchRequest.NoScore (bool) to Score (string) Values for SearchRequest.Score include... "" - basic TF/IDF scoring, and the default behavior. "none" - non-scoring search request. --- index_impl.go | 2 +- search.go | 5 ++++- search/search.go | 2 +- search/searcher/search_conjunction.go | 3 ++- search/searcher/search_disjunction.go | 2 +- search/searcher/search_term.go | 2 +- 6 files changed, 10 insertions(+), 6 deletions(-) diff --git a/index_impl.go b/index_impl.go index ad9b3505e..fe61b8064 100644 --- a/index_impl.go +++ b/index_impl.go @@ -458,7 +458,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr searcher, err := req.Query.Searcher(indexReader, i.m, search.SearcherOptions{ Explain: req.Explain, IncludeTermVectors: req.IncludeLocations || req.Highlight != nil, - NoScore: req.NoScore, + Score: req.Score, }) if err != nil { return nil, err diff --git a/search.go b/search.go index 019d6fd04..ebd69971e 100644 --- a/search.go +++ b/search.go @@ -261,6 +261,7 @@ func (h *HighlightRequest) AddField(field string) { // Explain triggers inclusion of additional search // result score explanations. // Sort describes the desired order for the results to be returned. +// Score controls the kind of scoring performed // // A special field named "*" can be used to return all fields. type SearchRequest struct { @@ -273,7 +274,7 @@ type SearchRequest struct { Explain bool `json:"explain"` Sort search.SortOrder `json:"sort"` IncludeLocations bool `json:"includeLocations"` - NoScore bool `json:"noScore"` + Score string `json:"score,omitempty"` } func (r *SearchRequest) Validate() error { @@ -323,6 +324,7 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error { Explain bool `json:"explain"` Sort []json.RawMessage `json:"sort"` IncludeLocations bool `json:"includeLocations"` + Score string `json:"score"` } err := json.Unmarshal(input, &temp) @@ -349,6 +351,7 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error { r.Fields = temp.Fields r.Facets = temp.Facets r.IncludeLocations = temp.IncludeLocations + r.Score = temp.Score r.Query, err = query.ParseQuery(temp.Q) if err != nil { return err diff --git a/search/search.go b/search/search.go index b241de668..7f6777df6 100644 --- a/search/search.go +++ b/search/search.go @@ -280,7 +280,7 @@ type Searcher interface { type SearcherOptions struct { Explain bool IncludeTermVectors bool - NoScore bool + Score string } // SearchContext represents the context around a single search diff --git a/search/searcher/search_conjunction.go b/search/searcher/search_conjunction.go index 0b21ec53a..ac737bccd 100644 --- a/search/searcher/search_conjunction.go +++ b/search/searcher/search_conjunction.go @@ -55,7 +55,8 @@ func NewConjunctionSearcher(indexReader index.IndexReader, // attempt the "unadorned" conjunction optimization only when we // do not need extra information like freq-norm's or term vectors - if len(searchers) > 1 && options.NoScore && !options.IncludeTermVectors { + if len(searchers) > 1 && + options.Score == "none" && !options.IncludeTermVectors { rv, err := optimizeCompositeSearcher("conjunction:unadorned", indexReader, searchers, options) if err != nil || rv != nil { diff --git a/search/searcher/search_disjunction.go b/search/searcher/search_disjunction.go index 8b9a0dade..6a296b68f 100644 --- a/search/searcher/search_disjunction.go +++ b/search/searcher/search_disjunction.go @@ -44,7 +44,7 @@ func newDisjunctionSearcher(indexReader index.IndexReader, // do not need extra information like freq-norm's or term vectors // and the requested min is simple if len(qsearchers) > 1 && min <= 1 && - options.NoScore && !options.IncludeTermVectors { + options.Score == "none" && !options.IncludeTermVectors { rv, err := optimizeCompositeSearcher("disjunction:unadorned", indexReader, qsearchers, options) if err != nil || rv != nil { diff --git a/search/searcher/search_term.go b/search/searcher/search_term.go index c90008eef..c1af74c76 100644 --- a/search/searcher/search_term.go +++ b/search/searcher/search_term.go @@ -42,7 +42,7 @@ func NewTermSearcher(indexReader index.IndexReader, term string, field string, b } func NewTermSearcherBytes(indexReader index.IndexReader, term []byte, field string, boost float64, options search.SearcherOptions) (*TermSearcher, error) { - needFreqNorm := !options.NoScore + needFreqNorm := options.Score != "none" reader, err := indexReader.TermFieldReader(term, field, needFreqNorm, needFreqNorm, options.IncludeTermVectors) if err != nil { return nil, err From e9779534840ac72a78b383fd904edbabe95f712a Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 22 Jan 2019 21:26:44 -0800 Subject: [PATCH 533/728] TestScorchCompositeSearchOptimizations --- index/scorch/optimize.go | 10 +- search/searcher/base_test.go | 2 +- search/searcher/search_conjunction_test.go | 191 ++++++++++++++++++++- 3 files changed, 198 insertions(+), 5 deletions(-) diff --git a/index/scorch/optimize.go b/index/scorch/optimize.go index 3e71e3696..b33e3be3d 100644 --- a/index/scorch/optimize.go +++ b/index/scorch/optimize.go @@ -24,17 +24,21 @@ import ( "github.com/blevesearch/bleve/index/scorch/segment/zap" ) +var OptimizeConjunction = true +var OptimizeConjunctionUnadorned = true +var OptimizeDisjunctionUnadorned = true + func (s *IndexSnapshotTermFieldReader) Optimize(kind string, octx index.OptimizableContext) (index.OptimizableContext, error) { - if kind == "conjunction" { + if OptimizeConjunction && kind == "conjunction" { return s.optimizeConjunction(octx) } - if kind == "conjunction:unadorned" { + if OptimizeConjunctionUnadorned && kind == "conjunction:unadorned" { return s.optimizeConjunctionUnadorned(octx) } - if kind == "disjunction:unadorned" { + if OptimizeDisjunctionUnadorned && kind == "disjunction:unadorned" { return s.optimizeDisjunctionUnadorned(octx) } diff --git a/search/searcher/base_test.go b/search/searcher/base_test.go index feb00a6aa..a4644b6b3 100644 --- a/search/searcher/base_test.go +++ b/search/searcher/base_test.go @@ -27,7 +27,7 @@ import ( "github.com/blevesearch/bleve/index/upsidedown" ) -var twoDocIndex index.Index //= upside_down.NewUpsideDownCouch(inmem.MustOpen()) +var twoDocIndex index.Index func init() { twoDocIndex = initTwoDocUpsideDown() diff --git a/search/searcher/search_conjunction_test.go b/search/searcher/search_conjunction_test.go index 2107743bd..4a71381be 100644 --- a/search/searcher/search_conjunction_test.go +++ b/search/searcher/search_conjunction_test.go @@ -15,14 +15,17 @@ package searcher import ( + "io/ioutil" + "os" + "strings" "testing" "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch" "github.com/blevesearch/bleve/search" ) func TestConjunctionSearch(t *testing.T) { - twoDocIndexReader, err := twoDocIndex.Reader() if err != nil { t.Error(err) @@ -221,3 +224,189 @@ func TestConjunctionSearch(t *testing.T) { } } } + +type compositeSearchOptimizationTest struct { + fieldTerms []string + expectEmpty string +} + +func TestScorchCompositeSearchOptimizations(t *testing.T) { + dir, _ := ioutil.TempDir("", "scorchTwoDoc") + defer func() { + _ = os.RemoveAll(dir) + }() + + twoDocIndex := initTwoDocScorch(dir) + + twoDocIndexReader, err := twoDocIndex.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err := twoDocIndexReader.Close() + if err != nil { + t.Fatal(err) + } + }() + + tests := []compositeSearchOptimizationTest{ + {fieldTerms: []string{}, + expectEmpty: "conjunction,disjunction"}, + {fieldTerms: []string{"name:marty"}, + expectEmpty: ""}, + {fieldTerms: []string{"name:marty", "desc:beer"}, + expectEmpty: ""}, + {fieldTerms: []string{"name:marty", "name:marty"}, + expectEmpty: ""}, + {fieldTerms: []string{"name:marty", "desc:beer", "title:mister", "street:couchbase"}, + expectEmpty: "conjunction"}, + {fieldTerms: []string{"name:steve", "desc:beer", "title:mister", "street:couchbase"}, + expectEmpty: ""}, + + {fieldTerms: []string{"name:NotARealName"}, + expectEmpty: "conjunction,disjunction"}, + {fieldTerms: []string{"name:NotARealName", "name:marty"}, + expectEmpty: "conjunction"}, + {fieldTerms: []string{"name:NotARealName", "name:marty", "desc:beer"}, + expectEmpty: "conjunction"}, + {fieldTerms: []string{"name:NotARealName", "name:marty", "name:marty"}, + expectEmpty: "conjunction"}, + {fieldTerms: []string{"name:NotARealName", "name:marty", "desc:beer", "title:mister", "street:couchbase"}, + expectEmpty: "conjunction"}, + } + + // The theme of this unit test is that given one of the above + // search test cases -- no matter what searcher options we + // provide, across either conjunctions or disjunctions, whether we + // have optimizations that are enabled or disabled, the set of doc + // ID's from the search results from any of those combinations + // should be the same. + searcherOptionsToCompare := []search.SearcherOptions{ + search.SearcherOptions{}, + search.SearcherOptions{Explain: true}, + search.SearcherOptions{IncludeTermVectors: true}, + search.SearcherOptions{IncludeTermVectors: true, Explain: true}, + search.SearcherOptions{Score: "none"}, + search.SearcherOptions{Score: "none", IncludeTermVectors: true}, + search.SearcherOptions{Score: "none", IncludeTermVectors: true, Explain: true}, + search.SearcherOptions{Score: "none", Explain: true}, + } + + testScorchCompositeSearchOptimizations(t, twoDocIndexReader, tests, + searcherOptionsToCompare, "conjunction") + + testScorchCompositeSearchOptimizations(t, twoDocIndexReader, tests, + searcherOptionsToCompare, "disjunction") +} + +func testScorchCompositeSearchOptimizations(t *testing.T, indexReader index.IndexReader, + tests []compositeSearchOptimizationTest, + searcherOptionsToCompare []search.SearcherOptions, + compositeKind string) { + for testi := range tests { + resultsToCompare := map[string]bool{} + + testScorchCompositeSearchOptimizationsHelper(t, indexReader, tests, testi, + searcherOptionsToCompare, compositeKind, false, resultsToCompare) + + testScorchCompositeSearchOptimizationsHelper(t, indexReader, tests, testi, + searcherOptionsToCompare, compositeKind, true, resultsToCompare) + } +} + +func testScorchCompositeSearchOptimizationsHelper( + t *testing.T, indexReader index.IndexReader, + tests []compositeSearchOptimizationTest, testi int, + searcherOptionsToCompare []search.SearcherOptions, + compositeKind string, allowOptimizations bool, resultsToCompare map[string]bool) { + // Save the global allowed optimization settings to restore later. + optimizeConjunction := scorch.OptimizeConjunction + optimizeConjunctionUnadorned := scorch.OptimizeConjunctionUnadorned + optimizeDisjunctionUnadorned := scorch.OptimizeDisjunctionUnadorned + optimizeDisjunctionUnadornedMinChildCardinality := + scorch.OptimizeDisjunctionUnadornedMinChildCardinality + + scorch.OptimizeConjunction = allowOptimizations + scorch.OptimizeConjunctionUnadorned = allowOptimizations + scorch.OptimizeDisjunctionUnadorned = allowOptimizations + + if allowOptimizations { + scorch.OptimizeDisjunctionUnadornedMinChildCardinality = uint64(0) + } + + defer func() { + scorch.OptimizeConjunction = optimizeConjunction + scorch.OptimizeConjunctionUnadorned = optimizeConjunctionUnadorned + scorch.OptimizeDisjunctionUnadorned = optimizeDisjunctionUnadorned + scorch.OptimizeDisjunctionUnadornedMinChildCardinality = + optimizeDisjunctionUnadornedMinChildCardinality + }() + + test := tests[testi] + + for searcherOptionsI, searcherOptions := range searcherOptionsToCompare { + // Construct the leaf term searchers. + var searchers []search.Searcher + + for _, fieldTerm := range test.fieldTerms { + ft := strings.Split(fieldTerm, ":") + field := ft[0] + term := ft[1] + + searcher, err := NewTermSearcher(indexReader, term, field, 1.0, searcherOptions) + if err != nil { + t.Fatal(err) + } + + searchers = append(searchers, searcher) + } + + // Construct the composite searcher. + var cs search.Searcher + var err error + if compositeKind == "conjunction" { + cs, err = NewConjunctionSearcher(indexReader, searchers, searcherOptions) + } else { + cs, err = NewDisjunctionSearcher(indexReader, searchers, 0, searcherOptions) + } + if err != nil { + t.Fatal(err) + } + + ctx := &search.SearchContext{ + DocumentMatchPool: search.NewDocumentMatchPool(10, 0), + } + + next, err := cs.Next(ctx) + i := 0 + for err == nil && next != nil { + docID, err := indexReader.ExternalID(next.IndexInternalID) + if err != nil { + t.Fatal(err) + } + + if searcherOptionsI == 0 && allowOptimizations == false { + resultsToCompare[string(docID)] = true + } else { + if !resultsToCompare[string(docID)] { + t.Errorf("missing %s", string(docID)) + } + } + + next, err = cs.Next(ctx) + i++ + } + + if i != len(resultsToCompare) { + t.Errorf("mismatched count, %d vs %d", i, len(resultsToCompare)) + } + + if i == 0 && !strings.Contains(test.expectEmpty, compositeKind) { + t.Errorf("testi: %d, compositeKind: %s, allowOptimizations: %t,"+ + " searcherOptionsI: %d, searcherOptions: %#v,"+ + " expected some results but got no results on test: %#v", + testi, compositeKind, allowOptimizations, + searcherOptionsI, searcherOptions, test) + } + } +} From c8e737a945472f5f266c0799005216080443fbb8 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 29 Jan 2019 09:36:53 -0800 Subject: [PATCH 534/728] MB-32846 - more aggressively removeOldData() in scorch persister The scorch persister loop has an optimization to not block/wait if it sees that the latest root epoch has changed during the current persistence round, so that the persister can continue immediately to the top of the persister loop. But, this "continue OUTER" optimization would skip a removeOldData() invocation. This meant that removeOldData() wouldn't be invoked for potentially a long time in this kind of indexing-heavy scenario. While indexing 200K wiki docs (indexing only using bleve-blast), before this change, # of files could grow up to >100 zap segment files. After this change, the # of files stayed nearer the neighborhood of 10~20 files. See also: https://issues.couchbase.com/browse/MB-32846 Also, cleaned up some whitespace lines seen while trying to diagnose this issue. --- index/scorch/introducer.go | 4 +++- index/scorch/persister.go | 5 ++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index ac627796f..2d04bd38e 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -376,7 +376,6 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { fileSegments++ } } - } // before the newMerge introduction, need to clean the newly @@ -393,6 +392,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { } } } + // In case where all the docs in the newly merged segment getting // deleted by the time we reach here, can skip the introduction. if nextMerge.new != nil && @@ -424,6 +424,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { newSnapshot.AddRef() // 1 ref for the nextMerge.notify response newSnapshot.updateSize() + s.rootLock.Lock() // swap in new index snapshot newSnapshot.epoch = s.nextSnapshotEpoch @@ -501,6 +502,7 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { } newSnapshot.updateSize() + // swap in new snapshot rootPrev := s.root s.root = newSnapshot diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 2ba50867d..f75945d4d 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -111,6 +111,7 @@ OUTER: if ew != nil && ew.epoch > lastMergedEpoch { lastMergedEpoch = ew.epoch } + lastMergedEpoch, persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch, lastMergedEpoch, persistWatchers, po) @@ -178,6 +179,7 @@ OUTER: s.fireEvent(EventKindPersisterProgress, time.Since(startTime)) if changed { + s.removeOldData() atomic.AddUint64(&s.stats.TotPersistLoopProgress, 1) continue OUTER } @@ -659,13 +661,13 @@ func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) { } func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { - rv := &IndexSnapshot{ parent: s, internal: make(map[string][]byte), refs: 1, creator: "loadSnapshot", } + var running uint64 c := snapshot.Cursor() for k, _ := c.First(); k != nil; k, _ = c.Next() { @@ -701,6 +703,7 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { running += segmentSnapshot.segment.Count() } } + return rv, nil } From 0e572047cb5f7839f2d903e71378c7e8d094f551 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 9 Jan 2019 14:39:42 +0530 Subject: [PATCH 535/728] MB-31974 - Streaming search results Adding optional builder/maker callbacks for the document match handlers from applications. These builder method gives back a threadsafe document match handler callback for bleve which it will invoke for every document hit during a search. The application should make its own copy of the hit/documentMatch in the callback if it wish to hold on to data. --- search/collector.go | 20 ++++++++ search/collector/topn.go | 98 +++++++++++++++++++++++++++++----------- search/search.go | 1 + 3 files changed, 92 insertions(+), 27 deletions(-) diff --git a/search/collector.go b/search/collector.go index 0d163a9d9..f7a490d65 100644 --- a/search/collector.go +++ b/search/collector.go @@ -30,3 +30,23 @@ type Collector interface { SetFacetsBuilder(facetsBuilder *FacetsBuilder) FacetResults() FacetResults } + +// DocumentMatchHandler is the type of document match callback +// bleve will invoke during the search. +// Eventually, bleve will indicate the completion of an ongoing search, +// by passing a nil value for the document match callback. +// The application should take a copy of the hit/documentMatch +// if it wish to own it or need prolonged access to it. +type DocumentMatchHandler func(hit *DocumentMatch) error + +type MakeDocumentMatchHandlerKeyType string + +var MakeDocumentMatchHandlerKey = MakeDocumentMatchHandlerKeyType( + "MakeDocumentMatchHandlerKey") + +// MakeDocumentMatchHandler is an optional DocumentMatchHandler +// builder function which the applications can pass to bleve. +// These builder methods gives a DocumentMatchHandler function +// to bleve, which it will invoke on every document matches. +type MakeDocumentMatchHandler func(ctx *SearchContext) ( + callback DocumentMatchHandler, loadID bool, err error) diff --git a/search/collector/topn.go b/search/collector/topn.go index 4b2682da0..3682952fd 100644 --- a/search/collector/topn.go +++ b/search/collector/topn.go @@ -140,6 +140,7 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, } searchContext := &search.SearchContext{ DocumentMatchPool: search.NewDocumentMatchPool(backingSize+searcher.DocumentMatchPoolSize(), len(hc.sort)), + Collector: hc, } hc.dvReader, err = reader.DocValueReader(hc.neededFields) @@ -154,6 +155,19 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, hc.sort.UpdateVisitor(field, term) } + dmHandlerMaker := MakeTopNDocumentMatchHandler + if cv := ctx.Value(search.MakeDocumentMatchHandlerKey); cv != nil { + dmHandlerMaker = cv.(search.MakeDocumentMatchHandler) + } + // use the application given builder for making the custom document match + // handler and perform callbacks/invocations on the newly made handler. + dmHandler, loadID, err := dmHandlerMaker(searchContext) + if err != nil { + return err + } + + hc.needDocIds = hc.needDocIds||loadID + select { case <-ctx.Done(): return ctx.Err() @@ -169,13 +183,26 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, } } - err = hc.collectSingle(searchContext, reader, next) + err = hc.prepareDocumentMatch(searchContext, reader, next) + if err != nil { + break + } + + err = dmHandler(next) if err != nil { break } next, err = searcher.Next(searchContext) } + + // help finalize/flush the results in case + // of custom document match handlers. + err = dmHandler(nil) + if err != nil { + return err + } + // compute search duration hc.took = time.Since(startTime) if err != nil { @@ -191,8 +218,8 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, var sortByScoreOpt = []string{"_score"} -func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.IndexReader, d *search.DocumentMatch) error { - var err error +func (hc *TopNCollector) prepareDocumentMatch(ctx *search.SearchContext, + reader index.IndexReader, d *search.DocumentMatch ) (err error) { // visit field terms for features that require it (sort, facets) if len(hc.neededFields) > 0 { @@ -226,35 +253,52 @@ func (hc *TopNCollector) collectSingle(ctx *search.SearchContext, reader index.I hc.sort.Value(d) } - // optimization, we track lowest sorting hit already removed from heap - // with this one comparison, we can avoid all heap operations if - // this hit would have been added and then immediately removed - if hc.lowestMatchOutsideResults != nil { - cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d, hc.lowestMatchOutsideResults) - if cmp >= 0 { - // this hit can't possibly be in the result set, so avoid heap ops - ctx.DocumentMatchPool.Put(d) - return nil - } - } + return nil +} - removed := hc.store.AddNotExceedingSize(d, hc.size+hc.skip) - if removed != nil { - if hc.lowestMatchOutsideResults == nil { - hc.lowestMatchOutsideResults = removed - } else { - cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, removed, hc.lowestMatchOutsideResults) - if cmp < 0 { - tmp := hc.lowestMatchOutsideResults - hc.lowestMatchOutsideResults = removed - ctx.DocumentMatchPool.Put(tmp) +func MakeTopNDocumentMatchHandler( + ctx *search.SearchContext) (search.DocumentMatchHandler, bool, error) { + var hc *TopNCollector + var ok bool + if hc, ok = ctx.Collector.(*TopNCollector); ok { + return func(d *search.DocumentMatch) error { + if d == nil { + return nil + } + // optimization, we track lowest sorting hit already removed from heap + // with this one comparison, we can avoid all heap operations if + // this hit would have been added and then immediately removed + if hc.lowestMatchOutsideResults != nil { + cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d, + hc.lowestMatchOutsideResults) + if cmp >= 0 { + // this hit can't possibly be in the result set, so avoid heap ops + ctx.DocumentMatchPool.Put(d) + return nil + } } - } - } - return nil + removed := hc.store.AddNotExceedingSize(d, hc.size+hc.skip) + if removed != nil { + if hc.lowestMatchOutsideResults == nil { + hc.lowestMatchOutsideResults = removed + } else { + cmp := hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, + removed, hc.lowestMatchOutsideResults) + if cmp < 0 { + tmp := hc.lowestMatchOutsideResults + hc.lowestMatchOutsideResults = removed + ctx.DocumentMatchPool.Put(tmp) + } + } + } + return nil + }, false, nil + } + return nil, false, nil } + // visitFieldTerms is responsible for visiting the field terms of the // search hit, and passing visited terms to the sort and facet builder func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.DocumentMatch) error { diff --git a/search/search.go b/search/search.go index 440c09571..1d26d0de0 100644 --- a/search/search.go +++ b/search/search.go @@ -285,6 +285,7 @@ type SearcherOptions struct { // SearchContext represents the context around a single search type SearchContext struct { DocumentMatchPool *DocumentMatchPool + Collector Collector } func (sc *SearchContext) Size() int { From f2126efa73aa93e6907aa1b8629c1d85bb8c1566 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 30 Jan 2019 13:23:44 -0800 Subject: [PATCH 536/728] Formatting: go fmt ./.. --- index/store/moss/lower_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/store/moss/lower_test.go b/index/store/moss/lower_test.go index 72b8a7194..afc0a0959 100644 --- a/index/store/moss/lower_test.go +++ b/index/store/moss/lower_test.go @@ -27,7 +27,7 @@ func openWithLower(t *testing.T, mo store.MergeOperator) (string, store.KVStore) tmpDir, _ := ioutil.TempDir("", "mossStore") config := map[string]interface{}{ - "path": tmpDir, + "path": tmpDir, "mossLowerLevelStoreName": "mossStore", } From 8b976597d8346285afeba177a968750696aebe9f Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 30 Jan 2019 13:19:24 -0800 Subject: [PATCH 537/728] MB-32855: Propagate min setting when optimizing a 1-claused disjunction --- search/query/disjunction.go | 8 ++- search/search.go | 5 ++ search/searcher/search_disjunction_heap.go | 4 ++ search/searcher/search_disjunction_slice.go | 4 ++ search/searcher/search_filter.go | 6 ++ search_test.go | 63 +++++++++++++++++++++ 6 files changed, 89 insertions(+), 1 deletion(-) diff --git a/search/query/disjunction.go b/search/query/disjunction.go index b884cb659..1effc29ac 100644 --- a/search/query/disjunction.go +++ b/search/query/disjunction.go @@ -58,7 +58,8 @@ func (q *DisjunctionQuery) SetMin(m float64) { q.Min = m } -func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) { +func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, + options search.SearcherOptions) (search.Searcher, error) { ss := make([]search.Searcher, 0, len(q.Disjuncts)) for _, disjunct := range q.Disjuncts { sr, err := disjunct.Searcher(i, m, options) @@ -80,6 +81,11 @@ func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, if len(ss) < 1 { return searcher.NewMatchNoneSearcher(i) } else if len(ss) == 1 && q.Min <= 1 { + // update min setting of child searcher if supported + if searcher, ok := ss[0].(search.MinApplicableSearcher); ok { + searcher.SetMin(int(q.Min)) + } + // return the single nested searcher as is; only if min clauses is not greater than 1; return ss[0], nil } diff --git a/search/search.go b/search/search.go index 440c09571..ef7683332 100644 --- a/search/search.go +++ b/search/search.go @@ -277,6 +277,11 @@ type Searcher interface { DocumentMatchPoolSize() int } +type MinApplicableSearcher interface { + Searcher + SetMin(to int) +} + type SearcherOptions struct { Explain bool IncludeTermVectors bool diff --git a/search/searcher/search_disjunction_heap.go b/search/searcher/search_disjunction_heap.go index ec133f1f8..92b28dcc2 100644 --- a/search/searcher/search_disjunction_heap.go +++ b/search/searcher/search_disjunction_heap.go @@ -290,6 +290,10 @@ func (s *DisjunctionHeapSearcher) Min() int { return s.min } +func (s *DisjunctionHeapSearcher) SetMin(to int) { + s.min = to +} + func (s *DisjunctionHeapSearcher) DocumentMatchPoolSize() int { rv := len(s.searchers) for _, s := range s.searchers { diff --git a/search/searcher/search_disjunction_slice.go b/search/searcher/search_disjunction_slice.go index e47f39ad0..00a631355 100644 --- a/search/searcher/search_disjunction_slice.go +++ b/search/searcher/search_disjunction_slice.go @@ -274,6 +274,10 @@ func (s *DisjunctionSliceSearcher) Min() int { return s.min } +func (s *DisjunctionSliceSearcher) SetMin(to int) { + s.min = to +} + func (s *DisjunctionSliceSearcher) DocumentMatchPoolSize() int { rv := len(s.currs) for _, s := range s.searchers { diff --git a/search/searcher/search_filter.go b/search/searcher/search_filter.go index 7c95fb41c..ff777623a 100644 --- a/search/searcher/search_filter.go +++ b/search/searcher/search_filter.go @@ -98,6 +98,12 @@ func (f *FilteringSearcher) Min() int { return f.child.Min() } +func (f *FilteringSearcher) SetMin(to int) { + if searcher, ok := f.child.(search.MinApplicableSearcher); ok { + searcher.SetMin(to) + } +} + func (f *FilteringSearcher) DocumentMatchPoolSize() int { return f.child.DocumentMatchPoolSize() } diff --git a/search_test.go b/search_test.go index 71fae4236..ef1cf1814 100644 --- a/search_test.go +++ b/search_test.go @@ -1083,3 +1083,66 @@ func TestDisjunctionQueryIncorrectMin(t *testing.T) { " but got: %v", res.Total) } } + +func TestBooleanShouldMinPropagation(t *testing.T) { + idx, err := New("testidx", NewIndexMapping()) + if err != nil { + t.Fatal(err) + } + + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + + err := os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + }() + + doc1 := map[string]interface{}{ + "dept": "finance", + "name": "alvita", + "email": "alvita@domain.com", + } + + doc2 := map[string]interface{}{ + "dept": "dev-ops", + "name": "keelia", + "email": "keelia@domain.com", + } + + batch := idx.NewBatch() + + if err = batch.Index("doc1", doc1); err != nil { + t.Fatal(err) + } + + if err = batch.Index("doc2", doc2); err != nil { + t.Fatal(err) + } + + if err = idx.Batch(batch); err != nil { + t.Fatal(err) + } + + mq1 := NewMatchQuery("dev-ops") + mq1.SetField("dept") + mq2 := NewMatchQuery("keelia@domain.com") + mq2.SetField("email") + bq := NewBooleanQuery() + bq.AddShould(mq1) + bq.AddMust(mq2) + sr := NewSearchRequest(bq) + + res, err := idx.Search(sr) + if err != nil { + t.Fatal(err) + } + + if res.Total != 2 { + t.Errorf("Expected 2 results, but got: %v", res.Total) + } +} From 82b1018c17577775a0c2cdb30b5b93bbf1fc67f7 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 30 Jan 2019 15:39:55 -0800 Subject: [PATCH 538/728] Update example to match multiple terms --- search/query/disjunction.go | 4 ++-- search_test.go | 27 +++++++++++++++++---------- 2 files changed, 19 insertions(+), 12 deletions(-) diff --git a/search/query/disjunction.go b/search/query/disjunction.go index 1effc29ac..87ae9db7a 100644 --- a/search/query/disjunction.go +++ b/search/query/disjunction.go @@ -81,12 +81,12 @@ func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, if len(ss) < 1 { return searcher.NewMatchNoneSearcher(i) } else if len(ss) == 1 && q.Min <= 1 { - // update min setting of child searcher if supported + // update min setting of child searcher if supported; and return the + // single child searcher as is when min is not greater than 1. if searcher, ok := ss[0].(search.MinApplicableSearcher); ok { searcher.SetMin(int(q.Min)) } - // return the single nested searcher as is; only if min clauses is not greater than 1; return ss[0], nil } diff --git a/search_test.go b/search_test.go index ef1cf1814..f2f9fdc56 100644 --- a/search_test.go +++ b/search_test.go @@ -1103,15 +1103,13 @@ func TestBooleanShouldMinPropagation(t *testing.T) { }() doc1 := map[string]interface{}{ - "dept": "finance", - "name": "alvita", - "email": "alvita@domain.com", + "dept": "queen", + "name": "cersei lannister", } doc2 := map[string]interface{}{ - "dept": "dev-ops", - "name": "keelia", - "email": "keelia@domain.com", + "dept": "kings guard", + "name": "jaime lannister", } batch := idx.NewBatch() @@ -1128,15 +1126,24 @@ func TestBooleanShouldMinPropagation(t *testing.T) { t.Fatal(err) } - mq1 := NewMatchQuery("dev-ops") + // term dictionaries in the index for field.. + // dept: queen kings guard + // name: cersei jaime lannister + + // the following match query would match doc2 + mq1 := NewMatchQuery("kings guard") mq1.SetField("dept") - mq2 := NewMatchQuery("keelia@domain.com") - mq2.SetField("email") + + // the following match query would match both doc1 and doc2, + // as both docs share common lastname + mq2 := NewMatchQuery("jaime lannister") + mq2.SetField("name") + bq := NewBooleanQuery() bq.AddShould(mq1) bq.AddMust(mq2) - sr := NewSearchRequest(bq) + sr := NewSearchRequest(bq) res, err := idx.Search(sr) if err != nil { t.Fatal(err) From 20e27ce55e32cfc79b6cc776bc40648b24361291 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 1 Feb 2019 13:50:14 +0530 Subject: [PATCH 539/728] 1118 - Fix for cachedFieldDocs concurrent access --- index/scorch/scorch_test.go | 85 ++++++++++++++++++++++++++++++++ index/scorch/snapshot_segment.go | 21 ++++++-- 2 files changed, 102 insertions(+), 4 deletions(-) diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index a1691fd09..d8a7c3c54 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -24,6 +24,7 @@ import ( "sync" "testing" "time" + "math/rand" "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/analysis/analyzer/keyword" @@ -1509,6 +1510,90 @@ func TestIndexDocumentVisitFieldTerms(t *testing.T) { } } +func TestFieldTermsConcurrent(t *testing.T) { + cfg := CreateConfig("TestFieldTermsConcurrent") + + // setting path to empty string disables persistence/merging + // which ensures we have in-memory segments + // which is important for this test, to trigger the right code + // path, where fields exist, but have NOT been uninverted by + // the Segment impl (in memory segments are still SegmentBase) + cfg["path"] = "" + + defer func() { + err := DestroyTest(cfg) + if err != nil { + t.Fatal(err) + } + }() + + mp := mapping.NewIndexMapping() + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, cfg, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Fatalf("error opening index: %v", err) + } + defer func() { + cerr := idx.Close() + if cerr != nil { + t.Fatal(cerr) + } + }() + + // create a single bath (leading to 1 in-memory segment) + // have one field named "name" and 100 others named f0-f99 + batch := index.NewBatch() + for i := 0; i < 1000; i++ { + data := map[string]string{ + "name": fmt.Sprintf("doc-%d", i), + } + for j := 0; j < 100; j++ { + data[fmt.Sprintf("f%d", j)] = fmt.Sprintf("v%d", i) + } + doc := document.NewDocument(fmt.Sprintf("%d", i)) + err = mp.MapDocument(doc, data) + if err != nil { + t.Errorf("error mapping doc: %v", err) + } + batch.Update(doc) + } + + err = idx.Batch(batch) + if err != nil { + t.Fatal(err) + } + + // now have 10 goroutines try to visit field values for doc 1 + // in a random field + var wg sync.WaitGroup + for j := 0; j < 10; j++ { + wg.Add(1) + go func() { + r, err := idx.Reader() + if err != nil { + t.Fatal(err) + } + docNumber, err := r.InternalID("1") + if err != nil { + t.Fatal(err) + } + err = r.DocumentVisitFieldTerms(docNumber, + []string{fmt.Sprintf("f%d", rand.Intn(100))}, + func(field string, term []byte) {}) + if err != nil { + t.Fatal(err) + } + wg.Done() + }() + } + + wg.Wait() +} + func TestConcurrentUpdate(t *testing.T) { cfg := CreateConfig("TestConcurrentUpdate") err := InitTest(cfg) diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 0e0c59e9f..7d3de0811 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -113,14 +113,29 @@ func (s *SegmentSnapshot) Size() (rv int) { } type cachedFieldDocs struct { + m sync.Mutex readyCh chan struct{} // closed when the cachedFieldDocs.docs is ready to be used. err error // Non-nil if there was an error when preparing this cachedFieldDocs. docs map[uint64][]byte // Keyed by localDocNum, value is a list of terms delimited by 0xFF. size uint64 } +func (cfd *cachedFieldDocs) Size() int { + var rv int + cfd.m.Lock() + for _, entry := range cfd.docs { + rv += 8 /* size of uint64 */ + len(entry) + } + cfd.m.Unlock() + return rv +} + func (cfd *cachedFieldDocs) prepareField(field string, ss *SegmentSnapshot) { - defer close(cfd.readyCh) + cfd.m.Lock() + defer func() { + close(cfd.readyCh) + cfd.m.Unlock() + }() cfd.size += uint64(size.SizeOfUint64) /* size field */ dict, err := ss.segment.Dictionary(field) @@ -231,9 +246,7 @@ func (c *cachedDocs) updateSizeLOCKED() { for k, v := range c.cache { // cachedFieldDocs sizeInBytes += len(k) if v != nil { - for _, entry := range v.docs { // docs - sizeInBytes += 8 /* size of uint64 */ + len(entry) - } + sizeInBytes += v.Size() } } atomic.StoreUint64(&c.size, uint64(sizeInBytes)) From be1fb367f2e36d4b8bf0bc26cee6e3c82525e2c0 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 1 Feb 2019 07:56:06 -0500 Subject: [PATCH 540/728] fix race in test for scorch batch callbacks previous version access/modifies values from multiple goroutines. this version uses a waitgroup to know that the callback fired, if the callback does not fire the test will timeout. --- index/scorch/scorch_test.go | 30 ++++++++---------------------- 1 file changed, 8 insertions(+), 22 deletions(-) diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index a1691fd09..2c4dc602a 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -15,7 +15,6 @@ package scorch import ( - "fmt" "log" "os" "reflect" @@ -950,24 +949,22 @@ func TestIndexBatchWithCallbacks(t *testing.T) { t.Fatalf("error opening index: %v", err) } defer func() { - err := idx.Close() - if err != nil { - t.Fatal(err) + cerr := idx.Close() + if cerr != nil { + t.Fatal(cerr) } }() // Check that callback function works - updated := false - cbErr := fmt.Errorf("") + var wg sync.WaitGroup + wg.Add(1) batch := index.NewBatch() doc := document.NewDocument("3") doc.AddField(document.NewTextField("name", []uint64{}, []byte("test3"))) batch.Update(doc) batch.SetPersistedCallback(func(e error) { - updated = true - cbErr = e - + wg.Done() }) err = idx.Batch(batch) @@ -975,19 +972,8 @@ func TestIndexBatchWithCallbacks(t *testing.T) { t.Error(err) } - for i := 0; i < 30; i++ { - if updated { - break - } - time.Sleep(500 * time.Millisecond) - } - if !updated { - t.Fatal("Callback function wasn't called") - } - if cbErr != nil { - t.Fatal("Error wasn't updated properly on callback function") - } - + wg.Wait() + // test has no assertion but will timeout if callback doesn't fire } func TestIndexInsertUpdateDeleteWithMultipleTypesStored(t *testing.T) { From 3c0b0ae477f8d880fc1ebde4680bda19495de9cb Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 1 Feb 2019 10:25:16 -0500 Subject: [PATCH 541/728] fix data race when indexing empty batches and using reset When the batches passed to bleve are empty, this was a special case that inadvertently avoided proper synchronization, and led to the race detector to correctly point out that resetting the batch was unsafe. By simply avoiding an unnecessary step when the batch is empty, we can avoid the problem. --- index/scorch/scorch.go | 18 +++++++++-------- index/upsidedown/upsidedown.go | 18 +++++++++-------- index_test.go | 35 ++++++++++++++++++++++++++++++++++ 3 files changed, 55 insertions(+), 16 deletions(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 07568e6e5..23608fc3b 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -312,15 +312,17 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { // FIXME could sort ids list concurrent with analysis? - go func() { - for _, doc := range batch.IndexOps { - if doc != nil { - aw := index.NewAnalysisWork(s, doc, resultChan) - // put the work on the queue - s.analysisQueue.Queue(aw) + if len(batch.IndexOps) > 0 { + go func() { + for _, doc := range batch.IndexOps { + if doc != nil { + aw := index.NewAnalysisWork(s, doc, resultChan) + // put the work on the queue + s.analysisQueue.Queue(aw) + } } - } - }() + }() + } // wait for analysis result analysisResults := make([]*index.AnalysisResult, int(numUpdates)) diff --git a/index/upsidedown/upsidedown.go b/index/upsidedown/upsidedown.go index 8edbb5b3d..e4bc3d8f0 100644 --- a/index/upsidedown/upsidedown.go +++ b/index/upsidedown/upsidedown.go @@ -810,15 +810,17 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) { } } - go func() { - for _, doc := range batch.IndexOps { - if doc != nil { - aw := index.NewAnalysisWork(udc, doc, resultChan) - // put the work on the queue - udc.analysisQueue.Queue(aw) + if len(batch.IndexOps) > 0 { + go func() { + for _, doc := range batch.IndexOps { + if doc != nil { + aw := index.NewAnalysisWork(udc, doc, resultChan) + // put the work on the queue + udc.analysisQueue.Queue(aw) + } } - } - }() + }() + } // retrieve back index rows concurrent with analysis docBackIndexRowErr := error(nil) diff --git a/index_test.go b/index_test.go index 55b38909d..d1351dc29 100644 --- a/index_test.go +++ b/index_test.go @@ -32,6 +32,7 @@ import ( "github.com/blevesearch/bleve/analysis/analyzer/keyword" "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/store/boltdb" "github.com/blevesearch/bleve/index/store/null" "github.com/blevesearch/bleve/mapping" "github.com/blevesearch/bleve/search" @@ -2149,3 +2150,37 @@ func TestBug1096(t *testing.T) { t.Fatalf("expected only 2 hits '9' and '90', got %v", res) } } + +func TestDataRaceBug1092(t *testing.T) { + defer func() { + rerr := os.RemoveAll("testidx") + if rerr != nil { + t.Fatal(rerr) + } + }() + + // use default mapping + mapping := NewIndexMapping() + + var idx Index + idx, err = NewUsing("testidx", mapping, upsidedown.Name, boltdb.Name, nil) + if err != nil { + log.Fatal(err) + } + defer func() { + cerr := idx.Close() + if cerr != nil { + t.Fatal(cerr) + } + }() + + batch := idx.NewBatch() + for i := 0; i < 10; i++ { + err = idx.Batch(batch) + if err != nil { + t.Error(err) + } + + batch.Reset() + } +} From 2aa43c5988384f2e8ebfaf44d595d04e94f7ff8e Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Fri, 1 Feb 2019 10:16:10 -0800 Subject: [PATCH 542/728] Un-export MinApplicableSearcher --> minApplicableSearcher Also, since the FilteringSearcher encapsulates a child Searcher, a "min" update need not be propagated for it. --- search/query/disjunction.go | 2 +- search/query/query.go | 7 +++++++ search/search.go | 5 ----- search/searcher/search_filter.go | 6 ------ 4 files changed, 8 insertions(+), 12 deletions(-) diff --git a/search/query/disjunction.go b/search/query/disjunction.go index 87ae9db7a..792259320 100644 --- a/search/query/disjunction.go +++ b/search/query/disjunction.go @@ -83,7 +83,7 @@ func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, } else if len(ss) == 1 && q.Min <= 1 { // update min setting of child searcher if supported; and return the // single child searcher as is when min is not greater than 1. - if searcher, ok := ss[0].(search.MinApplicableSearcher); ok { + if searcher, ok := ss[0].(minApplicableSearcher); ok { searcher.SetMin(int(q.Min)) } diff --git a/search/query/query.go b/search/query/query.go index c7c1eefb8..71b515bf4 100644 --- a/search/query/query.go +++ b/search/query/query.go @@ -63,6 +63,13 @@ type ValidatableQuery interface { Validate() error } +// A Searcher to which the "min" setting is applicable, and supports +// updates to the setting. +type minApplicableSearcher interface { + search.Searcher + SetMin(int) +} + // ParseQuery deserializes a JSON representation of // a Query object. func ParseQuery(input []byte) (Query, error) { diff --git a/search/search.go b/search/search.go index ef7683332..440c09571 100644 --- a/search/search.go +++ b/search/search.go @@ -277,11 +277,6 @@ type Searcher interface { DocumentMatchPoolSize() int } -type MinApplicableSearcher interface { - Searcher - SetMin(to int) -} - type SearcherOptions struct { Explain bool IncludeTermVectors bool diff --git a/search/searcher/search_filter.go b/search/searcher/search_filter.go index ff777623a..7c95fb41c 100644 --- a/search/searcher/search_filter.go +++ b/search/searcher/search_filter.go @@ -98,12 +98,6 @@ func (f *FilteringSearcher) Min() int { return f.child.Min() } -func (f *FilteringSearcher) SetMin(to int) { - if searcher, ok := f.child.(search.MinApplicableSearcher); ok { - searcher.SetMin(to) - } -} - func (f *FilteringSearcher) DocumentMatchPoolSize() int { return f.child.DocumentMatchPoolSize() } From 84070ba7a39fe96b5ca0216a5e9b89b2e329bd17 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 1 Feb 2019 15:22:07 -0500 Subject: [PATCH 543/728] fix build somehow latest merges broke --- index/scorch/scorch_test.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index 9ac65183b..550a584ae 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -15,7 +15,9 @@ package scorch import ( + "fmt" "log" + "math/rand" "os" "reflect" "regexp" @@ -23,7 +25,6 @@ import ( "sync" "testing" "time" - "math/rand" "github.com/blevesearch/bleve/analysis" "github.com/blevesearch/bleve/analysis/analyzer/keyword" From d59ba7f56723df4f1f9ddf8b4180e9bbb317c078 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 1 Feb 2019 15:47:19 -0500 Subject: [PATCH 544/728] enable race detector on travis --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index fd6e1ba38..35f7b60f2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -13,7 +13,7 @@ script: - go get github.com/kisielk/errcheck - go get -u github.com/FiloSottile/gvt - gvt restore - - go test -v $(go list ./... | grep -v vendor/) + - go test -race -v $(go list ./... | grep -v vendor/) - go vet $(go list ./... | grep -v vendor/) - errcheck -ignorepkg fmt $(go list ./... | grep -v vendor/) - docs/project-code-coverage.sh From 8e1c5b2bf4d3ea439fdefd0b0ba6e2d0f851ff03 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 4 Feb 2019 15:35:04 +0530 Subject: [PATCH 545/728] Unit tests for -Collector Chaining -DocumentMatchHandler -MakeDocumentMatchHandler --- search/collector/topn_test.go | 257 ++++++++++++++++++++++++++++++++++ 1 file changed, 257 insertions(+) diff --git a/search/collector/topn_test.go b/search/collector/topn_test.go index d50e38a0c..2e668bd9e 100644 --- a/search/collector/topn_test.go +++ b/search/collector/topn_test.go @@ -15,6 +15,7 @@ package collector import ( + "bytes" "context" "testing" @@ -467,6 +468,262 @@ func TestPaginationSameScores(t *testing.T) { } } +// TestStreamResults verifies the search.DocumentMatchHandler +func TestStreamResults(t *testing.T) { + matches := []*search.DocumentMatch{ + { + IndexInternalID: index.IndexInternalID("a"), + Score: 11, + }, + { + IndexInternalID: index.IndexInternalID("b"), + Score: 1, + }, + { + IndexInternalID: index.IndexInternalID("c"), + Score: 11, + }, + { + IndexInternalID: index.IndexInternalID("d"), + Score: 999, + }, + { + IndexInternalID: index.IndexInternalID("e"), + Score: 11, + }, + { + IndexInternalID: index.IndexInternalID("f"), + Score: 9, + }, + { + IndexInternalID: index.IndexInternalID("g"), + Score: 11, + }, + { + IndexInternalID: index.IndexInternalID("h"), + Score: 89, + }, + { + IndexInternalID: index.IndexInternalID("i"), + Score: 101, + }, + { + IndexInternalID: index.IndexInternalID("j"), + Score: 112, + }, + { + IndexInternalID: index.IndexInternalID("k"), + Score: 10, + }, + { + IndexInternalID: index.IndexInternalID("l"), + Score: 99, + }, + { + IndexInternalID: index.IndexInternalID("m"), + Score: 11, + }, + { + IndexInternalID: index.IndexInternalID("n"), + Score: 111, + }, + } + + searcher := &stubSearcher{ + matches: matches, + } + ind := 0 + docMatchHandler := func(hit *search.DocumentMatch) error { + if hit == nil { + return nil // search completed + } + if !bytes.Equal(hit.IndexInternalID, matches[ind].IndexInternalID) { + t.Errorf("%d hit IndexInternalID actual: %s, expected: %s", + ind, hit.IndexInternalID, matches[ind].IndexInternalID) + } + if hit.Score != matches[ind].Score { + t.Errorf("%d hit Score actual: %s, expected: %s", + ind, hit.IndexInternalID, matches[ind].IndexInternalID) + } + ind++ + return nil + } + + var handlerMaker search.MakeDocumentMatchHandler + handlerMaker = func(ctx *search.SearchContext) (search.DocumentMatchHandler, + bool, error) { + return docMatchHandler, false, nil + } + + ctx := context.WithValue(context.Background(), search.MakeDocumentMatchHandlerKey, + handlerMaker) + + collector := NewTopNCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}}) + err := collector.Collect(ctx, searcher, &stubReader{}) + if err != nil { + t.Fatal(err) + } + + maxScore := collector.MaxScore() + if maxScore != 999.0 { + t.Errorf("expected max score 99.0, got %f", maxScore) + } + + total := collector.Total() + if int(total) != ind { + t.Errorf("expected 14 total results, got %d", total) + } + + results := collector.Results() + + if len(results) != 0 { + t.Fatalf("expected 0 results, got %d", len(results)) + } +} + +// TestCollectorChaining verifies the chaining of collectors. +// The custom DocumentMatchHandler can process every hit for +// the search query and then pass the hit to the topn collector +// to eventually have the sorted top `N` results. +func TestCollectorChaining(t *testing.T) { + matches := []*search.DocumentMatch{ + { + IndexInternalID: index.IndexInternalID("a"), + Score: 11, + }, + { + IndexInternalID: index.IndexInternalID("b"), + Score: 1, + }, + { + IndexInternalID: index.IndexInternalID("c"), + Score: 11, + }, + { + IndexInternalID: index.IndexInternalID("d"), + Score: 999, + }, + { + IndexInternalID: index.IndexInternalID("e"), + Score: 11, + }, + { + IndexInternalID: index.IndexInternalID("f"), + Score: 9, + }, + { + IndexInternalID: index.IndexInternalID("g"), + Score: 11, + }, + { + IndexInternalID: index.IndexInternalID("h"), + Score: 89, + }, + { + IndexInternalID: index.IndexInternalID("i"), + Score: 101, + }, + { + IndexInternalID: index.IndexInternalID("j"), + Score: 112, + }, + { + IndexInternalID: index.IndexInternalID("k"), + Score: 10, + }, + { + IndexInternalID: index.IndexInternalID("l"), + Score: 99, + }, + { + IndexInternalID: index.IndexInternalID("m"), + Score: 11, + }, + { + IndexInternalID: index.IndexInternalID("n"), + Score: 111, + }, + } + + searcher := &stubSearcher{ + matches: matches, + } + + var topNHandler search.DocumentMatchHandler + ind := 0 + docMatchHandler := func(hit *search.DocumentMatch) error { + if hit == nil { + return nil // search completed + } + if !bytes.Equal(hit.IndexInternalID, matches[ind].IndexInternalID) { + t.Errorf("%d hit IndexInternalID actual: %s, expected: %s", + ind, hit.IndexInternalID, matches[ind].IndexInternalID) + } + if hit.Score != matches[ind].Score { + t.Errorf("%d hit Score actual: %s, expected: %s", + ind, hit.IndexInternalID, matches[ind].IndexInternalID) + } + ind++ + // give the hit back to the topN collector + err := topNHandler(hit) + if err != nil { + t.Errorf("unexpected err: %v", err) + } + return nil + } + + var handlerMaker search.MakeDocumentMatchHandler + handlerMaker = func(ctx *search.SearchContext) (search.DocumentMatchHandler, + bool, error) { + topNHandler, _, _ = MakeTopNDocumentMatchHandler(ctx) + return docMatchHandler, false, nil + } + + ctx := context.WithValue(context.Background(), search.MakeDocumentMatchHandlerKey, + handlerMaker) + + collector := NewTopNCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}}) + err := collector.Collect(ctx, searcher, &stubReader{}) + if err != nil { + t.Fatal(err) + } + + maxScore := collector.MaxScore() + if maxScore != 999.0 { + t.Errorf("expected max score 99.0, got %f", maxScore) + } + + total := collector.Total() + if int(total) != ind { + t.Errorf("expected 14 total results, got %d", total) + } + + results := collector.Results() + + if len(results) != 10 { // as it is paged + t.Fatalf("expected 0 results, got %d", len(results)) + } + + if results[0].ID != "d" { + t.Errorf("expected first result to have ID 'l', got %s", results[0].ID) + } + + if results[0].Score != 999.0 { + t.Errorf("expected highest score to be 999.0, got %f", results[0].Score) + } + + minScore := 1000.0 + for _, result := range results { + if result.Score < minScore { + minScore = result.Score + } + } + + if minScore < 10 { + t.Errorf("expected minimum score to be higher than 10, got %f", minScore) + } +} + func BenchmarkTop10of0Scores(b *testing.B) { benchHelper(0, func() search.Collector { return NewTopNCollector(10, 0, search.SortOrder{&search.SortScore{Desc: true}}) From c73ceb8bd4355f02aa46e1d9102727dbd9f2fbfe Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 4 Feb 2019 15:45:37 -0800 Subject: [PATCH 546/728] allocate walkContext only when docMapping is enabled --- mapping/index.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mapping/index.go b/mapping/index.go index fc5d12a73..602764cbb 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -320,8 +320,8 @@ func (im *IndexMappingImpl) determineType(data interface{}) string { func (im *IndexMappingImpl) MapDocument(doc *document.Document, data interface{}) error { docType := im.determineType(data) docMapping := im.mappingForType(docType) - walkContext := im.newWalkContext(doc, docMapping) if docMapping.Enabled { + walkContext := im.newWalkContext(doc, docMapping) docMapping.walkDocument(data, []string{}, []uint64{}, walkContext) // see if the _all field was disabled From 46f577ca226f9b392b07110c84d9613e38074474 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Tue, 5 Feb 2019 17:35:34 -0800 Subject: [PATCH 547/728] MB-32855: Applying the disjunction 1-clause optimization + This optimization can be applied only when .. - The parent disjunction searcher has only 1 child searcher. - The 1 child searcher's min is the same as the parent's min. + This means that the "min" setting doesn't need to be propagated. --- search/query/disjunction.go | 10 ++- search/query/query.go | 7 --- search/searcher/search_disjunction_heap.go | 4 -- search/searcher/search_disjunction_slice.go | 4 -- search_test.go | 70 +++++++++++++++++++++ 5 files changed, 74 insertions(+), 21 deletions(-) diff --git a/search/query/disjunction.go b/search/query/disjunction.go index 792259320..2bc1d7044 100644 --- a/search/query/disjunction.go +++ b/search/query/disjunction.go @@ -80,12 +80,10 @@ func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, if len(ss) < 1 { return searcher.NewMatchNoneSearcher(i) - } else if len(ss) == 1 && q.Min <= 1 { - // update min setting of child searcher if supported; and return the - // single child searcher as is when min is not greater than 1. - if searcher, ok := ss[0].(minApplicableSearcher); ok { - searcher.SetMin(int(q.Min)) - } + } else if len(ss) == 1 && int(q.Min) == ss[0].Min() { + // apply optimization only if both conditions below are satisfied: + // - disjunction searcher has only 1 child searcher + // - parent searcher's min setting is equal to child searcher's min return ss[0], nil } diff --git a/search/query/query.go b/search/query/query.go index 71b515bf4..c7c1eefb8 100644 --- a/search/query/query.go +++ b/search/query/query.go @@ -63,13 +63,6 @@ type ValidatableQuery interface { Validate() error } -// A Searcher to which the "min" setting is applicable, and supports -// updates to the setting. -type minApplicableSearcher interface { - search.Searcher - SetMin(int) -} - // ParseQuery deserializes a JSON representation of // a Query object. func ParseQuery(input []byte) (Query, error) { diff --git a/search/searcher/search_disjunction_heap.go b/search/searcher/search_disjunction_heap.go index 92b28dcc2..ec133f1f8 100644 --- a/search/searcher/search_disjunction_heap.go +++ b/search/searcher/search_disjunction_heap.go @@ -290,10 +290,6 @@ func (s *DisjunctionHeapSearcher) Min() int { return s.min } -func (s *DisjunctionHeapSearcher) SetMin(to int) { - s.min = to -} - func (s *DisjunctionHeapSearcher) DocumentMatchPoolSize() int { rv := len(s.searchers) for _, s := range s.searchers { diff --git a/search/searcher/search_disjunction_slice.go b/search/searcher/search_disjunction_slice.go index 00a631355..e47f39ad0 100644 --- a/search/searcher/search_disjunction_slice.go +++ b/search/searcher/search_disjunction_slice.go @@ -274,10 +274,6 @@ func (s *DisjunctionSliceSearcher) Min() int { return s.min } -func (s *DisjunctionSliceSearcher) SetMin(to int) { - s.min = to -} - func (s *DisjunctionSliceSearcher) DocumentMatchPoolSize() int { rv := len(s.currs) for _, s := range s.searchers { diff --git a/search_test.go b/search_test.go index f2f9fdc56..a37dea703 100644 --- a/search_test.go +++ b/search_test.go @@ -1153,3 +1153,73 @@ func TestBooleanShouldMinPropagation(t *testing.T) { t.Errorf("Expected 2 results, but got: %v", res.Total) } } + +func TestDisjunctionMinPropagation(t *testing.T) { + idx, err := New("testidx", NewIndexMapping()) + if err != nil { + t.Fatal(err) + } + + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + + err := os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + }() + + doc1 := map[string]interface{}{ + "dept": "finance", + "name": "xyz", + } + + doc2 := map[string]interface{}{ + "dept": "marketing", + "name": "xyz", + } + + doc3 := map[string]interface{}{ + "dept": "engineering", + "name": "abc", + } + + batch := idx.NewBatch() + + if err = batch.Index("doc1", doc1); err != nil { + t.Fatal(err) + } + + if err = batch.Index("doc2", doc2); err != nil { + t.Fatal(err) + } + + if err = batch.Index("doc3", doc3); err != nil { + t.Fatal(err) + } + + if err = idx.Batch(batch); err != nil { + t.Fatal(err) + } + + mq1 := NewMatchQuery("finance") + mq2 := NewMatchQuery("marketing") + dq := NewDisjunctionQuery(mq1, mq2) + dq.SetMin(3) + + dq2 := NewDisjunctionQuery(dq) + dq2.SetMin(1) + + sr := NewSearchRequest(dq2) + res, err := idx.Search(sr) + if err != nil { + t.Fatal(err) + } + + if res.Total != 0 { + t.Fatalf("Expect 0 results, but got: %v", res.Total) + } +} From 1415017996a1771ba6df5cf1dc7f805d2a86b348 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Tue, 5 Feb 2019 17:39:06 -0800 Subject: [PATCH 548/728] Fix formatting: go fmt ./... --- index/scorch/snapshot_segment.go | 2 +- search/collector.go | 2 +- search/collector/topn.go | 11 +++++------ search/search.go | 2 +- 4 files changed, 8 insertions(+), 9 deletions(-) diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index 7d3de0811..f3a2c56a9 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -113,7 +113,7 @@ func (s *SegmentSnapshot) Size() (rv int) { } type cachedFieldDocs struct { - m sync.Mutex + m sync.Mutex readyCh chan struct{} // closed when the cachedFieldDocs.docs is ready to be used. err error // Non-nil if there was an error when preparing this cachedFieldDocs. docs map[uint64][]byte // Keyed by localDocNum, value is a list of terms delimited by 0xFF. diff --git a/search/collector.go b/search/collector.go index f7a490d65..df3ff9c5a 100644 --- a/search/collector.go +++ b/search/collector.go @@ -48,5 +48,5 @@ var MakeDocumentMatchHandlerKey = MakeDocumentMatchHandlerKeyType( // builder function which the applications can pass to bleve. // These builder methods gives a DocumentMatchHandler function // to bleve, which it will invoke on every document matches. -type MakeDocumentMatchHandler func(ctx *SearchContext) ( +type MakeDocumentMatchHandler func(ctx *SearchContext) ( callback DocumentMatchHandler, loadID bool, err error) diff --git a/search/collector/topn.go b/search/collector/topn.go index 3682952fd..378a7b114 100644 --- a/search/collector/topn.go +++ b/search/collector/topn.go @@ -140,7 +140,7 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, } searchContext := &search.SearchContext{ DocumentMatchPool: search.NewDocumentMatchPool(backingSize+searcher.DocumentMatchPoolSize(), len(hc.sort)), - Collector: hc, + Collector: hc, } hc.dvReader, err = reader.DocValueReader(hc.neededFields) @@ -166,7 +166,7 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, return err } - hc.needDocIds = hc.needDocIds||loadID + hc.needDocIds = hc.needDocIds || loadID select { case <-ctx.Done(): @@ -188,7 +188,7 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, break } - err = dmHandler(next) + err = dmHandler(next) if err != nil { break } @@ -196,7 +196,7 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, next, err = searcher.Next(searchContext) } - // help finalize/flush the results in case + // help finalize/flush the results in case // of custom document match handlers. err = dmHandler(nil) if err != nil { @@ -219,7 +219,7 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, var sortByScoreOpt = []string{"_score"} func (hc *TopNCollector) prepareDocumentMatch(ctx *search.SearchContext, - reader index.IndexReader, d *search.DocumentMatch ) (err error) { + reader index.IndexReader, d *search.DocumentMatch) (err error) { // visit field terms for features that require it (sort, facets) if len(hc.neededFields) > 0 { @@ -298,7 +298,6 @@ func MakeTopNDocumentMatchHandler( return nil, false, nil } - // visitFieldTerms is responsible for visiting the field terms of the // search hit, and passing visited terms to the sort and facet builder func (hc *TopNCollector) visitFieldTerms(reader index.IndexReader, d *search.DocumentMatch) error { diff --git a/search/search.go b/search/search.go index 1d26d0de0..44451fa1e 100644 --- a/search/search.go +++ b/search/search.go @@ -285,7 +285,7 @@ type SearcherOptions struct { // SearchContext represents the context around a single search type SearchContext struct { DocumentMatchPool *DocumentMatchPool - Collector Collector + Collector Collector } func (sc *SearchContext) Size() int { From 098b7f138b0fef882f978cdebc4236dcb316272e Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 11 Feb 2019 14:01:34 +0530 Subject: [PATCH 549/728] cleaning up DocumentMatch - removing the un-used Document field --- search/search.go | 8 -------- 1 file changed, 8 deletions(-) diff --git a/search/search.go b/search/search.go index 440c09571..f0e24e8a2 100644 --- a/search/search.go +++ b/search/search.go @@ -18,7 +18,6 @@ import ( "fmt" "reflect" - "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/size" ) @@ -100,9 +99,6 @@ type DocumentMatch struct { // fields as float64s and date fields as time.RFC3339 formatted strings. Fields map[string]interface{} `json:"fields,omitempty"` - // if we load the document for this hit, remember it so we dont load again - Document *document.Document `json:"-"` - // used to maintain natural index order HitNumber uint64 `json:"-"` @@ -195,10 +191,6 @@ func (dm *DocumentMatch) Size() int { size.SizeOfPtr } - if dm.Document != nil { - sizeInBytes += dm.Document.Size() - } - return sizeInBytes } From 16c228ad9b0a41adf04cc2da39e56366efa34dc5 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 11 Feb 2019 17:12:34 -0800 Subject: [PATCH 550/728] Propagate Score setting to child search request during MultiSearch(..) --- index_alias_impl.go | 1 + 1 file changed, 1 insertion(+) diff --git a/index_alias_impl.go b/index_alias_impl.go index f678a059b..335fcade2 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -433,6 +433,7 @@ func createChildSearchRequest(req *SearchRequest) *SearchRequest { Explain: req.Explain, Sort: req.Sort.Copy(), IncludeLocations: req.IncludeLocations, + Score: req.Score, } return &rv } From da6534acf84faf5eca44a20e015e745f9f6c45b4 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 13 Feb 2019 17:07:56 -0800 Subject: [PATCH 551/728] MB-32855: Revert conjunction-single clause optimization Reverting the single-clause conjunction optimization for now, as we're noticing unexpected results while executing a boolean query in certain scenarios. --- search/query/conjunction.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/search/query/conjunction.go b/search/query/conjunction.go index f14cbdd88..1a7ed1bc0 100644 --- a/search/query/conjunction.go +++ b/search/query/conjunction.go @@ -73,9 +73,6 @@ func (q *ConjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, if len(ss) < 1 { return searcher.NewMatchNoneSearcher(i) - } else if len(ss) == 1 { - // return single nested searcher as is - return ss[0], nil } return searcher.NewConjunctionSearcher(i, ss, options) From 326f90d860033a97e50da3919dc38fc3da610d6b Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 14 Feb 2019 17:04:49 -0500 Subject: [PATCH 552/728] switch to bbolt (#1134) switch to use github.com/etcd-io/bbolt in upsidedown test, close reader before closing index --- index/scorch/persister.go | 2 +- index/scorch/scorch.go | 2 +- index/scorch/snapshot_rollback.go | 2 +- index/store/boltdb/iterator.go | 2 +- index/store/boltdb/reader.go | 2 +- index/store/boltdb/store.go | 2 +- index/store/boltdb/store_test.go | 2 +- index/upsidedown/upsidedown_test.go | 5 +++++ vendor/manifest | 6 +++--- 9 files changed, 15 insertions(+), 10 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index f75945d4d..349ccdc0e 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -33,7 +33,7 @@ import ( "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/index/scorch/segment/zap" - "github.com/boltdb/bolt" + bolt "github.com/etcd-io/bbolt" ) var DefaultChunkFactor uint32 = 1024 diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 23608fc3b..3f3d8bffc 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -31,7 +31,7 @@ import ( "github.com/blevesearch/bleve/index/scorch/segment/zap" "github.com/blevesearch/bleve/index/store" "github.com/blevesearch/bleve/registry" - "github.com/boltdb/bolt" + bolt "github.com/etcd-io/bbolt" ) const Name = "scorch" diff --git a/index/scorch/snapshot_rollback.go b/index/scorch/snapshot_rollback.go index 247003311..470868d0e 100644 --- a/index/scorch/snapshot_rollback.go +++ b/index/scorch/snapshot_rollback.go @@ -19,7 +19,7 @@ import ( "log" "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/boltdb/bolt" + bolt "github.com/etcd-io/bbolt" ) type RollbackPoint struct { diff --git a/index/store/boltdb/iterator.go b/index/store/boltdb/iterator.go index 82ab946fd..4b5019f1f 100644 --- a/index/store/boltdb/iterator.go +++ b/index/store/boltdb/iterator.go @@ -17,7 +17,7 @@ package boltdb import ( "bytes" - "github.com/boltdb/bolt" + bolt "github.com/etcd-io/bbolt" ) type Iterator struct { diff --git a/index/store/boltdb/reader.go b/index/store/boltdb/reader.go index 1d701c982..4cd94183c 100644 --- a/index/store/boltdb/reader.go +++ b/index/store/boltdb/reader.go @@ -16,7 +16,7 @@ package boltdb import ( "github.com/blevesearch/bleve/index/store" - "github.com/boltdb/bolt" + bolt "github.com/etcd-io/bbolt" ) type Reader struct { diff --git a/index/store/boltdb/store.go b/index/store/boltdb/store.go index d8de0768f..7021cd91c 100644 --- a/index/store/boltdb/store.go +++ b/index/store/boltdb/store.go @@ -30,7 +30,7 @@ import ( "github.com/blevesearch/bleve/index/store" "github.com/blevesearch/bleve/registry" - "github.com/boltdb/bolt" + bolt "github.com/etcd-io/bbolt" ) const ( diff --git a/index/store/boltdb/store_test.go b/index/store/boltdb/store_test.go index 693c7b9f3..6411c239c 100644 --- a/index/store/boltdb/store_test.go +++ b/index/store/boltdb/store_test.go @@ -20,7 +20,7 @@ import ( "github.com/blevesearch/bleve/index/store" "github.com/blevesearch/bleve/index/store/test" - "github.com/boltdb/bolt" + bolt "github.com/etcd-io/bbolt" ) func open(t *testing.T, mo store.MergeOperator) store.KVStore { diff --git a/index/upsidedown/upsidedown_test.go b/index/upsidedown/upsidedown_test.go index 288f49003..2135a22c9 100644 --- a/index/upsidedown/upsidedown_test.go +++ b/index/upsidedown/upsidedown_test.go @@ -1401,6 +1401,11 @@ func TestConcurrentUpdate(t *testing.T) { if len(doc.Fields) > 1 { t.Errorf("expected single field, found %d", len(doc.Fields)) } + + err = r.Close() + if err != nil { + t.Fatal(err) + } } func TestLargeField(t *testing.T) { diff --git a/vendor/manifest b/vendor/manifest index 5bdb6a076..729d8a64f 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -26,10 +26,10 @@ "notests": true }, { - "importpath": "github.com/boltdb/bolt", - "repository": "https://github.com/boltdb/bolt", + "importpath": "github.com/etcd-io/bbolt", + "repository": "https://github.com/etcd-io/bbolt", "vcs": "", - "revision": "9da31745363232bc1e27dbab3569e77383a51585", + "revision": "7ee3ded59d4835e10f3e7d0f7603c42aa5e83820", "branch": "master", "notests": true }, From 05d86ea8f6e30456949f612cf68cf4a27ce8c9c5 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 14 Feb 2019 17:05:07 -0500 Subject: [PATCH 553/728] add support for configuring the boltdb initialMmapSize option (#1135) Without this option, sometimes committing a write Tx will block until all ongoing read Txs finish. If your application supports running long searches, this may result in unacceptable delays to indexing the data. By setting this value to a very large value (say 1TB), you can ensure that bolt will never need to remap to complete the write Tx. On 64-bit systems this may be an acceptable trade-off. See also: https://github.com/boltdb/bolt/issues/240 https://github.com/boltdb/bolt/issues/489 --- index/store/boltdb/store.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/index/store/boltdb/store.go b/index/store/boltdb/store.go index 7021cd91c..56613d531 100644 --- a/index/store/boltdb/store.go +++ b/index/store/boltdb/store.go @@ -74,6 +74,12 @@ func New(mo store.MergeOperator, config map[string]interface{}) (store.KVStore, bo.ReadOnly = ro } + if initialMmapSize, ok := config["initialMmapSize"].(int); ok { + bo.InitialMmapSize = initialMmapSize + } else if initialMmapSize, ok := config["initialMmapSize"].(float64); ok { + bo.InitialMmapSize = int(initialMmapSize) + } + db, err := bolt.Open(path, 0600, bo) if err != nil { return nil, err From 8bbe51a161c462ca9baa952cb5bf27604016eddd Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Tue, 19 Feb 2019 17:12:08 +0530 Subject: [PATCH 554/728] dict cmd additions -exposing 1hit count and total term info -exposing postings size and cardinality --- cmd/bleve/cmd/zap/dict.go | 41 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/cmd/bleve/cmd/zap/dict.go b/cmd/bleve/cmd/zap/dict.go index 2c60d31da..3cd256fa5 100644 --- a/cmd/bleve/cmd/zap/dict.go +++ b/cmd/bleve/cmd/zap/dict.go @@ -19,6 +19,7 @@ import ( "fmt" "math" + "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index/scorch/segment/zap" "github.com/couchbase/vellum" "github.com/spf13/cobra" @@ -47,6 +48,7 @@ var dictCmd = &cobra.Command{ fstBytes := data[addr+uint64(read) : addr+uint64(read)+vellumLen] fmt.Printf("raw vellum data:\n % x\n", fstBytes) fmt.Printf("dictionary:\n") + var termsCount, hit1Count int64 if fstBytes != nil { fst, err := vellum.Load(fstBytes) if err != nil { @@ -61,15 +63,21 @@ var dictCmd = &cobra.Command{ docNum, normBits := zap.FSTValDecode1Hit(currVal) norm := math.Float32frombits(uint32(normBits)) extra = fmt.Sprintf("-- docNum: %d, norm: %f", docNum, norm) + fmt.Printf(" %s - %d (%x) %s\n", currTerm, currVal, currVal, extra) + hit1Count++ + } else { + // fetch the postings size, cardinality in case of non 1 hits + l, c := readPostingCardinality(currVal, data) + fmt.Printf(" %s - %d (%x) posting byteSize: %d cardinality: %d\n", + currTerm, currVal, currVal, l, c) } - - fmt.Printf(" %s - %d (%x) %s\n", currTerm, currVal, currVal, extra) + termsCount++ err = itr.Next() } if err != nil && err != vellum.ErrIteratorDone { return fmt.Errorf("error iterating dictionary: %v", err) } - + fmt.Printf("Total terms in dictionary : %d 1hit count: %d\n", termsCount, hit1Count) } return nil @@ -79,3 +87,30 @@ var dictCmd = &cobra.Command{ func init() { RootCmd.AddCommand(dictCmd) } + +func readPostingCardinality(postingsOffset uint64, data []byte) (int, uint64) { + // read the location of the freq/norm details + var n uint64 + var read int + + _, read = binary.Uvarint(data[postingsOffset+n : postingsOffset+binary.MaxVarintLen64]) + n += uint64(read) + + _, read = binary.Uvarint(data[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + + var postingsLen uint64 + postingsLen, read = binary.Uvarint(data[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) + n += uint64(read) + + roaringBytes := data[postingsOffset+n : postingsOffset+n+postingsLen] + + r := roaring.NewBitmap() + + _, err := r.FromBuffer(roaringBytes) + if err != nil { + fmt.Printf("error loading roaring bitmap: %v", err) + } + + return len(roaringBytes), r.GetCardinality() +} From eb62de6a81c63347e4c2a2312e145ee518991fc3 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 22 Feb 2019 09:27:26 +0530 Subject: [PATCH 555/728] LoadAndHighLightFields API Refactoring the code a bit to let applications to perform field loading and any highlighting. Mainly helpful with the streamed result hits. --- index_impl.go | 135 +++++++++++++++++++++++++---------------------- search/search.go | 1 + 2 files changed, 74 insertions(+), 62 deletions(-) diff --git a/index_impl.go b/index_impl.go index fe61b8064..63fe39ccb 100644 --- a/index_impl.go +++ b/index_impl.go @@ -542,71 +542,13 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr } for _, hit := range hits { - if len(req.Fields) > 0 || highlighter != nil { - doc, err := indexReader.Document(hit.ID) - if err == nil && doc != nil { - if len(req.Fields) > 0 { - fieldsToLoad := deDuplicate(req.Fields) - for _, f := range fieldsToLoad { - for _, docF := range doc.Fields { - if f == "*" || docF.Name() == f { - var value interface{} - switch docF := docF.(type) { - case *document.TextField: - value = string(docF.Value()) - case *document.NumericField: - num, err := docF.Number() - if err == nil { - value = num - } - case *document.DateTimeField: - datetime, err := docF.DateTime() - if err == nil { - value = datetime.Format(time.RFC3339) - } - case *document.BooleanField: - boolean, err := docF.Boolean() - if err == nil { - value = boolean - } - case *document.GeoPointField: - lon, err := docF.Lon() - if err == nil { - lat, err := docF.Lat() - if err == nil { - value = []float64{lon, lat} - } - } - } - if value != nil { - hit.AddFieldValue(docF.Name(), value) - } - } - } - } - } - if highlighter != nil { - highlightFields := req.Highlight.Fields - if highlightFields == nil { - // add all fields with matches - highlightFields = make([]string, 0, len(hit.Locations)) - for k := range hit.Locations { - highlightFields = append(highlightFields, k) - } - } - for _, hf := range highlightFields { - highlighter.BestFragmentsInField(hit, doc, hf, 1) - } - } - } else if doc == nil { - // unexpected case, a doc ID that was found as a search hit - // was unable to be found during document lookup - return nil, ErrorIndexReadInconsistency - } - } if i.name != "" { hit.Index = i.name } + err = LoadAndHighlightFields(hit, req, i.name, indexReader, highlighter) + if err != nil { + return nil, err + } } atomic.AddUint64(&i.stats.searches, 1) @@ -632,6 +574,75 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr }, nil } +func LoadAndHighlightFields(hit *search.DocumentMatch, req *SearchRequest, + indexName string, r index.IndexReader, + highlighter highlight.Highlighter) error { + if len(req.Fields) > 0 || highlighter != nil { + doc, err := r.Document(hit.ID) + if err == nil && doc != nil { + if len(req.Fields) > 0 { + fieldsToLoad := deDuplicate(req.Fields) + for _, f := range fieldsToLoad { + for _, docF := range doc.Fields { + if f == "*" || docF.Name() == f { + var value interface{} + switch docF := docF.(type) { + case *document.TextField: + value = string(docF.Value()) + case *document.NumericField: + num, err := docF.Number() + if err == nil { + value = num + } + case *document.DateTimeField: + datetime, err := docF.DateTime() + if err == nil { + value = datetime.Format(time.RFC3339) + } + case *document.BooleanField: + boolean, err := docF.Boolean() + if err == nil { + value = boolean + } + case *document.GeoPointField: + lon, err := docF.Lon() + if err == nil { + lat, err := docF.Lat() + if err == nil { + value = []float64{lon, lat} + } + } + } + if value != nil { + hit.AddFieldValue(docF.Name(), value) + } + } + } + } + } + if highlighter != nil { + highlightFields := req.Highlight.Fields + if highlightFields == nil { + // add all fields with matches + highlightFields = make([]string, 0, len(hit.Locations)) + for k := range hit.Locations { + highlightFields = append(highlightFields, k) + } + } + for _, hf := range highlightFields { + highlighter.BestFragmentsInField(hit, doc, hf, 1) + } + } + } else if doc == nil { + // unexpected case, a doc ID that was found as a search hit + // was unable to be found during document lookup + return ErrorIndexReadInconsistency + } + } + + return nil +} + // Fields returns the name of all the fields this // Index has operated on. func (i *indexImpl) Fields() (fields []string, err error) { diff --git a/search/search.go b/search/search.go index f8a282d16..72bb0ea29 100644 --- a/search/search.go +++ b/search/search.go @@ -279,6 +279,7 @@ type SearcherOptions struct { type SearchContext struct { DocumentMatchPool *DocumentMatchPool Collector Collector + IndexReader index.IndexReader } func (sc *SearchContext) Size() int { From 4c8905c96c2995005ac87120ecb4a35d6aeb3923 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Tue, 26 Feb 2019 10:43:21 -0800 Subject: [PATCH 556/728] Add API for index/store/moss to retrieve associated collection --- index/store/moss/store.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/index/store/moss/store.go b/index/store/moss/store.go index 89ea553cf..47aab076f 100644 --- a/index/store/moss/store.go +++ b/index/store/moss/store.go @@ -218,6 +218,10 @@ func (s *Store) LowerLevelStore() store.KVStore { return s.llstore } +func (s *Store) Collection() moss.Collection { + return s.ms +} + func init() { registry.RegisterKVStore(Name, New) } From a91b427b59b893f112021841ba7370d285f8426f Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Sun, 3 Mar 2019 10:18:44 -0500 Subject: [PATCH 557/728] fix data race with batch reset (#1150) this is another variation of the race found/fixed in #1092 in that case the batch was empty, which meant we would skip the code that properly synchronized access. our fix only handled this exact case (no data operations), however there is another variation, if the batch contains only deletes (which are data ops) we still spawned the goroutine, although since there were no real updates, again the synchronization code would be skipped, and thus the data race could happen. the fix is to check the number of updates (computed earlier on the caller's goroutine, so it's safe) instead of the length of the IndexOps (which includes updates and deletes) the key is that we should only spawn the goroutine that will range over the batch, in cases where we will synchronize on waiting for the analysis to complete (at least one update). fixes #1149 --- index/scorch/scorch.go | 2 +- index/upsidedown/upsidedown.go | 2 +- index_test.go | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 34 insertions(+), 2 deletions(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 3f3d8bffc..2641b3b19 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -312,7 +312,7 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { // FIXME could sort ids list concurrent with analysis? - if len(batch.IndexOps) > 0 { + if numUpdates > 0 { go func() { for _, doc := range batch.IndexOps { if doc != nil { diff --git a/index/upsidedown/upsidedown.go b/index/upsidedown/upsidedown.go index e4bc3d8f0..7992ac1d9 100644 --- a/index/upsidedown/upsidedown.go +++ b/index/upsidedown/upsidedown.go @@ -810,7 +810,7 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) { } } - if len(batch.IndexOps) > 0 { + if numUpdates > 0 { go func() { for _, doc := range batch.IndexOps { if doc != nil { diff --git a/index_test.go b/index_test.go index d1351dc29..65ddedaa8 100644 --- a/index_test.go +++ b/index_test.go @@ -2184,3 +2184,35 @@ func TestDataRaceBug1092(t *testing.T) { batch.Reset() } } + +func TestBatchRaceBug1149(t *testing.T) { + defer func() { + err := os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + }() + i, err := New("testidx", NewIndexMapping()) + //i, err := NewUsing("testidx", NewIndexMapping(), "scorch", "scorch", nil) + if err != nil { + t.Fatal(err) + } + defer func() { + err := i.Close() + if err != nil { + t.Fatal(err) + } + }() + b := i.NewBatch() + b.Delete("1") + err = i.Batch(b) + if err != nil { + t.Fatal(err) + } + b.Reset() + err = i.Batch(b) + if err != nil { + t.Fatal(err) + } + b.Reset() +} From 119f9f47953812f1569f6b630188428346332dc0 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Sun, 3 Mar 2019 19:36:36 -0500 Subject: [PATCH 558/728] fix postings list count implementation (#1143) previously the calculation subtracted the total number of deleted documents, without regard for how the deleted documents set might overlap with the postings list, this was incorrect. fixes #1140 --- index/scorch/segment/zap/posting.go | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 26378c27e..93b51ae73 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -264,18 +264,17 @@ func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool, // Count returns the number of items on this postings list func (p *PostingsList) Count() uint64 { - var n uint64 + var n, e uint64 if p.normBits1Hit != 0 { n = 1 + if p.except != nil && p.except.Contains(uint32(p.docNum1Hit)) { + e = 1 + } } else if p.postings != nil { n = p.postings.GetCardinality() - } - var e uint64 - if p.except != nil { - e = p.except.GetCardinality() - } - if n <= e { - return 0 + if p.except != nil { + e = p.postings.AndCardinality(p.except) + } } return n - e } From 4b5cfbc8d5656cf05bf74d1ae270d2ca505560fc Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 7 Mar 2019 08:51:01 -0500 Subject: [PATCH 559/728] fix issue mapping documents containing *time.Time (#1153) * fix issue mapping documents containing *time.Time If documents contained a field of type *time.Time AND they also had an explicit mapping for the field of a type OTHER THAN text, the field would not be mapped. The issue was introduced when we added special support for handling encoding.TextMarshaler because *time.Time satisfies this interface. This change fixes this issue, but there are some more complicated cases exposed by this, specifically related to mapping the same field multiple times, and handling cases where you try to mix multiple types. Instead, our solution is to further limit the way we support using ecoding.TextMarshaler. The new limitation is that we ONLY use the output of MarshalText() if you have an explicit mapping AND ALL the index fields are of type text. All other cases will ignore TextMarshaler and follow regular indexing rules. * address code review comments --- mapping/document.go | 26 +++++++++++++++++--------- mapping/mapping_test.go | 32 ++++++++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 9 deletions(-) diff --git a/mapping/document.go b/mapping/document.go index f950b59be..15cb6b5fa 100644 --- a/mapping/document.go +++ b/mapping/document.go @@ -525,19 +525,27 @@ func (dm *DocumentMapping) processProperty(property interface{}, path []string, if !propertyValue.IsNil() { switch property := property.(type) { case encoding.TextMarshaler: - - txt, err := property.MarshalText() - if err == nil && subDocMapping != nil { - // index by explicit mapping + // ONLY process TextMarshaler if there is an explicit mapping + // AND all of the fiels are of type text + // OTHERWISE process field without TextMarshaler + if subDocMapping != nil { + allFieldsText := true for _, fieldMapping := range subDocMapping.Fields { - if fieldMapping.Type == "text" { - fieldMapping.processString(string(txt), pathString, path, indexes, context) + if fieldMapping.Type != "text" { + allFieldsText = false + break } } - } else { - dm.walkDocument(property, path, indexes, context) + txt, err := property.MarshalText() + if err == nil && allFieldsText { + txtStr := string(txt) + for _, fieldMapping := range subDocMapping.Fields { + fieldMapping.processString(txtStr, pathString, path, indexes, context) + } + return + } } - + dm.walkDocument(property, path, indexes, context) default: dm.walkDocument(property, path, indexes, context) } diff --git a/mapping/mapping_test.go b/mapping/mapping_test.go index 498fdb4ec..2349da707 100644 --- a/mapping/mapping_test.go +++ b/mapping/mapping_test.go @@ -1109,3 +1109,35 @@ func TestClosestDocDynamicMapping(t *testing.T) { t.Fatalf("expected 1 field, got: %d", len(doc.Fields)) } } + +func TestMappingPointerToTimeBug1152(t *testing.T) { + + when, err := time.Parse(time.RFC3339, "2019-03-06T15:04:05Z") + if err != nil { + t.Fatal(err) + } + + thing := struct { + When *time.Time + }{ + When: &when, + } + + // this case tests when there WAS an explicit mapping, but it was NOT type text + // as this was the specific case that was problematic + m := NewIndexMapping() + dtf := NewDateTimeFieldMapping() + m.DefaultMapping.AddFieldMappingsAt("When", dtf) + doc := document.NewDocument("x") + err = m.MapDocument(doc, thing) + if err != nil { + t.Fatal(err) + } + + if len(doc.Fields) != 1 { + t.Fatalf("expected 1 field, got: %d", len(doc.Fields)) + } + if _, ok := doc.Fields[0].(*document.DateTimeField); !ok { + t.Fatalf("expected field to be type *document.DateTimeField, got %T", doc.Fields[0]) + } +} \ No newline at end of file From baab665d1b509f1247dd97c2e9eaa59d2a8ac6e3 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Tue, 12 Mar 2019 16:01:16 +0530 Subject: [PATCH 560/728] IndexReader initialisation in SearchContext Initialising the IndexReader field of the SearchContext so that the DocumentMatchHandler implementation will have enough details for invoking the LoadAndHighlightFields API. --- search/collector/topn.go | 1 + 1 file changed, 1 insertion(+) diff --git a/search/collector/topn.go b/search/collector/topn.go index 378a7b114..ff08d0a9c 100644 --- a/search/collector/topn.go +++ b/search/collector/topn.go @@ -141,6 +141,7 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, searchContext := &search.SearchContext{ DocumentMatchPool: search.NewDocumentMatchPool(backingSize+searcher.DocumentMatchPoolSize(), len(hc.sort)), Collector: hc, + IndexReader: reader, } hc.dvReader, err = reader.DocValueReader(hc.neededFields) From d12e143a661a8d1b2f067b72ea5e8f4620f62093 Mon Sep 17 00:00:00 2001 From: Mervin Date: Thu, 14 Mar 2019 07:31:44 +0800 Subject: [PATCH 561/728] Fix panic on range query with unlimited upper in scorch (issue#1156). (#1157) * fix panic on range query with unlimited upper in scorch. * add unit test for the bug also remove unnecessary `= nil` since that is the zero-value --- index/scorch/segment/zap/dict.go | 13 +++++--- index/scorch/segment/zap/dict_test.go | 44 +++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 5 deletions(-) diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 2c0e1bf2a..190265d6e 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -143,11 +143,14 @@ func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator } // need to increment the end position to be inclusive - endBytes := []byte(end) - if endBytes[len(endBytes)-1] < 0xff { - endBytes[len(endBytes)-1]++ - } else { - endBytes = append(endBytes, 0xff) + var endBytes []byte + if len(end) > 0 { + endBytes = []byte(end) + if endBytes[len(endBytes)-1] < 0xff { + endBytes[len(endBytes)-1]++ + } else { + endBytes = append(endBytes, 0xff) + } } if d.fst != nil { diff --git a/index/scorch/segment/zap/dict_test.go b/index/scorch/segment/zap/dict_test.go index b654bf45f..378abd854 100644 --- a/index/scorch/segment/zap/dict_test.go +++ b/index/scorch/segment/zap/dict_test.go @@ -279,3 +279,47 @@ func TestDictionaryError(t *testing.T) { t.Fatalf("expected nil next and nil err, got: %v, %v", nxt, err) } } + +func TestDictionaryBug1156(t *testing.T) { + + _ = os.RemoveAll("/tmp/scorch.zap") + + testSeg, _, _ := buildTestSegmentForDict() + err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") + if err != nil { + t.Fatalf("error persisting segment: %v", err) + } + + segment, err := Open("/tmp/scorch.zap") + if err != nil { + t.Fatalf("error opening segment: %v", err) + } + defer func() { + cerr := segment.Close() + if cerr != nil { + t.Fatalf("error closing segment: %v", err) + } + }() + + dict, err := segment.Dictionary("desc") + if err != nil { + t.Fatal(err) + } + + // test range iterator + expected := []string{"cat", "dog", "egg", "fish"} + var got []string + itr := dict.RangeIterator("cat", "") + next, err := itr.Next() + for next != nil && err == nil { + got = append(got, next.Term) + next, err = itr.Next() + } + if err != nil { + t.Fatalf("dict itr error: %v", err) + } + + if !reflect.DeepEqual(expected, got) { + t.Errorf("expected: %v, got: %v", expected, got) + } +} From 57a60f1bbc4ee907082cd6eb3e365795156b6862 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 13 Mar 2019 19:38:17 -0400 Subject: [PATCH 562/728] run integration tests with scorch (#1159) some minor changes have been made to a few of the queries to ensure that the search results have a deterministic order one test was removed as it was specifically checking the order when no sort was specified, which is defined in a way that does not lend itself to easy testing we did not change all tests without sort order, as many of them are correctly ordered by score already --- .travis.yml | 1 + test/tests/basic/searches.json | 5 +++++ test/tests/fosdem/searches.json | 1 + test/tests/sort/searches.json | 33 --------------------------------- 4 files changed, 7 insertions(+), 33 deletions(-) diff --git a/.travis.yml b/.travis.yml index 35f7b60f2..fcc516db5 100644 --- a/.travis.yml +++ b/.travis.yml @@ -15,6 +15,7 @@ script: - gvt restore - go test -race -v $(go list ./... | grep -v vendor/) - go vet $(go list ./... | grep -v vendor/) + - go test ./test -v -indexType scorch - errcheck -ignorepkg fmt $(go list ./... | grep -v vendor/) - docs/project-code-coverage.sh - docs/build_children.sh diff --git a/test/tests/basic/searches.json b/test/tests/basic/searches.json index 42bcee299..0f9606ede 100644 --- a/test/tests/basic/searches.json +++ b/test/tests/basic/searches.json @@ -189,6 +189,7 @@ "search": { "from": 0, "size": 10, + "sort": ["_id"], "query": { "field": "birthday", "start": "2010-01-01" @@ -408,6 +409,7 @@ "search": { "from": 0, "size": 10, + "sort": ["_id"], "query": { "query": "-title:mista" } @@ -516,6 +518,7 @@ "search": { "from": 0, "size": 10, + "sort": ["_id"], "query": { "match_all": {} } @@ -543,6 +546,7 @@ "search": { "from": 0, "size": 10, + "sort": ["_id"], "query": { "ids": ["b", "c"] } @@ -564,6 +568,7 @@ "search": { "from": 0, "size": 10, + "sort": ["_id"], "query": { "query": "+age:>20 missess" } diff --git a/test/tests/fosdem/searches.json b/test/tests/fosdem/searches.json index 99f95aaad..444f4dc30 100644 --- a/test/tests/fosdem/searches.json +++ b/test/tests/fosdem/searches.json @@ -3,6 +3,7 @@ "search": { "from": 0, "size": 10, + "sort": ["_id"], "query": { "field": "category", "match_phrase": "Perl" diff --git a/test/tests/sort/searches.json b/test/tests/sort/searches.json index a679f0b8d..bb51720a8 100644 --- a/test/tests/sort/searches.json +++ b/test/tests/sort/searches.json @@ -1,37 +1,4 @@ [ - { - "comment": "default order, all have same score, then by natural index order", - "search": { - "from": 0, - "size": 10, - "query": { - "match_all":{} - } - }, - "result": { - "total_hits": 6, - "hits": [ - { - "id": "a" - }, - { - "id": "b" - }, - { - "id": "c" - }, - { - "id": "d" - }, - { - "id": "e" - }, - { - "id": "f" - } - ] - } - }, { "comment": "sort by name, ascending", "search": { From 308ce95fe6f6e2c976462c5db0743e9a010b3d19 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 15 Mar 2019 09:22:15 -0400 Subject: [PATCH 563/728] change sort strategy for integration tests (#1161) any test which is not explicitly setting a sort order should instead specify: "sort": ["-_score", "_id"] this starts with the natural order of score descending, but also imposes an additional tiebreaking rule to sort by the documents identifier. this comes with some additional cost, but makes results more stable for testing purposes. --- test/tests/basic/searches.json | 60 ++++++++++++++++++++++++++++--- test/tests/employee/searches.json | 2 ++ test/tests/fosdem/searches.json | 6 +++- test/tests/phrase/searches.json | 22 ++++++++++++ 4 files changed, 84 insertions(+), 6 deletions(-) diff --git a/test/tests/basic/searches.json b/test/tests/basic/searches.json index 0f9606ede..8b6206c32 100644 --- a/test/tests/basic/searches.json +++ b/test/tests/basic/searches.json @@ -1,8 +1,10 @@ [ { + "comment": "test term search, exact match", "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "name", "term": "marti" @@ -18,9 +20,11 @@ } }, { + "comment": "test term search, no match", "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "name", "term": "noone" @@ -32,9 +36,11 @@ } }, { + "comment": "test match phrase search", "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "match_phrase": "long name" } @@ -49,9 +55,11 @@ } }, { + "comment": "test term search, no match", "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "name", "term": "walking" @@ -63,9 +71,11 @@ } }, { + "comment": "test match search, matching due to analysis", "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "fuzziness": 0, "prefix_length": 0, @@ -83,9 +93,11 @@ } }, { + "comment": "test term prefix search", "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "name", "prefix": "bobble" @@ -101,9 +113,11 @@ } }, { + "comment": "test simple query string", "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "query": "+name:phone" } @@ -118,9 +132,11 @@ } }, { + "comment": "test numeric range, no lower bound", "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "age", "max": 30 @@ -139,9 +155,11 @@ } }, { + "comment": "test numeric range, upper and lower bounds", "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "age", "max": 30, @@ -158,9 +176,11 @@ } }, { + "comment": "test conjunction of numeric range, upper and lower bounds", "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "conjuncts": [ { @@ -186,10 +206,11 @@ } }, { + "comment": "test date range, no upper bound", "search": { "from": 0, "size": 10, - "sort": ["_id"], + "sort": ["-_score", "_id"], "query": { "field": "birthday", "start": "2010-01-01" @@ -208,9 +229,11 @@ } }, { + "comment": "test numeric range, no lower bound", "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "birthday", "end": "2010-01-01" @@ -226,9 +249,11 @@ } }, { + "comment": "test term search, matching inside an array", "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "tags", "term": "gopher" @@ -244,9 +269,11 @@ } }, { + "comment": "test term search, matching another element inside array", "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "tags", "term": "belieber" @@ -262,9 +289,11 @@ } }, { + "comment": "test term search, not present in array", "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "tags", "term": "notintagsarray" @@ -280,6 +309,7 @@ "search": { "from": 0, "size": 0, + "sort": ["-_score", "_id"], "query": { "field": "name", "term": "marti" @@ -295,6 +325,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "fields": ["tags"], "query": { "field": "name", @@ -314,9 +345,11 @@ } }, { + "comment": "test fuzzy search, fuzziness 1 with match", "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "name", "term": "msrti", @@ -337,6 +370,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "name", "match": "long" @@ -362,6 +396,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "name", "match": "long" @@ -385,6 +420,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "fields": ["age","birthday"], "query": { "field": "name", @@ -409,7 +445,7 @@ "search": { "from": 0, "size": 10, - "sort": ["_id"], + "sort": ["-_score", "_id"], "query": { "query": "-title:mista" } @@ -434,6 +470,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "name", "match": "long" @@ -460,6 +497,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "tags", "match": "gopher" @@ -485,6 +523,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "title", "prefix": "miss" @@ -504,6 +543,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "match_none": {} } @@ -518,7 +558,7 @@ "search": { "from": 0, "size": 10, - "sort": ["_id"], + "sort": ["-_score", "_id"], "query": { "match_all": {} } @@ -546,7 +586,7 @@ "search": { "from": 0, "size": 10, - "sort": ["_id"], + "sort": ["-_score", "_id"], "query": { "ids": ["b", "c"] } @@ -568,7 +608,7 @@ "search": { "from": 0, "size": 10, - "sort": ["_id"], + "sort": ["-_score", "_id"], "query": { "query": "+age:>20 missess" } @@ -593,6 +633,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "name", "regexp": "mar.*" @@ -612,6 +653,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "name", "regexp": "mar." @@ -627,6 +669,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "name", "wildcard": "mar*" @@ -646,6 +689,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "disjuncts": [ { @@ -678,6 +722,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "disjuncts": [ { @@ -711,6 +756,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "disjuncts": [ { @@ -744,6 +790,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "disjuncts": [ { @@ -776,6 +823,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "query": "name:mar*" } @@ -794,6 +842,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "query": "name:/mar.*/" } @@ -812,6 +861,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "title", "max": "miz", diff --git a/test/tests/employee/searches.json b/test/tests/employee/searches.json index abba2e9fa..d4db280b5 100644 --- a/test/tests/employee/searches.json +++ b/test/tests/employee/searches.json @@ -1,8 +1,10 @@ [ { + "comment": "test array position output", "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "manages.reports", "term": "julián" diff --git a/test/tests/fosdem/searches.json b/test/tests/fosdem/searches.json index 444f4dc30..4909fea27 100644 --- a/test/tests/fosdem/searches.json +++ b/test/tests/fosdem/searches.json @@ -3,7 +3,7 @@ "search": { "from": 0, "size": 10, - "sort": ["_id"], + "sort": ["-_score", "_id"], "query": { "field": "category", "match_phrase": "Perl" @@ -31,6 +31,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "match": "lisp" } @@ -51,6 +52,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": {"boost":1,"query":"+lisp +category:Perl"} }, "result": { @@ -66,6 +68,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": {"boost":1,"query":"+lisp +category:\"Perl\""} }, "result": { @@ -81,6 +84,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "must": { "conjuncts":[ diff --git a/test/tests/phrase/searches.json b/test/tests/phrase/searches.json index 02d23a244..70f9c8346 100644 --- a/test/tests/phrase/searches.json +++ b/test/tests/phrase/searches.json @@ -3,6 +3,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "body", "match_phrase": "Twenty" @@ -21,6 +22,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "body", "match_phrase": "Twenty Thousand" @@ -39,6 +41,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "body", "match_phrase": "Twenty Thousand Leagues" @@ -57,6 +60,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "body", "match_phrase": "Twenty Thousand Leagues Under" @@ -75,6 +79,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "body", "match_phrase": "Twenty Thousand Leagues Under the" @@ -93,6 +98,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "body", "match_phrase": "Twenty Thousand Leagues Under the Sea" @@ -111,6 +117,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "body", "match_phrase": "Thousand" @@ -129,6 +136,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "body", "match_phrase": "Thousand Leagues" @@ -147,6 +155,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "body", "match_phrase": "Thousand Leagues Under" @@ -165,6 +174,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "body", "match_phrase": "Thousand Leagues Under the" @@ -183,6 +193,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "body", "match_phrase": "Thousand Leagues Under the Sea" @@ -201,6 +212,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "body", "match_phrase": "Leagues" @@ -219,6 +231,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "body", "match_phrase": "Leagues Under" @@ -237,6 +250,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "body", "match_phrase": "Leagues Under the" @@ -255,6 +269,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "body", "match_phrase": "Leagues Under the Sea" @@ -273,6 +288,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "body", "match_phrase": "Under the Sea" @@ -291,6 +307,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "body", "match_phrase": "the Sea" @@ -309,6 +326,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "body", "match_phrase": "Sea" @@ -327,6 +345,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "body", "match_phrase": "bad call" @@ -345,6 +364,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "body", "match_phrase": "defenseless receiver" @@ -363,6 +383,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "body", "match_phrase": "bad receiver" @@ -378,6 +399,7 @@ "search": { "from": 0, "size": 10, + "sort": ["-_score", "_id"], "query": { "field": "body", "terms": [["twenti","thirti"],["thousand"]] From c6b5eca7b2acff74224a35a03822a806af47b277 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 18 Mar 2019 18:44:44 -0700 Subject: [PATCH 564/728] Import analysis/char/asciifolding in config to register the filter --- config/config.go | 1 + 1 file changed, 1 insertion(+) diff --git a/config/config.go b/config/config.go index ad0bdcb9a..cf2827a5c 100644 --- a/config/config.go +++ b/config/config.go @@ -31,6 +31,7 @@ import ( _ "github.com/blevesearch/bleve/search/highlight/highlighter/simple" // char filters + _ "github.com/blevesearch/bleve/analysis/char/asciifolding" _ "github.com/blevesearch/bleve/analysis/char/html" _ "github.com/blevesearch/bleve/analysis/char/regexp" _ "github.com/blevesearch/bleve/analysis/char/zerowidthnonjoiner" From 950ca8b43520816995ed92c9d95a86dedd0c4964 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 18 Mar 2019 17:35:17 -0700 Subject: [PATCH 565/728] go fmt --- index/store/moss/lower_test.go | 2 +- mapping/mapping_test.go | 2 +- search/collector/topn.go | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/index/store/moss/lower_test.go b/index/store/moss/lower_test.go index afc0a0959..72b8a7194 100644 --- a/index/store/moss/lower_test.go +++ b/index/store/moss/lower_test.go @@ -27,7 +27,7 @@ func openWithLower(t *testing.T, mo store.MergeOperator) (string, store.KVStore) tmpDir, _ := ioutil.TempDir("", "mossStore") config := map[string]interface{}{ - "path": tmpDir, + "path": tmpDir, "mossLowerLevelStoreName": "mossStore", } diff --git a/mapping/mapping_test.go b/mapping/mapping_test.go index 2349da707..b57283f3a 100644 --- a/mapping/mapping_test.go +++ b/mapping/mapping_test.go @@ -1140,4 +1140,4 @@ func TestMappingPointerToTimeBug1152(t *testing.T) { if _, ok := doc.Fields[0].(*document.DateTimeField); !ok { t.Fatalf("expected field to be type *document.DateTimeField, got %T", doc.Fields[0]) } -} \ No newline at end of file +} diff --git a/search/collector/topn.go b/search/collector/topn.go index ff08d0a9c..b30bd6ecf 100644 --- a/search/collector/topn.go +++ b/search/collector/topn.go @@ -141,7 +141,7 @@ func (hc *TopNCollector) Collect(ctx context.Context, searcher search.Searcher, searchContext := &search.SearchContext{ DocumentMatchPool: search.NewDocumentMatchPool(backingSize+searcher.DocumentMatchPoolSize(), len(hc.sort)), Collector: hc, - IndexReader: reader, + IndexReader: reader, } hc.dvReader, err = reader.DocValueReader(hc.neededFields) From af45e364cfbda2039fc5047fc1b392e6abe478e2 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 18 Mar 2019 14:41:50 -0700 Subject: [PATCH 566/728] upsidedown.UpdateWithAnalysis() advanced helper method This commit refactors the UpsideDownCouch.Update(doc) method with a helper method, UpdateWithAnalysis(), in order to allow advanced apps to skip the analysis queue and back-index lookup steps of upsidedown's single-doc Update(). The use-case motivation details... couchbase server is integrating the N1QL query engine with its bleve-powered full-text search engine, and may need to check that a document matches a search during N1QL's predicate filtering phase. For example... SELECT * FROM `beer-sample` as b WHERE SEARCH(b, "desc:good"); SELECT * FROM [ {"desc": "the beer is good"} ] AS b WHERE SEARCH(b, "desc:good"); The above SEARCH()'es default to using a default, dynamic mapping. If an appropriate FTS index exists, it can be used to speed up the query. In the case of a non-existent or non-covering FTS index, there might be many JSON docs that need to be predicate-filtered by the N1QL engine, where predicate-filtering is processed 1 doc at a time. This is supported by creating a reused, upsidedown index using moss configured in memory-only mode. Each doc to be examined is a) indexed (so the index's doc-count will only ever reach a max of 1) and b) the upsidedown index is searched to see if the doc is a hit. In upsidedown's Update(doc), the usual handoff to the analysis goroutines was dominating the pprof graphs. So, this commit enables a performance optimization where couchbase can now avoid goroutine switching via UpdateWithAnalysis(). Some microbenchmark results from github.com/couchbase/n1fty... BenchmarkMossWithoutOptimizations-8 20000 61246 ns/op 26021 B/op 386 allocs/op BenchmarkMossWithOptimizeReset-8 30000 55692 ns/op 20863 B/op 323 allocs/op BenchmarkMossWithOptimizeResetAndUpdate-8 50000 32241 ns/op 16924 B/op 230 allocs/op The "Reset" is another, recent moss-specific optimization (ea630c), for a fuller before-vs-after picture. --- index/upsidedown/upsidedown.go | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/index/upsidedown/upsidedown.go b/index/upsidedown/upsidedown.go index 7992ac1d9..0699dbf97 100644 --- a/index/upsidedown/upsidedown.go +++ b/index/upsidedown/upsidedown.go @@ -415,7 +415,6 @@ func (udc *UpsideDownCouch) Close() error { func (udc *UpsideDownCouch) Update(doc *document.Document) (err error) { // do analysis before acquiring write lock analysisStart := time.Now() - numPlainTextBytes := doc.NumPlainTextBytes() resultChan := make(chan *index.AnalysisResult) aw := index.NewAnalysisWork(udc, doc, resultChan) @@ -452,6 +451,11 @@ func (udc *UpsideDownCouch) Update(doc *document.Document) (err error) { return } + return udc.UpdateWithAnalysis(doc, result, backIndexRow) +} + +func (udc *UpsideDownCouch) UpdateWithAnalysis(doc *document.Document, + result *index.AnalysisResult, backIndexRow *BackIndexRow) (err error) { // start a writer for this update indexStart := time.Now() var kvwriter store.KVWriter @@ -490,7 +494,7 @@ func (udc *UpsideDownCouch) Update(doc *document.Document) (err error) { atomic.AddUint64(&udc.stats.indexTime, uint64(time.Since(indexStart))) if err == nil { atomic.AddUint64(&udc.stats.updates, 1) - atomic.AddUint64(&udc.stats.numPlainTextBytesIndexed, numPlainTextBytes) + atomic.AddUint64(&udc.stats.numPlainTextBytesIndexed, doc.NumPlainTextBytes()) } else { atomic.AddUint64(&udc.stats.errors, 1) } From 59adf0b315be3cb05b3ae26e916a7bf44d6ca2f2 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 22 Mar 2019 12:15:59 -0700 Subject: [PATCH 567/728] fixes #1168 - filter duplicate locations in search results This fix filters out duplicate locations using a destructive in-place sort on the locations slice, and then a pass to remove duplicates. This hopefully does not have undue impact on normal case performance, as the deduplication effort happens only in the "last step" of finalizeResults() and its DocumentMatch.Complete() invocations. e.g., the deduplication will be only invoked on the final result hits (e.g., "size": 10) instead of on all the (potentially many) candidate document-matches which were examined beforehand. Also, we only perform the deduplication effort only if we detect that the locations that are being examined look like they're not in increasing order. --- search/search.go | 82 +++++++++++++++++++++++++++++++++++++++- search/search_test.go | 76 +++++++++++++++++++++++++++++++++++++ search_test.go | 41 ++++++++++++++++++++ test/integration_test.go | 2 +- 4 files changed, 199 insertions(+), 2 deletions(-) create mode 100644 search/search_test.go diff --git a/search/search.go b/search/search.go index 72bb0ea29..8ed23de45 100644 --- a/search/search.go +++ b/search/search.go @@ -17,6 +17,7 @@ package search import ( "fmt" "reflect" + "sort" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/size" @@ -49,6 +50,24 @@ func (ap ArrayPositions) Equals(other ArrayPositions) bool { return true } +func (ap ArrayPositions) Compare(other ArrayPositions) int { + for i, p := range ap { + if i >= len(other) { + return 1 + } + if p < other[i] { + return -1 + } + if p > other[i] { + return 1 + } + } + if len(ap) < len(other) { + return -1 + } + return 0 +} + type Location struct { // Pos is the position of the term within the field, starting at 1 Pos uint64 `json:"pos"` @@ -68,6 +87,46 @@ func (l *Location) Size() int { type Locations []*Location +func (p Locations) Len() int { return len(p) } +func (p Locations) Swap(i, j int) { p[i], p[j] = p[j], p[i] } + +func (p Locations) Less(i, j int) bool { + c := p[i].ArrayPositions.Compare(p[j].ArrayPositions) + if c < 0 { + return true + } + if c > 0 { + return false + } + return p[i].Pos < p[j].Pos +} + +func (p Locations) Dedupe() Locations { // destructive! + if len(p) <= 1 { + return p + } + + sort.Sort(p) + + slow := 0 + + for _, pfast := range p { + pslow := p[slow] + if pslow.Pos == pfast.Pos && + pslow.Start == pfast.Start && + pslow.End == pfast.End && + pslow.ArrayPositions.Equals(pfast.ArrayPositions) { + continue // duplicate, so only move fast ahead + } + + slow++ + + p[slow] = pfast + } + + return p[:slow+1] +} + type TermLocationMap map[string]Locations func (t TermLocationMap) AddLocation(term string, location *Location) { @@ -208,6 +267,7 @@ func (dm *DocumentMatch) Complete(prealloc []Location) []Location { var lastField string var tlm TermLocationMap + var needsDedupe bool for i, ftl := range dm.FieldTermLocations { if lastField != ftl.Field { @@ -231,7 +291,19 @@ func (dm *DocumentMatch) Complete(prealloc []Location) []Location { loc.ArrayPositions = append(ArrayPositions(nil), loc.ArrayPositions...) } - tlm[ftl.Term] = append(tlm[ftl.Term], loc) + locs := tlm[ftl.Term] + + // if the loc is before or at the last location, then there + // might be duplicates that need to be deduplicated + if !needsDedupe && len(locs) > 0 { + last := locs[len(locs)-1] + cmp := loc.ArrayPositions.Compare(last.ArrayPositions) + if cmp < 0 || (cmp == 0 && loc.Pos <= last.Pos) { + needsDedupe = true + } + } + + tlm[ftl.Term] = append(locs, loc) dm.FieldTermLocations[i] = FieldTermLocation{ // recycle Location: Location{ @@ -239,6 +311,14 @@ func (dm *DocumentMatch) Complete(prealloc []Location) []Location { }, } } + + if needsDedupe { + for _, tlm := range dm.Locations { + for term, locs := range tlm { + tlm[term] = locs.Dedupe() + } + } + } } dm.FieldTermLocations = dm.FieldTermLocations[:0] // recycle diff --git a/search/search_test.go b/search/search_test.go new file mode 100644 index 000000000..f15dc061c --- /dev/null +++ b/search/search_test.go @@ -0,0 +1,76 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package search + +import ( + "reflect" + "testing" +) + +func TestArrayPositionsCompare(t *testing.T) { + tests := []struct { + a []uint64 + b []uint64 + expect int + }{ + {nil, nil, 0}, + {[]uint64{}, []uint64{}, 0}, + {[]uint64{1}, []uint64{}, 1}, + {[]uint64{1}, []uint64{1}, 0}, + {[]uint64{}, []uint64{1}, -1}, + {[]uint64{0}, []uint64{1}, -1}, + {[]uint64{1}, []uint64{0}, 1}, + {[]uint64{1}, []uint64{1, 2}, -1}, + {[]uint64{1, 2}, []uint64{1}, 1}, + {[]uint64{1, 2}, []uint64{1, 2}, 0}, + {[]uint64{1, 2}, []uint64{1, 200}, -1}, + {[]uint64{1, 2}, []uint64{100, 2}, -1}, + {[]uint64{1, 2}, []uint64{1, 2, 3}, -1}, + } + + for _, test := range tests { + res := ArrayPositions(test.a).Compare(test.b) + if res != test.expect { + t.Errorf("test: %+v, res: %v", test, res) + } + } +} + +func TestLocationsDedupe(t *testing.T) { + a := &Location{} + b := &Location{Pos: 1} + c := &Location{Pos: 2} + + tests := []struct { + input Locations + expect Locations + }{ + {Locations{}, Locations{}}, + {Locations{a}, Locations{a}}, + {Locations{a, b, c}, Locations{a, b, c}}, + {Locations{a, a}, Locations{a}}, + {Locations{a, a, a}, Locations{a}}, + {Locations{a, b}, Locations{a, b}}, + {Locations{b, a}, Locations{a, b}}, + {Locations{c, b, a, c, b, a, c, b, a}, Locations{a, b, c}}, + } + + for testi, test := range tests { + res := test.input.Dedupe() + if !reflect.DeepEqual(res, test.expect) { + t.Errorf("testi: %d, test: %+v, res: %+v", testi, test, res) + } + } +} diff --git a/search_test.go b/search_test.go index a37dea703..917b24ad2 100644 --- a/search_test.go +++ b/search_test.go @@ -36,6 +36,7 @@ import ( "github.com/blevesearch/bleve/index/upsidedown" "github.com/blevesearch/bleve/mapping" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/search/highlight/highlighter/html" "github.com/blevesearch/bleve/search/query" ) @@ -1223,3 +1224,43 @@ func TestDisjunctionMinPropagation(t *testing.T) { t.Fatalf("Expect 0 results, but got: %v", res.Total) } } + +func TestDuplicateLocationsIssue1168(t *testing.T) { + fm1 := NewTextFieldMapping() + fm1.Analyzer = keyword.Name + fm1.Name = "name1" + + dm := NewDocumentStaticMapping() + dm.AddFieldMappingsAt("name", fm1) + + m := NewIndexMapping() + m.DefaultMapping = dm + + idx, err := NewMemOnly(m) + if err != nil { + t.Fatalf("bleve new err: %v", err) + } + + err = idx.Index("x", map[string]interface{}{ + "name": "marty", + }) + if err != nil { + t.Fatalf("bleve index err: %v", err) + } + + q1 := NewTermQuery("marty") + q2 := NewTermQuery("marty") + dq := NewDisjunctionQuery(q1, q2) + + sreq := NewSearchRequest(dq) + sreq.Fields = []string{"*"} + sreq.Highlight = NewHighlightWithStyle(html.Name) + + sres, err := idx.Search(sreq) + if err != nil { + t.Fatalf("bleve search err: %v", err) + } + if len(sres.Hits[0].Locations["name1"]["marty"]) != 1 { + t.Fatalf("duplicate marty") + } +} diff --git a/test/integration_test.go b/test/integration_test.go index 627969708..8cbac6f30 100644 --- a/test/integration_test.go +++ b/test/integration_test.go @@ -187,7 +187,7 @@ func runTestDir(t *testing.T, dir, datasetName string) { if hit.Locations != nil { if !reflect.DeepEqual(hit.Locations, res.Hits[hi].Locations) { t.Errorf("test error - %s", search.Comment) - t.Errorf("test %d - expected hit %d to have locations %v got %v", testNum, hi, hit.Locations, res.Hits[hi].Locations) + t.Errorf("test %d - expected hit %d to have locations %#v got %#v", testNum, hi, hit.Locations, res.Hits[hi].Locations) } } // assert that none of the scores were NaN,+Inf,-Inf From 6ba04b0f813c73cae8a4a3d5f527a3291ee05b88 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 22 Mar 2019 12:46:48 -0700 Subject: [PATCH 568/728] asciifolding test apples from MB-33486 --- analysis/char/asciifolding/asciifolding_test.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/analysis/char/asciifolding/asciifolding_test.go b/analysis/char/asciifolding/asciifolding_test.go index 82dc67868..216583d1e 100644 --- a/analysis/char/asciifolding/asciifolding_test.go +++ b/analysis/char/asciifolding/asciifolding_test.go @@ -43,6 +43,10 @@ func TestAsciiFoldingFilter(t *testing.T) { // composite unicode runes are folded to more than one ASCII rune input: []byte(`ÆꜴ`), output: []byte(`AEAO`), + }, { + // apples from https://issues.couchbase.com/browse/MB-33486 + input: []byte(`Ápple Àpple Äpple Âpple Ãpple Åpple`), + output: []byte(`Apple Apple Apple Apple Apple Apple`), }, } From 40c19afe2872462a5102972bf8ad13297a16a2a6 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Fri, 22 Mar 2019 13:53:11 -0700 Subject: [PATCH 569/728] MB-33462: Validating the range for a date range query + All datetimes after 2262-04-11T11:59:59Z will cause UnixNano() which generates an Int64 to overflow. + Error out early for these values rather than letting the query fail quietly. --- search/query/date_range.go | 14 +++++++++-- search/query/date_range_test.go | 41 +++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/search/query/date_range.go b/search/query/date_range.go index ff67a7bb7..ae26f8dec 100644 --- a/search/query/date_range.go +++ b/search/query/date_range.go @@ -143,10 +143,20 @@ func (q *DateRangeQuery) parseEndpoints() (*float64, *float64, error) { min := math.Inf(-1) max := math.Inf(1) if !q.Start.IsZero() { - min = numeric.Int64ToFloat64(q.Start.UnixNano()) + startInt64 := q.Start.UnixNano() + if startInt64 < 0 { + // overflow + return nil, nil, fmt.Errorf("invalid/unsupported date range, start: %v", q.Start) + } + min = numeric.Int64ToFloat64(startInt64) } if !q.End.IsZero() { - max = numeric.Int64ToFloat64(q.End.UnixNano()) + endInt64 := q.End.UnixNano() + if endInt64 < 0 { + // overflow + return nil, nil, fmt.Errorf("invalid/unsupported date range, end: %v", q.End) + } + max = numeric.Int64ToFloat64(endInt64) } return &min, &max, nil diff --git a/search/query/date_range_test.go b/search/query/date_range_test.go index 9dd691319..dd14d3238 100644 --- a/search/query/date_range_test.go +++ b/search/query/date_range_test.go @@ -49,3 +49,44 @@ func TestBleveQueryTime(t *testing.T) { } } } + +func TestValidateDatetimeRanges(t *testing.T) { + tests := []struct { + start string + end string + expect bool + }{ + { + start: "2019-03-22T13:25:00Z", + end: "2019-03-22T18:25:00Z", + expect: true, + }, + { + start: "2019-03-22T13:25:00Z", + end: "9999-03-22T13:25:00Z", + expect: false, + }, + { + start: "2019-03-22T13:25:00Z", + end: "2262-04-11T11:59:59Z", + expect: true, + }, + { + start: "2019-03-22T13:25:00Z", + end: "2262-04-12T00:00:00Z", + expect: false, + }, + } + + for _, test := range tests { + startTime, _ := time.Parse(time.RFC3339, test.start) + endTime, _ := time.Parse(time.RFC3339, test.end) + + dateRangeQuery := NewDateRangeQuery(startTime, endTime) + if (dateRangeQuery.Validate() == nil) != test.expect { + t.Errorf("unexpected results while validating date range query with"+ + " {start: %v, end: %v}, expected: %v", + test.start, test.end, test.expect) + } + } +} From 010c117a6c9b985178789987f470e4579b530bba Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 25 Mar 2019 15:33:56 +0530 Subject: [PATCH 570/728] Revert "MB-32846 - more aggressively removeOldData() in scorch persister" This reverts commit c8e737a945472f5f266c0799005216080443fbb8. This recent introduction of removeOldData in the persister main loop has slowed the persister work flow to result in higher number of memory based segments. This high number of in-memory segments had resulted in perf regression across a few query types like fuzzy, wildcard etc. --- index/scorch/introducer.go | 4 +--- index/scorch/persister.go | 5 +---- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 2d04bd38e..ac627796f 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -376,6 +376,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { fileSegments++ } } + } // before the newMerge introduction, need to clean the newly @@ -392,7 +393,6 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { } } } - // In case where all the docs in the newly merged segment getting // deleted by the time we reach here, can skip the introduction. if nextMerge.new != nil && @@ -424,7 +424,6 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { newSnapshot.AddRef() // 1 ref for the nextMerge.notify response newSnapshot.updateSize() - s.rootLock.Lock() // swap in new index snapshot newSnapshot.epoch = s.nextSnapshotEpoch @@ -502,7 +501,6 @@ func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { } newSnapshot.updateSize() - // swap in new snapshot rootPrev := s.root s.root = newSnapshot diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 349ccdc0e..e15fa2ab6 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -111,7 +111,6 @@ OUTER: if ew != nil && ew.epoch > lastMergedEpoch { lastMergedEpoch = ew.epoch } - lastMergedEpoch, persistWatchers = s.pausePersisterForMergerCatchUp(lastPersistedEpoch, lastMergedEpoch, persistWatchers, po) @@ -179,7 +178,6 @@ OUTER: s.fireEvent(EventKindPersisterProgress, time.Since(startTime)) if changed { - s.removeOldData() atomic.AddUint64(&s.stats.TotPersistLoopProgress, 1) continue OUTER } @@ -661,13 +659,13 @@ func (s *Scorch) LoadSnapshot(epoch uint64) (rv *IndexSnapshot, err error) { } func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { + rv := &IndexSnapshot{ parent: s, internal: make(map[string][]byte), refs: 1, creator: "loadSnapshot", } - var running uint64 c := snapshot.Cursor() for k, _ := c.First(); k != nil; k, _ = c.Next() { @@ -703,7 +701,6 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { running += segmentSnapshot.segment.Count() } } - return rv, nil } From 7740f389de43cc26265b6175d63f9882b0cd54d4 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Fri, 22 Mar 2019 21:38:55 -0700 Subject: [PATCH 571/728] MB-33455: improve ComputeGeoRange() performance This change improves ComputeGeoRange() performance by avoiding trips to the allocator by using pre-allocated []byte slices. It also avoids creation of interim [][]byte slices by append()'ing directly to returned output slices. Parts of this change... - before this change, ComputeGeoRange() would invoke relateAndRecurse(), which would recursively invoke ComputeGeoRange(). Five of the recursive call parameters... sminLon, sminLat, smaxLon, smaxLat, and checkBoundaries... would be passed down on each invocation. - in this change, ComputeGeoRange() was refactored to have two, internal helper closures: relateAndRecurse() and computeGeoRange(), so that those five parameters would be part of the closure and would no longer need to be passed on each invocation. - NewPrefixCodedInt64Prealloc() API was added to the numeric package to allow the caller to optionally provide a pre-allocated []byte slice. - relatedAndRecurse() now uses this NewPrefixCodedInt64Prealloc() API, via an internal helper closure function, makePrefixCoded(). - makePrefixCoded() manages a preallocated []byte slice, where it allocates another []byte slice, as needed, which will be 2x the size of its last preallocated slice. - the previous relateAndRecurse() would return [][]byte slices of the terms that were on-the-boundary and not-on-the-boundary. The caller, ComputeGeoRange(), would then append() those slices to its own on-boundary/not-on-boundary slices, and then return those onwards to its caller. All these interim slices, then, became garbage. - In this commit, since relateAndRecurse() and computeGeoRange() are now closures, they append() directly onto the final output slices of the top-level ComputeGeoRange(), reducing garbage and trips to the allocator. Before the change... BenchmarkComputeGeoRangePt01-8 100000 18516 ns/op 2005 B/op 63 allocs/op BenchmarkComputeGeoRangePt1-8 10000 108882 ns/op 96267 B/op 736 allocs/op BenchmarkComputeGeoRange10-8 50 35883331 ns/op 35812513 B/op 184543 allocs/op BenchmarkComputeGeoRange100-8 10 192568538 ns/op 187524856 B/op 926510 allocs/op After the change... BenchmarkComputeGeoRangePt01-8 100000 12447 ns/op 280 B/op 7 allocs/op BenchmarkComputeGeoRangePt1-8 30000 47053 ns/op 16416 B/op 28 allocs/op BenchmarkComputeGeoRange10-8 100 11836988 ns/op 8503406 B/op 76 allocs/op BenchmarkComputeGeoRange100-8 20 72555603 ns/op 42778777 B/op 89 allocs/op See also: https://issues.couchbase.com/browse/MB-33455 --- numeric/prefix_coded.go | 21 +++- search/searcher/search_geoboundingbox.go | 114 ++++++++++-------- search/searcher/search_geoboundingbox_test.go | 43 +++++++ 3 files changed, 126 insertions(+), 52 deletions(-) diff --git a/numeric/prefix_coded.go b/numeric/prefix_coded.go index 76ea001ba..29bd0fc5c 100644 --- a/numeric/prefix_coded.go +++ b/numeric/prefix_coded.go @@ -23,12 +23,26 @@ const ShiftStartInt64 byte = 0x20 type PrefixCoded []byte func NewPrefixCodedInt64(in int64, shift uint) (PrefixCoded, error) { + rv, _, err := NewPrefixCodedInt64Prealloc(in, shift, nil) + return rv, err +} + +func NewPrefixCodedInt64Prealloc(in int64, shift uint, prealloc []byte) ( + rv PrefixCoded, preallocRest []byte, err error) { if shift > 63 { - return nil, fmt.Errorf("cannot shift %d, must be between 0 and 63", shift) + return nil, prealloc, fmt.Errorf("cannot shift %d, must be between 0 and 63", shift) } nChars := ((63 - shift) / 7) + 1 - rv := make(PrefixCoded, nChars+1) + + size := int(nChars + 1) + if len(prealloc) >= size { + rv = PrefixCoded(prealloc[0:size]) + preallocRest = prealloc[size:] + } else { + rv = make(PrefixCoded, size) + } + rv[0] = ShiftStartInt64 + byte(shift) sortableBits := int64(uint64(in) ^ 0x8000000000000000) @@ -40,7 +54,8 @@ func NewPrefixCodedInt64(in int64, shift uint) (PrefixCoded, error) { nChars-- sortableBits = int64(uint64(sortableBits) >> 7) } - return rv, nil + + return rv, preallocRest, nil } func MustNewPrefixCodedInt64(in int64, shift uint) PrefixCoded { diff --git a/search/searcher/search_geoboundingbox.go b/search/searcher/search_geoboundingbox.go index 289e41678..29809397c 100644 --- a/search/searcher/search_geoboundingbox.go +++ b/search/searcher/search_geoboundingbox.go @@ -22,6 +22,9 @@ import ( "github.com/blevesearch/bleve/search" ) +var GeoBitsShift1 = (geo.GeoBits << 1) +var GeoBitsShift1Minus1 = GeoBitsShift1 - 1 + func NewGeoBoundingBoxSearcher(indexReader index.IndexReader, minLon, minLat, maxLon, maxLat float64, field string, boost float64, options search.SearcherOptions, checkBoundaries bool) ( @@ -36,7 +39,7 @@ func NewGeoBoundingBoxSearcher(indexReader index.IndexReader, minLon, minLat, } // do math to produce list of terms needed for this search - onBoundaryTerms, notOnBoundaryTerms := ComputeGeoRange(0, (geo.GeoBits<<1)-1, + onBoundaryTerms, notOnBoundaryTerms := ComputeGeoRange(0, GeoBitsShift1Minus1, minLon, minLat, maxLon, maxLat, checkBoundaries) var onBoundarySearcher search.Searcher @@ -94,59 +97,72 @@ var geoMaxShift = document.GeoPrecisionStep * 4 var geoDetailLevel = ((geo.GeoBits << 1) - geoMaxShift) / 2 func ComputeGeoRange(term uint64, shift uint, - sminLon, sminLat, smaxLon, smaxLat float64, - checkBoundaries bool) ( + sminLon, sminLat, smaxLon, smaxLat float64, checkBoundaries bool) ( onBoundary [][]byte, notOnBoundary [][]byte) { - split := term | uint64(0x1)<> 1 - - within := res%document.GeoPrecisionStep == 0 && - geo.RectWithin(minLon, minLat, maxLon, maxLat, - sminLon, sminLat, smaxLon, smaxLat) - if within || (level == geoDetailLevel && - geo.RectIntersects(minLon, minLat, maxLon, maxLat, - sminLon, sminLat, smaxLon, smaxLat)) { - if !within && checkBoundaries { - return [][]byte{ - numeric.MustNewPrefixCodedInt64(int64(start), res), - }, nil + makePrefixCoded := func(in int64, shift uint) (rv numeric.PrefixCoded) { + if len(preallocBytes) <= 0 { + preallocBytesLen = preallocBytesLen * 2 + preallocBytes = make([]byte, preallocBytesLen) + } + + var err error + rv, preallocBytes, err = + numeric.NewPrefixCodedInt64Prealloc(in, shift, preallocBytes) + if err != nil { + panic(err) } - return nil, - [][]byte{ - numeric.MustNewPrefixCodedInt64(int64(start), res), + return rv + } + + var computeGeoRange func(term uint64, shift uint) // declare for recursion + + relateAndRecurse := func(start, end uint64, res, level uint) { + minLon := geo.MortonUnhashLon(start) + minLat := geo.MortonUnhashLat(start) + maxLon := geo.MortonUnhashLon(end) + maxLat := geo.MortonUnhashLat(end) + + within := res%document.GeoPrecisionStep == 0 && + geo.RectWithin(minLon, minLat, maxLon, maxLat, + sminLon, sminLat, smaxLon, smaxLat) + if within || (level == geoDetailLevel && + geo.RectIntersects(minLon, minLat, maxLon, maxLat, + sminLon, sminLat, smaxLon, smaxLat)) { + if !within && checkBoundaries { + onBoundary = append(onBoundary, makePrefixCoded(int64(start), res)) + } else { + notOnBoundary = append(notOnBoundary, makePrefixCoded(int64(start), res)) } - } else if level < geoDetailLevel && - geo.RectIntersects(minLon, minLat, maxLon, maxLat, - sminLon, sminLat, smaxLon, smaxLat) { - return ComputeGeoRange(start, res-1, sminLon, sminLat, smaxLon, smaxLat, - checkBoundaries) + } else if level < geoDetailLevel && + geo.RectIntersects(minLon, minLat, maxLon, maxLat, + sminLon, sminLat, smaxLon, smaxLat) { + computeGeoRange(start, res-1) + } } - return nil, nil + + computeGeoRange = func(term uint64, shift uint) { + split := term | uint64(0x1)<> 1 + + relateAndRecurse(term, lowerMax, shift, level) + relateAndRecurse(split, upperMax, shift, level) + } + + computeGeoRange(term, shift) + + return onBoundary, notOnBoundary } func buildRectFilter(dvReader index.DocValueReader, field string, diff --git a/search/searcher/search_geoboundingbox_test.go b/search/searcher/search_geoboundingbox_test.go index 1de938db5..0113604fc 100644 --- a/search/searcher/search_geoboundingbox_test.go +++ b/search/searcher/search_geoboundingbox_test.go @@ -198,3 +198,46 @@ func setupGeo(t *testing.T) index.Index { return i } + +// -------------------------------------------------------------------- + +func BenchmarkComputeGeoRangePt01(b *testing.B) { + onBoundary := 4 + offBoundary := 0 + benchmarkComputeGeoRange(b, -0.01, -0.01, 0.01, 0.01, onBoundary, offBoundary) +} + +func BenchmarkComputeGeoRangePt1(b *testing.B) { + onBoundary := 56 + offBoundary := 144 + benchmarkComputeGeoRange(b, -0.1, -0.1, 0.1, 0.1, onBoundary, offBoundary) +} + +func BenchmarkComputeGeoRange10(b *testing.B) { + onBoundary := 5464 + offBoundary := 53704 + benchmarkComputeGeoRange(b, -10.0, -10.0, 10.0, 10.0, onBoundary, offBoundary) +} + +func BenchmarkComputeGeoRange100(b *testing.B) { + onBoundary := 32768 + offBoundary := 258560 + benchmarkComputeGeoRange(b, -100.0, -100.0, 100.0, 100.0, onBoundary, offBoundary) +} + +// -------------------------------------------------------------------- + +func benchmarkComputeGeoRange(b *testing.B, + minLon, minLat, maxLon, maxLat float64, onBoundary, offBoundary int) { + checkBoundaries := true + + b.ResetTimer() + + for i := 0; i < b.N; i++ { + onBoundaryRes, offBoundaryRes := + ComputeGeoRange(0, GeoBitsShift1Minus1, minLon, minLat, maxLon, maxLat, checkBoundaries) + if len(onBoundaryRes) != onBoundary || len(offBoundaryRes) != offBoundary { + b.Fatalf("boundaries not matching") + } + } +} From 3d774b9eb3804d64436a5fdaf3fa47efcfdbfec6 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Mon, 25 Mar 2019 09:53:03 -0700 Subject: [PATCH 572/728] MB-33455 - ComputeGeoRange() checks DisjunctionMaxClauseCount This optimization passes DisjunctionMaxClauseCount to ComputeGeoRange() so that it can error early if it sees that it's over the disjunction clause limit. --- search/searcher/search_geoboundingbox.go | 33 ++++++++++++----- search/searcher/search_geoboundingbox_test.go | 36 +++++++++++++++++-- 2 files changed, 58 insertions(+), 11 deletions(-) diff --git a/search/searcher/search_geoboundingbox.go b/search/searcher/search_geoboundingbox.go index 29809397c..a80855ac5 100644 --- a/search/searcher/search_geoboundingbox.go +++ b/search/searcher/search_geoboundingbox.go @@ -39,8 +39,11 @@ func NewGeoBoundingBoxSearcher(indexReader index.IndexReader, minLon, minLat, } // do math to produce list of terms needed for this search - onBoundaryTerms, notOnBoundaryTerms := ComputeGeoRange(0, GeoBitsShift1Minus1, - minLon, minLat, maxLon, maxLat, checkBoundaries) + onBoundaryTerms, notOnBoundaryTerms, err := ComputeGeoRange(0, GeoBitsShift1Minus1, + minLon, minLat, maxLon, maxLat, checkBoundaries, DisjunctionMaxClauseCount) + if err != nil { + return nil, err + } var onBoundarySearcher search.Searcher dvReader, err := indexReader.DocValueReader([]string{field}) @@ -97,8 +100,8 @@ var geoMaxShift = document.GeoPrecisionStep * 4 var geoDetailLevel = ((geo.GeoBits << 1) - geoMaxShift) / 2 func ComputeGeoRange(term uint64, shift uint, - sminLon, sminLat, smaxLon, smaxLat float64, checkBoundaries bool) ( - onBoundary [][]byte, notOnBoundary [][]byte) { + sminLon, sminLat, smaxLon, smaxLat float64, checkBoundaries bool, maxTerms int) ( + onBoundary [][]byte, notOnBoundary [][]byte, err error) { preallocBytesLen := 32 preallocBytes := make([]byte, preallocBytesLen) @@ -108,12 +111,9 @@ func ComputeGeoRange(term uint64, shift uint, preallocBytes = make([]byte, preallocBytesLen) } - var err error rv, preallocBytes, err = numeric.NewPrefixCodedInt64Prealloc(in, shift, preallocBytes) - if err != nil { - panic(err) - } + return rv } @@ -144,6 +144,17 @@ func ComputeGeoRange(term uint64, shift uint, } computeGeoRange = func(term uint64, shift uint) { + if maxTerms > 0 { + if len(onBoundary) > maxTerms { + err = tooManyClausesErr(len(onBoundary)) + } else if len(notOnBoundary) > maxTerms { + err = tooManyClausesErr(len(notOnBoundary)) + } + } + if err != nil { + return + } + split := term | uint64(0x1)< Date: Tue, 26 Mar 2019 15:03:13 -0700 Subject: [PATCH 573/728] Deploy errcheck only for go1.11 --- .travis.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index fcc516db5..c79b3fa3d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,7 +16,11 @@ script: - go test -race -v $(go list ./... | grep -v vendor/) - go vet $(go list ./... | grep -v vendor/) - go test ./test -v -indexType scorch - - errcheck -ignorepkg fmt $(go list ./... | grep -v vendor/) + - if [[ ${TRAVIS_GO_VERSION} =~ ^1\.11 ]]; then + errcheck -ignorepkg fmt $(go list ./... | grep -v vendor/); + else + echo "errcheck skipped for go version" $TRAVIS_GO_VERSION; + fi - docs/project-code-coverage.sh - docs/build_children.sh From 79c5a6840900fc453605270f189fa721c8b19759 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 26 Mar 2019 14:20:59 -0700 Subject: [PATCH 574/728] fixes #1166 - perf optimizations for ReadUvarint() This commit introduces a specialized MemUvarintReader which is focused on the "hot inner loop" of scorch/zap's reading of uvarint's -- such as the postingsList reading of freqNormHasLocs and location data. The various changes include... - the separate bytes.Reader vs binary.ReadUvarint() codepaths are unified, and ReadByte() is now inlined, along with more local vars. - SkipUvarint() allows entries to be skipped more efficiently (by postingsList.currChunkNext()). - prevRune maintenance is removed. - postingsList.readLocation() is always provided a non-nil output location. In microbenchmarks... $ go test -benchmem -bench=. ./index/scorch/segment/ BenchmarkUvarint-8 100000000 16.4 ns/op 0 B/op 0 allocs/op BenchmarkMemUvarintReader-8 200000000 6.15 ns/op 0 B/op 0 allocs/op In bleve-query microbenchmarks on a scorch index with 200K en-wiki docs, high-frequency term ("http") searches that included locations had throughput of ~21.5 q/sec before this change, and after this change went to ~28.8 q/sec. --- index/scorch/segment/int.go | 86 ++++++++++++++++++++++++++- index/scorch/segment/int_test.go | 71 ++++++++++++++++++++++ index/scorch/segment/zap/posting.go | 91 +++++++++++++++-------------- 3 files changed, 203 insertions(+), 45 deletions(-) diff --git a/index/scorch/segment/int.go b/index/scorch/segment/int.go index a4836ebf8..663418d5b 100644 --- a/index/scorch/segment/int.go +++ b/index/scorch/segment/int.go @@ -19,7 +19,10 @@ package segment -import "fmt" +import ( + "errors" + "fmt" +) const ( MaxVarintSize = 9 @@ -92,3 +95,84 @@ func DecodeUvarintAscending(b []byte) ([]byte, uint64, error) { } return b[length:], v, nil } + +// ------------------------------------------------------------ + +type MemUvarintReader struct { + C int64 // index of next byte to read from S + S []byte +} + +func NewMemUvarintReader(s []byte) *MemUvarintReader { + return &MemUvarintReader{S: s} +} + +// Len returns the number of unread bytes. +func (r *MemUvarintReader) Len() int { + n := int(int64(len(r.S)) - r.C) + if n < 0 { + return 0 + } + return n +} + +var ErrMemUvarintReaderOverflow = errors.New("MemUvarintReader overflow") + +// ReadUvarint reads an encoded uint64. The original code this was +// based on is at encoding/binary/ReadUvarint(). +func (r *MemUvarintReader) ReadUvarint() (uint64, error) { + var x uint64 + var s uint + var C = r.C + var S = r.S + + for true { + b := S[C] + C++ + + if b < 0x80 { + r.C = C + + // why 63? The original code had an 'i += 1' loop var and + // checked for i > 9 || i == 9 ...; but, we no longer + // check for the i var, but instead check here for s, + // which is incremented by 7. So, 7*9 == 63. + // + // why the "extra" >= check? The normal case is that s < + // 63, so we check this single >= guard first so that we + // hit the normal, nil-error return pathway sooner. + if s >= 63 && (s > 63 || s == 63 && b > 1) { + return 0, ErrMemUvarintReaderOverflow + } + + return x | uint64(b)<= n { + reader.Reset(buf) + seen = 0 + } + + _, _ = binary.ReadUvarint(reader) + seen = seen + 1 + } +} + +func BenchmarkMemUvarintReader(b *testing.B) { + n, buf := generateCommonUvarints(64, 512) + + reader := &MemUvarintReader{S: buf} + seen := 0 + + b.ResetTimer() + + for i := 0; i < b.N; i = i + 1 { + if seen >= n { + reader.Reset(buf) + seen = 0 + } + + _, _ = reader.ReadUvarint() + seen = seen + 1 + } +} + +// generate some common, encoded uvarint's that we might see as +// freq-norm's or locations. +func generateCommonUvarints(maxFreq, maxFieldLen int) (n int, rv []byte) { + buf := make([]byte, binary.MaxVarintLen64) + + var out bytes.Buffer + + encode := func(val uint64) { + bufLen := binary.PutUvarint(buf, val) + out.Write(buf[:bufLen]) + n = n + 1 + } + + for i := 1; i < maxFreq; i = i * 2 { // Common freqHasLoc's. + freqHasLocs := uint64(i << 1) + encode(freqHasLocs) + encode(freqHasLocs | 0x01) // 0'th LSB encodes whether there are locations. + } + + encodeNorm := func(fieldLen int) { + norm := float32(1.0 / math.Sqrt(float64(fieldLen))) + normUint64 := uint64(math.Float32bits(float32(norm))) + encode(normUint64) + } + + for i := 1; i < maxFieldLen; i = i * 2 { // Common norm's. + encodeNorm(i) + } + + return n, out.Bytes() +} diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 93b51ae73..fe9f4d957 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -15,10 +15,8 @@ package zap import ( - "bytes" "encoding/binary" "fmt" - "io" "math" "reflect" @@ -334,8 +332,8 @@ type PostingsIterator struct { currChunkFreqNorm []byte currChunkLoc []byte - freqNormReader *bytes.Reader - locReader *bytes.Reader + freqNormReader *segment.MemUvarintReader + locReader *segment.MemUvarintReader freqChunkOffsets []uint64 freqChunkStart uint64 @@ -386,7 +384,7 @@ func (i *PostingsIterator) loadChunk(chunk int) error { end += e i.currChunkFreqNorm = i.postings.sb.mem[start:end] if i.freqNormReader == nil { - i.freqNormReader = bytes.NewReader(i.currChunkFreqNorm) + i.freqNormReader = segment.NewMemUvarintReader(i.currChunkFreqNorm) } else { i.freqNormReader.Reset(i.currChunkFreqNorm) } @@ -404,7 +402,7 @@ func (i *PostingsIterator) loadChunk(chunk int) error { end += e i.currChunkLoc = i.postings.sb.mem[start:end] if i.locReader == nil { - i.locReader = bytes.NewReader(i.currChunkLoc) + i.locReader = segment.NewMemUvarintReader(i.currChunkLoc) } else { i.locReader.Reset(i.currChunkLoc) } @@ -419,18 +417,34 @@ func (i *PostingsIterator) readFreqNormHasLocs() (uint64, uint64, bool, error) { return 1, i.normBits1Hit, false, nil } - freqHasLocs, err := binary.ReadUvarint(i.freqNormReader) + freqHasLocs, err := i.freqNormReader.ReadUvarint() if err != nil { return 0, 0, false, fmt.Errorf("error reading frequency: %v", err) } + freq, hasLocs := decodeFreqHasLocs(freqHasLocs) - normBits, err := binary.ReadUvarint(i.freqNormReader) + normBits, err := i.freqNormReader.ReadUvarint() if err != nil { return 0, 0, false, fmt.Errorf("error reading norm: %v", err) } - return freq, normBits, hasLocs, err + return freq, normBits, hasLocs, nil +} + +func (i *PostingsIterator) skipFreqNormReadHasLocs() (bool, error) { + if i.normBits1Hit != 0 { + return false, nil + } + + freqHasLocs, err := i.freqNormReader.ReadUvarint() + if err != nil { + return false, fmt.Errorf("error reading freqHasLocs: %v", err) + } + + i.freqNormReader.SkipUvarint() // Skip normBits. + + return freqHasLocs&0x01 != 0, nil // See decodeFreqHasLocs() / hasLocs. } func encodeFreqHasLocs(freq uint64, hasLocs bool) uint64 { @@ -448,58 +462,53 @@ func decodeFreqHasLocs(freqHasLocs uint64) (uint64, bool) { } // readLocation processes all the integers on the stream representing a single -// location. if you care about it, pass in a non-nil location struct, and we -// will fill it. if you don't care about it, pass in nil and we safely consume -// the contents. +// location. func (i *PostingsIterator) readLocation(l *Location) error { // read off field - fieldID, err := binary.ReadUvarint(i.locReader) + fieldID, err := i.locReader.ReadUvarint() if err != nil { return fmt.Errorf("error reading location field: %v", err) } // read off pos - pos, err := binary.ReadUvarint(i.locReader) + pos, err := i.locReader.ReadUvarint() if err != nil { return fmt.Errorf("error reading location pos: %v", err) } // read off start - start, err := binary.ReadUvarint(i.locReader) + start, err := i.locReader.ReadUvarint() if err != nil { return fmt.Errorf("error reading location start: %v", err) } // read off end - end, err := binary.ReadUvarint(i.locReader) + end, err := i.locReader.ReadUvarint() if err != nil { return fmt.Errorf("error reading location end: %v", err) } // read off num array pos - numArrayPos, err := binary.ReadUvarint(i.locReader) + numArrayPos, err := i.locReader.ReadUvarint() if err != nil { return fmt.Errorf("error reading location num array pos: %v", err) } - // group these together for less branching - if l != nil { - l.field = i.postings.sb.fieldsInv[fieldID] - l.pos = pos - l.start = start - l.end = end - if cap(l.ap) < int(numArrayPos) { - l.ap = make([]uint64, int(numArrayPos)) - } else { - l.ap = l.ap[:int(numArrayPos)] - } + l.field = i.postings.sb.fieldsInv[fieldID] + l.pos = pos + l.start = start + l.end = end + + if cap(l.ap) < int(numArrayPos) { + l.ap = make([]uint64, int(numArrayPos)) + } else { + l.ap = l.ap[:int(numArrayPos)] } // read off array positions for k := 0; k < int(numArrayPos); k++ { - ap, err := binary.ReadUvarint(i.locReader) + ap, err := i.locReader.ReadUvarint() if err != nil { return fmt.Errorf("error reading array position: %v", err) } - if l != nil { - l.ap[k] = ap - } + + l.ap[k] = ap } return nil @@ -556,7 +565,7 @@ func (i *PostingsIterator) nextAtOrAfter(atOrAfter uint64) (segment.Posting, err } rv.locs = i.nextSegmentLocs[:0] - numLocsBytes, err := binary.ReadUvarint(i.locReader) + numLocsBytes, err := i.locReader.ReadUvarint() if err != nil { return nil, fmt.Errorf("error reading location numLocsBytes: %v", err) } @@ -612,17 +621,14 @@ func (i *PostingsIterator) nextBytes() ( if hasLocs { startLoc := len(i.currChunkLoc) - i.locReader.Len() - numLocsBytes, err := binary.ReadUvarint(i.locReader) + numLocsBytes, err := i.locReader.ReadUvarint() if err != nil { return 0, 0, 0, nil, nil, fmt.Errorf("error reading location nextBytes numLocs: %v", err) } // skip over all the location bytes - _, err = i.locReader.Seek(int64(numLocsBytes), io.SeekCurrent) - if err != nil { - return 0, 0, 0, nil, nil, err - } + i.locReader.SkipBytes(int64(numLocsBytes)) endLoc := len(i.currChunkLoc) - i.locReader.Len() bytesLoc = i.currChunkLoc[startLoc:endLoc] @@ -763,22 +769,19 @@ func (i *PostingsIterator) currChunkNext(nChunk uint32) error { } // read off freq/offsets even though we don't care about them - _, _, hasLocs, err := i.readFreqNormHasLocs() + hasLocs, err := i.skipFreqNormReadHasLocs() if err != nil { return err } if i.includeLocs && hasLocs { - numLocsBytes, err := binary.ReadUvarint(i.locReader) + numLocsBytes, err := i.locReader.ReadUvarint() if err != nil { return fmt.Errorf("error reading location numLocsBytes: %v", err) } // skip over all the location bytes - _, err = i.locReader.Seek(int64(numLocsBytes), io.SeekCurrent) - if err != nil { - return err - } + i.locReader.SkipBytes(int64(numLocsBytes)) } return nil From fb220ce32cb28e4b864156e632f2d3d4fdad3967 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Tue, 26 Mar 2019 21:28:30 -0700 Subject: [PATCH 575/728] fixes #1166 - MemUvarintReader avoids extra int64 casting --- index/scorch/segment/int.go | 6 +++--- index/scorch/segment/zap/posting.go | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/index/scorch/segment/int.go b/index/scorch/segment/int.go index 663418d5b..e89d2c1ee 100644 --- a/index/scorch/segment/int.go +++ b/index/scorch/segment/int.go @@ -99,7 +99,7 @@ func DecodeUvarintAscending(b []byte) ([]byte, uint64, error) { // ------------------------------------------------------------ type MemUvarintReader struct { - C int64 // index of next byte to read from S + C int // index of next byte to read from S S []byte } @@ -109,7 +109,7 @@ func NewMemUvarintReader(s []byte) *MemUvarintReader { // Len returns the number of unread bytes. func (r *MemUvarintReader) Len() int { - n := int(int64(len(r.S)) - r.C) + n := len(r.S) - r.C if n < 0 { return 0 } @@ -168,7 +168,7 @@ func (r *MemUvarintReader) SkipUvarint() { } // SkipBytes skips a count number of bytes. -func (r *MemUvarintReader) SkipBytes(count int64) { +func (r *MemUvarintReader) SkipBytes(count int) { r.C = r.C + count } diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index fe9f4d957..0656c8e99 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -628,7 +628,7 @@ func (i *PostingsIterator) nextBytes() ( } // skip over all the location bytes - i.locReader.SkipBytes(int64(numLocsBytes)) + i.locReader.SkipBytes(int(numLocsBytes)) endLoc := len(i.currChunkLoc) - i.locReader.Len() bytesLoc = i.currChunkLoc[startLoc:endLoc] @@ -781,7 +781,7 @@ func (i *PostingsIterator) currChunkNext(nChunk uint32) error { } // skip over all the location bytes - i.locReader.SkipBytes(int64(numLocsBytes)) + i.locReader.SkipBytes(int(numLocsBytes)) } return nil From 4a220e62ffa757eff5a3b7b2e2dbe9985ebde6bd Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 27 Mar 2019 06:53:34 -0700 Subject: [PATCH 576/728] simplify for true loop --- index/scorch/segment/int.go | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/index/scorch/segment/int.go b/index/scorch/segment/int.go index e89d2c1ee..55299d8f7 100644 --- a/index/scorch/segment/int.go +++ b/index/scorch/segment/int.go @@ -126,7 +126,7 @@ func (r *MemUvarintReader) ReadUvarint() (uint64, error) { var C = r.C var S = r.S - for true { + for { b := S[C] C++ @@ -151,13 +151,11 @@ func (r *MemUvarintReader) ReadUvarint() (uint64, error) { x |= uint64(b&0x7f) << s s += 7 } - - return 0, nil // never reached, but compiler wants this } // SkipUvarint skips ahead one encoded uint64. func (r *MemUvarintReader) SkipUvarint() { - for true { + for { b := r.S[r.C] r.C++ From 60b3810c76416608327c20aae06d57832f13eaea Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Fri, 29 Mar 2019 15:13:48 -0700 Subject: [PATCH 577/728] MB-32855: Revert disjunction-single clause optimization + Reverting the single-clause disjunction optimization as well, as we're noticing unexpected results while executing a boolean query in certain scenarios. + Note that I've already reverted the single-clause conjunction optimization with: https://github.com/blevesearch/bleve/pull/1133 + Included unit tests that capture failing test cases with the above mentioned optimizations. + With this change, I've completely reverted: - https://github.com/blevesearch/bleve/pull/1073 - https://github.com/blevesearch/bleve/pull/1116 - https://github.com/blevesearch/bleve/pull/1128 + Apologies for the grand mess :) Also, some related discussion on why these optimizations don't work: https://github.com/blevesearch/bleve/pull/1136 --- index/store/moss/lower_test.go | 2 +- search/query/disjunction.go | 6 -- search_test.go | 152 +++++++++++++++++++++++++++++++++ 3 files changed, 153 insertions(+), 7 deletions(-) diff --git a/index/store/moss/lower_test.go b/index/store/moss/lower_test.go index 72b8a7194..afc0a0959 100644 --- a/index/store/moss/lower_test.go +++ b/index/store/moss/lower_test.go @@ -27,7 +27,7 @@ func openWithLower(t *testing.T, mo store.MergeOperator) (string, store.KVStore) tmpDir, _ := ioutil.TempDir("", "mossStore") config := map[string]interface{}{ - "path": tmpDir, + "path": tmpDir, "mossLowerLevelStoreName": "mossStore", } diff --git a/search/query/disjunction.go b/search/query/disjunction.go index 2bc1d7044..a1fc1439a 100644 --- a/search/query/disjunction.go +++ b/search/query/disjunction.go @@ -80,12 +80,6 @@ func (q *DisjunctionQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, if len(ss) < 1 { return searcher.NewMatchNoneSearcher(i) - } else if len(ss) == 1 && int(q.Min) == ss[0].Min() { - // apply optimization only if both conditions below are satisfied: - // - disjunction searcher has only 1 child searcher - // - parent searcher's min setting is equal to child searcher's min - - return ss[0], nil } return searcher.NewDisjunctionSearcher(i, ss, q.Min, options) diff --git a/search_test.go b/search_test.go index 917b24ad2..682e8ed0a 100644 --- a/search_test.go +++ b/search_test.go @@ -28,7 +28,9 @@ import ( "github.com/blevesearch/bleve/analysis/analyzer/custom" "github.com/blevesearch/bleve/analysis/analyzer/keyword" "github.com/blevesearch/bleve/analysis/analyzer/standard" + "github.com/blevesearch/bleve/analysis/token/length" "github.com/blevesearch/bleve/analysis/token/lowercase" + "github.com/blevesearch/bleve/analysis/token/shingle" "github.com/blevesearch/bleve/analysis/tokenizer/single" "github.com/blevesearch/bleve/analysis/tokenizer/whitespace" "github.com/blevesearch/bleve/document" @@ -1264,3 +1266,153 @@ func TestDuplicateLocationsIssue1168(t *testing.T) { t.Fatalf("duplicate marty") } } + +func TestBooleanMustSingleMatchNone(t *testing.T) { + idxMapping := NewIndexMapping() + if err := idxMapping.AddCustomTokenFilter(length.Name, map[string]interface{}{ + "min": 3.0, + "max": 5.0, + "type": length.Name, + }); err != nil { + t.Fatal(err) + } + if err := idxMapping.AddCustomAnalyzer("custom1", map[string]interface{}{ + "type": "custom", + "tokenizer": "single", + "token_filters": []interface{}{length.Name}, + }); err != nil { + t.Fatal(err) + } + + idxMapping.DefaultAnalyzer = "custom1" + idx, err := New("testidx", idxMapping) + if err != nil { + t.Fatal(err) + } + + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + + err = os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + }() + + doc := map[string]interface{}{ + "languages_known": "Dutch", + "dept": "Sales", + } + + batch := idx.NewBatch() + if err = batch.Index("doc", doc); err != nil { + t.Fatal(err) + } + + if err = idx.Batch(batch); err != nil { + t.Fatal(err) + } + + // this is a successful match + matchSales := NewMatchQuery("Sales") + matchSales.SetField("dept") + + // this would spin off a MatchNoneSearcher as the + // token filter rules out the word "French" + matchFrench := NewMatchQuery("French") + matchFrench.SetField("languages_known") + + bq := NewBooleanQuery() + bq.AddShould(matchSales) + bq.AddMust(matchFrench) + + sr := NewSearchRequest(bq) + res, err := idx.Search(sr) + if err != nil { + t.Fatal(err) + } + + if res.Total != 0 { + t.Fatalf("Expected 0 results but got: %v", res.Total) + } +} + +func TestBooleanMustNotSingleMatchNone(t *testing.T) { + idxMapping := NewIndexMapping() + if err := idxMapping.AddCustomTokenFilter(shingle.Name, map[string]interface{}{ + "min": 3.0, + "max": 5.0, + "type": shingle.Name, + }); err != nil { + t.Fatal(err) + } + if err := idxMapping.AddCustomAnalyzer("custom1", map[string]interface{}{ + "type": "custom", + "tokenizer": "unicode", + "token_filters": []interface{}{shingle.Name}, + }); err != nil { + t.Fatal(err) + } + + idxMapping.DefaultAnalyzer = "custom1" + idx, err := New("testidx", idxMapping) + if err != nil { + t.Fatal(err) + } + + defer func() { + err = idx.Close() + if err != nil { + t.Fatal(err) + } + + err = os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + }() + + doc := map[string]interface{}{ + "languages_known": "Dutch", + "dept": "Sales", + } + + batch := idx.NewBatch() + if err = batch.Index("doc", doc); err != nil { + t.Fatal(err) + } + + if err = idx.Batch(batch); err != nil { + t.Fatal(err) + } + + // this is a successful match + matchSales := NewMatchQuery("Sales") + matchSales.SetField("dept") + + // this would spin off a MatchNoneSearcher as the + // token filter rules out the word "Dutch" + matchDutch := NewMatchQuery("Dutch") + matchDutch.SetField("languages_known") + + matchEngineering := NewMatchQuery("Engineering") + matchEngineering.SetField("dept") + + bq := NewBooleanQuery() + bq.AddShould(matchSales) + bq.AddMustNot(matchDutch, matchEngineering) + + sr := NewSearchRequest(bq) + res, err := idx.Search(sr) + if err != nil { + t.Fatal(err) + } + + if res.Total != 0 { + t.Fatalf("Expected 0 results but got: %v", res.Total) + } + +} From 0dc98d8d768f86fa4ccf5ea9289de3d33de2058c Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 1 Apr 2019 13:45:42 -0700 Subject: [PATCH 578/728] Adding a boolean search testcase Reference: https://github.com/blevesearch/bleve/issues/1185 --- search_test.go | 132 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 132 insertions(+) diff --git a/search_test.go b/search_test.go index 682e8ed0a..c534dd320 100644 --- a/search_test.go +++ b/search_test.go @@ -1414,5 +1414,137 @@ func TestBooleanMustNotSingleMatchNone(t *testing.T) { if res.Total != 0 { t.Fatalf("Expected 0 results but got: %v", res.Total) } +} + +func TestBooleanSearchBug1185(t *testing.T) { + defer func() { + err := os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + }() + + of := NewTextFieldMapping() + of.Analyzer = keyword.Name + of.Name = "owner" + + dm := NewDocumentMapping() + dm.AddFieldMappingsAt("owner", of) + + m := NewIndexMapping() + m.DefaultMapping = dm + + idx, err := NewUsing("testidx", m, "scorch", "scorch", nil) + if err != nil { + t.Fatal(err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + err = idx.Index("17112", map[string]interface{}{ + "owner": "marty", + "type": "A Demo Type", + }) + if err != nil { + t.Fatal(err) + } + + err = idx.Index("17139", map[string]interface{}{ + "type": "A Demo Type", + }) + if err != nil { + t.Fatal(err) + } + + err = idx.Index("177777", map[string]interface{}{ + "type": "x", + }) + if err != nil { + t.Fatal(err) + } + err = idx.Index("177778", map[string]interface{}{ + "type": "A Demo Type", + }) + if err != nil { + t.Fatal(err) + } + err = idx.Index("17140", map[string]interface{}{ + "type": "A Demo Type", + }) + if err != nil { + t.Fatal(err) + } + + err = idx.Index("17000", map[string]interface{}{ + "owner": "marty", + "type": "x", + }) + if err != nil { + t.Fatal(err) + } + + err = idx.Index("17141", map[string]interface{}{ + "type": "A Demo Type", + }) + if err != nil { + t.Fatal(err) + } + + err = idx.Index("17428", map[string]interface{}{ + "owner": "marty", + "type": "A Demo Type", + }) + if err != nil { + t.Fatal(err) + } + + err = idx.Index("17113", map[string]interface{}{ + "owner": "marty", + "type": "x", + }) + if err != nil { + t.Fatal(err) + } + + matchTypeQ := NewMatchPhraseQuery("A Demo Type") + matchTypeQ.SetField("type") + + matchAnyOwnerRegQ := NewRegexpQuery(".+") + matchAnyOwnerRegQ.SetField("owner") + + matchNoOwner := NewBooleanQuery() + matchNoOwner.AddMustNot(matchAnyOwnerRegQ) + + notNoOwner := NewBooleanQuery() + notNoOwner.AddMustNot(matchNoOwner) + + matchTypeAndNoOwner := NewConjunctionQuery() + matchTypeAndNoOwner.AddQuery(matchTypeQ) + matchTypeAndNoOwner.AddQuery(notNoOwner) + + req := NewSearchRequest(matchTypeAndNoOwner) + res, err := idx.Search(req) + if err != nil { + t.Fatal(err) + } + + // query 2 + matchTypeAndNoOwnerBoolean := NewBooleanQuery() + matchTypeAndNoOwnerBoolean.AddMust(matchTypeQ) + matchTypeAndNoOwnerBoolean.AddMustNot(matchNoOwner) + + req2 := NewSearchRequest(matchTypeAndNoOwnerBoolean) + res2, err := idx.Search(req2) + if err != nil { + t.Fatal(err) + } + + if len(res.Hits) != len(res2.Hits) { + t.Fatalf("expected same number of hits, got: %d and %d", len(res.Hits), len(res2.Hits)) + } } From 70e74bf466d04282cfceba8a7c7864c9da4a2c08 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 2 Apr 2019 08:48:58 -0400 Subject: [PATCH 579/728] fix incorrect BooleanSearcher Advance behavior * Revert "Advancing boolean searcher's currentID w.r.t currMustNot" This reverts commit dd49ef0aa0a006a46e90b20dcba578265c043c07. * track boolean searcher done status when boolean searchers are nested, there can occur a situation in which a boolean searcher is reset after reaching the end. this leads to incorrect behavior, so instead we track reaching the end, and prevent advance from resetting that behavior. * add Unit test that catches a cornercase: TestNestedBooleanMustNotSearcherUpsidedown --- search/searcher/search_boolean.go | 40 ++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/search/searcher/search_boolean.go b/search/searcher/search_boolean.go index bbbced479..7f0bfa424 100644 --- a/search/searcher/search_boolean.go +++ b/search/searcher/search_boolean.go @@ -45,6 +45,7 @@ type BooleanSearcher struct { scorer *scorer.ConjunctionQueryScorer matches []*search.DocumentMatch initialized bool + done bool } func NewBooleanSearcher(indexReader index.IndexReader, mustSearcher search.Searcher, shouldSearcher search.Searcher, mustNotSearcher search.Searcher, options search.SearcherOptions) (*BooleanSearcher, error) { @@ -207,6 +208,10 @@ func (s *BooleanSearcher) SetQueryNorm(qnorm float64) { func (s *BooleanSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch, error) { + if s.done { + return nil, nil + } + if !s.initialized { err := s.initSearchers(ctx) if err != nil { @@ -320,11 +325,19 @@ func (s *BooleanSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch } } + if rv == nil { + s.done = true + } + return rv, nil } func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInternalID) (*search.DocumentMatch, error) { + if s.done { + return nil, nil + } + if !s.initialized { err := s.initSearchers(ctx) if err != nil { @@ -332,14 +345,8 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter } } - // Advance the searchers only if the currentID cursor is trailing the lookup ID, - // additionally if the mustNotSearcher has been initialized, ensure that the - // cursor used to track the mustNotSearcher (currMustNot, which isn't tracked by - // currentID) is trailing the lookup ID as well - for in the case where currentID - // is nil and currMustNot is already at or ahead of the lookup ID, we MUST NOT - // advance the currentID or the currMustNot cursors. - if (s.currentID == nil || s.currentID.Compare(ID) < 0) && - (s.currMustNot == nil || s.currMustNot.IndexInternalID.Compare(ID) < 0) { + // Advance the searcher only if the cursor is trailing the lookup ID + if s.currentID == nil || s.currentID.Compare(ID) < 0 { var err error if s.mustSearcher != nil { if s.currMust != nil { @@ -362,12 +369,17 @@ func (s *BooleanSearcher) Advance(ctx *search.SearchContext, ID index.IndexInter } if s.mustNotSearcher != nil { - if s.currMustNot != nil { - ctx.DocumentMatchPool.Put(s.currMustNot) - } - s.currMustNot, err = s.mustNotSearcher.Advance(ctx, ID) - if err != nil { - return nil, err + // Additional check for mustNotSearcher, whose cursor isn't tracked by + // currentID to prevent it from moving when the searcher's tracked + // position is already ahead of or at the requested ID. + if s.currMustNot == nil || s.currMustNot.IndexInternalID.Compare(ID) < 0 { + if s.currMustNot != nil { + ctx.DocumentMatchPool.Put(s.currMustNot) + } + s.currMustNot, err = s.mustNotSearcher.Advance(ctx, ID) + if err != nil { + return nil, err + } } } From 3bd9e536b6bc4f99a82eda4a62a50240b0598974 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 1 Apr 2019 20:30:03 -0700 Subject: [PATCH 580/728] MB-33600: Allow date ranges before epoch time --- search/query/date_range.go | 11 +++++++++-- search/query/date_range_test.go | 5 +++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/search/query/date_range.go b/search/query/date_range.go index ae26f8dec..5f6a8db3c 100644 --- a/search/query/date_range.go +++ b/search/query/date_range.go @@ -41,6 +41,13 @@ type BleveQueryTime struct { time.Time } +var epochTime time.Time + +func init() { + unixEpochTime := "1970-01-01T00:00:00Z" + epochTime, _ = time.Parse(time.RFC3339, unixEpochTime) +} + func queryTimeFromString(t string) (time.Time, error) { dateTimeParser, err := cache.DateTimeParserNamed(QueryDateTimeParser) if err != nil { @@ -144,7 +151,7 @@ func (q *DateRangeQuery) parseEndpoints() (*float64, *float64, error) { max := math.Inf(1) if !q.Start.IsZero() { startInt64 := q.Start.UnixNano() - if startInt64 < 0 { + if startInt64 < 0 && q.Start.After(epochTime) { // overflow return nil, nil, fmt.Errorf("invalid/unsupported date range, start: %v", q.Start) } @@ -152,7 +159,7 @@ func (q *DateRangeQuery) parseEndpoints() (*float64, *float64, error) { } if !q.End.IsZero() { endInt64 := q.End.UnixNano() - if endInt64 < 0 { + if endInt64 < 0 && q.End.After(epochTime) { // overflow return nil, nil, fmt.Errorf("invalid/unsupported date range, end: %v", q.End) } diff --git a/search/query/date_range_test.go b/search/query/date_range_test.go index dd14d3238..cfec05e35 100644 --- a/search/query/date_range_test.go +++ b/search/query/date_range_test.go @@ -76,6 +76,11 @@ func TestValidateDatetimeRanges(t *testing.T) { end: "2262-04-12T00:00:00Z", expect: false, }, + { + start: "1950-03-22T12:23:23Z", + end: "1960-02-21T15:23:34Z", + expect: true, + }, } for _, test := range tests { From 96b0280a5c6458245d2ff398a983c87268965162 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Tue, 2 Apr 2019 12:24:39 -0700 Subject: [PATCH 581/728] MB-33600: Further validation for start/end date ranges + MinAllowedTime: 1677-12-01T00:00:00Z + MaxAllowedTime: 2262-04-11T11:59:59Z Any NonZero time.Time values that fall outside this range will cause UnixNano() to overflow (int64 cannot accommodate). --- search/query/date_range.go | 15 +++++++------- search/query/date_range_test.go | 35 +++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/search/query/date_range.go b/search/query/date_range.go index 5f6a8db3c..5d61bd37e 100644 --- a/search/query/date_range.go +++ b/search/query/date_range.go @@ -41,11 +41,12 @@ type BleveQueryTime struct { time.Time } -var epochTime time.Time +var minValidTime time.Time +var maxValidTime time.Time func init() { - unixEpochTime := "1970-01-01T00:00:00Z" - epochTime, _ = time.Parse(time.RFC3339, unixEpochTime) + minValidTime, _ = time.Parse(time.RFC3339, "1677-12-01T00:00:00Z") + maxValidTime, _ = time.Parse(time.RFC3339, "2262-04-11T11:59:59Z") } func queryTimeFromString(t string) (time.Time, error) { @@ -150,19 +151,19 @@ func (q *DateRangeQuery) parseEndpoints() (*float64, *float64, error) { min := math.Inf(-1) max := math.Inf(1) if !q.Start.IsZero() { - startInt64 := q.Start.UnixNano() - if startInt64 < 0 && q.Start.After(epochTime) { + if q.Start.Before(minValidTime) || q.Start.After(maxValidTime) { // overflow return nil, nil, fmt.Errorf("invalid/unsupported date range, start: %v", q.Start) } + startInt64 := q.Start.UnixNano() min = numeric.Int64ToFloat64(startInt64) } if !q.End.IsZero() { - endInt64 := q.End.UnixNano() - if endInt64 < 0 && q.End.After(epochTime) { + if q.End.Before(minValidTime) || q.End.After(maxValidTime) { // overflow return nil, nil, fmt.Errorf("invalid/unsupported date range, end: %v", q.End) } + endInt64 := q.End.UnixNano() max = numeric.Int64ToFloat64(endInt64) } diff --git a/search/query/date_range_test.go b/search/query/date_range_test.go index cfec05e35..2990642be 100644 --- a/search/query/date_range_test.go +++ b/search/query/date_range_test.go @@ -81,6 +81,41 @@ func TestValidateDatetimeRanges(t *testing.T) { end: "1960-02-21T15:23:34Z", expect: true, }, + { + start: "0001-01-01T00:00:00Z", + end: "0001-01-01T00:00:00Z", + expect: false, + }, + { + start: "0001-01-01T00:00:00Z", + end: "2000-01-01T00:00:00Z", + expect: true, + }, + { + start: "1677-11-30T11:59:59Z", + end: "2262-04-11T11:59:59Z", + expect: false, + }, + { + start: "2262-04-12T00:00:00Z", + end: "2262-04-11T11:59:59Z", + expect: false, + }, + { + start: "1677-12-01T00:00:00Z", + end: "2262-04-12T00:00:00Z", + expect: false, + }, + { + start: "1677-12-01T00:00:00Z", + end: "1677-11-30T11:59:59Z", + expect: false, + }, + { + start: "1677-12-01T00:00:00Z", + end: "2262-04-11T11:59:59Z", + expect: true, + }, } for _, test := range tests { From 85d7cd0ba3567ba9224618ff2cf1db437ab81b40 Mon Sep 17 00:00:00 2001 From: Steve Yen Date: Wed, 3 Apr 2019 09:52:44 -0700 Subject: [PATCH 582/728] remove maxTerms optimization for ComputeGeoRange() Unlike normal disjunctions, a geoboundingbox searcher does not limit its multi-term-searcher against the DisjunctionMaxClauseCount, so my recent maxTerms optimization for ComputeGeoRange() in commit 3d774b9 was just wrong. This commit is a manual revert instead of an automated "git revert", in that the previous commit 3d774b9 also changed a few things that we'd still like to keep (e.g., error return value instead of panic'ing) and there are now more unit tests in this commit from diagnosing this issue, which compare the outputs of the optimized ComputeGeoRange() with the previous, non-optimized version. See also: https://issues.couchbase.com/browse/MB-33614 --- search/searcher/search_geoboundingbox.go | 11 +-- search/searcher/search_geoboundingbox_test.go | 87 +++++++++++++++++-- .../searcher/search_geopointdistance_test.go | 73 ++++++++++++++++ 3 files changed, 154 insertions(+), 17 deletions(-) diff --git a/search/searcher/search_geoboundingbox.go b/search/searcher/search_geoboundingbox.go index a80855ac5..ed5424f1f 100644 --- a/search/searcher/search_geoboundingbox.go +++ b/search/searcher/search_geoboundingbox.go @@ -40,7 +40,7 @@ func NewGeoBoundingBoxSearcher(indexReader index.IndexReader, minLon, minLat, // do math to produce list of terms needed for this search onBoundaryTerms, notOnBoundaryTerms, err := ComputeGeoRange(0, GeoBitsShift1Minus1, - minLon, minLat, maxLon, maxLat, checkBoundaries, DisjunctionMaxClauseCount) + minLon, minLat, maxLon, maxLat, checkBoundaries) if err != nil { return nil, err } @@ -100,7 +100,7 @@ var geoMaxShift = document.GeoPrecisionStep * 4 var geoDetailLevel = ((geo.GeoBits << 1) - geoMaxShift) / 2 func ComputeGeoRange(term uint64, shift uint, - sminLon, sminLat, smaxLon, smaxLat float64, checkBoundaries bool, maxTerms int) ( + sminLon, sminLat, smaxLon, smaxLat float64, checkBoundaries bool) ( onBoundary [][]byte, notOnBoundary [][]byte, err error) { preallocBytesLen := 32 preallocBytes := make([]byte, preallocBytesLen) @@ -144,13 +144,6 @@ func ComputeGeoRange(term uint64, shift uint, } computeGeoRange = func(term uint64, shift uint) { - if maxTerms > 0 { - if len(onBoundary) > maxTerms { - err = tooManyClausesErr(len(onBoundary)) - } else if len(notOnBoundary) > maxTerms { - err = tooManyClausesErr(len(notOnBoundary)) - } - } if err != nil { return } diff --git a/search/searcher/search_geoboundingbox_test.go b/search/searcher/search_geoboundingbox_test.go index be3d09168..cae803412 100644 --- a/search/searcher/search_geoboundingbox_test.go +++ b/search/searcher/search_geoboundingbox_test.go @@ -19,9 +19,11 @@ import ( "testing" "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/geo" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/store/gtreap" "github.com/blevesearch/bleve/index/upsidedown" + "github.com/blevesearch/bleve/numeric" "github.com/blevesearch/bleve/search" ) @@ -202,20 +204,18 @@ func setupGeo(t *testing.T) index.Index { func TestComputeGeoRange(t *testing.T) { tests := []struct { degs float64 - maxTerms int onBoundary int offBoundary int err string }{ - {0.01, 0, 4, 0, ""}, - {0.1, 0, 56, 144, ""}, - {100.0, 0, 32768, 258560, ""}, - {100.0, 1024, 0, 0, "geo range produces too many terms, so should have error"}, + {0.01, 4, 0, ""}, + {0.1, 56, 144, ""}, + {100.0, 32768, 258560, ""}, } - for _, test := range tests { + for testi, test := range tests { onBoundaryRes, offBoundaryRes, err := ComputeGeoRange(0, GeoBitsShift1Minus1, - -1.0*test.degs, -1.0*test.degs, test.degs, test.degs, true, test.maxTerms) + -1.0*test.degs, -1.0*test.degs, test.degs, test.degs, true) if (err != nil) != (test.err != "") { t.Errorf("test: %+v, err: %v", test, err) } @@ -225,6 +225,17 @@ func TestComputeGeoRange(t *testing.T) { if len(offBoundaryRes) != test.offBoundary { t.Errorf("test: %+v, offBoundaryRes: %v", test, len(offBoundaryRes)) } + + onBROrig, offBROrig := origComputeGeoRange(0, GeoBitsShift1Minus1, + -1.0*test.degs, -1.0*test.degs, test.degs, test.degs, true) + if !reflect.DeepEqual(onBoundaryRes, onBROrig) { + t.Errorf("testi: %d, test: %+v, onBoundaryRes != onBROrig,\n onBoundaryRes:%v,\n onBROrig: %v", + testi, test, onBoundaryRes, onBROrig) + } + if !reflect.DeepEqual(offBoundaryRes, offBROrig) { + t.Errorf("testi: %d, test: %+v, offBoundaryRes, offBROrig,\n offBoundaryRes: %v,\n offBROrig: %v", + testi, test, offBoundaryRes, offBROrig) + } } } @@ -264,7 +275,7 @@ func benchmarkComputeGeoRange(b *testing.B, for i := 0; i < b.N; i++ { onBoundaryRes, offBoundaryRes, err := - ComputeGeoRange(0, GeoBitsShift1Minus1, minLon, minLat, maxLon, maxLat, checkBoundaries, 0) + ComputeGeoRange(0, GeoBitsShift1Minus1, minLon, minLat, maxLon, maxLat, checkBoundaries) if err != nil { b.Fatalf("expected no err") } @@ -273,3 +284,63 @@ func benchmarkComputeGeoRange(b *testing.B, } } } + +// -------------------------------------------------------------------- + +// original, non-optimized implementation of ComputeGeoRange +func origComputeGeoRange(term uint64, shift uint, + sminLon, sminLat, smaxLon, smaxLat float64, + checkBoundaries bool) ( + onBoundary [][]byte, notOnBoundary [][]byte) { + split := term | uint64(0x1)<> 1 + + within := res%document.GeoPrecisionStep == 0 && + geo.RectWithin(minLon, minLat, maxLon, maxLat, + sminLon, sminLat, smaxLon, smaxLat) + if within || (level == geoDetailLevel && + geo.RectIntersects(minLon, minLat, maxLon, maxLat, + sminLon, sminLat, smaxLon, smaxLat)) { + if !within && checkBoundaries { + return [][]byte{ + numeric.MustNewPrefixCodedInt64(int64(start), res), + }, nil + } + return nil, + [][]byte{ + numeric.MustNewPrefixCodedInt64(int64(start), res), + } + } else if level < geoDetailLevel && + geo.RectIntersects(minLon, minLat, maxLon, maxLat, + sminLon, sminLat, smaxLon, smaxLat) { + return origComputeGeoRange(start, res-1, sminLon, sminLat, smaxLon, smaxLat, + checkBoundaries) + } + return nil, nil +} diff --git a/search/searcher/search_geopointdistance_test.go b/search/searcher/search_geopointdistance_test.go index 002e4e3e8..eaf8216a3 100644 --- a/search/searcher/search_geopointdistance_test.go +++ b/search/searcher/search_geopointdistance_test.go @@ -18,6 +18,7 @@ import ( "reflect" "testing" + "github.com/blevesearch/bleve/geo" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" ) @@ -82,3 +83,75 @@ func testGeoPointDistanceSearch(i index.IndexReader, centerLon, centerLat, dist } return rv, nil } + +func TestGeoPointDistanceCompare(t *testing.T) { + tests := []struct { + docLat, docLon float64 + centerLat, centerLon float64 + distance string + }{ + // Data points originally from MB-33454. + { + docLat: 33.718, + docLon: -116.8293, + centerLat: 39.59000587, + centerLon: -119.22998428, + distance: "10000mi", + }, + { + docLat: 41.1305, + docLon: -121.6587, + centerLat: 61.28, + centerLon: -149.34, + distance: "10000mi", + }, + } + + for testi, test := range tests { + // compares the results from ComputeGeoRange with original, non-optimized version + compare := func(desc string, + minLon, minLat, maxLon, maxLat float64, checkBoundaries bool) { + // do math to produce list of terms needed for this search + onBoundaryRes, offBoundaryRes, err := ComputeGeoRange(0, GeoBitsShift1Minus1, + minLon, minLat, maxLon, maxLat, checkBoundaries) + if err != nil { + t.Fatal(err) + } + + onBROrig, offBROrig := origComputeGeoRange(0, GeoBitsShift1Minus1, + minLon, minLat, maxLon, maxLat, checkBoundaries) + if !reflect.DeepEqual(onBoundaryRes, onBROrig) { + t.Fatalf("testi: %d, test: %+v, desc: %s, onBoundaryRes != onBROrig,\n onBoundaryRes:%v,\n onBROrig: %v", + testi, test, desc, onBoundaryRes, onBROrig) + } + if !reflect.DeepEqual(offBoundaryRes, offBROrig) { + t.Fatalf("testi: %d, test: %+v, desc: %s, offBoundaryRes, offBROrig,\n offBoundaryRes: %v,\n offBROrig: %v", + testi, test, desc, offBoundaryRes, offBROrig) + } + } + + // follow the general approach of the GeoPointDistanceSearcher... + dist, err := geo.ParseDistance(test.distance) + if err != nil { + t.Fatal(err) + } + + topLeftLon, topLeftLat, bottomRightLon, bottomRightLat, err := + geo.RectFromPointDistance(test.centerLon, test.centerLat, dist) + if err != nil { + t.Fatal(err) + } + + if bottomRightLon < topLeftLon { + // crosses date line, rewrite as two parts + compare("-180/f", -180, bottomRightLat, bottomRightLon, topLeftLat, false) + compare("-180/t", -180, bottomRightLat, bottomRightLon, topLeftLat, true) + + compare("180/f", topLeftLon, bottomRightLat, 180, topLeftLat, false) + compare("180/t", topLeftLon, bottomRightLat, 180, topLeftLat, true) + } else { + compare("reg/f", topLeftLon, bottomRightLat, bottomRightLon, topLeftLat, false) + compare("reg/t", topLeftLon, bottomRightLat, bottomRightLon, topLeftLat, true) + } + } +} From 73e96b496795d6470e44e67dc5459c224953952a Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 3 Apr 2019 11:01:18 -0700 Subject: [PATCH 583/728] MB-33600: Checking if datetimes are RFC3339 compatible Check if datetimes provided within the datetime range query are RFC3339 compatible only of the QueryDateTimeFormat is RFC3339. --- search/query/date_range.go | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/search/query/date_range.go b/search/query/date_range.go index 5d61bd37e..3ac0322f5 100644 --- a/search/query/date_range.go +++ b/search/query/date_range.go @@ -41,12 +41,12 @@ type BleveQueryTime struct { time.Time } -var minValidTime time.Time -var maxValidTime time.Time +var MinRFC3339CompatibleTime time.Time +var MaxRFC3339CompatibleTime time.Time func init() { - minValidTime, _ = time.Parse(time.RFC3339, "1677-12-01T00:00:00Z") - maxValidTime, _ = time.Parse(time.RFC3339, "2262-04-11T11:59:59Z") + MinRFC3339CompatibleTime, _ = time.Parse(time.RFC3339, "1677-12-01T00:00:00Z") + MaxRFC3339CompatibleTime, _ = time.Parse(time.RFC3339, "2262-04-11T11:59:59Z") } func queryTimeFromString(t string) (time.Time, error) { @@ -151,7 +151,7 @@ func (q *DateRangeQuery) parseEndpoints() (*float64, *float64, error) { min := math.Inf(-1) max := math.Inf(1) if !q.Start.IsZero() { - if q.Start.Before(minValidTime) || q.Start.After(maxValidTime) { + if !isDatetimeCompatible(q.Start) { // overflow return nil, nil, fmt.Errorf("invalid/unsupported date range, start: %v", q.Start) } @@ -159,7 +159,7 @@ func (q *DateRangeQuery) parseEndpoints() (*float64, *float64, error) { min = numeric.Int64ToFloat64(startInt64) } if !q.End.IsZero() { - if q.End.Before(minValidTime) || q.End.After(maxValidTime) { + if !isDatetimeCompatible(q.End) { // overflow return nil, nil, fmt.Errorf("invalid/unsupported date range, end: %v", q.End) } @@ -180,3 +180,12 @@ func (q *DateRangeQuery) Validate() error { } return nil } + +func isDatetimeCompatible(t BleveQueryTime) bool { + if QueryDateTimeFormat == time.RFC3339 && + (t.Before(MinRFC3339CompatibleTime) || t.After(MaxRFC3339CompatibleTime)) { + return false + } + + return true +} From 9dcbc5c71af0d5f5a5ffc392461a6579c15bf686 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Thu, 4 Apr 2019 13:38:36 -0700 Subject: [PATCH 584/728] Include freq/norm information when IncludeTermVectors:true Fixes: https://github.com/blevesearch/bleve/issues/1194 --- search/scorer/scorer_term.go | 4 ++- search/searcher/search_term.go | 2 +- search_test.go | 46 ++++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 2 deletions(-) diff --git a/search/scorer/scorer_term.go b/search/scorer/scorer_term.go index 5544f2d01..f45fd5dbc 100644 --- a/search/scorer/scorer_term.go +++ b/search/scorer/scorer_term.go @@ -159,7 +159,9 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term rv := ctx.DocumentMatchPool.Get() rv.IndexInternalID = append(rv.IndexInternalID, termMatch.ID...) - rv.Score = score + if s.options.Score != "none" { + rv.Score = score + } if s.options.Explain { rv.Expl = scoreExplanation } diff --git a/search/searcher/search_term.go b/search/searcher/search_term.go index c1af74c76..8ff74d60c 100644 --- a/search/searcher/search_term.go +++ b/search/searcher/search_term.go @@ -42,7 +42,7 @@ func NewTermSearcher(indexReader index.IndexReader, term string, field string, b } func NewTermSearcherBytes(indexReader index.IndexReader, term []byte, field string, boost float64, options search.SearcherOptions) (*TermSearcher, error) { - needFreqNorm := options.Score != "none" + needFreqNorm := options.IncludeTermVectors || options.Score != "none" reader, err := indexReader.TermFieldReader(term, field, needFreqNorm, needFreqNorm, options.IncludeTermVectors) if err != nil { return nil, err diff --git a/search_test.go b/search_test.go index c534dd320..8c78d641f 100644 --- a/search_test.go +++ b/search_test.go @@ -1548,3 +1548,49 @@ func TestBooleanSearchBug1185(t *testing.T) { t.Fatalf("expected same number of hits, got: %d and %d", len(res.Hits), len(res2.Hits)) } } + +func TestSearchScoreNone(t *testing.T) { + idx, err := NewUsing("testidx", NewIndexMapping(), scorch.Name, Config.DefaultKVStore, nil) + if err != nil { + t.Fatal(err) + } + + defer func() { + err := os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + }() + + doc := map[string]interface{}{ + "field1": "asd fgh jkl", + "field2": "more content blah blah", + "id": "doc", + } + + if err = idx.Index("doc", doc); err != nil { + t.Fatal(err) + } + + q := NewQueryStringQuery("content") + sr := NewSearchRequest(q) + sr.IncludeLocations = true + sr.Score = "none" + + res, err := idx.Search(sr) + if err != nil { + t.Fatal(err) + } + + if len(res.Hits) != 1 { + t.Fatal("unexpected number of hits") + } + + if len(res.Hits[0].Locations) != 1 { + t.Fatal("unexpected locations for the hit") + } + + if res.Hits[0].Score != 0 { + t.Fatal("unexpected score for the hit") + } +} From 45782f295224663b84e7654939bb127012611102 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Thu, 4 Apr 2019 17:58:07 -0700 Subject: [PATCH 585/728] New flag within TermQueryScorer on whether to add score --- search/scorer/scorer_term.go | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/search/scorer/scorer_term.go b/search/scorer/scorer_term.go index f45fd5dbc..a0e02e438 100644 --- a/search/scorer/scorer_term.go +++ b/search/scorer/scorer_term.go @@ -40,6 +40,7 @@ type TermQueryScorer struct { idf float64 options search.SearcherOptions idfExplanation *search.Explanation + includeScore bool queryNorm float64 queryWeight float64 queryWeightExplanation *search.Explanation @@ -62,14 +63,15 @@ func (s *TermQueryScorer) Size() int { func NewTermQueryScorer(queryTerm []byte, queryField string, queryBoost float64, docTotal, docTerm uint64, options search.SearcherOptions) *TermQueryScorer { rv := TermQueryScorer{ - queryTerm: string(queryTerm), - queryField: queryField, - queryBoost: queryBoost, - docTerm: docTerm, - docTotal: docTotal, - idf: 1.0 + math.Log(float64(docTotal)/float64(docTerm+1.0)), - options: options, - queryWeight: 1.0, + queryTerm: string(queryTerm), + queryField: queryField, + queryBoost: queryBoost, + docTerm: docTerm, + docTotal: docTotal, + idf: 1.0 + math.Log(float64(docTotal)/float64(docTerm+1.0)), + options: options, + queryWeight: 1.0, + includeScore: options.Score != "none", } if options.Explain { @@ -159,7 +161,7 @@ func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.Term rv := ctx.DocumentMatchPool.Get() rv.IndexInternalID = append(rv.IndexInternalID, termMatch.ID...) - if s.options.Score != "none" { + if s.includeScore { rv.Score = score } if s.options.Explain { From eaa53f3be3d7e368753502a38fb04f29ceb3f253 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 8 Apr 2019 07:33:59 +0530 Subject: [PATCH 586/728] =?UTF-8?q?This=20change=20aims=20to=20cache=20the?= =?UTF-8?q?=20vellum=20fst=20instances=20per=20field=20at=20the=20SegmentB?= =?UTF-8?q?ase=20level=20to=20help=20it=E2=80=99s=20reuse=20across=20queri?= =?UTF-8?q?es.=20Its=20been=20noted=20from=20the=20cpu=20profiles=20that?= =?UTF-8?q?=20the=20vellum.Load/new=20could=20be=20optimised.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Improvements observed for a 1M data set with 10K w/sec, Wildcard queries -  147 – 184 [25%] Fuzzy1 – 705 – 790 [12%] Fuzzy2 - 71 - 79 [11%] Prefix queries - 253 -> 270 [7%] --- index/scorch/segment/zap/build.go | 2 ++ index/scorch/segment/zap/segment.go | 28 ++++++++++++++++++++-------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go index 91bfd4e24..c02333cee 100644 --- a/index/scorch/segment/zap/build.go +++ b/index/scorch/segment/zap/build.go @@ -16,6 +16,7 @@ package zap import ( "bufio" + "github.com/couchbase/vellum" "math" "os" ) @@ -137,6 +138,7 @@ func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32, docValueOffset: docValueOffset, dictLocs: dictLocs, fieldDvReaders: make(map[uint16]*docValueReader), + fieldFSTs: make(map[uint16]*vellum.FST), } sb.updateSize() diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 7ba28c236..bbae932e1 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -56,6 +56,7 @@ func Open(path string) (segment.Segment, error) { mem: mm[0 : len(mm)-FooterSize], fieldsMap: make(map[string]uint16), fieldDvReaders: make(map[uint16]*docValueReader), + fieldFSTs: make(map[uint16]*vellum.FST), }, f: f, mm: mm, @@ -101,6 +102,9 @@ type SegmentBase struct { fieldDvReaders map[uint16]*docValueReader // naive chunk cache per field fieldDvNames []string // field names cached in fieldDvReaders size uint64 + + m sync.Mutex + fieldFSTs map[uint16]*vellum.FST } func (sb *SegmentBase) Size() int { @@ -258,19 +262,27 @@ func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) { dictStart := sb.dictLocs[rv.fieldID] if dictStart > 0 { - // read the length of the vellum data - vellumLen, read := binary.Uvarint(sb.mem[dictStart : dictStart+binary.MaxVarintLen64]) - fstBytes := sb.mem[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen] - if fstBytes != nil { + var ok bool + sb.m.Lock() + if rv.fst, ok = sb.fieldFSTs[rv.fieldID]; !ok { + // read the length of the vellum data + vellumLen, read := binary.Uvarint(sb.mem[dictStart : dictStart+binary.MaxVarintLen64]) + fstBytes := sb.mem[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen] rv.fst, err = vellum.Load(fstBytes) if err != nil { + sb.m.Unlock() return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err) } - rv.fstReader, err = rv.fst.Reader() - if err != nil { - return nil, fmt.Errorf("dictionary field %s vellum reader err: %v", field, err) - } + + sb.fieldFSTs[rv.fieldID] = rv.fst } + + sb.m.Unlock() + rv.fstReader, err = rv.fst.Reader() + if err != nil { + return nil, fmt.Errorf("dictionary field %s vellum reader err: %v", field, err) + } + } } From 86614196cb9f5e971975abbd52a34d7e6c93cef3 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 8 Apr 2019 10:57:53 +0530 Subject: [PATCH 587/728] moving to unsafe.Sizeof() api to fix the vet errors. Its been analogous to reflect.Type.Size() as per API documentation. --- index/scorch/segment/zap/segment.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index bbae932e1..03cb070d3 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -20,8 +20,8 @@ import ( "fmt" "io" "os" - "reflect" "sync" + "unsafe" "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index/scorch/segment" @@ -35,7 +35,7 @@ var reflectStaticSizeSegmentBase int func init() { var sb SegmentBase - reflectStaticSizeSegmentBase = int(reflect.TypeOf(sb).Size()) + reflectStaticSizeSegmentBase = int(unsafe.Sizeof(sb)) } // Open returns a zap impl of a segment From 409d4e4fc3b43c29bc787618044c17cff2e75dad Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 8 Apr 2019 15:06:46 -0700 Subject: [PATCH 588/728] Include freq/norm if location information is asked for --- index/scorch/segment/zap/posting.go | 2 +- search/searcher/search_term.go | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 0656c8e99..417e89b4d 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -190,7 +190,7 @@ func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool, } rv.postings = p - rv.includeFreqNorm = includeFreq || includeNorm + rv.includeFreqNorm = includeFreq || includeNorm || includeLocs rv.includeLocs = includeLocs if p.normBits1Hit != 0 { diff --git a/search/searcher/search_term.go b/search/searcher/search_term.go index 8ff74d60c..c1af74c76 100644 --- a/search/searcher/search_term.go +++ b/search/searcher/search_term.go @@ -42,7 +42,7 @@ func NewTermSearcher(indexReader index.IndexReader, term string, field string, b } func NewTermSearcherBytes(indexReader index.IndexReader, term []byte, field string, boost float64, options search.SearcherOptions) (*TermSearcher, error) { - needFreqNorm := options.IncludeTermVectors || options.Score != "none" + needFreqNorm := options.Score != "none" reader, err := indexReader.TermFieldReader(term, field, needFreqNorm, needFreqNorm, options.IncludeTermVectors) if err != nil { return nil, err From ad32d2f762cf4d50e4332f494ea65b3f2535b273 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 24 Apr 2019 13:23:39 -0700 Subject: [PATCH 589/728] Update .travis.yml to test on the latest 3 versions of golang + 1.12.x, 1.11.x, 1.10.x + https://golang.org/doc/devel/release.html#policy --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index c79b3fa3d..22612c966 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,9 +3,9 @@ sudo: false language: go go: - - "1.9.x" - "1.10.x" - "1.11.x" + - "1.12.x" script: - go get golang.org/x/tools/cmd/cover From dca226594cf3d2111f5d0740f392e5199c1bc71f Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 24 Apr 2019 18:31:35 -0700 Subject: [PATCH 590/728] Reset stats: lastDocSize, totalSize in batch's Reset() --- index.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/index.go b/index.go index 99357eee0..ef6ede934 100644 --- a/index.go +++ b/index.go @@ -117,6 +117,8 @@ func (b *Batch) String() string { // be re-used in the future. func (b *Batch) Reset() { b.internal.Reset() + b.lastDocSize = 0 + b.totalSize = 0 } func (b *Batch) Merge(o *Batch) { From d1bc80056ac11ee36ef247eac3ef682a2690e769 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 25 Apr 2019 12:13:53 -0400 Subject: [PATCH 591/728] fix some minor issues with zap doc values (#1202) - first, we now check the error returned by loadFieldDocValueReader - second, in order for that to no break things we changed the behavior of loadFieldDocValueReader to not return an error in the case where the field isn't uninverted. this isn't really an error, we now just return nil, nil - third, we return a new error when the condition fieldDvLocEnd-fieldDvLocStart > 16 isn't satisfied. this is a good indication that we're pointing at the wrong part of the file, or it has been corrupted. - fourth, the unit test helper method testMergeWithEmptySegments was updated to check for errors returned by AnalysisResultsToSegmentBase - finally, the loadDvReaders metdod was updated to return early when the number of docs is 0. this was necessary because there is a way to build a segment which has an invalid docValueOffset. this case was exposed by the empty segment building test. a proper fix is to would change the file format, so adding this check was preferred for the moment. --- index/scorch/segment/zap/docvalues.go | 6 ++++-- index/scorch/segment/zap/merge_test.go | 5 ++++- index/scorch/segment/zap/segment.go | 7 +++++-- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index bcc0f9472..0ceb91fb5 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -88,8 +88,8 @@ func (s *SegmentBase) loadFieldDocValueReader(field string, fieldDvLocStart, fieldDvLocEnd uint64) (*docValueReader, error) { // get the docValue offset for the given fields if fieldDvLocStart == fieldNotUninverted { - return nil, fmt.Errorf("loadFieldDocValueReader: "+ - "no docValues found for field: %s", field) + // no docValues found, nothing to do + return nil, nil } // read the number of chunks, and chunk offsets position @@ -101,6 +101,8 @@ func (s *SegmentBase) loadFieldDocValueReader(field string, chunkOffsetsLen := binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-16 : fieldDvLocEnd-8]) // acquire position of chunk offsets chunkOffsetsPosition = (fieldDvLocEnd - 16) - chunkOffsetsLen + } else { + return nil, fmt.Errorf("loadFieldDocValueReader: fieldDvLoc too small: %d-%d", fieldDvLocEnd, fieldDvLocStart) } fdvIter := &docValueReader{ diff --git a/index/scorch/segment/zap/merge_test.go b/index/scorch/segment/zap/merge_test.go index 450ecba91..175671226 100644 --- a/index/scorch/segment/zap/merge_test.go +++ b/index/scorch/segment/zap/merge_test.go @@ -147,7 +147,10 @@ func testMergeWithEmptySegments(t *testing.T, before bool, numEmptySegments int) _ = os.RemoveAll("/tmp/" + fname) - emptySegment, _, _ := AnalysisResultsToSegmentBase([]*index.AnalysisResult{}, 1024) + emptySegment, _, err := AnalysisResultsToSegmentBase([]*index.AnalysisResult{}, 1024) + if err != nil { + t.Fatal(err) + } err = PersistSegmentBase(emptySegment, "/tmp/"+fname) if err != nil { t.Fatal(err) diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go index 03cb070d3..5aa33a26c 100644 --- a/index/scorch/segment/zap/segment.go +++ b/index/scorch/segment/zap/segment.go @@ -539,7 +539,7 @@ func (s *Segment) DictAddr(field string) (uint64, error) { } func (s *SegmentBase) loadDvReaders() error { - if s.docValueOffset == fieldNotUninverted { + if s.docValueOffset == fieldNotUninverted || s.numDocs == 0 { return nil } @@ -558,7 +558,10 @@ func (s *SegmentBase) loadDvReaders() error { } read += uint64(n) - fieldDvReader, _ := s.loadFieldDocValueReader(field, fieldLocStart, fieldLocEnd) + fieldDvReader, err := s.loadFieldDocValueReader(field, fieldLocStart, fieldLocEnd) + if err != nil { + return err + } if fieldDvReader != nil { s.fieldDvReaders[uint16(fieldID)] = fieldDvReader s.fieldDvNames = append(s.fieldDvNames, field) From 7f3a218ae72960bb4841254833a52a5f088a9928 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 1 May 2019 13:41:33 -0400 Subject: [PATCH 592/728] add optional validation hooks to scorch/zap (#1206) * add optional validation hooks to scorch/zap ValidateDocFields can be used to check each field seen by zap ValidateMerge can be used to check the results of a merge operation These hooks are intended to allow for application specific checks inside the main bleve branch, so as to allow maximum flexibility with minimal disruption. These hooks are not intended to be permanent, but that decision can be made at a later time. * fix bug in logging * invoke validate merge for in-memory merges as well * change error message for in-memory merge validation fail --- index/scorch/merge.go | 9 ++++++++- index/scorch/segment/zap/merge.go | 8 ++++++++ index/scorch/segment/zap/new.go | 13 +++++++++++++ 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index bcbf5b710..83f98aab0 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -151,7 +151,6 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, atomic.AddUint64(&s.stats.TotFileMergePlanNone, 1) return nil } - atomic.AddUint64(&s.stats.TotFileMergePlanOk, 1) atomic.AddUint64(&s.stats.TotFileMergePlanTasks, uint64(len(resultMergePlan.Tasks))) @@ -221,6 +220,10 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1) return err } + err = zap.ValidateMerge(segmentsToMerge, nil, docsToDrop, seg.(*zap.Segment)) + if err != nil { + return fmt.Errorf("merge validation failed: %v", err) + } oldNewDocNums = make(map[uint64][]uint64) for i, segNewDocNums := range newDocNums { oldNewDocNums[task.Segments[i].Id()] = segNewDocNums @@ -311,6 +314,10 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, atomic.AddUint64(&s.stats.TotMemMergeErr, 1) return nil, 0, err } + err = zap.ValidateMerge(nil, sbs, sbsDrops, seg.(*zap.Segment)) + if err != nil { + return nil, 0, fmt.Errorf("in-memory merge validation failed: %v", err) + } // update persisted stats atomic.AddUint64(&s.stats.TotPersistedItems, seg.Count()) diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go index 4ef222c1a..50bd7207a 100644 --- a/index/scorch/segment/zap/merge.go +++ b/index/scorch/segment/zap/merge.go @@ -31,6 +31,14 @@ import ( var DefaultFileMergerBufferSize = 1024 * 1024 +// ValidateMerge can be set by applications to perform additional checks +// on a new segment produced by a merge, by default this does nothing. +// Caller should provide EITHER segments or memSegments, but not both. +// This API is experimental and may be removed at any time. +var ValidateMerge = func(segments []*Segment, memSegments []*SegmentBase, drops []*roaring.Bitmap, newSegment *Segment) error { + return nil +} + const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc // Merge takes a slice of zap segments and bit masks describing which diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go index 22b69913e..c108ec16d 100644 --- a/index/scorch/segment/zap/new.go +++ b/index/scorch/segment/zap/new.go @@ -33,6 +33,14 @@ var NewSegmentBufferNumResultsBump int = 100 var NewSegmentBufferNumResultsFactor float64 = 1.0 var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0 +// ValidateDocFields can be set by applications to perform additional checks +// on fields in a document being added to a new segment, by default it does +// nothing. +// This API is experimental and may be removed at any time. +var ValidateDocFields = func(field document.Field) error { + return nil +} + // AnalysisResultsToSegmentBase produces an in-memory zap-encoded // SegmentBase from analysis results func AnalysisResultsToSegmentBase(results []*index.AnalysisResult, @@ -521,6 +529,11 @@ func (s *interim) writeStoredFields() ( if opts.IncludeDocValues() { s.IncludeDocValues[fieldID] = true } + + err := ValidateDocFields(field) + if err != nil { + return 0, err + } } var curr int From 97932ae5b280705390b8b3b6803186f5fc98d822 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 6 May 2019 17:24:25 -0400 Subject: [PATCH 593/728] move the impl of DocumentFieldTermVisitable (#1208) previously these methods were defined on Segment but they only use fields in SegmentBase, and having them defined on SegmentBase means you can use them on in-memory segments --- index/scorch/segment/zap/docvalues.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go index 0ceb91fb5..a819ca239 100644 --- a/index/scorch/segment/zap/docvalues.go +++ b/index/scorch/segment/zap/docvalues.go @@ -39,7 +39,7 @@ type docNumTermsVisitor func(docNum uint64, terms []byte) error type docVisitState struct { dvrs map[uint16]*docValueReader - segment *Segment + segment *SegmentBase } type docValueReader struct { @@ -252,7 +252,7 @@ func (di *docValueReader) getDocValueLocs(docNum uint64) (uint64, uint64) { // VisitDocumentFieldTerms is an implementation of the // DocumentFieldTermVisitable interface -func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string, +func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []string, visitor index.DocumentFieldTermVisitor, dvsIn segment.DocVisitState) ( segment.DocVisitState, error) { dvs, ok := dvsIn.(*docVisitState) @@ -291,7 +291,7 @@ func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string, if dvr, ok = dvs.dvrs[fieldID]; ok && dvr != nil { // check if the chunk is already loaded if docInChunk != dvr.curChunkNumber() { - err := dvr.loadDvChunk(docInChunk, &s.SegmentBase) + err := dvr.loadDvChunk(docInChunk, s) if err != nil { return dvs, err } @@ -306,6 +306,6 @@ func (s *Segment) VisitDocumentFieldTerms(localDocNum uint64, fields []string, // VisitableDocValueFields returns the list of fields with // persisted doc value terms ready to be visitable using the // VisitDocumentFieldTerms method. -func (s *Segment) VisitableDocValueFields() ([]string, error) { +func (s *SegmentBase) VisitableDocValueFields() ([]string, error) { return s.fieldDvNames, nil } From 5b9d7756e18c7c18b507d148ed4936896bd50370 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 7 May 2019 20:33:25 -0400 Subject: [PATCH 594/728] fix batch persisted callback for upsidedown (#1209) previous implementation would only execute the callback if the control flow reached the end of the function unfortunately there are other explicit returns earlier in the function users setting a callback, would reasonably expect it to always be called (passing either nil or an err) and due to this bug, certain error conditions resulted in the callback not firing this change checks for the callback, and if present defers execution of the callback, ensuring it will always be executed --- index/upsidedown/upsidedown.go | 8 ++--- index/upsidedown/upsidedown_test.go | 46 +++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/index/upsidedown/upsidedown.go b/index/upsidedown/upsidedown.go index 0699dbf97..24f5aae94 100644 --- a/index/upsidedown/upsidedown.go +++ b/index/upsidedown/upsidedown.go @@ -801,6 +801,10 @@ func (udc *UpsideDownCouch) termFieldVectorsFromTermVectors(in []*TermVector) [] } func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) { + persistedCallback := batch.PersistedCallback() + if persistedCallback != nil { + defer persistedCallback(err) + } analysisStart := time.Now() resultChan := make(chan *index.AnalysisResult, len(batch.IndexOps)) @@ -965,10 +969,6 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) { atomic.AddUint64(&udc.stats.errors, 1) } - persistedCallback := batch.PersistedCallback() - if persistedCallback != nil { - persistedCallback(err) - } return } diff --git a/index/upsidedown/upsidedown_test.go b/index/upsidedown/upsidedown_test.go index 2135a22c9..cc7cccfcb 100644 --- a/index/upsidedown/upsidedown_test.go +++ b/index/upsidedown/upsidedown_test.go @@ -19,6 +19,7 @@ import ( "reflect" "regexp" "strconv" + "strings" "sync" "testing" "time" @@ -1447,3 +1448,48 @@ func TestLargeField(t *testing.T) { t.Fatal(err) } } + +func TestIndexBatchPersistedCallbackWithErrorUpsideDown(t *testing.T) { + defer func() { + err := DestroyTest() + if err != nil { + t.Fatal(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewUpsideDownCouch(boltdb.Name, boltTestConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + var callbackExecuted bool + batch := index.NewBatch() + batch.SetPersistedCallback(func(e error) { + callbackExecuted = true + }) + // By using a really large ID, we ensure that the batch will fail, + // because the key generated by upside down will be too large for BoltDB + reallyBigId := strings.Repeat("x", 32768+1) + doc := document.NewDocument(reallyBigId) + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test3"))) + batch.Update(doc) + + _ = idx.Batch(batch) + // don't fail on this error, that isn't what we're testing + + if !callbackExecuted { + t.Fatal("expected callback to fire, it did not") + } + +} From 2b973805198b421f707520d449236f19f0236b75 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 8 May 2019 13:24:56 +0530 Subject: [PATCH 595/728] adding UTs for GeoHash Its been noted a precision/accuracy issue with respect to the existing geoHash decoder since it performs rounding of lat/lon values. This matters when the given hash is having a smaller precision like 5, eg: "d3hn3". This test is just a confirmation for the results with some sample values. --- geo/geohash_test.go | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 geo/geohash_test.go diff --git a/geo/geohash_test.go b/geo/geohash_test.go new file mode 100644 index 000000000..9600612f6 --- /dev/null +++ b/geo/geohash_test.go @@ -0,0 +1,44 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package geo + +import ( + "testing" +) + +func TestGeoHash(t *testing.T) { + tests := []struct { + hash string + lon float64 + lat float64 + }{ + {"d3hn3", -73.080000, 6.730000}, // -73.05908203, 6.74560547 as per http://geohash.co/ + {"u4pru", 10.380000, 57.620000}, // 10.39306641, 57.63427734 + {"u4pruy", 10.410000, 57.646000}, // 10.40954590, 57.64801025 + {"u4pruyd", 10.407000, 57.649000}, // 10.40748596, 57.64869690 + {"u4pruydqqvj", 10.40744, 57.64911}, // 10.40743969, 57.64911063 + } + + for _, test := range tests { + lat, lon := GeoHashDecode(test.hash) + + if compareGeo(test.lon, lon) != 0 { + t.Errorf("expected lon %f, got %f, hash %s", test.lon, lon, test.hash) + } + if compareGeo(test.lat, lat) != 0 { + t.Errorf("expected lat %f, got %f, hash %s", test.lat, lat, test.hash) + } + } +} From ee12e763a20db2b3dc6e7c090b9ebf029b9d6713 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 8 May 2019 14:40:36 +0530 Subject: [PATCH 596/728] MB-34021 - geo query not returning few docs The geo location precision lost during geohash decoding was resulting in this issue. Introducing a new geohash decoding method with better precision and performance. go test -benchmem -bench=. goos: darwin goarch: amd64 pkg: github.com/blevesearch/bleve/geo BenchmarkGeoHashLen5Decode-8 10000000 119 ns/op 0 B/op 0 allocs/op BenchmarkGeoHashLen5NewDecode-8 20000000 60.7 ns/op 0 B/op 0 allocs/op BenchmarkGeoHashLen6Decode-8 10000000 120 ns/op 0 B/op 0 allocs/op BenchmarkGeoHashLen6NewDecode-8 20000000 70.4 ns/op 0 B/op 0 allocs/op BenchmarkGeoHashLen7Decode-8 10000000 119 ns/op 0 B/op 0 allocs/op BenchmarkGeoHashLen7NewDecode-8 20000000 83.2 ns/op 0 B/op 0 allocs/op --- geo/benchmark_geohash_test.go | 67 ++++++++++++++++++++++++++ geo/geohash.go | 71 ++++++++++++++++++++++++++++ geo/geohash_test.go | 89 +++++++++++++++++++++++++++++++++++ geo/parse.go | 2 +- 4 files changed, 228 insertions(+), 1 deletion(-) create mode 100644 geo/benchmark_geohash_test.go create mode 100644 geo/geohash_test.go diff --git a/geo/benchmark_geohash_test.go b/geo/benchmark_geohash_test.go new file mode 100644 index 000000000..5ed5d6b76 --- /dev/null +++ b/geo/benchmark_geohash_test.go @@ -0,0 +1,67 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package geo + +import ( + "testing" +) + +func BenchmarkGeoHashLen5Decode(b *testing.B) { + b.ResetTimer() + hash := "d3hn3" + for i := 0; i < b.N; i++ { + _, _ = GeoHashDecode(hash) + } +} + +func BenchmarkGeoHashLen5NewDecode(b *testing.B) { + b.ResetTimer() + hash := "d3hn3" + for i := 0; i < b.N; i++ { + _, _ = DecodeGeoHash(hash) + } +} + +func BenchmarkGeoHashLen6Decode(b *testing.B) { + b.ResetTimer() + hash := "u4pruy" + for i := 0; i < b.N; i++ { + _, _ = GeoHashDecode(hash) + } +} + +func BenchmarkGeoHashLen6NewDecode(b *testing.B) { + b.ResetTimer() + hash := "u4pruy" + for i := 0; i < b.N; i++ { + _, _ = DecodeGeoHash(hash) + } +} + +func BenchmarkGeoHashLen7Decode(b *testing.B) { + b.ResetTimer() + hash := "u4pruyd" + for i := 0; i < b.N; i++ { + _, _ = GeoHashDecode(hash) + } +} + +func BenchmarkGeoHashLen7NewDecode(b *testing.B) { + b.ResetTimer() + hash := "u4pruyd" + for i := 0; i < b.N; i++ { + _, _ = DecodeGeoHash(hash) + } +} diff --git a/geo/geohash.go b/geo/geohash.go index 35db720c0..4fac3a877 100644 --- a/geo/geohash.go +++ b/geo/geohash.go @@ -172,3 +172,74 @@ func GeoHashDecode(hash string) (lat, lng float64) { box := geoBoundingBox(hash) return box.round() } + +var masks = []uint64{16, 8, 4, 2, 1} + +// DecodeGeoHash decodes the string geohash faster with +// higher precision. This api is in experimental phase. +func DecodeGeoHash(geoHash string) (float64, float64) { + even := true + lat := []float64{-90.0, 90.0} + lon := []float64{-180.0, 180.0} + + for i := 0; i < len(geoHash); i++ { + cd := uint64(base32encoding.dec[geoHash[i]]) + for j := 0; j < 5; j++ { + if even { + if cd&masks[j] > 0 { + lon[0] = (lon[0] + lon[1]) / 2 + } else { + lon[1] = (lon[0] + lon[1]) / 2 + } + } else { + if cd&masks[j] > 0 { + lat[0] = (lat[0] + lat[1]) / 2 + } else { + lat[1] = (lat[0] + lat[1]) / 2 + } + } + even = !even + } + } + + return (lat[0] + lat[1]) / 2, (lon[0] + lon[1]) / 2 +} + +func EncodeGeoHash(lat, lon float64) string { + even := true + lats := []float64{-90.0, 90.0} + lons := []float64{-180.0, 180.0} + precision := 12 + var ch, bit uint64 + var geoHash string + + for len(geoHash) < precision { + if even { + mid := (lons[0] + lons[1]) / 2 + if lon > mid { + ch |= masks[bit] + lons[0] = mid + } else { + lons[1] = mid + } + } else { + mid := (lats[0] + lats[1]) / 2 + if lat > mid { + ch |= masks[bit] + lats[0] = mid + } else { + lats[1] = mid + } + } + even = !even + if bit < 4 { + bit++ + } else { + geoHash += string(base32encoding.enc[ch]) + ch = 0 + bit = 0 + } + } + + return geoHash +} diff --git a/geo/geohash_test.go b/geo/geohash_test.go new file mode 100644 index 000000000..d0bec329d --- /dev/null +++ b/geo/geohash_test.go @@ -0,0 +1,89 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package geo + +import ( + "strings" + "testing" +) + +func TestGeoHashDecode(t *testing.T) { + tests := []struct { + hash string + lon float64 + lat float64 + }{ + {"d3hn3", -73.080000, 6.730000}, // -73.05908203, 6.74560547 as per http://geohash.co/ + {"u4pru", 10.380000, 57.620000}, // 10.39306641, 57.63427734 + {"u4pruy", 10.410000, 57.646000}, // 10.40954590, 57.64801025 + {"u4pruyd", 10.407000, 57.649000}, // 10.40748596, 57.64869690 + {"u4pruydqqvj", 10.40744, 57.64911}, // 10.40743969, 57.64911063 + } + + for _, test := range tests { + lat, lon := GeoHashDecode(test.hash) + + if compareGeo(test.lon, lon) != 0 { + t.Errorf("expected lon %f, got %f, hash %s", test.lon, lon, test.hash) + } + if compareGeo(test.lat, lat) != 0 { + t.Errorf("expected lat %f, got %f, hash %s", test.lat, lat, test.hash) + } + } +} + +func TestDecodeGeoHash(t *testing.T) { + tests := []struct { + hash string + lon float64 + lat float64 + }{ + {"d3hn3", -73.059082, 6.745605}, // -73.05908203, 6.74560547 as per http://geohash.co/ + {"u4pru", 10.393066, 57.634277}, // 10.39306641, 57.63427734 + {"u4pruy", 10.409546, 57.648010}, // 10.40954590, 57.64801025 + {"u4pruyd", 10.407486, 57.648697}, // 10.40748596, 57.64869690 + {"u4pruydqqvj", 10.40744, 57.64911}, // 10.40743969, 57.64911063 + } + + for _, test := range tests { + lat, lon := DecodeGeoHash(test.hash) + + if compareGeo(test.lon, lon) != 0 { + t.Errorf("expected lon %f, got %f, hash %s", test.lon, lon, test.hash) + } + if compareGeo(test.lat, lat) != 0 { + t.Errorf("expected lat %f, got %f, hash %s", test.lat, lat, test.hash) + } + } +} + +func TestEncodeGeoHash(t *testing.T) { + tests := []struct { + lon float64 + lat float64 + hash string + }{ + {2.29449034, 48.85841131, "u09tunquc"}, + {76.491540, 10.060349, "t9y3hx7my0fp"}, + } + + for _, test := range tests { + hash := EncodeGeoHash(test.lat, test.lon) + + if !strings.HasPrefix(hash, test.hash) { + t.Errorf("expected hash %s, got %s", test.hash, hash) + } + } +} diff --git a/geo/parse.go b/geo/parse.go index 0511fea7b..5d833d911 100644 --- a/geo/parse.go +++ b/geo/parse.go @@ -85,7 +85,7 @@ func ExtractGeoPoint(thing interface{}) (lon, lat float64, success bool) { } } else { // geohash - lat, lon = GeoHashDecode(geoStr) + lat, lon = DecodeGeoHash(geoStr) foundLat = true foundLon = true } From 0c3d62bdb09c6caaec2f5ea1f1e80153f37b90d8 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 9 May 2019 12:35:47 +0530 Subject: [PATCH 597/728] adding versus test for geohash This test basically confirms the dimensions of the bounded box computed between the DecodeGeoHash method and the existing original implementation from https://github.com/mmcloughlin/geohash. --- geo/versus_test.go | 4160 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 4160 insertions(+) create mode 100644 geo/versus_test.go diff --git a/geo/versus_test.go b/geo/versus_test.go new file mode 100644 index 000000000..0a067dae7 --- /dev/null +++ b/geo/versus_test.go @@ -0,0 +1,4160 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package geo + +import ( + "testing" +) + +// This test basically confirms the dimensions of the +// bounded box computed between the DecodeGeoHash method +// and the existing original implementation from +// https://github.com/mmcloughlin/geohash. +// DecodeGeoHash method returns the centre of the rectangle +// than returning the box dimensions. +// This test verifies that the returned rectangle centre matches +// the centre for the box dimensions defined in the original +// implementation tests here: +// https://github.com/mmcloughlin/geohash/blob/master/decodecases_test.go +func TestDecodeGeoHashVersus(t *testing.T) { + tests := []struct { + hash string + box []float64 + }{ + {"91rc", []float64{7.20703125, 7.3828125, -124.1015625, -123.75}}, + {"c", []float64{45.0, 90.0, -135.0, -90.0}}, + {"0fuz", []float64{-73.30078125, -73.125, -139.5703125, -139.21875}}, + {"dwfcndf", []float64{38.1596374512, 38.1610107422, -63.3444213867, -63.3430480957}}, + {"2z7", []float64{-4.21875, -2.8125, -142.03125, -140.625}}, + {"7spw2w", []float64{-21.3684082031, -21.3629150391, -11.9311523438, -11.9201660156}}, + {"eq", []float64{33.75, 39.375, -33.75, -22.5}}, + {"mgff0", []float64{-23.5546875, -23.5107421875, 82.6171875, 82.6611328125}}, + {"dp7k386jtk0", []float64{41.5306591988, 41.5306605399, -85.3607976437, -85.3607963026}}, + {"pjb", []float64{-57.65625, -56.25, 135.0, 136.40625}}, + {"jkc7uh9", []float64{-62.5973510742, -62.5959777832, 58.184967041, 58.186340332}}, + {"1gdp9", []float64{-68.994140625, -68.9501953125, -98.3935546875, -98.349609375}}, + {"z9yj14mmnxte", []float64{55.7359149121, 55.7359150797, 165.988941416, 165.988941751}}, + {"2brk", []float64{-42.890625, -42.71484375, -136.0546875, -135.703125}}, + {"dhv5t2qh59", []float64{27.3360496759, 27.3360550404, -82.7296471596, -82.7296364307}}, + {"v", []float64{45.0, 90.0, 45.0, 90.0}}, + {"3fgd9k15b5k", []float64{-29.0691630542, -29.0691617131, -96.2718147039, -96.2718133628}}, + {"j", []float64{-90.0, -45.0, 45.0, 90.0}}, + {"b4", []float64{56.25, 61.875, -180.0, -168.75}}, + {"sb38", []float64{1.40625, 1.58203125, 35.859375, 36.2109375}}, + {"puqeug", []float64{-65.4180908203, -65.4125976562, 178.099365234, 178.110351562}}, + {"45", []float64{-73.125, -67.5, -90.0, -78.75}}, + {"34b", []float64{-29.53125, -28.125, -135.0, -133.59375}}, + {"tqb8jzn9dfqn", []float64{38.0074727163, 38.0074728839, 57.2148630023, 57.2148633376}}, + {"9x", []float64{39.375, 45.0, -112.5, -101.25}}, + {"tybf7", []float64{38.3642578125, 38.408203125, 79.9365234375, 79.98046875}}, + {"9nc", []float64{37.96875, 39.375, -133.59375, -132.1875}}, + {"pp21", []float64{-49.04296875, -48.8671875, 135.0, 135.3515625}}, + {"s6wjfu76v", []float64{15.0970602036, 15.0971031189, 19.8130273819, 19.8130702972}}, + {"wxh8ped7", []float64{39.3947410583, 39.3949127197, 119.160804749, 119.161148071}}, + {"8gr", []float64{18.28125, 19.6875, -136.40625, -135.0}}, + {"ug6hf", []float64{64.1162109375, 64.16015625, 36.650390625, 36.6943359375}}, + {"pb", []float64{-90.0, -84.375, 168.75, 180.0}}, + {"nmhvpv", []float64{-60.9686279297, -60.9631347656, 108.270263672, 108.28125}}, + {"rxgthm", []float64{-0.499877929688, -0.494384765625, 162.608642578, 162.619628906}}, + {"mj8t", []float64{-13.18359375, -13.0078125, 45.703125, 46.0546875}}, + {"rkvw", []float64{-17.2265625, -17.05078125, 153.984375, 154.3359375}}, + {"j", []float64{-90.0, -45.0, 45.0, 90.0}}, + {"u4ryw22k", []float64{58.8008880615, 58.8010597229, 11.1734390259, 11.1737823486}}, + {"96bf6jfr", []float64{15.8970451355, 15.8972167969, -122.60433197, -122.603988647}}, + {"ubhnbn2n1jvj", []float64{46.2219173647, 46.2219175324, 39.3750496209, 39.3750499561}}, + {"3gmczz", []float64{-26.3726806641, -26.3671875, -92.8234863281, -92.8125}}, + {"yb4jh3u7px0", []float64{45.8890718222, 45.8890731633, 126.75542593, 126.755427271}}, + {"9ex7rkbw1duf", []float64{20.2859266475, 20.2859268151, -101.985326596, -101.98532626}}, + {"xrkyg3mj", []float64{41.9754981995, 41.9756698608, 153.079376221, 153.079719543}}, + {"z4hn2yfzryjq", []float64{57.3869894072, 57.3869895749, 140.662075169, 140.662075505}}, + {"x357", []float64{6.15234375, 6.328125, 150.8203125, 151.171875}}, + {"v3pew", []float64{51.240234375, 51.2841796875, 67.060546875, 67.1044921875}}, + {"j1", []float64{-84.375, -78.75, 45.0, 56.25}}, + {"ec1bkph03", []float64{5.70744037628, 5.70748329163, -8.60774517059, -8.60770225525}}, + {"q4t85", []float64{-30.9375, -30.8935546875, 97.8662109375, 97.91015625}}, + {"k26z8hf", []float64{-42.2492980957, -42.2479248047, 15.119934082, 15.121307373}}, + {"p6fyq2u63", []float64{-73.4281110764, -73.428068161, 150.397725105, 150.397768021}}, + {"uqu5w2yu", []float64{83.5887908936, 83.5889625549, 17.1589279175, 17.1592712402}}, + {"terbkv", []float64{18.3526611328, 18.3581542969, 78.6071777344, 78.6181640625}}, + {"8v2nwkp", []float64{30.6958007812, 30.6971740723, -145.96572876, -145.964355469}}, + {"rbktxk2enhr", []float64{-42.6030693948, -42.6030680537, 175.397682041, 175.397683382}}, + {"r1pnfct8", []float64{-38.1802368164, -38.180065155, 144.97215271, 144.972496033}}, + {"rcmxg", []float64{-36.6064453125, -36.5625, 176.616210938, 176.66015625}}, + {"jqw7xjdt8", []float64{-52.7911090851, -52.7910661697, 65.350112915, 65.3501558304}}, + {"7mt737xd", []float64{-13.4716415405, -13.4714698792, -26.3019561768, -26.301612854}}, + {"2dprx5rg57es", []float64{-32.4132534117, -32.413253244, -146.986283138, -146.986282803}}, + {"fpr5d98ws0", []float64{86.40583992, 86.4058452845, -80.0455284119, -80.045517683}}, + {"z4cmm3", []float64{61.3970947266, 61.4025878906, 136.988525391, 136.999511719}}, + {"gz3b8j", []float64{85.8966064453, 85.9020996094, -8.7890625, -8.77807617188}}, + {"svfztv42f2k", []float64{33.6897052824, 33.6897066236, 37.8730648756, 37.8730662167}}, + {"z6f3yb6rum", []float64{60.7790976763, 60.7791030407, 149.713965654, 149.713976383}}, + {"wvqv397", []float64{30.4609680176, 30.4623413086, 133.312225342, 133.313598633}}, + {"r0ce65wshm", []float64{-40.1900213957, -40.1900160313, 137.206374407, 137.206385136}}, + {"crqvbtfbjb09", []float64{86.8235780485, 86.8235782161, -114.23181586, -114.231815524}}, + {"ptz0vc5z87", []float64{-57.5176173449, -57.5176119804, 167.601596117, 167.601606846}}, + {"x806byp", []float64{0.516357421875, 0.517730712891, 157.894134521, 157.895507812}}, + {"s", []float64{0.0, 45.0, 0.0, 45.0}}, + {"kb23vjn10", []float64{-43.2584953308, -43.2584524155, 34.3295288086, 34.3295717239}}, + {"n82kv", []float64{-87.7587890625, -87.71484375, 113.071289062, 113.115234375}}, + {"dkhzmmqfg", []float64{23.8037252426, 23.803768158, -71.830201149, -71.8301582336}}, + {"y1v", []float64{54.84375, 56.25, 97.03125, 98.4375}}, + {"q977h9", []float64{-37.4359130859, -37.4304199219, 117.268066406, 117.279052734}}, + {"sdffzu5qzu", []float64{15.9753012657, 15.9753066301, 26.7125594616, 26.7125701904}}, + {"n3q", []float64{-82.96875, -81.5625, 109.6875, 111.09375}}, + {"fr99zgs7e", []float64{87.5149440765, 87.5149869919, -76.2940835953, -76.2940406799}}, + {"4d9cv4cbh", []float64{-75.6147766113, -75.614733696, -64.8167610168, -64.8167181015}}, + {"np896wq5", []float64{-47.557926178, -47.5577545166, 90.8212280273, 90.8215713501}}, + {"45", []float64{-73.125, -67.5, -90.0, -78.75}}, + {"c5srsy", []float64{66.0388183594, 66.0443115234, -128.814697266, -128.803710938}}, + {"vcshr", []float64{54.1845703125, 54.228515625, 84.6826171875, 84.7265625}}, + {"3zbbs1wnn5", []float64{-1.30907356739, -1.30906820297, -100.011034012, -100.011023283}}, + {"jtdvxn", []float64{-58.0627441406, -58.0572509766, 71.6748046875, 71.6857910156}}, + {"yz27wu0e", []float64{86.4189720154, 86.4191436768, 124.398880005, 124.399223328}}, + {"utb0ck65h9", []float64{77.4994522333, 77.4994575977, 22.5578713417, 22.5578820705}}, + {"1t1dvq5jj", []float64{-61.3577842712, -61.3577413559, -110.15557766, -110.155534744}}, + {"yzwgy5hvwz", []float64{87.8641408682, 87.8641462326, 133.512672186, 133.512682915}}, + {"8dx38y6vby", []float64{14.3615233898, 14.3615287542, -147.267919779, -147.26790905}}, + {"9bh", []float64{0.0, 1.40625, -95.625, -94.21875}}, + {"2", []float64{-45.0, 0.0, -180.0, -135.0}}, + {"x14ve", []float64{6.591796875, 6.6357421875, 138.999023438, 139.04296875}}, + {"3603v", []float64{-33.4423828125, -33.3984375, -123.178710938, -123.134765625}}, + {"bsuje4cn6", []float64{72.7017259598, 72.7017688751, -151.741704941, -151.741662025}}, + {"9nnut36epqhn", []float64{34.5484302565, 34.5484304242, -125.273349881, -125.273349546}}, + {"7q27jg", []float64{-9.29992675781, -9.29443359375, -33.1457519531, -33.134765625}}, + {"933zzd6", []float64{8.40591430664, 8.40728759766, -120.956726074, -120.955352783}}, + {"5743y", []float64{-72.8173828125, -72.7734375, -30.322265625, -30.2783203125}}, + {"7m94uc9", []float64{-13.5708618164, -13.5694885254, -32.1336364746, -32.1322631836}}, + {"780", []float64{-45.0, -43.59375, -22.5, -21.09375}}, + {"sqpqx2w0hh", []float64{34.8953461647, 34.8953515291, 21.7723274231, 21.7723381519}}, + {"1ep", []float64{-73.125, -71.71875, -102.65625, -101.25}}, + {"k0j", []float64{-45.0, -43.59375, 7.03125, 8.4375}}, + {"z1zgr0g440", []float64{55.4195022583, 55.4195076227, 146.210260391, 146.21027112}}, + {"681bf", []float64{-44.8681640625, -44.82421875, -64.951171875, -64.9072265625}}, + {"dn6j", []float64{36.03515625, 36.2109375, -87.1875, -86.8359375}}, + {"m", []float64{-45.0, 0.0, 45.0, 90.0}}, + {"t1m", []float64{7.03125, 8.4375, 52.03125, 53.4375}}, + {"9qkshh67xe3", []float64{35.8833391964, 35.8833405375, -117.242680639, -117.242679298}}, + {"4507gyj", []float64{-72.4328613281, -72.4314880371, -89.476776123, -89.475402832}}, + {"bb844h", []float64{48.1860351562, 48.1915283203, -146.162109375, -146.151123047}}, + {"trn6yz", []float64{39.8968505859, 39.90234375, 65.3356933594, 65.3466796875}}, + {"vb99", []float64{47.98828125, 48.1640625, 80.859375, 81.2109375}}, + {"dxpbksmed", []float64{39.4428920746, 39.4429349899, -56.3961696625, -56.3961267471}}, + {"zc", []float64{50.625, 56.25, 168.75, 180.0}}, + {"3webp3s6hdeb", []float64{-8.42890352011, -8.42890335247, -106.901924349, -106.901924014}}, + {"8qg", []float64{37.96875, 39.375, -164.53125, -163.125}}, + {"fgwzk", []float64{65.9619140625, 66.005859375, -46.58203125, -46.5380859375}}, + {"1q2mxvd", []float64{-53.8467407227, -53.8453674316, -123.055114746, -123.053741455}}, + {"sd", []float64{11.25, 16.875, 22.5, 33.75}}, + {"8hhwv", []float64{23.6865234375, 23.73046875, -173.452148438, -173.408203125}}, + {"1bw", []float64{-87.1875, -85.78125, -92.8125, -91.40625}}, + {"66zkh0ex724", []float64{-28.824133873, -28.8241325319, -68.3739575744, -68.3739562333}}, + {"w6qpy8", []float64{14.0185546875, 14.0240478516, 109.973144531, 109.984130859}}, + {"nux13fvfr", []float64{-64.4522809982, -64.4522380829, 133.678851128, 133.678894043}}, + {"cj2", []float64{74.53125, 75.9375, -135.0, -133.59375}}, + {"1qrumeqp", []float64{-54.0776252747, -54.0774536133, -112.601623535, -112.601280212}}, + {"hhm0", []float64{-66.09375, -65.91796875, 7.03125, 7.3828125}}, + {"50cp37u6w86", []float64{-84.4858060777, -84.4858047366, -43.5327002406, -43.5326988995}}, + {"0vjjfe5w", []float64{-60.8467483521, -60.8465766907, -139.1040802, -139.103736877}}, + {"xm67", []float64{30.05859375, 30.234375, 149.4140625, 149.765625}}, + {"1jmnqz1", []float64{-59.3316650391, -59.330291748, -127.67074585, -127.669372559}}, + {"jjfh69u78", []float64{-56.8989658356, -56.8989229202, 47.9281997681, 47.9282426834}}, + {"9xvnjtj3ytr", []float64{44.6762318909, 44.676233232, -105.219552666, -105.219551325}}, + {"ebwk44", []float64{3.52661132812, 3.53210449219, -2.373046875, -2.36206054688}}, + {"xej0p", []float64{16.875, 16.9189453125, 164.838867188, 164.8828125}}, + {"hm4m", []float64{-60.99609375, -60.8203125, 14.4140625, 14.765625}}, + {"71d1uhzq", []float64{-36.2277603149, -36.2275886536, -42.0017623901, -42.0014190674}}, + {"u7d8cgc9h4w", []float64{64.8401203752, 64.8401217163, 14.8447689414, 14.8447702825}}, + {"zwud", []float64{83.3203125, 83.49609375, 163.828125, 164.1796875}}, + {"p4nbh7wnwsb", []float64{-78.7296326458, -78.7296313047, 144.687473774, 144.687475115}}, + {"ev", []float64{28.125, 33.75, -11.25, 0.0}}, + {"2", []float64{-45.0, 0.0, -180.0, -135.0}}, + {"ytj8jhg3vmke", []float64{73.1514216028, 73.1514217705, 120.458796099, 120.458796434}}, + {"xn", []float64{33.75, 39.375, 135.0, 146.25}}, + {"1t7f", []float64{-60.1171875, -59.94140625, -107.2265625, -106.875}}, + {"wjt", []float64{30.9375, 32.34375, 97.03125, 98.4375}}, + {"f7j3", []float64{62.05078125, 62.2265625, -71.3671875, -71.015625}}, + {"62bd4mvcg", []float64{-40.3978013992, -40.3977584839, -77.9399728775, -77.9399299622}}, + {"5", []float64{-90.0, -45.0, -45.0, 0.0}}, + {"577epg1nh0be", []float64{-71.1738922633, -71.1738920957, -28.4860032052, -28.4860028699}}, + {"pvyz2qtjw6kc", []float64{-56.3451739959, -56.3451738283, 178.260314874, 178.26031521}}, + {"2hrf6fbj", []float64{-20.6822776794, -20.6821060181, -168.980712891, -168.980369568}}, + {"z1yz7p6dc3h8", []float64{56.1584669352, 56.1584671028, 144.627516344, 144.627516679}}, + {"24v5r460td50", []float64{-28.9475047588, -28.9475045912, -172.658146173, -172.658145837}}, + {"rdtnyvudc4mu", []float64{-29.7189060599, -29.7189058922, 164.834111296, 164.834111631}}, + {"s", []float64{0.0, 45.0, 0.0, 45.0}}, + {"e7he", []float64{17.40234375, 17.578125, -27.421875, -27.0703125}}, + {"9yrkg", []float64{35.9912109375, 36.03515625, -90.9228515625, -90.87890625}}, + {"m", []float64{-45.0, 0.0, 45.0, 90.0}}, + {"yppvw0", []float64{85.341796875, 85.3472900391, 101.162109375, 101.173095703}}, + {"n3876wh", []float64{-80.9582519531, -80.9568786621, 101.716918945, 101.718292236}}, + {"m1vjqf4", []float64{-34.2224121094, -34.2210388184, 52.3306274414, 52.3320007324}}, + {"kev", []float64{-23.90625, -22.5, 29.53125, 30.9375}}, + {"4qc", []float64{-52.03125, -50.625, -77.34375, -75.9375}}, + {"5f4", []float64{-78.75, -77.34375, -8.4375, -7.03125}}, + {"ug", []float64{61.875, 67.5, 33.75, 45.0}}, + {"g5", []float64{61.875, 67.5, -45.0, -33.75}}, + {"7wvx", []float64{-5.80078125, -5.625, -14.765625, -14.4140625}}, + {"rx", []float64{-5.625, 0.0, 157.5, 168.75}}, + {"jdg0t21qzr", []float64{-74.4421631098, -74.4421577454, 71.9514906406, 71.9515013695}}, + {"606yg", []float64{-42.4072265625, -42.36328125, -86.0009765625, -85.95703125}}, + {"nxt6zwhgqd", []float64{-47.2955739498, -47.2955685854, 120.219204426, 120.219215155}}, + {"e71g7w8dr", []float64{17.482380867, 17.4824237823, -31.1342668533, -31.134223938}}, + {"n6u", []float64{-74.53125, -73.125, 106.875, 108.28125}}, + {"5", []float64{-90.0, -45.0, -45.0, 0.0}}, + {"zfcehd0d3", []float64{61.0074663162, 61.0075092316, 171.057858467, 171.057901382}}, + {"hgx5efc24r5e", []float64{-69.68212137, -69.6821212023, 43.760362789, 43.7603631243}}, + {"x", []float64{0.0, 45.0, 135.0, 180.0}}, + {"q", []float64{-45.0, 0.0, 90.0, 135.0}}, + {"zbzg87qu07g", []float64{49.8525439203, 49.8525452614, 179.668708295, 179.668709636}}, + {"02urcn", []float64{-84.3859863281, -84.3804931641, -162.729492188, -162.718505859}}, + {"p4", []float64{-78.75, -73.125, 135.0, 146.25}}, + {"j6xjzcpsk78", []float64{-74.9205163121, -74.920514971, 66.4448082447, 66.4448095858}}, + {"svchwhvd08d", []float64{33.1612041593, 33.1612055004, 35.4274991155, 35.4275004566}}, + {"yughcgqek2", []float64{72.5721216202, 72.5721269846, 128.054763079, 128.054773808}}, + {"18tc1b", []float64{-87.01171875, -87.0062255859, -104.337158203, -104.326171875}}, + {"es43c", []float64{22.8076171875, 22.8515625, -19.2919921875, -19.248046875}}, + {"zdcephk", []float64{61.0194396973, 61.0208129883, 159.922485352, 159.923858643}}, + {"2zqh", []float64{-3.515625, -3.33984375, -137.8125, -137.4609375}}, + {"sh9bfh31", []float64{25.4678535461, 25.4680252075, 2.55020141602, 2.55054473877}}, + {"62vfgv", []float64{-40.2703857422, -40.2648925781, -70.4992675781, -70.48828125}}, + {"u2yn4", []float64{50.2734375, 50.3173828125, 19.775390625, 19.8193359375}}, + {"qx5wuf", []float64{-4.42749023438, -4.42199707031, 117.630615234, 117.641601562}}, + {"9y163", []float64{34.1455078125, 34.189453125, -99.4482421875, -99.404296875}}, + {"7ygu094y", []float64{-6.32160186768, -6.3214302063, -5.95081329346, -5.9504699707}}, + {"nkfj9vxh9e6", []float64{-62.2834508121, -62.283449471, 104.149084389, 104.14908573}}, + {"n220y9m", []float64{-88.4550476074, -88.4536743164, 101.542510986, 101.543884277}}, + {"472p7k4d", []float64{-70.4220199585, -70.4218482971, -78.6037445068, -78.6034011841}}, + {"7p", []float64{-5.625, 0.0, -45.0, -33.75}}, + {"b1r1n2zp", []float64{52.2123527527, 52.2125244141, -169.87197876, -169.871635437}}, + {"neu8fxsk8k4", []float64{-68.7324213982, -68.7324200571, 118.943838179, 118.94383952}}, + {"m33fn", []float64{-37.6171875, -37.5732421875, 58.974609375, 59.0185546875}}, + {"u6z5je", []float64{61.0125732422, 61.0180664062, 21.3354492188, 21.3464355469}}, + {"5e6csnf", []float64{-71.4179992676, -71.4166259766, -18.454284668, -18.452911377}}, + {"7f7k4", []float64{-31.640625, -31.5966796875, -6.591796875, -6.5478515625}}, + {"ykd07ytm4", []float64{70.3930091858, 70.3930521011, 104.23459053, 104.234633446}}, + {"ubyx97", []float64{50.5535888672, 50.5590820312, 42.9455566406, 42.9565429688}}, + {"r", []float64{-45.0, 0.0, 135.0, 180.0}}, + {"ungw16", []float64{84.0344238281, 84.0399169922, 4.97680664062, 4.98779296875}}, + {"k", []float64{-45.0, 0.0, 0.0, 45.0}}, + {"bhfkepk0n", []float64{72.5495910645, 72.5496339798, -176.698350906, -176.698307991}}, + {"d3", []float64{5.625, 11.25, -78.75, -67.5}}, + {"wsenn", []float64{26.3671875, 26.4111328125, 116.982421875, 117.026367188}}, + {"b4b6qpqk", []float64{60.9047698975, 60.9049415588, -179.376182556, -179.375839233}}, + {"zwjx2wv3", []float64{80.0616645813, 80.0618362427, 165.263557434, 165.263900757}}, + {"0", []float64{-90.0, -45.0, -180.0, -135.0}}, + {"3sgq6", []float64{-17.1826171875, -17.138671875, -107.841796875, -107.797851562}}, + {"9chbq", []float64{5.6689453125, 5.712890625, -94.306640625, -94.2626953125}}, + {"37nehnbt", []float64{-27.5597190857, -27.5595474243, -114.432907104, -114.432563782}}, + {"uhr5w71b", []float64{69.5379638672, 69.5381355286, 10.1208114624, 10.1211547852}}, + {"8", []float64{0.0, 45.0, -180.0, -135.0}}, + {"81", []float64{5.625, 11.25, -180.0, -168.75}}, + {"vyxn", []float64{82.6171875, 82.79296875, 88.59375, 88.9453125}}, + {"73jvv", []float64{-38.3642578125, -38.3203125, -25.4443359375, -25.400390625}}, + {"nmw1ysg2f23", []float64{-58.7286601961, -58.728658855, 109.977705628, 109.977706969}}, + {"5sv6js2nphq", []float64{-62.9052887857, -62.9052874446, -14.8751798272, -14.8751784861}}, + {"289", []float64{-42.1875, -40.78125, -156.09375, -154.6875}}, + {"cq9", []float64{81.5625, 82.96875, -122.34375, -120.9375}}, + {"mm", []float64{-16.875, -11.25, 56.25, 67.5}}, + {"49378fm9v", []float64{-82.3408555984, -82.3408126831, -65.7014608383, -65.701417923}}, + {"ee079be", []float64{17.492980957, 17.494354248, -22.0674133301, -22.0660400391}}, + {"4m5cs3h", []float64{-61.6058349609, -61.6044616699, -73.2843017578, -73.2829284668}}, + {"hpdre", []float64{-46.494140625, -46.4501953125, 3.2958984375, 3.33984375}}, + {"06", []float64{-78.75, -73.125, -168.75, -157.5}}, + {"3x3r4evkpp1q", []float64{-2.9669566825, -2.96695651487, -110.624812357, -110.624812022}}, + {"95074fyh23t", []float64{17.4181875587, 17.4181888998, -134.51933071, -134.519329369}}, + {"5tdj7sf", []float64{-58.1135559082, -58.1121826172, -19.5309448242, -19.5295715332}}, + {"8r9z", []float64{43.41796875, 43.59375, -166.2890625, -165.9375}}, + {"nc", []float64{-84.375, -78.75, 123.75, 135.0}}, + {"t2mb", []float64{1.40625, 1.58203125, 64.3359375, 64.6875}}, + {"hcs1g2vbkq0g", []float64{-81.2506873347, -81.250687167, 39.525902085, 39.5259024203}}, + {"pduzpxt", []float64{-73.2595825195, -73.2582092285, 164.516143799, 164.51751709}}, + {"mw9u700", []float64{-7.6904296875, -7.68905639648, 70.0927734375, 70.0941467285}}, + {"9y0ttsfr", []float64{34.7440910339, 34.7442626953, -100.302085876, -100.301742554}}, + {"ztmu17de4", []float64{75.2541160583, 75.2541589737, 165.644388199, 165.644431114}}, + {"1dqmehx", []float64{-76.3522338867, -76.3508605957, -103.569488525, -103.568115234}}, + {"5", []float64{-90.0, -45.0, -45.0, 0.0}}, + {"6ydvkt6cp", []float64{-7.48563766479, -7.48559474945, -52.180981636, -52.1809387207}}, + {"4", []float64{-90.0, -45.0, -90.0, -45.0}}, + {"706tn", []float64{-42.71484375, -42.6708984375, -41.220703125, -41.1767578125}}, + {"6z89j2x0gun", []float64{-2.63382196426, -2.63382062316, -55.3063800931, -55.306378752}}, + {"mt1t", []float64{-15.99609375, -15.8203125, 69.609375, 69.9609375}}, + {"95ypdkpcd", []float64{22.4343395233, 22.4343824387, -126.452894211, -126.452851295}}, + {"nr1wc", []float64{-49.4384765625, -49.39453125, 103.403320312, 103.447265625}}, + {"rb2uxf4", []float64{-42.7917480469, -42.7903747559, 170.148010254, 170.149383545}}, + {"gzz5w20e1wk", []float64{89.2095328867, 89.2095342278, -1.13083541393, -1.13083407283}}, + {"mtuddkh7my", []float64{-12.1942341328, -12.1942287683, 73.9330852032, 73.933095932}}, + {"r15def4", []float64{-38.9245605469, -38.9231872559, 140.089416504, 140.090789795}}, + {"9fwcy0d", []float64{14.3728637695, 14.3742370605, -91.491394043, -91.490020752}}, + {"6vx9z80hkd", []float64{-13.7541425228, -13.7541371584, -45.3733420372, -45.3733313084}}, + {"qqxt55re", []float64{-7.54022598267, -7.54005432129, 111.93901062, 111.939353943}}, + {"4hreqy", []float64{-65.4895019531, -65.4840087891, -79.1564941406, -79.1455078125}}, + {"9nv6", []float64{38.3203125, 38.49609375, -127.6171875, -127.265625}}, + {"k980q", []float64{-36.5185546875, -36.474609375, 22.763671875, 22.8076171875}}, + {"zd6qmdhsrtce", []float64{58.7666300498, 58.7666302174, 160.912265405, 160.91226574}}, + {"ucz57k", []float64{55.4370117188, 55.4425048828, 43.7365722656, 43.7475585938}}, + {"u5pq", []float64{62.9296875, 63.10546875, 10.1953125, 10.546875}}, + {"fey0v9", []float64{66.2310791016, 66.2365722656, -58.8208007812, -58.8098144531}}, + {"mdtggek9fmh5", []float64{-30.2601397969, -30.2601396292, 75.7460278273, 75.7460281625}}, + {"y6nfy", []float64{56.7333984375, 56.77734375, 111.005859375, 111.049804688}}, + {"f", []float64{45.0, 90.0, -90.0, -45.0}}, + {"33hz5xbsw", []float64{-38.1011867523, -38.101143837, -116.915559769, -116.915516853}}, + {"2wnb71890np2", []float64{-11.1976110935, -11.1976109259, -147.875280194, -147.875279859}}, + {"7d", []float64{-33.75, -28.125, -22.5, -11.25}}, + {"71vsdbh", []float64{-34.365234375, -34.363861084, -37.1392822266, -37.1379089355}}, + {"nk", []float64{-67.5, -61.875, 101.25, 112.5}}, + {"pn6uks", []float64{-54.0747070312, -54.0692138672, 139.064941406, 139.075927734}}, + {"t0", []float64{0.0, 5.625, 45.0, 56.25}}, + {"ze7c4z3e", []float64{63.4973716736, 63.497543335, 162.896347046, 162.896690369}}, + {"ujq6txghfjdv", []float64{75.0141208805, 75.0141210482, 9.03497111052, 9.0349714458}}, + {"40muh9sfm7", []float64{-87.8819829226, -87.8819775581, -81.7095601559, -81.709549427}}, + {"0y4kk1", []float64{-55.4974365234, -55.4919433594, -142.91015625, -142.899169922}}, + {"q8btfscp1g", []float64{-39.7431975603, -39.7431921959, 113.314436674, 113.314447403}}, + {"yg2mxt", []float64{64.2755126953, 64.2810058594, 124.431152344, 124.442138672}}, + {"7r5kh", []float64{-4.921875, -4.8779296875, -29.00390625, -28.9599609375}}, + {"cvg6t", []float64{77.783203125, 77.8271484375, -96.4599609375, -96.416015625}}, + {"msj4q7e3db", []float64{-22.0850086212, -22.0850032568, 74.8104894161, 74.810500145}}, + {"1452n4", []float64{-78.7390136719, -78.7335205078, -130.166015625, -130.155029297}}, + {"xj3kvwr2d", []float64{30.4006290436, 30.4006719589, 137.009553909, 137.009596825}}, + {"2wtvb", []float64{-7.4267578125, -7.3828125, -149.4140625, -149.370117188}}, + {"c5x5dhk", []float64{65.3260803223, 65.3274536133, -125.062866211, -125.06149292}}, + {"eph56u6", []float64{39.9696350098, 39.9710083008, -39.2514038086, -39.2500305176}}, + {"5dhyg0hckf", []float64{-77.5632512569, -77.5632458925, -15.6817495823, -15.6817388535}}, + {"9vrb", []float64{29.53125, 29.70703125, -90.3515625, -90.0}}, + {"b7nsys", []float64{62.7319335938, 62.7374267578, -159.323730469, -159.312744141}}, + {"kmenbsk0", []float64{-12.8526306152, -12.8524589539, 15.4962158203, 15.4965591431}}, + {"j35", []float64{-84.375, -82.96875, 60.46875, 61.875}}, + {"e7sx", []float64{20.91796875, 21.09375, -27.421875, -27.0703125}}, + {"h87mpg", []float64{-87.6983642578, -87.6928710938, 27.4108886719, 27.421875}}, + {"qjbtp", []float64{-11.77734375, -11.7333984375, 91.0107421875, 91.0546875}}, + {"4zqs2pndx", []float64{-48.4327983856, -48.4327554703, -47.100148201, -47.1001052856}}, + {"1fsc5v3", []float64{-75.7328796387, -75.7315063477, -94.4041442871, -94.4027709961}}, + {"kp6hptx", []float64{-3.48541259766, -3.48403930664, 3.15170288086, 3.15307617188}}, + {"3n77", []float64{-9.31640625, -9.140625, -130.4296875, -130.078125}}, + {"q347uc", []float64{-38.7103271484, -38.7048339844, 104.622802734, 104.633789062}}, + {"n8gckvg3", []float64{-85.5297660828, -85.5295944214, 117.98664093, 117.986984253}}, + {"p7szbr6ceq", []float64{-68.9100801945, -68.9100748301, 152.944589853, 152.944600582}}, + {"8w7n", []float64{36.2109375, 36.38671875, -153.28125, -152.9296875}}, + {"k4s3ndj8", []float64{-30.7507324219, -30.7505607605, 6.26976013184, 6.27010345459}}, + {"fh38ev", []float64{69.0216064453, 69.0270996094, -87.7258300781, -87.71484375}}, + {"rzebrsw", []float64{-2.74383544922, -2.7424621582, 174.36126709, 174.362640381}}, + {"un", []float64{78.75, 84.375, 0.0, 11.25}}, + {"27u3d4ybbkt", []float64{-23.6273190379, -23.6273176968, -162.676259726, -162.676258385}}, + {"5hk2", []float64{-66.09375, -65.91796875, -39.0234375, -38.671875}}, + {"62f6wqsbfc6n", []float64{-40.3059548512, -40.3059546836, -75.3046354651, -75.3046351299}}, + {"r6jvqxczv", []float64{-32.7832460403, -32.783203125, 154.624199867, 154.624242783}}, + {"wyg1es5yx", []float64{38.2555103302, 38.2555532455, 128.128008842, 128.128051758}}, + {"smp9qbcpnt", []float64{28.3500748873, 28.3500802517, 22.0951581001, 22.095168829}}, + {"4q46h", []float64{-55.8984375, -55.8544921875, -75.41015625, -75.3662109375}}, + {"9u6v9g2ebcm5", []float64{24.8915505968, 24.8915507644, -97.3051826656, -97.3051823303}}, + {"25mwbs3", []float64{-25.5088806152, -25.5075073242, -172.242279053, -172.240905762}}, + {"kwf", []float64{-7.03125, -5.625, 25.3125, 26.71875}}, + {"ekqgkknev", []float64{24.5001554489, 24.5001983643, -24.0619039536, -24.0618610382}}, + {"y9974617cb12", []float64{53.9764738083, 53.9764739759, 114.358482845, 114.35848318}}, + {"htuu1sxcpd", []float64{-56.9282233715, -56.9282180071, 29.2565703392, 29.256581068}}, + {"150sv8q", []float64{-72.2886657715, -72.2872924805, -134.046936035, -134.045562744}}, + {"j36p6wnmqm", []float64{-81.6604489088, -81.6604435444, 59.181214571, 59.1812252998}}, + {"py", []float64{-56.25, -50.625, 168.75, 180.0}}, + {"8", []float64{0.0, 45.0, -180.0, -135.0}}, + {"y87", []float64{46.40625, 47.8125, 116.71875, 118.125}}, + {"6v90bwn999", []float64{-13.8974422216, -13.8974368572, -54.8127865791, -54.8127758503}}, + {"8kyq", []float64{27.7734375, 27.94921875, -159.9609375, -159.609375}}, + {"8cht765b92zg", []float64{6.55892824754, 6.55892841518, -139.773838855, -139.77383852}}, + {"nu5jwk23v7f", []float64{-66.5095366538, -66.5095353127, 128.243979514, 128.243980855}}, + {"0m10969", []float64{-61.7733764648, -61.7720031738, -167.287445068, -167.286071777}}, + {"s29h49frdp", []float64{3.52656304836, 3.52656841278, 12.7692890167, 12.7692997456}}, + {"6", []float64{-45.0, 0.0, -90.0, -45.0}}, + {"brmmeg2z8r", []float64{86.7672246695, 86.7672300339, -161.201351881, -161.201341152}}, + {"r3bngg619d", []float64{-33.9516055584, -33.951600194, 146.417605877, 146.417616606}}, + {"rmz76wced8c", []float64{-12.0472772419, -12.0472759008, 156.557344347, 156.557345688}}, + {"z", []float64{45.0, 90.0, 135.0, 180.0}}, + {"2h", []float64{-22.5, -16.875, -180.0, -168.75}}, + {"ty2764x", []float64{35.7412719727, 35.7426452637, 79.1990661621, 79.2004394531}}, + {"5yh3330r", []float64{-56.0235786438, -56.0234069824, -5.21816253662, -5.21781921387}}, + {"9szz", []float64{27.94921875, 28.125, -101.6015625, -101.25}}, + {"x7d41b", []float64{20.0390625, 20.0445556641, 149.139404297, 149.150390625}}, + {"dw", []float64{33.75, 39.375, -67.5, -56.25}}, + {"gnd4cw", []float64{82.0788574219, 82.0843505859, -42.1215820312, -42.1105957031}}, + {"k9bxc2n8", []float64{-33.7939453125, -33.7937736511, 23.2669830322, 23.267326355}}, + {"hump4nk", []float64{-64.8289489746, -64.8275756836, 40.8746337891, 40.8760070801}}, + {"gkz", []float64{71.71875, 73.125, -23.90625, -22.5}}, + {"g9e08yt", []float64{53.5610961914, 53.5624694824, -18.2414245605, -18.2400512695}}, + {"3eyuyzpm", []float64{-23.0319786072, -23.0318069458, -102.701225281, -102.700881958}}, + {"utpuc59p4m", []float64{73.9804154634, 73.9804208279, 33.443852663, 33.4438633919}}, + {"cnqt", []float64{81.03515625, 81.2109375, -125.859375, -125.5078125}}, + {"z05xfy72gk0", []float64{46.3967871666, 46.3967885077, 140.04732728, 140.047328621}}, + {"skr4zd", []float64{24.4006347656, 24.4061279297, 21.4233398438, 21.4343261719}}, + {"h2fe8mdnq", []float64{-85.1347303391, -85.1346874237, 14.7796154022, 14.7796583176}}, + {"z18b", []float64{53.4375, 53.61328125, 136.0546875, 136.40625}}, + {"6", []float64{-45.0, 0.0, -90.0, -45.0}}, + {"vh", []float64{67.5, 73.125, 45.0, 56.25}}, + {"v64zfspngxw", []float64{57.6354762912, 57.6354776323, 60.2368220687, 60.2368234098}}, + {"w", []float64{0.0, 45.0, 90.0, 135.0}}, + {"d800", []float64{0.0, 0.17578125, -67.5, -67.1484375}}, + {"s", []float64{0.0, 45.0, 0.0, 45.0}}, + {"bnugeje1", []float64{83.6143684387, 83.6145401001, -173.184356689, -173.184013367}}, + {"46", []float64{-78.75, -73.125, -78.75, -67.5}}, + {"8rsncsbz2mx4", []float64{43.4013903514, 43.401390519, -163.058031946, -163.058031611}}, + {"t4w9wh7f98je", []float64{14.3499474786, 14.3499476463, 54.4095184654, 54.4095188007}}, + {"nsr1tp2", []float64{-65.7902526855, -65.7888793945, 122.563476562, 122.564849854}}, + {"9me", []float64{30.9375, 32.34375, -119.53125, -118.125}}, + {"t8250bh2y", []float64{1.93372249603, 1.93376541138, 67.5390529633, 67.5390958786}}, + {"1", []float64{-90.0, -45.0, -135.0, -90.0}}, + {"uqty0nmvvj0c", []float64{82.652533818, 82.6525339857, 19.3440495059, 19.3440498412}}, + {"7nkxt", []float64{-8.525390625, -8.4814453125, -38.4521484375, -38.408203125}}, + {"jzev", []float64{-46.93359375, -46.7578125, 84.0234375, 84.375}}, + {"dmtj1fktxe", []float64{31.8297261, 31.8297314644, -71.6353440285, -71.6353332996}}, + {"0r", []float64{-50.625, -45.0, -168.75, -157.5}}, + {"5hqv273y", []float64{-65.152015686, -65.1518440247, -35.4944229126, -35.4940795898}}, + {"78k", []float64{-43.59375, -42.1875, -16.875, -15.46875}}, + {"f2krt5", []float64{47.7410888672, 47.7465820312, -72.5537109375, -72.5427246094}}, + {"ffw63hhzm2u", []float64{59.481229037, 59.4812303782, -47.4102383852, -47.4102370441}}, + {"mv3z5k", []float64{-14.2163085938, -14.2108154297, 81.3537597656, 81.3647460938}}, + {"f15", []float64{50.625, 52.03125, -85.78125, -84.375}}, + {"j710re25dhzs", []float64{-73.0625749379, -73.0625747703, 57.9859357327, 57.985936068}}, + {"rtt328k93", []float64{-13.8411855698, -13.8411426544, 164.911007881, 164.911050797}}, + {"d2x003t048", []float64{2.82073974609, 2.82074511051, -68.8882899284, -68.8882791996}}, + {"22uy0", []float64{-39.7265625, -39.6826171875, -162.0703125, -162.026367188}}, + {"tuzxx", []float64{28.037109375, 28.0810546875, 89.6044921875, 89.6484375}}, + {"su44fdqr2pv1", []float64{22.9970443435, 22.9970445111, 36.6809530556, 36.6809533909}}, + {"yttq", []float64{76.9921875, 77.16796875, 119.8828125, 120.234375}}, + {"9fu5vyr", []float64{16.1622619629, 16.1636352539, -95.362701416, -95.361328125}}, + {"zwmzk37wsh97", []float64{81.4386709593, 81.438671127, 165.777684934, 165.77768527}}, + {"hd777ygzewj", []float64{-76.7340624332, -76.7340610921, 27.2404141724, 27.2404155135}}, + {"0b1", []float64{-90.0, -88.59375, -144.84375, -143.4375}}, + {"ejmgd", []float64{30.146484375, 30.1904296875, -36.826171875, -36.7822265625}}, + {"rzxh6", []float64{-2.0654296875, -2.021484375, 178.681640625, 178.725585938}}, + {"0rf4f4u", []float64{-45.9077453613, -45.9063720703, -165.844116211, -165.84274292}}, + {"1m23ggbv8", []float64{-60.1395893097, -60.1395463943, -123.23261261, -123.232569695}}, + {"gdvt2y6wfv", []float64{61.4271193743, 61.4271247387, -14.7291147709, -14.7291040421}}, + {"wxb3eqeug0y0", []float64{43.8939468563, 43.8939470239, 112.9996714, 112.999671735}}, + {"516kngbm6", []float64{-82.2441244125, -82.2440814972, -41.5388774872, -41.5388345718}}, + {"xtuwe2zu", []float64{33.4911346436, 33.4913063049, 163.981590271, 163.981933594}}, + {"dhb1c2jrz6c1", []float64{27.027712483, 27.0277126506, -89.9375461042, -89.9375457689}}, + {"23c397n4v8g", []float64{-34.8756225407, -34.8756211996, -166.928776056, -166.928774714}}, + {"1w81bnmc3", []float64{-53.0953359604, -53.095293045, -112.492060661, -112.492017746}}, + {"03hu77r", []float64{-83.6100769043, -83.6087036133, -161.917877197, -161.916503906}}, + {"z2vrm", []float64{50.4931640625, 50.537109375, 153.852539062, 153.896484375}}, + {"q630e", []float64{-32.255859375, -32.2119140625, 102.788085938, 102.83203125}}, + {"h9uzt", []float64{-78.837890625, -78.7939453125, 29.3994140625, 29.443359375}}, + {"x09c9jz", []float64{3.10775756836, 3.10913085938, 137.51449585, 137.515869141}}, + {"8", []float64{0.0, 45.0, -180.0, -135.0}}, + {"xv9ee3gq2j", []float64{31.5634471178, 31.5634524822, 171.006660461, 171.00667119}}, + {"6e8xjp4", []float64{-24.0435791016, -24.0422058105, -66.5744018555, -66.5730285645}}, + {"0m8sj439bys", []float64{-58.3466801047, -58.3466787636, -167.82505095, -167.825049609}}, + {"khf7yq8js6", []float64{-17.5854098797, -17.5854045153, 3.43890309334, 3.43891382217}}, + {"hh", []float64{-67.5, -61.875, 0.0, 11.25}}, + {"kcyx9b0rd65e", []float64{-33.8365919329, -33.8365917653, 42.967973873, 42.9679742083}}, + {"qcy4pees", []float64{-34.7847747803, -34.7846031189, 132.521896362, 132.522239685}}, + {"tc6", []float64{7.03125, 8.4375, 81.5625, 82.96875}}, + {"mxhnk2fkh", []float64{-4.52156066895, -4.5215177536, 73.3150291443, 73.3150720596}}, + {"1mggmg5x0k", []float64{-57.067258358, -57.0672529936, -118.219059706, -118.219048977}}, + {"f1udt102z", []float64{55.2888250351, 55.2888679504, -83.4515047073, -83.451461792}}, + {"jjz", []float64{-57.65625, -56.25, 54.84375, 56.25}}, + {"1q1dg8peze2", []float64{-55.765940398, -55.7659390569, -121.476194859, -121.476193517}}, + {"604unch", []float64{-44.2913818359, -44.2900085449, -85.8306884766, -85.8293151855}}, + {"kt", []float64{-16.875, -11.25, 22.5, 33.75}}, + {"wpbgc", []float64{44.2529296875, 44.296875, 91.0986328125, 91.142578125}}, + {"c", []float64{45.0, 90.0, -135.0, -90.0}}, + {"5", []float64{-90.0, -45.0, -45.0, 0.0}}, + {"tz2pky3c", []float64{42.0901679993, 42.0903396606, 78.9611434937, 78.9614868164}}, + {"j96v3y", []float64{-82.0129394531, -82.0074462891, 71.4440917969, 71.455078125}}, + {"vs974dv", []float64{70.8549499512, 70.8563232422, 69.3745422363, 69.3759155273}}, + {"dunwchdt7d6", []float64{23.712155968, 23.7121573091, -47.061843574, -47.0618422329}}, + {"h2u2nkksw", []float64{-85.7571315765, -85.7570886612, 17.5076580048, 17.5077009201}}, + {"z8qtu1", []float64{47.4224853516, 47.4279785156, 166.81640625, 166.827392578}}, + {"2677dsdngh", []float64{-31.7026162148, -31.7026108503, -164.066948891, -164.066938162}}, + {"4yy", []float64{-52.03125, -50.625, -47.8125, -46.40625}}, + {"uym", []float64{80.15625, 81.5625, 40.78125, 42.1875}}, + {"m2mssd", []float64{-42.7917480469, -42.7862548828, 64.1821289062, 64.1931152344}}, + {"xed3b", []float64{19.9951171875, 20.0390625, 160.6640625, 160.708007812}}, + {"rfp4ky", []float64{-33.3215332031, -33.3160400391, 178.802490234, 178.813476562}}, + {"83fmm9e", []float64{10.7748413086, 10.7762145996, -165.340118408, -165.338745117}}, + {"tr7z2dqh", []float64{42.0687103271, 42.0688819885, 61.5536499023, 61.5539932251}}, + {"crsh", []float64{87.890625, 88.06640625, -118.125, -117.7734375}}, + {"f", []float64{45.0, 90.0, -90.0, -45.0}}, + {"761m6h", []float64{-32.8051757812, -32.7996826172, -31.904296875, -31.8933105469}}, + {"7p", []float64{-5.625, 0.0, -45.0, -33.75}}, + {"8qu9r5", []float64{38.2049560547, 38.2104492188, -162.114257812, -162.103271484}}, + {"5z", []float64{-50.625, -45.0, -11.25, 0.0}}, + {"kbz", []float64{-40.78125, -39.375, 43.59375, 45.0}}, + {"zdftsw1rbpsf", []float64{61.4698768035, 61.4698769711, 161.21510189, 161.215102226}}, + {"n1m105r", []float64{-82.7751159668, -82.7737426758, 97.0408630371, 97.0422363281}}, + {"xf4kktnxsz", []float64{12.0258611441, 12.0258665085, 172.120946646, 172.120957375}}, + {"kzqyq4m0qtyu", []float64{-3.10768313706, -3.10768296942, 43.5130138323, 43.5130141675}}, + {"1j1vy57w7", []float64{-60.8453321457, -60.8452892303, -132.27045536, -132.270412445}}, + {"nq4u3", []float64{-55.5029296875, -55.458984375, 105.161132812, 105.205078125}}, + {"bhcy", []float64{72.7734375, 72.94921875, -177.5390625, -177.1875}}, + {"vjr", []float64{74.53125, 75.9375, 54.84375, 56.25}}, + {"uc99nrdsntv", []float64{53.6551974714, 53.6551988125, 36.1377520859, 36.137753427}}, + {"3e8zmnz", []float64{-24.0010070801, -23.9996337891, -111.2159729, -111.214599609}}, + {"j0yzdvs9", []float64{-84.4325065613, -84.4323348999, 54.6192169189, 54.6195602417}}, + {"8chvm4", []float64{6.55883789062, 6.56433105469, -139.350585938, -139.339599609}}, + {"ywf6099wx", []float64{83.329668045, 83.3297109604, 115.6883955, 115.688438416}}, + {"b", []float64{45.0, 90.0, -180.0, -135.0}}, + {"zrc", []float64{88.59375, 90.0, 147.65625, 149.0625}}, + {"zq", []float64{78.75, 84.375, 146.25, 157.5}}, + {"7xznu50ru", []float64{-0.201916694641, -0.201873779297, -12.4799537659, -12.4799108505}}, + {"u", []float64{45.0, 90.0, 0.0, 45.0}}, + {"7r", []float64{-5.625, 0.0, -33.75, -22.5}}, + {"f27k", []float64{47.109375, 47.28515625, -74.1796875, -73.828125}}, + {"0", []float64{-90.0, -45.0, -180.0, -135.0}}, + {"b4s13188yv", []float64{59.2906218767, 59.2906272411, -174.330078363, -174.330067635}}, + {"q8", []float64{-45.0, -39.375, 112.5, 123.75}}, + {"skn7w", []float64{23.115234375, 23.1591796875, 20.302734375, 20.3466796875}}, + {"1tzqwzyh5vp", []float64{-56.4703863859, -56.4703850448, -101.999646574, -101.999645233}}, + {"r52cmn", []float64{-26.4660644531, -26.4605712891, 136.274414062, 136.285400391}}, + {"7bwvbbbru", []float64{-41.1713075638, -41.1712646484, -1.72433853149, -1.72429561615}}, + {"ruv0b1n", []float64{-18.1439208984, -18.1425476074, 175.789489746, 175.790863037}}, + {"vwf01ujm", []float64{82.9915809631, 82.9917526245, 70.3966140747, 70.3969573975}}, + {"0metgxjtrm91", []float64{-58.0123747699, -58.0123746023, -163.666450828, -163.666450493}}, + {"2w", []float64{-11.25, -5.625, -157.5, -146.25}}, + {"8kmch3", []float64{24.0875244141, 24.0930175781, -160.477294922, -160.466308594}}, + {"g6m", []float64{57.65625, 59.0625, -26.71875, -25.3125}}, + {"t6v4k2", []float64{15.8642578125, 15.8697509766, 63.4680175781, 63.4790039062}}, + {"zr02vfju8hd", []float64{84.5186188817, 84.5186202228, 146.862147152, 146.862148494}}, + {"kb8jn751", []float64{-41.2919425964, -41.2917709351, 34.0287780762, 34.0291213989}}, + {"mj76", []float64{-15.1171875, -14.94140625, 49.5703125, 49.921875}}, + {"f0hwcbkwjnr", []float64{46.1889602244, 46.1889615655, -83.5885669291, -83.588565588}}, + {"5", []float64{-90.0, -45.0, -45.0, 0.0}}, + {"6rt9btpxj2p1", []float64{-2.47621519491, -2.47621502727, -70.9831179678, -70.9831176326}}, + {"wh0s6yb6j", []float64{23.2844924927, 23.284535408, 90.8245325089, 90.8245754242}}, + {"65b", []float64{-23.90625, -22.5, -90.0, -88.59375}}, + {"r", []float64{-45.0, 0.0, 135.0, 180.0}}, + {"z0tpc5d", []float64{49.1940307617, 49.1954040527, 142.077941895, 142.079315186}}, + {"fgs2w0", []float64{64.775390625, 64.7808837891, -50.009765625, -49.9987792969}}, + {"vfus35qgp", []float64{61.2341880798, 61.2342309952, 85.1316404343, 85.1316833496}}, + {"hywnkqdye", []float64{-52.3020458221, -52.3020029068, 42.3781728745, 42.3782157898}}, + {"qpwxwun0b", []float64{-1.47203922272, -1.47199630737, 99.4454956055, 99.4455385208}}, + {"v88u9rqr24", []float64{48.6445963383, 48.6446017027, 68.6182022095, 68.6182129383}}, + {"17x06x4fyw", []float64{-70.2295982838, -70.2295929193, -113.792331219, -113.79232049}}, + {"3ykkhjyhrt", []float64{-9.1082829237, -9.10827755928, -95.0890946388, -95.08908391}}, + {"mbzkuqw5", []float64{-39.910068512, -39.9098968506, 89.1403198242, 89.140663147}}, + {"yqjf0p3mn", []float64{79.1422462463, 79.1422891617, 109.337911606, 109.337954521}}, + {"2f", []float64{-33.75, -28.125, -146.25, -135.0}}, + {"hqsm3", []float64{-52.5146484375, -52.470703125, 17.2705078125, 17.314453125}}, + {"gp1y4wtft1tp", []float64{85.4658314399, 85.4658316076, -42.4210815132, -42.4210811779}}, + {"k", []float64{-45.0, 0.0, 0.0, 45.0}}, + {"kxxruvh", []float64{-1.42272949219, -1.42135620117, 32.9095458984, 32.9109191895}}, + {"08h0ft4vk97r", []float64{-89.839789141, -89.8397889733, -151.761162691, -151.761162356}}, + {"y9u8", []float64{54.84375, 55.01953125, 118.828125, 119.1796875}}, + {"3zhsk", []float64{-4.8779296875, -4.833984375, -94.74609375, -94.7021484375}}, + {"x2", []float64{0.0, 5.625, 146.25, 157.5}}, + {"mr7nnvr", []float64{-3.13522338867, -3.13385009766, 60.7749938965, 60.7763671875}}, + {"d3g0pn54hr", []float64{9.87708985806, 9.87709522247, -74.2193305492, -74.2193198204}}, + {"jv173u", []float64{-61.2817382812, -61.2762451172, 80.5847167969, 80.595703125}}, + {"vte", []float64{75.9375, 77.34375, 71.71875, 73.125}}, + {"303un61j2s", []float64{-42.878715992, -42.8787106276, -132.263009548, -132.262998819}}, + {"m528h", []float64{-26.71875, -26.6748046875, 45.87890625, 45.9228515625}}, + {"8", []float64{0.0, 45.0, -180.0, -135.0}}, + {"p8h", []float64{-90.0, -88.59375, 163.125, 164.53125}}, + {"tzusgm6v33y", []float64{44.4584606588, 44.4584619999, 85.2247855067, 85.2247868478}}, + {"26sre", []float64{-29.619140625, -29.5751953125, -162.641601562, -162.59765625}}, + {"q1tcnkmh55b", []float64{-36.3626660407, -36.3626646996, 98.3675909042, 98.3675922453}}, + {"xs4t1g3", []float64{23.3967590332, 23.3981323242, 161.093902588, 161.095275879}}, + {"td", []float64{11.25, 16.875, 67.5, 78.75}}, + {"xdvwxsmsjd", []float64{16.6353714466, 16.635376811, 165.571753979, 165.571764708}}, + {"dnw", []float64{36.5625, 37.96875, -81.5625, -80.15625}}, + {"u7nu2ff43vx", []float64{62.6375922561, 62.6375935972, 20.777977556, 20.7779788971}}, + {"0jy2", []float64{-57.65625, -57.48046875, -171.2109375, -170.859375}}, + {"u", []float64{45.0, 90.0, 0.0, 45.0}}, + {"2yx2k", []float64{-8.3935546875, -8.349609375, -135.87890625, -135.834960938}}, + {"g3", []float64{50.625, 56.25, -33.75, -22.5}}, + {"6", []float64{-45.0, 0.0, -90.0, -45.0}}, + {"vrrpw34jfjs", []float64{87.1061190963, 87.1061204374, 66.3712459803, 66.3712473214}}, + {"g2gkzeutg", []float64{50.0752973557, 50.075340271, -28.8437891006, -28.8437461853}}, + {"z8ett4zh", []float64{48.7950897217, 48.7952613831, 162.6512146, 162.651557922}}, + {"cyspv2k0", []float64{82.9261779785, 82.9263496399, -95.3887939453, -95.3884506226}}, + {"fqu7qbshmr", []float64{83.5435527563, 83.5435581207, -72.471088171, -72.4710774422}}, + {"578u4ww7", []float64{-69.5731544495, -69.5729827881, -32.5768661499, -32.5765228271}}, + {"4gzkwxrzb", []float64{-68.0740785599, -68.0740356445, -45.7583999634, -45.758357048}}, + {"g0z038npm1ym", []float64{49.2639500834, 49.263950251, -35.0818693265, -35.0818689913}}, + {"vrse4ey62b1y", []float64{87.7358303592, 87.7358305268, 62.6966058835, 62.6966062188}}, + {"7x2fkbbj6", []float64{-3.81822109222, -3.81817817688, -21.2364864349, -21.2364435196}}, + {"tuvfdxy7b", []float64{27.2014188766, 27.201461792, 86.9543838501, 86.9544267654}}, + {"9qzggb", []float64{38.6279296875, 38.6334228516, -112.686767578, -112.67578125}}, + {"pupf6", []float64{-67.1044921875, -67.060546875, 179.736328125, 179.780273438}}, + {"fknyrbe41k0w", []float64{68.6017451808, 68.6017453484, -68.9130621403, -68.9130618051}}, + {"591vk5jz7x4v", []float64{-83.4343860112, -83.4343858436, -19.8552309349, -19.8552305996}}, + {"n4psdv", []float64{-77.9315185547, -77.9260253906, 100.667724609, 100.678710938}}, + {"zyw2", []float64{81.5625, 81.73828125, 177.5390625, 177.890625}}, + {"r0e29", []float64{-42.099609375, -42.0556640625, 139.614257812, 139.658203125}}, + {"j9cq7f", []float64{-79.0466308594, -79.0411376953, 69.4226074219, 69.43359375}}, + {"n", []float64{-90.0, -45.0, 90.0, 135.0}}, + {"m42x20h", []float64{-31.0693359375, -31.0679626465, 45.7086181641, 45.7099914551}}, + {"405zg3kz2j", []float64{-88.6295574903, -88.6295521259, -84.5772171021, -84.5772063732}}, + {"7vu7httsdm", []float64{-12.0978945494, -12.097889185, -5.06803393364, -5.0680232048}}, + {"prznz91", []float64{-45.2142333984, -45.2128601074, 156.424713135, 156.426086426}}, + {"46dupvqwbsuj", []float64{-75.2043508552, -75.2043506876, -74.5332831144, -74.5332827792}}, + {"7s0", []float64{-22.5, -21.09375, -22.5, -21.09375}}, + {"5nhz5er2nnm5", []float64{-55.0016444363, -55.0016442686, -38.1562833488, -38.1562830135}}, + {"5qqp", []float64{-53.61328125, -53.4375, -25.3125, -24.9609375}}, + {"t4dqn2", []float64{15.1171875, 15.1226806641, 48.4387207031, 48.4497070312}}, + {"gmxt", []float64{76.81640625, 76.9921875, -23.203125, -22.8515625}}, + {"syuh0ke1syh", []float64{38.6968839169, 38.696885258, 39.3903154135, 39.3903167546}}, + {"nvr0", []float64{-60.46875, -60.29296875, 133.59375, 133.9453125}}, + {"g4", []float64{56.25, 61.875, -45.0, -33.75}}, + {"gnd2kb6w0s32", []float64{81.6088713706, 81.6088715382, -41.623740904, -41.6237405688}}, + {"x7hptvsgdvu", []float64{18.2242034376, 18.2242047787, 152.134332061, 152.134333402}}, + {"2fc8", []float64{-29.53125, -29.35546875, -144.140625, -143.7890625}}, + {"e3fsu48yte7", []float64{10.693577081, 10.6935784221, -30.057323724, -30.0573223829}}, + {"pbqhjvqh", []float64{-87.8610992432, -87.8609275818, 177.448425293, 177.448768616}}, + {"sqx1nwgn", []float64{36.7763900757, 36.7765617371, 21.3835144043, 21.3838577271}}, + {"e6g", []float64{15.46875, 16.875, -29.53125, -28.125}}, + {"dt", []float64{28.125, 33.75, -67.5, -56.25}}, + {"02s64yzk", []float64{-86.7981719971, -86.7980003357, -162.642631531, -162.642288208}}, + {"z9", []float64{50.625, 56.25, 157.5, 168.75}}, + {"fjv", []float64{77.34375, 78.75, -82.96875, -81.5625}}, + {"5yc0", []float64{-52.03125, -51.85546875, -9.84375, -9.4921875}}, + {"y", []float64{45.0, 90.0, 90.0, 135.0}}, + {"k3x3j1pmdnfp", []float64{-36.3802440651, -36.3802438974, 21.6750839353, 21.6750842705}}, + {"g2pjds", []float64{45.9887695312, 45.9942626953, -23.7963867188, -23.7854003906}}, + {"ppw0p3q2gzbk", []float64{-47.8054625541, -47.8054623865, 143.764847852, 143.764848188}}, + {"9xwp0kprs0x6", []float64{43.4412318841, 43.4412320517, -104.041375928, -104.041375592}}, + {"0gzww7fv7gj", []float64{-67.7421551943, -67.7421538532, -135.424522609, -135.424521267}}, + {"sgpt2", []float64{17.7978515625, 17.841796875, 44.296875, 44.3408203125}}, + {"3", []float64{-45.0, 0.0, -135.0, -90.0}}, + {"rr", []float64{-5.625, 0.0, 146.25, 157.5}}, + {"wkqrtvj5", []float64{25.2525901794, 25.2527618408, 110.298614502, 110.298957825}}, + {"ngtfyx77u", []float64{-69.7886323929, -69.7885894775, 132.126216888, 132.126259804}}, + {"m", []float64{-45.0, 0.0, 45.0, 90.0}}, + {"58y", []float64{-85.78125, -84.375, -14.0625, -12.65625}}, + {"mvjrxvx", []float64{-15.5264282227, -15.5250549316, 86.483001709, 86.484375}}, + {"jgve4ttu", []float64{-68.3480072021, -68.3478355408, 86.6021347046, 86.6024780273}}, + {"h9", []float64{-84.375, -78.75, 22.5, 33.75}}, + {"ytf9sb6mj27", []float64{77.609654814, 77.6096561551, 116.227684468, 116.227685809}}, + {"rk9kv3q", []float64{-18.8456726074, -18.8442993164, 148.246765137, 148.248138428}}, + {"kbq8t", []float64{-43.505859375, -43.4619140625, 43.1103515625, 43.154296875}}, + {"j8prqp8kx", []float64{-88.6836147308, -88.6835718155, 77.9596281052, 77.9596710205}}, + {"8", []float64{0.0, 45.0, -180.0, -135.0}}, + {"mq53k", []float64{-11.0302734375, -10.986328125, 60.99609375, 61.0400390625}}, + {"d95fw", []float64{6.064453125, 6.1083984375, -61.962890625, -61.9189453125}}, + {"x15j3", []float64{6.5478515625, 6.591796875, 139.262695312, 139.306640625}}, + {"x0yg7k2b", []float64{4.81338500977, 4.81355667114, 144.636039734, 144.636383057}}, + {"dx8x9yxwprk5", []float64{43.5426343046, 43.5426344723, -66.7093545198, -66.7093541846}}, + {"v", []float64{45.0, 90.0, 45.0, 90.0}}, + {"429ptwcscc3", []float64{-85.8312396705, -85.8312383294, -77.0999144018, -77.0999130607}}, + {"ncknjt", []float64{-81.8865966797, -81.8811035156, 129.616699219, 129.627685547}}, + {"7ce8p70yc7h", []float64{-36.5448457003, -36.5448443592, -6.00843250751, -6.00843116641}}, + {"un1", []float64{78.75, 80.15625, 1.40625, 2.8125}}, + {"b6rv0shhd5", []float64{58.5579174757, 58.5579228401, -157.824010849, -157.82400012}}, + {"qyg8d", []float64{-6.943359375, -6.8994140625, 128.759765625, 128.803710938}}, + {"3p7f2mvx6", []float64{-3.79041194916, -3.79036903381, -129.707937241, -129.707894325}}, + {"u0vj", []float64{50.09765625, 50.2734375, 7.03125, 7.3828125}}, + {"49m", []float64{-82.96875, -81.5625, -60.46875, -59.0625}}, + {"7vhu8hp00j2", []float64{-16.0619835556, -16.0619822145, -4.56069946289, -4.56069812179}}, + {"3sh", []float64{-22.5, -21.09375, -106.875, -105.46875}}, + {"sexg40hc", []float64{20.2150154114, 20.2151870728, 33.4928512573, 33.4931945801}}, + {"4uwxw8u", []float64{-63.365020752, -63.3636474609, -46.8182373047, -46.8168640137}}, + {"mr", []float64{-5.625, 0.0, 56.25, 67.5}}, + {"1kymkgft79d3", []float64{-62.3368896358, -62.3368894681, -114.748610817, -114.748610482}}, + {"3tj", []float64{-16.875, -15.46875, -105.46875, -104.0625}}, + {"vxfzth5", []float64{89.9340820312, 89.9354553223, 71.5910339355, 71.5924072266}}, + {"h", []float64{-90.0, -45.0, 0.0, 45.0}}, + {"e5r9", []float64{18.45703125, 18.6328125, -34.453125, -34.1015625}}, + {"hq9bc8pmfk", []float64{-53.3046555519, -53.3046501875, 13.7869083881, 13.786919117}}, + {"05t", []float64{-70.3125, -68.90625, -172.96875, -171.5625}}, + {"ye6yg", []float64{64.4677734375, 64.51171875, 116.499023438, 116.54296875}}, + {"gg4k", []float64{62.578125, 62.75390625, -8.0859375, -7.734375}}, + {"50q2fcp", []float64{-88.4564208984, -88.4550476074, -36.0804748535, -36.0791015625}}, + {"2hu8d3131", []float64{-18.1876945496, -18.1876516342, -173.571238518, -173.571195602}}, + {"08gcdmy84ewg", []float64{-85.4859731533, -85.4859729856, -152.118642814, -152.118642479}}, + {"4", []float64{-90.0, -45.0, -90.0, -45.0}}, + {"t05v4wzmqmet", []float64{0.91691667214, 0.916916839778, 50.3935300559, 50.3935303912}}, + {"3nc", []float64{-7.03125, -5.625, -133.59375, -132.1875}}, + {"v", []float64{45.0, 90.0, 45.0, 90.0}}, + {"3gvk6zm1869", []float64{-23.1190833449, -23.1190820038, -93.7394593656, -93.7394580245}}, + {"39rruq", []float64{-36.5734863281, -36.5679931641, -102.117919922, -102.106933594}}, + {"jhkxvqt7q5z", []float64{-64.6951617301, -64.6951603889, 51.5663145483, 51.5663158894}}, + {"bb", []float64{45.0, 50.625, -146.25, -135.0}}, + {"es26styfpb", []float64{24.3776321411, 24.3776375055, -21.9410812855, -21.9410705566}}, + {"500q98r9", []float64{-88.8558769226, -88.8557052612, -44.5722198486, -44.5718765259}}, + {"kv", []float64{-16.875, -11.25, 33.75, 45.0}}, + {"njf", []float64{-57.65625, -56.25, 92.8125, 94.21875}}, + {"whk5vu", []float64{24.5874023438, 24.5928955078, 95.8776855469, 95.888671875}}, + {"w9pq4yk4p4qf", []float64{6.71437550336, 6.714375671, 122.821964733, 122.821965069}}, + {"yt", []float64{73.125, 78.75, 112.5, 123.75}}, + {"ccztdek", []float64{55.8283996582, 55.8297729492, -90.5877685547, -90.5863952637}}, + {"ej33gfth2", []float64{29.8533296585, 29.8533725739, -43.070526123, -43.0704832077}}, + {"y", []float64{45.0, 90.0, 90.0, 135.0}}, + {"hqyfjm", []float64{-51.6522216797, -51.6467285156, 20.9729003906, 20.9838867188}}, + {"njcr", []float64{-56.42578125, -56.25, 91.7578125, 92.109375}}, + {"48ejbxwrk6", []float64{-86.1343038082, -86.1342984438, -63.2505118847, -63.2505011559}}, + {"78f1r09e5v8", []float64{-40.558232367, -40.5582310259, -19.3776619434, -19.3776606023}}, + {"ged8fvuhk31", []float64{64.8516565561, 64.8516578972, -18.8578484952, -18.8578471541}}, + {"8ss3fn", []float64{25.6530761719, 25.6585693359, -151.435546875, -151.424560547}}, + {"v90sp4e6", []float64{51.3422012329, 51.3423728943, 68.5152053833, 68.5155487061}}, + {"bx00h848", []float64{84.375, 84.3751716614, -157.298812866, -157.298469543}}, + {"9y3", []float64{35.15625, 36.5625, -99.84375, -98.4375}}, + {"ehpkg7", []float64{23.3514404297, 23.3569335938, -34.6618652344, -34.6508789062}}, + {"r38623wxhs", []float64{-36.1575293541, -36.1575239897, 146.621668339, 146.621679068}}, + {"x6yex2zx", []float64{16.0893058777, 16.0894775391, 155.719528198, 155.719871521}}, + {"r", []float64{-45.0, 0.0, 135.0, 180.0}}, + {"9", []float64{0.0, 45.0, -135.0, -90.0}}, + {"6w", []float64{-11.25, -5.625, -67.5, -56.25}}, + {"mx6g2d", []float64{-3.63647460938, -3.63098144531, 71.3891601562, 71.4001464844}}, + {"vmsh9b6f", []float64{76.7302322388, 76.7304039001, 61.9556808472, 61.9560241699}}, + {"7uqbsm728bjw", []float64{-20.9769334272, -20.9769332595, -1.56654216349, -1.56654182822}}, + {"nvtqqh0jc3", []float64{-57.9409021139, -57.9408967495, 131.396538019, 131.396548748}}, + {"x8", []float64{0.0, 5.625, 157.5, 168.75}}, + {"nqx5n", []float64{-52.91015625, -52.8662109375, 111.357421875, 111.401367188}}, + {"c4wmv60xtud", []float64{60.0855401158, 60.0855414569, -125.979288518, -125.979287177}}, + {"t9", []float64{5.625, 11.25, 67.5, 78.75}}, + {"e3nqrsk9fqdu", []float64{6.74731470644, 6.74731487408, -24.6250675991, -24.6250672638}}, + {"k", []float64{-45.0, 0.0, 0.0, 45.0}}, + {"7x9", []float64{-2.8125, -1.40625, -21.09375, -19.6875}}, + {"760qb6yxf", []float64{-32.5470399857, -32.5469970703, -33.3784389496, -33.3783960342}}, + {"x7skftff", []float64{20.5543899536, 20.554561615, 152.340202332, 152.340545654}}, + {"jv774u", []float64{-59.9194335938, -59.9139404297, 83.4411621094, 83.4521484375}}, + {"phh7", []float64{-66.97265625, -66.796875, 140.9765625, 141.328125}}, + {"hv4tt0pymy", []float64{-60.9070980549, -60.9070926905, 37.4962413311, 37.4962520599}}, + {"3zemdehyubj", []float64{-1.82806491852, -1.82806357741, -96.563090533, -96.5630891919}}, + {"v", []float64{45.0, 90.0, 45.0, 90.0}}, + {"dep80ett", []float64{16.8950843811, 16.8952560425, -56.9235992432, -56.9232559204}}, + {"bbj", []float64{45.0, 46.40625, -139.21875, -137.8125}}, + {"k", []float64{-45.0, 0.0, 0.0, 45.0}}, + {"v", []float64{45.0, 90.0, 45.0, 90.0}}, + {"9k1t0ecqh3v3", []float64{23.4005451389, 23.4005453065, -121.616746299, -121.616745964}}, + {"mm5", []float64{-16.875, -15.46875, 60.46875, 61.875}}, + {"jz4zmmv2", []float64{-49.3190002441, -49.3188285828, 82.8551101685, 82.8554534912}}, + {"bpshcry", []float64{88.065032959, 88.06640625, -174.311828613, -174.310455322}}, + {"z", []float64{45.0, 90.0, 135.0, 180.0}}, + {"e87b", []float64{1.40625, 1.58203125, -17.2265625, -16.875}}, + {"pcphekd1ph0", []float64{-83.5590720177, -83.5590706766, 178.739619255, 178.739620596}}, + {"nrwq0", []float64{-46.7578125, -46.7138671875, 110.0390625, 110.083007812}}, + {"6k3e7rk823cj", []float64{-20.4825823568, -20.4825821891, -76.4916108549, -76.4916105196}}, + {"kkv059k5v", []float64{-18.2737398148, -18.2736968994, 18.4407663345, 18.4408092499}}, + {"ep7x2t4t6x", []float64{42.084068656, 42.0840740204, -40.0526118279, -40.052601099}}, + {"43hh0", []float64{-83.671875, -83.6279296875, -73.125, -73.0810546875}}, + {"rdhfdg0qee", []float64{-33.2929354906, -33.2929301262, 164.301030636, 164.301041365}}, + {"znku45j7p", []float64{80.8763694763, 80.8764123917, 141.77508831, 141.775131226}}, + {"ju", []float64{-67.5, -61.875, 78.75, 90.0}}, + {"b6zckuz", []float64{60.7145690918, 60.7159423828, -157.633209229, -157.631835938}}, + {"fm7m", []float64{75.41015625, 75.5859375, -74.1796875, -73.828125}}, + {"8xg", []float64{43.59375, 45.0, -153.28125, -151.875}}, + {"wfk7q", []float64{13.2275390625, 13.271484375, 129.990234375, 130.034179688}}, + {"6r1p9yz6z9", []float64{-4.26908433437, -4.26907896996, -77.2565674782, -77.2565567493}}, + {"t", []float64{0.0, 45.0, 45.0, 90.0}}, + {"fuzw5", []float64{72.7734375, 72.8173828125, -45.5712890625, -45.52734375}}, + {"m33ehjv", []float64{-37.4098205566, -37.4084472656, 58.5420227051, 58.5433959961}}, + {"s6stp", []float64{14.94140625, 14.9853515625, 17.8857421875, 17.9296875}}, + {"tjxuvxh", []float64{31.8109130859, 31.812286377, 56.1456298828, 56.1470031738}}, + {"0vgxezccsf", []float64{-56.2950503826, -56.2950450182, -141.160722971, -141.160712242}}, + {"0h", []float64{-67.5, -61.875, -180.0, -168.75}}, + {"ge6bb", []float64{63.4130859375, 63.45703125, -18.6328125, -18.5888671875}}, + {"h2gyy9", []float64{-84.5892333984, -84.5837402344, 16.8090820312, 16.8200683594}}, + {"g0f7xg", []float64{49.8504638672, 49.8559570312, -41.4953613281, -41.484375}}, + {"ujfk5", []float64{78.046875, 78.0908203125, 3.2958984375, 3.33984375}}, + {"q0b6rwm0", []float64{-40.3514099121, -40.3512382507, 90.6880187988, 90.6883621216}}, + {"d", []float64{0.0, 45.0, -90.0, -45.0}}, + {"0nyhwsx2g5", []float64{-51.2153702974, -51.215364933, -171.266770363, -171.266759634}}, + {"mjpe", []float64{-16.34765625, -16.171875, 55.546875, 55.8984375}}, + {"mt", []float64{-16.875, -11.25, 67.5, 78.75}}, + {"z49vj0d4zz", []float64{59.9446624517, 59.9446678162, 137.683743238, 137.683753967}}, + {"zry97vd3gs", []float64{88.8440108299, 88.8440161943, 155.55866003, 155.558670759}}, + {"c", []float64{45.0, 90.0, -135.0, -90.0}}, + {"v3syrvc9", []float64{54.5678901672, 54.5680618286, 63.2723236084, 63.2726669312}}, + {"nrer", []float64{-46.58203125, -46.40625, 105.8203125, 106.171875}}, + {"2hqt8nf65v5e", []float64{-20.0895036198, -20.0895034522, -170.856119469, -170.856119134}}, + {"wekgdxc", []float64{18.9390563965, 18.9404296875, 119.290924072, 119.292297363}}, + {"6bh43q", []float64{-44.5715332031, -44.5660400391, -50.5700683594, -50.5590820312}}, + {"d564", []float64{18.6328125, 18.80859375, -87.1875, -86.8359375}}, + {"m85", []float64{-45.0, -43.59375, 71.71875, 73.125}}, + {"x4tj5rb77r", []float64{14.9845737219, 14.9845790863, 142.174555063, 142.174565792}}, + {"0bwycz8vr", []float64{-85.9588766098, -85.9588336945, -136.679577827, -136.679534912}}, + {"ehnu2e", []float64{23.2635498047, 23.2690429688, -35.4858398438, -35.4748535156}}, + {"jbe8mk", []float64{-87.1215820312, -87.1160888672, 83.9025878906, 83.9135742188}}, + {"ss", []float64{22.5, 28.125, 22.5, 33.75}}, + {"8edywpv3ygj3", []float64{20.8729668148, 20.8729669824, -153.361634128, -153.361633793}}, + {"xpxu1erq390", []float64{42.9095560312, 42.9095573723, 145.974376202, 145.974377543}}, + {"4043qb91kj", []float64{-89.7772854567, -89.7772800922, -86.5377616882, -86.5377509594}}, + {"2n7q0j0r", []float64{-8.76039505005, -8.76022338867, -175.429344177, -175.429000854}}, + {"cm0n96q3vn8t", []float64{74.2802738585, 74.2802740261, -123.686270043, -123.686269708}}, + {"50hgpxg", []float64{-89.4300842285, -89.4287109375, -37.9866027832, -37.9852294922}}, + {"2m", []float64{-16.875, -11.25, -168.75, -157.5}}, + {"w4s1zj8n9", []float64{14.4014453888, 14.4014883041, 95.9326601028, 95.9327030182}}, + {"hxzh", []float64{-45.703125, -45.52734375, 32.34375, 32.6953125}}, + {"cn", []float64{78.75, 84.375, -135.0, -123.75}}, + {"dpt5k8", []float64{42.7587890625, 42.7642822266, -82.7709960938, -82.7600097656}}, + {"gz", []float64{84.375, 90.0, -11.25, 0.0}}, + {"d09knt", []float64{3.54309082031, 3.54858398438, -87.9565429688, -87.9455566406}}, + {"mrucw", []float64{-1.142578125, -1.0986328125, 63.193359375, 63.2373046875}}, + {"055egqf4y", []float64{-72.4282693863, -72.4282264709, -174.93229866, -174.932255745}}, + {"qphu", []float64{-4.921875, -4.74609375, 96.6796875, 97.03125}}, + {"dfeh", []float64{14.765625, 14.94140625, -52.03125, -51.6796875}}, + {"00qfn4ctp57", []float64{-88.2262055576, -88.2262042165, -170.241776258, -170.241774917}}, + {"q9xx27p2trn", []float64{-35.2714830637, -35.2714817226, 123.06805104, 123.068052381}}, + {"10bhfgfsj8m", []float64{-84.9250017107, -84.9250003695, -134.875474423, -134.875473082}}, + {"6k", []float64{-22.5, -16.875, -78.75, -67.5}}, + {"zyvjr4m37", []float64{83.9041757584, 83.9042186737, 176.096205711, 176.096248627}}, + {"0c8k7gb", []float64{-80.7948303223, -80.7934570312, -145.733642578, -145.732269287}}, + {"n7k", []float64{-71.71875, -70.3125, 106.875, 108.28125}}, + {"fpj4eecz", []float64{84.8362541199, 84.8364257812, -82.812538147, -82.8121948242}}, + {"ev", []float64{28.125, 33.75, -11.25, 0.0}}, + {"8y", []float64{33.75, 39.375, -146.25, -135.0}}, + {"x9xd1q", []float64{8.82202148438, 8.82751464844, 168.101806641, 168.112792969}}, + {"x", []float64{0.0, 45.0, 135.0, 180.0}}, + {"d1pmefm0t", []float64{6.60424232483, 6.60428524017, -79.6328115463, -79.632768631}}, + {"tmy", []float64{32.34375, 33.75, 64.6875, 66.09375}}, + {"phu6jzv0z", []float64{-62.8869867325, -62.8869438171, 141.236414909, 141.236457825}}, + {"zv7pjxpuwjbq", []float64{75.8009752259, 75.8009753935, 173.221350051, 173.221350387}}, + {"9gegu1", []float64{20.3521728516, 20.3576660156, -95.80078125, -95.7897949219}}, + {"tq33", []float64{35.33203125, 35.5078125, 58.0078125, 58.359375}}, + {"e", []float64{0.0, 45.0, -45.0, 0.0}}, + {"y6z1sy", []float64{60.7653808594, 60.7708740234, 111.302490234, 111.313476562}}, + {"3", []float64{-45.0, 0.0, -135.0, -90.0}}, + {"q", []float64{-45.0, 0.0, 90.0, 135.0}}, + {"mtyyw7", []float64{-11.4971923828, -11.4916992188, 77.2668457031, 77.2778320312}}, + {"2sqmb", []float64{-20.0830078125, -20.0390625, -148.7109375, -148.666992188}}, + {"yp487k", []float64{84.4409179688, 84.4464111328, 93.6584472656, 93.6694335938}}, + {"nyn5k1fc", []float64{-55.668926239, -55.6687545776, 132.3670578, 132.367401123}}, + {"gnqk77v", []float64{80.9239196777, 80.9252929688, -36.0612487793, -36.0598754883}}, + {"yw12", []float64{78.75, 78.92578125, 114.2578125, 114.609375}}, + {"vcgqqmd", []float64{55.9725952148, 55.9739685059, 83.5977172852, 83.5990905762}}, + {"27zp3q", []float64{-22.5988769531, -22.5933837891, -158.851318359, -158.840332031}}, + {"cg4tb6", []float64{62.8967285156, 62.9022216797, -97.7233886719, -97.7124023438}}, + {"w5njb9rnp5e", []float64{17.8936573863, 17.8936587274, 98.4693901241, 98.4693914652}}, + {"9y6rp4pptv", []float64{36.3990193605, 36.399024725, -97.7684605122, -97.7684497833}}, + {"pjup17j08hup", []float64{-56.4091892727, -56.409189105, 140.68680346, 140.686803795}}, + {"vz52f33z9mu", []float64{84.5150206983, 84.5150220394, 83.421651721, 83.4216530621}}, + {"zd", []float64{56.25, 61.875, 157.5, 168.75}}, + {"q", []float64{-45.0, 0.0, 90.0, 135.0}}, + {"dm46s", []float64{28.564453125, 28.6083984375, -75.41015625, -75.3662109375}}, + {"d7dnvbu", []float64{20.8781433105, 20.8795166016, -75.6793212891, -75.677947998}}, + {"02ercsvx0h", []float64{-85.7978796959, -85.7978743315, -164.106216431, -164.106205702}}, + {"hnfx5f", []float64{-50.7897949219, -50.7843017578, 3.68041992188, 3.69140625}}, + {"evb677f6t", []float64{32.7602863312, 32.7603292465, -10.7523107529, -10.7522678375}}, + {"sg43r7p151", []float64{17.1113830805, 17.1113884449, 37.2424077988, 37.2424185276}}, + {"441mdz5", []float64{-77.7447509766, -77.7433776855, -88.1172180176, -88.1158447266}}, + {"k2qrsc7hr81", []float64{-42.2677946091, -42.267793268, 20.2522458136, 20.2522471547}}, + {"d0ydzxhjwr", []float64{4.74158227444, 4.74158763885, -80.5240237713, -80.5240130424}}, + {"4cn", []float64{-84.375, -82.96875, -47.8125, -46.40625}}, + {"ds2", []float64{23.90625, 25.3125, -67.5, -66.09375}}, + {"rxvywy", []float64{-0.230712890625, -0.225219726562, 165.882568359, 165.893554688}}, + {"zs2k", []float64{69.609375, 69.78515625, 157.8515625, 158.203125}}, + {"kg63", []float64{-26.54296875, -26.3671875, 36.9140625, 37.265625}}, + {"hmxh64j", []float64{-58.3044433594, -58.3030700684, 21.1885070801, 21.1898803711}}, + {"d5v3", []float64{21.26953125, 21.4453125, -82.6171875, -82.265625}}, + {"ddg1", []float64{15.64453125, 15.8203125, -63.28125, -62.9296875}}, + {"stf4tug", []float64{32.8092956543, 32.8106689453, 25.5693054199, 25.5706787109}}, + {"vgc", []float64{66.09375, 67.5, 80.15625, 81.5625}}, + {"jby3xf", []float64{-85.5065917969, -85.5010986328, 87.8796386719, 87.890625}}, + {"9b1f419tdjf", []float64{0.360777229071, 0.360778570175, -98.6990234256, -98.6990220845}}, + {"0zqp4", []float64{-47.98828125, -47.9443359375, -137.724609375, -137.680664062}}, + {"gg292c4wd", []float64{63.5075855255, 63.5076284409, -10.5103969574, -10.5103540421}}, + {"zy96qbt", []float64{81.9607543945, 81.9621276855, 170.811309814, 170.812683105}}, + {"tz9x7f5c70", []float64{43.4731149673, 43.4731203318, 81.0294485092, 81.0294592381}}, + {"rq", []float64{-11.25, -5.625, 146.25, 157.5}}, + {"94tt", []float64{14.94140625, 15.1171875, -127.265625, -126.9140625}}, + {"h7vnwf8", []float64{-67.7499389648, -67.7485656738, 18.5778808594, 18.5792541504}}, + {"4f", []float64{-78.75, -73.125, -56.25, -45.0}}, + {"kj3t", []float64{-14.58984375, -14.4140625, 2.109375, 2.4609375}}, + {"qspj", []float64{-21.62109375, -21.4453125, 122.34375, 122.6953125}}, + {"4y9", []float64{-53.4375, -52.03125, -54.84375, -53.4375}}, + {"b05kqcvsm8", []float64{45.7574129105, 45.7574182749, -175.125267506, -175.125256777}}, + {"p5zq4f", []float64{-67.8405761719, -67.8350830078, 145.316162109, 145.327148438}}, + {"1cgx", []float64{-78.92578125, -78.75, -96.328125, -95.9765625}}, + {"m2", []float64{-45.0, -39.375, 56.25, 67.5}}, + {"j150xkd492", []float64{-84.2619609833, -84.2619556189, 49.5401537418, 49.5401644707}}, + {"05rs", []float64{-71.015625, -70.83984375, -169.453125, -169.1015625}}, + {"ve8", []float64{64.6875, 66.09375, 67.5, 68.90625}}, + {"r9tv", []float64{-35.68359375, -35.5078125, 165.5859375, 165.9375}}, + {"d71r07", []float64{18.1219482422, 18.1274414062, -76.9812011719, -76.9702148438}}, + {"b6hepfqk6", []float64{56.79043293, 56.7904758453, -162.072629929, -162.072587013}}, + {"md8y86mqjd", []float64{-29.7815215588, -29.7815161943, 68.5731196404, 68.5731303692}}, + {"bcgcyq", []float64{55.1843261719, 55.1898193359, -140.701904297, -140.690917969}}, + {"e3hpu9352", []float64{6.99472904205, 6.9947719574, -27.9258728027, -27.9258298874}}, + {"hsbsq2rkmg", []float64{-62.5320607424, -62.532055378, 23.4879863262, 23.4879970551}}, + {"ub2r", []float64{47.63671875, 47.8125, 34.1015625, 34.453125}}, + {"d8", []float64{0.0, 5.625, -67.5, -56.25}}, + {"gexm9j6y088", []float64{65.6841686368, 65.6841699779, -12.2569441795, -12.2569428384}}, + {"15cdq", []float64{-68.5107421875, -68.466796875, -132.626953125, -132.583007812}}, + {"9zud17gy", []float64{43.9669418335, 43.9671134949, -94.8617935181, -94.8614501953}}, + {"4q3y", []float64{-53.7890625, -53.61328125, -76.2890625, -75.9375}}, + {"gph138", []float64{84.5947265625, 84.6002197266, -39.3090820312, -39.2980957031}}, + {"m09d8ju2b", []float64{-41.7163324356, -41.7162895203, 47.1152114868, 47.1152544022}}, + {"8mszup4gk", []float64{32.3388147354, 32.3388576508, -161.890583038, -161.890540123}}, + {"dyrfvtvqh", []float64{35.6722640991, 35.6723070145, -45.102481842, -45.1024389267}}, + {"3h9tgkp", []float64{-18.6547851562, -18.6534118652, -132.738189697, -132.736816406}}, + {"66gdty14u", []float64{-29.0583658218, -29.0583229065, -73.5738945007, -73.5738515854}}, + {"83zp1d", []float64{11.0852050781, 11.0906982422, -158.840332031, -158.829345703}}, + {"e7gp0", []float64{22.32421875, 22.3681640625, -29.53125, -29.4873046875}}, + {"s0ykfgceytk", []float64{5.07498219609, 5.0749835372, 8.91225636005, 8.91225770116}}, + {"zfe7", []float64{59.58984375, 59.765625, 173.3203125, 173.671875}}, + {"cr9", []float64{87.1875, 88.59375, -122.34375, -120.9375}}, + {"9ugr3kq", []float64{28.0165100098, 28.0178833008, -96.6165161133, -96.6151428223}}, + {"2grcq0", []float64{-26.4990234375, -26.4935302734, -135.087890625, -135.076904297}}, + {"50bb31vkds3p", []float64{-85.726895202, -85.7268950343, -43.8940487802, -43.8940484449}}, + {"qhxdv1hcqe3", []float64{-19.1983763874, -19.1983750463, 100.773404986, 100.773406327}}, + {"k2cv0gp2vk", []float64{-39.8857140541, -39.8857086897, 13.7540781498, 13.7540888786}}, + {"y5jd", []float64{62.2265625, 62.40234375, 97.734375, 98.0859375}}, + {"gnvg3p", []float64{83.5784912109, 83.583984375, -36.8701171875, -36.8591308594}}, + {"9g70w", []float64{18.369140625, 18.4130859375, -96.767578125, -96.7236328125}}, + {"9g7qsb", []float64{19.423828125, 19.4293212891, -96.4709472656, -96.4599609375}}, + {"1zq2u", []float64{-49.0869140625, -49.04296875, -92.28515625, -92.2412109375}}, + {"tr", []float64{39.375, 45.0, 56.25, 67.5}}, + {"4wmddz9mgk38", []float64{-54.3620882928, -54.3620881252, -59.6429172903, -59.6429169551}}, + {"wkcdsn8nbz", []float64{27.1951049566, 27.195110321, 103.535188437, 103.535199165}}, + {"1r198wxj", []float64{-50.3247642517, -50.3245925903, -121.609039307, -121.608695984}}, + {"eu", []float64{22.5, 28.125, -11.25, 0.0}}, + {"k2y7mk6sd0wz", []float64{-40.1858386584, -40.1858384907, 20.2733035013, 20.2733038366}}, + {"gms9ytw4v5", []float64{76.2758177519, 76.2758231163, -27.1277761459, -27.1277654171}}, + {"2vdkc", []float64{-13.2275390625, -13.18359375, -143.041992188, -142.998046875}}, + {"bke7fx3", []float64{71.011505127, 71.012878418, -164.068450928, -164.067077637}}, + {"tvnxu7jt8", []float64{29.5047283173, 29.5047712326, 88.0849456787, 88.0849885941}}, + {"f864yp3c", []float64{46.9296455383, 46.9298171997, -64.4214248657, -64.421081543}}, + {"g8hxr7x150d7", []float64{46.2938149832, 46.2938151509, -15.8435266837, -15.8435263485}}, + {"zmk4kmh", []float64{74.9542236328, 74.9555969238, 152.067260742, 152.068634033}}, + {"gtqsvep4", []float64{75.3830337524, 75.3832054138, -13.1080627441, -13.1077194214}}, + {"trsvy0", []float64{43.1982421875, 43.2037353516, 63.193359375, 63.2043457031}}, + {"bevjfs", []float64{67.1264648438, 67.1319580078, -150.358886719, -150.347900391}}, + {"ktrb", []float64{-15.46875, -15.29296875, 33.3984375, 33.75}}, + {"dn20q1pv", []float64{35.2065467834, 35.2067184448, -89.7256851196, -89.7253417969}}, + {"8n3wy5g2", []float64{36.3633728027, 36.3635444641, -177.622489929, -177.622146606}}, + {"vyzft", []float64{83.408203125, 83.4521484375, 89.8681640625, 89.912109375}}, + {"gwuedjbs8222", []float64{83.6163438857, 83.6163440533, -16.0832866654, -16.0832863301}}, + {"fpb89dkktj83", []float64{88.6948023923, 88.6948025599, -89.2249056324, -89.2249052972}}, + {"wjjk3", []float64{28.8720703125, 28.916015625, 97.4267578125, 97.470703125}}, + {"wx6", []float64{40.78125, 42.1875, 115.3125, 116.71875}}, + {"yzpuuv3w0pw", []float64{85.2398702502, 85.2398715913, 134.859245718, 134.859247059}}, + {"k2518ucy", []float64{-44.7092056274, -44.7090339661, 15.5041122437, 15.5044555664}}, + {"hkjk1075", []float64{-66.7949867249, -66.7948150635, 18.6808776855, 18.6812210083}}, + {"btmd", []float64{74.8828125, 75.05859375, -149.765625, -149.4140625}}, + {"ucbvmyuw9z", []float64{55.8048337698, 55.8048391342, 35.0636279583, 35.0636386871}}, + {"wf86ytd34", []float64{14.5762825012, 14.5763254166, 124.390382767, 124.390425682}}, + {"9zjbws3u", []float64{39.4869232178, 39.4870948792, -92.8760147095, -92.8756713867}}, + {"rqc", []float64{-7.03125, -5.625, 147.65625, 149.0625}}, + {"pwqw8gh8x74", []float64{-53.6845904589, -53.6845891178, 166.680077612, 166.680078954}}, + {"5ekn", []float64{-70.6640625, -70.48828125, -16.875, -16.5234375}}, + {"mxx0b3mzwxp", []float64{-2.67247259617, -2.67247125506, 77.3629210889, 77.36292243}}, + {"tn", []float64{33.75, 39.375, 45.0, 56.25}}, + {"ju59u9b8grr", []float64{-67.1826021373, -67.1826007962, 83.8704644144, 83.8704657555}}, + {"hmte6v6jzq84", []float64{-58.4613495693, -58.4613494016, 19.1082823277, 19.1082826629}}, + {"t3", []float64{5.625, 11.25, 56.25, 67.5}}, + {"es9g6", []float64{25.8837890625, 25.927734375, -19.951171875, -19.9072265625}}, + {"2bwcdk6y", []float64{-41.8994522095, -41.8992805481, -136.655158997, -136.654815674}}, + {"4ew3jn8umh", []float64{-70.1002621651, -70.1002568007, -58.4899663925, -58.4899556637}}, + {"0meekufm4x3", []float64{-58.4642212093, -58.4642198682, -163.616186231, -163.61618489}}, + {"02", []float64{-90.0, -84.375, -168.75, -157.5}}, + {"9yuv0t43sv0", []float64{38.8754063845, 38.8754077256, -94.5450460911, -94.54504475}}, + {"u0g7y8444", []float64{49.8782730103, 49.8783159256, 4.85878944397, 4.85883235931}}, + {"r4", []float64{-33.75, -28.125, 135.0, 146.25}}, + {"1ps631dwde", []float64{-47.4076205492, -47.4076151848, -128.975951672, -128.975940943}}, + {"vbmfdm", []float64{46.8731689453, 46.8786621094, 86.9348144531, 86.9458007812}}, + {"s7gmxm8vhn5v", []float64{22.0916506089, 22.0916507766, 16.1401226744, 16.1401230097}}, + {"mcwrh68t", []float64{-35.317440033, -35.3172683716, 87.7265167236, 87.7268600464}}, + {"9yt5645jq9r9", []float64{37.145683486, 37.1456836537, -94.1264504939, -94.1264501587}}, + {"61t1x3z", []float64{-36.2892150879, -36.2878417969, -82.6405334473, -82.6391601562}}, + {"wmzkmbm", []float64{33.0921936035, 33.0935668945, 111.704864502, 111.706237793}}, + {"jy", []float64{-56.25, -50.625, 78.75, 90.0}}, + {"9ctckntmvgkr", []float64{8.69393778965, 8.69393795729, -92.9808190092, -92.980818674}}, + {"8rnugbss4gd", []float64{40.2134129405, 40.2134142816, -159.086717069, -159.086715728}}, + {"ek8", []float64{25.3125, 26.71875, -33.75, -32.34375}}, + {"nvqegbu", []float64{-59.8054504395, -59.8040771484, 133.060913086, 133.062286377}}, + {"h0u7v6pujp5", []float64{-85.1103597879, -85.1103584468, 6.21813699603, 6.21813833714}}, + {"2rj", []float64{-5.625, -4.21875, -161.71875, -160.3125}}, + {"jk38", []float64{-66.09375, -65.91796875, 58.359375, 58.7109375}}, + {"c67qe30m", []float64{58.8051795959, 58.8053512573, -119.036521912, -119.036178589}}, + {"xhtwk", []float64{26.4111328125, 26.455078125, 142.91015625, 142.954101562}}, + {"pw6ue97rjn11", []float64{-54.0446339361, -54.0446337685, 161.525675207, 161.525675543}}, + {"xpmw424pk6", []float64{41.8371927738, 41.8371981382, 142.836180925, 142.836191654}}, + {"rk", []float64{-22.5, -16.875, 146.25, 157.5}}, + {"hmfmn5uwdh2", []float64{-56.755605787, -56.7556044459, 14.6840000153, 14.6840013564}}, + {"defvfj", []float64{22.1319580078, 22.1374511719, -63.544921875, -63.5339355469}}, + {"sg7jd89w", []float64{19.2518234253, 19.2519950867, 38.0806732178, 38.0810165405}}, + {"yn0", []float64{78.75, 80.15625, 90.0, 91.40625}}, + {"3n7re6gv2e92", []float64{-8.50936442614, -8.5093642585, -130.281692259, -130.281691924}}, + {"vyj", []float64{78.75, 80.15625, 85.78125, 87.1875}}, + {"9cntsz", []float64{6.63024902344, 6.6357421875, -91.9006347656, -91.8896484375}}, + {"w4uwq3d", []float64{16.5756225586, 16.5769958496, 96.6055297852, 96.6069030762}}, + {"zffrf", []float64{61.8310546875, 61.875, 172.001953125, 172.045898438}}, + {"hq6142s", []float64{-54.665222168, -54.663848877, 14.1668701172, 14.1682434082}}, + {"m7srp977", []float64{-24.0746498108, -24.0744781494, 62.5606155396, 62.5609588623}}, + {"8v6hxy3sz", []float64{30.3574132919, 30.3574562073, -143.094563484, -143.094520569}}, + {"rf4snrh70h", []float64{-33.0078864098, -33.0078810453, 172.54611969, 172.546130419}}, + {"6", []float64{-45.0, 0.0, -90.0, -45.0}}, + {"ssr6y", []float64{24.3896484375, 24.43359375, 32.958984375, 33.0029296875}}, + {"z", []float64{45.0, 90.0, 135.0, 180.0}}, + {"yz", []float64{84.375, 90.0, 123.75, 135.0}}, + {"n2", []float64{-90.0, -84.375, 101.25, 112.5}}, + {"vfe", []float64{59.0625, 60.46875, 82.96875, 84.375}}, + {"h", []float64{-90.0, -45.0, 0.0, 45.0}}, + {"z3", []float64{50.625, 56.25, 146.25, 157.5}}, + {"z2web5usz", []float64{48.4930944443, 48.4931373596, 155.397105217, 155.397148132}}, + {"8gkc", []float64{18.45703125, 18.6328125, -139.5703125, -139.21875}}, + {"17de4", []float64{-69.78515625, -69.7412109375, -120.146484375, -120.102539062}}, + {"ky71b3fwd2vv", []float64{-9.52539911494, -9.5253989473, 37.9832738265, 37.9832741618}}, + {"e4gfcm", []float64{15.9796142578, 15.9851074219, -39.6716308594, -39.6606445312}}, + {"kdgwxnnb", []float64{-28.3557128906, -28.3555412292, 27.7387619019, 27.7391052246}}, + {"4nyn2chy", []float64{-50.9260940552, -50.9259223938, -81.5230178833, -81.5226745605}}, + {"u6je3kk57", []float64{56.8451929092, 56.8452358246, 19.0449285507, 19.0449714661}}, + {"nrs9j1hj5tj", []float64{-47.630340457, -47.6303391159, 107.803501636, 107.803502977}}, + {"j", []float64{-90.0, -45.0, 45.0, 90.0}}, + {"krqrcc2hm9uq", []float64{-2.84883890301, -2.84883873537, 20.116208531, 20.1162088662}}, + {"fmceu", []float64{78.0029296875, 78.046875, -76.46484375, -76.4208984375}}, + {"q3w576", []float64{-35.9802246094, -35.9747314453, 109.830322266, 109.841308594}}, + {"19ehn", []float64{-80.859375, -80.8154296875, -108.017578125, -107.973632812}}, + {"zpkvjk", []float64{86.6821289062, 86.6876220703, 141.910400391, 141.921386719}}, + {"7cgwy9jm", []float64{-33.9633750916, -33.9632034302, -6.03527069092, -6.03492736816}}, + {"jju1cefk5v1", []float64{-57.3273199797, -57.3273186386, 50.6941701472, 50.6941714883}}, + {"5tfpq6", []float64{-56.3708496094, -56.3653564453, -19.4128417969, -19.4018554688}}, + {"j", []float64{-90.0, -45.0, 45.0, 90.0}}, + {"dm0vh8", []float64{29.00390625, 29.0093994141, -77.4975585938, -77.4865722656}}, + {"jt5c8hhd58b", []float64{-61.5890081227, -61.5890067816, 72.7797675133, 72.7797688544}}, + {"4ttr", []float64{-57.83203125, -57.65625, -60.1171875, -59.765625}}, + {"d", []float64{0.0, 45.0, -90.0, -45.0}}, + {"sp2d3v6wz", []float64{41.2067556381, 41.2067985535, 0.783762931824, 0.783805847168}}, + {"up1yhbspqy", []float64{85.4337108135, 85.4337161779, 2.67546057701, 2.67547130585}}, + {"1z4bx8dp1ex1", []float64{-50.5331422202, -50.5331420526, -97.0504023135, -97.0504019782}}, + {"76qbnz5n1937", []float64{-32.3042606749, -32.3042605072, -23.9569957182, -23.9569953829}}, + {"0w1bvcjb40xd", []float64{-56.112667881, -56.1126677133, -154.778384641, -154.778384306}}, + {"x", []float64{0.0, 45.0, 135.0, 180.0}}, + {"yqbjwyvsmc", []float64{83.9733606577, 83.9733660221, 101.554430723, 101.554441452}}, + {"0hdp2tdy5", []float64{-63.3818435669, -63.3818006516, -177.161622047, -177.161579132}}, + {"s8yv34v", []float64{5.15670776367, 5.15808105469, 32.0429992676, 32.0443725586}}, + {"uc60k7w6", []float64{52.0947647095, 52.0949363708, 36.757850647, 36.7581939697}}, + {"en6h7me8", []float64{35.9335327148, 35.9337043762, -42.0398712158, -42.0395278931}}, + {"1bks34", []float64{-87.8356933594, -87.8302001953, -94.8779296875, -94.8669433594}}, + {"65", []float64{-28.125, -22.5, -90.0, -78.75}}, + {"qwrquvr", []float64{-8.62838745117, -8.62701416016, 122.913665771, 122.915039062}}, + {"3dmchcusw83", []float64{-32.1575818956, -32.1575805545, -104.198862165, -104.198860824}}, + {"urfeh", []float64{89.12109375, 89.1650390625, 14.94140625, 14.9853515625}}, + {"d3g6rs", []float64{10.2612304688, 10.2667236328, -73.8500976562, -73.8391113281}}, + {"wx0v", []float64{40.25390625, 40.4296875, 113.5546875, 113.90625}}, + {"7", []float64{-45.0, 0.0, -45.0, 0.0}}, + {"et", []float64{28.125, 33.75, -22.5, -11.25}}, + {"dqtd0", []float64{36.9140625, 36.9580078125, -71.015625, -70.9716796875}}, + {"vtzuwhxhgjv8", []float64{78.1603311002, 78.1603312679, 78.6718585342, 78.6718588695}}, + {"bn9", []float64{81.5625, 82.96875, -178.59375, -177.1875}}, + {"685n", []float64{-43.9453125, -43.76953125, -63.28125, -62.9296875}}, + {"fqy4nhy", []float64{83.3464050293, 83.3477783203, -70.0405883789, -70.0392150879}}, + {"5q1dw", []float64{-55.810546875, -55.7666015625, -31.376953125, -31.3330078125}}, + {"0sv8m11dgkf", []float64{-63.2313139737, -63.2313126326, -149.543696344, -149.543695003}}, + {"vp435nn", []float64{84.5837402344, 84.5851135254, 48.3041381836, 48.3055114746}}, + {"wydj", []float64{37.44140625, 37.6171875, 126.5625, 126.9140625}}, + {"ebg8", []float64{4.21875, 4.39453125, -6.328125, -5.9765625}}, + {"ksmb6pfxe4", []float64{-21.0059344769, -21.0059291124, 30.6773900986, 30.6774008274}}, + {"8bv63qv96dp", []float64{4.65156197548, 4.65156331658, -138.804586083, -138.804584742}}, + {"0s0d53hd", []float64{-67.1426010132, -67.1424293518, -156.647872925, -156.647529602}}, + {"1yjf34ubpm", []float64{-55.8393591642, -55.8393537998, -93.1132829189, -93.1132721901}}, + {"823y79hmfe", []float64{2.51137912273, 2.51138448715, -166.129310131, -166.129299402}}, + {"xynnrv", []float64{34.8760986328, 34.8815917969, 177.528076172, 177.5390625}}, + {"9b9ejqcqqdu", []float64{3.37801024318, 3.37801158428, -98.9079111814, -98.9079098403}}, + {"cuuhfkd", []float64{72.5784301758, 72.5798034668, -95.5233764648, -95.5220031738}}, + {"khwceh", []float64{-19.4018554688, -19.3963623047, 9.6240234375, 9.63500976562}}, + {"z8vub32sy6", []float64{50.061403513, 50.0614088774, 165.597878695, 165.597889423}}, + {"4s", []float64{-67.5, -61.875, -67.5, -56.25}}, + {"bsrb4qeqt7eq", []float64{68.9430911466, 68.9430913143, -146.497992687, -146.497992352}}, + {"b1x0sc", []float64{53.5308837891, 53.5363769531, -169.947509766, -169.936523438}}, + {"1ngn2bn2vub", []float64{-50.9324629605, -50.9324616194, -130.739461184, -130.739459842}}, + {"bsm", []float64{68.90625, 70.3125, -150.46875, -149.0625}}, + {"xyzd7", []float64{38.3642578125, 38.408203125, 179.428710938, 179.47265625}}, + {"cvvf1tqs", []float64{77.7248382568, 77.7250099182, -93.0892181396, -93.0888748169}}, + {"fz2tpb1xj", []float64{86.6613578796, 86.661400795, -55.2040243149, -55.2039813995}}, + {"r4zqr7", []float64{-28.4161376953, -28.4106445312, 145.513916016, 145.524902344}}, + {"wth", []float64{28.125, 29.53125, 118.125, 119.53125}}, + {"n", []float64{-90.0, -45.0, 90.0, 135.0}}, + {"4j0ff5xs58h", []float64{-61.3716888428, -61.3716875017, -88.8469666243, -88.8469652832}}, + {"c15", []float64{50.625, 52.03125, -130.78125, -129.375}}, + {"tnkgyg", []float64{35.8319091797, 35.8374023438, 51.9763183594, 51.9873046875}}, + {"e99", []float64{8.4375, 9.84375, -21.09375, -19.6875}}, + {"bcrz69", []float64{53.3111572266, 53.3166503906, -135.241699219, -135.230712891}}, + {"e5w29f6", []float64{19.7877502441, 19.7891235352, -36.1312866211, -36.1299133301}}, + {"gykjjcq656b5", []float64{81.0423812829, 81.0423814505, -5.36359190941, -5.36359157413}}, + {"7j62nzypd3be", []float64{-15.4248806275, -15.4248804599, -41.5309696645, -41.5309693292}}, + {"rrsgbbvwt9r7", []float64{-2.14807743207, -2.14807726443, 152.970445342, 152.970445678}}, + {"93kub", []float64{7.8662109375, 7.91015625, -117.0703125, -117.026367188}}, + {"8", []float64{0.0, 45.0, -180.0, -135.0}}, + {"02bcjtw5f84", []float64{-85.5746126175, -85.5746112764, -167.445263565, -167.445262223}}, + {"262dqsqs1", []float64{-31.9242095947, -31.9241666794, -167.752261162, -167.752218246}}, + {"185ss806c", []float64{-89.2085123062, -89.2084693909, -107.379984856, -107.37994194}}, + {"9s6", []float64{23.90625, 25.3125, -109.6875, -108.28125}}, + {"ych1d25e", []float64{50.8891868591, 50.8893585205, 129.478683472, 129.479026794}}, + {"7f2qcykht0d", []float64{-31.1221191287, -31.1221177876, -10.8158227801, -10.815821439}}, + {"5gk7qw", []float64{-71.1145019531, -71.1090087891, -4.98779296875, -4.97680664062}}, + {"7kjutjpu98", []float64{-21.6807460785, -21.6807407141, -25.4336285591, -25.4336178303}}, + {"h64", []float64{-78.75, -77.34375, 14.0625, 15.46875}}, + {"uy57", []float64{79.27734375, 79.453125, 38.3203125, 38.671875}}, + {"r5dtqkydk796", []float64{-24.3631505594, -24.3631503917, 138.799393661, 138.799393997}}, + {"5j", []float64{-61.875, -56.25, -45.0, -33.75}}, + {"xzbszzeu", []float64{44.4705963135, 44.4707679749, 169.798851013, 169.799194336}}, + {"wqjz0fj5e", []float64{34.9920558929, 34.9920988083, 109.375891685, 109.375934601}}, + {"dekz", []float64{19.51171875, 19.6875, -60.8203125, -60.46875}}, + {"bbyux", []float64{50.009765625, 50.0537109375, -136.450195312, -136.40625}}, + {"rctysz36", []float64{-35.3797531128, -35.3795814514, 177.046394348, 177.046737671}}, + {"xmhvqm1wcw7", []float64{29.0765096247, 29.0765109658, 153.206474036, 153.206475377}}, + {"nhw6c", []float64{-64.2041015625, -64.16015625, 98.8330078125, 98.876953125}}, + {"u9d3gdu", []float64{53.7602233887, 53.7615966797, 25.8233642578, 25.8247375488}}, + {"xenu1tyd0q", []float64{17.6100862026, 17.610091567, 167.067042589, 167.067053318}}, + {"qm70t", []float64{-15.380859375, -15.3369140625, 105.688476562, 105.732421875}}, + {"3g0", []float64{-28.125, -26.71875, -101.25, -99.84375}}, + {"fg", []float64{61.875, 67.5, -56.25, -45.0}}, + {"jq1tn6nn0hfs", []float64{-55.3590513021, -55.3590511344, 58.642276302, 58.6422766373}}, + {"b9kmw", []float64{52.998046875, 53.0419921875, -151.259765625, -151.215820312}}, + {"z7f9pj6", []float64{66.2983703613, 66.2997436523, 150.07598877, 150.077362061}}, + {"k", []float64{-45.0, 0.0, 0.0, 45.0}}, + {"hqbvrz4x", []float64{-51.0687446594, -51.068572998, 12.6486968994, 12.6490402222}}, + {"2", []float64{-45.0, 0.0, -180.0, -135.0}}, + {"81es", []float64{9.140625, 9.31640625, -175.078125, -174.7265625}}, + {"nyn6mf2", []float64{-55.8421325684, -55.8407592773, 132.791748047, 132.793121338}}, + {"6", []float64{-45.0, 0.0, -90.0, -45.0}}, + {"46hfqnuv", []float64{-78.3165550232, -78.3163833618, -71.8001174927, -71.7997741699}}, + {"7tne", []float64{-16.34765625, -16.171875, -13.359375, -13.0078125}}, + {"z7pkduh4g2c", []float64{62.6884643734, 62.6884657145, 156.571796089, 156.571797431}}, + {"xgmm", []float64{19.16015625, 19.3359375, 176.1328125, 176.484375}}, + {"he054x", []float64{-72.5592041016, -72.5537109375, 22.6098632812, 22.6208496094}}, + {"rqnu21r8g", []float64{-10.4959344864, -10.495891571, 155.752615929, 155.752658844}}, + {"k8xg8n36", []float64{-41.5375900269, -41.5374183655, 33.4001541138, 33.4004974365}}, + {"d60md", []float64{12.216796875, 12.2607421875, -78.310546875, -78.2666015625}}, + {"50tqctv", []float64{-85.9693908691, -85.9680175781, -37.5444030762, -37.5430297852}}, + {"yxknv54eqnz", []float64{86.984847039, 86.9848483801, 118.34842667, 118.348428011}}, + {"5zpczbcjgj", []float64{-50.3122490644, -50.3122437, -0.00948429107666, -0.0094735622406}}, + {"yp7nz0rr7d8", []float64{86.9704046845, 86.9704060256, 94.5364737511, 94.5364750922}}, + {"kfjb9s772f", []float64{-33.6381947994, -33.638189435, 41.9063508511, 41.9063615799}}, + {"q", []float64{-45.0, 0.0, 90.0, 135.0}}, + {"cx1f", []float64{84.7265625, 84.90234375, -110.0390625, -109.6875}}, + {"y", []float64{45.0, 90.0, 90.0, 135.0}}, + {"hdtcdg62524", []float64{-75.6559753418, -75.6559740007, 30.7100191712, 30.7100205123}}, + {"fpy32", []float64{88.8134765625, 88.857421875, -81.2109375, -81.1669921875}}, + {"256qs", []float64{-25.576171875, -25.5322265625, -176.66015625, -176.616210938}}, + {"6rgzd89v3r48", []float64{-0.0842052698135, -0.0842051021755, -73.3642389625, -73.3642386273}}, + {"c8430hd23", []float64{45.2005434036, 45.200586319, -109.33280468, -109.332761765}}, + {"xtn", []float64{28.125, 29.53125, 165.9375, 167.34375}}, + {"1u2s9", []float64{-65.302734375, -65.2587890625, -100.502929688, -100.458984375}}, + {"5c80k35m35p", []float64{-81.512144208, -81.5121428668, -11.058716923, -11.0587155819}}, + {"5", []float64{-90.0, -45.0, -45.0, 0.0}}, + {"u7c0rfsdjn", []float64{66.1518037319, 66.1518090963, 13.0032205582, 13.003231287}}, + {"f4ce8gy", []float64{61.1045837402, 61.1059570312, -87.8494262695, -87.8480529785}}, + {"pbd4y02", []float64{-86.7027282715, -86.7013549805, 171.826171875, 171.827545166}}, + {"9n", []float64{33.75, 39.375, -135.0, -123.75}}, + {"ztybk5c8mw", []float64{77.4083697796, 77.408375144, 167.170264721, 167.17027545}}, + {"ks8pv0cv5u6j", []float64{-18.3201934956, -18.320193328, 22.7222934365, 22.7222937718}}, + {"nuh", []float64{-67.5, -66.09375, 129.375, 130.78125}}, + {"6khcf5xumxz", []float64{-22.1723856032, -22.1723842621, -71.9715334475, -71.9715321064}}, + {"nj2", []float64{-60.46875, -59.0625, 90.0, 91.40625}}, + {"mdd459nebt64", []float64{-30.5797721073, -30.5797719397, 70.4752591252, 70.4752594605}}, + {"e7ujxh", []float64{22.0825195312, 22.0880126953, -27.8173828125, -27.8063964844}}, + {"eb1zvy9n", []float64{1.39904022217, 1.39921188354, -8.53500366211, -8.53466033936}}, + {"1huxmt", []float64{-61.9793701172, -61.9738769531, -128.430175781, -128.419189453}}, + {"v4cyrgyg", []float64{61.5884971619, 61.5886688232, 47.8107833862, 47.811126709}}, + {"j6h0dvut", []float64{-78.6296653748, -78.6294937134, 62.0020294189, 62.0023727417}}, + {"r8j", []float64{-45.0, -43.59375, 164.53125, 165.9375}}, + {"feyj018b7", []float64{66.9809389114, 66.9809818268, -59.0613412857, -59.0612983704}}, + {"tw33887htf7j", []float64{35.4220805503, 35.422080718, 69.2841558158, 69.2841561511}}, + {"3rbgkj8z4cy8", []float64{-0.803537517786, -0.803537350148, -122.518374547, -122.518374212}}, + {"gq30gnn9dg", []float64{80.3213185072, 80.3213238716, -32.2028696537, -32.2028589249}}, + {"rek4qy2", []float64{-26.2889099121, -26.2875366211, 163.421630859, 163.42300415}}, + {"j8", []float64{-90.0, -84.375, 67.5, 78.75}}, + {"f2rev1d7", []float64{47.0741844177, 47.0743560791, -67.9803085327, -67.97996521}}, + {"rw1sg2f78x", []float64{-10.4102808237, -10.4102754593, 159.755308628, 159.755319357}}, + {"r", []float64{-45.0, 0.0, 135.0, 180.0}}, + {"8xfw", []float64{44.6484375, 44.82421875, -153.984375, -153.6328125}}, + {"fj20", []float64{74.53125, 74.70703125, -90.0, -89.6484375}}, + {"m76w867pt9yj", []float64{-25.5625145696, -25.562514402, 59.7809752822, 59.7809756175}}, + {"n", []float64{-90.0, -45.0, 90.0, 135.0}}, + {"wdb4v9vwt4ez", []float64{15.9628918581, 15.9628920257, 112.749471925, 112.74947226}}, + {"m53rvw", []float64{-25.3234863281, -25.3179931641, 46.9995117188, 47.0104980469}}, + {"f33", []float64{52.03125, 53.4375, -77.34375, -75.9375}}, + {"0t36bun98", []float64{-59.9631214142, -59.9630784988, -155.700302124, -155.700259209}}, + {"ezs1", []float64{42.36328125, 42.5390625, -5.625, -5.2734375}}, + {"jb", []float64{-90.0, -84.375, 78.75, 90.0}}, + {"vd0k", []float64{56.953125, 57.12890625, 67.8515625, 68.203125}}, + {"39cqr9bd", []float64{-34.0476608276, -34.0474891663, -110.411911011, -110.411567688}}, + {"n8r", []float64{-88.59375, -87.1875, 122.34375, 123.75}}, + {"1b6", []float64{-88.59375, -87.1875, -98.4375, -97.03125}}, + {"358wn7v3tch", []float64{-24.2369502783, -24.2369489372, -134.014754891, -134.01475355}}, + {"d506n2v77qf", []float64{17.2312764823, 17.2312778234, -89.366427362, -89.3664260209}}, + {"qb", []float64{-45.0, -39.375, 123.75, 135.0}}, + {"sn", []float64{33.75, 39.375, 0.0, 11.25}}, + {"5rj", []float64{-50.625, -49.21875, -26.71875, -25.3125}}, + {"0y51c", []float64{-55.9423828125, -55.8984375, -141.987304688, -141.943359375}}, + {"r85z8", []float64{-43.681640625, -43.6376953125, 162.7734375, 162.817382812}}, + {"rk5z", []float64{-21.26953125, -21.09375, 151.5234375, 151.875}}, + {"vzn2", []float64{84.375, 84.55078125, 87.5390625, 87.890625}}, + {"bjvk", []float64{78.046875, 78.22265625, -172.6171875, -172.265625}}, + {"b9f", []float64{54.84375, 56.25, -154.6875, -153.28125}}, + {"q0u1y3mu4k", []float64{-40.4660582542, -40.4660528898, 95.907651186, 95.9076619148}}, + {"r7", []float64{-28.125, -22.5, 146.25, 157.5}}, + {"cv90dz31", []float64{76.0653877258, 76.0655593872, -99.7215270996, -99.7211837769}}, + {"gxfeekgv9sng", []float64{89.2360430025, 89.2360431701, -18.8363294676, -18.8363291323}}, + {"92t", []float64{2.8125, 4.21875, -116.71875, -115.3125}}, + {"m06", []float64{-43.59375, -42.1875, 47.8125, 49.21875}}, + {"n27p3f3c", []float64{-87.306804657, -87.3066329956, 105.548057556, 105.548400879}}, + {"jjptjwvt2", []float64{-60.9581136703, -60.958070755, 55.7961273193, 55.7961702347}}, + {"u258hxwr", []float64{45.0424003601, 45.0425720215, 16.3782119751, 16.3785552979}}, + {"t47", []float64{12.65625, 14.0625, 49.21875, 50.625}}, + {"e", []float64{0.0, 45.0, -45.0, 0.0}}, + {"0s", []float64{-67.5, -61.875, -157.5, -146.25}}, + {"4drwu7v94g3", []float64{-76.1364381015, -76.1364367604, -56.758684963, -56.7586836219}}, + {"wk14pc", []float64{22.8570556641, 22.8625488281, 102.996826172, 103.0078125}}, + {"w5xg", []float64{20.21484375, 20.390625, 100.8984375, 101.25}}, + {"f2z2dvf7", []float64{49.3387413025, 49.3389129639, -68.4307479858, -68.4304046631}}, + {"duk0212sgtp", []float64{23.9579039812, 23.9579053223, -50.6241537631, -50.624152422}}, + {"h7z", []float64{-68.90625, -67.5, 21.09375, 22.5}}, + {"31k02yzy90", []float64{-37.8866100311, -37.8866046667, -129.331355095, -129.331344366}}, + {"vus0k", []float64{70.3564453125, 70.400390625, 84.55078125, 84.5947265625}}, + {"5m0sjm4rrvf", []float64{-61.1431337893, -61.1431324482, -32.8127369285, -32.8127355874}}, + {"4syd3n9", []float64{-62.8500366211, -62.8486633301, -58.3140563965, -58.3126831055}}, + {"n", []float64{-90.0, -45.0, 90.0, 135.0}}, + {"g", []float64{45.0, 90.0, -45.0, 0.0}}, + {"ntn69zcu", []float64{-61.392288208, -61.3921165466, 121.368370056, 121.368713379}}, + {"vnc4m", []float64{83.3642578125, 83.408203125, 46.6259765625, 46.669921875}}, + {"sy848jtk7wdj", []float64{37.0329307951, 37.0329309627, 33.7573626637, 33.757362999}}, + {"ry7y", []float64{-8.7890625, -8.61328125, 174.0234375, 174.375}}, + {"5k7gesef08j", []float64{-65.453453064, -65.4534517229, -28.3175759017, -28.3175745606}}, + {"n5gnev", []float64{-67.7362060547, -67.7307128906, 94.3835449219, 94.39453125}}, + {"kz3eeg1trf7", []float64{-3.58612284064, -3.58612149954, 36.0265664756, 36.0265678167}}, + {"t55rxfr7", []float64{18.2062339783, 18.2064056396, 49.9208450317, 49.9211883545}}, + {"tm9", []float64{30.9375, 32.34375, 57.65625, 59.0625}}, + {"pjw", []float64{-59.0625, -57.65625, 143.4375, 144.84375}}, + {"f", []float64{45.0, 90.0, -90.0, -45.0}}, + {"2my", []float64{-12.65625, -11.25, -160.3125, -158.90625}}, + {"3w6bwyt", []float64{-9.72015380859, -9.71878051758, -108.329315186, -108.327941895}}, + {"v4s8r3q", []float64{59.1133117676, 59.1146850586, 51.6549682617, 51.6563415527}}, + {"ggx8be488kyn", []float64{64.8359277472, 64.8359279148, -0.677700340748, -0.677700005472}}, + {"95w", []float64{19.6875, 21.09375, -126.5625, -125.15625}}, + {"9jck7spubf", []float64{33.1136190891, 33.1136244535, -133.077703714, -133.077692986}}, + {"f1zfe", []float64{55.283203125, 55.3271484375, -78.9697265625, -78.92578125}}, + {"p5ycnk9j", []float64{-68.7048912048, -68.7047195435, 144.768218994, 144.768562317}}, + {"v", []float64{45.0, 90.0, 45.0, 90.0}}, + {"zk", []float64{67.5, 73.125, 146.25, 157.5}}, + {"rkq7z", []float64{-20.4345703125, -20.390625, 155.346679688, 155.390625}}, + {"j5wsc3t4", []float64{-69.4689559937, -69.4687843323, 54.2024230957, 54.2027664185}}, + {"56szw", []float64{-74.619140625, -74.5751953125, -26.806640625, -26.7626953125}}, + {"6xp21s7", []float64{-5.60165405273, -5.60028076172, -57.2346496582, -57.2332763672}}, + {"gbmymdgc5x", []float64{47.520198226, 47.5202035904, -2.91706323624, -2.9170525074}}, + {"bmm935md5m", []float64{74.7691994905, 74.769204855, -160.963987112, -160.963976383}}, + {"z2v55d8", []float64{49.7598266602, 49.7611999512, 153.435058594, 153.436431885}}, + {"q232vgb0", []float64{-43.4413146973, -43.4411430359, 103.260498047, 103.26084137}}, + {"b7kf7b5uz4", []float64{63.6775839329, 63.6775892973, -161.900067329, -161.900056601}}, + {"y6", []float64{56.25, 61.875, 101.25, 112.5}}, + {"3xdf2ts4d", []float64{-2.38635063171, -2.38630771637, -108.605260849, -108.605217934}}, + {"0szy1551", []float64{-62.2099113464, -62.2097396851, -146.553497314, -146.553153992}}, + {"z", []float64{45.0, 90.0, 135.0, 180.0}}, + {"kk86s", []float64{-19.248046875, -19.2041015625, 11.77734375, 11.8212890625}}, + {"tqq", []float64{35.15625, 36.5625, 64.6875, 66.09375}}, + {"3gcy97b", []float64{-22.7430725098, -22.7416992188, -98.7341308594, -98.7327575684}}, + {"5dnz5z1d", []float64{-77.4807357788, -77.4805641174, -12.8409576416, -12.8406143188}}, + {"eumv1pe", []float64{24.8263549805, 24.8277282715, -3.11599731445, -3.11462402344}}, + {"2kxmbgqtfc3", []float64{-18.6579112709, -18.6579099298, -158.512682766, -158.512681425}}, + {"hc5uf211rpb", []float64{-83.5397829115, -83.5397815704, 39.1239881516, 39.1239894927}}, + {"p4khbd21", []float64{-76.496257782, -76.4960861206, 140.646972656, 140.647315979}}, + {"xuq", []float64{23.90625, 25.3125, 177.1875, 178.59375}}, + {"gn63cf91nhf", []float64{80.47779724, 80.4777985811, -41.7573997378, -41.7573983967}}, + {"5hd1nh3", []float64{-64.4883728027, -64.4869995117, -41.922454834, -41.921081543}}, + {"ustjn5", []float64{71.2078857422, 71.2133789062, 29.794921875, 29.8059082031}}, + {"btv2", []float64{77.34375, 77.51953125, -150.1171875, -149.765625}}, + {"7ds5rkh3u1", []float64{-30.3439325094, -30.343927145, -16.5503883362, -16.5503776073}}, + {"54", []float64{-78.75, -73.125, -45.0, -33.75}}, + {"7sr", []float64{-21.09375, -19.6875, -12.65625, -11.25}}, + {"kr552fbb", []float64{-5.03860473633, -5.03843307495, 15.5027389526, 15.5030822754}}, + {"8uc453nv4qq4", []float64{27.0766978338, 27.0766980015, -144.691553414, -144.691553079}}, + {"6n3m", []float64{-8.96484375, -8.7890625, -88.2421875, -87.890625}}, + {"rptzf8", []float64{-1.4501953125, -1.44470214844, 143.195800781, 143.206787109}}, + {"x63b", []float64{12.65625, 12.83203125, 148.7109375, 149.0625}}, + {"qwj7353", []float64{-10.6608581543, -10.6594848633, 119.928131104, 119.929504395}}, + {"j", []float64{-90.0, -45.0, 45.0, 90.0}}, + {"v6", []float64{56.25, 61.875, 56.25, 67.5}}, + {"5mpwtp", []float64{-60.6939697266, -60.6884765625, -22.9833984375, -22.9724121094}}, + {"ukgs", []float64{72.421875, 72.59765625, 16.171875, 16.5234375}}, + {"qxyb4g6qf", []float64{-1.3872385025, -1.38719558716, 122.116212845, 122.11625576}}, + {"qhww7zdb5v", []float64{-18.5476416349, -18.5476362705, 99.3093574047, 99.3093681335}}, + {"wfwffcw5vd", []float64{14.5547926426, 14.554798007, 133.37151289, 133.371523619}}, + {"rynp", []float64{-10.01953125, -9.84375, 177.1875, 177.5390625}}, + {"fb57ykxryn82", []float64{45.6852641702, 45.6852643378, -51.3948151097, -51.3948147744}}, + {"30sq", []float64{-41.1328125, -40.95703125, -129.0234375, -128.671875}}, + {"9", []float64{0.0, 45.0, -135.0, -90.0}}, + {"e5gjz7", []float64{22.1209716797, 22.1264648438, -40.4626464844, -40.4516601562}}, + {"t6vkbx", []float64{16.3421630859, 16.34765625, 63.6547851562, 63.6657714844}}, + {"3e", []float64{-28.125, -22.5, -112.5, -101.25}}, + {"th", []float64{22.5, 28.125, 45.0, 56.25}}, + {"7j", []float64{-16.875, -11.25, -45.0, -33.75}}, + {"2", []float64{-45.0, 0.0, -180.0, -135.0}}, + {"1r15z2z5", []float64{-49.9611854553, -49.9610137939, -122.015533447, -122.015190125}}, + {"k9e57kg2f3", []float64{-35.9649842978, -35.9649789333, 26.866132021, 26.8661427498}}, + {"4w", []float64{-56.25, -50.625, -67.5, -56.25}}, + {"82c582qhsx7", []float64{4.83616903424, 4.83617037535, -167.324326783, -167.324325442}}, + {"qzdzkwwdc", []float64{-1.50190830231, -1.50186538696, 127.823910713, 127.823953629}}, + {"6xn26p6fnr0", []float64{-5.54084837437, -5.54084703326, -58.6190021038, -58.6190007627}}, + {"wm4y0xu8ee", []float64{29.2223614454, 29.2223668098, 105.14549017, 105.145500898}}, + {"rke5rewv", []float64{-19.0961265564, -19.095954895, 150.807609558, 150.807952881}}, + {"0bn3vwcjj", []float64{-89.6544456482, -89.6544027328, -137.217650414, -137.217607498}}, + {"5", []float64{-90.0, -45.0, -45.0, 0.0}}, + {"red", []float64{-25.3125, -23.90625, 160.3125, 161.71875}}, + {"r2", []float64{-45.0, -39.375, 146.25, 157.5}}, + {"v7qptt2u5k", []float64{64.6291565895, 64.6291619539, 64.9303686619, 64.9303793907}}, + {"pey", []float64{-68.90625, -67.5, 165.9375, 167.34375}}, + {"r7cg5cz8", []float64{-23.3692932129, -23.3691215515, 148.886032104, 148.886375427}}, + {"9", []float64{0.0, 45.0, -135.0, -90.0}}, + {"qqr5mxz", []float64{-9.22988891602, -9.228515625, 111.345062256, 111.346435547}}, + {"wuuvdz7x2", []float64{27.7266168594, 27.7266597748, 130.555343628, 130.555386543}}, + {"t", []float64{0.0, 45.0, 45.0, 90.0}}, + {"4mgwkd9yp3r5", []float64{-56.5428471006, -56.542846933, -73.6276473105, -73.6276469752}}, + {"dpk6jbemhyvh", []float64{41.1364542693, 41.1364544369, -83.7660782039, -83.7660778686}}, + {"768py6th4818", []float64{-29.5607757568, -29.5607755892, -33.4683660418, -33.4683657065}}, + {"q8", []float64{-45.0, -39.375, 112.5, 123.75}}, + {"xmgvmq9cf", []float64{33.3026075363, 33.3026504517, 151.756639481, 151.756682396}}, + {"6rhhdy3", []float64{-4.79965209961, -4.79827880859, -73.0027770996, -73.0014038086}}, + {"05dd6myj4xqj", []float64{-69.884508457, -69.8845082894, -176.377142966, -176.377142631}}, + {"yw6", []float64{80.15625, 81.5625, 115.3125, 116.71875}}, + {"6mt6h4jrw", []float64{-13.6986637115, -13.6986207962, -71.1839389801, -71.1838960648}}, + {"86ymqv2s", []float64{16.4211273193, 16.4212989807, -159.663619995, -159.663276672}}, + {"vygq9vr6umc", []float64{84.1406701505, 84.1406714916, 83.4073568881, 83.4073582292}}, + {"89g6sp8p81", []float64{10.3256946802, 10.3257000446, -152.75390625, -152.753895521}}, + {"w", []float64{0.0, 45.0, 90.0, 135.0}}, + {"6st", []float64{-19.6875, -18.28125, -60.46875, -59.0625}}, + {"v", []float64{45.0, 90.0, 45.0, 90.0}}, + {"sk", []float64{22.5, 28.125, 11.25, 22.5}}, + {"t", []float64{0.0, 45.0, 45.0, 90.0}}, + {"2", []float64{-45.0, 0.0, -180.0, -135.0}}, + {"f3pf69r61v4", []float64{51.0277444124, 51.0277457535, -67.7316650748, -67.7316637337}}, + {"r9qde0cj", []float64{-37.5243186951, -37.5241470337, 166.773834229, 166.774177551}}, + {"nbm5pqh", []float64{-88.0334472656, -88.0320739746, 131.10534668, 131.106719971}}, + {"u7f9fh2dzpw9", []float64{66.4252256043, 66.425225772, 14.8545113951, 14.8545117304}}, + {"pnzr19j29", []float64{-50.7952022552, -50.7951593399, 145.268483162, 145.268526077}}, + {"3e7rf8ww4fe5", []float64{-25.3526548482, -25.3526546806, -107.810775787, -107.810775451}}, + {"1x", []float64{-50.625, -45.0, -112.5, -101.25}}, + {"4", []float64{-90.0, -45.0, -90.0, -45.0}}, + {"4h4swejxpzv", []float64{-66.6912616789, -66.6912603378, -86.1908380687, -86.1908367276}}, + {"hy6gnf0u", []float64{-54.3047332764, -54.304561615, 37.9148483276, 37.9151916504}}, + {"xnycujudf", []float64{38.3084249496, 38.308467865, 144.67423439, 144.674277306}}, + {"t6bypmux9z54", []float64{16.5563485399, 16.5563487075, 57.6295499504, 57.6295502856}}, + {"ufw1c6xp2t", []float64{59.3851214647, 59.3851268291, 42.2520661354, 42.2520768642}}, + {"42jpxwe9byn", []float64{-88.6456024647, -88.6456011236, -71.3843134046, -71.3843120635}}, + {"5", []float64{-90.0, -45.0, -45.0, 0.0}}, + {"tj0de", []float64{28.564453125, 28.6083984375, 45.8349609375, 45.87890625}}, + {"e0kqd", []float64{2.548828125, 2.5927734375, -38.935546875, -38.8916015625}}, + {"bzysgj0rr", []float64{89.4574213028, 89.4574642181, -136.976895332, -136.976852417}}, + {"bxdp8ycgtcqt", []float64{88.543546591, 88.5435467586, -154.651882276, -154.651881941}}, + {"k0kx2785", []float64{-42.2995948792, -42.2994232178, 6.33911132812, 6.33945465088}}, + {"75ugg", []float64{-23.2470703125, -23.203125, -38.1884765625, -38.14453125}}, + {"sbbsbv2r", []float64{5.08375167847, 5.08392333984, 34.4864273071, 34.4867706299}}, + {"u7vunvq7rn", []float64{66.8263041973, 66.8263095617, 19.6414518356, 19.6414625645}}, + {"w4m7uexcx", []float64{13.3349132538, 13.3349561691, 97.591509819, 97.5915527344}}, + {"350g", []float64{-27.59765625, -27.421875, -133.9453125, -133.59375}}, + {"p", []float64{-90.0, -45.0, 135.0, 180.0}}, + {"t8w2g1m1", []float64{2.95137405396, 2.95154571533, 76.4277648926, 76.4281082153}}, + {"96k738f", []float64{13.2316589355, 13.2330322266, -117.704772949, -117.703399658}}, + {"c26nv174", []float64{47.5999832153, 47.6001548767, -120.713653564, -120.713310242}}, + {"s67g9ehvds", []float64{13.2889294624, 13.2889348269, 16.5959858894, 16.5959966183}}, + {"4ybt4sw", []float64{-51.1276245117, -51.1262512207, -55.4287719727, -55.4273986816}}, + {"5jqnz9zqyk12", []float64{-59.2714333534, -59.2714331858, -36.2226838991, -36.2226835638}}, + {"31d5nguy", []float64{-36.0135269165, -36.0133552551, -131.884346008, -131.884002686}}, + {"m4bbcg", []float64{-29.3829345703, -29.3774414062, 46.1315917969, 46.142578125}}, + {"u", []float64{45.0, 90.0, 0.0, 45.0}}, + {"mkx0hzzq", []float64{-19.6438980103, -19.6437263489, 66.3124465942, 66.312789917}}, + {"9bv7x0", []float64{4.833984375, 4.83947753906, -93.5595703125, -93.5485839844}}, + {"k", []float64{-45.0, 0.0, 0.0, 45.0}}, + {"0x7pv0083h", []float64{-47.8563809395, -47.8563755751, -153.060793877, -153.060783148}}, + {"ruqzw0ztgw", []float64{-19.7702515125, -19.7702461481, 178.516309261, 178.51631999}}, + {"xpy5c3c", []float64{44.2625427246, 44.2639160156, 143.493804932, 143.495178223}}, + {"bnn6fxqm", []float64{79.2740821838, 79.2742538452, -171.09249115, -171.092147827}}, + {"fnm9u1t", []float64{80.4721069336, 80.4734802246, -82.0829772949, -82.0816040039}}, + {"jfvn54", []float64{-73.4655761719, -73.4600830078, 85.9130859375, 85.9240722656}}, + {"dj", []float64{28.125, 33.75, -90.0, -78.75}}, + {"mxfstk2s", []float64{-0.591201782227, -0.59103012085, 71.2470245361, 71.2473678589}}, + {"v", []float64{45.0, 90.0, 45.0, 90.0}}, + {"4xy4p", []float64{-46.0546875, -46.0107421875, -58.7548828125, -58.7109375}}, + {"cbdg47d", []float64{48.3590698242, 48.3604431152, -97.2811889648, -97.2798156738}}, + {"1ddv9", []float64{-74.970703125, -74.9267578125, -108.588867188, -108.544921875}}, + {"4cn07cd", []float64{-84.3228149414, -84.3214416504, -47.6449584961, -47.6435852051}}, + {"dq8fjtv64n", []float64{36.9460237026, 36.946029067, -77.4463176727, -77.4463069439}}, + {"gx2qc4", []float64{86.9787597656, 86.9842529297, -22.1044921875, -22.0935058594}}, + {"yx", []float64{84.375, 90.0, 112.5, 123.75}}, + {"44", []float64{-78.75, -73.125, -90.0, -78.75}}, + {"zz679sbs", []float64{86.4232635498, 86.4234352112, 171.980667114, 171.981010437}}, + {"2fdh2u769u7y", []float64{-30.1666307822, -30.1666306145, -143.399997689, -143.399997354}}, + {"k", []float64{-45.0, 0.0, 0.0, 45.0}}, + {"11cxx", []float64{-78.837890625, -78.7939453125, -132.583007812, -132.5390625}}, + {"ygf2", []float64{66.09375, 66.26953125, 126.9140625, 127.265625}}, + {"m0uscc", []float64{-39.9407958984, -39.9353027344, 51.4050292969, 51.416015625}}, + {"y", []float64{45.0, 90.0, 90.0, 135.0}}, + {"kn2m", []float64{-8.96484375, -8.7890625, 0.3515625, 0.703125}}, + {"k56d043m45", []float64{-26.3539534807, -26.3539481163, 3.51742744446, 3.51743817329}}, + {"m7", []float64{-28.125, -22.5, 56.25, 67.5}}, + {"pwe", []float64{-53.4375, -52.03125, 161.71875, 163.125}}, + {"1j7q0hs8u", []float64{-59.3892145157, -59.3891716003, -130.423336029, -130.423293114}}, + {"f264077wzn", []float64{46.776856184, 46.7768615484, -75.9214067459, -75.9213960171}}, + {"2jvn", []float64{-11.6015625, -11.42578125, -172.96875, -172.6171875}}, + {"4d4ghmzzsjb9", []float64{-78.1897520833, -78.1897519156, -63.4352295846, -63.4352292493}}, + {"1h2n6vef0we", []float64{-64.9645265937, -64.9645252526, -134.873975068, -134.873973727}}, + {"vnfc1v2yh", []float64{83.1744003296, 83.1744432449, 48.9452934265, 48.9453363419}}, + {"2b5brk", []float64{-44.9340820312, -44.9285888672, -140.657958984, -140.646972656}}, + {"yntb", []float64{81.5625, 81.73828125, 98.0859375, 98.4375}}, + {"5fj85yy", []float64{-78.7129211426, -78.7115478516, -3.34259033203, -3.34121704102}}, + {"cm36wevqn", []float64{74.9923324585, 74.9923753738, -121.699075699, -121.699032784}}, + {"7hf52n", []float64{-17.6770019531, -17.6715087891, -42.1875, -42.1765136719}}, + {"dh1sz", []float64{23.3349609375, 23.37890625, -87.5830078125, -87.5390625}}, + {"h", []float64{-90.0, -45.0, 0.0, 45.0}}, + {"7ny5k0k28", []float64{-6.4585018158, -6.45845890045, -36.3808822632, -36.3808393478}}, + {"w7", []float64{16.875, 22.5, 101.25, 112.5}}, + {"f9j0296wjz", []float64{50.6768792868, 50.6768846512, -60.443097353, -60.4430866241}}, + {"v316n9gu7y5", []float64{50.9869372845, 50.9869386256, 58.2987718284, 58.2987731695}}, + {"9", []float64{0.0, 45.0, -135.0, -90.0}}, + {"0jt97bxt", []float64{-58.8391685486, -58.8389968872, -172.090530396, -172.090187073}}, + {"cfdbvrc", []float64{59.236907959, 59.23828125, -97.1507263184, -97.1493530273}}, + {"95d", []float64{19.6875, 21.09375, -132.1875, -130.78125}}, + {"my", []float64{-11.25, -5.625, 78.75, 90.0}}, + {"5t", []float64{-61.875, -56.25, -22.5, -11.25}}, + {"n", []float64{-90.0, -45.0, 90.0, 135.0}}, + {"4", []float64{-90.0, -45.0, -90.0, -45.0}}, + {"k2j", []float64{-45.0, -43.59375, 18.28125, 19.6875}}, + {"rjygh", []float64{-12.12890625, -12.0849609375, 144.66796875, 144.711914062}}, + {"xs79f", []float64{24.2138671875, 24.2578125, 162.509765625, 162.553710938}}, + {"nmz3x305vedq", []float64{-57.3864214495, -57.3864212818, 111.764155068, 111.764155403}}, + {"u1hq", []float64{51.6796875, 51.85546875, 5.9765625, 6.328125}}, + {"2v8jpgs92zxm", []float64{-13.1641120277, -13.1641118601, -145.903202109, -145.903201774}}, + {"f54n04uq", []float64{62.9458236694, 62.9459953308, -87.1816635132, -87.1813201904}}, + {"8zkv13p8", []float64{41.6656494141, 41.6658210754, -139.505081177, -139.504737854}}, + {"z03j6ktm2", []float64{47.354722023, 47.3547649384, 136.512336731, 136.512379646}}, + {"qd266u", []float64{-31.9262695312, -31.9207763672, 112.972412109, 112.983398438}}, + {"783q5", []float64{-42.5390625, -42.4951171875, -20.6103515625, -20.56640625}}, + {"ynqe7fy5s9m6", []float64{80.7432531193, 80.7432532869, 99.3138598278, 99.3138601631}}, + {"pp", []float64{-50.625, -45.0, 135.0, 146.25}}, + {"bd", []float64{56.25, 61.875, -157.5, -146.25}}, + {"8573xzmt2qy", []float64{18.5856847465, 18.5856860876, -175.081539452, -175.081538111}}, + {"gwhbzrftu0", []float64{78.9253950119, 78.9254003763, -15.4981040955, -15.4980933666}}, + {"h42w9e805", []float64{-76.1819458008, -76.1819028854, 0.769171714783, 0.769214630127}}, + {"uzbhd66135d1", []float64{89.397358764, 89.3973589316, 33.8516691327, 33.851669468}}, + {"zf", []float64{56.25, 61.875, 168.75, 180.0}}, + {"6q2r", []float64{-8.61328125, -8.4375, -78.3984375, -78.046875}}, + {"qtxy7v4w9", []float64{-12.9352855682, -12.9352426529, 123.566708565, 123.56675148}}, + {"58", []float64{-90.0, -84.375, -22.5, -11.25}}, + {"r", []float64{-45.0, 0.0, 135.0, 180.0}}, + {"8u9qjb5qw", []float64{26.368303299, 26.3683462143, -144.234781265, -144.23473835}}, + {"48sx", []float64{-85.95703125, -85.78125, -61.171875, -60.8203125}}, + {"690tdt6", []float64{-38.3793640137, -38.3779907227, -66.6842651367, -66.6828918457}}, + {"qm8", []float64{-14.0625, -12.65625, 101.25, 102.65625}}, + {"2mj", []float64{-16.875, -15.46875, -161.71875, -160.3125}}, + {"3e5", []float64{-28.125, -26.71875, -108.28125, -106.875}}, + {"t", []float64{0.0, 45.0, 45.0, 90.0}}, + {"f6dg1tndqxy", []float64{59.6177373827, 59.6177387238, -74.8076811433, -74.8076798022}}, + {"c", []float64{45.0, 90.0, -135.0, -90.0}}, + {"9f3eqkhmf", []float64{13.2504987717, 13.250541687, -98.8600444794, -98.860001564}}, + {"yuqghk1x5z", []float64{69.4568055868, 69.4568109512, 133.431175947, 133.431186676}}, + {"w30rynrjmy", []float64{7.02257037163, 7.02257573605, 101.875094175, 101.875104904}}, + {"m25ergh6", []float64{-44.4118881226, -44.4117164612, 61.5182876587, 61.5186309814}}, + {"jqznhq", []float64{-50.9436035156, -50.9381103516, 66.2805175781, 66.2915039062}}, + {"3u4", []float64{-22.5, -21.09375, -98.4375, -97.03125}}, + {"fzr", []float64{85.78125, 87.1875, -46.40625, -45.0}}, + {"je", []float64{-73.125, -67.5, 67.5, 78.75}}, + {"pmztf5q7e7d4", []float64{-56.6270351037, -56.6270349361, 156.893490851, 156.893491186}}, + {"xdknsz", []float64{13.8372802734, 13.8427734375, 163.333740234, 163.344726562}}, + {"736h1c7d5h", []float64{-37.2583937645, -37.2583884001, -30.8556604385, -30.8556497097}}, + {"c57pmmv9u", []float64{64.5875501633, 64.5875930786, -130.542812347, -130.542769432}}, + {"80qnjsh", []float64{2.48291015625, 2.48428344727, -171.315307617, -171.313934326}}, + {"kp", []float64{-5.625, 0.0, 0.0, 11.25}}, + {"u6yufbthbdw", []float64{61.3072863221, 61.3072876632, 20.8699330688, 20.8699344099}}, + {"c4yj93f16ys", []float64{61.4454093575, 61.4454106987, -126.504698396, -126.504697055}}, + {"ygkp", []float64{64.51171875, 64.6875, 129.375, 129.7265625}}, + {"h1ypf", []float64{-78.7939453125, -78.75, 8.525390625, 8.5693359375}}, + {"hz76my5h4qe", []float64{-48.7895616889, -48.7895603478, 38.5772185028, 38.5772198439}}, + {"zh1cw", []float64{67.763671875, 67.8076171875, 137.724609375, 137.768554688}}, + {"00", []float64{-90.0, -84.375, -180.0, -168.75}}, + {"h9be5u0k", []float64{-79.6062469482, -79.6060752869, 23.3682632446, 23.3686065674}}, + {"4btv", []float64{-86.30859375, -86.1328125, -48.1640625, -47.8125}}, + {"42hz7pm83dt", []float64{-88.6857041717, -88.6857028306, -71.9308523834, -71.9308510423}}, + {"qv49", []float64{-16.69921875, -16.5234375, 127.265625, 127.6171875}}, + {"0nwzwd", []float64{-52.1081542969, -52.1026611328, -170.222167969, -170.211181641}}, + {"jhkmc", []float64{-65.0830078125, -65.0390625, 51.0205078125, 51.064453125}}, + {"sysens0py", []float64{37.1131467819, 37.1131896973, 40.3640270233, 40.3640699387}}, + {"5q792kbf", []float64{-54.5975875854, -54.5974159241, -28.8161087036, -28.8157653809}}, + {"624gbe", []float64{-44.3243408203, -44.3188476562, -74.8608398438, -74.8498535156}}, + {"gtkjfqg", []float64{75.5790710449, 75.5804443359, -16.7720031738, -16.7706298828}}, + {"nv4", []float64{-61.875, -60.46875, 126.5625, 127.96875}}, + {"dcwv6uu0", []float64{9.3864440918, 9.38661575317, -46.6314697266, -46.6311264038}}, + {"vvgtyfv9", []float64{78.36977005, 78.3699417114, 83.97605896, 83.9764022827}}, + {"53rjpqr6zt9", []float64{-82.0550099015, -82.0550085604, -23.5773669183, -23.5773655772}}, + {"vmyp4dnemp6", []float64{78.5858018696, 78.5858032107, 64.8065069318, 64.8065082729}}, + {"t9xqjxjpv", []float64{9.53197002411, 9.53201293945, 77.9440927505, 77.9441356659}}, + {"sby32e", []float64{4.45495605469, 4.46044921875, 42.5610351562, 42.5720214844}}, + {"sjfgy", []float64{33.0029296875, 33.046875, 4.130859375, 4.1748046875}}, + {"k7q0z2b2du", []float64{-26.5826869011, -26.5826815367, 20.0065648556, 20.0065755844}}, + {"nt", []float64{-61.875, -56.25, 112.5, 123.75}}, + {"1", []float64{-90.0, -45.0, -135.0, -90.0}}, + {"mpfpfd32e", []float64{-0.0314998626709, -0.0314569473267, 47.9242086411, 47.9242515564}}, + {"hqjqn", []float64{-55.1953125, -55.1513671875, 18.896484375, 18.9404296875}}, + {"9q7chj6u2uw", []float64{35.3616240621, 35.3616254032, -118.296964467, -118.296963125}}, + {"0wsf47v9c", []float64{-53.0650377274, -53.064994812, -150.713839531, -150.713796616}}, + {"kdv72env8xke", []float64{-28.9424979128, -28.9424977452, 29.9140823632, 29.9140826985}}, + {"trfx", []float64{44.82421875, 45.0, 59.765625, 60.1171875}}, + {"02uttm", []float64{-84.7869873047, -84.7814941406, -162.191162109, -162.180175781}}, + {"hhjgb5s3vv39", []float64{-66.8212655, -66.8212653324, 8.0920227617, 8.09202309698}}, + {"r16", []float64{-37.96875, -36.5625, 137.8125, 139.21875}}, + {"4xy44eer4t3", []float64{-46.0342316329, -46.0342302918, -58.9480648935, -58.9480635524}}, + {"8b", []float64{0.0, 5.625, -146.25, -135.0}}, + {"zd", []float64{56.25, 61.875, 157.5, 168.75}}, + {"z0x", []float64{47.8125, 49.21875, 144.84375, 146.25}}, + {"4967", []float64{-82.44140625, -82.265625, -64.3359375, -63.984375}}, + {"2vf4", []float64{-12.3046875, -12.12890625, -143.4375, -143.0859375}}, + {"tzp3t0rtg", []float64{39.6410322189, 39.6410751343, 89.1754674911, 89.1755104065}}, + {"75yry", []float64{-22.5439453125, -22.5, -35.947265625, -35.9033203125}}, + {"bdgtu", []float64{61.4794921875, 61.5234375, -152.40234375, -152.358398438}}, + {"u1", []float64{50.625, 56.25, 0.0, 11.25}}, + {"rz2bgp7hds", []float64{-4.04629468918, -4.04628932476, 169.940750599, 169.940761328}}, + {"g", []float64{45.0, 90.0, -45.0, 0.0}}, + {"psptppx32e", []float64{-66.5796643496, -66.5796589851, 168.364470005, 168.364480734}}, + {"gshfctpwf1", []float64{68.0120283365, 68.0120337009, -15.7440090179, -15.7439982891}}, + {"3yq", []float64{-9.84375, -8.4375, -92.8125, -91.40625}}, + {"685zv36e0fd2", []float64{-43.6303004622, -43.6303002946, -61.9923811778, -61.9923808426}}, + {"7gf5fd", []float64{-23.2360839844, -23.2305908203, -8.32763671875, -8.31665039062}}, + {"bmmzfq", []float64{75.9265136719, 75.9320068359, -160.565185547, -160.554199219}}, + {"m40", []float64{-33.75, -32.34375, 45.0, 46.40625}}, + {"tx45501g7v", []float64{39.9029284716, 39.902933836, 70.4469001293, 70.4469108582}}, + {"u", []float64{45.0, 90.0, 0.0, 45.0}}, + {"ej054jn", []float64{28.6798095703, 28.6811828613, -44.9038696289, -44.9024963379}}, + {"n1g7d", []float64{-79.541015625, -79.4970703125, 94.658203125, 94.7021484375}}, + {"nn6ejehuf", []float64{-54.2991113663, -54.2990684509, 93.7639331818, 93.7639760971}}, + {"qs8e93xwrrc", []float64{-19.0629114211, -19.06291008, 113.268668801, 113.268670142}}, + {"f2", []float64{45.0, 50.625, -78.75, -67.5}}, + {"gm", []float64{73.125, 78.75, -33.75, -22.5}}, + {"npp4rnm", []float64{-50.1951599121, -50.1937866211, 100.158233643, 100.159606934}}, + {"6t", []float64{-16.875, -11.25, -67.5, -56.25}}, + {"2f4fe3d5", []float64{-33.3017921448, -33.3016204834, -142.237243652, -142.23690033}}, + {"s7r00k0196", []float64{18.3034908772, 18.3034962416, 21.1047899723, 21.1048007011}}, + {"st084", []float64{28.125, 28.1689453125, 23.291015625, 23.3349609375}}, + {"p6f3bv9f1", []float64{-74.1930770874, -74.1930341721, 149.449467659, 149.449510574}}, + {"fgk5j", []float64{63.80859375, 63.8525390625, -50.4052734375, -50.361328125}}, + {"yeu22jjtp", []float64{66.1660194397, 66.166062355, 118.484416008, 118.484458923}}, + {"3bcn7gkusn25", []float64{-39.6639578976, -39.6639577299, -99.6722602844, -99.6722599491}}, + {"1", []float64{-90.0, -45.0, -135.0, -90.0}}, + {"6q4dh71sqn", []float64{-10.8811962605, -10.881190896, -75.0452899933, -75.0452792645}}, + {"p", []float64{-90.0, -45.0, 135.0, 180.0}}, + {"6uy2kbef9yg", []float64{-18.2340927422, -18.2340914011, -47.2469682992, -47.246966958}}, + {"58j", []float64{-90.0, -88.59375, -15.46875, -14.0625}}, + {"j", []float64{-90.0, -45.0, 45.0, 90.0}}, + {"mh6x", []float64{-19.86328125, -19.6875, 48.515625, 48.8671875}}, + {"cgq4xrjz", []float64{63.7603569031, 63.7605285645, -92.486000061, -92.4856567383}}, + {"n", []float64{-90.0, -45.0, 90.0, 135.0}}, + {"ppqg570sk3n", []float64{-48.6741918325, -48.6741904914, 144.635886848, 144.635888189}}, + {"1suyu1k", []float64{-62.0878601074, -62.0864868164, -105.639038086, -105.637664795}}, + {"xm4xkzsnvux", []float64{29.4417956471, 29.4417969882, 149.980114549, 149.980115891}}, + {"8p3xj", []float64{42.01171875, 42.0556640625, -177.670898438, -177.626953125}}, + {"ef92nkk", []float64{14.0858459473, 14.0872192383, -9.21203613281, -9.2106628418}}, + {"qnrf101", []float64{-9.4921875, -9.49081420898, 100.943756104, 100.945129395}}, + {"2qt", []float64{-8.4375, -7.03125, -161.71875, -160.3125}}, + {"c2q7e", []float64{47.021484375, 47.0654296875, -114.829101562, -114.78515625}}, + {"w", []float64{0.0, 45.0, 90.0, 135.0}}, + {"j6t0bwpjpusg", []float64{-75.7718221284, -75.7718219608, 63.3131746575, 63.3131749928}}, + {"wrq", []float64{40.78125, 42.1875, 109.6875, 111.09375}}, + {"xvf2jk", []float64{32.3657226562, 32.3712158203, 172.144775391, 172.155761719}}, + {"xy0p5y", []float64{35.0134277344, 35.0189208984, 168.914794922, 168.92578125}}, + {"bsh9xbd", []float64{67.766418457, 67.767791748, -150.828552246, -150.827178955}}, + {"g675yc", []float64{58.3209228516, 58.3264160156, -29.2346191406, -29.2236328125}}, + {"dkrnq7", []float64{25.0213623047, 25.0268554688, -68.6315917969, -68.6206054688}}, + {"6q4uk3dyk5", []float64{-10.4936009645, -10.4935956001, -74.6920967102, -74.6920859814}}, + {"t58tp1kxb54p", []float64{20.5746203475, 20.5746205151, 46.0169246793, 46.0169250146}}, + {"bbw73yzeuy", []float64{48.4215438366, 48.421549201, -137.373529673, -137.373518944}}, + {"gnq", []float64{80.15625, 81.5625, -36.5625, -35.15625}}, + {"3j", []float64{-16.875, -11.25, -135.0, -123.75}}, + {"7dx2c", []float64{-30.8056640625, -30.76171875, -12.2607421875, -12.216796875}}, + {"vn9", []float64{81.5625, 82.96875, 46.40625, 47.8125}}, + {"4kj", []float64{-67.5, -66.09375, -71.71875, -70.3125}}, + {"cuvj4trg5nb8", []float64{72.6270465553, 72.6270467229, -94.0981142968, -94.0981139615}}, + {"uetmbuswe", []float64{65.7240772247, 65.7241201401, 29.92208004, 29.9221229553}}, + {"z", []float64{45.0, 90.0, 135.0, 180.0}}, + {"f", []float64{45.0, 90.0, -90.0, -45.0}}, + {"jg", []float64{-73.125, -67.5, 78.75, 90.0}}, + {"ycz", []float64{54.84375, 56.25, 133.59375, 135.0}}, + {"pevtd", []float64{-67.939453125, -67.8955078125, 165.322265625, 165.366210938}}, + {"gf7fm3hmb8", []float64{58.0582380295, 58.0582433939, -5.73999166489, -5.73998093605}}, + {"w7zwjnh24qd", []float64{22.1814313531, 22.1814326942, 112.022537291, 112.022538632}}, + {"nfesgy", []float64{-75.0695800781, -75.0640869141, 128.836669922, 128.84765625}}, + {"s", []float64{0.0, 45.0, 0.0, 45.0}}, + {"efq0q0dg0g0n", []float64{12.7034739777, 12.7034741454, -2.5450193882, -2.54501905292}}, + {"kkucr8pk", []float64{-18.060836792, -18.0606651306, 18.2692337036, 18.2695770264}}, + {"5zdg4zvz", []float64{-47.2413825989, -47.2412109375, -7.25406646729, -7.25372314453}}, + {"fuw", []float64{70.3125, 71.71875, -47.8125, -46.40625}}, + {"x51mnftp8", []float64{17.7689266205, 17.7689695358, 137.061309814, 137.06135273}}, + {"y0", []float64{45.0, 50.625, 90.0, 101.25}}, + {"ndufku4sr", []float64{-74.1130399704, -74.1129970551, 119.392161369, 119.392204285}}, + {"ydwndhywhg8", []float64{60.232219398, 60.2322207391, 121.034520864, 121.034522206}}, + {"gj6ehkq0", []float64{75.0819396973, 75.0821113586, -41.2893676758, -41.289024353}}, + {"m3hfct0", []float64{-38.8641357422, -38.8627624512, 62.9956054688, 62.9969787598}}, + {"6745yupp70qu", []float64{-27.4426010996, -27.442600932, -75.631118305, -75.6311179698}}, + {"d7b0m9dzj213", []float64{21.1471368559, 21.1471370235, -78.504297249, -78.5042969137}}, + {"py", []float64{-56.25, -50.625, 168.75, 180.0}}, + {"4vhrpypw", []float64{-60.6105422974, -60.610370636, -49.9225616455, -49.9222183228}}, + {"xwyyj1xz5kzw", []float64{39.0329053625, 39.0329055302, 167.222706601, 167.222706936}}, + {"18ht0j2w8ste", []float64{-89.0911141969, -89.0911140293, -106.171159521, -106.171159185}}, + {"vynwqurve", []float64{79.8729228973, 79.8729658127, 88.1980276108, 88.1980705261}}, + {"s77hhn", []float64{19.0173339844, 19.0228271484, 15.64453125, 15.6555175781}}, + {"hj66tgs86", []float64{-60.0100278854, -60.0099849701, 3.42301368713, 3.42305660248}}, + {"e5nh4k", []float64{17.6000976562, 17.6055908203, -36.4636230469, -36.4526367188}}, + {"jk", []float64{-67.5, -61.875, 56.25, 67.5}}, + {"7", []float64{-45.0, 0.0, -45.0, 0.0}}, + {"f0p3v5", []float64{45.3240966797, 45.3295898438, -79.5849609375, -79.5739746094}}, + {"numc175r1", []float64{-65.9002876282, -65.9002447128, 131.895375252, 131.895418167}}, + {"7pc", []float64{-1.40625, 0.0, -43.59375, -42.1875}}, + {"b7qw82mfqc4", []float64{64.4255930185, 64.4255943596, -159.590199888, -159.590198547}}, + {"qfe", []float64{-30.9375, -29.53125, 127.96875, 129.375}}, + {"mw9kj6nrue61", []float64{-7.72204069421, -7.72204052657, 69.4973042607, 69.497304596}}, + {"6en5d6psj", []float64{-27.4980926514, -27.498049736, -58.9531087875, -58.9530658722}}, + {"mk80", []float64{-19.6875, -19.51171875, 56.25, 56.6015625}}, + {"d2fbpmpyjv", []float64{4.24727261066, 4.24727797508, -74.5533192158, -74.5533084869}}, + {"pf84wbwguh9", []float64{-75.4946324229, -75.4946310818, 169.056073576, 169.056074917}}, + {"ncj8287", []float64{-84.3296813965, -84.3283081055, 131.510467529, 131.51184082}}, + {"smd4t", []float64{31.376953125, 31.4208984375, 14.2822265625, 14.326171875}}, + {"4ryj3jjxrd", []float64{-45.4546773434, -45.454671979, -70.2606797218, -70.260668993}}, + {"udffsxnn8", []float64{60.9477710724, 60.9478139877, 26.5731811523, 26.5732240677}}, + {"cub7vr", []float64{72.4163818359, 72.421875, -100.667724609, -100.656738281}}, + {"y7c6s4", []float64{66.5441894531, 66.5496826172, 103.18359375, 103.194580078}}, + {"t253", []float64{0.17578125, 0.3515625, 60.8203125, 61.171875}}, + {"1e2bhmk9ybw", []float64{-71.6896077991, -71.6896064579, -111.252067387, -111.252066046}}, + {"k", []float64{-45.0, 0.0, 0.0, 45.0}}, + {"99r75", []float64{7.55859375, 7.6025390625, -102.172851562, -102.12890625}}, + {"knbr2kzz1791", []float64{-5.72952283546, -5.72952266783, 0.373246818781, 0.373247154057}}, + {"v8h", []float64{45.0, 46.40625, 73.125, 74.53125}}, + {"sm6xvf3bc", []float64{30.9060430527, 30.906085968, 15.0207567215, 15.0207996368}}, + {"vu", []float64{67.5, 73.125, 78.75, 90.0}}, + {"w56htc6", []float64{19.0791320801, 19.0805053711, 93.0679321289, 93.0693054199}}, + {"t", []float64{0.0, 45.0, 45.0, 90.0}}, + {"k", []float64{-45.0, 0.0, 0.0, 45.0}}, + {"5hm57j8", []float64{-65.4922485352, -65.4908752441, -37.8369140625, -37.8355407715}}, + {"k8qwr0e", []float64{-42.4923706055, -42.4909973145, 31.9523620605, 31.9537353516}}, + {"716e0", []float64{-37.44140625, -37.3974609375, -41.484375, -41.4404296875}}, + {"wz71b", []float64{41.0888671875, 41.1328125, 127.96875, 128.012695312}}, + {"w", []float64{0.0, 45.0, 90.0, 135.0}}, + {"v", []float64{45.0, 90.0, 45.0, 90.0}}, + {"89x2", []float64{8.4375, 8.61328125, -147.3046875, -146.953125}}, + {"rcr37p", []float64{-37.7105712891, -37.705078125, 179.077148438, 179.088134766}}, + {"4xzjc7vmstr", []float64{-45.3739361465, -45.3739348054, -57.5939060748, -57.5939047337}}, + {"tv07ndf8", []float64{28.6674499512, 28.6676216125, 79.3906402588, 79.3909835815}}, + {"qb2z", []float64{-42.36328125, -42.1875, 124.8046875, 125.15625}}, + {"xjq0fjferr", []float64{29.6952670813, 29.6952724457, 143.529134989, 143.529145718}}, + {"zwn6", []float64{79.1015625, 79.27734375, 166.2890625, 166.640625}}, + {"7xc", []float64{-1.40625, 0.0, -21.09375, -19.6875}}, + {"m", []float64{-45.0, 0.0, 45.0, 90.0}}, + {"vw4sy8z", []float64{79.5890808105, 79.5904541016, 71.3108825684, 71.3122558594}}, + {"djg8s3pre", []float64{32.4384212494, 32.4384641647, -84.881272316, -84.8812294006}}, + {"vpn8t", []float64{84.462890625, 84.5068359375, 54.3603515625, 54.404296875}}, + {"1sse8x6", []float64{-64.0324401855, -64.0310668945, -106.147155762, -106.145782471}}, + {"snm4", []float64{35.5078125, 35.68359375, 7.03125, 7.3828125}}, + {"t", []float64{0.0, 45.0, 45.0, 90.0}}, + {"5rb", []float64{-46.40625, -45.0, -33.75, -32.34375}}, + {"q7", []float64{-28.125, -22.5, 101.25, 112.5}}, + {"8", []float64{0.0, 45.0, -180.0, -135.0}}, + {"hn5sw9rbvjk3", []float64{-55.4519608431, -55.4519606754, 5.21838281304, 5.21838314831}}, + {"y0f7w5tep1f", []float64{49.8537348211, 49.8537361622, 93.4355905652, 93.4355919063}}, + {"xts0uk", []float64{31.0913085938, 31.0968017578, 163.311767578, 163.322753906}}, + {"ftybqfey", []float64{77.4024581909, 77.4026298523, -57.7060317993, -57.7056884766}}, + {"eqvhf", []float64{38.8037109375, 38.84765625, -26.630859375, -26.5869140625}}, + {"ctpbp73", []float64{73.1428527832, 73.1442260742, -101.281585693, -101.280212402}}, + {"15czhy35", []float64{-67.6409339905, -67.6407623291, -132.328948975, -132.328605652}}, + {"1", []float64{-90.0, -45.0, -135.0, -90.0}}, + {"vk6mwwrysbf", []float64{69.9084989727, 69.9085003138, 59.7105565667, 59.7105579078}}, + {"4yfcbqvevqy", []float64{-51.6858740151, -51.685872674, -52.3640397191, -52.364038378}}, + {"d6qm1q41g", []float64{13.5684156418, 13.5684585571, -69.9031305313, -69.903087616}}, + {"kjqew1cty", []float64{-14.842915535, -14.8428726196, 9.40661430359, 9.40665721893}}, + {"hf9zn39ewerv", []float64{-74.6981724165, -74.6981722489, 36.4879449829, 36.4879453182}}, + {"1j", []float64{-61.875, -56.25, -135.0, -123.75}}, + {"u41", []float64{56.25, 57.65625, 1.40625, 2.8125}}, + {"pd8sbu5sk", []float64{-75.0798368454, -75.0797939301, 158.241062164, 158.24110508}}, + {"k7", []float64{-28.125, -22.5, 11.25, 22.5}}, + {"fx6xcm", []float64{87.1710205078, 87.1765136719, -63.9294433594, -63.9184570312}}, + {"k1nwc4mun", []float64{-38.1754302979, -38.1753873825, 9.19272422791, 9.19276714325}}, + {"nechx1mg", []float64{-68.1078529358, -68.1076812744, 114.221763611, 114.222106934}}, + {"8et6dbj4g", []float64{20.1274251938, 20.1274681091, -149.98934269, -149.989299774}}, + {"7e", []float64{-28.125, -22.5, -22.5, -11.25}}, + {"vqcthybtw0", []float64{83.885679245, 83.8856846094, 58.5690593719, 58.5690701008}}, + {"r6qdv32n", []float64{-31.8524551392, -31.8522834778, 155.621337891, 155.621681213}}, + {"tbhh", []float64{0.703125, 0.87890625, 84.375, 84.7265625}}, + {"0c5fpu", []float64{-84.0014648438, -83.9959716797, -140.635986328, -140.625}}, + {"7b", []float64{-45.0, -39.375, -11.25, 0.0}}, + {"9vzmkfvug", []float64{33.2825231552, 33.2825660706, -90.8379220963, -90.8378791809}}, + {"68t", []float64{-42.1875, -40.78125, -60.46875, -59.0625}}, + {"ef1szshm45h", []float64{12.1078079939, 12.107809335, -8.80510747433, -8.80510613322}}, + {"21dgj4", []float64{-36.0241699219, -36.0186767578, -175.913085938, -175.902099609}}, + {"109q9yt", []float64{-86.0092163086, -86.0078430176, -133.158416748, -133.157043457}}, + {"nhj3b9vc", []float64{-67.182598114, -67.1824264526, 97.4126815796, 97.4130249023}}, + {"nye5uzgb", []float64{-52.735748291, -52.7355766296, 128.182640076, 128.182983398}}, + {"dhz5f", []float64{27.3779296875, 27.421875, -80.068359375, -80.0244140625}}, + {"g1verehd", []float64{55.4318618774, 55.4320335388, -36.9298553467, -36.9295120239}}, + {"jtr", []float64{-60.46875, -59.0625, 77.34375, 78.75}}, + {"m5nbruj", []float64{-28.0590820312, -28.0577087402, 54.839630127, 54.841003418}}, + {"p", []float64{-90.0, -45.0, 135.0, 180.0}}, + {"h", []float64{-90.0, -45.0, 0.0, 45.0}}, + {"bm3gr", []float64{75.1025390625, 75.146484375, -165.981445312, -165.9375}}, + {"e7my1m0qp", []float64{19.3644332886, 19.3644762039, -25.6084871292, -25.6084442139}}, + {"fzue", []float64{89.12109375, 89.296875, -49.921875, -49.5703125}}, + {"q70", []float64{-28.125, -26.71875, 101.25, 102.65625}}, + {"sjeed", []float64{31.552734375, 31.5966796875, 5.009765625, 5.0537109375}}, + {"cvsuyyw", []float64{76.8081665039, 76.8095397949, -94.2654418945, -94.2640686035}}, + {"7dnp", []float64{-32.51953125, -32.34375, -14.0625, -13.7109375}}, + {"tf9kr1u", []float64{14.8191833496, 14.8205566406, 80.8209228516, 80.8222961426}}, + {"j38nduwqew", []float64{-80.3940546513, -80.3940492868, 56.3795828819, 56.3795936108}}, + {"444y82r", []float64{-77.606048584, -77.604675293, -86.1122131348, -86.1108398438}}, + {"1rwzsww", []float64{-46.4584350586, -46.4570617676, -114.051818848, -114.050445557}}, + {"98vu", []float64{4.921875, 5.09765625, -104.4140625, -104.0625}}, + {"f0hu79k2y84", []float64{45.7540655136, 45.7540668547, -83.1603857875, -83.1603844464}}, + {"35399zm2gnn", []float64{-26.415091753, -26.4150904119, -132.806374133, -132.806372792}}, + {"qzxy", []float64{-1.7578125, -1.58203125, 134.6484375, 135.0}}, + {"7gpr25", []float64{-26.8341064453, -26.8286132812, -1.0546875, -1.04370117188}}, + {"xucdp", []float64{27.0703125, 27.1142578125, 171.166992188, 171.2109375}}, + {"db3mpnz89zq", []float64{2.32235983014, 2.32236117125, -54.1741874814, -54.1741861403}}, + {"p", []float64{-90.0, -45.0, 135.0, 180.0}}, + {"94m52", []float64{13.2275390625, 13.271484375, -127.96875, -127.924804688}}, + {"u7ucp", []float64{66.26953125, 66.3134765625, 18.2373046875, 18.28125}}, + {"81qq43p", []float64{8.09143066406, 8.09280395508, -171.10244751, -171.101074219}}, + {"f80w8", []float64{46.142578125, 46.1865234375, -66.796875, -66.7529296875}}, + {"8j5z", []float64{29.35546875, 29.53125, -174.7265625, -174.375}}, + {"56q", []float64{-77.34375, -75.9375, -25.3125, -23.90625}}, + {"b72vvhj", []float64{64.3139648438, 64.3153381348, -167.468719482, -167.467346191}}, + {"5j", []float64{-61.875, -56.25, -45.0, -33.75}}, + {"42hm9tj0rn", []float64{-89.0056622028, -89.0056568384, -72.7003526688, -72.7003419399}}, + {"cbxx9q2c89", []float64{49.1654545069, 49.1654598713, -90.6471419334, -90.6471312046}}, + {"43", []float64{-84.375, -78.75, -78.75, -67.5}}, + {"rvmw", []float64{-14.4140625, -14.23828125, 176.484375, 176.8359375}}, + {"jwmeyr4hj7", []float64{-54.1454154253, -54.1454100609, 75.5120050907, 75.5120158195}}, + {"b3y", []float64{54.84375, 56.25, -160.3125, -158.90625}}, + {"4y3n0e7u8j", []float64{-53.7704104185, -53.7704050541, -54.8166275024, -54.8166167736}}, + {"m0k0x", []float64{-43.505859375, -43.4619140625, 50.9326171875, 50.9765625}}, + {"2zc1v219ev8z", []float64{-1.09834464267, -1.09834447503, -144.610815234, -144.610814899}}, + {"3rvj3ezyffez", []float64{-0.46162577346, -0.461625605822, -116.64206598, -116.642065644}}, + {"35bdpq", []float64{-23.5217285156, -23.5162353516, -133.978271484, -133.967285156}}, + {"qdqrzp2e7b", []float64{-30.9410619736, -30.9410566092, 121.597527266, 121.597537994}}, + {"vmrsrejf", []float64{75.2951431274, 75.2953147888, 67.1343612671, 67.1347045898}}, + {"up", []float64{84.375, 90.0, 0.0, 11.25}}, + {"bzy", []float64{88.59375, 90.0, -137.8125, -136.40625}}, + {"3rnm42gs62", []float64{-4.7412443161, -4.74123895168, -114.857157469, -114.85714674}}, + {"yhekty3621c", []float64{71.1382435262, 71.1382448673, 94.8247160017, 94.8247173429}}, + {"ektx", []float64{26.54296875, 26.71875, -26.015625, -25.6640625}}, + {"9nxkb4u9f6", []float64{37.4128782749, 37.4128836393, -124.798411131, -124.798400402}}, + {"fg", []float64{61.875, 67.5, -56.25, -45.0}}, + {"66e4x10k68", []float64{-30.4918241501, -30.4918187857, -74.2231822014, -74.2231714725}}, + {"me", []float64{-28.125, -22.5, 67.5, 78.75}}, + {"r385f5q9", []float64{-35.8852958679, -35.8851242065, 146.346817017, 146.347160339}}, + {"xbdc8wgn0", []float64{3.11428070068, 3.11432361603, 172.643280029, 172.643322945}}, + {"74s", []float64{-30.9375, -29.53125, -39.375, -37.96875}}, + {"dg8t7", []float64{20.6103515625, 20.654296875, -55.4150390625, -55.37109375}}, + {"nf7", []float64{-77.34375, -75.9375, 127.96875, 129.375}}, + {"6nzfqpxy", []float64{-6.59351348877, -6.59334182739, -78.8272476196, -78.8269042969}}, + {"0ux06p9jktht", []float64{-64.6014270745, -64.6014269069, -136.31678693, -136.316786595}}, + {"nb8pxznpjh", []float64{-85.8294653893, -85.8294600248, 124.099030495, 124.099041224}}, + {"6qks2x97hzm", []float64{-9.05492708087, -9.05492573977, -72.3979751766, -72.3979738355}}, + {"us6qrd43em", []float64{70.0161534548, 70.0161588192, 25.9968817234, 25.9968924522}}, + {"tp2eh", []float64{41.30859375, 41.3525390625, 45.87890625, 45.9228515625}}, + {"vcgf16q2", []float64{55.2076721191, 55.2078437805, 84.0869522095, 84.0872955322}}, + {"qt15nkc82kz", []float64{-16.3214953244, -16.3214939833, 114.182988256, 114.182989597}}, + {"t6t", []float64{14.0625, 15.46875, 63.28125, 64.6875}}, + {"yx53b3kj32", []float64{84.6903848648, 84.6903902292, 117.086845636, 117.086856365}}, + {"twqdxev4cx", []float64{35.6168121099, 35.6168174744, 76.9771456718, 76.9771564007}}, + {"p", []float64{-90.0, -45.0, 135.0, 180.0}}, + {"4gty084hgddy", []float64{-69.2569826916, -69.2569825239, -48.13918937, -48.1391890347}}, + {"c7", []float64{61.875, 67.5, -123.75, -112.5}}, + {"ywffc641", []float64{83.463306427, 83.4634780884, 116.424865723, 116.425209045}}, + {"k0km", []float64{-42.71484375, -42.5390625, 5.9765625, 6.328125}}, + {"17k4fg", []float64{-71.2188720703, -71.2133789062, -118.004150391, -117.993164062}}, + {"9fr", []float64{12.65625, 14.0625, -91.40625, -90.0}}, + {"w", []float64{0.0, 45.0, 90.0, 135.0}}, + {"h08scsx", []float64{-86.3278198242, -86.3264465332, 0.778656005859, 0.780029296875}}, + {"8f8nq48", []float64{15.1748657227, 15.1762390137, -145.986328125, -145.984954834}}, + {"hecr6qx49k0", []float64{-67.59567976, -67.5956784189, 24.3663561344, 24.3663574755}}, + {"jn", []float64{-56.25, -50.625, 45.0, 56.25}}, + {"qwx7j", []float64{-7.91015625, -7.8662109375, 122.915039062, 122.958984375}}, + {"z", []float64{45.0, 90.0, 135.0, 180.0}}, + {"wxcj4", []float64{44.47265625, 44.5166015625, 113.994140625, 114.038085938}}, + {"gw63h", []float64{80.33203125, 80.3759765625, -19.16015625, -19.1162109375}}, + {"hp7b6f27tq6p", []float64{-49.1618095525, -49.1618093848, 5.3948584199, 5.39485875517}}, + {"kd", []float64{-33.75, -28.125, 22.5, 33.75}}, + {"fbweu", []float64{48.4716796875, 48.515625, -46.93359375, -46.8896484375}}, + {"m1fcuue7nm1g", []float64{-34.8233712651, -34.8233710974, 49.080661498, 49.0806618333}}, + {"h9j", []float64{-84.375, -82.96875, 29.53125, 30.9375}}, + {"n9d3g5uv", []float64{-81.2334251404, -81.233253479, 115.80242157, 115.802764893}}, + {"nhpp1spf", []float64{-66.247215271, -66.2470436096, 99.9203109741, 99.9206542969}}, + {"7jg2w13b23h", []float64{-12.5614446402, -12.5614432991, -40.1635962725, -40.1635949314}}, + {"6q4z0ebf3", []float64{-9.99854564667, -9.99850273132, -74.8597669601, -74.8597240448}}, + {"sv9vqgeecr", []float64{31.8802589178, 31.8802642822, 36.5124285221, 36.5124392509}}, + {"wqd4", []float64{36.9140625, 37.08984375, 104.0625, 104.4140625}}, + {"bwqgj", []float64{80.68359375, 80.7275390625, -147.788085938, -147.744140625}}, + {"hk73qx", []float64{-65.8355712891, -65.830078125, 16.1059570312, 16.1169433594}}, + {"gdx1d6hr76v", []float64{59.3384175003, 59.3384188414, -12.5513903797, -12.5513890386}}, + {"47czd", []float64{-67.587890625, -67.5439453125, -76.201171875, -76.1572265625}}, + {"1kpebdk50", []float64{-66.8279457092, -66.8279027939, -113.17565918, -113.175616264}}, + {"g3z7", []float64{55.37109375, 55.546875, -23.5546875, -23.203125}}, + {"8x", []float64{39.375, 45.0, -157.5, -146.25}}, + {"nczvrs", []float64{-79.2114257812, -79.2059326172, 134.978027344, 134.989013672}}, + {"fbyjmwc6", []float64{50.1790237427, 50.1791954041, -47.5690841675, -47.5687408447}}, + {"yhz41tus5wm", []float64{72.1026183665, 72.1026197076, 99.9160046875, 99.9160060287}}, + {"uktff1pp0", []float64{70.8025932312, 70.8026361465, 19.4334411621, 19.4334840775}}, + {"hrt8", []float64{-47.8125, -47.63671875, 18.984375, 19.3359375}}, + {"b5vkzheshz3", []float64{66.9541557133, 66.9541570544, -172.304558605, -172.304557264}}, + {"v", []float64{45.0, 90.0, 45.0, 90.0}}, + {"h6mvgwv1kmp", []float64{-76.2956875563, -76.2956862152, 19.4968043268, 19.4968056679}}, + {"etzq7udz", []float64{33.4683036804, 33.4684753418, -12.1361160278, -12.1357727051}}, + {"rf2x5pd6", []float64{-31.0717391968, -31.0715675354, 169.588050842, 169.588394165}}, + {"k8kquxqbt2", []float64{-42.3673152924, -42.3673099279, 28.6838114262, 28.683822155}}, + {"bz91jncb2c7", []float64{87.4004097283, 87.4004110694, -144.621583968, -144.621582627}}, + {"3uk4y5r6sjwr", []float64{-20.5920389481, -20.5920387805, -95.3511917219, -95.3511913866}}, + {"d", []float64{0.0, 45.0, -90.0, -45.0}}, + {"y95n1hd", []float64{51.7044067383, 51.7057800293, 116.765441895, 116.766815186}}, + {"629k5b3kbkc", []float64{-41.4821608365, -41.4821594954, -76.8256638944, -76.8256625533}}, + {"pp42st7vp5", []float64{-50.5073958635, -50.5073904991, 138.367266655, 138.367277384}}, + {"u17e", []float64{52.55859375, 52.734375, 4.921875, 5.2734375}}, + {"s", []float64{0.0, 45.0, 0.0, 45.0}}, + {"ex5w6znubms", []float64{40.5129298568, 40.5129311979, -17.447989583, -17.4479882419}}, + {"8jsn", []float64{31.9921875, 32.16796875, -174.375, -174.0234375}}, + {"6", []float64{-45.0, 0.0, -90.0, -45.0}}, + {"mht9efkj", []float64{-19.410610199, -19.4104385376, 52.9046630859, 52.9050064087}}, + {"kkbcqqfcsz", []float64{-18.0241495371, -18.0241441727, 12.5833261013, 12.5833368301}}, + {"866rppwznm", []float64{13.9291459322, 13.9291512966, -165.268782377, -165.268771648}}, + {"96wj53wczp0c", []float64{14.9499841221, 14.9499842897, -115.160106607, -115.160106272}}, + {"9ctzrj", []float64{9.73937988281, 9.74487304688, -92.8564453125, -92.8454589844}}, + {"d", []float64{0.0, 45.0, -90.0, -45.0}}, + {"wfq5hd", []float64{13.1945800781, 13.2000732422, 132.385253906, 132.396240234}}, + {"9y6vu2v8wm5", []float64{36.1712247133, 36.1712260544, -97.1882195771, -97.188218236}}, + {"6xcpg", []float64{-0.0439453125, 0.0, -65.9619140625, -65.91796875}}, + {"rxqgqmc", []float64{-3.61587524414, -3.61450195312, 167.268218994, 167.269592285}}, + {"yye", []float64{81.5625, 82.96875, 127.96875, 129.375}}, + {"r3", []float64{-39.375, -33.75, 146.25, 157.5}}, + {"x7t", []float64{19.6875, 21.09375, 153.28125, 154.6875}}, + {"3", []float64{-45.0, 0.0, -135.0, -90.0}}, + {"g9mmh9sku8h", []float64{52.9192113876, 52.9192127287, -14.9133986235, -14.9133972824}}, + {"v2q6qu3", []float64{46.8251037598, 46.8264770508, 65.3370666504, 65.3384399414}}, + {"7j9ckmu00j", []float64{-13.8111609221, -13.8111555576, -42.3468017578, -42.346791029}}, + {"4q3td", []float64{-53.876953125, -53.8330078125, -76.552734375, -76.5087890625}}, + {"9ve2c92z33ke", []float64{31.077454146, 31.0774543136, -96.6126798838, -96.6126795486}}, + {"9sscvm0mw1kw", []float64{25.6485348567, 25.6485350244, -105.58899276, -105.588992424}}, + {"9u", []float64{22.5, 28.125, -101.25, -90.0}}, + {"nv4j7yb8mjjy", []float64{-60.9149988368, -60.9149986692, 126.728203855, 126.728204191}}, + {"80w8", []float64{2.8125, 2.98828125, -170.859375, -170.5078125}}, + {"q06ch78z1", []float64{-43.3975410461, -43.3974981308, 94.0550279617, 94.0550708771}}, + {"u", []float64{45.0, 90.0, 0.0, 45.0}}, + {"gzw", []float64{87.1875, 88.59375, -2.8125, -1.40625}}, + {"1", []float64{-90.0, -45.0, -135.0, -90.0}}, + {"u", []float64{45.0, 90.0, 0.0, 45.0}}, + {"furb", []float64{68.90625, 69.08203125, -45.3515625, -45.0}}, + {"xen8pyc36", []float64{16.9122934341, 16.9123363495, 166.983003616, 166.983046532}}, + {"n1gc29sd", []float64{-79.9279403687, -79.9277687073, 95.3015899658, 95.3019332886}}, + {"cjvu", []float64{78.046875, 78.22265625, -126.9140625, -126.5625}}, + {"w7x53f7fb0", []float64{20.2716207504, 20.2716261148, 111.175804138, 111.175814867}}, + {"hz", []float64{-50.625, -45.0, 33.75, 45.0}}, + {"8tz14", []float64{32.51953125, 32.5634765625, -147.568359375, -147.524414062}}, + {"z", []float64{45.0, 90.0, 135.0, 180.0}}, + {"pyfunm", []float64{-51.3006591797, -51.2951660156, 172.891845703, 172.902832031}}, + {"b", []float64{45.0, 90.0, -180.0, -135.0}}, + {"7xre21ceuxv", []float64{-3.63716259599, -3.63716125488, -11.9508652389, -11.9508638978}}, + {"17", []float64{-73.125, -67.5, -123.75, -112.5}}, + {"ru", []float64{-22.5, -16.875, 168.75, 180.0}}, + {"bdjs6y77m", []float64{57.0319604874, 57.0320034027, -149.640097618, -149.640054703}}, + {"u1vsgx2", []float64{55.718536377, 55.719909668, 7.88818359375, 7.88955688477}}, + {"pj5e80uz", []float64{-61.2544441223, -61.2542724609, 139.928398132, 139.928741455}}, + {"ju01xcg9", []float64{-67.2265434265, -67.2263717651, 79.0953826904, 79.0957260132}}, + {"4", []float64{-90.0, -45.0, -90.0, -45.0}}, + {"k05kz1", []float64{-44.1595458984, -44.1540527344, 4.8779296875, 4.88891601562}}, + {"ru2h1xc", []float64{-20.3480529785, -20.3466796875, 168.81729126, 168.818664551}}, + {"ud2s6zr", []float64{58.443145752, 58.444519043, 23.3335876465, 23.3349609375}}, + {"mrq5fm1zrqp", []float64{-3.5308277607, -3.53082641959, 64.7891007364, 64.7891020775}}, + {"0y2u0dg2rg", []float64{-54.1254597902, -54.1254544258, -145.168544054, -145.168533325}}, + {"9nkt0rnyx3", []float64{36.0747295618, 36.0747349262, -128.651307821, -128.651297092}}, + {"77q", []float64{-26.71875, -25.3125, -25.3125, -23.90625}}, + {"ng76t7", []float64{-71.2628173828, -71.2573242188, 128.551025391, 128.562011719}}, + {"4ewypr0y27up", []float64{-69.2182661779, -69.2182660103, -57.6881629229, -57.6881625876}}, + {"ge7vkm2x73", []float64{64.2341905832, 64.2341959476, -17.0389688015, -17.0389580727}}, + {"3qm4d", []float64{-9.404296875, -9.3603515625, -116.630859375, -116.586914062}}, + {"6gqw9", []float64{-25.576171875, -25.5322265625, -47.0654296875, -47.021484375}}, + {"32", []float64{-45.0, -39.375, -123.75, -112.5}}, + {"ns85", []float64{-64.16015625, -63.984375, 112.5, 112.8515625}}, + {"hzy00b4j5tcj", []float64{-46.4053600095, -46.4053598419, 42.2233571112, 42.2233574465}}, + {"7qrk5nt", []float64{-9.10491943359, -9.10354614258, -23.4159851074, -23.4146118164}}, + {"vd4219t7th", []float64{56.2588620186, 56.258867383, 70.7374048233, 70.7374155521}}, + {"g", []float64{45.0, 90.0, -45.0, 0.0}}, + {"pq1p16t", []float64{-55.0057983398, -55.0044250488, 147.718048096, 147.719421387}}, + {"gryfsc3wgn", []float64{89.0412604809, 89.0412658453, -24.0468835831, -24.0468728542}}, + {"np0u", []float64{-49.921875, -49.74609375, 91.0546875, 91.40625}}, + {"u87", []float64{46.40625, 47.8125, 26.71875, 28.125}}, + {"9qz2", []float64{37.96875, 38.14453125, -113.5546875, -113.203125}}, + {"xunf", []float64{22.8515625, 23.02734375, 178.2421875, 178.59375}}, + {"ve", []float64{61.875, 67.5, 67.5, 78.75}}, + {"s2c1tf2bvp4s", []float64{4.49494846165, 4.49494862929, 12.9101834446, 12.9101837799}}, + {"5znd0f", []float64{-50.2624511719, -50.2569580078, -2.07641601562, -2.0654296875}}, + {"dn90ug", []float64{36.7108154297, 36.7163085938, -88.3850097656, -88.3740234375}}, + {"24bg3", []float64{-28.9599609375, -28.916015625, -178.901367188, -178.857421875}}, + {"x46xpb2", []float64{13.888092041, 13.889465332, 138.856201172, 138.857574463}}, + {"83q", []float64{7.03125, 8.4375, -160.3125, -158.90625}}, + {"2pup20sqj", []float64{-0.128059387207, -0.128016471863, -174.368948936, -174.368906021}}, + {"07", []float64{-73.125, -67.5, -168.75, -157.5}}, + {"jj7nh21g4zk", []float64{-59.4135086238, -59.4135072827, 49.408044219, 49.4080455601}}, + {"19up4rw9", []float64{-78.8844108582, -78.8842391968, -106.767196655, -106.766853333}}, + {"2j", []float64{-16.875, -11.25, -180.0, -168.75}}, + {"14uexty", []float64{-73.8844299316, -73.8830566406, -128.33404541, -128.332672119}}, + {"t3wtb", []float64{9.4482421875, 9.4921875, 65.390625, 65.4345703125}}, + {"wv0z", []float64{29.35546875, 29.53125, 124.8046875, 125.15625}}, + {"jj6gcte19u", []float64{-59.7790789604, -59.779073596, 48.9373004436, 48.9373111725}}, + {"xz0wc0te1bbe", []float64{40.5647895299, 40.5647896975, 169.504699185, 169.504699521}}, + {"d", []float64{0.0, 45.0, -90.0, -45.0}}, + {"zyejp1um86c", []float64{82.4519781768, 82.4519795179, 173.282215744, 173.282217085}}, + {"ft915xruxj8n", []float64{76.1539096758, 76.1539098434, -65.9289979935, -65.9289976582}}, + {"vchvx0yp5c", []float64{51.5971237421, 51.5971291065, 85.7457053661, 85.745716095}}, + {"x5x", []float64{19.6875, 21.09375, 144.84375, 146.25}}, + {"0ykju58", []float64{-53.8137817383, -53.8124084473, -140.44921875, -140.447845459}}, + {"d2yk35fd", []float64{4.98676300049, 4.98693466187, -69.91355896, -69.9132156372}}, + {"6ymr7k8", []float64{-8.54461669922, -8.5432434082, -48.7243652344, -48.7229919434}}, + {"pjsxb9f", []float64{-57.6905822754, -57.6892089844, 141.352844238, 141.354217529}}, + {"trydkh27gn2t", []float64{44.0132818557, 44.0132820234, 65.5668789893, 65.5668793246}}, + {"k72c0uqqw2", []float64{-26.5185070038, -26.5185016394, 12.3464977741, 12.346508503}}, + {"s8trjfz", []float64{4.05807495117, 4.05944824219, 30.145111084, 30.146484375}}, + {"57ffkwfnrh1", []float64{-68.4725689888, -68.4725676477, -29.6820102632, -29.6820089221}}, + {"x", []float64{0.0, 45.0, 135.0, 180.0}}, + {"u2k", []float64{46.40625, 47.8125, 16.875, 18.28125}}, + {"nndqt", []float64{-52.294921875, -52.2509765625, 93.3837890625, 93.427734375}}, + {"w7", []float64{16.875, 22.5, 101.25, 112.5}}, + {"r6c7x00p6", []float64{-28.91477108, -28.9147281647, 148.315515518, 148.315558434}}, + {"mdrgz", []float64{-31.6845703125, -31.640625, 78.7060546875, 78.75}}, + {"f1dzhud0j", []float64{54.6926879883, 54.6927309036, -85.9211111069, -85.9210681915}}, + {"yqh", []float64{78.75, 80.15625, 106.875, 108.28125}}, + {"9jp43kj", []float64{28.5424804688, 28.5438537598, -125.094451904, -125.093078613}}, + {"14pb4v5q", []float64{-78.7215042114, -78.72133255, -123.976249695, -123.975906372}}, + {"bjzkjzr9hsvc", []float64{78.0868977495, 78.0868979171, -169.54150144, -169.541501105}}, + {"svjbhs9e9g8", []float64{28.1503388286, 28.1503401697, 42.0358264446, 42.0358277857}}, + {"guuxc8c5v", []float64{73.0858182907, 73.0858612061, -4.85436916351, -4.85432624817}}, + {"utu2603h0ru5", []float64{77.3897973262, 77.3897974938, 28.5658425093, 28.5658428445}}, + {"bq", []float64{78.75, 84.375, -168.75, -157.5}}, + {"kk", []float64{-22.5, -16.875, 11.25, 22.5}}, + {"6vxq65mhx", []float64{-12.9452419281, -12.9451990128, -45.9596300125, -45.9595870972}}, + {"f4sb", []float64{59.0625, 59.23828125, -83.3203125, -82.96875}}, + {"y5p4", []float64{62.2265625, 62.40234375, 99.84375, 100.1953125}}, + {"bs6cju", []float64{69.1040039062, 69.1094970703, -153.380126953, -153.369140625}}, + {"5j", []float64{-61.875, -56.25, -45.0, -33.75}}, + {"4e8z0qpsppx", []float64{-69.048345387, -69.0483440459, -66.4237166941, -66.423715353}}, + {"nbyg", []float64{-85.25390625, -85.078125, 133.2421875, 133.59375}}, + {"8jn2dnxvbd", []float64{28.2495939732, 28.2495993376, -171.112382412, -171.112371683}}, + {"0h6ej9g", []float64{-65.5567932129, -65.5554199219, -176.238555908, -176.237182617}}, + {"9j18njf9mc", []float64{28.1568056345, 28.1568109989, -132.623273134, -132.623262405}}, + {"pf93jtqd2xm9", []float64{-75.7324543409, -75.7324541733, 170.758466944, 170.758467279}}, + {"hc2", []float64{-82.96875, -81.5625, 33.75, 35.15625}}, + {"g", []float64{45.0, 90.0, -45.0, 0.0}}, + {"cewhub", []float64{65.5224609375, 65.5279541016, -103.853759766, -103.842773438}}, + {"2vcrfbgsv2sx", []float64{-11.2890061922, -11.2890060246, -144.366300032, -144.366299696}}, + {"mxdsnv", []float64{-2.08190917969, -2.07641601562, 71.3122558594, 71.3232421875}}, + {"03", []float64{-84.375, -78.75, -168.75, -157.5}}, + {"u73kybuxbq", []float64{64.1216933727, 64.1216987371, 13.3106338978, 13.3106446266}}, + {"uz2", []float64{85.78125, 87.1875, 33.75, 35.15625}}, + {"3w", []float64{-11.25, -5.625, -112.5, -101.25}}, + {"j", []float64{-90.0, -45.0, 45.0, 90.0}}, + {"q8kttcg9t", []float64{-42.6170825958, -42.6170396805, 119.085831642, 119.085874557}}, + {"j8w8vhmh9d", []float64{-87.0315349102, -87.0315295458, 76.8672823906, 76.8672931194}}, + {"qxn54fn91g", []float64{-5.08648216724, -5.08647680283, 121.067351103, 121.067361832}}, + {"9", []float64{0.0, 45.0, -135.0, -90.0}}, + {"3h2p4vmu", []float64{-19.8337554932, -19.8335838318, -134.871253967, -134.870910645}}, + {"j4re6xcw", []float64{-76.7288589478, -76.7286872864, 55.6587982178, 55.6591415405}}, + {"ugkmrc1vj", []float64{64.2104530334, 64.2104959488, 40.0697565079, 40.0697994232}}, + {"n1mj98dq", []float64{-81.9981765747, -81.9980049133, 97.1002578735, 97.1006011963}}, + {"r6c", []float64{-29.53125, -28.125, 147.65625, 149.0625}}, + {"9uksmr", []float64{24.6917724609, 24.697265625, -94.6911621094, -94.6801757812}}, + {"dmve", []float64{32.87109375, 33.046875, -71.015625, -70.6640625}}, + {"jgrdkvuq", []float64{-71.2906265259, -71.2904548645, 89.5114517212, 89.5117950439}}, + {"94g", []float64{15.46875, 16.875, -130.78125, -129.375}}, + {"2vj4gt", []float64{-16.3641357422, -16.3586425781, -139.064941406, -139.053955078}}, + {"q39q2pm5kxt", []float64{-35.4234436154, -35.4234422743, 103.01487878, 103.014880121}}, + {"qcuy493hpwdf", []float64{-34.0939741954, -34.0939740278, 130.541249625, 130.541249961}}, + {"nhfpjqpyqnv3", []float64{-62.0167130046, -62.0167128369, 93.0541204289, 93.0541207641}}, + {"838kk", []float64{9.1845703125, 9.228515625, -168.22265625, -168.178710938}}, + {"zx2fdg", []float64{86.2371826172, 86.2426757812, 158.675537109, 158.686523438}}, + {"j7ktd1g4", []float64{-70.7419967651, -70.7418251038, 62.670135498, 62.6704788208}}, + {"yzp", []float64{84.375, 85.78125, 133.59375, 135.0}}, + {"76kf7tcsnb94", []float64{-31.9159668311, -31.9159666635, -26.91415295, -26.9141526148}}, + {"9jb", []float64{32.34375, 33.75, -135.0, -133.59375}}, + {"6w", []float64{-11.25, -5.625, -67.5, -56.25}}, + {"f2zs58", []float64{49.921875, 49.9273681641, -68.0493164062, -68.0383300781}}, + {"f0", []float64{45.0, 50.625, -90.0, -78.75}}, + {"mnqum74bk", []float64{-9.08015727997, -9.08011436462, 54.7268486023, 54.7268915176}}, + {"t6rhggbn5", []float64{13.512840271, 13.5128831863, 66.2586736679, 66.2587165833}}, + {"g9q", []float64{52.03125, 53.4375, -14.0625, -12.65625}}, + {"7vr", []float64{-15.46875, -14.0625, -1.40625, 0.0}}, + {"t6sr47h", []float64{15.3094482422, 15.3108215332, 62.3309326172, 62.3323059082}}, + {"076vzrw", []float64{-70.666809082, -70.665435791, -164.555969238, -164.554595947}}, + {"s2", []float64{0.0, 5.625, 11.25, 22.5}}, + {"gd7350xxrrs", []float64{57.8360626101, 57.8360639513, -17.7872353792, -17.7872340381}}, + {"0p8sgbg", []float64{-46.9734191895, -46.9720458984, -179.127960205, -179.126586914}}, + {"5zsn39", []float64{-46.7083740234, -46.7028808594, -5.55908203125, -5.54809570312}}, + {"80f", []float64{4.21875, 5.625, -177.1875, -175.78125}}, + {"cymr4xm05c0", []float64{81.4265495539, 81.426550895, -93.7502968311, -93.75029549}}, + {"qmjmz", []float64{-15.8642578125, -15.8203125, 108.940429688, 108.984375}}, + {"39c0", []float64{-35.15625, -34.98046875, -111.09375, -110.7421875}}, + {"pgxnue0jpfn", []float64{-69.1086280346, -69.1086266935, 178.791844547, 178.791845888}}, + {"nytxjp", []float64{-52.1685791016, -52.1630859375, 131.704101562, 131.715087891}}, + {"q1mvpgze", []float64{-37.0687294006, -37.0685577393, 98.4368133545, 98.4371566772}}, + {"tqjgn4h", []float64{34.2883300781, 34.2897033691, 64.6051025391, 64.6064758301}}, + {"tjn18qrtdk", []float64{28.4239697456, 28.4239751101, 53.4588825703, 53.4588932991}}, + {"qq3w12r", []float64{-8.78768920898, -8.78631591797, 103.423919678, 103.425292969}}, + {"zzdu8g6", []float64{87.9963684082, 87.9977416992, 172.652893066, 172.654266357}}, + {"mz72xeccms3t", []float64{-4.11002179608, -4.11002162844, 83.6525436491, 83.6525439844}}, + {"s7fnw5nzhbs4", []float64{22.2540122643, 22.2540124319, 14.3356508017, 14.3356511369}}, + {"76rtxb6nkdm", []float64{-31.3744948804, -31.3744935393, -22.8596024215, -22.8596010804}}, + {"znbejer93dr", []float64{83.5141731799, 83.514174521, 135.955197662, 135.955199003}}, + {"smk7u7bz27", []float64{30.212289691, 30.2122950554, 17.4143707752, 17.4143815041}}, + {"yvmurs", []float64{75.3002929688, 75.3057861328, 132.165527344, 132.176513672}}, + {"tnjn6dgd8", []float64{34.8641681671, 34.8642110825, 52.1459197998, 52.1459627151}}, + {"gyee1hnu", []float64{82.1125030518, 82.1126747131, -6.27490997314, -6.27456665039}}, + {"gjwh9n02wfmc", []float64{76.7615726776, 76.7615728453, -36.5179139748, -36.5179136395}}, + {"n6jh51hrnfws", []float64{-78.0401661247, -78.0401659571, 108.41922082, 108.419221155}}, + {"shgszu46pudj", []float64{27.5760518946, 27.5760520622, 5.26587635279, 5.26587668806}}, + {"3c", []float64{-39.375, -33.75, -101.25, -90.0}}, + {"fy", []float64{78.75, 84.375, -56.25, -45.0}}, + {"s75d5tn3m", []float64{17.254242897, 17.2542858124, 16.3344812393, 16.3345241547}}, + {"2c1c9kkw8dk5", []float64{-39.0868538059, -39.0868536383, -143.727924228, -143.727923892}}, + {"5yugurey8e2", []float64{-51.3297383487, -51.3297370076, -4.37837362289, -4.37837228179}}, + {"rd", []float64{-33.75, -28.125, 157.5, 168.75}}, + {"um", []float64{73.125, 78.75, 11.25, 22.5}}, + {"bkgc", []float64{71.89453125, 72.0703125, -163.4765625, -163.125}}, + {"8cfxnrphhu0", []float64{11.1133790016, 11.1133803427, -142.449899912, -142.449898571}}, + {"tdm", []float64{12.65625, 14.0625, 74.53125, 75.9375}}, + {"y9usehucn02q", []float64{55.6610321626, 55.6610323302, 118.966741897, 118.966742232}}, + {"vk7kbfk0vf33", []float64{69.7537115403, 69.7537117079, 60.859013088, 60.8590134233}}, + {"ewyx82", []float64{39.287109375, 39.2926025391, -13.3483886719, -13.3374023438}}, + {"ev8c3", []float64{31.1572265625, 31.201171875, -10.1513671875, -10.107421875}}, + {"37p4fhuc", []float64{-27.6153373718, -27.6151657104, -113.811836243, -113.81149292}}, + {"0yvh2p9wbu", []float64{-51.2418007851, -51.2417954206, -139.216657877, -139.216647148}}, + {"81k", []float64{7.03125, 8.4375, -174.375, -172.96875}}, + {"7", []float64{-45.0, 0.0, -45.0, 0.0}}, + {"5c67rs", []float64{-82.3754882812, -82.3699951172, -7.75634765625, -7.74536132812}}, + {"udjvtg", []float64{57.2332763672, 57.2387695312, 30.8386230469, 30.849609375}}, + {"8b56vfqrppu", []float64{0.497001260519, 0.497002601624, -141.418113112, -141.418111771}}, + {"xv50", []float64{28.125, 28.30078125, 172.96875, 173.3203125}}, + {"7ep", []float64{-28.125, -26.71875, -12.65625, -11.25}}, + {"bxzp4746", []float64{89.8410415649, 89.8412132263, -147.554283142, -147.553939819}}, + {"5d54r", []float64{-78.3544921875, -78.310546875, -17.9736328125, -17.9296875}}, + {"hhknsytff", []float64{-64.9149942398, -64.9149513245, 5.8417224884, 5.84176540375}}, + {"gvjjq75", []float64{74.0643310547, 74.0657043457, -3.93997192383, -3.93859863281}}, + {"6ryrt5", []float64{-0.0714111328125, -0.06591796875, -69.7412109375, -69.7302246094}}, + {"tykj1vq1gj", []float64{36.0643225908, 36.0643279552, 84.460272789, 84.4602835178}}, + {"20tdw84b1w", []float64{-41.7480146885, -41.7480093241, -171.976139545, -171.976128817}}, + {"r", []float64{-45.0, 0.0, 135.0, 180.0}}, + {"sbwdq1c4355", []float64{3.21802318096, 3.21802452207, 43.1557171047, 43.1557184458}}, + {"rwhkzg", []float64{-10.3985595703, -10.3930664062, 163.817138672, 163.828125}}, + {"wrzphwjw57", []float64{44.8582237959, 44.8582291603, 111.299196482, 111.299207211}}, + {"674", []float64{-28.125, -26.71875, -75.9375, -74.53125}}, + {"z8kb", []float64{46.40625, 46.58203125, 164.1796875, 164.53125}}, + {"pmudq9vs33", []float64{-57.2503942251, -57.2503888607, 152.871376276, 152.871387005}}, + {"j1br4n9", []float64{-78.8900756836, -78.8887023926, 45.440826416, 45.442199707}}, + {"ccc", []float64{54.84375, 56.25, -99.84375, -98.4375}}, + {"src", []float64{43.59375, 45.0, 12.65625, 14.0625}}, + {"cc51keq", []float64{50.8625793457, 50.8639526367, -96.8252563477, -96.8238830566}}, + {"pr", []float64{-50.625, -45.0, 146.25, 157.5}}, + {"tvd9", []float64{31.11328125, 31.2890625, 82.265625, 82.6171875}}, + {"489bdms44x", []float64{-87.069016099, -87.0690107346, -64.9345850945, -64.9345743656}}, + {"cmn23svsyv", []float64{73.1958800554, 73.1958854198, -114.887176752, -114.887166023}}, + {"dm9vug1194", []float64{31.9649899006, 31.964995265, -76.0789060593, -76.0788953304}}, + {"45f7", []float64{-68.37890625, -68.203125, -86.8359375, -86.484375}}, + {"mxuxkdtgy50", []float64{-0.117443203926, -0.117441862822, 74.0340328217, 74.0340341628}}, + {"tdwd7", []float64{14.4580078125, 14.501953125, 76.7724609375, 76.81640625}}, + {"s", []float64{0.0, 45.0, 0.0, 45.0}}, + {"jss4y", []float64{-64.2041015625, -64.16015625, 73.388671875, 73.4326171875}}, + {"tmcs", []float64{33.046875, 33.22265625, 58.359375, 58.7109375}}, + {"x", []float64{0.0, 45.0, 135.0, 180.0}}, + {"gc61ncnxuec", []float64{52.2138749063, 52.2138762474, -8.13174828887, -8.13174694777}}, + {"jwpc", []float64{-56.07421875, -55.8984375, 78.3984375, 78.75}}, + {"yq5gn", []float64{79.27734375, 79.3212890625, 106.787109375, 106.831054688}}, + {"uhtmex", []float64{71.3177490234, 71.3232421875, 7.53662109375, 7.54760742188}}, + {"n0hbvyc", []float64{-89.8310852051, -89.8297119141, 96.9337463379, 96.9351196289}}, + {"kp8q0gk0h", []float64{-1.7399597168, -1.73991680145, 0.390186309814, 0.390229225159}}, + {"yjzduj46p", []float64{77.8549575806, 77.8550004959, 100.726046562, 100.726089478}}, + {"8hhkjyy4v5s", []float64{23.2406947017, 23.2406960428, -173.762292266, -173.762290925}}, + {"7", []float64{-45.0, 0.0, -45.0, 0.0}}, + {"0", []float64{-90.0, -45.0, -180.0, -135.0}}, + {"wwu63", []float64{38.3642578125, 38.408203125, 118.520507812, 118.564453125}}, + {"z9rnym", []float64{53.2452392578, 53.2507324219, 167.618408203, 167.629394531}}, + {"78fnfc", []float64{-39.5892333984, -39.5837402344, -19.5666503906, -19.5556640625}}, + {"8dc1mqyer", []float64{15.7261133194, 15.7261562347, -155.85381031, -155.853767395}}, + {"b5", []float64{61.875, 67.5, -180.0, -168.75}}, + {"q3zq3gz6w7", []float64{-34.0365725756, -34.0365672112, 111.532441378, 111.532452106}}, + {"xx6", []float64{40.78125, 42.1875, 160.3125, 161.71875}}, + {"r3", []float64{-39.375, -33.75, 146.25, 157.5}}, + {"dytz0swq74e", []float64{37.8187742829, 37.818775624, -48.1333740056, -48.1333726645}}, + {"gpwu", []float64{87.890625, 88.06640625, -35.5078125, -35.15625}}, + {"9ywdf6cr2w7", []float64{37.0622827113, 37.0622840524, -92.0087559521, -92.008754611}}, + {"3", []float64{-45.0, 0.0, -135.0, -90.0}}, + {"ues", []float64{64.6875, 66.09375, 28.125, 29.53125}}, + {"qggvs5q7pr", []float64{-22.9210478067, -22.9210424423, 129.208112955, 129.208123684}}, + {"fy9d8y2mv2", []float64{82.0372724533, 82.0372778177, -54.1070973873, -54.1070866585}}, + {"bxx92bb60s", []float64{87.411711216, 87.4117165804, -146.919801235, -146.919790506}}, + {"uugkmp4jkm0", []float64{72.5052005053, 72.5052018464, 38.5429680347, 38.5429693758}}, + {"7mmd13sd30", []float64{-15.1085615158, -15.1085561514, -25.9544706345, -25.9544599056}}, + {"5nn2", []float64{-56.25, -56.07421875, -36.2109375, -35.859375}}, + {"jf9sz2r", []float64{-75.1011657715, -75.0997924805, 81.1875915527, 81.1889648438}}, + {"3r", []float64{-5.625, 0.0, -123.75, -112.5}}, + {"yw", []float64{78.75, 84.375, 112.5, 123.75}}, + {"yt31y5hwc3c5", []float64{74.8565152846, 74.8565154523, 114.17615667, 114.176157005}}, + {"7vgv9xhmn", []float64{-11.6501426697, -11.6500997543, -5.90455055237, -5.90450763702}}, + {"f10tgm4q6", []float64{51.6642808914, 51.6643238068, -89.1508769989, -89.1508340836}}, + {"hepj84tfcj", []float64{-72.143971324, -72.1439659595, 32.3516893387, 32.3517000675}}, + {"zg", []float64{61.875, 67.5, 168.75, 180.0}}, + {"by12", []float64{78.75, 78.92578125, -144.4921875, -144.140625}}, + {"51", []float64{-84.375, -78.75, -45.0, -33.75}}, + {"w44s78ur", []float64{12.0023918152, 12.0025634766, 93.6752700806, 93.6756134033}}, + {"2tcr0", []float64{-11.42578125, -11.3818359375, -155.7421875, -155.698242188}}, + {"p0n", []float64{-90.0, -88.59375, 143.4375, 144.84375}}, + {"u1", []float64{50.625, 56.25, 0.0, 11.25}}, + {"nygu1mshqxku", []float64{-51.2971434742, -51.2971433066, 129.084147625, 129.08414796}}, + {"3khrs6gty5", []float64{-21.1655312777, -21.1655259132, -117.581605911, -117.581595182}}, + {"4n", []float64{-56.25, -50.625, -90.0, -78.75}}, + {"tj8pjxb5", []float64{32.2110557556, 32.211227417, 45.2416992188, 45.2420425415}}, + {"7nhpg3stdjgu", []float64{-9.87847991288, -9.87847974524, -39.225907065, -39.2259067297}}, + {"rhtcg24t2s50", []float64{-19.3789601326, -19.378959965, 143.232218474, 143.232218809}}, + {"5vx7c75x", []float64{-58.3856391907, -58.3854675293, -0.99494934082, -0.994606018066}}, + {"cs4kgc65z", []float64{68.3424711227, 68.3425140381, -109.168095589, -109.168052673}}, + {"k3sn2mb", []float64{-35.4322814941, -35.4309082031, 16.8859863281, 16.8873596191}}, + {"ud7s8v", []float64{58.4747314453, 58.4802246094, 27.4548339844, 27.4658203125}}, + {"8qqg2ue1mr6b", []float64{35.7525117695, 35.7525119372, -159.220504649, -159.220504314}}, + {"k", []float64{-45.0, 0.0, 0.0, 45.0}}, + {"kwtg55d3me", []float64{-7.89069950581, -7.89069414139, 30.7210993767, 30.7211101055}}, + {"mqsedtkq3", []float64{-7.79235363007, -7.79231071472, 62.6938676834, 62.6939105988}}, + {"94j6tg", []float64{11.7059326172, 11.7114257812, -127.364501953, -127.353515625}}, + {"wd45s", []float64{11.865234375, 11.9091796875, 115.48828125, 115.532226562}}, + {"fwgxjq0n2", []float64{84.233250618, 84.2332935333, -62.3474121094, -62.347369194}}, + {"1k8fh1", []float64{-64.3304443359, -64.3249511719, -122.51953125, -122.508544922}}, + {"h", []float64{-90.0, -45.0, 0.0, 45.0}}, + {"f", []float64{45.0, 90.0, -90.0, -45.0}}, + {"6", []float64{-45.0, 0.0, -90.0, -45.0}}, + {"qh", []float64{-22.5, -16.875, 90.0, 101.25}}, + {"24nszw5", []float64{-32.8820800781, -32.8807067871, -170.525665283, -170.524291992}}, + {"p8rfy", []float64{-88.1103515625, -88.06640625, 168.662109375, 168.706054688}}, + {"st23m", []float64{29.7509765625, 29.794921875, 23.0712890625, 23.115234375}}, + {"zgg3", []float64{66.26953125, 66.4453125, 173.3203125, 173.671875}}, + {"zgsgk7bcz8k", []float64{65.2796901762, 65.2796915174, 175.617812276, 175.617813617}}, + {"js0", []float64{-67.5, -66.09375, 67.5, 68.90625}}, + {"9zs3bh", []float64{42.5170898438, 42.5225830078, -95.2734375, -95.2624511719}}, + {"k8qd0d6mqfz", []float64{-43.2289119065, -43.2289105654, 31.6659866273, 31.6659879684}}, + {"xrj", []float64{39.375, 40.78125, 153.28125, 154.6875}}, + {"0mbxv2r4", []float64{-56.2922286987, -56.2920570374, -167.806549072, -167.80620575}}, + {"bdf8wz8g1", []float64{60.5983543396, 60.5983972549, -153.686671257, -153.686628342}}, + {"emgeh3mkyu", []float64{32.8787970543, 32.8788024187, -28.6338579655, -28.6338472366}}, + {"5", []float64{-90.0, -45.0, -45.0, 0.0}}, + {"156p3nxfwq", []float64{-70.4081690311, -70.4081636667, -132.132643461, -132.132632732}}, + {"stduvpcnp4", []float64{31.8160736561, 31.8160790205, 26.5885877609, 26.5885984898}}, + {"shk8evrmmbgr", []float64{24.0238861553, 24.023886323, 6.50312740356, 6.50312773883}}, + {"7ynw", []float64{-10.1953125, -10.01953125, -2.109375, -1.7578125}}, + {"r4zgvkp", []float64{-28.8500976562, -28.8487243652, 146.138763428, 146.140136719}}, + {"5b9p", []float64{-85.95703125, -85.78125, -9.84375, -9.4921875}}, + {"gsp", []float64{67.5, 68.90625, -12.65625, -11.25}}, + {"ek4mmvr", []float64{23.4516906738, 23.4530639648, -30.323638916, -30.322265625}}, + {"hd2nu", []float64{-76.1572265625, -76.11328125, 22.67578125, 22.7197265625}}, + {"xzp0t", []float64{39.462890625, 39.5068359375, 178.813476562, 178.857421875}}, + {"fycjx9", []float64{83.9410400391, 83.9465332031, -54.5141601562, -54.5031738281}}, + {"x4ygy62x5", []float64{16.1414909363, 16.1415338516, 144.767661095, 144.76770401}}, + {"37", []float64{-28.125, -22.5, -123.75, -112.5}}, + {"m40", []float64{-33.75, -32.34375, 45.0, 46.40625}}, + {"m", []float64{-45.0, 0.0, 45.0, 90.0}}, + {"b5", []float64{61.875, 67.5, -180.0, -168.75}}, + {"zwd", []float64{81.5625, 82.96875, 160.3125, 161.71875}}, + {"qgnb2g", []float64{-28.0645751953, -28.0590820312, 133.275146484, 133.286132812}}, + {"7w", []float64{-11.25, -5.625, -22.5, -11.25}}, + {"v6x", []float64{59.0625, 60.46875, 66.09375, 67.5}}, + {"018v8j6y", []float64{-80.5658340454, -80.565662384, -178.94153595, -178.941192627}}, + {"mm4p02h18xc", []float64{-15.6442321837, -15.6442308426, 59.079002291, 59.0790036321}}, + {"6cty1v7", []float64{-35.4789733887, -35.4776000977, -48.0830383301, -48.0816650391}}, + {"g6rzs", []float64{58.974609375, 59.0185546875, -22.67578125, -22.6318359375}}, + {"58qb", []float64{-88.59375, -88.41796875, -13.0078125, -12.65625}}, + {"n8v", []float64{-85.78125, -84.375, 119.53125, 120.9375}}, + {"h1nj4b1uv", []float64{-83.4952783585, -83.4952354431, 8.56096744537, 8.56101036072}}, + {"qt4ukphd", []float64{-16.0891342163, -16.0889625549, 116.54914856, 116.549491882}}, + {"n2", []float64{-90.0, -84.375, 101.25, 112.5}}, + {"50sux01hejpj", []float64{-86.3956842385, -86.3956840709, -38.0111838877, -38.0111835524}}, + {"4exy", []float64{-69.2578125, -69.08203125, -56.6015625, -56.25}}, + {"x9t8r", []float64{8.4814453125, 8.525390625, 165.541992188, 165.5859375}}, + {"785m2wrxgw2", []float64{-44.0414522588, -44.0414509177, -17.8972649574, -17.8972636163}}, + {"0exegdddqf", []float64{-69.6391904354, -69.639185071, -146.7955935, -146.795582771}}, + {"ynrw", []float64{81.2109375, 81.38671875, 100.546875, 100.8984375}}, + {"uqdmuzrz6", []float64{82.6143121719, 82.6143550873, 14.6335315704, 14.6335744858}}, + {"gchp3yu15c", []float64{51.9366699457, 51.9366753101, -5.54244160652, -5.54243087769}}, + {"415qzwpj2r", []float64{-83.154578805, -83.1545734406, -85.0904738903, -85.0904631615}}, + {"v", []float64{45.0, 90.0, 45.0, 90.0}}, + {"vzq07z4", []float64{85.8636474609, 85.865020752, 87.3550415039, 87.3564147949}}, + {"w43s", []float64{13.359375, 13.53515625, 92.109375, 92.4609375}}, + {"b7td", []float64{65.0390625, 65.21484375, -161.015625, -160.6640625}}, + {"2fpe", []float64{-33.22265625, -33.046875, -135.703125, -135.3515625}}, + {"nyt", []float64{-53.4375, -52.03125, 130.78125, 132.1875}}, + {"g", []float64{45.0, 90.0, -45.0, 0.0}}, + {"s", []float64{0.0, 45.0, 0.0, 45.0}}, + {"dkzk", []float64{27.421875, 27.59765625, -68.5546875, -68.203125}}, + {"r3925m207w", []float64{-36.5335857868, -36.5335804224, 148.150784969, 148.150795698}}, + {"0zy5gndwbzt3", []float64{-45.710165631, -45.7101654634, -137.677191608, -137.677191272}}, + {"46ex1dmu9", []float64{-74.6938991547, -74.6938562393, -73.7542676926, -73.7542247772}}, + {"jf74h9rrvnz8", []float64{-76.9839544594, -76.9839542918, 83.1766849011, 83.1766852364}}, + {"6vj1jtgv", []float64{-16.6667747498, -16.6666030884, -48.9719009399, -48.9715576172}}, + {"ntsx", []float64{-57.83203125, -57.65625, 118.828125, 119.1796875}}, + {"ehr3ejb", []float64{24.2015075684, 24.2028808594, -34.6728515625, -34.6714782715}}, + {"p7", []float64{-73.125, -67.5, 146.25, 157.5}}, + {"re7", []float64{-26.71875, -25.3125, 161.71875, 163.125}}, + {"66x6j7sc0t", []float64{-30.5665129423, -30.5665075779, -68.3174300194, -68.3174192905}}, + {"mywcjb2q", []float64{-8.25931549072, -8.25914382935, 88.4952163696, 88.4955596924}}, + {"f88jrw", []float64{48.7683105469, 48.7738037109, -67.1704101562, -67.1594238281}}, + {"bjty3ef9n3y6", []float64{77.0569135621, 77.0569137298, -171.844434701, -171.844434366}}, + {"jhz66rh4fj7s", []float64{-62.8467891365, -62.8467889689, 55.2997731417, 55.299773477}}, + {"u16r3nm", []float64{53.3399963379, 53.3413696289, 3.21487426758, 3.21624755859}}, + {"b", []float64{45.0, 90.0, -180.0, -135.0}}, + {"q5rwnj4", []float64{-25.6365966797, -25.6352233887, 100.813293457, 100.814666748}}, + {"dsqd4vc85c", []float64{24.2894035578, 24.2894089222, -58.2363045216, -58.2362937927}}, + {"bzpu7z2qxf9", []float64{85.1630249619, 85.1630263031, -135.18609032, -135.186088979}}, + {"e805cvqt", []float64{0.688877105713, 0.68904876709, -22.4141693115, -22.4138259888}}, + {"y9su01vg", []float64{54.1507530212, 54.1509246826, 119.187583923, 119.187927246}}, + {"41", []float64{-84.375, -78.75, -90.0, -78.75}}, + {"vu3f", []float64{69.2578125, 69.43359375, 81.2109375, 81.5625}}, + {"86", []float64{11.25, 16.875, -168.75, -157.5}}, + {"wpuksnn65", []float64{44.4180679321, 44.4181108475, 96.1610555649, 96.1610984802}}, + {"w", []float64{0.0, 45.0, 90.0, 135.0}}, + {"nrqk483fq0kp", []float64{-48.5138629563, -48.5138627887, 110.151591897, 110.151592232}}, + {"cg0mc", []float64{62.8857421875, 62.9296875, -100.854492188, -100.810546875}}, + {"myt75g", []float64{-7.89367675781, -7.88818359375, 86.2976074219, 86.30859375}}, + {"ugxe27b", []float64{65.2793884277, 65.2807617188, 44.3078613281, 44.3092346191}}, + {"wd845dtbhs8", []float64{14.42781955, 14.4278208911, 112.661898136, 112.661899477}}, + {"c8ef9h6mr", []float64{48.2762002945, 48.2762432098, -107.179226875, -107.17918396}}, + {"bx", []float64{84.375, 90.0, -157.5, -146.25}}, + {"qduv3", []float64{-28.6083984375, -28.564453125, 119.223632812, 119.267578125}}, + {"j86depxm", []float64{-88.1122398376, -88.1120681763, 71.1574172974, 71.1577606201}}, + {"4semjs5hr9p", []float64{-63.7858861685, -63.7858848274, -62.6835371554, -62.6835358143}}, + {"ee", []float64{16.875, 22.5, -22.5, -11.25}}, + {"jxv25m96n", []float64{-46.3756942749, -46.3756513596, 75.0276088715, 75.0276517868}}, + {"vx7gj0006xwe", []float64{86.3086774014, 86.308677569, 72.993280068, 72.9932804033}}, + {"6c", []float64{-39.375, -33.75, -56.25, -45.0}}, + {"ukstfgt78f", []float64{71.3430798054, 71.3430851698, 17.7062165737, 17.7062273026}}, + {"g1h0ye", []float64{50.7733154297, 50.7788085938, -39.0893554688, -39.0783691406}}, + {"5s3j5er", []float64{-65.1969909668, -65.1956176758, -20.9303283691, -20.9289550781}}, + {"6yrnnwq1", []float64{-8.75455856323, -8.75438690186, -46.1123657227, -46.1120223999}}, + {"20fjy", []float64{-39.7705078125, -39.7265625, -176.923828125, -176.879882812}}, + {"1tt417q", []float64{-58.6930847168, -58.6917114258, -105.405578613, -105.404205322}}, + {"hh68", []float64{-66.09375, -65.91796875, 3.515625, 3.8671875}}, + {"85s823r7j", []float64{19.7388267517, 19.7388696671, -173.650717735, -173.65067482}}, + {"3u6n", []float64{-20.0390625, -19.86328125, -98.4375, -98.0859375}}, + {"7w5y8", []float64{-10.107421875, -10.0634765625, -17.2265625, -17.1826171875}}, + {"5", []float64{-90.0, -45.0, -45.0, 0.0}}, + {"zk8j", []float64{71.19140625, 71.3671875, 146.25, 146.6015625}}, + {"mgm1evw0", []float64{-26.4248657227, -26.4246940613, 85.954284668, 85.9546279907}}, + {"m7nzv", []float64{-26.7626953125, -26.71875, 65.9619140625, 66.005859375}}, + {"ev89re48", []float64{31.1737060547, 31.1738777161, -10.2138519287, -10.213508606}}, + {"dnc7mrxvb3", []float64{38.5822302103, 38.5822355747, -88.0008208752, -88.0008101463}}, + {"2", []float64{-45.0, 0.0, -180.0, -135.0}}, + {"wbksmz281", []float64{2.19314575195, 2.1931886673, 130.331540108, 130.331583023}}, + {"0x3zqt", []float64{-47.9168701172, -47.9113769531, -154.753417969, -154.742431641}}, + {"h97q", []float64{-81.9140625, -81.73828125, 27.0703125, 27.421875}}, + {"ypjt38wtc", []float64{85.3015851974, 85.3016281128, 97.8092622757, 97.809305191}}, + {"d3ey054b7p", []float64{9.50874745846, 9.50875282288, -73.4726572037, -73.4726464748}}, + {"zpbkps3qd9r", []float64{89.3213434517, 89.3213447928, 135.682985634, 135.682986975}}, + {"sqxhhhs68mxy", []float64{37.2908039019, 37.2908040695, 21.2753888592, 21.2753891945}}, + {"293cyj4g6q", []float64{-37.6330769062, -37.6330715418, -154.771517515, -154.771506786}}, + {"w3", []float64{5.625, 11.25, 101.25, 112.5}}, + {"r", []float64{-45.0, 0.0, 135.0, 180.0}}, + {"q69s", []float64{-30.234375, -30.05859375, 103.359375, 103.7109375}}, + {"fy1qerq2ven", []float64{79.9325484037, 79.9325497448, -54.3405380845, -54.3405367434}}, + {"62", []float64{-45.0, -39.375, -78.75, -67.5}}, + {"yke1jyek0fg", []float64{70.5246882141, 70.5246895552, 105.725934952, 105.725936294}}, + {"2rcbq1z5c", []float64{-1.35204792023, -1.35200500488, -166.015734673, -166.015691757}}, + {"gue", []float64{70.3125, 71.71875, -7.03125, -5.625}}, + {"t8kqv", []float64{2.5927734375, 2.63671875, 73.6962890625, 73.740234375}}, + {"bgtecc1b", []float64{65.3521728516, 65.3523445129, -138.436317444, -138.435974121}}, + {"8s550", []float64{23.02734375, 23.0712890625, -153.28125, -153.237304688}}, + {"j655kpm1w0b", []float64{-78.1386239827, -78.1386226416, 60.6516551971, 60.6516565382}}, + {"980h", []float64{0.703125, 0.87890625, -112.5, -112.1484375}}, + {"ssywj", []float64{27.7734375, 27.8173828125, 31.8603515625, 31.904296875}}, + {"hrvu", []float64{-45.703125, -45.52734375, 19.3359375, 19.6875}}, + {"3ftuv", []float64{-30.1025390625, -30.05859375, -92.9443359375, -92.900390625}}, + {"zcphg93jux", []float64{51.4678519964, 51.4678573608, 178.749125004, 178.749135733}}, + {"6", []float64{-45.0, 0.0, -90.0, -45.0}}, + {"vjxk", []float64{76.640625, 76.81640625, 55.1953125, 55.546875}}, + {"t6cgynpnh4", []float64{16.161929369, 16.1619347334, 58.9843940735, 58.9844048023}}, + {"jspc", []float64{-67.32421875, -67.1484375, 78.3984375, 78.75}}, + {"6w7k2v", []float64{-9.06921386719, -9.06372070312, -62.8967285156, -62.8857421875}}, + {"n6z4", []float64{-74.1796875, -74.00390625, 111.09375, 111.4453125}}, + {"y507hx0", []float64{62.4407958984, 62.4421691895, 90.5493164062, 90.5506896973}}, + {"z17p35rnsc6v", []float64{53.3246401884, 53.324640356, 139.272515886, 139.272516221}}, + {"p8uj5760", []float64{-84.8844909668, -84.8843193054, 163.270568848, 163.27091217}}, + {"8pcszdrxwp", []float64{44.4423955679, 44.4424009323, -177.550477982, -177.550467253}}, + {"mckjc7q4r", []float64{-36.9397687912, -36.9397258759, 84.4384717941, 84.4385147095}}, + {"p6sngm70", []float64{-74.7221374512, -74.7219657898, 152.021942139, 152.022285461}}, + {"59gcptbk", []float64{-79.9481964111, -79.9480247498, -16.8966293335, -16.8962860107}}, + {"n", []float64{-90.0, -45.0, 90.0, 135.0}}, + {"fmnv3ug5m", []float64{74.0745019913, 74.0745449066, -69.1765737534, -69.176530838}}, + {"qc8kkbc0xev", []float64{-35.8112038672, -35.8112025261, 124.312004596, 124.312005937}}, + {"b4", []float64{56.25, 61.875, -180.0, -168.75}}, + {"vkxeh", []float64{70.83984375, 70.8837890625, 66.97265625, 67.0166015625}}, + {"399qcn2zv", []float64{-35.3403139114, -35.3402709961, -110.696997643, -110.696954727}}, + {"ybz3", []float64{49.39453125, 49.5703125, 133.9453125, 134.296875}}, + {"d5e25", []float64{19.6875, 19.7314453125, -85.2978515625, -85.25390625}}, + {"5n", []float64{-56.25, -50.625, -45.0, -33.75}}, + {"wdw", []float64{14.0625, 15.46875, 120.9375, 122.34375}}, + {"29q7", []float64{-37.44140625, -37.265625, -148.7109375, -148.359375}}, + {"tqe1y4trw", []float64{36.885137558, 36.8851804733, 60.7398891449, 60.7399320602}}, + {"zfwy172d", []float64{60.135383606, 60.1355552673, 178.297805786, 178.298149109}}, + {"tfd57f", []float64{14.6447753906, 14.6502685547, 81.7272949219, 81.73828125}}, + {"27f6s6h", []float64{-23.4558105469, -23.4544372559, -165.393676758, -165.392303467}}, + {"zk2q2ph7db", []float64{70.0439357758, 70.0439411402, 146.607517004, 146.607527733}}, + {"ptp", []float64{-61.875, -60.46875, 167.34375, 168.75}}, + {"7pcgp042wz", []float64{-0.878782868385, -0.878777503967, -42.2280657291, -42.2280550003}}, + {"9t", []float64{28.125, 33.75, -112.5, -101.25}}, + {"d", []float64{0.0, 45.0, -90.0, -45.0}}, + {"qd0pwby4y", []float64{-32.4270486832, -32.4270057678, 112.805128098, 112.805171013}}, + {"b", []float64{45.0, 90.0, -180.0, -135.0}}, + {"yet", []float64{64.6875, 66.09375, 119.53125, 120.9375}}, + {"v4pbsh7cp", []float64{56.3614082336, 56.361451149, 56.0796689987, 56.0797119141}}, + {"kvr7u7pm7jeu", []float64{-14.7921594232, -14.7921592556, 44.1421702132, 44.1421705484}}, + {"687jj", []float64{-42.71484375, -42.6708984375, -63.0615234375, -63.017578125}}, + {"4w29", []float64{-54.66796875, -54.4921875, -66.796875, -66.4453125}}, + {"6bz45dve", []float64{-40.4140663147, -40.4138946533, -46.2448883057, -46.2445449829}}, + {"gfysykk0nw29", []float64{61.32709058, 61.3270907477, -1.82894401252, -1.82894367725}}, + {"pdw4scw", []float64{-75.4898071289, -75.4884338379, 166.15447998, 166.155853271}}, + {"65f4", []float64{-23.5546875, -23.37890625, -87.1875, -86.8359375}}, + {"jc9g6tpd08j", []float64{-80.9634017944, -80.9634004533, 81.3311286271, 81.3311299682}}, + {"ckw1tvf18r09", []float64{70.608052779, 70.6080529466, -115.057056472, -115.057056136}}, + {"2cbtp", []float64{-34.27734375, -34.2333984375, -145.239257812, -145.1953125}}, + {"54myf", []float64{-76.1572265625, -76.11328125, -36.826171875, -36.7822265625}}, + {"kw", []float64{-11.25, -5.625, 22.5, 33.75}}, + {"mw", []float64{-11.25, -5.625, 67.5, 78.75}}, + {"2bjvee8sgek", []float64{-44.0131442249, -44.0131428838, -138.009411693, -138.009410352}}, + {"yj6r4eyxuv1", []float64{75.783675313, 75.7836766541, 93.2830573618, 93.2830587029}}, + {"n", []float64{-90.0, -45.0, 90.0, 135.0}}, + {"pkec4ng", []float64{-64.4746398926, -64.4732666016, 151.615447998, 151.616821289}}, + {"uvpynzd0v", []float64{74.2210149765, 74.2210578918, 44.9480295181, 44.9480724335}}, + {"grxrc9by8g4", []float64{88.5605496168, 88.5605509579, -23.4877046943, -23.4877033532}}, + {"6z0", []float64{-5.625, -4.21875, -56.25, -54.84375}}, + {"s", []float64{0.0, 45.0, 0.0, 45.0}}, + {"6npe6", []float64{-10.6787109375, -10.634765625, -79.365234375, -79.3212890625}}, + {"04wzmgz23", []float64{-74.6424436569, -74.6424007416, -170.245127678, -170.245084763}}, + {"dmfzk9", []float64{33.6236572266, 33.6291503906, -74.6850585938, -74.6740722656}}, + {"eeu", []float64{21.09375, 22.5, -16.875, -15.46875}}, + {"84bd9k3t", []float64{15.9324073792, 15.9325790405, -179.239883423, -179.2395401}}, + {"7q6ywwxq2kc", []float64{-8.664367944, -8.6643666029, -29.5871995389, -29.5871981978}}, + {"bve6jyuwk", []float64{76.327214241, 76.3272571564, -141.420650482, -141.420607567}}, + {"h558", []float64{-73.125, -72.94921875, 4.921875, 5.2734375}}, + {"sk4f3d2h1vq", []float64{22.9085822403, 22.9085835814, 15.1831886172, 15.1831899583}}, + {"tffnvn", []float64{16.6882324219, 16.6937255859, 81.7822265625, 81.7932128906}}, + {"eppvmm886m", []float64{40.3281337023, 40.3281390667, -33.8700664043, -33.8700556755}}, + {"d2w", []float64{2.8125, 4.21875, -70.3125, -68.90625}}, + {"sd3", []float64{12.65625, 14.0625, 23.90625, 25.3125}}, + {"q73kk", []float64{-25.9716796875, -25.927734375, 103.18359375, 103.227539062}}, + {"wtz5yx", []float64{33.0413818359, 33.046875, 122.629394531, 122.640380859}}, + {"xw8z", []float64{37.79296875, 37.96875, 158.5546875, 158.90625}}, + {"dyhuchkt7qr", []float64{34.6092416346, 34.6092429757, -49.5200385153, -49.5200371742}}, + {"pyb1es47f", []float64{-51.7449617386, -51.7449188232, 168.906984329, 168.907027245}}, + {"v6jcgfj1nus", []float64{56.5687993169, 56.568800658, 64.5078939199, 64.507895261}}, + {"xxe", []float64{42.1875, 43.59375, 161.71875, 163.125}}, + {"7xdvcext7rq", []float64{-1.78159162402, -1.78159028292, -18.5564473271, -18.556445986}}, + {"1f35b7w", []float64{-76.6653442383, -76.6639709473, -99.8245239258, -99.8231506348}}, + {"he675b8pjek", []float64{-71.187440604, -71.1874392629, 25.8290988207, 25.8291001618}}, + {"pzj3", []float64{-50.44921875, -50.2734375, 176.1328125, 176.484375}}, + {"1s1m8j10zwq3", []float64{-66.5055748634, -66.5055746958, -110.740483962, -110.740483627}}, + {"sk4xsg4ufxm", []float64{23.8356931508, 23.8356944919, 14.9782557786, 14.9782571197}}, + {"r3m2ker83", []float64{-37.906908989, -37.9068660736, 153.840909004, 153.84095192}}, + {"p7bj", []float64{-68.02734375, -67.8515625, 146.25, 146.6015625}}, + {"4wws597u", []float64{-52.7268218994, -52.726650238, -58.2004165649, -58.2000732422}}, + {"u9rk", []float64{52.734375, 52.91015625, 32.6953125, 33.046875}}, + {"2mv856bp9uj4", []float64{-12.6398345456, -12.6398343779, -160.872720927, -160.872720592}}, + {"57th4543xnbs", []float64{-69.5926011354, -69.5926009677, -26.6274683923, -26.627468057}}, + {"8pct", []float64{44.47265625, 44.6484375, -177.890625, -177.5390625}}, + {"me0skjqbk8", []float64{-27.3490476608, -27.3490422964, 68.3883690834, 68.3883798122}}, + {"30s74", []float64{-41.66015625, -41.6162109375, -128.935546875, -128.891601562}}, + {"9r0h9fedn", []float64{40.1800918579, 40.1801347733, -123.668031693, -123.667988777}}, + {"9v6gvte7r", []float64{30.2211999893, 30.2212429047, -97.136349678, -97.1363067627}}, + {"j5d", []float64{-70.3125, -68.90625, 47.8125, 49.21875}}, + {"hf8ge", []float64{-75.322265625, -75.2783203125, 34.9365234375, 34.98046875}}, + {"hf2hmz", []float64{-76.5582275391, -76.552734375, 34.0026855469, 34.013671875}}, + {"5wc405", []float64{-51.6632080078, -51.6577148438, -21.09375, -21.0827636719}}, + {"x2dk49y7p8", []float64{3.52575302124, 3.52575838566, 149.532830715, 149.532841444}}, + {"350jjfux7dbk", []float64{-27.2297275811, -27.2297274135, -134.740984105, -134.740983769}}, + {"04wd", []float64{-75.5859375, -75.41015625, -170.859375, -170.5078125}}, + {"1zxmhstk", []float64{-46.9081878662, -46.9080162048, -90.8497238159, -90.8493804932}}, + {"b2sj26ddce3", []float64{48.7495739758, 48.7495753169, -163.11051473, -163.110513389}}, + {"vznd8mz363", []float64{84.8462587595, 84.8462641239, 87.9116642475, 87.9116749763}}, + {"f", []float64{45.0, 90.0, -90.0, -45.0}}, + {"0", []float64{-90.0, -45.0, -180.0, -135.0}}, + {"kw", []float64{-11.25, -5.625, 22.5, 33.75}}, + {"1s0nq579", []float64{-66.3833427429, -66.3831710815, -112.231521606, -112.231178284}}, + {"mzky2scd", []float64{-3.09368133545, -3.09350967407, 85.4537200928, 85.4540634155}}, + {"kctpzwx2rxg", []float64{-35.1644052565, -35.1644039154, 41.121122092, 41.1211234331}}, + {"19", []float64{-84.375, -78.75, -112.5, -101.25}}, + {"pmg4wc8pn", []float64{-57.2073554993, -57.2073125839, 150.765638351, 150.765681267}}, + {"sxcn0yd0jck", []float64{44.6841497719, 44.684151113, 23.9422076941, 23.9422090352}}, + {"000dsj4g", []float64{-89.5325660706, -89.5323944092, -179.1173172, -179.116973877}}, + {"pgv0", []float64{-68.90625, -68.73046875, 175.78125, 176.1328125}}, + {"dhw0935d8", []float64{25.4063129425, 25.4063558578, -81.5027618408, -81.5027189255}}, + {"4gvz08kbk9", []float64{-67.6743596792, -67.6743543148, -48.1353735924, -48.1353628635}}, + {"djz4g1m", []float64{32.8340148926, 32.8353881836, -80.0175476074, -80.0161743164}}, + {"x4yhn1h5m1z", []float64{16.1779354513, 16.1779367924, 143.706889004, 143.706890345}}, + {"9t", []float64{28.125, 33.75, -112.5, -101.25}}, + {"d", []float64{0.0, 45.0, -90.0, -45.0}}, + {"4fgytpvp2v", []float64{-73.3448284864, -73.344823122, -50.7499372959, -50.7499265671}}, + {"jggxx", []float64{-67.587890625, -67.5439453125, 83.9794921875, 84.0234375}}, + {"mye8t1", []float64{-8.34411621094, -8.33862304688, 83.8916015625, 83.9025878906}}, + {"bssun7dstj", []float64{71.0356503725, 71.0356557369, -150.542006493, -150.541995764}}, + {"ehhv", []float64{23.37890625, 23.5546875, -38.3203125, -37.96875}}, + {"484tc", []float64{-88.9892578125, -88.9453125, -63.9404296875, -63.896484375}}, + {"d4tjtv", []float64{15.0567626953, 15.0622558594, -82.7160644531, -82.705078125}}, + {"2z", []float64{-5.625, 0.0, -146.25, -135.0}}, + {"t5ey32", []float64{20.7861328125, 20.7916259766, 50.3283691406, 50.3393554688}}, + {"um", []float64{73.125, 78.75, 11.25, 22.5}}, + {"pe", []float64{-73.125, -67.5, 157.5, 168.75}}, + {"2x05zw6z", []float64{-4.93028640747, -4.93011474609, -157.166633606, -157.166290283}}, + {"67", []float64{-28.125, -22.5, -78.75, -67.5}}, + {"dxfr3yndexbg", []float64{44.9015942775, 44.9015944451, -64.249955602, -64.2499552667}}, + {"y87", []float64{46.40625, 47.8125, 116.71875, 118.125}}, + {"2h29w", []float64{-20.830078125, -20.7861328125, -179.033203125, -178.989257812}}, + {"cv", []float64{73.125, 78.75, -101.25, -90.0}}, + {"jcx2m1mjy9e", []float64{-81.5106931329, -81.5106917918, 89.1721884906, 89.1721898317}}, + {"ryj9w", []float64{-10.986328125, -10.9423828125, 176.748046875, 176.791992188}}, + {"g5sftxrhf40", []float64{65.1676046848, 65.1676060259, -38.0689144135, -38.0689130723}}, + {"nhs95z98z", []float64{-64.4703912735, -64.4703483582, 96.4952802658, 96.4953231812}}, + {"s7n00se8u3n", []float64{16.8998533487, 16.8998546898, 19.7144696116, 19.7144709527}}, + {"4310r6m", []float64{-84.3186950684, -84.3173217773, -77.0182800293, -77.0169067383}}, + {"7kkmp", []float64{-20.21484375, -20.1708984375, -27.4658203125, -27.421875}}, + {"3dvnpf13665", []float64{-28.4653508663, -28.4653495252, -105.126356632, -105.12635529}}, + {"1cv2hegqd1s7", []float64{-80.1345262863, -80.1345261186, -93.6648788676, -93.6648785323}}, + {"uy0yttxwk65", []float64{79.9238741398, 79.9238754809, 35.0568728149, 35.056874156}}, + {"166cs2etxffy", []float64{-77.0763716474, -77.0763714798, -119.690902121, -119.690901786}}, + {"bm7n7", []float64{75.6298828125, 75.673828125, -164.399414062, -164.35546875}}, + {"5r7q", []float64{-48.1640625, -47.98828125, -29.1796875, -28.828125}}, + {"ymjqjv6", []float64{74.2085266113, 74.2098999023, 108.888244629, 108.88961792}}, + {"jx95kpvmv9", []float64{-47.1976464987, -47.1976411343, 69.0894770622, 69.0894877911}}, + {"f1m4m9", []float64{52.4322509766, 52.4377441406, -82.7270507812, -82.7160644531}}, + {"0cdr1wfep", []float64{-80.2944374084, -80.2943944931, -143.016285896, -143.016242981}}, + {"fe4q2v7", []float64{63.0024719238, 63.0038452148, -64.2988586426, -64.2974853516}}, + {"9pxfyb9p", []float64{42.6748466492, 42.6750183105, -123.80355835, -123.803215027}}, + {"8w088r", []float64{33.8763427734, 33.8818359375, -156.785888672, -156.774902344}}, + {"u569fj", []float64{63.6163330078, 63.6218261719, 3.603515625, 3.61450195312}}, + {"smvm0syj0kst", []float64{33.2496320643, 33.2496322319, 18.6630416662, 18.6630420014}}, + {"vx", []float64{84.375, 90.0, 67.5, 78.75}}, + {"zj", []float64{73.125, 78.75, 135.0, 146.25}}, + {"5", []float64{-90.0, -45.0, -45.0, 0.0}}, + {"3", []float64{-45.0, 0.0, -135.0, -90.0}}, + {"69n85w0", []float64{-39.3420410156, -39.3406677246, -58.2055664062, -58.2041931152}}, + {"ywmj", []float64{81.03515625, 81.2109375, 119.53125, 119.8828125}}, + {"2717c52t", []float64{-27.4471092224, -27.446937561, -166.947555542, -166.947212219}}, + {"t1h", []float64{5.625, 7.03125, 50.625, 52.03125}}, + {"c", []float64{45.0, 90.0, -135.0, -90.0}}, + {"5xphd2", []float64{-49.833984375, -49.8284912109, -12.5573730469, -12.5463867188}}, + {"xy8", []float64{36.5625, 37.96875, 168.75, 170.15625}}, + {"t", []float64{0.0, 45.0, 45.0, 90.0}}, + {"3pet007v7qb", []float64{-1.93128302693, -1.93128168583, -130.072835684, -130.072834343}}, + {"dyuz", []float64{39.19921875, 39.375, -49.5703125, -49.21875}}, + {"6r", []float64{-5.625, 0.0, -78.75, -67.5}}, + {"y8v06s5", []float64{49.2846679688, 49.2860412598, 119.645233154, 119.646606445}}, + {"6", []float64{-45.0, 0.0, -90.0, -45.0}}, + {"mfuy2m", []float64{-28.4051513672, -28.3996582031, 85.4406738281, 85.4516601562}}, + {"dnp2u", []float64{33.8818359375, 33.92578125, -79.62890625, -79.5849609375}}, + {"g4u3ehx6", []float64{60.757484436, 60.7576560974, -38.8816452026, -38.8813018799}}, + {"q14h9tg88tx", []float64{-38.5522833467, -38.5522820055, 92.8832553327, 92.8832566738}}, + {"d4m6rn", []float64{13.0847167969, 13.0902099609, -82.3095703125, -82.2985839844}}, + {"64", []float64{-33.75, -28.125, -90.0, -78.75}}, + {"y2nzb", []float64{46.3623046875, 46.40625, 110.7421875, 110.786132812}}, + {"yhs5m8ytcgxb", []float64{70.8889147639, 70.8889149316, 95.8757111058, 95.875711441}}, + {"ytp9j8f0dv8", []float64{73.305016458, 73.3050177991, 123.291438818, 123.291440159}}, + {"pxcpm2h", []float64{-45.1318359375, -45.1304626465, 159.142456055, 159.143829346}}, + {"5zyf9", []float64{-45.966796875, -45.9228515625, -1.7138671875, -1.669921875}}, + {"wmz7v2", []float64{33.0029296875, 33.0084228516, 111.676025391, 111.687011719}}, + {"3fb7hkc8wus", []float64{-28.9777037501, -28.977702409, -100.709314942, -100.709313601}}, + {"ssstmsrv", []float64{26.2595558167, 26.259727478, 29.0804672241, 29.0808105469}}, + {"21", []float64{-39.375, -33.75, -180.0, -168.75}}, + {"w0du7r8257", []float64{3.60078513622, 3.60079050064, 94.0104925632, 94.0105032921}}, + {"fhx", []float64{70.3125, 71.71875, -80.15625, -78.75}}, + {"1xd", []float64{-47.8125, -46.40625, -109.6875, -108.28125}}, + {"s040p9v3shb2", []float64{0.00989601016045, 0.00989617779851, 3.14947161824, 3.14947195351}}, + {"gq", []float64{78.75, 84.375, -33.75, -22.5}}, + {"0", []float64{-90.0, -45.0, -180.0, -135.0}}, + {"17b4wsgdq0q", []float64{-68.4403167665, -68.4403154254, -123.459283412, -123.45928207}}, + {"x3qs", []float64{7.734375, 7.91015625, 155.390625, 155.7421875}}, + {"rc", []float64{-39.375, -33.75, 168.75, 180.0}}, + {"sfxjv06b9", []float64{15.0747013092, 15.0747442245, 43.8172960281, 43.8173389435}}, + {"6", []float64{-45.0, 0.0, -90.0, -45.0}}, + {"ngveyg", []float64{-68.2305908203, -68.2250976562, 131.781005859, 131.791992188}}, + {"pmxd5bd39qp", []float64{-58.7079012394, -58.7078998983, 156.964822859, 156.9648242}}, + {"xhw6", []float64{25.6640625, 25.83984375, 143.7890625, 144.140625}}, + {"bc6vx6", []float64{53.0090332031, 53.0145263672, -142.064208984, -142.053222656}}, + {"uuxy5sbu", []float64{71.3939666748, 71.3941383362, 44.803276062, 44.8036193848}}, + {"yu", []float64{67.5, 73.125, 123.75, 135.0}}, + {"610pf9c3ebh", []float64{-38.0028247833, -38.0028234422, -89.888253808, -89.8882524669}}, + {"9d", []float64{11.25, 16.875, -112.5, -101.25}}, + {"1s0tk4yp", []float64{-66.5608406067, -66.5606689453, -111.612854004, -111.612510681}}, + {"ss7yjqhs5su", []float64{24.9946086109, 24.994609952, 28.0104857683, 28.0104871094}}, + {"krqww1d0kp", []float64{-3.06785166264, -3.06784629822, 20.6572151184, 20.6572258472}}, + {"4yzg18c07qnm", []float64{-51.4997104369, -51.4997102693, -45.2841233835, -45.2841230482}}, + {"5wf241", []float64{-52.0257568359, -52.0202636719, -19.248046875, -19.2370605469}}, + {"gxrh60ce", []float64{86.5329551697, 86.5331268311, -12.5662994385, -12.5659561157}}, + {"5rzrgr9qf", []float64{-45.0015878677, -45.0015449524, -23.4100627899, -23.4100198746}}, + {"u3", []float64{50.625, 56.25, 11.25, 22.5}}, + {"8tm4uz7c", []float64{30.0546455383, 30.0548171997, -150.254859924, -150.254516602}}, + {"wz5f3", []float64{39.7705078125, 39.814453125, 129.067382812, 129.111328125}}, + {"ckz", []float64{71.71875, 73.125, -113.90625, -112.5}}, + {"h4s31", []float64{-75.76171875, -75.7177734375, 6.0205078125, 6.064453125}}, + {"hguje", []float64{-67.939453125, -67.8955078125, 39.5068359375, 39.55078125}}, + {"4rnfer", []float64{-50.1470947266, -50.1416015625, -69.1149902344, -69.1040039062}}, + {"gj", []float64{73.125, 78.75, -45.0, -33.75}}, + {"04", []float64{-78.75, -73.125, -180.0, -168.75}}, + {"kvj", []float64{-16.875, -15.46875, 40.78125, 42.1875}}, + {"7c3p", []float64{-36.73828125, -36.5625, -9.84375, -9.4921875}}, + {"rdw55", []float64{-30.41015625, -30.3662109375, 166.069335938, 166.11328125}}, + {"7spe18wk094b", []float64{-21.969217658, -21.9692174904, -11.8785988167, -11.8785984814}}, + {"uxumm", []float64{89.5166015625, 89.560546875, 28.6962890625, 28.740234375}}, + {"1n0sh0", []float64{-55.546875, -55.5413818359, -134.12109375, -134.110107422}}, + {"cphmy", []float64{85.3857421875, 85.4296875, -128.759765625, -128.715820312}}, + {"sd", []float64{11.25, 16.875, 22.5, 33.75}}, + {"h6jbb2gxyd0", []float64{-78.6127030849, -78.6127017438, 19.3520092964, 19.3520106375}}, + {"x3", []float64{5.625, 11.25, 146.25, 157.5}}, + {"yv289c0nbs", []float64{74.625813961, 74.6258193254, 124.530050755, 124.530061483}}, + {"g1fxbp5", []float64{56.2445068359, 56.245880127, -41.480255127, -41.4788818359}}, + {"bqdh", []float64{82.265625, 82.44140625, -165.9375, -165.5859375}}, + {"w558neznn3", []float64{16.8966346979, 16.8966400623, 95.2174007893, 95.2174115181}}, + {"up", []float64{84.375, 90.0, 0.0, 11.25}}, + {"pmy73pm2r", []float64{-57.0450925827, -57.0450496674, 155.090517998, 155.090560913}}, + {"jzerkufsjnh", []float64{-46.5112745762, -46.5112732351, 83.5327059031, 83.5327072442}}, + {"8eks", []float64{18.984375, 19.16015625, -151.171875, -150.8203125}}, + {"3rryyvfxc", []float64{-2.99931049347, -2.99926757812, -112.551455498, -112.551412582}}, + {"vn0cz", []float64{79.0576171875, 79.1015625, 46.3623046875, 46.40625}}, + {"skb5cnez", []float64{27.4148368835, 27.4150085449, 11.2990951538, 11.2994384766}}, + {"3p5cu0yju", []float64{-5.31227588654, -5.31223297119, -129.542369843, -129.542326927}}, + {"8yrwss1y2s", []float64{36.3218951225, 36.3219004869, -135.502946377, -135.502935648}}, + {"7x9kys4ydp", []float64{-1.95441305637, -1.95440769196, -20.4526805878, -20.4526698589}}, + {"u7gscks3", []float64{66.9536018372, 66.9537734985, 16.2326431274, 16.2329864502}}, + {"30c8pz7", []float64{-40.7414245605, -40.7400512695, -132.545928955, -132.544555664}}, + {"umertwj6wxuc", []float64{77.2892892547, 77.2892894223, 16.0695068166, 16.0695071518}}, + {"n6", []float64{-78.75, -73.125, 101.25, 112.5}}, + {"br8dqrhg058f", []float64{87.6219940558, 87.6219942234, -167.765692659, -167.765692323}}, + {"tp", []float64{39.375, 45.0, 45.0, 56.25}}, + {"33uy", []float64{-34.1015625, -33.92578125, -117.0703125, -116.71875}}, + {"u3ps1jnxg", []float64{51.356921196, 51.3569641113, 21.8498754501, 21.8499183655}}, + {"nqwmqymbyz", []float64{-52.4801498652, -52.4801445007, 110.343879461, 110.34389019}}, + {"wfugcgd", []float64{16.1471557617, 16.1485290527, 130.509338379, 130.51071167}}, + {"vp7g2eg", []float64{86.3731384277, 86.3745117188, 50.2995300293, 50.3009033203}}, + {"zz", []float64{84.375, 90.0, 168.75, 180.0}}, + {"859t0xmm6gwk", []float64{20.6071523577, 20.6071525253, -177.861316167, -177.861315832}}, + {"bvjnf49wp1hu", []float64{74.3262923509, 74.3262925185, -139.128492661, -139.128492326}}, + {"08ybs", []float64{-85.693359375, -85.6494140625, -147.83203125, -147.788085938}}, + {"dr908p7", []float64{42.3152160645, 42.3165893555, -77.339630127, -77.3382568359}}, + {"gufpbsu", []float64{73.1071472168, 73.1085205078, -8.41003417969, -8.40866088867}}, + {"623c388eg2t", []float64{-43.3706304431, -43.370629102, -76.2223117054, -76.2223103642}}, + {"8rc", []float64{43.59375, 45.0, -167.34375, -165.9375}}, + {"h4", []float64{-78.75, -73.125, 0.0, 11.25}}, + {"x84", []float64{0.0, 1.40625, 160.3125, 161.71875}}, + {"bbxgcx6nyzk", []float64{48.5127027333, 48.5127040744, -135.282602906, -135.282601565}}, + {"tqg06239nrht", []float64{38.014278654, 38.0142788216, 60.5699611455, 60.5699614808}}, + {"05fd", []float64{-68.5546875, -68.37890625, -176.484375, -176.1328125}}, + {"wuc", []float64{26.71875, 28.125, 125.15625, 126.5625}}, + {"m3hh", []float64{-38.671875, -38.49609375, 61.875, 62.2265625}}, + {"m2w4ru", []float64{-41.7700195312, -41.7645263672, 65.0280761719, 65.0390625}}, + {"4cz3k", []float64{-79.9365234375, -79.892578125, -45.87890625, -45.8349609375}}, + {"jqefz9v", []float64{-52.9444885254, -52.9431152344, 61.8598937988, 61.8612670898}}, + {"qqcnf60wjf", []float64{-5.83269953728, -5.83269417286, 102.756060362, 102.756071091}}, + {"n", []float64{-90.0, -45.0, 90.0, 135.0}}, + {"n8qen549x9vr", []float64{-88.0496587045, -88.0496585369, 121.908059008, 121.908059344}}, + {"g01nwm2wyj", []float64{46.1726027727, 46.1726081371, -43.3181476593, -43.3181369305}}, + {"6xc", []float64{-1.40625, 0.0, -66.09375, -64.6875}}, + {"v", []float64{45.0, 90.0, 45.0, 90.0}}, + {"nmjjnhz67", []float64{-60.9696149826, -60.9695720673, 108.555006981, 108.555049896}}, + {"2gzwv1zggqg", []float64{-22.7094335854, -22.7094322443, -135.472611934, -135.472610593}}, + {"dt", []float64{28.125, 33.75, -67.5, -56.25}}, + {"v33ysf2y0s", []float64{53.1872391701, 53.1872445345, 58.9207291603, 58.9207398891}}, + {"fwy6tccjp3", []float64{83.4186798334, 83.4186851978, -58.4565675259, -58.456556797}}, + {"qug6fjx4ue8k", []float64{-17.7671476454, -17.7671474777, 128.418009616, 128.418009952}}, + {"8y59duq21", []float64{34.0370178223, 34.0370607376, -141.198649406, -141.198606491}}, + {"pxgfrpkhdnk", []float64{-45.9701107442, -45.9701094031, 163.086639047, 163.086640388}}, + {"91b", []float64{9.84375, 11.25, -135.0, -133.59375}}, + {"v83mnvugmh", []float64{47.3173213005, 47.3173266649, 69.5611810684, 69.5611917973}}, + {"k76cem9", []float64{-26.4248657227, -26.4234924316, 15.2613830566, 15.2627563477}}, + {"ydjq0mk", []float64{57.3335266113, 57.3348999023, 119.899291992, 119.900665283}}, + {"cdsuf", []float64{59.8974609375, 59.94140625, -105.732421875, -105.688476562}}, + {"tuyz3", []float64{27.9931640625, 28.037109375, 88.2861328125, 88.330078125}}, + {"r7r2skwfj00", []float64{-26.605796814, -26.6057954729, 156.641564369, 156.64156571}}, + {"8hx507p9f5x", []float64{25.8566424251, 25.8566437662, -170.134868771, -170.13486743}}, + {"cc", []float64{50.625, 56.25, -101.25, -90.0}}, + {"pkuuvxz1z3", []float64{-62.4034112692, -62.4034059048, 153.181310892, 153.181321621}}, + {"4", []float64{-90.0, -45.0, -90.0, -45.0}}, + {"7vvz9gbf", []float64{-11.316947937, -11.3167762756, -3.08612823486, -3.08578491211}}, + {"54r", []float64{-77.34375, -75.9375, -35.15625, -33.75}}, + {"5r0mm0pwj1j", []float64{-49.7011131048, -49.7011117637, -33.1681899726, -33.1681886315}}, + {"mx", []float64{-5.625, 0.0, 67.5, 78.75}}, + {"rm39jwc", []float64{-15.2558898926, -15.2545166016, 148.60244751, 148.603820801}}, + {"x", []float64{0.0, 45.0, 135.0, 180.0}}, + {"k", []float64{-45.0, 0.0, 0.0, 45.0}}, + {"4gqrkyvqc", []float64{-70.4060983658, -70.4060554504, -47.2449445724, -47.2449016571}}, + {"9s36btfr9", []float64{24.4225215912, 24.4225645065, -110.717082024, -110.717039108}}, + {"dxt5", []float64{42.71484375, 42.890625, -60.46875, -60.1171875}}, + {"21x8tven9", []float64{-36.4432811737, -36.4432382584, -169.196276665, -169.196233749}}, + {"7hh", []float64{-22.5, -21.09375, -39.375, -37.96875}}, + {"v7nm8q7s", []float64{62.8768157959, 62.8769874573, 65.0548553467, 65.0551986694}}, + {"e9qxh", []float64{8.26171875, 8.3056640625, -13.18359375, -13.1396484375}}, + {"273", []float64{-26.71875, -25.3125, -167.34375, -165.9375}}, + {"p99qpkcvb", []float64{-80.4807329178, -80.4806900024, 159.578819275, 159.57886219}}, + {"q2062f9csybw", []float64{-44.5904645696, -44.590464402, 101.637129262, 101.637129597}}, + {"btsq32", []float64{77.0361328125, 77.0416259766, -151.468505859, -151.457519531}}, + {"1cj5", []float64{-83.84765625, -83.671875, -94.21875, -93.8671875}}, + {"j71pemzwqxt9", []float64{-71.7739416473, -71.7739414796, 57.8096582741, 57.8096586093}}, + {"qnqcbndu0nf", []float64{-9.49970439076, -9.49970304966, 99.4959667325, 99.4959680736}}, + {"2v7cm6junqb", []float64{-15.237314254, -15.2373129129, -140.737684965, -140.737683624}}, + {"e7", []float64{16.875, 22.5, -33.75, -22.5}}, + {"s91p45", []float64{6.87194824219, 6.87744140625, 23.994140625, 24.0051269531}}, + {"xwk1de36", []float64{35.438117981, 35.4382896423, 163.236579895, 163.236923218}}, + {"gf2", []float64{57.65625, 59.0625, -11.25, -9.84375}}, + {"d1m", []float64{7.03125, 8.4375, -82.96875, -81.5625}}, + {"0b", []float64{-90.0, -84.375, -146.25, -135.0}}, + {"rw4", []float64{-11.25, -9.84375, 160.3125, 161.71875}}, + {"74pk9xsb", []float64{-32.9177856445, -32.9176139832, -34.7322463989, -34.7319030762}}, + {"4ghe0cn8s", []float64{-72.5920772552, -72.5920343399, -49.8798179626, -49.8797750473}}, + {"tw4", []float64{33.75, 35.15625, 70.3125, 71.71875}}, + {"gevx3", []float64{67.3681640625, 67.412109375, -14.7216796875, -14.677734375}}, + {"kw8w6yqc1ks", []float64{-7.30433911085, -7.30433776975, 23.3333033323, 23.3333046734}}, + {"vbdjwzpg7", []float64{48.8183069229, 48.8183498383, 81.8699026108, 81.8699455261}}, + {"tp", []float64{39.375, 45.0, 45.0, 56.25}}, + {"751q", []float64{-27.0703125, -26.89453125, -43.2421875, -42.890625}}, + {"yeurzu", []float64{67.4780273438, 67.4835205078, 118.817138672, 118.828125}}, + {"svx", []float64{30.9375, 32.34375, 43.59375, 45.0}}, + {"4yrg1w", []float64{-54.2834472656, -54.2779541016, -45.2856445312, -45.2746582031}}, + {"h6xmxenmzkc", []float64{-74.9532110989, -74.9532097578, 21.7837978899, 21.7837992311}}, + {"j5uzmqughwj1", []float64{-67.5942097418, -67.5942095742, 51.9171233475, 51.9171236828}}, + {"26trngxu", []float64{-29.6871185303, -29.6869468689, -161.059913635, -161.059570312}}, + {"9", []float64{0.0, 45.0, -135.0, -90.0}}, + {"h58jxs7g", []float64{-69.3218421936, -69.3216705322, 0.334739685059, 0.335083007812}}, + {"tg2kgbk3", []float64{19.1177558899, 19.1179275513, 79.2721939087, 79.2725372314}}, + {"x34hfu", []float64{6.48193359375, 6.48742675781, 149.183349609, 149.194335938}}, + {"774j", []float64{-27.24609375, -27.0703125, -30.9375, -30.5859375}}, + {"yf", []float64{56.25, 61.875, 123.75, 135.0}}, + {"0", []float64{-90.0, -45.0, -180.0, -135.0}}, + {"e273k4gje9s", []float64{1.64203494787, 1.64203628898, -28.9996308088, -28.9996294677}}, + {"e", []float64{0.0, 45.0, -45.0, 0.0}}, + {"hx", []float64{-50.625, -45.0, 22.5, 33.75}}, + {"wz", []float64{39.375, 45.0, 123.75, 135.0}}, + {"w7p94b", []float64{17.05078125, 17.0562744141, 111.917724609, 111.928710938}}, + {"69sgzyb46pbs", []float64{-35.8658129722, -35.8658128045, -60.4796498269, -60.4796494916}}, + {"7f7m8k2u5", []float64{-31.3529205322, -31.3528776169, -6.66754245758, -6.66749954224}}, + {"gq4n7m", []float64{79.8760986328, 79.8815917969, -30.7946777344, -30.7836914062}}, + {"c1srg1pz8kt", []float64{54.8066094518, 54.8066107929, -128.880941123, -128.880939782}}, + {"b2h1dy", []float64{45.2966308594, 45.3021240234, -163.004150391, -162.993164062}}, + {"7170zy", []float64{-37.8039550781, -37.7984619141, -40.4406738281, -40.4296875}}, + {"p09", []float64{-87.1875, -85.78125, 136.40625, 137.8125}}, + {"sw970ux6", []float64{37.114906311, 37.1150779724, 24.3007278442, 24.301071167}}, + {"rc8hsdptqn", []float64{-35.7595646381, -35.7595592737, 168.958311081, 168.95832181}}, + {"qp3jujqc1", []float64{-3.17899703979, -3.17895412445, 91.5913438797, 91.591386795}}, + {"dn7n9krj2tgg", []float64{36.3231066428, 36.3231068105, -85.7166788355, -85.7166785002}}, + {"23e7vwt", []float64{-35.8676147461, -35.8662414551, -163.931121826, -163.929748535}}, + {"um", []float64{73.125, 78.75, 11.25, 22.5}}, + {"p8ws1", []float64{-86.484375, -86.4404296875, 166.684570312, 166.728515625}}, + {"vqt63tm6xx", []float64{81.9873136282, 81.9873189926, 63.7062621117, 63.7062728405}}, + {"b", []float64{45.0, 90.0, -180.0, -135.0}}, + {"5", []float64{-90.0, -45.0, -45.0, 0.0}}, + {"bm20d", []float64{74.619140625, 74.6630859375, -168.662109375, -168.618164062}}, + {"p3t4k9c0", []float64{-81.1573791504, -81.157207489, 153.480377197, 153.48072052}}, + {"x", []float64{0.0, 45.0, 135.0, 180.0}}, + {"8rsnr3k0", []float64{43.2929992676, 43.293170929, -162.80090332, -162.800559998}}, + {"b739vykqw", []float64{63.6243152618, 63.6243581772, -166.381845474, -166.381802559}}, + {"m", []float64{-45.0, 0.0, 45.0, 90.0}}, + {"13zrrq", []float64{-78.8488769531, -78.8433837891, -113.236083984, -113.225097656}}, + {"6yk9jt", []float64{-9.64050292969, -9.63500976562, -49.6801757812, -49.6691894531}}, + {"zmn0", []float64{73.125, 73.30078125, 154.6875, 155.0390625}}, + {"k", []float64{-45.0, 0.0, 0.0, 45.0}}, + {"ps151cxkk8", []float64{-66.9636869431, -66.9636815786, 158.993303776, 158.993314505}}, + {"tn", []float64{33.75, 39.375, 45.0, 56.25}}, + {"u7", []float64{61.875, 67.5, 11.25, 22.5}}, + {"55yuz", []float64{-68.0712890625, -68.02734375, -35.2001953125, -35.15625}}, + {"x0p3grv8k", []float64{0.350232124329, 0.350275039673, 145.345859528, 145.345902443}}, + {"y5c6gjhshg", []float64{66.6053169966, 66.605322361, 91.896032095, 91.8960428238}}, + {"6fd2j", []float64{-30.9375, -30.8935546875, -52.8662109375, -52.822265625}}, + {"gegbfkt19cj6", []float64{66.2505683675, 66.2505685352, -17.1207369491, -17.1207366139}}, + {"k7", []float64{-28.125, -22.5, 11.25, 22.5}}, + {"y1ydbr", []float64{55.3656005859, 55.37109375, 99.1516113281, 99.1625976562}}, + {"byqsd", []float64{80.947265625, 80.9912109375, -137.021484375, -136.977539062}}, + {"mfcbxhf", []float64{-29.4172668457, -29.4158935547, 81.5213012695, 81.5226745605}}, + {"yxwd5901", []float64{87.5447273254, 87.5448989868, 121.794433594, 121.794776917}}, + {"24k7s7y0gqgj", []float64{-31.7077504657, -31.7077502981, -173.828286678, -173.828286342}}, + {"9031fbu", []float64{1.71798706055, 1.71936035156, -133.467407227, -133.466033936}}, + {"xqusuduvmc", []float64{38.8197237253, 38.8197290897, 152.782648802, 152.782659531}}, + {"5bxw", []float64{-86.1328125, -85.95703125, -0.703125, -0.3515625}}, + {"xw593h52f", []float64{33.9918279648, 33.9918708801, 162.470369339, 162.470412254}}, + {"3", []float64{-45.0, 0.0, -135.0, -90.0}}, + {"9xxm2s", []float64{43.1323242188, 43.1378173828, -102.282714844, -102.271728516}}, + {"byf7t", []float64{83.583984375, 83.6279296875, -142.866210938, -142.822265625}}, + {"v6", []float64{56.25, 61.875, 56.25, 67.5}}, + {"yh", []float64{67.5, 73.125, 90.0, 101.25}}, + {"d6k43xp", []float64{13.0902099609, 13.091583252, -73.0494689941, -73.0480957031}}, + {"k4m5h", []float64{-31.81640625, -31.7724609375, 7.20703125, 7.2509765625}}, + {"r1t7", []float64{-36.03515625, -35.859375, 142.3828125, 142.734375}}, + {"cs4kvrjdkm7", []float64{68.3738274872, 68.3738288283, -109.097485095, -109.097483754}}, + {"unu", []float64{82.96875, 84.375, 5.625, 7.03125}}, + {"59xp", []float64{-80.33203125, -80.15625, -12.65625, -12.3046875}}, + {"542p4hbm", []float64{-76.0863304138, -76.0861587524, -44.9117660522, -44.9114227295}}, + {"5j", []float64{-61.875, -56.25, -45.0, -33.75}}, + {"v3", []float64{50.625, 56.25, 56.25, 67.5}}, + {"mstr13c0", []float64{-18.4474182129, -18.4472465515, 74.9391174316, 74.9394607544}}, + {"wcvtdg61swv8", []float64{10.8286933601, 10.8286935277, 131.608171687, 131.608172022}}, + {"3cm40m0w91z3", []float64{-37.5885963254, -37.5885961577, -94.207024388, -94.2070240527}}, + {"m9fup0", []float64{-34.453125, -34.4476318359, 71.6748046875, 71.6857910156}}, + {"jx", []float64{-50.625, -45.0, 67.5, 78.75}}, + {"7myut8cg", []float64{-11.8605995178, -11.8604278564, -24.013710022, -24.0133666992}}, + {"d", []float64{0.0, 45.0, -90.0, -45.0}}, + {"5dgu11uuf12g", []float64{-73.8176893629, -73.8176891953, -17.1760072187, -17.1760068834}}, + {"qp1ens1zqg", []float64{-5.07442295551, -5.07441759109, 92.3977124691, 92.3977231979}}, + {"vusxu8ewwbrz", []float64{71.6786695831, 71.6786697507, 85.2809854969, 85.2809858322}}, + {"8qjtgd1q", []float64{34.7727584839, 34.7729301453, -160.860099792, -160.85975647}}, + {"60dppvw", []float64{-40.9268188477, -40.9254455566, -86.838684082, -86.837310791}}, + {"tygxz83s2", []float64{39.3331575394, 39.3332004547, 84.0035247803, 84.0035676956}}, + {"e0qwrc", []float64{2.51037597656, 2.51586914062, -35.5187988281, -35.5078125}}, + {"5wyh2qbz", []float64{-51.2458992004, -51.2457275391, -14.0504837036, -14.0501403809}}, + {"0zqs", []float64{-48.515625, -48.33984375, -137.109375, -136.7578125}}, + {"n5ss", []float64{-69.609375, -69.43359375, 96.328125, 96.6796875}}, + {"5", []float64{-90.0, -45.0, -45.0, 0.0}}, + {"v9q84x9n8ktx", []float64{52.0735898428, 52.0735900104, 76.7518796772, 76.7518800125}}, + {"4", []float64{-90.0, -45.0, -90.0, -45.0}}, + {"j", []float64{-90.0, -45.0, 45.0, 90.0}}, + {"2eh8wf4ktz", []float64{-28.0253130198, -28.0253076553, -150.871907473, -150.871896744}}, + {"cr96dg281x", []float64{87.6448434591, 87.6448488235, -121.870586872, -121.870576143}}, + {"56u", []float64{-74.53125, -73.125, -28.125, -26.71875}}, + {"628vt", []float64{-41.220703125, -41.1767578125, -77.4755859375, -77.431640625}}, + {"0heb", []float64{-64.6875, -64.51171875, -174.7265625, -174.375}}, + {"8q5skg4mk", []float64{34.5144510269, 34.5144939423, -163.616123199, -163.616080284}}, + {"7x0pw97495hw", []float64{-4.2993279174, -4.29932774976, -22.2101866454, -22.2101863101}}, + {"1j4vjyz1dqx", []float64{-60.9587225318, -60.9587211907, -130.870407969, -130.870406628}}, + {"u4q1szh", []float64{57.9583740234, 57.9597473145, 8.65173339844, 8.65310668945}}, + {"3v3dbm2uv2", []float64{-14.9556970596, -14.9556916952, -99.1283833981, -99.1283726692}}, + {"te3jtfwjnq", []float64{19.2626702785, 19.262675643, 69.1674435139, 69.1674542427}}, + {"sgvgx3ey6qe", []float64{21.7183318734, 21.7183332145, 42.1597914398, 42.1597927809}}, + {"2sw", []float64{-19.6875, -18.28125, -149.0625, -147.65625}}, + {"wzy2pqu8e", []float64{43.6309146881, 43.6309576035, 132.863974571, 132.864017487}}, + {"dk2849", []float64{23.9117431641, 23.9172363281, -77.9370117188, -77.9260253906}}, + {"0d65", []float64{-76.81640625, -76.640625, -154.6875, -154.3359375}}, + {"84", []float64{11.25, 16.875, -180.0, -168.75}}, + {"q17", []float64{-37.96875, -36.5625, 94.21875, 95.625}}, + {"x9wzer", []float64{9.79431152344, 9.7998046875, 167.135009766, 167.145996094}}, + {"7xk2s7425n6", []float64{-4.1143463552, -4.1143450141, -16.3334485888, -16.3334472477}}, + {"jv", []float64{-61.875, -56.25, 78.75, 90.0}}, + {"u5juvw", []float64{62.7429199219, 62.7484130859, 8.32763671875, 8.33862304688}}, + {"cnuczt0c", []float64{83.3040046692, 83.3041763306, -127.989692688, -127.989349365}}, + {"7f3jv330zuh", []float64{-31.3259911537, -31.3259898126, -9.61132586002, -9.61132451892}}, + {"5", []float64{-90.0, -45.0, -45.0, 0.0}}, + {"70jqgb8uv", []float64{-43.8099145889, -43.8098716736, -37.4511480331, -37.4511051178}}, + {"u5t5d155", []float64{65.3087425232, 65.3089141846, 7.12326049805, 7.1236038208}}, + {"r9hy1gz8m4p", []float64{-38.2996594906, -38.2996581495, 164.267115444, 164.267116785}}, + {"qwmurb9920", []float64{-9.09371852875, -9.09371316433, 120.928573608, 120.928584337}}, + {"d2pt693fw4", []float64{0.930157899857, 0.930163264275, -68.0906009674, -68.0905902386}}, + {"zfgbx", []float64{60.556640625, 60.6005859375, 174.331054688, 174.375}}, + {"c7mtdqkfuuc", []float64{64.2828767002, 64.2828780413, -115.910019726, -115.910018384}}, + {"9", []float64{0.0, 45.0, -135.0, -90.0}}, + {"w89eqd", []float64{3.39477539062, 3.40026855469, 114.895019531, 114.906005859}}, + {"75mtem68vx", []float64{-25.7229477167, -25.7229423523, -37.1191334724, -37.1191227436}}, + {"12e9fwr", []float64{-86.8455505371, -86.8441772461, -118.708648682, -118.707275391}}, + {"v", []float64{45.0, 90.0, 45.0, 90.0}}, + {"nq", []float64{-56.25, -50.625, 101.25, 112.5}}, + {"svnx", []float64{29.35546875, 29.53125, 42.890625, 43.2421875}}, + {"0e1kd5pxdgt", []float64{-72.316198647, -72.3161973059, -155.64387247, -155.643871129}}, + {"pgk", []float64{-71.71875, -70.3125, 174.375, 175.78125}}, + {"vry9q", []float64{88.8134765625, 88.857421875, 65.654296875, 65.6982421875}}, + {"6f2x91u", []float64{-31.0157775879, -31.0144042969, -55.4974365234, -55.4960632324}}, + {"3hefw", []float64{-19.248046875, -19.2041015625, -129.462890625, -129.418945312}}, + {"g", []float64{45.0, 90.0, -45.0, 0.0}}, + {"m66erp", []float64{-31.7340087891, -31.728515625, 60.0732421875, 60.0842285156}}, + {"5nwny", []float64{-52.2509765625, -52.20703125, -36.298828125, -36.2548828125}}, + {"d5", []float64{16.875, 22.5, -90.0, -78.75}}, + {"wuphy8", []float64{23.3349609375, 23.3404541016, 133.879394531, 133.890380859}}, + {"b8u631kf", []float64{49.6214675903, 49.6216392517, -151.472969055, -151.472625732}}, + {"34d8j", []float64{-30.9375, -30.8935546875, -131.264648438, -131.220703125}}, + {"x", []float64{0.0, 45.0, 135.0, 180.0}}, + {"he", []float64{-73.125, -67.5, 22.5, 33.75}}, + {"yec1yepjd", []float64{66.4187908173, 66.4188337326, 114.201593399, 114.201636314}}, + {"h0p58hpz", []float64{-89.3615913391, -89.3614196777, 9.85439300537, 9.85473632812}}, + {"8u", []float64{22.5, 28.125, -146.25, -135.0}}, + {"hg9sxf0mu", []float64{-69.509510994, -69.5094680786, 36.200466156, 36.2005090714}}, + {"7zdg4c9868", []float64{-2.27687358856, -2.27686822414, -7.25979566574, -7.2597849369}}, + {"1h", []float64{-67.5, -61.875, -135.0, -123.75}}, + {"k7h239zr4097", []float64{-28.0702368356, -28.070236668, 17.3025243357, 17.302524671}}, + {"9h", []float64{22.5, 28.125, -135.0, -123.75}}, + {"fx", []float64{84.375, 90.0, -67.5, -56.25}}, + {"sf66h05", []float64{13.0078125, 13.009185791, 37.093963623, 37.0953369141}}, + {"4ge39", []float64{-70.048828125, -70.0048828125, -51.6357421875, -51.591796875}}, + {"w", []float64{0.0, 45.0, 90.0, 135.0}}, + {"7", []float64{-45.0, 0.0, -45.0, 0.0}}, + {"ypd", []float64{87.1875, 88.59375, 92.8125, 94.21875}}, + {"gbzh2r", []float64{50.0042724609, 50.009765625, -1.39526367188, -1.38427734375}}, + {"h0", []float64{-90.0, -84.375, 0.0, 11.25}}, + {"nk7wkxwvku2", []float64{-64.952994436, -64.9529930949, 106.379102468, 106.37910381}}, + {"23qu", []float64{-37.265625, -37.08984375, -159.2578125, -158.90625}}, + {"n15b3", []float64{-84.3310546875, -84.287109375, 95.3173828125, 95.361328125}}, + {"8x9c6f1cjkn", []float64{42.4184060097, 42.4184073508, -154.915576279, -154.915574938}}, + {"5dr", []float64{-77.34375, -75.9375, -12.65625, -11.25}}, + {"022q4", []float64{-87.5390625, -87.4951171875, -168.310546875, -168.266601562}}, + {"52", []float64{-90.0, -84.375, -33.75, -22.5}}, + {"s0j8hb", []float64{0.0, 0.0054931640625, 7.94311523438, 7.9541015625}}, + {"58ygg9vn9nj", []float64{-85.1113092899, -85.1113079488, -12.8470878303, -12.8470864892}}, + {"ztzqs1", []float64{78.4918212891, 78.4973144531, 167.87109375, 167.882080078}}, + {"n5x", []float64{-70.3125, -68.90625, 99.84375, 101.25}}, + {"jh593g3fsf", []float64{-67.261980772, -67.2619754076, 50.001386404, 50.0013971329}}, + {"8vjg52364", []float64{28.6540603638, 28.6541032791, -138.01943779, -138.019394875}}, + {"dqeh1", []float64{37.265625, 37.3095703125, -74.4873046875, -74.443359375}}, + {"2", []float64{-45.0, 0.0, -180.0, -135.0}}, + {"m1mjx", []float64{-37.001953125, -36.9580078125, 52.3388671875, 52.3828125}}, + {"w4fr", []float64{16.69921875, 16.875, 93.1640625, 93.515625}}, + {"k2v0kj", []float64{-40.7098388672, -40.7043457031, 18.45703125, 18.4680175781}}, + {"ytmdb30u", []float64{75.0208282471, 75.0209999084, 120.246391296, 120.246734619}}, + {"wzuhv", []float64{44.4287109375, 44.47265625, 129.594726562, 129.638671875}}, + {"84m2ue733hx1", []float64{12.8061776049, 12.8061777726, -172.414918095, -172.41491776}}, + {"02", []float64{-90.0, -84.375, -168.75, -157.5}}, + {"x8j5vd68s", []float64{0.671625137329, 0.671668052673, 164.776554108, 164.776597023}}, + {"7k", []float64{-22.5, -16.875, -33.75, -22.5}}, + {"4xtrffmbtxr", []float64{-46.4377109706, -46.4377096295, -59.9881960452, -59.9881947041}}, + {"k8x", []float64{-42.1875, -40.78125, 32.34375, 33.75}}, + {"yyxsh", []float64{82.265625, 82.3095703125, 134.47265625, 134.516601562}}, + {"f2rqpzxu", []float64{47.502822876, 47.5029945374, -68.2034683228, -68.203125}}, + {"d6h4jg", []float64{11.6180419922, 11.6235351562, -72.8723144531, -72.861328125}}, + {"e7d", []float64{19.6875, 21.09375, -30.9375, -29.53125}}, + {"bbs0tcr5wc9", []float64{47.9078659415, 47.9078672826, -140.362410396, -140.362409055}}, + {"7fuf", []float64{-29.1796875, -29.00390625, -4.5703125, -4.21875}}, + {"n", []float64{-90.0, -45.0, 90.0, 135.0}}, + {"tt9p6y", []float64{32.2448730469, 32.2503662109, 69.0270996094, 69.0380859375}}, + {"xypsv42g6pr", []float64{34.5979173481, 34.5979186893, 179.517726749, 179.51772809}}, + {"p4gvh3sr97h", []float64{-73.6428004503, -73.6427991092, 140.466100574, 140.466101915}}, + {"4", []float64{-90.0, -45.0, -90.0, -45.0}}, + {"6ecy4rk", []float64{-22.8117370605, -22.8103637695, -64.9346923828, -64.9333190918}}, + {"045xe38uj", []float64{-77.4227142334, -77.4226713181, -174.934058189, -174.934015274}}, + {"xyunrc48", []float64{39.0728759766, 39.0730476379, 174.719009399, 174.719352722}}, + {"enm03", []float64{35.2001953125, 35.244140625, -37.9248046875, -37.880859375}}, + {"n7r9k6ecbhm", []float64{-71.4849673212, -71.4849659801, 111.988799125, 111.988800466}}, + {"2bpxnf2", []float64{-43.7571716309, -43.7557983398, -135.406494141, -135.40512085}}, + {"4", []float64{-90.0, -45.0, -90.0, -45.0}}, + {"vs65p7h9m", []float64{69.4502878189, 69.4503307343, 70.6374979019, 70.6375408173}}, + {"j6vt6yyjmms", []float64{-73.5703888535, -73.5703875124, 64.1136950254, 64.1136963665}}, + {"pp7z8s7", []float64{-47.8770446777, -47.8756713867, 140.299530029, 140.30090332}}, + {"skxsqyzdgn", []float64{26.0971534252, 26.0971587896, 22.103934288, 22.1039450169}}, + {"y1x7mdy", []float64{54.0238952637, 54.0252685547, 100.445251465, 100.446624756}}, + {"ck9n9vptne", []float64{71.4834183455, 71.4834237099, -122.256267071, -122.256256342}}, + {"fq7u12930mgt", []float64{80.862324927, 80.8623250946, -73.4198988229, -73.4198984876}}, + {"zs50j6", []float64{67.5109863281, 67.5164794922, 161.949462891, 161.960449219}}, + {"9jccmq0j", []float64{32.5972938538, 32.5974655151, -132.308349609, -132.308006287}}, + {"y", []float64{45.0, 90.0, 90.0, 135.0}}, + {"hhtn7", []float64{-63.5888671875, -63.544921875, 7.1630859375, 7.20703125}}, + {"480spdnxu", []float64{-89.2845582962, -89.2845153809, -66.4581871033, -66.4581441879}}, + {"ghz87yh", []float64{71.7956542969, 71.7970275879, -34.2828369141, -34.281463623}}, + {"cf", []float64{56.25, 61.875, -101.25, -90.0}}, + {"gd1n5pnc8p7", []float64{57.3434360325, 57.3434373736, -20.9526403248, -20.9526389837}}, + {"hef9dvw3q", []float64{-68.6121511459, -68.6121082306, 26.1453151703, 26.1453580856}}, + {"wbjwhe8rkyf0", []float64{1.07519432902, 1.07519449666, 131.682678759, 131.682679094}}, + {"pndb3wg", []float64{-53.3564758301, -53.3551025391, 138.937225342, 138.938598633}}, + {"bh2k52ewybs", []float64{69.6132829785, 69.6132843196, -179.500513673, -179.500512332}}, + {"t4r", []float64{12.65625, 14.0625, 54.84375, 56.25}}, + {"s7215", []float64{18.45703125, 18.5009765625, 11.3818359375, 11.42578125}}, + {"u8", []float64{45.0, 50.625, 22.5, 33.75}}, + {"f2", []float64{45.0, 50.625, -78.75, -67.5}}, + {"3hwb5r0etbt", []float64{-19.6484443545, -19.6484430134, -125.36405012, -125.364048779}}, + {"wendsfh63", []float64{17.3258256912, 17.3258686066, 121.855244637, 121.855287552}}, + {"90n", []float64{0.0, 1.40625, -126.5625, -125.15625}}, + {"e5mx13zs4t2", []float64{19.5220465958, 19.5220479369, -37.2002863884, -37.2002850473}}, + {"6kngstqxn", []float64{-21.854724884, -21.8546819687, -69.0508747101, -69.0508317947}}, + {"rgs", []float64{-25.3125, -23.90625, 174.375, 175.78125}}, + {"sbd1n1z90", []float64{2.99806594849, 2.99810886383, 36.8364715576, 36.836514473}}, + {"693c7m26y6", []float64{-37.7197015285, -37.7196961641, -64.8956286907, -64.8956179619}}, + {"ynu", []float64{82.96875, 84.375, 95.625, 97.03125}}, + {"68t74uch2444", []float64{-41.6333230957, -41.6333229281, -59.9949619174, -59.9949615821}}, + {"jcmv2zscc", []float64{-82.0043992996, -82.0043563843, 86.875462532, 86.8755054474}}, + {"3c", []float64{-39.375, -33.75, -101.25, -90.0}}, + {"r76ymc", []float64{-25.6146240234, -25.6091308594, 150.369873047, 150.380859375}}, + {"kj9vj024cd", []float64{-13.1817376614, -13.1817322969, 2.68072843552, 2.68073916435}}, + {"scr", []float64{7.03125, 8.4375, 43.59375, 45.0}}, + {"57g8wvk", []float64{-68.7895202637, -68.7881469727, -28.5260009766, -28.5246276855}}, + {"8qde6", []float64{37.1337890625, 37.177734375, -165.146484375, -165.102539062}}, + {"7ve260", []float64{-14.0185546875, -14.0130615234, -6.591796875, -6.58081054688}}, + {"trcxzhy6", []float64{44.9824905396, 44.9826622009, 58.6755752563, 58.6759185791}}, + {"syqw7fyhx", []float64{36.2707614899, 36.2708044052, 43.0639600754, 43.0640029907}}, + {"tb0", []float64{0.0, 1.40625, 78.75, 80.15625}}, + {"dttugd1xtj5p", []float64{31.7847627215, 31.7847628891, -59.2579753697, -59.2579750344}}, + {"v899zsbyh7f2", []float64{48.1472598016, 48.1472599693, 69.9401802197, 69.940180555}}, + {"8e", []float64{16.875, 22.5, -157.5, -146.25}}, + {"s", []float64{0.0, 45.0, 0.0, 45.0}}, + {"ny4017r8x", []float64{-56.2320613861, -56.2320184708, 126.628031731, 126.628074646}}, + {"8ued", []float64{25.6640625, 25.83984375, -141.328125, -140.9765625}}, + {"bqpsbj0h9p1t", []float64{79.6132376231, 79.6132377908, -158.203080073, -158.203079738}}, + {"s", []float64{0.0, 45.0, 0.0, 45.0}}, + {"b5m", []float64{63.28125, 64.6875, -172.96875, -171.5625}}, + {"t", []float64{0.0, 45.0, 45.0, 90.0}}, + {"w722c", []float64{18.4130859375, 18.45703125, 101.645507812, 101.689453125}}, + {"ey", []float64{33.75, 39.375, -11.25, 0.0}}, + {"6ndrutd", []float64{-7.04498291016, -7.04360961914, -86.6354370117, -86.6340637207}}, + {"c", []float64{45.0, 90.0, -135.0, -90.0}}, + {"47jtykh", []float64{-72.0922851562, -72.0909118652, -70.7354736328, -70.7341003418}}, + {"krb5qvkb", []float64{-0.806121826172, -0.805950164795, 11.5531539917, 11.5534973145}}, + {"mtpgnes7uugs", []float64{-16.3277602941, -16.3277601264, 78.6901270598, 78.6901273951}}, + {"6062z", []float64{-43.4619140625, -43.41796875, -86.5283203125, -86.484375}}, + {"241quukp", []float64{-32.5389289856, -32.5387573242, -178.027954102, -178.027610779}}, + {"g4m49", []float64{58.095703125, 58.1396484375, -37.9248046875, -37.880859375}}, + {"wwq", []float64{35.15625, 36.5625, 120.9375, 122.34375}}, + {"beujb1c", []float64{67.1141052246, 67.1154785156, -151.873626709, -151.872253418}}, + {"h4", []float64{-78.75, -73.125, 0.0, 11.25}}, + {"69xq1n9v", []float64{-35.4712486267, -35.4710769653, -57.2583389282, -57.2579956055}}, + {"cjpu", []float64{73.828125, 74.00390625, -124.1015625, -123.75}}, + {"pmks2611hw", []float64{-59.7104895115, -59.7104841471, 152.590677738, 152.590688467}}, + {"zd78cuj300z8", []float64{57.8102342784, 57.8102344461, 162.505999133, 162.505999468}}, + {"g9h460p7719", []float64{51.0210737586, 51.0210750997, -16.777022928, -16.7770215869}}, + {"crncwf7g0c", []float64{84.6515518427, 84.6515572071, -113.955999613, -113.955988884}}, + {"mp4", []float64{-5.625, -4.21875, 47.8125, 49.21875}}, + {"3ghgz3gsjgr", []float64{-27.4555031955, -27.4555018544, -94.2466463149, -94.2466449738}}, + {"m2g9", []float64{-40.60546875, -40.4296875, 61.171875, 61.5234375}}, + {"ngt309w", []float64{-70.1284790039, -70.1271057129, 131.163024902, 131.164398193}}, + {"hgn9u3537p", []float64{-72.8116375208, -72.8116321564, 43.08198452, 43.0819952488}}, + {"6spnhu", []float64{-21.4233398438, -21.4178466797, -57.4475097656, -57.4365234375}}, + {"z22v4r0d82", []float64{47.3240375519, 47.3240429163, 147.404261827, 147.404272556}}, + {"ytypn", []float64{78.57421875, 78.6181640625, 121.201171875, 121.245117188}}, + {"qstc", []float64{-19.51171875, -19.3359375, 120.5859375, 120.9375}}, + {"y8r48c5", []float64{46.8511962891, 46.8525695801, 122.380828857, 122.382202148}}, + {"uq2xybqqg", []float64{81.5210866928, 81.5211296082, 12.2584676743, 12.2585105896}}, + {"95xm8dtz5u42", []float64{20.6692528725, 20.6692530401, -124.77465447, -124.774654135}}, + {"x0uqxwj", []float64{5.39428710938, 5.39566040039, 141.313018799, 141.31439209}}, + {"0kve9v", []float64{-62.6385498047, -62.6330566406, -160.938720703, -160.927734375}}, + {"szwv1er87", []float64{43.0843019485, 43.0843448639, 43.3185338974, 43.3185768127}}, + {"88zscymedp", []float64{5.08868157864, 5.08868694305, -146.868581772, -146.868571043}}, + {"pd", []float64{-78.75, -73.125, 157.5, 168.75}}, + {"g6", []float64{56.25, 61.875, -33.75, -22.5}}, + {"3pg6r201vv51", []float64{-1.01041479036, -1.01041462272, -130.110833198, -130.110832863}}, + {"y", []float64{45.0, 90.0, 90.0, 135.0}}, + {"g02", []float64{46.40625, 47.8125, -45.0, -43.59375}}, + {"jx", []float64{-50.625, -45.0, 67.5, 78.75}}, + {"zksxs2nz2j3", []float64{71.6321320832, 71.6321334243, 152.774163634, 152.774164975}}, + {"erqyg8ff4d", []float64{41.9722473621, 41.9722527266, -24.1001200676, -24.1001093388}}, + {"n3wm1r3p4jev", []float64{-80.6425363384, -80.6425361708, 110.095458291, 110.095458627}}, + {"sqfbq13pjp9", []float64{38.0208036304, 38.0208049715, 15.3824485838, 15.3824499249}}, + {"2", []float64{-45.0, 0.0, -180.0, -135.0}}, + {"y0", []float64{45.0, 50.625, 90.0, 101.25}}, + {"04q3tpcc68bq", []float64{-77.0372864977, -77.03728633, -170.988700055, -170.988699719}}, + {"81", []float64{5.625, 11.25, -180.0, -168.75}}, + {"nj", []float64{-61.875, -56.25, 90.0, 101.25}}, + {"b", []float64{45.0, 90.0, -180.0, -135.0}}, + {"8bxs0rfgp3n7", []float64{3.55871787295, 3.55871804059, -135.688042603, -135.688042268}}, + {"j0rm6", []float64{-87.6708984375, -87.626953125, 55.283203125, 55.3271484375}}, + {"86v9", []float64{15.64453125, 15.8203125, -161.015625, -160.6640625}}, + {"c", []float64{45.0, 90.0, -135.0, -90.0}}, + {"nmcsv", []float64{-56.8212890625, -56.77734375, 103.579101562, 103.623046875}}, + {"cqytkn1d56", []float64{83.9249145985, 83.9249199629, -114.431394339, -114.43138361}}, + {"jfpcxxud86r9", []float64{-78.4433147125, -78.4433145449, 89.9842279404, 89.9842282757}}, + {"zf8cm8fkrg", []float64{59.2870920897, 59.2870974541, 170.049809217, 170.049819946}}, + {"98x", []float64{2.8125, 4.21875, -102.65625, -101.25}}, + {"e5mbzuu", []float64{18.4391784668, 18.4405517578, -36.5679931641, -36.566619873}}, + {"03g5kqkcp", []float64{-79.5504570007, -79.5504140854, -164.337658882, -164.337615967}}, + {"b362q0b79x2", []float64{52.0799548924, 52.0799562335, -165.321857929, -165.321856588}}, + {"m3pq", []float64{-38.3203125, -38.14453125, 66.4453125, 66.796875}}, + {"6564m2w5b0", []float64{-26.3198518753, -26.3198465109, -86.9485473633, -86.9485366344}}, + {"m9puw71wrf", []float64{-38.5664212704, -38.566415906, 78.6754882336, 78.6754989624}}, + {"z", []float64{45.0, 90.0, 135.0, 180.0}}, + {"9t34u", []float64{30.0146484375, 30.05859375, -110.91796875, -110.874023438}}, + {"hewq3", []float64{-69.2138671875, -69.169921875, 31.3330078125, 31.376953125}}, + {"2m2uy1tgg0", []float64{-14.6249055862, -14.6249002218, -167.423615456, -167.423604727}}, + {"qu8dxgebd9wz", []float64{-19.22872575, -19.2287255824, 124.798967354, 124.798967689}}, + {"scwmj", []float64{9.31640625, 9.3603515625, 42.7587890625, 42.802734375}}, + {"9", []float64{0.0, 45.0, -135.0, -90.0}}, + {"zm02460g", []float64{73.1365013123, 73.1366729736, 146.701469421, 146.701812744}}, + {"cgtd84ky", []float64{65.1403427124, 65.1405143738, -93.5091018677, -93.5087585449}}, + {"eq", []float64{33.75, 39.375, -33.75, -22.5}}, + {"ht26dn8h", []float64{-59.9929046631, -59.9927330017, 22.939453125, 22.9397964478}}, + {"c", []float64{45.0, 90.0, -135.0, -90.0}}, + {"b", []float64{45.0, 90.0, -180.0, -135.0}}, + {"j70dzsu", []float64{-72.6155090332, -72.6141357422, 57.2882080078, 57.2895812988}}, + {"y241djby0dk", []float64{45.2962996066, 45.2963009477, 104.151447415, 104.151448756}}, + {"un9", []float64{81.5625, 82.96875, 1.40625, 2.8125}}, + {"2kzdj9", []float64{-17.9241943359, -17.9187011719, -157.961425781, -157.950439453}}, + {"m7b3gj70", []float64{-23.5697937012, -23.5696220398, 56.7375183105, 56.7378616333}}, + {"vvpmy3", []float64{74.1412353516, 74.1467285156, 89.2199707031, 89.2309570312}}, + {"ym0n0", []float64{74.1796875, 74.2236328125, 101.25, 101.293945312}}, + {"k6", []float64{-33.75, -28.125, 11.25, 22.5}}, + {"d96nj5sqmjfn", []float64{8.10626830906, 8.10626847669, -64.4617196918, -64.4617193565}}, + {"9yfrdwk", []float64{39.3214416504, 39.3228149414, -97.9705810547, -97.9692077637}}, + {"8vj712jd0kv", []float64{28.6527125537, 28.6527138948, -138.804685324, -138.804683983}}, + {"bk86yhd", []float64{70.8206176758, 70.8219909668, -168.132019043, -168.130645752}}, + {"6e3pb2s87q10", []float64{-25.3536236286, -25.353623461, -66.0764430463, -66.0764427111}}, + {"nxbn2n7yn", []float64{-45.2722549438, -45.2722120285, 112.505407333, 112.505450249}}, + {"cg4n", []float64{62.9296875, 63.10546875, -98.4375, -98.0859375}}, + {"4de6s37sk467", []float64{-75.4904382862, -75.4904381186, -62.7379387245, -62.7379383892}}, + {"pzegg", []float64{-47.1533203125, -47.109375, 174.155273438, 174.19921875}}, + {"xytyx62", []float64{37.7174377441, 37.7188110352, 177.154541016, 177.155914307}}, + {"2x4", []float64{-5.625, -4.21875, -154.6875, -153.28125}}, + {"j11m8j1eywcu", []float64{-83.3800566941, -83.3800565265, 46.7601537332, 46.7601540685}}, + {"k2xw57e", []float64{-41.1135864258, -41.1122131348, 21.9438171387, 21.9451904297}}, + {"2q74", []float64{-9.4921875, -9.31640625, -164.53125, -164.1796875}}, + {"9tp1960t8", []float64{28.4006023407, 28.400645256, -102.600631714, -102.600588799}}, + {"0", []float64{-90.0, -45.0, -180.0, -135.0}}, + {"2b2mwb", []float64{-42.626953125, -42.6214599609, -145.601806641, -145.590820312}}, + {"b7ts80s", []float64{65.481262207, 65.482635498, -161.010131836, -161.008758545}}, + {"10x0", []float64{-87.1875, -87.01171875, -125.15625, -124.8046875}}, + {"c5p9s65qqch", []float64{62.1507364511, 62.1507377923, -124.261599183, -124.261597842}}, + {"n1j", []float64{-84.375, -82.96875, 97.03125, 98.4375}}, + {"gjy9e2g7hcvc", []float64{77.6120662875, 77.6120664552, -35.7118779793, -35.7118776441}}, + {"d0nngw26hnc0", []float64{1.22123524547, 1.2212354131, -81.408175081, -81.4081747457}}, + {"sqqh99ewn", []float64{35.9565353394, 35.9565782547, 19.7584819794, 19.7585248947}}, + {"27", []float64{-28.125, -22.5, -168.75, -157.5}}, + {"9", []float64{0.0, 45.0, -135.0, -90.0}}, + {"uv", []float64{73.125, 78.75, 33.75, 45.0}}, + {"nu6", []float64{-66.09375, -64.6875, 126.5625, 127.96875}}, + {"hjg9s97mjbfp", []float64{-57.3848481663, -57.3848479986, 5.12434154749, 5.12434188277}}, + {"cekd", []float64{63.6328125, 63.80859375, -106.171875, -105.8203125}}, + {"fx2s", []float64{86.484375, 86.66015625, -66.796875, -66.4453125}}, + {"zxx5n4wh37", []float64{87.7293223143, 87.7293276787, 167.615715265, 167.615725994}}, + {"k0redc", []float64{-42.9730224609, -42.9675292969, 10.6677246094, 10.6787109375}}, + {"xhj810f1r7", []float64{22.504350543, 22.5043559074, 142.781378031, 142.78138876}}, + {"9j", []float64{28.125, 33.75, -135.0, -123.75}}, + {"3", []float64{-45.0, 0.0, -135.0, -90.0}}, + {"z59900erg", []float64{64.8673582077, 64.867401123, 137.113966942, 137.114009857}}, + {"6dnccesgx8ts", []float64{-33.4225525707, -33.4225524031, -57.9350421578, -57.9350418225}}, + {"jx", []float64{-50.625, -45.0, 67.5, 78.75}}, + {"312y4y56", []float64{-36.8807601929, -36.8805885315, -133.819999695, -133.819656372}}, + {"32vk3g9np", []float64{-40.013923645, -40.0138807297, -116.288609505, -116.288566589}}, + {"kn39", []float64{-9.66796875, -9.4921875, 2.109375, 2.4609375}}, + {"eed4ybdn4mpp", []float64{20.1747029833, 20.174703151, -19.3880166113, -19.3880162761}}, + {"st", []float64{28.125, 33.75, 22.5, 33.75}}, + {"7", []float64{-45.0, 0.0, -45.0, 0.0}}, + {"0t8kw4xyhu5x", []float64{-58.2566988654, -58.2566986978, -156.873914078, -156.873913743}}, + {"dcy", []float64{9.84375, 11.25, -47.8125, -46.40625}}, + {"p3r", []float64{-82.96875, -81.5625, 156.09375, 157.5}}, + {"wdc0w9tbd6", []float64{15.5649769306, 15.564982295, 114.199887514, 114.199898243}}, + {"j3jc", []float64{-84.19921875, -84.0234375, 64.3359375, 64.6875}}, + {"k5cw3t", []float64{-22.7801513672, -22.7746582031, 2.17529296875, 2.18627929688}}, + {"pd6m", []float64{-76.46484375, -76.2890625, 160.6640625, 161.015625}}, + {"hnfqkjqrqnh", []float64{-50.9025013447, -50.9025000036, 3.34868967533, 3.34869101644}}, + {"6qkwd", []float64{-8.701171875, -8.6572265625, -72.333984375, -72.2900390625}}, + {"1q2x", []float64{-53.61328125, -53.4375, -123.046875, -122.6953125}}, + {"3nps1et4nde", []float64{-10.527292192, -10.5272908509, -124.380057603, -124.380056262}}, + {"gq2c8qnkx2", []float64{80.4536533356, 80.4536587, -32.6754319668, -32.6754212379}}, + {"q7d", []float64{-25.3125, -23.90625, 104.0625, 105.46875}}, + {"560", []float64{-78.75, -77.34375, -33.75, -32.34375}}, + {"j855ffmuxv4s", []float64{-89.3276607245, -89.3276605569, 71.8478319794, 71.8478323147}}, + {"sumfzt8z4", []float64{24.4210624695, 24.4211053848, 42.1666431427, 42.166686058}}, + {"bje9d2n0", []float64{76.201171875, 76.2013435364, -174.971008301, -174.970664978}}, + {"y943", []float64{50.80078125, 50.9765625, 115.6640625, 116.015625}}, + {"3", []float64{-45.0, 0.0, -135.0, -90.0}}, + {"gmn", []float64{73.125, 74.53125, -25.3125, -23.90625}}, + {"1djwe5s", []float64{-77.5881958008, -77.5868225098, -104.628295898, -104.626922607}}, + {"tb5wv3", []float64{1.19201660156, 1.19750976562, 83.9025878906, 83.9135742188}}, + {"58h3qnfv9m", []float64{-89.7422236204, -89.742218256, -16.2559354305, -16.2559247017}}, + {"f0du0sguugkg", []float64{48.5425508581, 48.5425510257, -86.1054797843, -86.105479449}}, + {"h6zyvwk", []float64{-73.3103942871, -73.3090209961, 22.3956298828, 22.3970031738}}, + {"vg2nu", []float64{64.4677734375, 64.51171875, 78.92578125, 78.9697265625}}, + {"j7r7de", []float64{-71.0870361328, -71.0815429688, 66.5551757812, 66.5661621094}}, + {"hjshn", []float64{-58.359375, -58.3154296875, 5.888671875, 5.9326171875}}, + {"46khrs1t", []float64{-76.5738487244, -76.573677063, -72.7933502197, -72.793006897}}, + {"g5p5p", []float64{62.40234375, 62.4462890625, -34.8486328125, -34.8046875}}, + {"7pxgy2sf", []float64{-2.15023040771, -2.15005874634, -33.8203811646, -33.8200378418}}, + {"vrz", []float64{88.59375, 90.0, 66.09375, 67.5}}, + {"bry3ngyuq", []float64{88.7908601761, 88.7909030914, -159.654779434, -159.654736519}}, + {"9cu9t1g3cs", []float64{10.1173567772, 10.1173621416, -94.6976208687, -94.6976101398}}, + {"82", []float64{0.0, 5.625, -168.75, -157.5}}, + {"m9yzmzsmxx", []float64{-33.8396555185, -33.8396501541, 77.2510313988, 77.2510421276}}, + {"4wv4wwxb2dr2", []float64{-51.5560363233, -51.5560361557, -60.1724312827, -60.1724309474}}, + {"j7ufy8", []float64{-68.4228515625, -68.4173583984, 63.2153320312, 63.2263183594}}, + {"hzrjzg9xjj", []float64{-48.1875532866, -48.1875479221, 43.9366006851, 43.936611414}}, + {"z5eh3ghtm7", []float64{65.4519671202, 65.4519724846, 139.302059412, 139.302070141}}, + {"6sjhet2cts", []float64{-21.6798663139, -21.6798609495, -60.3136754036, -60.3136646748}}, + {"vjyn6v", []float64{78.4698486328, 78.4753417969, 53.5583496094, 53.5693359375}}, + {"hysxqy94", []float64{-52.1270370483, -52.126865387, 40.3761291504, 40.3764724731}}, + {"8yd4qxm0d", []float64{36.9979190826, 36.997961998, -143.144903183, -143.144860268}}, + {"w43b42f", []float64{12.660369873, 12.6617431641, 92.5625610352, 92.5639343262}}, + {"7suc291x", []float64{-18.0548286438, -18.0546569824, -15.7962799072, -15.7959365845}}, + {"pq", []float64{-56.25, -50.625, 146.25, 157.5}}, + {"frd", []float64{87.1875, 88.59375, -75.9375, -74.53125}}, + {"r2", []float64{-45.0, -39.375, 146.25, 157.5}}, + {"5rs1x50fe5m", []float64{-47.531902045, -47.5319007039, -27.8162173927, -27.8162160516}}, + {"zjv30682s21k", []float64{77.5333506614, 77.533350829, 142.394326217, 142.394326553}}, + {"zbbtmw", []float64{50.1745605469, 50.1800537109, 169.694824219, 169.705810547}}, + {"e56k", []float64{18.984375, 19.16015625, -41.8359375, -41.484375}}, + {"v7dzp", []float64{65.91796875, 65.9619140625, 60.4248046875, 60.46875}}, + {"n2z8qt2hnzss", []float64{-85.707738027, -85.7077378593, 112.082815245, 112.08281558}}, + {"zbp8bt07bb5", []float64{45.159945488, 45.1599468291, 179.319227189, 179.31922853}}, + {"s551t", []float64{17.138671875, 17.1826171875, 4.4384765625, 4.482421875}}, + {"5zp7trjp", []float64{-49.9701118469, -49.9699401855, -0.817108154297, -0.816764831543}}, + {"81n2z7dz", []float64{5.77726364136, 5.77743530273, -170.888557434, -170.888214111}}, + {"dw", []float64{33.75, 39.375, -67.5, -56.25}}, + {"vvw35", []float64{76.11328125, 76.1572265625, 87.6708984375, 87.71484375}}, + {"zhtuc3b8bg5n", []float64{71.1572198197, 71.1572199874, 143.141591996, 143.141592331}}, + {"042w4qgx", []float64{-76.2507820129, -76.2506103516, -179.193191528, -179.192848206}}, + {"9sntb7", []float64{23.5272216797, 23.5327148438, -103.348388672, -103.337402344}}, + {"wt6x5nxk", []float64{30.7981109619, 30.7982826233, 116.157417297, 116.15776062}}, + {"spzwehn", []float64{44.7583007812, 44.7596740723, 10.6869506836, 10.6883239746}}, + {"hv70sbbpw64", []float64{-60.3754413128, -60.3754399717, 38.1777611375, 38.1777624786}}, + {"wxt", []float64{42.1875, 43.59375, 119.53125, 120.9375}}, + {"7dvqsskj8", []float64{-28.3643817902, -28.3643388748, -14.9139404297, -14.9138975143}}, + {"wm22", []float64{29.53125, 29.70703125, 101.6015625, 101.953125}}, + {"5t4kf", []float64{-61.0400390625, -60.99609375, -19.248046875, -19.2041015625}}, + {"5sy67z47fz1", []float64{-62.846608758, -62.8466074169, -13.542933315, -13.5429319739}}, + {"c36nw2msv0t", []float64{53.1760194898, 53.1760208309, -120.655067414, -120.655066073}}, + {"311v", []float64{-38.49609375, -38.3203125, -132.5390625, -132.1875}}, + {"x", []float64{0.0, 45.0, 135.0, 180.0}}, + {"xcykeys", []float64{10.6704711914, 10.6718444824, 177.709350586, 177.710723877}}, + {"gtmvbbv", []float64{75.5461120605, 75.5474853516, -14.3742370605, -14.3728637695}}, + {"5n1", []float64{-56.25, -54.84375, -43.59375, -42.1875}}, + {"y08uf", []float64{48.6474609375, 48.69140625, 91.142578125, 91.1865234375}}, + {"ds4", []float64{22.5, 23.90625, -64.6875, -63.28125}}, + {"1t49n9", []float64{-61.6937255859, -61.6882324219, -108.698730469, -108.687744141}}, + {"v81d55x758", []float64{45.3713035583, 45.3713089228, 69.7513175011, 69.7513282299}}, + {"39j80vxmjh", []float64{-39.3439078331, -39.3439024687, -104.722495079, -104.72248435}}, + {"x3vd66k2vj46", []float64{10.251773335, 10.2517735027, 154.089306034, 154.089306369}}, + {"vg", []float64{61.875, 67.5, 78.75, 90.0}}, + {"06q04jnnuyz", []float64{-77.3150892556, -77.3150879145, -160.216156393, -160.216155052}}, + {"cvws", []float64{76.640625, 76.81640625, -92.109375, -91.7578125}}, + {"9d5j84gmgbv", []float64{12.2328941524, 12.2328954935, -108.276619166, -108.276617825}}, + {"mjg", []float64{-12.65625, -11.25, 49.21875, 50.625}}, + {"k7gspu", []float64{-23.1811523438, -23.1756591797, 16.5124511719, 16.5234375}}, + {"13nrmggfx", []float64{-83.0795574188, -83.0795145035, -114.702801704, -114.702758789}}, + {"ypj759", []float64{84.9078369141, 84.9133300781, 97.5366210938, 97.5476074219}}, + {"hhy3z0zjn", []float64{-62.9686546326, -62.9686117172, 9.10655021667, 9.10659313202}}, + {"b0xhjv", []float64{48.5430908203, 48.5485839844, -169.903564453, -169.892578125}}, + {"xucn7t7t11", []float64{27.8470855951, 27.8470909595, 170.314908028, 170.314918756}}, + {"f0yqwpw6", []float64{50.4028701782, 50.4030418396, -80.9386825562, -80.9383392334}}, + {"hcud164f7z", []float64{-79.7932773829, -79.7932720184, 40.1369941235, 40.1370048523}}, + {"k7b09t", []float64{-23.7908935547, -23.7854003906, 11.3159179688, 11.3269042969}}, + {"gzttr69", []float64{88.1240844727, 88.1254577637, -3.19564819336, -3.19427490234}}, + {"z1", []float64{50.625, 56.25, 135.0, 146.25}}, + {"mt", []float64{-16.875, -11.25, 67.5, 78.75}}, + {"vgpm3pe", []float64{62.839050293, 62.840423584, 88.9933776855, 88.9947509766}}, + {"xc2xk", []float64{8.3056640625, 8.349609375, 169.62890625, 169.672851562}}, + {"7gpegjrrn", []float64{-27.4357795715, -27.4357366562, -0.561075210571, -0.561032295227}}, + {"5m00s41bg21m", []float64{-61.7759934627, -61.775993295, -33.5716743395, -33.5716740042}}, + {"ybz9un", []float64{49.5593261719, 49.5648193359, 134.47265625, 134.483642578}}, + {"5cxhpu5jy6y", []float64{-80.8364005387, -80.8363991976, -1.06127768755, -1.06127634645}}, + {"0gk7412", []float64{-71.1845397949, -71.1831665039, -140.185546875, -140.184173584}}, + {"zj2", []float64{74.53125, 75.9375, 135.0, 136.40625}}, + {"6jj5vt6zv5", []float64{-16.1856347322, -16.1856293678, -82.7230596542, -82.7230489254}}, + {"mdp7x0cy8x", []float64{-33.1294924021, -33.1294870377, 78.0053544044, 78.0053651333}}, + {"515", []float64{-84.375, -82.96875, -40.78125, -39.375}}, + {"rcrpxqxje", []float64{-36.613740921, -36.6136980057, 178.922095299, 178.922138214}}, + {"ydd5n3qr6tu", []float64{59.5979855955, 59.5979869366, 115.595853925, 115.595855266}}, + {"hd", []float64{-78.75, -73.125, 22.5, 33.75}}, + {"rj4uc", []float64{-16.0400390625, -15.99609375, 138.911132812, 138.955078125}}, + {"0r3x", []float64{-47.98828125, -47.8125, -166.640625, -166.2890625}}, + {"490rp8g4y", []float64{-83.1399393082, -83.1398963928, -66.8144702911, -66.8144273758}}, + {"v19nuj", []float64{54.6514892578, 54.6569824219, 46.58203125, 46.5930175781}}, + {"wrg3hx7", []float64{43.8093566895, 43.8107299805, 106.022186279, 106.02355957}}, + {"ntzjry4", []float64{-56.7004394531, -56.6990661621, 122.687072754, 122.688446045}}, + {"3s5c85t2d", []float64{-22.2170162201, -22.2169733047, -107.219266891, -107.219223976}}, + {"wyy", []float64{37.96875, 39.375, 132.1875, 133.59375}}, + {"6wk17", []float64{-9.6240234375, -9.580078125, -61.7431640625, -61.69921875}}, + {"7m5wqccz7meg", []float64{-15.7654795982, -15.7654794306, -28.5289463773, -28.5289460421}}, + {"sk8n5z1qd2", []float64{26.4067554474, 26.4067608118, 11.4166080952, 11.416618824}}, + {"kw8hyjjj384", []float64{-7.57417201996, -7.57417067885, 22.7706053853, 22.7706067264}}, + {"14rw6bt3vx9v", []float64{-76.2420291267, -76.2420289591, -124.324827231, -124.324826896}}, + {"9cdu", []float64{9.140625, 9.31640625, -97.3828125, -97.03125}}, + {"0cptj3e6crgm", []float64{-83.4873395227, -83.4873393551, -135.467890911, -135.467890576}}, + {"0m", []float64{-61.875, -56.25, -168.75, -157.5}}, + {"mx", []float64{-5.625, 0.0, 67.5, 78.75}}, + {"p1sfuc", []float64{-81.0736083984, -81.0681152344, 141.888427734, 141.899414062}}, + {"03nduncm", []float64{-83.8536643982, -83.8534927368, -159.431877136, -159.431533813}}, + {"0m0j", []float64{-60.99609375, -60.8203125, -168.75, -168.3984375}}, + {"7", []float64{-45.0, 0.0, -45.0, 0.0}}, + {"fn", []float64{78.75, 84.375, -90.0, -78.75}}, + {"srddrb0", []float64{42.5830078125, 42.5843811035, 15.1062011719, 15.1075744629}}, + {"wpbf1dnj", []float64{43.957157135, 43.9573287964, 91.1288452148, 91.1291885376}}, + {"nv8k1f98", []float64{-58.3456420898, -58.3454704285, 124.180526733, 124.180870056}}, + {"7upj4ct1", []float64{-21.6126823425, -21.6125106812, -1.27853393555, -1.27819061279}}, + {"k312nzxpwm", []float64{-39.3324869871, -39.3324816227, 13.3143246174, 13.3143353462}}, + {"s", []float64{0.0, 45.0, 0.0, 45.0}}, + {"qure08zn2n", []float64{-20.5611813068, -20.5611759424, 134.328460693, 134.328471422}}, + {"m1", []float64{-39.375, -33.75, 45.0, 56.25}}, + {"q7wbtv20e", []float64{-25.195684433, -25.1956415176, 110.995001793, 110.995044708}}, + {"fsqx0ywr3s", []float64{70.1736903191, 70.1736956835, -58.3177685738, -58.3177578449}}, + {"rn9k7k2927vd", []float64{-7.66684871167, -7.66684854403, 136.901339516, 136.901339851}}, + {"8ypg", []float64{34.27734375, 34.453125, -135.3515625, -135.0}}, + {"42ru", []float64{-87.890625, -87.71484375, -67.8515625, -67.5}}, + {"z3efhtnprhw", []float64{53.8177970052, 53.8177983463, 151.729739606, 151.729740947}}, + {"c", []float64{45.0, 90.0, -135.0, -90.0}}, + {"fu", []float64{67.5, 73.125, -56.25, -45.0}}, + {"uz9", []float64{87.1875, 88.59375, 35.15625, 36.5625}}, + {"bqdnsr4y4", []float64{82.7445602417, 82.744603157, -165.746870041, -165.746827126}}, + {"rshpnngvd", []float64{-21.231508255, -21.2314653397, 163.393907547, 163.393950462}}, + {"4egtrvjfe", []float64{-67.9555034637, -67.9554605484, -62.2295236588, -62.2294807434}}, + {"w", []float64{0.0, 45.0, 90.0, 135.0}}, + {"7", []float64{-45.0, 0.0, -45.0, 0.0}}, + {"r2m2yc", []float64{-43.4564208984, -43.4509277344, 153.929443359, 153.940429688}}, + {"v9vps2z7", []float64{56.1667442322, 56.1669158936, 74.727973938, 74.7283172607}}, + {"dh", []float64{22.5, 28.125, -90.0, -78.75}}, + {"k", []float64{-45.0, 0.0, 0.0, 45.0}}, + {"7", []float64{-45.0, 0.0, -45.0, 0.0}}, + {"rrtp", []float64{-1.58203125, -1.40625, 153.28125, 153.6328125}}, + {"57wdet", []float64{-69.8455810547, -69.8400878906, -24.4555664062, -24.4445800781}}, + {"sxm3rvsc1x0y", []float64{41.031399183, 41.0313993506, 30.229977183, 30.2299775183}}, + {"k", []float64{-45.0, 0.0, 0.0, 45.0}}, + {"fmwp70q5", []float64{77.2138023376, 77.213973999, -70.1724243164, -70.1720809937}}, + {"vq3whey", []float64{81.2315368652, 81.2329101562, 58.5653686523, 58.5667419434}}, + {"vm55829xumn", []float64{73.7443381548, 73.7443394959, 60.4819867015, 60.4819880426}}, + {"pc", []float64{-84.375, -78.75, 168.75, 180.0}}, + {"76j", []float64{-33.75, -32.34375, -26.71875, -25.3125}}, + {"du5md", []float64{23.466796875, 23.5107421875, -51.591796875, -51.5478515625}}, + {"0b800r9", []float64{-87.1463012695, -87.1449279785, -146.237640381, -146.23626709}}, + {"r96suj", []float64{-37.1063232422, -37.1008300781, 161.19140625, 161.202392578}}, + {"dqn86", []float64{33.7939453125, 33.837890625, -69.521484375, -69.4775390625}}, + {"62jjysq7fun", []float64{-43.9652466774, -43.9652453363, -71.4243963361, -71.424394995}}, + {"s623s4p3v", []float64{12.9312086105, 12.9312515259, 11.7875146866, 11.7875576019}}, + {"j9w5", []float64{-81.03515625, -80.859375, 75.9375, 76.2890625}}, + {"cku2qh5ee64", []float64{71.7852795124, 71.7852808535, -117.504816949, -117.504815608}}, + {"ypmy864vvgs", []float64{86.9358202815, 86.9358216226, 98.1009525061, 98.1009538472}}, + {"kwe", []float64{-8.4375, -7.03125, 26.71875, 28.125}}, + {"gmq7083dvewj", []float64{75.0604587235, 75.0604588911, -24.9366608262, -24.9366604909}}, + {"9er", []float64{18.28125, 19.6875, -102.65625, -101.25}}, + {"5p89tmjs9j5", []float64{-47.5205630064, -47.5205616653, -44.0585620701, -44.058560729}}, + {"x", []float64{0.0, 45.0, 135.0, 180.0}}, + {"ewy", []float64{37.96875, 39.375, -14.0625, -12.65625}}, + {"jtgef", []float64{-56.9970703125, -56.953125, 72.509765625, 72.5537109375}}, + {"9yjjw", []float64{34.716796875, 34.7607421875, -93.955078125, -93.9111328125}}, + {"926", []float64{1.40625, 2.8125, -120.9375, -119.53125}}, + {"bz1", []float64{84.375, 85.78125, -144.84375, -143.4375}}, + {"yjjpq0ecnve", []float64{74.4023618102, 74.4023631513, 97.3003654182, 97.3003667593}}, + {"w5e", []float64{19.6875, 21.09375, 94.21875, 95.625}}, + {"hqcn9wtcr", []float64{-50.8527517319, -50.8527088165, 12.7303647995, 12.7304077148}}, + {"qfh6xphngs", []float64{-33.2709145546, -33.2709091902, 130.039823055, 130.039833784}}, + {"1he586fypp", []float64{-64.0560919046, -64.0560865402, -130.766186714, -130.766175985}}, + {"4cc5sh9n3s", []float64{-79.5152020454, -79.515196681, -54.666531086, -54.6665203571}}, + {"9y5wfm", []float64{34.9639892578, 34.9694824219, -96.2292480469, -96.2182617188}}, + {"c97809", []float64{52.0367431641, 52.0422363281, -107.556152344, -107.545166016}}, + {"k9g2nkbm3j5h", []float64{-35.1292287558, -35.1292285882, 27.3453609645, 27.3453612998}}, + {"thdwugw196t", []float64{26.5185204148, 26.5185217559, 48.7326653302, 48.7326666713}}, + {"34nm41n89c8v", []float64{-32.8655058704, -32.8655057028, -126.114044376, -126.11404404}}, + {"buf7qgu", []float64{72.3106384277, 72.3120117188, -142.783813477, -142.782440186}}, + {"mhvh0u7f4", []float64{-17.55443573, -17.5543928146, 52.0694446564, 52.0694875717}}, + {"t", []float64{0.0, 45.0, 45.0, 90.0}}, + {"f0vdwj1bu", []float64{49.6857976913, 49.6858406067, -81.9993782043, -81.999335289}}, + {"kcke59", []float64{-37.4359130859, -37.4304199219, 40.2319335938, 40.2429199219}}, + {"9rws4p0", []float64{42.9290771484, 42.9304504395, -114.521484375, -114.520111084}}, + {"fhj1u03epu", []float64{67.8095269203, 67.8095322847, -82.7905762196, -82.7905654907}}, + {"13296d9gwq1", []float64{-82.734657526, -82.7346561849, -122.934338897, -122.934337556}}, + {"4j", []float64{-61.875, -56.25, -90.0, -78.75}}, + {"gk5u1y2", []float64{68.2374572754, 68.2388305664, -28.3996582031, -28.3982849121}}, + {"9v6yrwx00", []float64{30.6655883789, 30.6656312943, -97.0436096191, -97.0435667038}}, + {"mc92", []float64{-36.5625, -36.38671875, 80.5078125, 80.859375}}, + {"m", []float64{-45.0, 0.0, 45.0, 90.0}}, + {"vtzr1we0jh5", []float64{78.6099457741, 78.6099471152, 77.7655689418, 77.7655702829}}, + {"ytmmrjr08p", []float64{75.4830640554, 75.4830694199, 120.200042725, 120.200053453}}, + {"y7q525c0mgkz", []float64{63.8731999509, 63.8732001185, 109.689126424, 109.68912676}}, + {"s5nc", []float64{17.05078125, 17.2265625, 9.4921875, 9.84375}}, + {"wk2", []float64{23.90625, 25.3125, 101.25, 102.65625}}, + {"f4beky4z04y", []float64{61.0742144287, 61.0742157698, -89.0843501687, -89.0843488276}}, + {"ywdu5yj95", []float64{82.2987556458, 82.2987985611, 116.539664268, 116.539707184}}, + {"n3", []float64{-84.375, -78.75, 101.25, 112.5}}, + {"0334vnb6", []float64{-82.4479293823, -82.4477577209, -167.123680115, -167.123336792}}, + {"xg65", []float64{18.80859375, 18.984375, 171.5625, 171.9140625}}, + {"0ebmse71br", []float64{-67.9212623835, -67.921257019, -156.946552992, -156.946542263}}, + {"ycwd9fc", []float64{53.8920593262, 53.8934326172, 132.968902588, 132.970275879}}, + {"0z2gsvd0tfzy", []float64{-48.573201634, -48.5732014664, -144.983568527, -144.983568192}}, + {"e041", []float64{0.17578125, 0.3515625, -42.1875, -41.8359375}}, + {"ntzdfpdcphj7", []float64{-57.1314592101, -57.1314590424, 123.138849624, 123.138849959}}, + {"jx1bfyyhgds1", []float64{-50.4552562349, -50.4552560672, 70.0901824236, 70.0901827589}}, + {"dhzuvhgspbew", []float64{27.5804938003, 27.580493968, -78.8766921312, -78.8766917959}}, + {"4", []float64{-90.0, -45.0, -90.0, -45.0}}, + {"teyjc173t", []float64{22.1116161346, 22.11165905, 75.986123085, 75.9861660004}}, + {"bg57uz4rxw5", []float64{62.5739514828, 62.5739528239, -141.467531472, -141.467530131}}, + {"52dtfpdc", []float64{-86.1353874207, -86.1352157593, -30.1427078247, -30.142364502}}, + {"vx1j39e", []float64{85.3060913086, 85.3074645996, 68.9762878418, 68.9776611328}}, + {"2", []float64{-45.0, 0.0, -180.0, -135.0}}, + {"psmpz5", []float64{-64.7149658203, -64.7094726562, 164.838867188, 164.849853516}}, + {"4xr95eeee", []float64{-49.023141861, -49.0230989456, -56.7943811417, -56.7943382263}}, + {"5j", []float64{-61.875, -56.25, -45.0, -33.75}}, + {"kpb", []float64{-1.40625, 0.0, 0.0, 1.40625}}, + {"dsub48epk", []float64{26.722741127, 26.7227840424, -60.7061576843, -60.706114769}}, + {"2urtnwtdw17", []float64{-20.1787023246, -20.1787009835, -135.409665853, -135.409664512}}, + {"e6s30gwjxm", []float64{14.2584782839, 14.2584836483, -27.7319276333, -27.7319169044}}, + {"qtx", []float64{-14.0625, -12.65625, 122.34375, 123.75}}, + {"qj0qvndweq3k", []float64{-15.651620999, -15.6516208313, 90.5748634413, 90.5748637766}}, + {"ffetyh28uyj", []float64{60.0967490673, 60.0967504084, -51.0635559261, -51.063554585}}, + {"z56t8nwqq7", []float64{64.2848414183, 64.2848467827, 138.52447629, 138.524487019}}, + {"7h", []float64{-22.5, -16.875, -45.0, -33.75}}, + {"9tuuw1pkyh", []float64{33.1410956383, 33.1411010027, -105.546426773, -105.546416044}}, + {"2m", []float64{-16.875, -11.25, -168.75, -157.5}}, + {"h7qt", []float64{-70.83984375, -70.6640625, 20.390625, 20.7421875}}, + {"t832ztb6psn", []float64{1.57003641129, 1.57003775239, 69.5880755782, 69.5880769193}}, + {"wk", []float64{22.5, 28.125, 101.25, 112.5}}, + {"ndjbb8w3n", []float64{-78.6152458191, -78.6152029037, 120.616750717, 120.616793633}}, + {"14pf3eqg4zd5", []float64{-78.3360836841, -78.3360835165, -124.026254117, -124.026253782}}, + {"9j", []float64{28.125, 33.75, -135.0, -123.75}}, + {"fr6ng34", []float64{86.9732666016, 86.9746398926, -75.7919311523, -75.7905578613}}, + {"p3ggurx2c", []float64{-79.455742836, -79.4556999207, 151.720204353, 151.720247269}}, + {"1h1pg1myn06", []float64{-66.1297975481, -66.129796207, -133.453757465, -133.453756124}}, + {"cqsue", []float64{82.353515625, 82.3974609375, -116.938476562, -116.89453125}}, + {"w", []float64{0.0, 45.0, 90.0, 135.0}}, + {"s8jkw", []float64{0.791015625, 0.8349609375, 30.146484375, 30.1904296875}}, + {"67", []float64{-28.125, -22.5, -78.75, -67.5}}, + {"ywe4mn", []float64{81.9909667969, 81.9964599609, 116.938476562, 116.949462891}}, + {"0f5te71q9g", []float64{-77.7655917406, -77.7655863762, -141.183511019, -141.18350029}}, + {"v9s6tw70swwv", []float64{53.911406938, 53.9114071056, 73.7225837633, 73.7225840986}}, + {"0jbutv", []float64{-56.8377685547, -56.8322753906, -178.692626953, -178.681640625}}, + {"bn271bp", []float64{80.68359375, 80.684967041, -179.561920166, -179.560546875}}, + {"1vvyth", []float64{-56.4916992188, -56.4862060547, -92.9443359375, -92.9333496094}}, + {"7ruk94vup", []float64{-0.59944152832, -0.599398612976, -27.7212953568, -27.7212524414}}, + {"3hf", []float64{-18.28125, -16.875, -132.1875, -130.78125}}, + {"741rwgds3m4k", []float64{-32.4116574973, -32.4116573296, -42.9420667514, -42.9420664161}}, + {"2pye", []float64{-0.87890625, -0.703125, -170.859375, -170.5078125}}, + {"2", []float64{-45.0, 0.0, -180.0, -135.0}}, + {"e7", []float64{16.875, 22.5, -33.75, -22.5}}, + {"f", []float64{45.0, 90.0, -90.0, -45.0}}, + {"6", []float64{-45.0, 0.0, -90.0, -45.0}}, + {"4c5p5ke", []float64{-83.1198120117, -83.1184387207, -51.8843078613, -51.8829345703}}, + {"h7q", []float64{-71.71875, -70.3125, 19.6875, 21.09375}}, + {"4fjp8", []float64{-77.431640625, -77.3876953125, -49.21875, -49.1748046875}}, + {"p2cbvvvdt8", []float64{-85.6173992157, -85.6173938513, 148.971412182, 148.971422911}}, + {"xxjtqz46qmm", []float64{40.3367181122, 40.3367194533, 165.534370691, 165.534372032}}, + {"w1e", []float64{8.4375, 9.84375, 94.21875, 95.625}}, + {"fxpg4v3e", []float64{84.9316978455, 84.9318695068, -56.4786529541, -56.4783096313}}, + {"3be6u", []float64{-41.7041015625, -41.66015625, -96.50390625, -96.4599609375}}, + {"9", []float64{0.0, 45.0, -135.0, -90.0}}, + {"seqqvkuphz", []float64{19.4951051474, 19.4951105118, 31.5254724026, 31.5254831314}}, + {"txy2t7xx", []float64{43.7020683289, 43.7022399902, 76.5300750732, 76.530418396}}, + {"s2hc2d2s", []float64{0.232772827148, 0.232944488525, 17.9523468018, 17.9526901245}}, + {"8zr0n4f62k", []float64{40.7967638969, 40.7967692614, -136.139477491, -136.139466763}}, + {"th1vxpfxnp9", []float64{23.5106107593, 23.5106121004, 47.7722467482, 47.7722480893}}, + {"n", []float64{-90.0, -45.0, 90.0, 135.0}}, + {"33", []float64{-39.375, -33.75, -123.75, -112.5}}, + {"gu", []float64{67.5, 73.125, -11.25, 0.0}}, + {"9vq49", []float64{29.970703125, 30.0146484375, -92.7685546875, -92.724609375}}, + {"tm", []float64{28.125, 33.75, 56.25, 67.5}}, + {"dpzw0p", []float64{44.6868896484, 44.6923828125, -79.453125, -79.4421386719}}, + {"gwg12", []float64{83.1884765625, 83.232421875, -18.28125, -18.2373046875}}, + {"b8vphv0m5k", []float64{50.4775643349, 50.4775696993, -150.259526968, -150.259516239}}, + {"pgpffhw1", []float64{-72.6167106628, -72.6165390015, 179.744567871, 179.744911194}}, + {"3r3w", []float64{-3.1640625, -2.98828125, -121.640625, -121.2890625}}, + {"u1d", []float64{53.4375, 54.84375, 2.8125, 4.21875}}, + {"mznb8v5xu6", []float64{-5.50830245018, -5.50829708576, 88.2801353931, 88.280146122}}, + {"8mb57vjrex4", []float64{32.9438298941, 32.9438312352, -168.577842414, -168.577841073}}, + {"zm", []float64{73.125, 78.75, 146.25, 157.5}}, + {"c9ef6tm74sg", []float64{53.8623873889, 53.86238873, -107.109378129, -107.109376788}}, + {"spww", []float64{43.2421875, 43.41796875, 9.140625, 9.4921875}}, + {"snp97n", []float64{34.0026855469, 34.0081787109, 10.6787109375, 10.6896972656}}, + {"zp9r6emsk8xx", []float64{88.4805002622, 88.4805004299, 136.875432059, 136.875432394}}, + {"v", []float64{45.0, 90.0, 45.0, 90.0}}, + {"18zsh9bg", []float64{-85.0679969788, -85.0678253174, -101.754341125, -101.753997803}}, + {"v28", []float64{47.8125, 49.21875, 56.25, 57.65625}}, + {"4e", []float64{-73.125, -67.5, -67.5, -56.25}}, + {"evn0wp56", []float64{28.2516860962, 28.2518577576, -2.5443649292, -2.54402160645}}, + {"uyf9v", []float64{83.2763671875, 83.3203125, 37.4853515625, 37.529296875}}, + {"d7", []float64{16.875, 22.5, -78.75, -67.5}}, + {"05", []float64{-73.125, -67.5, -180.0, -168.75}}, + {"ujj8", []float64{73.125, 73.30078125, 7.734375, 8.0859375}}, + {"wcb7n8", []float64{10.37109375, 10.3765869141, 124.387207031, 124.398193359}}, + {"r35s2y4e2", []float64{-38.5944128036, -38.5943698883, 151.208267212, 151.208310127}}, + {"k", []float64{-45.0, 0.0, 0.0, 45.0}}, + {"8tm3h7b1f", []float64{29.7279310226, 29.727973938, -149.930334091, -149.930291176}}, + {"3xecw9gsguw3", []float64{-2.53837538883, -2.53837522119, -106.935942136, -106.9359418}}, + {"hqs10v", []float64{-53.2342529297, -53.2287597656, 16.9079589844, 16.9189453125}}, + {"b21g", []float64{45.52734375, 45.703125, -166.2890625, -165.9375}}, + {"vphhpnjt5b", []float64{85.1119422913, 85.1119476557, 50.9403312206, 50.9403419495}}, + {"kbd", []float64{-42.1875, -40.78125, 36.5625, 37.96875}}, + {"2c", []float64{-39.375, -33.75, -146.25, -135.0}}, + {"07ur", []float64{-67.67578125, -67.5, -162.7734375, -162.421875}}, + {"8e5ky1", []float64{17.7154541016, 17.7209472656, -152.666015625, -152.655029297}}, + {"k2w84t", []float64{-42.1600341797, -42.1545410156, 20.5004882812, 20.5114746094}}, + {"p9t4ncex81m", []float64{-81.2014035881, -81.201402247, 164.832694083, 164.832695425}}, + {"q67rduzsu6uz", []float64{-30.9984667785, -30.9984666109, 105.951650552, 105.951650888}}, + {"udwkypp0v", []float64{59.936041832, 59.9360847473, 31.5625619888, 31.5626049042}}, + {"pjsu1q9qg", []float64{-58.3225107193, -58.322467804, 141.7364645, 141.736507416}}, + {"2kj2w9b021b", []float64{-22.4024440348, -22.4024426937, -161.081542969, -161.081541628}}, + {"5k0", []float64{-67.5, -66.09375, -33.75, -32.34375}}, + {"t626vs8j", []float64{13.1652259827, 13.165397644, 56.8432617188, 56.8436050415}}, + {"hd0z4zr73", []float64{-77.4791479111, -77.4791049957, 23.6855363846, 23.6855792999}}, + {"79gjppfekhhk", []float64{-34.2341917008, -34.2341915332, -17.9700222239, -17.9700218886}}, + {"u9u", []float64{54.84375, 56.25, 28.125, 29.53125}}, + {"5zbfmj3n30", []float64{-45.9808301926, -45.9808248281, -9.97416973114, -9.9741590023}}, + {"1w1nt3g4t9pp", []float64{-55.0973731466, -55.0973729789, -110.858671814, -110.858671479}}, + {"f6bh910940", []float64{61.2654304504, 61.2654358149, -78.7052822113, -78.7052714825}}, + {"r65q38x", []float64{-32.6486206055, -32.6472473145, 150.895843506, 150.897216797}}, + {"xq2", []float64{35.15625, 36.5625, 146.25, 147.65625}}, + {"q87xbntvdv8d", []float64{-42.1947657689, -42.1947656013, 117.429890111, 117.429890446}}, + {"w1zhgmbpw", []float64{10.7115840912, 10.7116270065, 99.9868297577, 99.986872673}}, + {"5n", []float64{-56.25, -50.625, -45.0, -33.75}}, + {"9dz", []float64{15.46875, 16.875, -102.65625, -101.25}}, + {"n8r794hh15gv", []float64{-87.9668216966, -87.966821529, 122.744798921, 122.744799256}}, + {"px78re9", []float64{-49.1555786133, -49.1542053223, 162.752838135, 162.754211426}}, + {"3pps", []float64{-4.921875, -4.74609375, -124.453125, -124.1015625}}, + {"3s6um", []float64{-20.3466796875, -20.302734375, -108.413085938, -108.369140625}}, + {"9dj7zre6t7", []float64{11.9508236647, 11.9508290291, -104.793895483, -104.793884754}}, + {"4v1b", []float64{-61.875, -61.69921875, -53.7890625, -53.4375}}, + {"1k35z", []float64{-65.4345703125, -65.390625, -122.036132812, -121.9921875}}, + {"7z9n57", []float64{-1.74133300781, -1.73583984375, -9.70092773438, -9.68994140625}}, + {"3gzg", []float64{-23.37890625, -23.203125, -90.3515625, -90.0}}, + {"hy", []float64{-56.25, -50.625, 33.75, 45.0}}, + {"2rj6t", []float64{-5.185546875, -5.1416015625, -161.147460938, -161.103515625}}, + {"h", []float64{-90.0, -45.0, 0.0, 45.0}}, + {"dp44nc1t", []float64{39.7329139709, 39.7330856323, -86.8888092041, -86.8884658813}}, + {"0x1", []float64{-50.625, -49.21875, -156.09375, -154.6875}}, + {"dmwxxf", []float64{32.2668457031, 32.2723388672, -69.2687988281, -69.2578125}}, + {"khy29", []float64{-18.193359375, -18.1494140625, 8.8330078125, 8.876953125}}, + {"v", []float64{45.0, 90.0, 45.0, 90.0}}, + {"phn4", []float64{-67.1484375, -66.97265625, 143.4375, 143.7890625}}, + {"qzhvp", []float64{-4.74609375, -4.7021484375, 130.737304688, 130.78125}}, + {"3n", []float64{-11.25, -5.625, -135.0, -123.75}}, + {"0nx", []float64{-53.4375, -52.03125, -170.15625, -168.75}}, + {"19uwx04h21", []float64{-79.0129369497, -79.0129315853, -105.86151123, -105.861500502}}, + {"7ur1q", []float64{-20.8740234375, -20.830078125, -1.142578125, -1.0986328125}}, + {"8yn6q9vmm", []float64{34.1560220718, 34.1560649872, -137.167868614, -137.167825699}}, + {"m4zk", []float64{-28.828125, -28.65234375, 55.1953125, 55.546875}}, + {"9bgzpypspd0", []float64{5.48287510872, 5.48287644982, -95.6253647804, -95.6253634393}}, + {"y1s", []float64{53.4375, 54.84375, 95.625, 97.03125}}, + {"qsyp207nvy", []float64{-17.0042717457, -17.0042663813, 120.941866636, 120.941877365}}, + {"rfb", []float64{-29.53125, -28.125, 168.75, 170.15625}}, + {"j", []float64{-90.0, -45.0, 45.0, 90.0}}, + {"5exm5p63", []float64{-69.3935966492, -69.3934249878, -12.1697616577, -12.169418335}}, + {"cnv22fdkruw", []float64{83.0271819234, 83.0271832645, -127.58079797, -127.580796629}}, + {"n7vg", []float64{-68.37890625, -68.203125, 109.3359375, 109.6875}}, + {"whvgd2h3sz9", []float64{27.3342821002, 27.3342834413, 98.1908561289, 98.19085747}}, + {"shbfuzk8vr", []float64{27.2421401739, 27.2421455383, 1.2698328495, 1.26984357834}}, + {"44vmk", []float64{-73.6083984375, -73.564453125, -82.44140625, -82.3974609375}}, + {"uhd1mfq", []float64{70.5445861816, 70.5459594727, 3.07342529297, 3.07479858398}}, + {"7bz", []float64{-40.78125, -39.375, -1.40625, 0.0}}, + {"h5b2wkdqpz", []float64{-68.7925726175, -68.7925672531, 0.629643201828, 0.629653930664}}, + {"h1", []float64{-84.375, -78.75, 0.0, 11.25}}, + {"3", []float64{-45.0, 0.0, -135.0, -90.0}}, + {"408bm1", []float64{-87.1380615234, -87.1325683594, -88.7255859375, -88.7145996094}}, + {"ggysyy5e2be6", []float64{66.9622308388, 66.9622310065, -1.80790107697, -1.8079007417}}, + {"w4u7dn8m9ndw", []float64{16.1206699535, 16.1206701212, 96.0648427159, 96.0648430511}}, + {"yq", []float64{78.75, 84.375, 101.25, 112.5}}, + {"2nwuht4w", []float64{-7.70587921143, -7.70570755005, -170.306625366, -170.306282043}}, + {"v5gqe", []float64{67.236328125, 67.2802734375, 49.7021484375, 49.74609375}}, + {"0", []float64{-90.0, -45.0, -180.0, -135.0}}, + {"cmehghzjm1", []float64{76.7994600534, 76.7994654179, -119.389586449, -119.38957572}}, + {"u207q361d", []float64{45.5784130096, 45.578455925, 11.8790531158, 11.8790960312}}, + {"n4pgq345pp", []float64{-78.1726652384, -78.172659874, 101.176142693, 101.176153421}}, + {"b2mn8", []float64{47.548828125, 47.5927734375, -161.71875, -161.674804688}}, + {"qbe6mp0g8et3", []float64{-41.7529202811, -41.7529201135, 128.541097529, 128.541097865}}, + {"sr04m", []float64{39.7705078125, 39.814453125, 11.4697265625, 11.513671875}}, + {"hfr0y7u988jx", []float64{-77.1910560317, -77.1910558641, 43.8746168464, 43.8746171817}}, + {"jrgxg5qkg7p", []float64{-45.0252610445, -45.0252597034, 61.3124428689, 61.3124442101}}, + {"gryut", []float64{89.384765625, 89.4287109375, -24.0380859375, -23.994140625}}, + {"14d5b766ppxg", []float64{-75.2600834705, -75.2600833029, -132.173112966, -132.173112631}}, + {"0ede2rgwt2", []float64{-69.6975231171, -69.6975177526, -153.968356848, -153.968346119}}, + {"3", []float64{-45.0, 0.0, -135.0, -90.0}}, + {"dj", []float64{28.125, 33.75, -90.0, -78.75}}, + {"xf", []float64{11.25, 16.875, 168.75, 180.0}}, + {"szf3p5k9rmx", []float64{43.7876281142, 43.7876294553, 37.228180021, 37.2281813622}}, + {"b9fqkhfem7", []float64{55.9690493345, 55.9690546989, -154.156497717, -154.156486988}}, + {"t7zw8x5c", []float64{22.2749519348, 22.2751235962, 66.8239974976, 66.8243408203}}, + {"f87dmh", []float64{46.8237304688, 46.8292236328, -62.3583984375, -62.3474121094}}, + {"yrd1swq12", []float64{87.4857187271, 87.4857616425, 104.268493652, 104.268536568}}, + {"s2", []float64{0.0, 5.625, 11.25, 22.5}}, + {"q9dhkgwy4kum", []float64{-35.7951473258, -35.7951471582, 115.530612208, 115.530612543}}, + {"7dr", []float64{-32.34375, -30.9375, -12.65625, -11.25}}, + {"v", []float64{45.0, 90.0, 45.0, 90.0}}, + {"2rpk2ey43dw", []float64{-4.85693067312, -4.85692933202, -158.524402678, -158.524401337}}, + {"3wfeg", []float64{-6.3720703125, -6.328125, -108.852539062, -108.80859375}}, + {"ke5k4j", []float64{-27.3944091797, -27.3889160156, 27.158203125, 27.1691894531}}, + {"z0xq", []float64{48.8671875, 49.04296875, 145.1953125, 145.546875}}, + {"w1sy", []float64{9.4921875, 9.66796875, 96.6796875, 97.03125}}, + {"eqm14", []float64{35.33203125, 35.3759765625, -26.630859375, -26.5869140625}}, + {"s", []float64{0.0, 45.0, 0.0, 45.0}}, + {"hjp9d9f", []float64{-61.6017150879, -61.6003417969, 10.6594848633, 10.6608581543}}, + {"p92v0", []float64{-82.08984375, -82.0458984375, 158.5546875, 158.598632812}}, + {"36m7g02m", []float64{-31.6823387146, -31.6821670532, -116.23500824, -116.234664917}}, + {"5g70e57zjrf", []float64{-71.6117633879, -71.6117620468, -6.89403623343, -6.89403489232}}, + {"65rkyfq4se", []float64{-25.8709841967, -25.8709788322, -79.4996237755, -79.4996130466}}, + {"eev1", []float64{21.26953125, 21.4453125, -15.46875, -15.1171875}}, + {"6", []float64{-45.0, 0.0, -90.0, -45.0}}, + {"m75926t", []float64{-27.8915405273, -27.8901672363, 61.1897277832, 61.1911010742}}, + {"1kjyeb", []float64{-66.357421875, -66.3519287109, -115.499267578, -115.48828125}}, + {"fb8rk2yfwmrp", []float64{49.0914924257, 49.0914925933, -55.7021225989, -55.7021222636}}, + {"y2qhd0j8x", []float64{47.1973514557, 47.197394371, 109.783244133, 109.783287048}}, + {"m2", []float64{-45.0, -39.375, 56.25, 67.5}}, + {"0543np5pgd23", []float64{-72.9094239883, -72.9094238207, -176.567995213, -176.567994878}}, + {"3", []float64{-45.0, 0.0, -135.0, -90.0}}, + {"d4h5zdhe5gy", []float64{11.9207011163, 11.9207024574, -84.0390613675, -84.0390600264}}, + {"9rcd", []float64{43.9453125, 44.12109375, -121.640625, -121.2890625}}, + {"ne9nrh75tq3", []float64{-69.1898868978, -69.1898855567, 114.218213707, 114.218215048}}, + {"7wk7", []float64{-9.31640625, -9.140625, -16.5234375, -16.171875}}, + {"995f97e", []float64{6.08367919922, 6.08505249023, -107.167510986, -107.166137695}}, + {"60kmung", []float64{-42.5459289551, -42.5445556641, -83.843536377, -83.8421630859}}, + {"845", []float64{11.25, 12.65625, -175.78125, -174.375}}, + {"n", []float64{-90.0, -45.0, 90.0, 135.0}}, + {"jehdxn0", []float64{-72.6525878906, -72.6512145996, 74.1357421875, 74.1371154785}}, + {"y", []float64{45.0, 90.0, 90.0, 135.0}}, + {"1d", []float64{-78.75, -73.125, -112.5, -101.25}}, + {"rbjy", []float64{-43.9453125, -43.76953125, 176.8359375, 177.1875}}, + {"r8qgzf4r9uy", []float64{-42.9222710431, -42.922269702, 167.335936725, 167.335938066}}, + {"k5p", []float64{-28.125, -26.71875, 9.84375, 11.25}}, + {"f4z7", []float64{60.99609375, 61.171875, -79.8046875, -79.453125}}, + {"7rp35b", []float64{-5.44921875, -5.44372558594, -23.3898925781, -23.37890625}}, + {"zn71yyn0pbc", []float64{80.4968301952, 80.4968315363, 139.52395454, 139.523955882}}, + {"ppj7", []float64{-50.09765625, -49.921875, 142.3828125, 142.734375}}, + {"mqv3q", []float64{-6.8115234375, -6.767578125, 63.896484375, 63.9404296875}}, + {"tsdtmfq", []float64{26.2477111816, 26.2490844727, 71.276550293, 71.277923584}}, + {"72ey8b14uynx", []float64{-41.0444164462, -41.0444162786, -28.4420176595, -28.4420173243}}, + {"7qrgb", []float64{-9.1845703125, -9.140625, -22.8515625, -22.8076171875}}, + {"w7zmkdpezcm", []float64{22.0282383263, 22.0282396674, 111.653705388, 111.653706729}}, + {"kqwr1dh9jdbc", []float64{-7.19585834071, -7.19585817307, 20.1113973185, 20.1113976538}}, + {"kv9jx", []float64{-13.095703125, -13.0517578125, 35.4638671875, 35.5078125}}, + {"09", []float64{-84.375, -78.75, -157.5, -146.25}}, + {"f8ztmmp0", []float64{50.1690673828, 50.1692390442, -56.7127990723, -56.7124557495}}, + {"k5dj8cuwbxjg", []float64{-24.3348933198, -24.3348931521, 2.85166796297, 2.85166829824}}, + {"xd72j5qndwhn", []float64{12.6752517745, 12.6752519421, 162.298391461, 162.298391797}}, + {"esp42d", []float64{22.9064941406, 22.9119873047, -12.6342773438, -12.6232910156}}, + {"5sbfys", []float64{-62.7758789062, -62.7703857422, -21.1596679688, -21.1486816406}}, + {"8wsz02n", []float64{37.79296875, 37.794342041, -150.801086426, -150.799713135}}, + {"zeghw8", []float64{66.884765625, 66.8902587891, 162.004394531, 162.015380859}}, + {"u0xg7ug", []float64{48.4098815918, 48.4112548828, 11.0673522949, 11.0687255859}}, + {"0jb11", []float64{-57.48046875, -57.4365234375, -179.956054688, -179.912109375}}, + {"xv8cwtybm", []float64{31.2328004837, 31.232843399, 170.099816322, 170.099859238}}, + {"ef0cwqt7", []float64{11.5498924255, 11.5500640869, -9.91344451904, -9.91310119629}}, + {"hrh5k", []float64{-50.0537109375, -50.009765625, 17.05078125, 17.0947265625}}, + {"pnpdsx4eb", []float64{-55.7714509964, -55.7714080811, 145.748062134, 145.748105049}}, + {"8g2sx4gn", []float64{19.0884017944, 19.0885734558, -145.235137939, -145.234794617}}, + {"tsue3yr4z", []float64{27.3248434067, 27.324886322, 73.9149427414, 73.9149856567}}, + {"k4vq", []float64{-28.4765625, -28.30078125, 7.3828125, 7.734375}}, + {"mr1f1d430h", []float64{-5.26225805283, -5.26225268841, 58.7799453735, 58.7799561024}}, + {"dtuqkjybm", []float64{33.4740114212, 33.4740543365, -61.3381719589, -61.3381290436}}, + {"p00zpfbj5350", []float64{-88.7535613775, -88.7535612099, 136.39540717, 136.395407505}}, + {"n16jy8wrg38", []float64{-81.9539228082, -81.9539214671, 93.106867075, 93.1068684161}}, + {"3ckf9t6", []float64{-37.5004577637, -37.4990844727, -94.5016479492, -94.5002746582}}, + {"vvch78h7q7", []float64{78.0913943052, 78.0913996696, 80.3161633015, 80.3161740303}}, + {"x", []float64{0.0, 45.0, 135.0, 180.0}}, + {"qkr8dj44", []float64{-20.9780502319, -20.9778785706, 111.887512207, 111.88785553}}, + {"s5dw7s", []float64{20.8081054688, 20.8135986328, 3.66943359375, 3.68041992188}}, + {"tpt", []float64{42.1875, 43.59375, 52.03125, 53.4375}}, + {"6vqn07ep", []float64{-14.3936347961, -14.3934631348, -47.7973937988, -47.7970504761}}, + {"7zbup2", []float64{-0.703125, -0.697631835938, -9.87670898438, -9.86572265625}}, + {"xd0j0wrn39f8", []float64{12.1643207967, 12.1643209644, 157.531653419, 157.531653754}}, + {"254kywz4", []float64{-27.2526168823, -27.2524452209, -176.540679932, -176.540336609}}, + {"6pkmr1875rp", []float64{-3.28710615635, -3.28710481524, -83.7153281271, -83.715326786}}, + {"69bmhmbw0de", []float64{-34.2447146773, -34.2447133362, -66.9609577954, -66.9609564543}}, + {"47jd", []float64{-72.7734375, -72.59765625, -71.015625, -70.6640625}}, + {"mw3ngtnj", []float64{-8.6289024353, -8.62873077393, 69.0682983398, 69.0686416626}}, + {"v", []float64{45.0, 90.0, 45.0, 90.0}}, + {"4uyq1", []float64{-62.2265625, -62.1826171875, -47.4169921875, -47.373046875}}, + {"9748v3e", []float64{17.0150756836, 17.0164489746, -119.999542236, -119.998168945}}, + {"sjy7", []float64{32.87109375, 33.046875, 8.7890625, 9.140625}}, + {"nc1jb2kb", []float64{-83.3628845215, -83.3627128601, 125.17375946, 125.174102783}}, + {"ffryw", []float64{58.798828125, 58.8427734375, -45.087890625, -45.0439453125}}, + {"3qfr7scg5s", []float64{-5.7302069664, -5.73020160198, -120.429575443, -120.429564714}}, + {"1x", []float64{-50.625, -45.0, -112.5, -101.25}}, + {"y", []float64{45.0, 90.0, 90.0, 135.0}}, + {"0", []float64{-90.0, -45.0, -180.0, -135.0}}, + {"n4wk", []float64{-75.234375, -75.05859375, 98.7890625, 99.140625}}, + {"bw2d2", []float64{80.5517578125, 80.595703125, -156.796875, -156.752929688}}, + {"ztgehr9mpt", []float64{77.9131776094, 77.9131829739, 162.610681057, 162.610691786}}, + {"bnkb", []float64{80.15625, 80.33203125, -173.3203125, -172.96875}}, + {"q0fmcn", []float64{-39.7375488281, -39.7320556641, 93.2080078125, 93.2189941406}}, + {"e0e1sxt9gwm", []float64{3.11770454049, 3.1177058816, -40.5757860839, -40.5757847428}}, + {"9qc", []float64{37.96875, 39.375, -122.34375, -120.9375}}, + {"0cybm9snr", []float64{-80.1029920578, -80.1029491425, -136.51031971, -136.510276794}}, + {"fp", []float64{84.375, 90.0, -90.0, -78.75}}, + {"7u69k7", []float64{-20.8575439453, -20.8520507812, -7.54760742188, -7.53662109375}}, + {"guh3mbvnwv7y", []float64{67.7249914035, 67.7249915712, -5.01359079033, -5.01359045506}}, + {"vgw4wgnrd58e", []float64{65.1447393559, 65.1447395235, 87.4928004295, 87.4928007647}}, + {"rzk732w", []float64{-3.64471435547, -3.64334106445, 174.789733887, 174.791107178}}, + {"kf", []float64{-33.75, -28.125, 33.75, 45.0}}, + {"rcfr28t0", []float64{-33.8790893555, -33.8789176941, 171.942901611, 171.943244934}}, + {"5bqnms", []float64{-87.4731445312, -87.4676513672, -2.57080078125, -2.55981445312}}, + {"fs84w", []float64{70.751953125, 70.7958984375, -67.236328125, -67.1923828125}}, + {"mcjrsmx", []float64{-38.0264282227, -38.0250549316, 86.3291931152, 86.3305664062}}, + {"u84", []float64{45.0, 46.40625, 25.3125, 26.71875}}, + {"gkv4g14m", []float64{72.2084999084, 72.2086715698, -26.5838241577, -26.583480835}}, + {"27dhxu", []float64{-24.4995117188, -24.4940185547, -165.596923828, -165.5859375}}, + {"0v", []float64{-61.875, -56.25, -146.25, -135.0}}, + {"bpurn", []float64{89.82421875, 89.8681640625, -173.759765625, -173.715820312}}, + {"p5", []float64{-73.125, -67.5, 135.0, 146.25}}, + {"f3ffsuh", []float64{55.3051757812, 55.3065490723, -74.6685791016, -74.6672058105}}, + {"j0zbr0tb", []float64{-85.7345581055, -85.7343864441, 56.2139511108, 56.2142944336}}, + {"vyz", []float64{82.96875, 84.375, 88.59375, 90.0}}, + {"082p96b5ey", []float64{-87.2596514225, -87.2596460581, -157.444907427, -157.444896698}}, + {"y", []float64{45.0, 90.0, 90.0, 135.0}}, + {"g030qs", []float64{46.4721679688, 46.4776611328, -43.3081054688, -43.2971191406}}, + {"54", []float64{-78.75, -73.125, -45.0, -33.75}}, + {"fp5rcptn2gc", []float64{85.7795964181, 85.7795977592, -85.3788422048, -85.3788408637}}, + {"dk8z85s6516h", []float64{26.650436148, 26.6504363157, -77.6893445849, -77.6893442497}}, + {"3v1ebh18qujh", []float64{-16.1937826127, -16.193782445, -99.1382686794, -99.1382683441}}, + {"un50j3xf9", []float64{78.7586688995, 78.7587118149, 4.46014881134, 4.46019172668}}, + {"4b8y3gh", []float64{-86.0723876953, -86.0710144043, -55.1129150391, -55.111541748}}, + {"efdgh", []float64{14.58984375, 14.6337890625, -7.20703125, -7.1630859375}}, + {"1xxuk2", []float64{-47.0654296875, -47.0599365234, -101.414794922, -101.403808594}}, + {"s", []float64{0.0, 45.0, 0.0, 45.0}}, + {"1z6", []float64{-49.21875, -47.8125, -98.4375, -97.03125}}, + {"s", []float64{0.0, 45.0, 0.0, 45.0}}, + {"mxkjr1jdu5ru", []float64{-3.28991509974, -3.2899149321, 73.440352343, 73.4403526783}}, + {"x", []float64{0.0, 45.0, 135.0, 180.0}}, + {"3kpcn3sm", []float64{-22.315120697, -22.3149490356, -112.57106781, -112.570724487}}, + {"buk3t0ctrmt0", []float64{69.1749724746, 69.1749726422, -140.051333159, -140.051332824}}, + {"pp", []float64{-50.625, -45.0, 135.0, 146.25}}, + {"4h", []float64{-67.5, -61.875, -90.0, -78.75}}, + {"fjw1kcg4", []float64{76.1671829224, 76.1673545837, -81.3496398926, -81.3492965698}}, + {"877wsvjfz5", []float64{19.4517821074, 19.4517874718, -163.611187935, -163.611177206}}, + {"ru3", []float64{-21.09375, -19.6875, 170.15625, 171.5625}}, + {"yr", []float64{84.375, 90.0, 101.25, 112.5}}, + {"cu5x6cxq", []float64{68.7836837769, 68.7838554382, -96.1973190308, -96.196975708}}, + {"w04vuf4bdzjm", []float64{1.02185273543, 1.02185290307, 94.0798293427, 94.0798296779}}, + {"8", []float64{0.0, 45.0, -180.0, -135.0}}, + {"4zdcmmp2p4", []float64{-47.5652968884, -47.5652915239, -52.1418428421, -52.1418321133}}, + {"eft02s1hu", []float64{14.1292333603, 14.1292762756, -4.19523239136, -4.19518947601}}, + {"zk4v9qdeg5q1", []float64{68.5031637736, 68.5031639412, 150.175689161, 150.175689496}}, + {"8xr", []float64{40.78125, 42.1875, -147.65625, -146.25}}, + {"3pxyrt", []float64{-1.68640136719, -1.68090820312, -123.771972656, -123.760986328}}, + {"cmh39xszs8", []float64{73.4311580658, 73.4311634302, -117.70080328, -117.700792551}}, + {"xrm9d0wb48", []float64{41.047668457, 41.0476738214, 154.081642628, 154.081653357}}, + {"d4bh0k", []float64{16.1938476562, 16.1993408203, -89.9890136719, -89.9780273438}}, + {"hk8", []float64{-64.6875, -63.28125, 11.25, 12.65625}}, + {"9hxqk54m0wn", []float64{26.4285027981, 26.4285041392, -124.625786841, -124.6257855}}, + {"mygnv0", []float64{-5.8447265625, -5.83923339844, 83.1884765625, 83.1994628906}}, + {"yrjmvs", []float64{85.4077148438, 85.4132080078, 108.874511719, 108.885498047}}, + {"52csyemvf12", []float64{-84.9274425209, -84.9274411798, -31.3469982147, -31.3469968736}}, + {"4jrvjj", []float64{-59.5623779297, -59.5568847656, -78.8818359375, -78.8708496094}}, + {"ys1", []float64{67.5, 68.90625, 113.90625, 115.3125}}, + {"unf91", []float64{83.14453125, 83.1884765625, 3.5595703125, 3.603515625}}, + {"h5che0vnt", []float64{-68.109998703, -68.1099557877, 1.5451669693, 1.54520988464}}, + {"ugrk3", []float64{64.0283203125, 64.072265625, 43.9892578125, 44.033203125}}, + {"9ush8c", []float64{26.1090087891, 26.1145019531, -95.5920410156, -95.5810546875}}, + {"q92pzb", []float64{-36.6064453125, -36.6009521484, 112.840576172, 112.8515625}}, + {"0e", []float64{-73.125, -67.5, -157.5, -146.25}}, + {"dbt1mchu", []float64{3.03840637207, 3.03857803345, -48.9595413208, -48.959197998}}, + {"98xv2m", []float64{3.76281738281, 3.76831054688, -101.590576172, -101.579589844}}, + {"rqd8u195kgu", []float64{-8.29684630036, -8.29684495926, 149.942988753, 149.942990094}}, + {"504wk58ccv", []float64{-88.8818138838, -88.8818085194, -41.3074886799, -41.307477951}}, + {"0dzjhbn", []float64{-73.65234375, -73.650970459, -147.43927002, -147.437896729}}, + {"sgcn", []float64{22.1484375, 22.32421875, 35.15625, 35.5078125}}, + {"46k78jw0x65w", []float64{-76.6982056573, -76.6982054897, -72.7648819238, -72.7648815885}}, + {"6w2cbxx3nf9", []float64{-9.49474900961, -9.4947476685, -66.4130924642, -66.4130911231}}, + {"zxmf4", []float64{86.1328125, 86.1767578125, 165.673828125, 165.717773438}}, + {"unf", []float64{82.96875, 84.375, 2.8125, 4.21875}}, + {"m4p", []float64{-33.75, -32.34375, 54.84375, 56.25}}, + {"dsc1rqss2w", []float64{26.9749438763, 26.9749492407, -65.7689452171, -65.7689344883}}, + {"cxp", []float64{84.375, 85.78125, -102.65625, -101.25}}, + {"zmh", []float64{73.125, 74.53125, 151.875, 153.28125}}, + {"tynvnjc8hdb", []float64{34.6605066955, 34.6605080366, 88.5081124306, 88.5081137717}}, + {"uk8hb", []float64{71.1474609375, 71.19140625, 11.25, 11.2939453125}}, + {"34d", []float64{-30.9375, -29.53125, -132.1875, -130.78125}}, + {"ts39vet4rzw5", []float64{24.2335202359, 24.2335204035, 69.8582813144, 69.8582816496}}, + {"3rt1fx5", []float64{-2.46643066406, -2.46505737305, -116.604766846, -116.603393555}}, + {"ujn8yhfpg", []float64{73.2842588425, 73.2843017578, 9.40717220306, 9.40721511841}}, + {"pdbvhzj", []float64{-73.6138916016, -73.6125183105, 158.770294189, 158.77166748}}, + {"q35", []float64{-39.375, -37.96875, 105.46875, 106.875}}, + {"szh5424hc", []float64{39.9031591415, 39.9032020569, 39.4766664505, 39.4767093658}}, + {"m", []float64{-45.0, 0.0, 45.0, 90.0}}, + {"tt1wjkr44e", []float64{29.2033928633, 29.2033982277, 69.8498082161, 69.8498189449}}, + {"1u3hdkn", []float64{-65.2807617188, -65.2793884277, -99.7366333008, -99.7352600098}}, + {"jc9", []float64{-81.5625, -80.15625, 80.15625, 81.5625}}, + {"627pp", []float64{-42.36328125, -42.3193359375, -74.2236328125, -74.1796875}}, + {"g46wqb4z", []float64{58.7560844421, 58.7562561035, -41.1839675903, -41.1836242676}}, + {"2407674", []float64{-33.1622314453, -33.1608581543, -179.546813965, -179.545440674}}, + {"3vbsrcxu", []float64{-11.9002532959, -11.9000816345, -100.195655823, -100.1953125}}, + {"u0mr9fpy", []float64{47.7366256714, 47.7367973328, 7.47035980225, 7.470703125}}, + {"p1s1", []float64{-81.38671875, -81.2109375, 140.625, 140.9765625}}, + {"ce7y6s1ugjpu", []float64{64.4026983529, 64.4026985206, -107.11415682, -107.114156485}}, + {"tujn", []float64{23.5546875, 23.73046875, 85.78125, 86.1328125}}, + {"fes", []float64{64.6875, 66.09375, -61.875, -60.46875}}, + {"28te871t29y", []float64{-41.5548755229, -41.5548741817, -149.752549231, -149.75254789}}, + {"2z9j0591", []float64{-1.9141960144, -1.91402435303, -144.842376709, -144.842033386}}, + {"e", []float64{0.0, 45.0, -45.0, 0.0}}, + {"90", []float64{0.0, 5.625, -135.0, -123.75}}, + {"jbfm12r", []float64{-84.900970459, -84.899597168, 81.9786071777, 81.9799804688}}, + {"y0ws", []float64{48.515625, 48.69140625, 99.140625, 99.4921875}}, + {"m2", []float64{-45.0, -39.375, 56.25, 67.5}}, + {"gpspv95sz", []float64{88.5561132431, 88.5561561584, -39.1281938553, -39.1281509399}}, + {"7k8u95cyjdx6", []float64{-18.8748412952, -18.8748411275, -32.6487181708, -32.6487178355}}, + {"c1fe0r", []float64{55.4095458984, 55.4150390625, -131.473388672, -131.462402344}}, + {"668wjecj2d", []float64{-29.8613011837, -29.8612958193, -77.8037810326, -77.8037703037}}, + {"dnq3", []float64{35.33203125, 35.5078125, -81.2109375, -80.859375}}, + {"m3sxdxnvmrr", []float64{-35.2047483623, -35.2047470212, 62.6974926889, 62.69749403}}, + {"zz3qpfvqzu", []float64{86.8522238731, 86.8522292376, 170.855931044, 170.855941772}}, + {"98mjjx8bu", []float64{2.3264837265, 2.32652664185, -105.225849152, -105.225806236}}, + {"pkmusy0e4j35", []float64{-65.2692317404, -65.2692315727, 154.545451552, 154.545451887}}, + {"j3f9dtm5r5n", []float64{-79.8631650209, -79.8631636798, 59.8826631904, 59.8826645315}}, + {"67up3c0uh9jn", []float64{-22.6256497577, -22.62564959, -73.0468659103, -73.046865575}}, + {"6q0fd9wn2", []float64{-10.8012342453, -10.80119133, -77.5772094727, -77.5771665573}}, + {"t82e5zrs", []float64{1.97410583496, 1.97427749634, 68.3782196045, 68.3785629272}}, + {"0hstxh", []float64{-63.6987304688, -63.6932373047, -173.364257812, -173.353271484}}, + {"qe1egcuetqe", []float64{-27.4555715919, -27.4555702507, 114.78057906, 114.780580401}}, + {"yhp25wc4v", []float64{67.5375509262, 67.5375938416, 100.350708961, 100.350751877}}, + {"z6uvby2nrt4k", []float64{61.5149248391, 61.5149250068, 152.962971367, 152.962971702}}, + {"29sd0863cx", []float64{-36.2092262506, -36.2092208862, -151.146748066, -151.146737337}}, + {"kvnx614", []float64{-15.5950927734, -15.5937194824, 42.981262207, 42.982635498}}, + {"mu1srk07", []float64{-21.7304420471, -21.7302703857, 81.1783218384, 81.1786651611}}, + {"5bz5bmq", []float64{-85.0932312012, -85.0918579102, -1.38702392578, -1.38565063477}}, + {"fu4yx9fr8gtk", []float64{68.6534980685, 68.6534982361, -52.0500935242, -52.0500931889}}, + {"3hyhj92rn", []float64{-17.5700569153, -17.5700139999, -126.320199966, -126.320157051}}, + {"345nw", []float64{-32.607421875, -32.5634765625, -130.517578125, -130.473632812}}, + {"q5f2p327mhy", []float64{-23.8988001645, -23.8987988234, 93.4832319617, 93.4832333028}}, + {"0wmufb9", []float64{-54.0060424805, -54.0046691895, -149.2918396, -149.290466309}}, + {"r", []float64{-45.0, 0.0, 135.0, 180.0}}, + {"07d2sde", []float64{-70.2108764648, -70.2095031738, -165.384063721, -165.38269043}}, + {"d0r2", []float64{1.40625, 1.58203125, -79.8046875, -79.453125}}, + {"znegsexfs23h", []float64{82.1973916143, 82.197391782, 140.482018143, 140.482018478}}, + {"sfr69qxg", []float64{13.1319236755, 13.1320953369, 44.010887146, 44.0112304688}}, + {"tr44b8brc", []float64{39.8638486862, 39.8638916016, 59.0848588943, 59.0849018097}}, + {"tbnqctecsf", []float64{1.21700406075, 1.21700942516, 87.6103341579, 87.6103448868}}, + {"jpfy538qu", []float64{-45.3421640396, -45.3421211243, 49.0105247498, 49.0105676651}}, + {"u", []float64{45.0, 90.0, 0.0, 45.0}}, + {"gskrg0z5e", []float64{70.2732753754, 70.2733182907, -16.3818597794, -16.381816864}}, + {"6cz", []float64{-35.15625, -33.75, -46.40625, -45.0}}, + {"u67hm7b47423", []float64{58.4243181534, 58.424318321, 15.6995919719, 15.6995923072}}, + {"j154zhnnkyt3", []float64{-83.8685209863, -83.8685208187, 49.5348178223, 49.5348181576}}, + {"muqdpev4smv", []float64{-20.7211281359, -20.7211267948, 88.2272703946, 88.2272717357}}, + {"47h3upynmsru", []float64{-72.7737144381, -72.7737142704, -72.589170076, -72.5891697407}}, + {"g6j200", []float64{56.25, 56.2554931641, -26.3671875, -26.3562011719}}, + {"tw", []float64{33.75, 39.375, 67.5, 78.75}}, + {"c0pjhr520q", []float64{45.9173905849, 45.9173959494, -124.965008497, -124.964997768}}, + {"8nx", []float64{36.5625, 37.96875, -170.15625, -168.75}}, + {"47b2wvtns", []float64{-68.7870311737, -68.7869882584, -78.0947685242, -78.0947256088}}, + {"vrsbq", []float64{87.2314453125, 87.275390625, 63.193359375, 63.2373046875}}, + {"sz", []float64{39.375, 45.0, 33.75, 45.0}}, + {"xe61b0bnw", []float64{18.5941028595, 18.5941457748, 160.312757492, 160.312800407}}, + {"dky6qedz3w", []float64{27.1347606182, 27.1347659826, -69.6714520454, -69.6714413166}}, + {"vmvkqx2hb", []float64{78.1314611435, 78.1315040588, 63.9184570312, 63.9184999466}}, + {"t96m49xgr1y", []float64{7.9189632833, 7.9189646244, 70.7848772407, 70.7848785818}}, + {"brw2urrqcfex", []float64{87.3603346758, 87.3603348434, -159.764133766, -159.764133431}}, + {"z7m8", []float64{63.28125, 63.45703125, 153.984375, 154.3359375}}, + {"wm6w7f38", []float64{30.6422424316, 30.642414093, 104.932479858, 104.932823181}}, + {"rxj23rtt4y", []float64{-5.53896546364, -5.53896009922, 164.945415258, 164.945425987}}, + {"sfr9xsyzfn", []float64{12.9473769665, 12.9473823309, 44.6358203888, 44.6358311176}}, + {"9ubf9uq02e", []float64{27.1816080809, 27.1816134453, -100.110146999, -100.110136271}}, + {"kj25zp1gb6j", []float64{-14.7704637051, -14.770462364, 0.310037881136, 0.31003922224}}, + {"x4f", []float64{15.46875, 16.875, 137.8125, 139.21875}}, + {"xnn27kkf4c", []float64{33.8176399469, 33.8176453114, 143.938525915, 143.938536644}}, + {"61bhs9byn4", []float64{-34.3545806408, -34.3545752764, -89.8009586334, -89.8009479046}}, + {"rv2sve92mngr", []float64{-14.6144826896, -14.614482522, 169.696759768, 169.696760103}}, + {"zkvq2w", []float64{72.8503417969, 72.8558349609, 153.654785156, 153.665771484}}, + {"qprmp68h7kcd", []float64{-3.32535546273, -3.32535529509, 100.514057502, 100.514057837}}, + {"77pmzubu", []float64{-27.0874786377, -27.0873069763, -23.2130813599, -23.2127380371}}, + {"q73t2sumh3b", []float64{-25.7689382136, -25.7689368725, 103.387366533, 103.387367874}}, + {"3kxch9c", []float64{-19.5021057129, -19.5007324219, -112.652435303, -112.651062012}}, + {"t", []float64{0.0, 45.0, 45.0, 90.0}}, + {"3um1y618chw", []float64{-20.7749935985, -20.7749922574, -93.9419808984, -93.9419795573}}, + {"45nj7sxww", []float64{-72.1763134003, -72.1762704849, -81.3981342316, -81.3980913162}}, + {"rnkyjdv404", []float64{-8.77360224724, -8.77359688282, 141.928253174, 141.928263903}}, + {"p3", []float64{-84.375, -78.75, 146.25, 157.5}}, + {"sxbz", []float64{44.82421875, 45.0, 23.5546875, 23.90625}}, + {"xuj2k", []float64{22.5439453125, 22.587890625, 176.30859375, 176.352539062}}, + {"yhp9", []float64{67.67578125, 67.8515625, 100.546875, 100.8984375}}, + {"1yq4", []float64{-54.4921875, -54.31640625, -92.8125, -92.4609375}}, + {"u4m2jkw", []float64{57.6809692383, 57.6823425293, 7.62176513672, 7.62313842773}}, + {"xb9", []float64{2.8125, 4.21875, 170.15625, 171.5625}}, + {"ebf4e478jp", []float64{4.67060029507, 4.67060565948, -8.30064296722, -8.30063223839}}, + {"y7venx9", []float64{66.6622924805, 66.6636657715, 109.271392822, 109.272766113}}, + {"8qu", []float64{37.96875, 39.375, -163.125, -161.71875}}, + {"jw2jbzms66", []float64{-53.7924420834, -53.7924367189, 67.5406086445, 67.5406193733}}, + {"n", []float64{-90.0, -45.0, 90.0, 135.0}}, + {"jbx", []float64{-87.1875, -85.78125, 88.59375, 90.0}}, + {"3v4n", []float64{-15.8203125, -15.64453125, -98.4375, -98.0859375}}, + {"0z1theg", []float64{-49.7254943848, -49.7241210938, -143.938751221, -143.93737793}}, + {"zbz00jf21m", []float64{49.2503625154, 49.2503678799, 178.596893549, 178.596904278}}, + {"dfpq2eg2", []float64{12.3692321777, 12.3694038391, -46.0282516479, -46.0279083252}}, + {"z2j5bc1ph562", []float64{45.6658919156, 45.6658920832, 153.315756954, 153.31575729}}, + {"3p3g", []float64{-3.69140625, -3.515625, -132.5390625, -132.1875}}, + {"4rfgeu3", []float64{-45.7676696777, -45.7662963867, -74.7166442871, -74.7152709961}}, + {"nykq", []float64{-53.7890625, -53.61328125, 129.7265625, 130.078125}}, + {"h", []float64{-90.0, -45.0, 0.0, 45.0}}, + {"85", []float64{16.875, 22.5, -180.0, -168.75}}, + {"bdsdxr", []float64{59.5404052734, 59.5458984375, -150.853271484, -150.842285156}}, + {"wsyt3duqg2", []float64{27.657866478, 27.6578718424, 121.71251893, 121.712529659}}, + {"90", []float64{0.0, 5.625, -135.0, -123.75}}, + {"butw", []float64{71.3671875, 71.54296875, -138.515625, -138.1640625}}, + {"ddhpjv6b7tqh", []float64{12.5093796104, 12.5093797781, -61.6183796525, -61.6183793172}}, + {"18ueqgd", []float64{-85.1907348633, -85.1893615723, -105.872497559, -105.871124268}}, + {"v2g8jh1", []float64{49.2407226562, 49.2420959473, 61.3929748535, 61.3943481445}}, + {"84umeh3gmupk", []float64{16.45947285, 16.4594730176, -173.888941817, -173.888941482}}, + {"s4g900", []float64{15.64453125, 15.6500244141, 4.921875, 4.93286132812}}, + {"0b313fz2", []float64{-88.3589172363, -88.358745575, -144.756889343, -144.756546021}}, + {"4q", []float64{-56.25, -50.625, -78.75, -67.5}}, + {"d61", []float64{11.25, 12.65625, -77.34375, -75.9375}}, + {"w5q5298pq", []float64{18.8620233536, 18.8620662689, 98.4597301483, 98.4597730637}}, + {"ushgx399", []float64{68.1236457825, 68.1238174438, 29.5003509521, 29.5006942749}}, + {"73ngt", []float64{-38.759765625, -38.7158203125, -24.0380859375, -23.994140625}}, + {"2f4smcem", []float64{-32.9938316345, -32.9936599731, -142.477226257, -142.476882935}}, + {"0", []float64{-90.0, -45.0, -180.0, -135.0}}, + {"8", []float64{0.0, 45.0, -180.0, -135.0}}, + {"5u14weqgz", []float64{-67.0420503616, -67.0420074463, -9.54853534698, -9.54849243164}}, + {"xxhuu8y4xb", []float64{40.214509964, 40.2145153284, 164.386013746, 164.386024475}}, + {"272xeqmj", []float64{-25.3652000427, -25.3650283813, -167.897186279, -167.896842957}}, + {"2trrhunrd1", []float64{-14.215015769, -14.2150104046, -147.087278366, -147.087267637}}, + {"e4", []float64{11.25, 16.875, -45.0, -33.75}}, + {"p5duz8tp", []float64{-69.4735908508, -69.4734191895, 139.203643799, 139.203987122}}, + {"5qprz78e", []float64{-54.8679542542, -54.8677825928, -23.2353973389, -23.2350540161}}, + {"ch5yq40qtu3", []float64{68.6107577384, 68.6107590795, -129.462299198, -129.462297857}}, + {"u", []float64{45.0, 90.0, 0.0, 45.0}}, + {"7qmv9c5", []float64{-8.87145996094, -8.87008666992, -25.5830383301, -25.5816650391}}, + {"c", []float64{45.0, 90.0, -135.0, -90.0}}, + {"hp8s7", []float64{-47.0654296875, -47.021484375, 0.8349609375, 0.87890625}}, + {"9e4y04d17", []float64{17.9436349869, 17.9436779022, -108.629937172, -108.629894257}}, + {"39nh", []float64{-38.671875, -38.49609375, -104.0625, -103.7109375}}, + {"6", []float64{-45.0, 0.0, -90.0, -45.0}}, + {"pjpxe1pvyzhm", []float64{-60.5501220189, -60.5501218513, 145.689649321, 145.689649656}}, + {"drx", []float64{42.1875, 43.59375, -68.90625, -67.5}}, + {"zu1c5qg0s", []float64{67.7129459381, 67.7129888535, 171.3580513, 171.358094215}}, + {"y", []float64{45.0, 90.0, 90.0, 135.0}}, + {"t8d", []float64{2.8125, 4.21875, 70.3125, 71.71875}}, + {"d47w70rwe23h", []float64{13.7573739141, 13.7573740818, -84.9358485639, -84.9358482286}}, + {"3t617", []float64{-15.2490234375, -15.205078125, -109.555664062, -109.51171875}}, + {"qnkq1pz", []float64{-8.74649047852, -8.7451171875, 96.0301208496, 96.0314941406}}, + {"fu", []float64{67.5, 73.125, -56.25, -45.0}}, + {"7vs", []float64{-14.0625, -12.65625, -5.625, -4.21875}}, + {"bztqz0h", []float64{88.3740234375, 88.3753967285, -138.554077148, -138.552703857}}, + {"b8j", []float64{45.0, 46.40625, -150.46875, -149.0625}}, + {"cetkxmq73", []float64{65.5079126358, 65.5079555511, -104.789958, -104.789915085}}, + {"p91", []float64{-84.375, -82.96875, 158.90625, 160.3125}}, + {"z4g7bn4w38", []float64{61.1619615555, 61.1619669199, 139.573810101, 139.573820829}}, + {"j", []float64{-90.0, -45.0, 45.0, 90.0}}, + {"g2eej64jbwj", []float64{48.3518493176, 48.3518506587, -28.5946373641, -28.594636023}}, + {"fwshzb30mj", []float64{82.398903966, 82.3989093304, -61.5328359604, -61.5328252316}}, + {"fv2mqt", []float64{75.4815673828, 75.4870605469, -55.6127929688, -55.6018066406}}, + {"bzr6m3zdun5", []float64{86.1868751049, 86.186876446, -135.813499242, -135.813497901}}, + {"et5rq77j8b", []float64{29.4182109833, 29.4182163477, -17.6508772373, -17.6508665085}}, + {"1c", []float64{-84.375, -78.75, -101.25, -90.0}}, + {"y1hyumh10jq", []float64{51.8391890824, 51.8391904235, 96.8719562888, 96.8719576299}}, + {"qd42djnqxq", []float64{-33.6334955692, -33.6334902048, 115.76084733, 115.760858059}}, + {"hsd9s", []float64{-64.423828125, -64.3798828125, 26.19140625, 26.2353515625}}, + {"8289gq947", []float64{3.156208992, 3.15625190735, -167.902550697, -167.902507782}}, + {"em37sw72zq4", []float64{30.1809775829, 30.180978924, -31.7896565795, -31.7896552384}}, + {"zms25", []float64{75.9375, 75.9814453125, 152.358398438, 152.40234375}}, + {"h25d54", []float64{-89.6374511719, -89.6319580078, 16.3037109375, 16.3146972656}}, + {"6qc7y2t4bb", []float64{-6.36885166168, -6.36884629726, -76.7106306553, -76.7106199265}}, + {"06vt5z8j", []float64{-73.6102867126, -73.6101150513, -160.850830078, -160.850486755}}, + {"37q3", []float64{-26.54296875, -26.3671875, -114.9609375, -114.609375}}, + {"sey9wu", []float64{21.3793945312, 21.3848876953, 31.9372558594, 31.9482421875}}, + {"qk0jrj", []float64{-21.5496826172, -21.5441894531, 101.557617188, 101.568603516}}, + {"8x6jjpm0", []float64{41.6999816895, 41.7001533508, -154.460906982, -154.46056366}}, + {"5j1etu", []float64{-61.2377929688, -61.2322998047, -42.6379394531, -42.626953125}}, + {"r6b", []float64{-29.53125, -28.125, 146.25, 147.65625}}, + {"ddu3vyj07", []float64{15.8093690872, 15.8094120026, -61.263756752, -61.2637138367}}, + {"m9fm5q2d91", []float64{-34.2425769567, -34.2425715923, 70.8076143265, 70.8076250553}}, + {"0pxdx", []float64{-47.373046875, -47.3291015625, -169.145507812, -169.1015625}}, + {"w", []float64{0.0, 45.0, 90.0, 135.0}}, + {"q1e", []float64{-36.5625, -35.15625, 94.21875, 95.625}}, + {"h3vxhm8tu", []float64{-78.8945817947, -78.8945388794, 19.172000885, 19.1720438004}}, + {"bcxsz", []float64{54.2724609375, 54.31640625, -135.395507812, -135.3515625}}, + {"crjh", []float64{85.078125, 85.25390625, -116.71875, -116.3671875}}, + {"bdqejqqgwj", []float64{58.2185536623, 58.2185590267, -148.119134903, -148.119124174}}, + {"x7zhc480u7", []float64{21.9425886869, 21.9425940514, 156.137877703, 156.137888432}}, + {"xhr7c9nd6js9", []float64{24.5713387616, 24.5713389292, 145.270248726, 145.270249061}}, + {"f25r3r", []float64{46.3128662109, 46.318359375, -74.1247558594, -74.1137695312}}, + {"b4v1e8zek", []float64{60.7370996475, 60.7371425629, -172.804470062, -172.804427147}}, + {"95cwh1k", []float64{22.1553039551, 22.1566772461, -132.709350586, -132.707977295}}, + {"kh1r", []float64{-21.26953125, -21.09375, 1.7578125, 2.109375}}, + {"7p", []float64{-5.625, 0.0, -45.0, -33.75}}, + {"mgsvj", []float64{-24.43359375, -24.3896484375, 85.6494140625, 85.693359375}}, + {"k70", []float64{-28.125, -26.71875, 11.25, 12.65625}}, + {"pxjr5g", []float64{-49.3780517578, -49.3725585938, 165.047607422, 165.05859375}}, + {"3", []float64{-45.0, 0.0, -135.0, -90.0}}, + {"81pv", []float64{6.50390625, 6.6796875, -169.1015625, -168.75}}, + {"jjg", []float64{-57.65625, -56.25, 49.21875, 50.625}}, + {"732kjtvw", []float64{-37.2330093384, -37.232837677, -33.1491851807, -33.1488418579}}, + {"kuc2", []float64{-18.28125, -18.10546875, 35.5078125, 35.859375}}, + {"wn91fmw18yp", []float64{36.9006192684, 36.9006206095, 91.5134082735, 91.5134096146}}, + {"5wdnzyz5r", []float64{-52.2133398056, -52.2132968903, -19.3370103836, -19.3369674683}}, + {"m682wkeu80", []float64{-30.8241176605, -30.8241122961, 56.8813705444, 56.8813812733}}, + {"r18jv9k", []float64{-35.5448913574, -35.5435180664, 135.247192383, 135.248565674}}, + {"zr079yhvttr", []float64{85.0241656601, 85.0241670012, 146.685235351, 146.685236692}}, + {"r4umz4vhvm", []float64{-28.5045593977, -28.5045540333, 141.291271448, 141.291282177}}, + {"58gdwpzc3zs", []float64{-85.2989700437, -85.2989687026, -17.3037296534, -17.3037283123}}, + {"64frgqpt0yj", []float64{-28.1350958347, -28.1350944936, -86.6827766597, -86.6827753186}}, + {"8n18eckkw5", []float64{33.8455456495, 33.8455510139, -177.719736099, -177.71972537}}, + {"mz326c81b1", []float64{-4.16625916958, -4.16625380516, 80.6286621094, 80.6286728382}}, + {"hx", []float64{-50.625, -45.0, 22.5, 33.75}}, + {"ush2juq", []float64{67.5233459473, 67.5247192383, 28.737487793, 28.738861084}}, + {"bp6h7m8fe", []float64{86.5589618683, 86.5590047836, -177.04351902, -177.043476105}}, + {"111", []float64{-84.375, -82.96875, -133.59375, -132.1875}}, + {"m9hwzz", []float64{-38.1500244141, -38.14453125, 74.1687011719, 74.1796875}}, + {"100u6e92zuk", []float64{-89.2335520685, -89.2335507274, -133.833394647, -133.833393306}}, + {"y", []float64{45.0, 90.0, 90.0, 135.0}}, + {"9wbmdmhu", []float64{38.9636993408, 38.9638710022, -112.043037415, -112.042694092}}, + {"9p1jg8k", []float64{40.3871154785, 40.3884887695, -133.434448242, -133.433074951}}, + {"vqf6kw", []float64{83.3972167969, 83.4027099609, 59.6118164062, 59.6228027344}}, + {"gw", []float64{78.75, 84.375, -22.5, -11.25}}, + {"h49v9", []float64{-74.970703125, -74.9267578125, 2.5048828125, 2.548828125}}, + {"23cmz", []float64{-34.1455078125, -34.1015625, -166.684570312, -166.640625}}, + {"71", []float64{-39.375, -33.75, -45.0, -33.75}}, + {"5x2kvmbu", []float64{-48.3515167236, -48.3513450623, -21.9166946411, -21.9163513184}}, + {"1nywfjs8e", []float64{-50.8144283295, -50.8143854141, -125.765175819, -125.765132904}}, + {"7u4vm3b9qy", []float64{-21.5672886372, -21.5672832727, -7.15112328529, -7.15111255646}}, + {"rx4n750gn", []float64{-4.50937271118, -4.50932979584, 160.445623398, 160.445666313}}, + {"9", []float64{0.0, 45.0, -135.0, -90.0}}, + {"nxfyqng3", []float64{-45.2703666687, -45.2701950073, 116.635322571, 116.635665894}}, + {"tgnt", []float64{17.75390625, 17.9296875, 87.890625, 88.2421875}}, + {"qe2k5jtbm2c", []float64{-25.985365659, -25.9853643179, 112.991521508, 112.991522849}}, + {"d", []float64{0.0, 45.0, -90.0, -45.0}}, + {"1jhq", []float64{-60.8203125, -60.64453125, -129.0234375, -128.671875}}, + {"p874n4ubjm", []float64{-88.2270544767, -88.2270491123, 161.989170313, 161.989181042}}, + {"91h2qc", []float64{5.67443847656, 5.67993164062, -128.726806641, -128.715820312}}, + {"mzp8s0p", []float64{-5.537109375, -5.53573608398, 89.4822692871, 89.4836425781}}, + {"ptp0rem", []float64{-61.8132019043, -61.8118286133, 167.680206299, 167.68157959}}, + {"14", []float64{-78.75, -73.125, -135.0, -123.75}}, + {"s4dq", []float64{15.1171875, 15.29296875, 3.1640625, 3.515625}}, + {"uvs7", []float64{76.46484375, 76.640625, 39.7265625, 40.078125}}, + {"wh9xq3mqh", []float64{26.5948104858, 26.5948534012, 92.3914146423, 92.3914575577}}, + {"kz", []float64{-5.625, 0.0, 33.75, 45.0}}, + {"s8t4hkb3", []float64{3.19032669067, 3.19049835205, 29.7183609009, 29.7187042236}}, + {"3ry9w", []float64{-1.142578125, -1.0986328125, -114.345703125, -114.301757812}}, + {"mf1wt5", []float64{-32.5909423828, -32.5854492188, 81.0791015625, 81.0900878906}}, + {"e", []float64{0.0, 45.0, -45.0, 0.0}}, + {"mh", []float64{-22.5, -16.875, 45.0, 56.25}}, + {"75y3665k", []float64{-23.6748504639, -23.6746788025, -36.1075973511, -36.1072540283}}, + {"sts", []float64{30.9375, 32.34375, 28.125, 29.53125}}, + {"6fdmxb", []float64{-29.970703125, -29.9652099609, -52.7453613281, -52.734375}}, + {"xcvf9qbx13jx", []float64{10.3214901499, 10.3214903176, 176.891616806, 176.891617142}}, + {"n1r6pkxu86t", []float64{-82.5916823745, -82.5916810334, 100.524576455, 100.524577796}}, + {"1m2p", []float64{-59.23828125, -59.0625, -123.75, -123.3984375}}, + {"fz", []float64{84.375, 90.0, -56.25, -45.0}}, + {"hgw", []float64{-70.3125, -68.90625, 42.1875, 43.59375}}, + {"ssktp8e5hsub", []float64{24.7884432971, 24.7884434648, 29.1620342061, 29.1620345414}}, + {"8wbw4", []float64{39.0234375, 39.0673828125, -156.708984375, -156.665039062}}, + {"wbcdsqtpnkh9", []float64{4.69513194636, 4.69513211399, 126.053283289, 126.053283624}}, + {"f0md", []float64{46.7578125, 46.93359375, -82.265625, -81.9140625}}, + {"hngnmbt2pe4", []float64{-50.9298545122, -50.9298531711, 4.478969872, 4.4789712131}}, + {"gbkn8cgjewxc", []float64{47.559420336, 47.5594205037, -5.58776054531, -5.58776021004}}, + {"u", []float64{45.0, 90.0, 0.0, 45.0}}, + {"b5", []float64{61.875, 67.5, -180.0, -168.75}}, + {"w1r042nrqyk", []float64{7.0325280726, 7.0325294137, 99.951505065, 99.9515064061}}, + {"tv6r1uuh1h", []float64{30.7885193825, 30.7885247469, 81.9965028763, 81.9965136051}}, + {"r1hw8pqpr3", []float64{-38.1913465261, -38.1913411617, 141.336675882, 141.336686611}}, + {"j2bcyrxdgj", []float64{-85.4319351912, -85.4319298267, 57.5897741318, 57.5897848606}}, + {"m", []float64{-45.0, 0.0, 45.0, 90.0}}, + {"qtxvgfvmrbf", []float64{-13.0357463658, -13.0357450247, 123.570777476, 123.570778817}}, + {"jp", []float64{-50.625, -45.0, 45.0, 56.25}}, + {"f76", []float64{63.28125, 64.6875, -75.9375, -74.53125}}, + {"vz9", []float64{87.1875, 88.59375, 80.15625, 81.5625}}, + {"wm", []float64{28.125, 33.75, 101.25, 112.5}}, + {"c0wn5", []float64{48.8671875, 48.9111328125, -126.430664062, -126.38671875}}, + {"7pn7whg", []float64{-4.9836730957, -4.98229980469, -35.943145752, -35.9417724609}}, + {"s", []float64{0.0, 45.0, 0.0, 45.0}}, + {"txwr3", []float64{43.4619140625, 43.505859375, 76.3330078125, 76.376953125}}, + {"zc0", []float64{50.625, 52.03125, 168.75, 170.15625}}, + {"sq7pru6gr", []float64{36.4545679092, 36.4546108246, 15.8134031296, 15.8134460449}}, + {"nu", []float64{-67.5, -61.875, 123.75, 135.0}}, + {"7dkt6vr", []float64{-31.3920593262, -31.3906860352, -16.0414123535, -16.0400390625}}, + {"xm2uwefdyf1", []float64{30.3433477879, 30.343349129, 147.594056278, 147.59405762}}, + {"mgmnc0", []float64{-25.5322265625, -25.5267333984, 85.8251953125, 85.8361816406}}, + {"jj1shq", []float64{-61.1389160156, -61.1334228516, 47.2961425781, 47.3071289062}}, + {"3", []float64{-45.0, 0.0, -135.0, -90.0}}, + {"0p4y3p8tgr", []float64{-49.4841438532, -49.4841384888, -176.088041067, -176.088030338}}, + {"gu", []float64{67.5, 73.125, -11.25, 0.0}}, + {"e94", []float64{5.625, 7.03125, -19.6875, -18.28125}}, + {"u7khr", []float64{64.0283203125, 64.072265625, 17.1826171875, 17.2265625}}, + {"k1k", []float64{-37.96875, -36.5625, 5.625, 7.03125}}, + {"wks48m7f", []float64{25.7811355591, 25.7813072205, 106.891136169, 106.891479492}}, + {"z91w3", []float64{51.7236328125, 51.767578125, 159.653320312, 159.697265625}}, + {"c2d6xmbp1", []float64{48.284740448, 48.2847833633, -120.267291069, -120.267248154}}, + {"s9yur", []float64{10.5908203125, 10.634765625, 32.2998046875, 32.34375}}, + {"7u09b46", []float64{-22.1800231934, -22.1786499023, -10.544128418, -10.542755127}}, + {"8sndxb", []float64{22.939453125, 22.9449462891, -148.018798828, -148.0078125}}, + {"j2g761bhsex", []float64{-85.1995566487, -85.1995553076, 60.9084056318, 60.9084069729}}, + {"5wg", []float64{-52.03125, -50.625, -18.28125, -16.875}}, + {"fzn", []float64{84.375, 85.78125, -47.8125, -46.40625}}, + {"ugdpz8p4mu", []float64{66.0502123833, 66.0502177477, 36.9019496441, 36.9019603729}}, + {"nx", []float64{-50.625, -45.0, 112.5, 123.75}}, + {"d", []float64{0.0, 45.0, -90.0, -45.0}}, + {"crr9e8mz59se", []float64{86.0475053452, 86.0475055128, -113.041263744, -113.041263409}}, + {"dgmpsg", []float64{19.6160888672, 19.6215820312, -49.0100097656, -48.9990234375}}, + {"jcfk52m03", []float64{-79.4517087936, -79.4516658783, 82.063794136, 82.0638370514}}, + {"d8trpccuk", []float64{4.05331134796, 4.05335426331, -59.7740364075, -59.7739934921}}, + {"93g9665", []float64{10.0744628906, 10.0758361816, -118.725128174, -118.723754883}}, + {"sqt7cuf8d0f", []float64{37.2478620708, 37.2478634119, 18.7132385373, 18.7132398784}}, + {"f9", []float64{50.625, 56.25, -67.5, -56.25}}, + {"k90", []float64{-39.375, -37.96875, 22.5, 23.90625}}, + {"k8xdhcv", []float64{-41.8263244629, -41.8249511719, 33.2624816895, 33.2638549805}}, + {"4989w4r926t7", []float64{-81.2862400152, -81.2862398475, -66.5228856727, -66.5228853375}}, + {"c3", []float64{50.625, 56.25, -123.75, -112.5}}, + {"bd908pg0", []float64{59.1929626465, 59.1931343079, -156.089630127, -156.089286804}}, + {"bq", []float64{78.75, 84.375, -168.75, -157.5}}, + {"chcdt", []float64{72.158203125, 72.2021484375, -132.670898438, -132.626953125}}, + {"hff8vsrzhy39", []float64{-74.3748327903, -74.3748326227, 37.5181730837, 37.5181734189}}, + {"9gef7g6ezj", []float64{20.101531148, 20.1015365124, -95.8080339432, -95.8080232143}}, + {"yc0u2dp", []float64{51.3830566406, 51.3844299316, 124.836273193, 124.837646484}}, + {"w0b41f7", []float64{4.58267211914, 4.58404541016, 90.0810241699, 90.0823974609}}, + {"8cdmwjc", []float64{9.43588256836, 9.43725585938, -142.820892334, -142.819519043}}, + {"p4ngqtjm2", []float64{-78.150343895, -78.1503009796, 144.785041809, 144.785084724}}, + {"5", []float64{-90.0, -45.0, -45.0, 0.0}}, + {"3qtzqdknx", []float64{-7.14961051941, -7.14956760406, -115.372624397, -115.372581482}}, + {"gzjhuv9xmkg5", []float64{85.2414438687, 85.2414440364, -4.00772050023, -4.00772016495}}, + {"8g8t7eh4y0fh", []float64{20.6273078173, 20.627307985, -145.387313068, -145.387312733}}, + {"39w", []float64{-36.5625, -35.15625, -104.0625, -102.65625}}, + {"z34rj8", []float64{51.85546875, 51.8609619141, 149.655761719, 149.666748047}}, + {"c9p0zv2", []float64{50.7856750488, 50.7870483398, -102.315673828, -102.314300537}}, + {"vh871y", []float64{70.8728027344, 70.8782958984, 45.4284667969, 45.439453125}}, + {"7ggt8b4yfw", []float64{-22.9382622242, -22.9382568598, -6.29128217697, -6.29127144814}}, + {"3qz3d324z", []float64{-6.76023960114, -6.76019668579, -113.455510139, -113.455467224}}, + {"4sm2463w0w", []float64{-66.0803282261, -66.0803228617, -60.0162291527, -60.0162184238}}, + {"26ewbu0gw3", []float64{-29.728397727, -29.7283923626, -163.793867826, -163.793857098}}, + {"bre7js2w9z", []float64{87.7393430471, 87.7393484116, -163.937226534, -163.937215805}}, + {"sy5ug08bn", []float64{34.5877075195, 34.5877504349, 39.1565608978, 39.1566038132}}, + {"p4r", []float64{-77.34375, -75.9375, 144.84375, 146.25}}, + {"qb", []float64{-45.0, -39.375, 123.75, 135.0}}, + {"f4hj", []float64{57.12890625, 57.3046875, -84.375, -84.0234375}}, + {"5r0f5t7d5", []float64{-50.2442550659, -50.2442121506, -32.5365686417, -32.5365257263}}, + {"2j7n4r6r", []float64{-14.3730354309, -14.3728637695, -175.679283142, -175.678939819}}, + {"wu92egv2wsy", []float64{25.4211013019, 25.421102643, 125.680104196, 125.680105537}}, + {"vgtwey347bg", []float64{65.8648006618, 65.8648020029, 86.6507081687, 86.6507095098}}, + {"q2meny7pbd18", []float64{-43.0307328701, -43.0307327025, 109.285149202, 109.285149537}}, + {"rpe", []float64{-2.8125, -1.40625, 139.21875, 140.625}}, + {"m69", []float64{-30.9375, -29.53125, 57.65625, 59.0625}}, + {"w1zwd8", []float64{10.986328125, 10.9918212891, 100.656738281, 100.667724609}}, + {"fzpf", []float64{84.7265625, 84.90234375, -45.3515625, -45.0}}, + {"t3w", []float64{8.4375, 9.84375, 64.6875, 66.09375}}, + {"zb11", []float64{45.17578125, 45.3515625, 170.15625, 170.5078125}}, + {"r2prkmxpgtww", []float64{-43.6940126494, -43.6940124817, 156.641852036, 156.641852371}}, + {"zr34g1zcj", []float64{86.274433136, 86.2744760513, 147.79894352, 147.798986435}}, + {"19mgdk8", []float64{-82.3287963867, -82.3274230957, -104.315185547, -104.313812256}}, + {"mkp", []float64{-22.5, -21.09375, 66.09375, 67.5}}, + {"934qy86ssc", []float64{6.81367456913, 6.81367993355, -120.296655893, -120.296645164}}, + {"byydj4mrm8", []float64{83.3339166641, 83.3339220285, -136.882202625, -136.882191896}}, + {"j", []float64{-90.0, -45.0, 45.0, 90.0}}, + {"9cjzqv5n6", []float64{6.92795276642, 6.92799568176, -92.8632259369, -92.8631830215}}, + {"vkg1wg6s", []float64{72.0009613037, 72.0011329651, 60.7688140869, 60.7691574097}}, + {"ynp42e9x", []float64{79.1659355164, 79.1661071777, 99.8677825928, 99.8681259155}}, + {"uv9zwddtwn", []float64{77.2705686092, 77.2705739737, 36.5002727509, 36.5002834797}}, + {"t17zkzszpm", []float64{8.3480912447, 8.34809660912, 50.4890120029, 50.4890227318}}, + {"tuw779c04ukm", []float64{25.8934257366, 25.8934259042, 87.6943681017, 87.6943684369}}, + {"37rm598", []float64{-25.8316040039, -25.8302307129, -113.400878906, -113.399505615}}, + {"ymf18", []float64{77.607421875, 77.6513671875, 104.0625, 104.106445312}}, + {"gd", []float64{56.25, 61.875, -22.5, -11.25}}, + {"smz", []float64{32.34375, 33.75, 21.09375, 22.5}}, + {"p", []float64{-90.0, -45.0, 135.0, 180.0}}, + {"muzkh95w2u42", []float64{-17.5715374947, -17.571537327, 89.1479081288, 89.1479084641}}, + {"hh53c48eg", []float64{-67.1780061722, -67.1779632568, 4.61507320404, 4.61511611938}}, + {"739ewv", []float64{-35.9197998047, -35.9143066406, -31.3439941406, -31.3330078125}}, + {"cw883", []float64{81.6064453125, 81.650390625, -111.752929688, -111.708984375}}, + {"41xu1w37yf", []float64{-80.8243882656, -80.8243829012, -79.0336382389, -79.0336275101}}, + {"0y750v2k", []float64{-54.2868804932, -54.2867088318, -141.997947693, -141.99760437}}, + {"gqbgw", []float64{83.583984375, 83.6279296875, -32.431640625, -32.3876953125}}, + {"pej", []float64{-73.125, -71.71875, 164.53125, 165.9375}}, + {"r05t", []float64{-44.12109375, -43.9453125, 139.921875, 140.2734375}}, + {"qfuew7wk4f", []float64{-28.8960921764, -28.896086812, 130.361484289, 130.361495018}}, + {"nu357yhp72y", []float64{-65.4882533848, -65.4882520437, 125.326685607, 125.326686949}}, + {"8qt7rnggz", []float64{37.1715116501, 37.1715545654, -161.054120064, -161.054077148}}, + {"dhjq2nz", []float64{23.6357116699, 23.6370849609, -82.6075744629, -82.6062011719}}, + {"5s4", []float64{-67.5, -66.09375, -19.6875, -18.28125}}, + {"ge8nq842", []float64{65.7861328125, 65.7863044739, -22.211265564, -22.2109222412}}, + {"71", []float64{-39.375, -33.75, -45.0, -33.75}}, + {"sz59kteks87q", []float64{39.625713788, 39.6257139556, 38.8742895797, 38.874289915}}, + {"ur2bh", []float64{85.78125, 85.8251953125, 12.48046875, 12.5244140625}}, + {"w140u2f", []float64{5.76095581055, 5.76232910156, 93.0020141602, 93.0033874512}}, + {"fpd3zkt6whz4", []float64{87.5202913955, 87.5202915631, -86.5098573267, -86.5098569915}}, + {"zmej764h8kb8", []float64{76.8721358478, 76.8721360154, 150.614330247, 150.614330582}}, + {"k4p6z", []float64{-33.2666015625, -33.22265625, 10.5029296875, 10.546875}}, + {"f8", []float64{45.0, 50.625, -67.5, -56.25}}, + {"utsy6pv17m", []float64{77.0789462328, 77.0789515972, 29.2745840549, 29.2745947838}}, + {"6z5", []float64{-5.625, -4.21875, -52.03125, -50.625}}, + {"mjdc1", []float64{-13.88671875, -13.8427734375, 48.9111328125, 48.955078125}}, + {"gjks4c2", []float64{75.2412414551, 75.2426147461, -38.5510253906, -38.5496520996}}, + {"fvkvvrh42ju", []float64{75.5808614194, 75.5808627605, -49.3341010809, -49.3340997398}}, + {"yp63x30p7u7", []float64{86.0516823828, 86.0516837239, 93.4828309715, 93.4828323126}}, + {"6rw", []float64{-2.8125, -1.40625, -70.3125, -68.90625}}, + {"28vsqm8sb6", []float64{-40.0031411648, -40.0031358004, -149.490269423, -149.490258694}}, + {"be72g2zcek5", []float64{63.4174847603, 63.4174861014, -152.776078731, -152.77607739}}, + {"xry6fn699f", []float64{44.1117489338, 44.1117542982, 155.130461454, 155.130472183}}, + {"3sf7bw01y4", []float64{-17.5888001919, -17.5887948275, -109.313707352, -109.313696623}}, + {"k729yrqr77", []float64{-26.3700467348, -26.3700413704, 12.2365057468, 12.2365164757}}, + {"e", []float64{0.0, 45.0, -45.0, 0.0}}, + {"63x6dd", []float64{-36.1120605469, -36.1065673828, -68.4448242188, -68.4338378906}}, + {"z", []float64{45.0, 90.0, 135.0, 180.0}}, + {"mzyj", []float64{-0.52734375, -0.3515625, 87.1875, 87.5390625}}, + {"3j6r", []float64{-14.23828125, -14.0625, -131.8359375, -131.484375}}, + {"u3q", []float64{52.03125, 53.4375, 19.6875, 21.09375}}, + {"nueu7mbnbn4", []float64{-63.9076530933, -63.9076517522, 129.166262448, 129.166263789}}, + {"cyq2pkr", []float64{80.1795959473, 80.1809692383, -92.1327209473, -92.1313476562}}, + {"gptzzke9hvh", []float64{88.5747224092, 88.5747237504, -36.5904432535, -36.5904419124}}, + {"khd", []float64{-19.6875, -18.28125, 2.8125, 4.21875}}, + {"ghm92hg6p5", []float64{69.1524285078, 69.1524338722, -37.2608613968, -37.260850668}}, + {"n9e20w0", []float64{-81.5295410156, -81.5281677246, 117.092285156, 117.093658447}}, + {"826dzs0k", []float64{1.91230773926, 1.91247940063, -164.904441833, -164.904098511}}, + {"0d5f2", []float64{-78.3544921875, -78.310546875, -152.2265625, -152.182617188}}, + {"70zsyg3", []float64{-39.9284362793, -39.9270629883, -34.1551208496, -34.1537475586}}, + {"zykh8gwy", []float64{80.9675216675, 80.9676933289, 174.417228699, 174.417572021}}, + {"4spd3s", []float64{-67.0825195312, -67.0770263672, -56.8872070312, -56.8762207031}}, + {"r9p6f2t", []float64{-38.8888549805, -38.8874816895, 167.801055908, 167.802429199}}, + {"6q3merq7w", []float64{-8.83652687073, -8.83648395538, -76.8405246735, -76.8404817581}}, + {"qx1zr32", []float64{-4.34371948242, -4.34234619141, 115.279541016, 115.280914307}}, + {"3zfnk", []float64{-0.3076171875, -0.263671875, -98.26171875, -98.2177734375}}, + {"kd2e3", []float64{-31.7724609375, -31.728515625, 23.2470703125, 23.291015625}}, + {"stkhr6", []float64{30.2893066406, 30.2947998047, 28.4436035156, 28.4545898438}}, + {"nh4pzbm", []float64{-66.1363220215, -66.1349487305, 93.159942627, 93.161315918}}, + {"zt8tf", []float64{76.9482421875, 76.9921875, 158.291015625, 158.334960938}}, + {"gd37", []float64{58.18359375, 58.359375, -20.7421875, -20.390625}}, + {"gnx5b45dd", []float64{82.2330951691, 82.2331380844, -35.1513576508, -35.1513147354}}, + {"2qm7", []float64{-9.31640625, -9.140625, -161.3671875, -161.015625}}, + {"4g0zf0kq9cpp", []float64{-71.7601996846, -71.760199517, -55.1015008986, -55.1015005633}}, + {"977tgt", []float64{19.3194580078, 19.3249511719, -118.674316406, -118.663330078}}, + {"md0k", []float64{-33.046875, -32.87109375, 67.8515625, 68.203125}}, + {"v1q3nvfb3h", []float64{52.2386813164, 52.2386866808, 54.089512825, 54.0895235538}}, + {"z96pt6u96w", []float64{53.3649623394, 53.3649677038, 160.549499989, 160.549510717}}, + {"pu", []float64{-67.5, -61.875, 168.75, 180.0}}, + {"6uydh", []float64{-17.9296875, -17.8857421875, -46.93359375, -46.8896484375}}, + {"nx5mt4sk", []float64{-49.6437835693, -49.643611908, 117.295875549, 117.296218872}}, + {"nk8jt", []float64{-63.720703125, -63.6767578125, 101.469726562, 101.513671875}}, + {"kec1015b", []float64{-23.7249755859, -23.7248039246, 23.9113998413, 23.9117431641}}, + {"fk388b", []float64{68.994140625, 68.9996337891, -76.6076660156, -76.5966796875}}, + {"nsb", []float64{-63.28125, -61.875, 112.5, 113.90625}}, + {"ndbws", []float64{-73.388671875, -73.3447265625, 113.37890625, 113.422851562}}, + {"fs5", []float64{67.5, 68.90625, -63.28125, -61.875}}, + {"h6x0kbec6p15", []float64{-75.8905554749, -75.8905553073, 21.3077272475, 21.3077275828}}, + {"hy78", []float64{-54.84375, -54.66796875, 38.671875, 39.0234375}}, + {"vpun9j4ce9", []float64{89.7640568018, 89.7640621662, 50.6728720665, 50.6728827953}}, + {"6tyevvqrj665", []float64{-11.9670169987, -11.967016831, -58.0978783965, -58.0978780612}}, + {"d3ktekr48n", []float64{8.02185416222, 8.02185952663, -72.2694396973, -72.2694289684}}, + {"heyfm2up5", []float64{-68.5054206848, -68.5053777695, 32.2285223007, 32.2285652161}}, + {"mn3f7qjr22v", []float64{-9.41403463483, -9.41403329372, 47.6109869778, 47.6109883189}}, + {"1wngqx9b", []float64{-55.637512207, -55.6373405457, -102.719764709, -102.719421387}}, + {"xc9jv49jej", []float64{9.46294605732, 9.46295142174, 170.3774786, 170.377489328}}, + {"27", []float64{-28.125, -22.5, -168.75, -157.5}}, + {"6yhqnw", []float64{-10.1623535156, -10.1568603516, -49.9877929688, -49.9768066406}}, + {"rmhhu3qd", []float64{-16.0328292847, -16.0326576233, 152.07069397, 152.071037292}}, + {"y00b8mhby", []float64{45.1154851913, 45.1155281067, 91.0724544525, 91.0724973679}}, + {"yq5tr", []float64{79.6728515625, 79.716796875, 106.479492188, 106.5234375}}, + {"cuvxw", []float64{73.037109375, 73.0810546875, -93.251953125, -93.2080078125}}, + {"exvb4bcn", []float64{43.5988998413, 43.5990715027, -14.2918395996, -14.2914962769}}, + {"uhdpsyx75n0", []float64{71.667112112, 71.6671134531, 3.03132534027, 3.03132668138}}, + {"y", []float64{45.0, 90.0, 90.0, 135.0}}, + {"qvwbzgw7q", []float64{-13.9108800888, -13.9108371735, 133.591604233, 133.591647148}}, + {"u0rqp0bres", []float64{47.466366291, 47.4663716555, 10.503423214, 10.5034339428}}, + {"43k2u0vbnrbd", []float64{-82.8327522799, -82.8327521123, -72.5894909352, -72.5894905999}}, + {"7r7kwz7xxr", []float64{-3.38658392429, -3.38657855988, -28.8779389858, -28.877928257}}, + {"fu2f", []float64{69.2578125, 69.43359375, -55.1953125, -54.84375}}, + {"9tkjsvzxw6", []float64{30.5309307575, 30.5309361219, -106.655691862, -106.655681133}}, + {"y9x0z", []float64{53.5693359375, 53.61328125, 122.651367188, 122.6953125}}, + {"y4rk4b25g", []float64{58.3613920212, 58.3614349365, 100.316290855, 100.316333771}}, + {"sxmdmu9xcn", []float64{41.202839613, 41.2028449774, 30.4891633987, 30.4891741276}}, + {"fb0", []float64{45.0, 46.40625, -56.25, -54.84375}}, + {"7ffkxt5", []float64{-28.7127685547, -28.7113952637, -7.7522277832, -7.75085449219}}, + {"n1x5jn4dr", []float64{-81.0018110275, -81.0017681122, 100.067210197, 100.067253113}}, + {"uxcdctgmj", []float64{89.1095924377, 89.1096353531, 24.6799707413, 24.6800136566}}, + {"h7", []float64{-73.125, -67.5, 11.25, 22.5}}, + {"b", []float64{45.0, 90.0, -180.0, -135.0}}, + {"us1n4udk", []float64{68.5800933838, 68.5802650452, 24.0301895142, 24.0305328369}}, + {"zjtptsj2vn", []float64{77.2779929638, 77.2779983282, 142.280373573, 142.280384302}}, + {"x6utsqz", []float64{16.4726257324, 16.4739990234, 152.774505615, 152.775878906}}, + {"cs9dn901rqn", []float64{70.6698024273, 70.6698037684, -110.104661286, -110.104659945}}, + {"sjbn", []float64{33.3984375, 33.57421875, 0.0, 0.3515625}}, + {"0fwxjyc3g9q", []float64{-74.6696452796, -74.6696439385, -136.854814589, -136.854813248}}, + {"fk", []float64{67.5, 73.125, -78.75, -67.5}}, + {"75hq9jh", []float64{-26.9549560547, -26.9535827637, -38.9739990234, -38.9726257324}}, + {"kr3hg1q1", []float64{-3.37675094604, -3.37657928467, 12.7963256836, 12.7966690063}}, + {"hfq4d2wu", []float64{-76.9008636475, -76.9006919861, 42.2956466675, 42.2959899902}}, + {"rg6ygh", []float64{-25.5102539062, -25.5047607422, 172.749023438, 172.760009766}}, + {"995pvrg", []float64{7.02987670898, 7.03125, -108.046417236, -108.045043945}}, + {"s5ys", []float64{21.796875, 21.97265625, 9.140625, 9.4921875}}, + {"289ucubzj6", []float64{-41.3252341747, -41.3252288103, -154.960902929, -154.9608922}}, + {"4", []float64{-90.0, -45.0, -90.0, -45.0}}, + {"7g0e3gp7cz", []float64{-27.5365501642, -27.5365447998, -10.4599392414, -10.4599285126}}, + {"9suuudg", []float64{27.5688171387, 27.5701904297, -105.618438721, -105.61706543}}, + {"8vdt3j8zb0", []float64{31.8918943405, 31.8918997049, -142.689399719, -142.68938899}}, + {"cf", []float64{56.25, 61.875, -101.25, -90.0}}, + {"jnp33f5pr9", []float64{-56.0180372, -56.0180318356, 55.276658535, 55.2766692638}}, + {"czgmgyb", []float64{89.6415710449, 89.6429443359, -96.5148925781, -96.5135192871}}, + {"c1kk", []float64{52.734375, 52.91015625, -129.0234375, -128.671875}}, + {"kfm4hfe8cp4s", []float64{-31.9782876223, -31.9782874547, 40.994843021, 40.9948433563}}, + {"9mnws4hc8h", []float64{29.2788434029, 29.2788487673, -114.427070618, -114.427059889}}, + {"t0j7chwg", []float64{0.684413909912, 0.684585571289, 52.4360275269, 52.4363708496}}, + {"y", []float64{45.0, 90.0, 90.0, 135.0}}, + {"suj", []float64{22.5, 23.90625, 40.78125, 42.1875}}, + } + + for _, test := range tests { + lat, lon := DecodeGeoHash(test.hash) + + if !compareLatitude(test.box, lat) { + t.Errorf("expected lat %f, got %f, hash %s", (test.box[0]+test.box[1])/2, lat, test.hash) + } + if !compareLogitude(test.box, lon) { + t.Errorf("expected lon %f, got %f, hash %s", (test.box[2]+test.box[3])/2, lon, test.hash) + } + } +} + +func compareLatitude(box []float64, v float64) bool { + avg := (box[0] + box[1]) / 2 + if compareGeo(avg, v) != 0 { + return false + } + return true +} + +func compareLogitude(box []float64, v float64) bool { + avg := (box[2] + box[3]) / 2 + if compareGeo(avg, v) != 0 { + return false + } + return true +} From 83347b91127f3d89a6d37918a9a273bb704ce0b1 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 10 May 2019 09:49:31 +0530 Subject: [PATCH 598/728] cleaning the unused GeoHashDecode method --- geo/benchmark_geohash_test.go | 24 ----- geo/geohash.go | 164 +++------------------------------- geo/geohash_test.go | 25 ------ geo/versus_test.go | 2 +- 4 files changed, 15 insertions(+), 200 deletions(-) diff --git a/geo/benchmark_geohash_test.go b/geo/benchmark_geohash_test.go index 5ed5d6b76..2cdeb1d47 100644 --- a/geo/benchmark_geohash_test.go +++ b/geo/benchmark_geohash_test.go @@ -18,14 +18,6 @@ import ( "testing" ) -func BenchmarkGeoHashLen5Decode(b *testing.B) { - b.ResetTimer() - hash := "d3hn3" - for i := 0; i < b.N; i++ { - _, _ = GeoHashDecode(hash) - } -} - func BenchmarkGeoHashLen5NewDecode(b *testing.B) { b.ResetTimer() hash := "d3hn3" @@ -34,14 +26,6 @@ func BenchmarkGeoHashLen5NewDecode(b *testing.B) { } } -func BenchmarkGeoHashLen6Decode(b *testing.B) { - b.ResetTimer() - hash := "u4pruy" - for i := 0; i < b.N; i++ { - _, _ = GeoHashDecode(hash) - } -} - func BenchmarkGeoHashLen6NewDecode(b *testing.B) { b.ResetTimer() hash := "u4pruy" @@ -50,14 +34,6 @@ func BenchmarkGeoHashLen6NewDecode(b *testing.B) { } } -func BenchmarkGeoHashLen7Decode(b *testing.B) { - b.ResetTimer() - hash := "u4pruyd" - for i := 0; i < b.N; i++ { - _, _ = GeoHashDecode(hash) - } -} - func BenchmarkGeoHashLen7NewDecode(b *testing.B) { b.ResetTimer() hash := "u4pruyd" diff --git a/geo/geohash.go b/geo/geohash.go index 4fac3a877..598a63cba 100644 --- a/geo/geohash.go +++ b/geo/geohash.go @@ -1,32 +1,19 @@ -// The code here was obtained from: -// https://github.com/mmcloughlin/geohash - -// The MIT License (MIT) -// Copyright (c) 2015 Michael McLoughlin -// Permission is hereby granted, free of charge, to any person obtaining a copy -// of this software and associated documentation files (the "Software"), to deal -// in the Software without restriction, including without limitation the rights -// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -// copies of the Software, and to permit persons to whom the Software is -// furnished to do so, subject to the following conditions: - -// The above copyright notice and this permission notice shall be included in all -// copies or substantial portions of the Software. - -// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -// SOFTWARE. +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. package geo -import ( - "math" -) - // encoding encapsulates an encoding defined by a given base32 alphabet. type encoding struct { enc string @@ -47,132 +34,9 @@ func newEncoding(encoder string) *encoding { return e } -// Decode string into bits of a 64-bit word. The string s may be at most 12 -// characters. -func (e *encoding) decode(s string) uint64 { - x := uint64(0) - for i := 0; i < len(s); i++ { - x = (x << 5) | uint64(e.dec[s[i]]) - } - return x -} - -// Encode bits of 64-bit word into a string. -func (e *encoding) encode(x uint64) string { - b := [12]byte{} - for i := 0; i < 12; i++ { - b[11-i] = e.enc[x&0x1f] - x >>= 5 - } - return string(b[:]) -} - -// Base32Encoding with the Geohash alphabet. +// base32encoding with the Geohash alphabet. var base32encoding = newEncoding("0123456789bcdefghjkmnpqrstuvwxyz") -// BoundingBox returns the region encoded by the given string geohash. -func geoBoundingBox(hash string) geoBox { - bits := uint(5 * len(hash)) - inthash := base32encoding.decode(hash) - return geoBoundingBoxIntWithPrecision(inthash, bits) -} - -// Box represents a rectangle in latitude/longitude space. -type geoBox struct { - minLat float64 - maxLat float64 - minLng float64 - maxLng float64 -} - -// Round returns a point inside the box, making an effort to round to minimal -// precision. -func (b geoBox) round() (lat, lng float64) { - x := maxDecimalPower(b.maxLat - b.minLat) - lat = math.Ceil(b.minLat/x) * x - x = maxDecimalPower(b.maxLng - b.minLng) - lng = math.Ceil(b.minLng/x) * x - return -} - -// precalculated for performance -var exp232 = math.Exp2(32) - -// errorWithPrecision returns the error range in latitude and longitude for in -// integer geohash with bits of precision. -func errorWithPrecision(bits uint) (latErr, lngErr float64) { - b := int(bits) - latBits := b / 2 - lngBits := b - latBits - latErr = math.Ldexp(180.0, -latBits) - lngErr = math.Ldexp(360.0, -lngBits) - return -} - -// minDecimalPlaces returns the minimum number of decimal places such that -// there must exist an number with that many places within any range of width -// r. This is intended for returning minimal precision coordinates inside a -// box. -func maxDecimalPower(r float64) float64 { - m := int(math.Floor(math.Log10(r))) - return math.Pow10(m) -} - -// Encode the position of x within the range -r to +r as a 32-bit integer. -func encodeRange(x, r float64) uint32 { - p := (x + r) / (2 * r) - return uint32(p * exp232) -} - -// Decode the 32-bit range encoding X back to a value in the range -r to +r. -func decodeRange(X uint32, r float64) float64 { - p := float64(X) / exp232 - x := 2*r*p - r - return x -} - -// Squash the even bitlevels of X into a 32-bit word. Odd bitlevels of X are -// ignored, and may take any value. -func squash(X uint64) uint32 { - X &= 0x5555555555555555 - X = (X | (X >> 1)) & 0x3333333333333333 - X = (X | (X >> 2)) & 0x0f0f0f0f0f0f0f0f - X = (X | (X >> 4)) & 0x00ff00ff00ff00ff - X = (X | (X >> 8)) & 0x0000ffff0000ffff - X = (X | (X >> 16)) & 0x00000000ffffffff - return uint32(X) -} - -// Deinterleave the bits of X into 32-bit words containing the even and odd -// bitlevels of X, respectively. -func deinterleave(X uint64) (uint32, uint32) { - return squash(X), squash(X >> 1) -} - -// BoundingBoxIntWithPrecision returns the region encoded by the integer -// geohash with the specified precision. -func geoBoundingBoxIntWithPrecision(hash uint64, bits uint) geoBox { - fullHash := hash << (64 - bits) - latInt, lngInt := deinterleave(fullHash) - lat := decodeRange(latInt, 90) - lng := decodeRange(lngInt, 180) - latErr, lngErr := errorWithPrecision(bits) - return geoBox{ - minLat: lat, - maxLat: lat + latErr, - minLng: lng, - maxLng: lng + lngErr, - } -} - -// ---------------------------------------------------------------------- - -// Decode the string geohash to a (lat, lng) point. -func GeoHashDecode(hash string) (lat, lng float64) { - box := geoBoundingBox(hash) - return box.round() -} - var masks = []uint64{16, 8, 4, 2, 1} // DecodeGeoHash decodes the string geohash faster with diff --git a/geo/geohash_test.go b/geo/geohash_test.go index d0bec329d..11b920ded 100644 --- a/geo/geohash_test.go +++ b/geo/geohash_test.go @@ -19,31 +19,6 @@ import ( "testing" ) -func TestGeoHashDecode(t *testing.T) { - tests := []struct { - hash string - lon float64 - lat float64 - }{ - {"d3hn3", -73.080000, 6.730000}, // -73.05908203, 6.74560547 as per http://geohash.co/ - {"u4pru", 10.380000, 57.620000}, // 10.39306641, 57.63427734 - {"u4pruy", 10.410000, 57.646000}, // 10.40954590, 57.64801025 - {"u4pruyd", 10.407000, 57.649000}, // 10.40748596, 57.64869690 - {"u4pruydqqvj", 10.40744, 57.64911}, // 10.40743969, 57.64911063 - } - - for _, test := range tests { - lat, lon := GeoHashDecode(test.hash) - - if compareGeo(test.lon, lon) != 0 { - t.Errorf("expected lon %f, got %f, hash %s", test.lon, lon, test.hash) - } - if compareGeo(test.lat, lat) != 0 { - t.Errorf("expected lat %f, got %f, hash %s", test.lat, lat, test.hash) - } - } -} - func TestDecodeGeoHash(t *testing.T) { tests := []struct { hash string diff --git a/geo/versus_test.go b/geo/versus_test.go index 0a067dae7..cadf96208 100644 --- a/geo/versus_test.go +++ b/geo/versus_test.go @@ -20,7 +20,7 @@ import ( // This test basically confirms the dimensions of the // bounded box computed between the DecodeGeoHash method -// and the existing original implementation from +// and the popular implementation from // https://github.com/mmcloughlin/geohash. // DecodeGeoHash method returns the centre of the rectangle // than returning the box dimensions. From 2f920ab09a5a94515984d64b6a2821fb1eda2f7c Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 10 May 2019 19:58:20 +0530 Subject: [PATCH 599/728] updating copyright header adding reference to source --- geo/geohash.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/geo/geohash.go b/geo/geohash.go index 598a63cba..d3d4dfa8b 100644 --- a/geo/geohash.go +++ b/geo/geohash.go @@ -11,6 +11,8 @@ // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. +// This implementation is inspired from the geohash-js +// ref: https://github.com/davetroy/geohash-js package geo From 10662df9cf02d544913847ccc827e345085c662d Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Tue, 28 May 2019 15:38:37 +0530 Subject: [PATCH 600/728] MB-33364 - perf wrinkles with geo queries Geo queries are reported to have high memory consumption and higher latencies. One plasubile root cause for this could be, -Very high number of candidate terms computed during the bounded box queries. (as unlike other queries, looks like these terms are just mathematically created without actual existence in the bleve index) -Too many candidate terms keeps more TermFieldDictionaries live and managed over the Disjunction heap as well as the term searcher objects. This fix tries to filter out the onBoundary/notOnBoundary terms early in the computation cycle so that it creates lesser garbage from TFDs, term byte searchers and lighter disjunction heap management. Fix have shown good improvement with numbers in local/mac environment against the functional tests with 1000 documents and num_queries=100 Average FTS latency in ms for 100 queries 63.20892059(new) Vs 189.9222353(existing) ~3X improvment in latency is observed. And the latency spikes got better normalised. eg: New mills Vs Old mills 80.392502 427.297942 306.575176 1555.539272 435.612816 2431.383857 307.829132 1402.138768 148.749478 936.793944 293.825909 944.349282 390.313823 469.521313 620.221195 2414.491882 --- search/searcher/search_geoboundingbox.go | 31 +++++++++++++++---- search/searcher/search_geoboundingbox_test.go | 4 +-- .../searcher/search_geopointdistance_test.go | 2 +- 3 files changed, 28 insertions(+), 9 deletions(-) diff --git a/search/searcher/search_geoboundingbox.go b/search/searcher/search_geoboundingbox.go index ed5424f1f..ad20d1a0f 100644 --- a/search/searcher/search_geoboundingbox.go +++ b/search/searcher/search_geoboundingbox.go @@ -40,7 +40,7 @@ func NewGeoBoundingBoxSearcher(indexReader index.IndexReader, minLon, minLat, // do math to produce list of terms needed for this search onBoundaryTerms, notOnBoundaryTerms, err := ComputeGeoRange(0, GeoBitsShift1Minus1, - minLon, minLat, maxLon, maxLat, checkBoundaries) + minLon, minLat, maxLon, maxLat, checkBoundaries, indexReader, field) if err != nil { return nil, err } @@ -100,7 +100,8 @@ var geoMaxShift = document.GeoPrecisionStep * 4 var geoDetailLevel = ((geo.GeoBits << 1) - geoMaxShift) / 2 func ComputeGeoRange(term uint64, shift uint, - sminLon, sminLat, smaxLon, smaxLat float64, checkBoundaries bool) ( + sminLon, sminLat, smaxLon, smaxLat float64, checkBoundaries bool, + indexReader index.IndexReader, field string) ( onBoundary [][]byte, notOnBoundary [][]byte, err error) { preallocBytesLen := 32 preallocBytes := make([]byte, preallocBytesLen) @@ -117,6 +118,21 @@ func ComputeGeoRange(term uint64, shift uint, return rv } + isIndexed := func(term []byte) bool { + if indexReader != nil { + reader, err := indexReader.TermFieldReader(term, field, false, false, false) + if err != nil || reader == nil { + return false + } + if reader.Count() == 0 { + _ = reader.Close() + return false + } + _ = reader.Close() + } + return true + } + var computeGeoRange func(term uint64, shift uint) // declare for recursion relateAndRecurse := func(start, end uint64, res, level uint) { @@ -131,10 +147,13 @@ func ComputeGeoRange(term uint64, shift uint, if within || (level == geoDetailLevel && geo.RectIntersects(minLon, minLat, maxLon, maxLat, sminLon, sminLat, smaxLon, smaxLat)) { - if !within && checkBoundaries { - onBoundary = append(onBoundary, makePrefixCoded(int64(start), res)) - } else { - notOnBoundary = append(notOnBoundary, makePrefixCoded(int64(start), res)) + codedTerm := makePrefixCoded(int64(start), res) + if isIndexed(codedTerm) { + if !within && checkBoundaries { + onBoundary = append(onBoundary, makePrefixCoded(int64(start), res)) + } else { + notOnBoundary = append(notOnBoundary, makePrefixCoded(int64(start), res)) + } } } else if level < geoDetailLevel && geo.RectIntersects(minLon, minLat, maxLon, maxLat, diff --git a/search/searcher/search_geoboundingbox_test.go b/search/searcher/search_geoboundingbox_test.go index cae803412..6b3072868 100644 --- a/search/searcher/search_geoboundingbox_test.go +++ b/search/searcher/search_geoboundingbox_test.go @@ -215,7 +215,7 @@ func TestComputeGeoRange(t *testing.T) { for testi, test := range tests { onBoundaryRes, offBoundaryRes, err := ComputeGeoRange(0, GeoBitsShift1Minus1, - -1.0*test.degs, -1.0*test.degs, test.degs, test.degs, true) + -1.0*test.degs, -1.0*test.degs, test.degs, test.degs, true, nil, "") if (err != nil) != (test.err != "") { t.Errorf("test: %+v, err: %v", test, err) } @@ -275,7 +275,7 @@ func benchmarkComputeGeoRange(b *testing.B, for i := 0; i < b.N; i++ { onBoundaryRes, offBoundaryRes, err := - ComputeGeoRange(0, GeoBitsShift1Minus1, minLon, minLat, maxLon, maxLat, checkBoundaries) + ComputeGeoRange(0, GeoBitsShift1Minus1, minLon, minLat, maxLon, maxLat, checkBoundaries, nil, "") if err != nil { b.Fatalf("expected no err") } diff --git a/search/searcher/search_geopointdistance_test.go b/search/searcher/search_geopointdistance_test.go index eaf8216a3..8b7f1c968 100644 --- a/search/searcher/search_geopointdistance_test.go +++ b/search/searcher/search_geopointdistance_test.go @@ -113,7 +113,7 @@ func TestGeoPointDistanceCompare(t *testing.T) { minLon, minLat, maxLon, maxLat float64, checkBoundaries bool) { // do math to produce list of terms needed for this search onBoundaryRes, offBoundaryRes, err := ComputeGeoRange(0, GeoBitsShift1Minus1, - minLon, minLat, maxLon, maxLat, checkBoundaries) + minLon, minLat, maxLon, maxLat, checkBoundaries, nil, "") if err != nil { t.Fatal(err) } From 7ad4bbc2adec301f007f328101f0bcf19f371e8e Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 29 May 2019 15:25:24 -0400 Subject: [PATCH 601/728] support search after style pagination (#1182) * support search after style pagination Typically deep pagination becomes quite expensive when using the size+from mechanism on distributed indexes, due to each child index having to build large result sets, and return them over the network for merging at the coordinating node. Alternatively, the search after approach allows for the client to maintain state, in this case, the sort key of the last result for the current page. By running the same search again, but this time providing the last hit's sort key as this new requests search after key, we allow for significantly more efficient pagination. TotalHits and Facets are unaffected by using this parameter, because all hits are still seen by the collector. * fix incorrectly named method * add support for SearchBefore Adds a capability to go one page backwards in deep pagination of search results, similar to going forwards using SearchAfter. * reset request back to original * refactor integration tests to allow for testing aliases added integration tests for search_before and search_after which go through index aliases * fix typo * fix incorrect type for method receiver --- index_alias_impl.go | 46 ++++---- index_impl.go | 64 +++++++++-- search.go | 35 ++++++ search/collector/topn.go | 29 +++++ search/sort.go | 29 +++++ test/integration_test.go | 147 ++++++++++++++++-------- test/tests/alias/datasets/shard0/a.json | 3 + test/tests/alias/datasets/shard0/c.json | 3 + test/tests/alias/datasets/shard1/b.json | 3 + test/tests/alias/datasets/shard1/d.json | 3 + test/tests/alias/mapping.json | 3 + test/tests/alias/searches.json | 76 ++++++++++++ test/tests/sort/searches.json | 54 ++++++++- 13 files changed, 415 insertions(+), 80 deletions(-) create mode 100644 test/tests/alias/datasets/shard0/a.json create mode 100644 test/tests/alias/datasets/shard0/c.json create mode 100644 test/tests/alias/datasets/shard1/b.json create mode 100644 test/tests/alias/datasets/shard1/d.json create mode 100644 test/tests/alias/mapping.json create mode 100644 test/tests/alias/searches.json diff --git a/index_alias_impl.go b/index_alias_impl.go index 335fcade2..4366fc795 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -434,6 +434,8 @@ func createChildSearchRequest(req *SearchRequest) *SearchRequest { Sort: req.Sort.Copy(), IncludeLocations: req.IncludeLocations, Score: req.Score, + SearchAfter: req.SearchAfter, + SearchBefore: req.SearchBefore, } return &rv } @@ -451,6 +453,14 @@ func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*Se searchStart := time.Now() asyncResults := make(chan *asyncSearchResult, len(indexes)) + var reverseQueryExecution bool + if req.SearchBefore != nil { + reverseQueryExecution = true + req.Sort.Reverse() + req.SearchAfter = req.SearchBefore + req.SearchBefore = nil + } + // run search on each index in separate go routine var waitGroup sync.WaitGroup @@ -503,7 +513,7 @@ func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*Se // sort all hits with the requested order if len(req.Sort) > 0 { - sorter := newMultiSearchHitSorter(req.Sort, sr.Hits) + sorter := newSearchHitSorter(req.Sort, sr.Hits) sort.Sort(sorter) } @@ -524,6 +534,17 @@ func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*Se sr.Facets.Fixup(name, fr.Size) } + if reverseQueryExecution { + // reverse the sort back to the original + req.Sort.Reverse() + // resort using the original order + mhs := newSearchHitSorter(req.Sort, sr.Hits) + sort.Sort(mhs) + // reset request + req.SearchBefore = req.SearchAfter + req.SearchAfter = nil + } + // fix up original request sr.Request = req searchDuration := time.Since(searchStart) @@ -581,26 +602,3 @@ func (f *indexAliasImplFieldDict) Close() error { defer f.index.mutex.RUnlock() return f.fieldDict.Close() } - -type multiSearchHitSorter struct { - hits search.DocumentMatchCollection - sort search.SortOrder - cachedScoring []bool - cachedDesc []bool -} - -func newMultiSearchHitSorter(sort search.SortOrder, hits search.DocumentMatchCollection) *multiSearchHitSorter { - return &multiSearchHitSorter{ - sort: sort, - hits: hits, - cachedScoring: sort.CacheIsScore(), - cachedDesc: sort.CacheDescending(), - } -} - -func (m *multiSearchHitSorter) Len() int { return len(m.hits) } -func (m *multiSearchHitSorter) Swap(i, j int) { m.hits[i], m.hits[j] = m.hits[j], m.hits[i] } -func (m *multiSearchHitSorter) Less(i, j int) bool { - c := m.sort.Compare(m.cachedScoring, m.cachedDesc, m.hits[i], m.hits[j]) - return c < 0 -} diff --git a/index_impl.go b/index_impl.go index 63fe39ccb..6324d960e 100644 --- a/index_impl.go +++ b/index_impl.go @@ -19,6 +19,7 @@ import ( "encoding/json" "fmt" "os" + "sort" "sync" "sync/atomic" "time" @@ -442,7 +443,20 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr return nil, ErrorIndexClosed } - collector := collector.NewTopNCollector(req.Size, req.From, req.Sort) + var reverseQueryExecution bool + if req.SearchBefore != nil { + reverseQueryExecution = true + req.Sort.Reverse() + req.SearchAfter = req.SearchBefore + req.SearchBefore = nil + } + + var coll *collector.TopNCollector + if req.SearchAfter != nil { + coll = collector.NewTopNCollectorAfter(req.Size, req.Sort, req.SearchAfter) + } else { + coll = collector.NewTopNCollector(req.Size, req.From, req.Sort) + } // open a reader for this search indexReader, err := i.i.Reader() @@ -494,10 +508,10 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr facetsBuilder.Add(facetName, facetBuilder) } } - collector.SetFacetsBuilder(facetsBuilder) + coll.SetFacetsBuilder(facetsBuilder) } - memNeeded := memNeededForSearch(req, searcher, collector) + memNeeded := memNeededForSearch(req, searcher, coll) if cb := ctx.Value(SearchQueryStartCallbackKey); cb != nil { if cbF, ok := cb.(SearchQueryStartCallbackFn); ok { err = cbF(memNeeded) @@ -515,12 +529,12 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr } } - err = collector.Collect(ctx, searcher, indexReader) + err = coll.Collect(ctx, searcher, indexReader) if err != nil { return nil, err } - hits := collector.Results() + hits := coll.Results() var highlighter highlight.Highlighter @@ -560,6 +574,17 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr logger.Printf("slow search took %s - %v", searchDuration, req) } + if reverseQueryExecution { + // reverse the sort back to the original + req.Sort.Reverse() + // resort using the original order + mhs := newSearchHitSorter(req.Sort, hits) + sort.Sort(mhs) + // reset request + req.SearchBefore = req.SearchAfter + req.SearchAfter = nil + } + return &SearchResult{ Status: &SearchStatus{ Total: 1, @@ -567,10 +592,10 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr }, Request: req, Hits: hits, - Total: collector.Total(), - MaxScore: collector.MaxScore(), + Total: coll.Total(), + MaxScore: coll.MaxScore(), Took: searchDuration, - Facets: collector.FacetResults(), + Facets: coll.FacetResults(), }, nil } @@ -865,3 +890,26 @@ func deDuplicate(fields []string) []string { } return ret } + +type searchHitSorter struct { + hits search.DocumentMatchCollection + sort search.SortOrder + cachedScoring []bool + cachedDesc []bool +} + +func newSearchHitSorter(sort search.SortOrder, hits search.DocumentMatchCollection) *searchHitSorter { + return &searchHitSorter{ + sort: sort, + hits: hits, + cachedScoring: sort.CacheIsScore(), + cachedDesc: sort.CacheDescending(), + } +} + +func (m *searchHitSorter) Len() int { return len(m.hits) } +func (m *searchHitSorter) Swap(i, j int) { m.hits[i], m.hits[j] = m.hits[j], m.hits[i] } +func (m *searchHitSorter) Less(i, j int) bool { + c := m.sort.Compare(m.cachedScoring, m.cachedDesc, m.hits[i], m.hits[j]) + return c < 0 +} diff --git a/search.go b/search.go index ebd69971e..e8d66f80a 100644 --- a/search.go +++ b/search.go @@ -262,6 +262,8 @@ func (h *HighlightRequest) AddField(field string) { // result score explanations. // Sort describes the desired order for the results to be returned. // Score controls the kind of scoring performed +// SearchAfter supports deep paging by providing a minimum sort key +// SearchBefore supports deep paging by providing a maximum sort key // // A special field named "*" can be used to return all fields. type SearchRequest struct { @@ -275,6 +277,8 @@ type SearchRequest struct { Sort search.SortOrder `json:"sort"` IncludeLocations bool `json:"includeLocations"` Score string `json:"score,omitempty"` + SearchAfter []string `json:"search_after"` + SearchBefore []string `json:"search_before"` } func (r *SearchRequest) Validate() error { @@ -285,6 +289,27 @@ func (r *SearchRequest) Validate() error { } } + if r.SearchAfter != nil && r.SearchBefore != nil { + return fmt.Errorf("cannot use search after and search before together") + } + + if r.SearchAfter != nil { + if r.From != 0 { + return fmt.Errorf("cannot use search after with from !=0") + } + if len(r.SearchAfter) != len(r.Sort) { + return fmt.Errorf("search after must have same size as sort order") + } + } + if r.SearchBefore != nil { + if r.From != 0 { + return fmt.Errorf("cannot use search before with from !=0") + } + if len(r.SearchBefore) != len(r.Sort) { + return fmt.Errorf("search before must have same size as sort order") + } + } + return r.Facets.Validate() } @@ -311,6 +336,12 @@ func (r *SearchRequest) SortByCustom(order search.SortOrder) { r.Sort = order } +// SetSearchAfter sets the request to skip over hits with a sort +// value less than the provided sort after key +func (r *SearchRequest) SetSearchAfter(after []string) { + r.SearchAfter = after +} + // UnmarshalJSON deserializes a JSON representation of // a SearchRequest func (r *SearchRequest) UnmarshalJSON(input []byte) error { @@ -325,6 +356,8 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error { Sort []json.RawMessage `json:"sort"` IncludeLocations bool `json:"includeLocations"` Score string `json:"score"` + SearchAfter []string `json:"search_after"` + SearchBefore []string `json:"search_before"` } err := json.Unmarshal(input, &temp) @@ -352,6 +385,8 @@ func (r *SearchRequest) UnmarshalJSON(input []byte) error { r.Facets = temp.Facets r.IncludeLocations = temp.IncludeLocations r.Score = temp.Score + r.SearchAfter = temp.SearchAfter + r.SearchBefore = temp.SearchBefore r.Query, err = query.ParseQuery(temp.Q) if err != nil { return err diff --git a/search/collector/topn.go b/search/collector/topn.go index b30bd6ecf..a027a12c2 100644 --- a/search/collector/topn.go +++ b/search/collector/topn.go @@ -69,6 +69,7 @@ type TopNCollector struct { lowestMatchOutsideResults *search.DocumentMatch updateFieldVisitor index.DocumentFieldTermVisitor dvReader index.DocValueReader + searchAfter *search.DocumentMatch } // CheckDoneEvery controls how frequently we check the context deadline @@ -78,6 +79,21 @@ const CheckDoneEvery = uint64(1024) // skipping over the first 'skip' hits // ordering hits by the provided sort order func NewTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector { + return newTopNCollector(size, skip, sort) +} + +// NewTopNCollector builds a collector to find the top 'size' hits +// skipping over the first 'skip' hits +// ordering hits by the provided sort order +func NewTopNCollectorAfter(size int, sort search.SortOrder, after []string) *TopNCollector { + rv := newTopNCollector(size, 0, sort) + rv.searchAfter = &search.DocumentMatch{ + Sort: after, + } + return rv +} + +func newTopNCollector(size int, skip int, sort search.SortOrder) *TopNCollector { hc := &TopNCollector{size: size, skip: skip, sort: sort} // pre-allocate space on the store to avoid reslicing @@ -266,6 +282,19 @@ func MakeTopNDocumentMatchHandler( if d == nil { return nil } + + // support search after based pagination, + // if this hit is <= the search after sort key + // we should skip it + if hc.searchAfter != nil { + // exact sort order matches use hit number to break tie + // but we want to allow for exact match, so we pretend + hc.searchAfter.HitNumber = d.HitNumber + if hc.sort.Compare(hc.cachedScoring, hc.cachedDesc, d, hc.searchAfter) <= 0 { + return nil + } + } + // optimization, we track lowest sorting hit already removed from heap // with this one comparison, we can avoid all heap operations if // this hit would have been added and then immediately removed diff --git a/search/sort.go b/search/sort.go index e17f70787..6e4ed80fa 100644 --- a/search/sort.go +++ b/search/sort.go @@ -38,6 +38,8 @@ type SearchSort interface { RequiresScoring() bool RequiresFields() []string + Reverse() + Copy() SearchSort } @@ -293,6 +295,12 @@ func (so SortOrder) CacheDescending() []bool { return rv } +func (so SortOrder) Reverse() { + for _, soi := range so { + soi.Reverse() + } +} + // SortFieldType lets you control some internal sort behavior // normally leaving this to the zero-value of SortFieldAuto is fine type SortFieldType int @@ -492,6 +500,15 @@ func (s *SortField) Copy() SearchSort { return &rv } +func (s *SortField) Reverse() { + s.Desc = !s.Desc + if s.Missing == SortFieldMissingFirst { + s.Missing = SortFieldMissingLast + } else { + s.Missing = SortFieldMissingFirst + } +} + // SortDocID will sort results by the document identifier type SortDocID struct { Desc bool @@ -533,6 +550,10 @@ func (s *SortDocID) Copy() SearchSort { return &rv } +func (s *SortDocID) Reverse() { + s.Desc = !s.Desc +} + // SortScore will sort results by the document match score type SortScore struct { Desc bool @@ -574,6 +595,10 @@ func (s *SortScore) Copy() SearchSort { return &rv } +func (s *SortScore) Reverse() { + s.Desc = !s.Desc +} + var maxDistance = string(numeric.MustNewPrefixCodedInt64(math.MaxInt64, 0)) // NewSortGeoDistance creates SearchSort instance for sorting documents by @@ -705,6 +730,10 @@ func (s *SortGeoDistance) Copy() SearchSort { return &rv } +func (s *SortGeoDistance) Reverse() { + s.Desc = !s.Desc +} + type BytesSlice [][]byte func (p BytesSlice) Len() int { return len(p) } diff --git a/test/integration_test.go b/test/integration_test.go index 8cbac6f30..ed59323ff 100644 --- a/test/integration_test.go +++ b/test/integration_test.go @@ -17,6 +17,7 @@ package test import ( "encoding/json" "flag" + "fmt" "io/ioutil" "math" "os" @@ -87,53 +88,29 @@ func runTestDir(t *testing.T, dir, datasetName string) { return } - // open new index - if !*keepIndex { - defer func() { - err := os.RemoveAll("test.bleve") - if err != nil { - t.Fatal(err) - } - }() - } - index, err := bleve.New("test.bleve", &mapping) - if err != nil { - t.Errorf("error creating new index: %v", err) - return - } - // set a custom index name - index.SetName(datasetName) - defer func() { - err := index.Close() - if err != nil { - t.Fatal(err) - } - }() + var index bleve.Index + var cleanup func() - // index data - fis, err := ioutil.ReadDir(dir + string(filepath.Separator) + "data") - if err != nil { - t.Errorf("error reading data dir: %v", err) - return - } - for _, fi := range fis { - fileBytes, err := ioutil.ReadFile(dir + string(filepath.Separator) + "data" + string(filepath.Separator) + fi.Name()) + // if there is a dir named 'data' open single index + _, err = os.Stat(dir + string(filepath.Separator) + "data") + if !os.IsNotExist(err) { + + index, cleanup, err = loadDataSet(t, datasetName, mapping, dir+string(filepath.Separator)+"data") if err != nil { - t.Errorf("error reading data file: %v", err) + t.Errorf("error loading dataset: %v", err) return } - var fileDoc interface{} - err = json.Unmarshal(fileBytes, &fileDoc) - if err != nil { - t.Errorf("error parsing data file as json: %v", err) - } - filename := fi.Name() - ext := filepath.Ext(filename) - id := filename[0 : len(filename)-len(ext)] - err = index.Index(id, fileDoc) - if err != nil { - t.Errorf("error indexing data: %v", err) - return + defer cleanup() + } else { + // if there is a dir named 'datasets' build alias over each index + _, err = os.Stat(dir + string(filepath.Separator) + "datasets") + if !os.IsNotExist(err) { + index, cleanup, err = loadDataSets(t, datasetName, mapping, dir+string(filepath.Separator)+"datasets") + if err != nil { + t.Errorf("error loading dataset: %v", err) + return + } + defer cleanup() } } @@ -165,6 +142,7 @@ func runTestDir(t *testing.T, dir, datasetName string) { if len(res.Hits) != len(search.Result.Hits) { t.Errorf("test error - %s", search.Comment) t.Errorf("test %d - expected hits len: %d got %d", testNum, len(search.Result.Hits), len(res.Hits)) + t.Errorf("got hits: %v", res.Hits) continue } for hi, hit := range search.Result.Hits { @@ -202,12 +180,87 @@ func runTestDir(t *testing.T, dir, datasetName string) { t.Errorf("test %d - expected facets: %#v got %#v", testNum, search.Result.Facets, res.Facets) } } - // check that custom index name is in results - for _, hit := range res.Hits { - if hit.Index != datasetName { - t.Fatalf("expected name: %s, got: %s", datasetName, hit.Index) + if _, ok := index.(bleve.IndexAlias); !ok { + // check that custom index name is in results + for _, hit := range res.Hits { + if hit.Index != datasetName { + t.Fatalf("expected name: %s, got: %s", datasetName, hit.Index) + } } } } } } + +func loadDataSet(t *testing.T, datasetName string, mapping mapping.IndexMappingImpl, path string) (bleve.Index, func(), error) { + idxPath := fmt.Sprintf("test-%s.bleve", datasetName) + index, err := bleve.New(idxPath, &mapping) + if err != nil { + return nil, nil, fmt.Errorf("error creating new index: %v", err) + } + // set a custom index name + index.SetName(datasetName) + + // index data + fis, err := ioutil.ReadDir(path) + if err != nil { + return nil, nil, fmt.Errorf("error reading data dir: %v", err) + } + for _, fi := range fis { + fileBytes, err := ioutil.ReadFile(path + string(filepath.Separator) + fi.Name()) + if err != nil { + return nil, nil, fmt.Errorf("error reading data file: %v", err) + } + var fileDoc interface{} + err = json.Unmarshal(fileBytes, &fileDoc) + if err != nil { + return nil, nil, fmt.Errorf("error parsing data file as json: %v", err) + } + filename := fi.Name() + ext := filepath.Ext(filename) + id := filename[0 : len(filename)-len(ext)] + err = index.Index(id, fileDoc) + if err != nil { + return nil, nil, fmt.Errorf("error indexing data: %v", err) + } + } + cleanup := func() { + err := index.Close() + if err != nil { + t.Fatalf("error closing index: %v", err) + } + if !*keepIndex { + err := os.RemoveAll(idxPath) + if err != nil { + t.Fatalf("error removing index: %v", err) + } + } + } + return index, cleanup, nil +} + +func loadDataSets(t *testing.T, datasetName string, mapping mapping.IndexMappingImpl, path string) (bleve.Index, func(), error) { + fis, err := ioutil.ReadDir(path) + if err != nil { + return nil, nil, fmt.Errorf("error reading datasets dir: %v", err) + } + var cleanups []func() + alias := bleve.NewIndexAlias() + for _, fi := range fis { + idx, idxCleanup, err := loadDataSet(t, fi.Name(), mapping, path+string(filepath.Separator)+fi.Name()) + if err != nil { + return nil, nil, fmt.Errorf("error loading dataset: %v", err) + } + cleanups = append(cleanups, idxCleanup) + alias.Add(idx) + } + alias.SetName(datasetName) + + cleanupAll := func() { + for _, cleanup := range cleanups { + cleanup() + } + } + + return alias, cleanupAll, nil +} diff --git a/test/tests/alias/datasets/shard0/a.json b/test/tests/alias/datasets/shard0/a.json new file mode 100644 index 000000000..1aeaf2c81 --- /dev/null +++ b/test/tests/alias/datasets/shard0/a.json @@ -0,0 +1,3 @@ +{ + "name": "a" +} \ No newline at end of file diff --git a/test/tests/alias/datasets/shard0/c.json b/test/tests/alias/datasets/shard0/c.json new file mode 100644 index 000000000..a584a5486 --- /dev/null +++ b/test/tests/alias/datasets/shard0/c.json @@ -0,0 +1,3 @@ +{ + "name": "c" +} \ No newline at end of file diff --git a/test/tests/alias/datasets/shard1/b.json b/test/tests/alias/datasets/shard1/b.json new file mode 100644 index 000000000..1c67e0d00 --- /dev/null +++ b/test/tests/alias/datasets/shard1/b.json @@ -0,0 +1,3 @@ +{ + "name": "b" +} \ No newline at end of file diff --git a/test/tests/alias/datasets/shard1/d.json b/test/tests/alias/datasets/shard1/d.json new file mode 100644 index 000000000..0a478db4d --- /dev/null +++ b/test/tests/alias/datasets/shard1/d.json @@ -0,0 +1,3 @@ +{ + "name": "d" +} \ No newline at end of file diff --git a/test/tests/alias/mapping.json b/test/tests/alias/mapping.json new file mode 100644 index 000000000..7f9bae748 --- /dev/null +++ b/test/tests/alias/mapping.json @@ -0,0 +1,3 @@ +{ + "default_analyzer": "keyword" +} diff --git a/test/tests/alias/searches.json b/test/tests/alias/searches.json new file mode 100644 index 000000000..fb8dd06d4 --- /dev/null +++ b/test/tests/alias/searches.json @@ -0,0 +1,76 @@ +[ + { + "comment": "match all across shards", + "search": { + "from": 0, + "size": 10, + "sort": ["-_score", "_id"], + "query": { + "match_all": {} + } + }, + "result": { + "total_hits": 4, + "hits": [ + { + "id": "a" + }, + { + "id": "b" + }, + { + "id": "c" + }, + { + "id": "d" + } + ] + } + }, + { + "comment": "search after b (page 2 when size=2)", + "search": { + "from": 0, + "size": 2, + "sort": ["name"], + "search_after": ["b"], + "query": { + "match_all": {} + } + }, + "result": { + "total_hits": 4, + "hits": [ + { + "id": "c" + }, + { + "id": "d" + } + ] + } + }, + { + "comment": "search before c (page 1 when size=2)", + "search": { + "from": 0, + "size": 2, + "sort": ["name"], + "search_before": ["c"], + "query": { + "match_all": {} + } + }, + "result": { + "total_hits": 4, + "hits": [ + { + "id": "a" + }, + { + "id": "b" + } + ] + } + } +] \ No newline at end of file diff --git a/test/tests/sort/searches.json b/test/tests/sort/searches.json index bb51720a8..8b32df5d0 100644 --- a/test/tests/sort/searches.json +++ b/test/tests/sort/searches.json @@ -406,5 +406,57 @@ } ] } - } + }, + { + "comment": "sort by name, ascending, after marty", + "search": { + "from": 0, + "size": 10, + "query": { + "match_all":{} + }, + "sort": ["name"], + "search_after": ["marty"] + }, + "result": { + "total_hits": 6, + "hits": [ + { + "id": "e" + }, + { + "id": "b" + }, + { + "id": "d" + } + ] + } + }, + { + "comment": "sort by name, ascending, before nancy", + "search": { + "from": 0, + "size": 10, + "query": { + "match_all":{} + }, + "sort": ["name"], + "search_before": ["nancy"] + }, + "result": { + "total_hits": 6, + "hits": [ + { + "id": "c" + }, + { + "id": "f" + }, + { + "id": "a" + } + ] + } + } ] From 6225a102a716e2cacf9e6b1e5faf692b7dcc790f Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 30 May 2019 12:08:42 +0530 Subject: [PATCH 602/728] fix to avoid double prefix coding --- search/searcher/search_geoboundingbox.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/search/searcher/search_geoboundingbox.go b/search/searcher/search_geoboundingbox.go index ad20d1a0f..6ec5e72dd 100644 --- a/search/searcher/search_geoboundingbox.go +++ b/search/searcher/search_geoboundingbox.go @@ -150,9 +150,9 @@ func ComputeGeoRange(term uint64, shift uint, codedTerm := makePrefixCoded(int64(start), res) if isIndexed(codedTerm) { if !within && checkBoundaries { - onBoundary = append(onBoundary, makePrefixCoded(int64(start), res)) + onBoundary = append(onBoundary, codedTerm) } else { - notOnBoundary = append(notOnBoundary, makePrefixCoded(int64(start), res)) + notOnBoundary = append(notOnBoundary, codedTerm) } } } else if level < geoDetailLevel && From 4e0f481955fd1cf003ee0baf796be2e230c18ff7 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Thu, 30 May 2019 13:04:39 -0700 Subject: [PATCH 603/728] MB-33617: Supporting the "reverse" token filter This token filter will simply reverse each token. --- analysis/token/reverse/reverse.go | 66 ++++++++++ analysis/token/reverse/reverse_test.go | 163 +++++++++++++++++++++++++ config/config.go | 1 + 3 files changed, 230 insertions(+) create mode 100644 analysis/token/reverse/reverse.go create mode 100644 analysis/token/reverse/reverse_test.go diff --git a/analysis/token/reverse/reverse.go b/analysis/token/reverse/reverse.go new file mode 100644 index 000000000..2dd16a531 --- /dev/null +++ b/analysis/token/reverse/reverse.go @@ -0,0 +1,66 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package reverse + +import ( + "unicode/utf8" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +// Name is the name used to register ReverseFilter in the bleve registry +const Name = "reverse" + +type ReverseFilter struct { +} + +func NewReverseFilter() *ReverseFilter { + return &ReverseFilter{} +} + +func (f *ReverseFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + token.Term = reverse(token.Term) + } + return input +} + +func ReverseFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewReverseFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(Name, ReverseFilterConstructor) +} + +// reverse(..) will generate a reversed version of the provided +// utf-8 encoded byte array and return it back to its caller. +func reverse(s []byte) []byte { + j := len(s) + rv := make([]byte, len(s)) + for i := 0; i < len(s); { + wid := 1 + r := rune(s[i]) + if r >= utf8.RuneSelf { + r, wid = utf8.DecodeRune(s[i:]) + } + + copy(rv[j-wid:j], s[i:i+wid]) + i += wid + j -= wid + } + return rv +} diff --git a/analysis/token/reverse/reverse_test.go b/analysis/token/reverse/reverse_test.go new file mode 100644 index 000000000..6437ecdd3 --- /dev/null +++ b/analysis/token/reverse/reverse_test.go @@ -0,0 +1,163 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package reverse + +import ( + "bytes" + "testing" + + "github.com/blevesearch/bleve/analysis" +) + +func TestReverseFilter(t *testing.T) { + inputTokenStream := analysis.TokenStream{ + &analysis.Token{ + Term: []byte("one"), + }, + &analysis.Token{ + Term: []byte("TWo"), + }, + &analysis.Token{ + Term: []byte("thRee"), + }, + &analysis.Token{ + Term: []byte("four's"), + }, + &analysis.Token{ + Term: []byte("what's this in reverse"), + }, + &analysis.Token{ + Term: []byte("œ∑´®†"), + }, + &analysis.Token{ + Term: []byte("İȺȾCAT÷≥≤µ123"), + }, + &analysis.Token{ + Term: []byte("!@#$%^&*()"), + }, + &analysis.Token{}, + } + + expectedTokenStream := analysis.TokenStream{ + &analysis.Token{ + Term: []byte("eno"), + }, + &analysis.Token{ + Term: []byte("oWT"), + }, + &analysis.Token{ + Term: []byte("eeRht"), + }, + &analysis.Token{ + Term: []byte("s'ruof"), + }, + &analysis.Token{ + Term: []byte("esrever ni siht s'tahw"), + }, + &analysis.Token{ + Term: []byte("†®´∑œ"), + }, + &analysis.Token{ + Term: []byte("321µ≤≥÷TACȾȺİ"), + }, + &analysis.Token{ + Term: []byte(")(*&^%$#@!"), + }, + &analysis.Token{}, + } + + filter := NewReverseFilter() + outputTokenStream := filter.Filter(inputTokenStream) + for i := 0; i < len(expectedTokenStream); i++ { + if !bytes.Equal(outputTokenStream[i].Term, expectedTokenStream[i].Term) { + t.Errorf("[%d] expected %s got %s", + i+1, expectedTokenStream[i].Term, outputTokenStream[i].Term) + } + } +} + +func BenchmarkReverseFilter(b *testing.B) { + input := analysis.TokenStream{ + &analysis.Token{ + Term: []byte("A"), + }, + &analysis.Token{ + Term: []byte("boiling"), + }, + &analysis.Token{ + Term: []byte("liquid"), + }, + &analysis.Token{ + Term: []byte("expanding"), + }, + &analysis.Token{ + Term: []byte("vapor"), + }, + &analysis.Token{ + Term: []byte("explosion"), + }, + &analysis.Token{ + Term: []byte("caused"), + }, + &analysis.Token{ + Term: []byte("by"), + }, + &analysis.Token{ + Term: []byte("the"), + }, + &analysis.Token{ + Term: []byte("rupture"), + }, + &analysis.Token{ + Term: []byte("of"), + }, + &analysis.Token{ + Term: []byte("a"), + }, + &analysis.Token{ + Term: []byte("vessel"), + }, + &analysis.Token{ + Term: []byte("containing"), + }, + &analysis.Token{ + Term: []byte("pressurized"), + }, + &analysis.Token{ + Term: []byte("liquid"), + }, + &analysis.Token{ + Term: []byte("above"), + }, + &analysis.Token{ + Term: []byte("its"), + }, + &analysis.Token{ + Term: []byte("boiling"), + }, + &analysis.Token{ + Term: []byte("point"), + }, + &analysis.Token{ + Term: []byte("İȺȾCAT"), + }, + } + filter := NewReverseFilter() + + b.ResetTimer() + for i := 0; i < b.N; i++ { + filter.Filter(input) + } +} diff --git a/config/config.go b/config/config.go index cf2827a5c..5550d130b 100644 --- a/config/config.go +++ b/config/config.go @@ -52,6 +52,7 @@ import ( _ "github.com/blevesearch/bleve/analysis/token/length" _ "github.com/blevesearch/bleve/analysis/token/lowercase" _ "github.com/blevesearch/bleve/analysis/token/ngram" + _ "github.com/blevesearch/bleve/analysis/token/reverse" _ "github.com/blevesearch/bleve/analysis/token/shingle" _ "github.com/blevesearch/bleve/analysis/token/stop" _ "github.com/blevesearch/bleve/analysis/token/truncate" From 99bcef21a044bf98f1f8ab07042bf4b25236f210 Mon Sep 17 00:00:00 2001 From: Darren Hoo Date: Sat, 1 Jun 2019 00:30:26 +0800 Subject: [PATCH 604/728] makes Scorch and cachedDocs aligned so that scorch run properly on arm Update: https://github.com/blevesearch/bleve/issues/1231 --- index/scorch/scorch.go | 8 ++++---- index/scorch/snapshot_segment.go | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 2641b3b19..522feca19 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -41,12 +41,14 @@ const Version uint8 = 2 var ErrClosed = fmt.Errorf("scorch closed") type Scorch struct { + nextSegmentID uint64 + stats Stats + iStats internalStats + readOnly bool version uint8 config map[string]interface{} analysisQueue *index.AnalysisQueue - stats Stats - nextSegmentID uint64 path string unsafeBatch bool @@ -73,8 +75,6 @@ type Scorch struct { onEvent func(event Event) onAsyncError func(err error) - iStats internalStats - pauseLock sync.RWMutex pauseCount uint64 diff --git a/index/scorch/snapshot_segment.go b/index/scorch/snapshot_segment.go index f3a2c56a9..96742b4f9 100644 --- a/index/scorch/snapshot_segment.go +++ b/index/scorch/snapshot_segment.go @@ -183,9 +183,9 @@ func (cfd *cachedFieldDocs) prepareField(field string, ss *SegmentSnapshot) { } type cachedDocs struct { + size uint64 m sync.Mutex // As the cache is asynchronously prepared, need a lock cache map[string]*cachedFieldDocs // Keyed by field - size uint64 } func (c *cachedDocs) prepareFields(wantedFields []string, ss *SegmentSnapshot) error { From 2d7b31a60fcb28149a6bf3c426cea06f349f3ca1 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 29 May 2019 18:03:01 +0530 Subject: [PATCH 605/728] dictionary random Lookup api draft version of having a new Exists iterator api implementation for the FieldDictionary --- index/index.go | 10 +++++++ index/scorch/segment/empty.go | 14 ++++++++++ index/scorch/segment/segment.go | 6 ++++ index/scorch/segment/zap/dict.go | 31 +++++++++++++++++++++ index/scorch/snapshot_index.go | 43 +++++++++++++++++++++++++++++ index/scorch/snapshot_index_dict.go | 15 ++++++++++ 6 files changed, 119 insertions(+) diff --git a/index/index.go b/index/index.go index 6aa444cfd..aadd8f5e2 100644 --- a/index/index.go +++ b/index/index.go @@ -121,6 +121,11 @@ type IndexReaderOnly interface { FieldDictOnly(field string, onlyTerms [][]byte, includeCount bool) (FieldDict, error) } +// better naming everywhere +type IndexReaderRandom interface { + FieldDictRandom(field string) (AdvFieldDict, error) +} + // FieldTerms contains the terms used by a document, keyed by field type FieldTerms map[string][]string @@ -230,6 +235,11 @@ type FieldDict interface { Close() error } +type AdvFieldDict interface { + FieldDict + Exists(key []byte) (error, bool) +} + // DocIDReader is the interface exposing enumeration of documents identifiers. // Close the reader to release associated resources. type DocIDReader interface { diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index 165a01bc1..6163eb577 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -91,12 +91,26 @@ func (e *EmptyDictionary) OnlyIterator(onlyTerms [][]byte, return &EmptyDictionaryIterator{} } +func (e *EmptyDictionary) ExistsIterator() AdvDictionaryIterator { + return &EmptyDictionaryLookupIterator{} +} + type EmptyDictionaryIterator struct{} func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) { return nil, nil } +type EmptyDictionaryLookupIterator struct{} + +func (e *EmptyDictionaryLookupIterator) Next() (*index.DictEntry, error) { + return nil, nil +} + +func (e *EmptyDictionaryLookupIterator) Exists(key []byte) (error, bool) { + return nil, false +} + func (e *EmptyPostingsIterator) Advance(uint64) (Posting, error) { return nil, nil } diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index b94d6f979..87f608a69 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -59,12 +59,18 @@ type TermDictionary interface { AutomatonIterator(a vellum.Automaton, startKeyInclusive, endKeyExclusive []byte) DictionaryIterator OnlyIterator(onlyTerms [][]byte, includeCount bool) DictionaryIterator + ExistsIterator() AdvDictionaryIterator } type DictionaryIterator interface { Next() (*index.DictEntry, error) } +type AdvDictionaryIterator interface { + DictionaryIterator + Exists(key []byte) (error, bool) +} + type PostingsList interface { Iterator(includeFreq, includeNorm, includeLocations bool, prealloc PostingsIterator) PostingsIterator diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 190265d6e..08f89739f 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -228,6 +228,24 @@ func (d *Dictionary) OnlyIterator(onlyTerms [][]byte, return rv } +// ExistsIterator returns an exists iterator for this dictionary +func (d *Dictionary) ExistsIterator() segment.AdvDictionaryIterator { + rv := &DictionaryIterator{ + d: d, + } + + if d.fst != nil { + itr, err := d.fst.Iterator(nil, nil) + if err == nil { + rv.itr = itr + } else if err != vellum.ErrIteratorDone { + rv.err = err + } + } + + return rv +} + // DictionaryIterator is an iterator for term dictionary type DictionaryIterator struct { d *Dictionary @@ -257,3 +275,16 @@ func (i *DictionaryIterator) Next() (*index.DictEntry, error) { i.err = i.itr.Next() return &i.entry, nil } + +func (i *DictionaryIterator) Exists(key []byte) (error, bool) { + if i.err != nil && i.err != vellum.ErrIteratorDone { + return i.err, false + } else if i.itr == nil || i.err == vellum.ErrIteratorDone { + return nil, false + } + if advItr, ok := i.itr.(vellum.AdvIterator); ok { + return advItr.Exists(key) + } + + return fmt.Errorf("no implementation found"), false +} diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 8babb31fa..094227f2d 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -172,6 +172,43 @@ func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i s return rv, nil } +func (i *IndexSnapshot) newIndexSnapshotFieldDictRandom(field string, makeItr func(i segment.TermDictionary) segment.AdvDictionaryIterator) (*IndexSnapshotFieldDict, error) { + results := make(chan *asynchSegmentResult) + for index, segment := range i.segment { + go func(index int, segment *SegmentSnapshot) { + dict, err := segment.segment.Dictionary(field) + if err != nil { + results <- &asynchSegmentResult{err: err} + } else { + results <- &asynchSegmentResult{dictItr: makeItr(dict)} + } + }(index, segment) + } + + var err error + rv := &IndexSnapshotFieldDict{ + snapshot: i, + cursors: make([]*segmentDictCursor, 0, len(i.segment)), + } + for count := 0; count < len(i.segment); count++ { + asr := <-results + if asr.err != nil && err == nil { + err = asr.err + } else { + rv.cursors = append(rv.cursors, &segmentDictCursor{ + itr: asr.dictItr, + }) + } + + } + // after ensuring we've read all items on channel + if err != nil { + return nil, err + } + + return rv, nil +} + func (i *IndexSnapshot) FieldDict(field string) (index.FieldDict, error) { return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { return i.Iterator() @@ -242,6 +279,12 @@ func (i *IndexSnapshot) FieldDictOnly(field string, }) } +func (i *IndexSnapshot) FieldDictRandom(field string) (index.AdvFieldDict, error) { + return i.newIndexSnapshotFieldDictRandom(field, func(i segment.TermDictionary) segment.AdvDictionaryIterator { + return i.ExistsIterator() + }) +} + func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) { results := make(chan *asynchSegmentResult) for index, segment := range i.segment { diff --git a/index/scorch/snapshot_index_dict.go b/index/scorch/snapshot_index_dict.go index abd3bde8c..33e13fc8a 100644 --- a/index/scorch/snapshot_index_dict.go +++ b/index/scorch/snapshot_index_dict.go @@ -91,3 +91,18 @@ func (i *IndexSnapshotFieldDict) Next() (*index.DictEntry, error) { func (i *IndexSnapshotFieldDict) Close() error { return nil } + +func (i *IndexSnapshotFieldDict) Exists(key []byte) (error, bool) { + if len(i.cursors) == 0 { + return nil, false + } + + for _, cursor := range i.cursors { + if itr, ok := cursor.itr.(segment.AdvDictionaryIterator); ok { + if _, found := itr.Exists(key); found { + return nil, true + } + } + } + return nil, false +} From 7ca4ea2cd8bb55fa4578b89a0731f83f83b27f68 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 3 Jun 2019 12:00:40 -0700 Subject: [PATCH 606/728] MB-33617: Support unicode strings with combining characters Context: reverse-token-filter --- analysis/token/reverse/reverse.go | 35 ++++++++++++++++---------- analysis/token/reverse/reverse_test.go | 25 ++++++++++++++++-- 2 files changed, 45 insertions(+), 15 deletions(-) diff --git a/analysis/token/reverse/reverse.go b/analysis/token/reverse/reverse.go index 2dd16a531..671ed8992 100644 --- a/analysis/token/reverse/reverse.go +++ b/analysis/token/reverse/reverse.go @@ -15,6 +15,7 @@ package reverse import ( + "unicode" "unicode/utf8" "github.com/blevesearch/bleve/analysis" @@ -47,20 +48,28 @@ func init() { } // reverse(..) will generate a reversed version of the provided -// utf-8 encoded byte array and return it back to its caller. +// unicode array and return it back to its caller. func reverse(s []byte) []byte { - j := len(s) - rv := make([]byte, len(s)) - for i := 0; i < len(s); { - wid := 1 - r := rune(s[i]) - if r >= utf8.RuneSelf { - r, wid = utf8.DecodeRune(s[i:]) + cursorIn := 0 + inputRunes := []rune(string(s)) + cursorOut := len(s) + output := make([]byte, len(s)) + for i := 0; i < len(inputRunes); { + wid := utf8.RuneLen(inputRunes[i]) + i++ + for i < len(inputRunes) { + r := inputRunes[i] + if unicode.Is(unicode.Mn, r) || unicode.Is(unicode.Me, r) || unicode.Is(unicode.Mc, r) { + wid += utf8.RuneLen(r) + i++ + } else { + break + } } - - copy(rv[j-wid:j], s[i:i+wid]) - i += wid - j -= wid + copy(output[cursorOut-wid:cursorOut], s[cursorIn:cursorIn+wid]) + cursorIn += wid + cursorOut -= wid } - return rv + + return output } diff --git a/analysis/token/reverse/reverse_test.go b/analysis/token/reverse/reverse_test.go index 6437ecdd3..8e84ab983 100644 --- a/analysis/token/reverse/reverse_test.go +++ b/analysis/token/reverse/reverse_test.go @@ -23,6 +23,7 @@ import ( func TestReverseFilter(t *testing.T) { inputTokenStream := analysis.TokenStream{ + &analysis.Token{}, &analysis.Token{ Term: []byte("one"), }, @@ -47,10 +48,19 @@ func TestReverseFilter(t *testing.T) { &analysis.Token{ Term: []byte("!@#$%^&*()"), }, - &analysis.Token{}, + &analysis.Token{ + Term: []byte("cafés"), + }, + &analysis.Token{ + Term: []byte("¿Dónde estás?"), + }, + &analysis.Token{ + Term: []byte("Me gustaría una cerveza."), + }, } expectedTokenStream := analysis.TokenStream{ + &analysis.Token{}, &analysis.Token{ Term: []byte("eno"), }, @@ -75,7 +85,15 @@ func TestReverseFilter(t *testing.T) { &analysis.Token{ Term: []byte(")(*&^%$#@!"), }, - &analysis.Token{}, + &analysis.Token{ + Term: []byte("séfac"), + }, + &analysis.Token{ + Term: []byte("?sátse ednóD¿"), + }, + &analysis.Token{ + Term: []byte(".azevrec anu aíratsug eM"), + }, } filter := NewReverseFilter() @@ -153,6 +171,9 @@ func BenchmarkReverseFilter(b *testing.B) { &analysis.Token{ Term: []byte("İȺȾCAT"), }, + &analysis.Token{ + Term: []byte("Me gustaría una cerveza."), + }, } filter := NewReverseFilter() From 0d9d5dee5b5c82639402eb3fc1ef957178d5a78f Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 3 Jun 2019 11:31:13 +0530 Subject: [PATCH 607/728] refactoring to handle errs, save castings etc --- index/index.go | 1 - index/scorch/segment/empty.go | 12 +--- index/scorch/segment/segment.go | 6 +- index/scorch/segment/zap/dict.go | 14 ++--- index/scorch/snapshot_index.go | 80 ++++++++---------------- index/scorch/snapshot_index_dict.go | 7 +-- search/searcher/search_geoboundingbox.go | 31 ++++++--- 7 files changed, 61 insertions(+), 90 deletions(-) diff --git a/index/index.go b/index/index.go index aadd8f5e2..5c604a133 100644 --- a/index/index.go +++ b/index/index.go @@ -121,7 +121,6 @@ type IndexReaderOnly interface { FieldDictOnly(field string, onlyTerms [][]byte, includeCount bool) (FieldDict, error) } -// better naming everywhere type IndexReaderRandom interface { FieldDictRandom(field string) (AdvFieldDict, error) } diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index 6163eb577..c9f003311 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -91,8 +91,8 @@ func (e *EmptyDictionary) OnlyIterator(onlyTerms [][]byte, return &EmptyDictionaryIterator{} } -func (e *EmptyDictionary) ExistsIterator() AdvDictionaryIterator { - return &EmptyDictionaryLookupIterator{} +func (e *EmptyDictionary) ExistsIterator() DictionaryIterator { + return &EmptyDictionaryIterator{} } type EmptyDictionaryIterator struct{} @@ -101,13 +101,7 @@ func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) { return nil, nil } -type EmptyDictionaryLookupIterator struct{} - -func (e *EmptyDictionaryLookupIterator) Next() (*index.DictEntry, error) { - return nil, nil -} - -func (e *EmptyDictionaryLookupIterator) Exists(key []byte) (error, bool) { +func (e *EmptyDictionaryIterator) Exists(key []byte) (error, bool) { return nil, false } diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index 87f608a69..f8b6e3edb 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -59,15 +59,11 @@ type TermDictionary interface { AutomatonIterator(a vellum.Automaton, startKeyInclusive, endKeyExclusive []byte) DictionaryIterator OnlyIterator(onlyTerms [][]byte, includeCount bool) DictionaryIterator - ExistsIterator() AdvDictionaryIterator + ExistsIterator() DictionaryIterator } type DictionaryIterator interface { Next() (*index.DictEntry, error) -} - -type AdvDictionaryIterator interface { - DictionaryIterator Exists(key []byte) (error, bool) } diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 08f89739f..9a9ee6b86 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -229,9 +229,10 @@ func (d *Dictionary) OnlyIterator(onlyTerms [][]byte, } // ExistsIterator returns an exists iterator for this dictionary -func (d *Dictionary) ExistsIterator() segment.AdvDictionaryIterator { +func (d *Dictionary) ExistsIterator() segment.DictionaryIterator { rv := &DictionaryIterator{ - d: d, + d: d, + omitCount: true, } if d.fst != nil { @@ -279,12 +280,9 @@ func (i *DictionaryIterator) Next() (*index.DictEntry, error) { func (i *DictionaryIterator) Exists(key []byte) (error, bool) { if i.err != nil && i.err != vellum.ErrIteratorDone { return i.err, false - } else if i.itr == nil || i.err == vellum.ErrIteratorDone { - return nil, false } - if advItr, ok := i.itr.(vellum.AdvIterator); ok { - return advItr.Exists(key) + if i.itr == nil || i.err == vellum.ErrIteratorDone { + return nil, false } - - return fmt.Errorf("no implementation found"), false + return i.itr.Exists(key) } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 094227f2d..6da417881 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -126,7 +126,9 @@ func (i *IndexSnapshot) updateSize() { } } -func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i segment.TermDictionary) segment.DictionaryIterator) (*IndexSnapshotFieldDict, error) { +func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, + makeItr func(i segment.TermDictionary) segment.DictionaryIterator, + randomLookup bool) (*IndexSnapshotFieldDict, error) { results := make(chan *asynchSegmentResult) for index, segment := range i.segment { @@ -150,14 +152,20 @@ func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i s if asr.err != nil && err == nil { err = asr.err } else { - next, err2 := asr.dictItr.Next() - if err2 != nil && err == nil { - err = err2 - } - if next != nil { + if !randomLookup { + next, err2 := asr.dictItr.Next() + if err2 != nil && err == nil { + err = err2 + } + if next != nil { + rv.cursors = append(rv.cursors, &segmentDictCursor{ + itr: asr.dictItr, + curr: *next, + }) + } + } else { rv.cursors = append(rv.cursors, &segmentDictCursor{ - itr: asr.dictItr, - curr: *next, + itr: asr.dictItr, }) } } @@ -166,44 +174,10 @@ func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, makeItr func(i s if err != nil { return nil, err } - // prepare heap - heap.Init(rv) - - return rv, nil -} - -func (i *IndexSnapshot) newIndexSnapshotFieldDictRandom(field string, makeItr func(i segment.TermDictionary) segment.AdvDictionaryIterator) (*IndexSnapshotFieldDict, error) { - results := make(chan *asynchSegmentResult) - for index, segment := range i.segment { - go func(index int, segment *SegmentSnapshot) { - dict, err := segment.segment.Dictionary(field) - if err != nil { - results <- &asynchSegmentResult{err: err} - } else { - results <- &asynchSegmentResult{dictItr: makeItr(dict)} - } - }(index, segment) - } - - var err error - rv := &IndexSnapshotFieldDict{ - snapshot: i, - cursors: make([]*segmentDictCursor, 0, len(i.segment)), - } - for count := 0; count < len(i.segment); count++ { - asr := <-results - if asr.err != nil && err == nil { - err = asr.err - } else { - rv.cursors = append(rv.cursors, &segmentDictCursor{ - itr: asr.dictItr, - }) - } - } - // after ensuring we've read all items on channel - if err != nil { - return nil, err + if !randomLookup { + // prepare heap + heap.Init(rv) } return rv, nil @@ -212,21 +186,21 @@ func (i *IndexSnapshot) newIndexSnapshotFieldDictRandom(field string, makeItr fu func (i *IndexSnapshot) FieldDict(field string) (index.FieldDict, error) { return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { return i.Iterator() - }) + }, false) } func (i *IndexSnapshot) FieldDictRange(field string, startTerm []byte, endTerm []byte) (index.FieldDict, error) { return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { return i.RangeIterator(string(startTerm), string(endTerm)) - }) + }, false) } func (i *IndexSnapshot) FieldDictPrefix(field string, termPrefix []byte) (index.FieldDict, error) { return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { return i.PrefixIterator(string(termPrefix)) - }) + }, false) } func (i *IndexSnapshot) FieldDictRegexp(field string, @@ -241,7 +215,7 @@ func (i *IndexSnapshot) FieldDictRegexp(field string, return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { return i.AutomatonIterator(a, prefixBeg, prefixEnd) - }) + }, false) } func (i *IndexSnapshot) getLevAutomaton(term string, @@ -269,20 +243,20 @@ func (i *IndexSnapshot) FieldDictFuzzy(field string, return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { return i.AutomatonIterator(a, prefixBeg, prefixEnd) - }) + }, false) } func (i *IndexSnapshot) FieldDictOnly(field string, onlyTerms [][]byte, includeCount bool) (index.FieldDict, error) { return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { return i.OnlyIterator(onlyTerms, includeCount) - }) + }, false) } func (i *IndexSnapshot) FieldDictRandom(field string) (index.AdvFieldDict, error) { - return i.newIndexSnapshotFieldDictRandom(field, func(i segment.TermDictionary) segment.AdvDictionaryIterator { + return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { return i.ExistsIterator() - }) + }, true) } func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) { diff --git a/index/scorch/snapshot_index_dict.go b/index/scorch/snapshot_index_dict.go index 33e13fc8a..2d1614d46 100644 --- a/index/scorch/snapshot_index_dict.go +++ b/index/scorch/snapshot_index_dict.go @@ -98,11 +98,10 @@ func (i *IndexSnapshotFieldDict) Exists(key []byte) (error, bool) { } for _, cursor := range i.cursors { - if itr, ok := cursor.itr.(segment.AdvDictionaryIterator); ok { - if _, found := itr.Exists(key); found { - return nil, true - } + if _, found := cursor.itr.Exists(key); found { + return nil, true } } + return nil, false } diff --git a/search/searcher/search_geoboundingbox.go b/search/searcher/search_geoboundingbox.go index 6ec5e72dd..ee9c3e061 100644 --- a/search/searcher/search_geoboundingbox.go +++ b/search/searcher/search_geoboundingbox.go @@ -118,19 +118,30 @@ func ComputeGeoRange(term uint64, shift uint, return rv } - isIndexed := func(term []byte) bool { - if indexReader != nil { - reader, err := indexReader.TermFieldReader(term, field, false, false, false) - if err != nil || reader == nil { - return false + var fieldDict index.AdvFieldDict + if irr, ok := indexReader.(index.IndexReaderRandom); ok { + fieldDict, err = irr.FieldDictRandom(field) + if err != nil { + return nil, nil, err + } + } + + defer func() { + if fieldDict != nil { + cerr := fieldDict.Close() + if cerr != nil { + err = cerr } - if reader.Count() == 0 { - _ = reader.Close() - return false + } + }() + + isIndexed := func(term []byte) bool { + if fieldDict != nil { + if err, found := fieldDict.Exists(term); found && err == nil { + return true } - _ = reader.Close() } - return true + return false } var computeGeoRange func(term uint64, shift uint) // declare for recursion From fc8e1392425eebbfe6143fb3d7313bee789b759c Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 6 Jun 2019 10:30:34 +0530 Subject: [PATCH 608/728] vellum SHA bump - new Exists API --- vendor/manifest | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/manifest b/vendor/manifest index 729d8a64f..f4be538bb 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -137,7 +137,7 @@ "importpath": "github.com/couchbase/vellum", "repository": "https://github.com/couchbase/vellum", "vcs": "git", - "revision": "28880ab96d9361ab5a74f0e12000f8fe0cd20712", + "revision": "5f4edc22838b1433cdb9793bcac21c9376bb4753", "branch": "master", "notests": true } From eb619881fe5c7bbf95ce30059ef5cc5989e5cbe4 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 6 Jun 2019 13:58:58 +0530 Subject: [PATCH 609/728] EnumeratorTests adopting Exists method -implementing the new Exists method from vellum Iterator interface --- index/scorch/segment/zap/enumerator_test.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/index/scorch/segment/zap/enumerator_test.go b/index/scorch/segment/zap/enumerator_test.go index b27788923..4ba4aa4c0 100644 --- a/index/scorch/segment/zap/enumerator_test.go +++ b/index/scorch/segment/zap/enumerator_test.go @@ -231,3 +231,7 @@ func (m *testIterator) Close() error { m.curr = 654321 return nil } + +func (m *testIterator) Exists(key []byte) (bool, error) { + return false, fmt.Errorf("not implemented for enumerator unit tests") +} From 72eacd7c7687d1c6ba34a15bbf82585d7e6b2632 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 6 Jun 2019 14:14:06 +0530 Subject: [PATCH 610/728] support for upside_down indexType too e Please enter the commit message for your changes. Lines starting --- index/index.go | 9 +++--- index/scorch/segment/empty.go | 4 +-- index/scorch/segment/segment.go | 2 +- index/scorch/segment/zap/dict.go | 6 ++-- index/scorch/snapshot_index.go | 2 +- index/scorch/snapshot_index_dict.go | 10 +++--- search/searcher/search_geoboundingbox.go | 40 +++++++++++++++++------- 7 files changed, 45 insertions(+), 28 deletions(-) diff --git a/index/index.go b/index/index.go index 5c604a133..a470f5669 100644 --- a/index/index.go +++ b/index/index.go @@ -121,8 +121,8 @@ type IndexReaderOnly interface { FieldDictOnly(field string, onlyTerms [][]byte, includeCount bool) (FieldDict, error) } -type IndexReaderRandom interface { - FieldDictRandom(field string) (AdvFieldDict, error) +type IndexReaderExists interface { + FieldDictExists(field string) (FieldDictExists, error) } // FieldTerms contains the terms used by a document, keyed by field @@ -234,9 +234,8 @@ type FieldDict interface { Close() error } -type AdvFieldDict interface { - FieldDict - Exists(key []byte) (error, bool) +type FieldDictExists interface { + Exists(key []byte) (bool, error) } // DocIDReader is the interface exposing enumeration of documents identifiers. diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index c9f003311..020790ca1 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -101,8 +101,8 @@ func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) { return nil, nil } -func (e *EmptyDictionaryIterator) Exists(key []byte) (error, bool) { - return nil, false +func (e *EmptyDictionaryIterator) Exists(key []byte) (bool, error) { + return false, nil } func (e *EmptyPostingsIterator) Advance(uint64) (Posting, error) { diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index f8b6e3edb..9261c2282 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -64,7 +64,7 @@ type TermDictionary interface { type DictionaryIterator interface { Next() (*index.DictEntry, error) - Exists(key []byte) (error, bool) + Exists(key []byte) (bool, error) } type PostingsList interface { diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index 9a9ee6b86..cea0fe307 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -277,12 +277,12 @@ func (i *DictionaryIterator) Next() (*index.DictEntry, error) { return &i.entry, nil } -func (i *DictionaryIterator) Exists(key []byte) (error, bool) { +func (i *DictionaryIterator) Exists(key []byte) (bool, error) { if i.err != nil && i.err != vellum.ErrIteratorDone { - return i.err, false + return false, i.err } if i.itr == nil || i.err == vellum.ErrIteratorDone { - return nil, false + return false, nil } return i.itr.Exists(key) } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 6da417881..434d94ea4 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -253,7 +253,7 @@ func (i *IndexSnapshot) FieldDictOnly(field string, }, false) } -func (i *IndexSnapshot) FieldDictRandom(field string) (index.AdvFieldDict, error) { +func (i *IndexSnapshot) FieldDictExists(field string) (index.FieldDictExists, error) { return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { return i.ExistsIterator() }, true) diff --git a/index/scorch/snapshot_index_dict.go b/index/scorch/snapshot_index_dict.go index 2d1614d46..028c401e4 100644 --- a/index/scorch/snapshot_index_dict.go +++ b/index/scorch/snapshot_index_dict.go @@ -92,16 +92,16 @@ func (i *IndexSnapshotFieldDict) Close() error { return nil } -func (i *IndexSnapshotFieldDict) Exists(key []byte) (error, bool) { +func (i *IndexSnapshotFieldDict) Exists(key []byte) (bool, error) { if len(i.cursors) == 0 { - return nil, false + return false, nil } for _, cursor := range i.cursors { - if _, found := cursor.itr.Exists(key); found { - return nil, true + if found, _ := cursor.itr.Exists(key); found { + return true, nil } } - return nil, false + return false, nil } diff --git a/search/searcher/search_geoboundingbox.go b/search/searcher/search_geoboundingbox.go index ee9c3e061..c81cdd9d8 100644 --- a/search/searcher/search_geoboundingbox.go +++ b/search/searcher/search_geoboundingbox.go @@ -22,6 +22,8 @@ import ( "github.com/blevesearch/bleve/search" ) +type filterFunc func(key []byte) bool + var GeoBitsShift1 = (geo.GeoBits << 1) var GeoBitsShift1Minus1 = GeoBitsShift1 - 1 @@ -118,30 +120,46 @@ func ComputeGeoRange(term uint64, shift uint, return rv } - var fieldDict index.AdvFieldDict - if irr, ok := indexReader.(index.IndexReaderRandom); ok { - fieldDict, err = irr.FieldDictRandom(field) + var fieldDict index.FieldDictExists + var isIndexed filterFunc + if irr, ok := indexReader.(index.IndexReaderExists); ok { + fieldDict, err = irr.FieldDictExists(field) if err != nil { return nil, nil, err } + + isIndexed = func(term []byte) bool { + found, err := fieldDict.Exists(term) + return err == nil && found + } } defer func() { if fieldDict != nil { - cerr := fieldDict.Close() - if cerr != nil { - err = cerr + if fd, ok := fieldDict.(index.FieldDict); ok { + cerr := fd.Close() + if cerr != nil { + err = cerr + } } } }() - isIndexed := func(term []byte) bool { - if fieldDict != nil { - if err, found := fieldDict.Exists(term); found && err == nil { - return true + if isIndexed == nil { + isIndexed = func(term []byte) bool { + if indexReader != nil { + reader, err := indexReader.TermFieldReader(term, field, false, false, false) + if err != nil || reader == nil { + return false + } + if reader.Count() == 0 { + _ = reader.Close() + return false + } + _ = reader.Close() } + return true } - return false } var computeGeoRange func(term uint64, shift uint) // declare for recursion From 7e5b7c200faf569915e2417393916bc36126a6e7 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 6 Jun 2019 14:42:54 +0530 Subject: [PATCH 611/728] Numeric range query adopts Exists Iterator Numeric range query to use the new Exists iterator from Vellum for filtering the candidate terms for scorch index types. Empirical evidence shows that this brought about 44% throughput improvements for qps. --- search/searcher/search_numeric_range.go | 53 ++++++++++++++++---- search/searcher/search_numeric_range_test.go | 2 +- 2 files changed, 45 insertions(+), 10 deletions(-) diff --git a/search/searcher/search_numeric_range.go b/search/searcher/search_numeric_range.go index e52ef9a82..0bc1f6e4a 100644 --- a/search/searcher/search_numeric_range.go +++ b/search/searcher/search_numeric_range.go @@ -53,20 +53,49 @@ func NewNumericRangeSearcher(indexReader index.IndexReader, if !*inclusiveMax && maxInt64 != math.MinInt64 { maxInt64-- } + + var fieldDict index.FieldDictExists + var isIndexed filterFunc + var err error + if irr, ok := indexReader.(index.IndexReaderExists); ok { + fieldDict, err = irr.FieldDictExists(field) + if err != nil { + return nil, err + } + + isIndexed = func(term []byte) bool { + found, err := fieldDict.Exists(term) + return err == nil && found + } + } + // FIXME hard-coded precision, should match field declaration termRanges := splitInt64Range(minInt64, maxInt64, 4) - terms := termRanges.Enumerate() + terms := termRanges.Enumerate(isIndexed) + if fieldDict != nil { + if fd, ok := fieldDict.(index.FieldDict); ok { + cerr := fd.Close() + if cerr != nil { + err = cerr + } + } + } + if len(terms) < 1 { // cannot return MatchNoneSearcher because of interaction with // commit f391b991c20f02681bacd197afc6d8aed444e132 return NewMultiTermSearcherBytes(indexReader, terms, field, boost, options, true) } - var err error - terms, err = filterCandidateTerms(indexReader, terms, field) - if err != nil { - return nil, err + + // for upside_down + if isIndexed == nil { + terms, err = filterCandidateTerms(indexReader, terms, field) + if err != nil { + return nil, err + } } + if tooManyClauses(len(terms)) { return nil, tooManyClausesErr(len(terms)) } @@ -125,11 +154,17 @@ type termRange struct { endTerm []byte } -func (t *termRange) Enumerate() [][]byte { +func (t *termRange) Enumerate(filter filterFunc) [][]byte { var rv [][]byte next := t.startTerm for bytes.Compare(next, t.endTerm) <= 0 { - rv = append(rv, next) + if filter != nil { + if filter(next) { + rv = append(rv, next) + } + } else { + rv = append(rv, next) + } next = incrementBytes(next) } return rv @@ -150,10 +185,10 @@ func incrementBytes(in []byte) []byte { type termRanges []*termRange -func (tr termRanges) Enumerate() [][]byte { +func (tr termRanges) Enumerate(filter filterFunc) [][]byte { var rv [][]byte for _, tri := range tr { - trie := tri.Enumerate() + trie := tri.Enumerate(filter) rv = append(rv, trie...) } return rv diff --git a/search/searcher/search_numeric_range_test.go b/search/searcher/search_numeric_range_test.go index 9a04fd62e..b19e1b24a 100644 --- a/search/searcher/search_numeric_range_test.go +++ b/search/searcher/search_numeric_range_test.go @@ -25,7 +25,7 @@ func TestSplitRange(t *testing.T) { min := numeric.Float64ToInt64(1.0) max := numeric.Float64ToInt64(5.0) ranges := splitInt64Range(min, max, 4) - enumerated := ranges.Enumerate() + enumerated := ranges.Enumerate(nil) if len(enumerated) != 135 { t.Errorf("expected 135 terms, got %d", len(enumerated)) } From 75cdaca51a2ad20227868a152dfb599981b93623 Mon Sep 17 00:00:00 2001 From: Tyler Kovacs Date: Mon, 10 Jun 2019 11:04:07 -0700 Subject: [PATCH 612/728] add missing setter method for SetSearchBefore (#1237) --- search.go | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/search.go b/search.go index e8d66f80a..b337edc9e 100644 --- a/search.go +++ b/search.go @@ -342,6 +342,12 @@ func (r *SearchRequest) SetSearchAfter(after []string) { r.SearchAfter = after } +// SetSearchBefore sets the request to skip over hits with a sort +// value greater than the provided sort before key +func (r *SearchRequest) SetSearchBefore(before []string) { + r.SearchBefore = before +} + // UnmarshalJSON deserializes a JSON representation of // a SearchRequest func (r *SearchRequest) UnmarshalJSON(input []byte) error { From aaf4e4546fe6c312d9e26438c4e40f0cdd4cac09 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 10 Jun 2019 16:16:44 -0400 Subject: [PATCH 613/728] bump to latest vellum --- vendor/manifest | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/manifest b/vendor/manifest index f4be538bb..2bba989a2 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -137,7 +137,7 @@ "importpath": "github.com/couchbase/vellum", "repository": "https://github.com/couchbase/vellum", "vcs": "git", - "revision": "5f4edc22838b1433cdb9793bcac21c9376bb4753", + "revision": "ec7b775d247f63c2ba2aee9dc9c4872a1b30058b", "branch": "master", "notests": true } From bfa42c077826f889a57797f7927c704ffbdfa9ad Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 11 Jun 2019 07:21:36 -0400 Subject: [PATCH 614/728] fix scorch regex literal prefix for case insensitive (#1245) the previous version incorrectly returned a literal prefix when case-insensitive matching was enabled --- index/scorch/segment/regexp.go | 2 +- index/scorch/segment/regexp_test.go | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/index/scorch/segment/regexp.go b/index/scorch/segment/regexp.go index 3aa151d64..3a31f4149 100644 --- a/index/scorch/segment/regexp.go +++ b/index/scorch/segment/regexp.go @@ -55,7 +55,7 @@ func LiteralPrefix(s *syntax.Regexp) string { s = s.Sub[0] } - if s.Op == syntax.OpLiteral { + if s.Op == syntax.OpLiteral && (s.Flags&syntax.FoldCase == 0) { return string(s.Rune) } diff --git a/index/scorch/segment/regexp_test.go b/index/scorch/segment/regexp_test.go index b4731d6b8..887476b28 100644 --- a/index/scorch/segment/regexp_test.go +++ b/index/scorch/segment/regexp_test.go @@ -40,6 +40,7 @@ func TestLiteralPrefix(t *testing.T) { {`^hello`, ""}, {`^`, ""}, {`$`, ""}, + {`(?i)mArTy`, ""}, } for i, test := range tests { From 0fc6e671a53546934073d041903bfbe52a61b4de Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Wed, 12 Jun 2019 13:49:39 -0700 Subject: [PATCH 615/728] Disable errcheck on only go1.10 See.. https://github.com/blevesearch/bleve/issues/1183 --- .travis.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.travis.yml b/.travis.yml index 22612c966..e00e7b994 100644 --- a/.travis.yml +++ b/.travis.yml @@ -16,10 +16,10 @@ script: - go test -race -v $(go list ./... | grep -v vendor/) - go vet $(go list ./... | grep -v vendor/) - go test ./test -v -indexType scorch - - if [[ ${TRAVIS_GO_VERSION} =~ ^1\.11 ]]; then - errcheck -ignorepkg fmt $(go list ./... | grep -v vendor/); - else + - if [[ ${TRAVIS_GO_VERSION} =~ ^1\.10 ]]; then echo "errcheck skipped for go version" $TRAVIS_GO_VERSION; + else + errcheck -ignorepkg fmt $(go list ./... | grep -v vendor/); fi - docs/project-code-coverage.sh - docs/build_children.sh From 152c60bf035def2ba8327c17fc17bbb97476fc08 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Tue, 18 Jun 2019 17:40:44 +0530 Subject: [PATCH 616/728] Adding Contains method to TermDictionary Dictionary's Contains api implmentation to use the FST's Contains method directly, and there is no need for a ContainsIterator for TD. --- index/index.go | 8 +++--- index/scorch/segment/empty.go | 6 ++--- index/scorch/segment/segment.go | 4 +-- index/scorch/segment/zap/dict.go | 33 +++--------------------- index/scorch/snapshot_index.go | 15 ++++++----- index/scorch/snapshot_index_dict.go | 5 ++-- search/searcher/search_geoboundingbox.go | 8 +++--- search/searcher/search_numeric_range.go | 8 +++--- 8 files changed, 33 insertions(+), 54 deletions(-) diff --git a/index/index.go b/index/index.go index a470f5669..3e866f3aa 100644 --- a/index/index.go +++ b/index/index.go @@ -121,8 +121,8 @@ type IndexReaderOnly interface { FieldDictOnly(field string, onlyTerms [][]byte, includeCount bool) (FieldDict, error) } -type IndexReaderExists interface { - FieldDictExists(field string) (FieldDictExists, error) +type IndexReaderContains interface { + FieldDictContains(field string) (FieldDictContains, error) } // FieldTerms contains the terms used by a document, keyed by field @@ -234,8 +234,8 @@ type FieldDict interface { Close() error } -type FieldDictExists interface { - Exists(key []byte) (bool, error) +type FieldDictContains interface { + Contains(key []byte) (bool, error) } // DocIDReader is the interface exposing enumeration of documents identifiers. diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index 020790ca1..fdc407a74 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -91,8 +91,8 @@ func (e *EmptyDictionary) OnlyIterator(onlyTerms [][]byte, return &EmptyDictionaryIterator{} } -func (e *EmptyDictionary) ExistsIterator() DictionaryIterator { - return &EmptyDictionaryIterator{} +func (e *EmptyDictionary) Contains(key []byte) (bool, error) { + return false, nil } type EmptyDictionaryIterator struct{} @@ -101,7 +101,7 @@ func (e *EmptyDictionaryIterator) Next() (*index.DictEntry, error) { return nil, nil } -func (e *EmptyDictionaryIterator) Exists(key []byte) (bool, error) { +func (e *EmptyDictionaryIterator) Contains(key []byte) (bool, error) { return false, nil } diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index 9261c2282..34c2bc204 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -59,12 +59,12 @@ type TermDictionary interface { AutomatonIterator(a vellum.Automaton, startKeyInclusive, endKeyExclusive []byte) DictionaryIterator OnlyIterator(onlyTerms [][]byte, includeCount bool) DictionaryIterator - ExistsIterator() DictionaryIterator + + Contains(key []byte) (bool, error) } type DictionaryIterator interface { Next() (*index.DictEntry, error) - Exists(key []byte) (bool, error) } type PostingsList interface { diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go index cea0fe307..ad4a8f8dc 100644 --- a/index/scorch/segment/zap/dict.go +++ b/index/scorch/segment/zap/dict.go @@ -95,6 +95,10 @@ func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) return rv } +func (d *Dictionary) Contains(key []byte) (bool, error) { + return d.fst.Contains(key) +} + // Iterator returns an iterator for this dictionary func (d *Dictionary) Iterator() segment.DictionaryIterator { rv := &DictionaryIterator{ @@ -228,25 +232,6 @@ func (d *Dictionary) OnlyIterator(onlyTerms [][]byte, return rv } -// ExistsIterator returns an exists iterator for this dictionary -func (d *Dictionary) ExistsIterator() segment.DictionaryIterator { - rv := &DictionaryIterator{ - d: d, - omitCount: true, - } - - if d.fst != nil { - itr, err := d.fst.Iterator(nil, nil) - if err == nil { - rv.itr = itr - } else if err != vellum.ErrIteratorDone { - rv.err = err - } - } - - return rv -} - // DictionaryIterator is an iterator for term dictionary type DictionaryIterator struct { d *Dictionary @@ -276,13 +261,3 @@ func (i *DictionaryIterator) Next() (*index.DictEntry, error) { i.err = i.itr.Next() return &i.entry, nil } - -func (i *DictionaryIterator) Exists(key []byte) (bool, error) { - if i.err != nil && i.err != vellum.ErrIteratorDone { - return false, i.err - } - if i.itr == nil || i.err == vellum.ErrIteratorDone { - return false, nil - } - return i.itr.Exists(key) -} diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 434d94ea4..dada6b76c 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -35,6 +35,7 @@ import ( var lb1, lb2 *lev2.LevenshteinAutomatonBuilder type asynchSegmentResult struct { + dict segment.TermDictionary dictItr segment.DictionaryIterator index int @@ -137,7 +138,11 @@ func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, if err != nil { results <- &asynchSegmentResult{err: err} } else { - results <- &asynchSegmentResult{dictItr: makeItr(dict)} + if randomLookup { + results <- &asynchSegmentResult{dict: dict} + } else { + results <- &asynchSegmentResult{dictItr: makeItr(dict)} + } } }(index, segment) } @@ -165,7 +170,7 @@ func (i *IndexSnapshot) newIndexSnapshotFieldDict(field string, } } else { rv.cursors = append(rv.cursors, &segmentDictCursor{ - itr: asr.dictItr, + dict: asr.dict, }) } } @@ -253,10 +258,8 @@ func (i *IndexSnapshot) FieldDictOnly(field string, }, false) } -func (i *IndexSnapshot) FieldDictExists(field string) (index.FieldDictExists, error) { - return i.newIndexSnapshotFieldDict(field, func(i segment.TermDictionary) segment.DictionaryIterator { - return i.ExistsIterator() - }, true) +func (i *IndexSnapshot) FieldDictContains(field string) (index.FieldDictContains, error) { + return i.newIndexSnapshotFieldDict(field, nil, true) } func (i *IndexSnapshot) DocIDReaderAll() (index.DocIDReader, error) { diff --git a/index/scorch/snapshot_index_dict.go b/index/scorch/snapshot_index_dict.go index 028c401e4..47486c255 100644 --- a/index/scorch/snapshot_index_dict.go +++ b/index/scorch/snapshot_index_dict.go @@ -22,6 +22,7 @@ import ( ) type segmentDictCursor struct { + dict segment.TermDictionary itr segment.DictionaryIterator curr index.DictEntry } @@ -92,13 +93,13 @@ func (i *IndexSnapshotFieldDict) Close() error { return nil } -func (i *IndexSnapshotFieldDict) Exists(key []byte) (bool, error) { +func (i *IndexSnapshotFieldDict) Contains(key []byte) (bool, error) { if len(i.cursors) == 0 { return false, nil } for _, cursor := range i.cursors { - if found, _ := cursor.itr.Exists(key); found { + if found, _ := cursor.dict.Contains(key); found { return true, nil } } diff --git a/search/searcher/search_geoboundingbox.go b/search/searcher/search_geoboundingbox.go index c81cdd9d8..38cb6467f 100644 --- a/search/searcher/search_geoboundingbox.go +++ b/search/searcher/search_geoboundingbox.go @@ -120,16 +120,16 @@ func ComputeGeoRange(term uint64, shift uint, return rv } - var fieldDict index.FieldDictExists + var fieldDict index.FieldDictContains var isIndexed filterFunc - if irr, ok := indexReader.(index.IndexReaderExists); ok { - fieldDict, err = irr.FieldDictExists(field) + if irr, ok := indexReader.(index.IndexReaderContains); ok { + fieldDict, err = irr.FieldDictContains(field) if err != nil { return nil, nil, err } isIndexed = func(term []byte) bool { - found, err := fieldDict.Exists(term) + found, err := fieldDict.Contains(term) return err == nil && found } } diff --git a/search/searcher/search_numeric_range.go b/search/searcher/search_numeric_range.go index 0bc1f6e4a..83107f020 100644 --- a/search/searcher/search_numeric_range.go +++ b/search/searcher/search_numeric_range.go @@ -54,17 +54,17 @@ func NewNumericRangeSearcher(indexReader index.IndexReader, maxInt64-- } - var fieldDict index.FieldDictExists + var fieldDict index.FieldDictContains var isIndexed filterFunc var err error - if irr, ok := indexReader.(index.IndexReaderExists); ok { - fieldDict, err = irr.FieldDictExists(field) + if irr, ok := indexReader.(index.IndexReaderContains); ok { + fieldDict, err = irr.FieldDictContains(field) if err != nil { return nil, err } isIndexed = func(term []byte) bool { - found, err := fieldDict.Exists(term) + found, err := fieldDict.Contains(term) return err == nil && found } } From 0f78b62e3372d9e8b49f18d5787c6498f1de1ecf Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 26 Jun 2019 21:03:19 +0530 Subject: [PATCH 617/728] bump to latest vellum SHA --- vendor/manifest | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/manifest b/vendor/manifest index 2bba989a2..20788833f 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -137,7 +137,7 @@ "importpath": "github.com/couchbase/vellum", "repository": "https://github.com/couchbase/vellum", "vcs": "git", - "revision": "ec7b775d247f63c2ba2aee9dc9c4872a1b30058b", + "revision": "41f2deade2cfab59facd263e918d7c05f656c2e9", "branch": "master", "notests": true } From a74500143a531ba1b2a7518fea0611863848a08a Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 1 Jul 2019 15:15:27 -0700 Subject: [PATCH 618/728] Preserve persistedCallbacks of an unpersisted snapshot until needed The persistedCallbacks of a snapshot that failed persistance need to be preserved until one of the two events have occurred: - a new snapshot was introduced - the retry-attempt of the exact same snapshot succeeded --- index/scorch/persister.go | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index e15fa2ab6..7c3e179fc 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -90,6 +90,10 @@ func (s *Scorch) persisterLoop() { var persistWatchers []*epochWatcher var lastPersistedEpoch, lastMergedEpoch uint64 var ew *epochWatcher + + var lastUnpersistedEpoch uint64 + var unpersistedCallbacks []index.BatchCallback + po, err := s.parsePersisterOptions() if err != nil { s.fireAsyncError(fmt.Errorf("persisterOptions json parsing err: %v", err)) @@ -149,11 +153,28 @@ OUTER: _ = ourSnapshot.DecRef() break OUTER } + + // save this current snapshot's epoch and persistedCallbacks, for + // a possible retry attempt in persisting the same snapshot again. + lastUnpersistedEpoch = ourSnapshot.epoch + unpersistedCallbacks = ourPersistedCallbacks + s.fireAsyncError(fmt.Errorf("got err persisting snapshot: %v", err)) _ = ourSnapshot.DecRef() atomic.AddUint64(&s.stats.TotPersistLoopErr, 1) continue OUTER } + + if ourPersistedCallbacks == nil && + lastUnpersistedEpoch == ourSnapshot.epoch { + // in the event of this being a retry attempt for persisting a snapshot + // that had earlier failed, retrieve the persistedCallbacks associated + // with that snapshot. + ourPersistedCallbacks = unpersistedCallbacks + lastUnpersistedEpoch = 0 + unpersistedCallbacks = nil + } + for i := range ourPersistedCallbacks { ourPersistedCallbacks[i](err) } From 732150df0a0890fbcb96ca9f8f4162502f31c82c Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Tue, 2 Jul 2019 14:24:34 -0700 Subject: [PATCH 619/728] Prepend earlier segment's persisted callbacks to latest This is so that no persistedCallbacks of older segments are dropped when persisting a snapshot of segments finally does succeed. --- index/scorch/persister.go | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 7c3e179fc..5af6ad532 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -91,7 +91,6 @@ func (s *Scorch) persisterLoop() { var lastPersistedEpoch, lastMergedEpoch uint64 var ew *epochWatcher - var lastUnpersistedEpoch uint64 var unpersistedCallbacks []index.BatchCallback po, err := s.parsePersisterOptions() @@ -154,10 +153,9 @@ OUTER: break OUTER } - // save this current snapshot's epoch and persistedCallbacks, for - // a possible retry attempt in persisting the same snapshot again. - lastUnpersistedEpoch = ourSnapshot.epoch - unpersistedCallbacks = ourPersistedCallbacks + // save this current snapshot's persistedCallbacks, to invoke during + // the retry attempt + unpersistedCallbacks = append(unpersistedCallbacks, ourPersistedCallbacks...) s.fireAsyncError(fmt.Errorf("got err persisting snapshot: %v", err)) _ = ourSnapshot.DecRef() @@ -165,13 +163,11 @@ OUTER: continue OUTER } - if ourPersistedCallbacks == nil && - lastUnpersistedEpoch == ourSnapshot.epoch { + if unpersistedCallbacks != nil { // in the event of this being a retry attempt for persisting a snapshot - // that had earlier failed, retrieve the persistedCallbacks associated - // with that snapshot. - ourPersistedCallbacks = unpersistedCallbacks - lastUnpersistedEpoch = 0 + // that had earlier failed, prepend the persistedCallbacks associated + // with earlier segment(s) to the latest persistedCallbacks + ourPersistedCallbacks = append(unpersistedCallbacks, ourPersistedCallbacks...) unpersistedCallbacks = nil } From 41fd936002da36ce8d9589dac39cea3526784507 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 1 Jul 2019 17:20:39 -0700 Subject: [PATCH 620/728] Include missing token filters + camelCase + unique --- config/config.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/config/config.go b/config/config.go index 5550d130b..5ed2d9c3f 100644 --- a/config/config.go +++ b/config/config.go @@ -45,6 +45,7 @@ import ( // token filters _ "github.com/blevesearch/bleve/analysis/token/apostrophe" + _ "github.com/blevesearch/bleve/analysis/token/camelcase" _ "github.com/blevesearch/bleve/analysis/token/compound" _ "github.com/blevesearch/bleve/analysis/token/edgengram" _ "github.com/blevesearch/bleve/analysis/token/elision" @@ -57,6 +58,7 @@ import ( _ "github.com/blevesearch/bleve/analysis/token/stop" _ "github.com/blevesearch/bleve/analysis/token/truncate" _ "github.com/blevesearch/bleve/analysis/token/unicodenorm" + _ "github.com/blevesearch/bleve/analysis/token/unique" // tokenizers _ "github.com/blevesearch/bleve/analysis/tokenizer/exception" From e5efb60815dfa171f6daacc120fc1c10494ff447 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 31 Jul 2019 16:04:30 +0530 Subject: [PATCH 621/728] Tightening the scorch stale file clean ups Fix for #1269. Its observed in an under resourced situation that the merger was treading ahead of the persister and the scorch files were piling up. It was reaching upto 70K with an update heavy workload. The intention of this fix is to unflip any ineligible files which are genuinely eligible for clean up. Also, adding stats to track this for future. --- index/scorch/merge.go | 16 ++++++++++++++++ index/scorch/persister.go | 9 ++++----- index/scorch/scorch.go | 3 +++ index/scorch/stats.go | 3 +++ 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 83f98aab0..d7144772f 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -18,6 +18,7 @@ import ( "encoding/json" "fmt" "os" + "strings" "sync/atomic" "time" @@ -157,6 +158,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, // process tasks in serial for now var notifications []chan *IndexSnapshot + var filenames []string for _, task := range resultMergePlan.Tasks { if len(task.Segments) == 0 { atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegmentsEmpty, 1) @@ -181,6 +183,12 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, segmentsToMerge = append(segmentsToMerge, zapSeg) docsToDrop = append(docsToDrop, segSnapshot.deleted) } + // track the files getting merged for unsetting the + // removal ineligibility. This helps to unflip files + // even with fast merger, slow persister work flows. + path := zapSeg.Path() + filenames = append(filenames, + strings.TrimPrefix(path, s.path+string(os.PathSeparator))) } } } @@ -222,6 +230,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, } err = zap.ValidateMerge(segmentsToMerge, nil, docsToDrop, seg.(*zap.Segment)) if err != nil { + s.unmarkIneligibleForRemoval(filename) return fmt.Errorf("merge validation failed: %v", err) } oldNewDocNums = make(map[uint64][]uint64) @@ -266,6 +275,13 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, } } + // once all the newly merged segment introductions are done, + // its safe to unflip the removal ineligibility for the replaced + // older segments + for _, f := range filenames { + s.unmarkIneligibleForRemoval(f) + } + return nil } diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 5af6ad532..b38ebcdec 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -764,12 +764,11 @@ func (s *Scorch) removeOldData() { if err != nil { s.fireAsyncError(fmt.Errorf("got err removing old bolt snapshots: %v", err)) } + atomic.AddUint64(&s.stats.TotSnapshotsRemovedFromMetaStore, uint64(removed)) - if removed > 0 { - err = s.removeOldZapFiles() - if err != nil { - s.fireAsyncError(fmt.Errorf("got err removing old zap files: %v", err)) - } + err = s.removeOldZapFiles() + if err != nil { + s.fireAsyncError(fmt.Errorf("got err removing old zap files: %v", err)) } } diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 522feca19..44a97d1ea 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -490,6 +490,9 @@ func (s *Scorch) StatsMap() map[string]interface{} { m["CurOnDiskBytes"] = numBytesUsedDisk m["CurOnDiskFiles"] = numFilesOnDisk + s.rootLock.RLock() + m["CurFilesIneligibleForRemoval"] = uint64(len(s.ineligibleForRemoval)) + s.rootLock.RUnlock() // TODO: consider one day removing these backwards compatible // names for apps using the old names m["updates"] = m["TotUpdates"] diff --git a/index/scorch/stats.go b/index/scorch/stats.go index 2eb832f2c..6549fddf5 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -107,6 +107,9 @@ type Stats struct { TotFileMergeIntroductionsDone uint64 TotFileMergeIntroductionsSkipped uint64 + CurFilesIneligibleForRemoval uint64 + TotSnapshotsRemovedFromMetaStore uint64 + TotMemMergeBeg uint64 TotMemMergeErr uint64 TotMemMergeDone uint64 From 7965de170e4905501df6178cbfeb2eafda3aec2e Mon Sep 17 00:00:00 2001 From: Alexander Petrov Date: Mon, 5 Aug 2019 09:05:33 +0100 Subject: [PATCH 622/728] Replace brute force for-loop that calls HasNext/Next() with roaring.IntPeekable --- index/scorch/segment/zap/posting.go | 25 +++++++++++-------------- vendor/manifest | 2 +- 2 files changed, 12 insertions(+), 15 deletions(-) diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go index 417e89b4d..4c43fdb9b 100644 --- a/index/scorch/segment/zap/posting.go +++ b/index/scorch/segment/zap/posting.go @@ -324,8 +324,8 @@ func (rv *PostingsList) init1Hit(fstVal uint64) error { // PostingsIterator provides a way to iterate through the postings list type PostingsIterator struct { postings *PostingsList - all roaring.IntIterable - Actual roaring.IntIterable + all roaring.IntPeekable + Actual roaring.IntPeekable ActualBM *roaring.Bitmap currChunk uint32 @@ -662,14 +662,14 @@ func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, return i.nextDocNumAtOrAfterClean(atOrAfter) } - n := i.Actual.Next() - for uint64(n) < atOrAfter && i.Actual.HasNext() { - n = i.Actual.Next() - } - if uint64(n) < atOrAfter { + i.Actual.AdvanceIfNeeded(uint32(atOrAfter)) + + if !i.Actual.HasNext() { // couldn't find anything return 0, false, nil } + + n := i.Actual.Next() allN := i.all.Next() nChunk := n / i.postings.sb.chunkFactor @@ -706,23 +706,20 @@ func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, // no deletions) where the all bitmap is the same as the actual bitmap func (i *PostingsIterator) nextDocNumAtOrAfterClean( atOrAfter uint64) (uint64, bool, error) { - n := i.Actual.Next() if !i.includeFreqNorm { - for uint64(n) < atOrAfter && i.Actual.HasNext() { - n = i.Actual.Next() - } + i.Actual.AdvanceIfNeeded(uint32(atOrAfter)) - if uint64(n) < atOrAfter { + if !i.Actual.HasNext() { return 0, false, nil // couldn't find anything } - return uint64(n), true, nil + return uint64(i.Actual.Next()), true, nil } // freq-norm's needed, so maintain freq-norm chunk reader sameChunkNexts := 0 // # of times we called Next() in the same chunk - + n := i.Actual.Next() nChunk := n / i.postings.sb.chunkFactor for uint64(n) < atOrAfter && i.Actual.HasNext() { diff --git a/vendor/manifest b/vendor/manifest index 20788833f..c600bde8a 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -78,7 +78,7 @@ "importpath": "github.com/RoaringBitmap/roaring", "repository": "https://github.com/RoaringBitmap/roaring", "vcs": "", - "revision": "01d244c43a7e8d1191a4f369f5908ea9eb9bc9ac", + "revision": "d0ce1763c3526f65703c395da50da7a7fb2138d5", "branch": "master", "notests": true }, From ee6b126383160d11cf538a751a3a62e066a40daa Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 31 Jul 2019 15:46:09 +0530 Subject: [PATCH 623/728] MB-35333 - Potential removeOldData call starve issue If the persister is busy doing the catch up with the ongoing mutations then there is a chance for the older snapshot clean up call/removeOldData to wait for a longer time to get invoked. This can result in unbounded growth in the number of scorch files. This change attempt to invoke removeOldData whenever the number of files grow beyond the thresholds. --- index/scorch/persister.go | 26 ++++++++++++++++++-------- 1 file changed, 18 insertions(+), 8 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 5af6ad532..c05e1331a 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -245,20 +245,19 @@ func notifyMergeWatchers(lastPersistedEpoch uint64, return watchersNext } -func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastMergedEpoch uint64, - persistWatchers []*epochWatcher, po *persisterOptions) (uint64, []*epochWatcher) { +func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, + lastMergedEpoch uint64, persistWatchers []*epochWatcher, + po *persisterOptions) (uint64, []*epochWatcher) { - // first, let the watchers proceed if they lag behind + // First, let the watchers proceed if they lag behind persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) - // check the merger lag by counting the segment files on disk, + // Check the merger lag by counting the segment files on disk, + numFilesOnDisk, _ := s.diskFileStats() + // On finding fewer files on disk, persister takes a short pause // for sufficient in-memory segments to pile up for the next // memory merge cum persist loop. - // On finding too many files on disk, persister pause until the merger - // catches up to reduce the segment file count under the threshold. - // But if there is memory pressure, then skip this sleep maneuvers. - numFilesOnDisk, _ := s.diskFileStats() if numFilesOnDisk < uint64(po.PersisterNapUnderNumFiles) && po.PersisterNapTimeMSec > 0 && s.paused() == 0 { select { @@ -276,6 +275,17 @@ func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, lastM return lastMergedEpoch, persistWatchers } + // Finding too many files on disk could be due to two reasons. + // 1. Too many older snapshots awaiting the clean up. + // 2. The merger could be lagging behind on merging the disk files. + if numFilesOnDisk > uint64(po.PersisterNapUnderNumFiles) { + s.removeOldData() + numFilesOnDisk, _ = s.diskFileStats() + } + + // Persister pause until the merger catches up to reduce the segment + // file count under the threshold. + // But if there is memory pressure, then skip this sleep maneuvers. OUTER: for po.PersisterNapUnderNumFiles > 0 && numFilesOnDisk >= uint64(po.PersisterNapUnderNumFiles) && From c997533a776fe2f236621c653512f0678e88db1f Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 12 Aug 2019 13:43:55 -0400 Subject: [PATCH 624/728] add method to access analyzer of a text field (#1276) this change allows applications to see the analyzer set for a text field, useful if applications want to modify the document after mapping, but prior to indexing. --- document/field_text.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/document/field_text.go b/document/field_text.go index c8e871c9d..6bd74c712 100644 --- a/document/field_text.go +++ b/document/field_text.go @@ -86,6 +86,10 @@ func (t *TextField) Analyze() (int, analysis.TokenFrequencies) { return fieldLength, tokenFreqs } +func (t *TextField) Analyzer() *analysis.Analyzer { + return t.analyzer +} + func (t *TextField) Value() []byte { return t.value } From f1c9cb5659531140daeb291a3d9406b42821880d Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 23 Aug 2019 15:46:01 +0530 Subject: [PATCH 625/728] bumping the vellum SHA adopting the levenshtein package changes in vellum --- index/scorch/segment/zap/dict_test.go | 20 ++++++++++++++++---- index/scorch/snapshot_index.go | 8 ++++---- vendor/manifest | 2 +- 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/index/scorch/segment/zap/dict_test.go b/index/scorch/segment/zap/dict_test.go index 378abd854..c632917bc 100644 --- a/index/scorch/segment/zap/dict_test.go +++ b/index/scorch/segment/zap/dict_test.go @@ -181,6 +181,14 @@ func TestDictionary(t *testing.T) { } func TestDictionaryError(t *testing.T) { + hash := make(map[uint8]levenshtein.LevenshteinAutomatonBuilder, 4) + for i := 1; i <= 3; i++ { + lb, err := levenshtein.NewLevenshteinAutomatonBuilder(uint8(i), false) + if err != nil { + t.Errorf("NewLevenshteinAutomatonBuilder(%d, false) failed, err: %v", i, err) + } + hash[uint8(i)] = *lb + } _ = os.RemoveAll("/tmp/scorch.zap") @@ -206,7 +214,8 @@ func TestDictionaryError(t *testing.T) { t.Fatal(err) } - a, err := levenshtein.New("summer", 2) + lb := hash[uint8(2)] + a, err := lb.BuildDfa("summer", 2) if err != nil { t.Fatal(err) } @@ -222,7 +231,8 @@ func TestDictionaryError(t *testing.T) { t.Fatalf("expected nil error from iterator, got: %v", err) } - a, err = levenshtein.New("cat", 1) // cat & bat + lb = hash[uint8(1)] + a, err = lb.BuildDfa("cat", 1) // cat & bat if err != nil { t.Fatal(err) } @@ -241,7 +251,8 @@ func TestDictionaryError(t *testing.T) { t.Fatalf("expected nil next and nil err, got: %v, %v", nxt, err) } - a, err = levenshtein.New("cat", 2) // cat & bat + lb = hash[uint8(2)] + a, err = lb.BuildDfa("cat", 2) // cat & bat if err != nil { t.Fatal(err) } @@ -260,7 +271,8 @@ func TestDictionaryError(t *testing.T) { t.Fatalf("expected nil next and nil err, got: %v, %v", nxt, err) } - a, err = levenshtein.New("cat", 3) + lb = hash[uint8(3)] + a, err = lb.BuildDfa("cat", 3) if err != nil { t.Fatal(err) } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index dada6b76c..47cc809b2 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -28,11 +28,11 @@ import ( "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" "github.com/couchbase/vellum" - lev2 "github.com/couchbase/vellum/levenshtein2" + lev "github.com/couchbase/vellum/levenshtein" ) // re usable, threadsafe levenshtein builders -var lb1, lb2 *lev2.LevenshteinAutomatonBuilder +var lb1, lb2 *lev.LevenshteinAutomatonBuilder type asynchSegmentResult struct { dict segment.TermDictionary @@ -52,11 +52,11 @@ func init() { var is interface{} = IndexSnapshot{} reflectStaticSizeIndexSnapshot = int(reflect.TypeOf(is).Size()) var err error - lb1, err = lev2.NewLevenshteinAutomatonBuilder(1, true) + lb1, err = lev.NewLevenshteinAutomatonBuilder(1, true) if err != nil { panic(fmt.Errorf("Levenshtein automaton ed1 builder err: %v", err)) } - lb2, err = lev2.NewLevenshteinAutomatonBuilder(2, true) + lb2, err = lev.NewLevenshteinAutomatonBuilder(2, true) if err != nil { panic(fmt.Errorf("Levenshtein automaton ed2 builder err: %v", err)) } diff --git a/vendor/manifest b/vendor/manifest index c600bde8a..e6f985136 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -137,7 +137,7 @@ "importpath": "github.com/couchbase/vellum", "repository": "https://github.com/couchbase/vellum", "vcs": "git", - "revision": "41f2deade2cfab59facd263e918d7c05f656c2e9", + "revision": "dc222902e86f298bfae0b3dec6ba8b9d874ad5f8", "branch": "master", "notests": true } From ad3191238ac2f57961ba55130eef292bcb3570a9 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 16 Sep 2019 14:34:18 +0530 Subject: [PATCH 626/728] =?UTF-8?q?Support=20for=20bounded=20polygon=20que?= =?UTF-8?q?ry=E2=80=A6?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Attempt to introduce GeoBoundingPolygonQuery to enable the bounded polygon queries. Polygon searcher internally finds the bounded rectangle for the given polygon points and apply the filter which performs the point-in-polygon checks. In this iteration, as simplicity is preferred over precision(on boundary points), the point-in-polygon check is based on ray-casting algorithm. --- geo/geo.go | 38 +++ search/query/geo_boundingpolygon.go | 94 +++++++ search/query/query.go | 9 + search/searcher/search_geopointdistance.go | 11 +- search/searcher/search_geopolygon.go | 110 ++++++++ search/searcher/search_geopolygon_test.go | 291 +++++++++++++++++++++ 6 files changed, 548 insertions(+), 5 deletions(-) create mode 100644 search/query/geo_boundingpolygon.go create mode 100644 search/searcher/search_geopolygon.go create mode 100644 search/searcher/search_geopolygon_test.go diff --git a/geo/geo.go b/geo/geo.go index 86861b4f3..583451e30 100644 --- a/geo/geo.go +++ b/geo/geo.go @@ -37,6 +37,12 @@ var geoTolerance = 1E-6 var lonScale = float64((uint64(0x1)< lat) != (polygon[i-1].Lat > lat) && + lon < (polygon[i-1].Lon-polygon[i].Lon)*(lat-polygon[i].Lat)/ + (polygon[i-1].Lat-polygon[i].Lat)+polygon[i].Lon { + inside = !inside + } + } + return inside + + } + return false + } +} diff --git a/search/searcher/search_geopolygon_test.go b/search/searcher/search_geopolygon_test.go new file mode 100644 index 000000000..015d82611 --- /dev/null +++ b/search/searcher/search_geopolygon_test.go @@ -0,0 +1,291 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package searcher + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/geo" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/store/gtreap" + "github.com/blevesearch/bleve/index/upsidedown" + "github.com/blevesearch/bleve/search" +) + +func TestSimpleGeoPolygons(t *testing.T) { + + tests := []struct { + polygon []geo.Point + field string + want []string + }{ + // test points inside a triangle & on vertices + // r, s - inside and t,u - on vertices. + {[]geo.Point{{Lon: 1.0, Lat: 1.0}, {Lon: 2.0, Lat: 1.9}, {Lon: 2.0, Lat: 1.0}}, "loc", []string{"r", "s", "t", "u"}}, + // non overlapping polygon for the indexed documents + {[]geo.Point{{Lon: 3.0, Lat: 1.0}, {Lon: 4.0, Lat: 2.5}, {Lon: 3.0, Lat: 2}}, "loc", nil}, + } + i := setupGeoPolygonPoints(t) + indexReader, err := i.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err = indexReader.Close() + if err != nil { + t.Fatal(err) + } + }() + + for _, test := range tests { + got, err := testGeoPolygonSearch(indexReader, test.polygon, test.field) + if err != nil { + t.Fatal(err) + } + if !reflect.DeepEqual(got, test.want) { + t.Errorf("expected %v, got %v for polygon: %+v", test.want, got, test.polygon) + } + } +} + +func TestRealGeoPolygons(t *testing.T) { + + tests := []struct { + polygon []geo.Point + field string + want []string + }{ + {[]geo.Point{{Lon: -80.881, Lat: 35.282}, {Lon: -80.858, Lat: 35.281}, + {Lon: -80.864, Lat: 35.270}}, "loc", []string{"k", "l"}}, + {[]geo.Point{{Lon: -82.467, Lat: 36.356}, {Lon: -78.127, Lat: 36.321}, {Lon: -80.555, Lat: 32.932}, + {Lon: -84.807, Lat: 33.111}}, "loc", []string{"k", "l", "m"}}, + // same polygon vertices + {[]geo.Point{{Lon: -82.467, Lat: 36.356}, {Lon: -82.467, Lat: 36.356}, {Lon: -82.467, Lat: 36.356}, {Lon: -82.467, Lat: 36.356}}, "loc", nil}, + // non-overlaping polygon + {[]geo.Point{{Lon: -89.113, Lat: 36.400}, {Lon: -93.947, Lat: 36.471}, {Lon: -93.947, Lat: 34.031}}, "loc", nil}, + // concave polygon with a document `n` residing inside the hands, but outside the polygon + {[]geo.Point{{Lon: -71.65, Lat: 42.446}, {Lon: -71.649, Lat: 42.428}, {Lon: -71.640, Lat: 42.445}, {Lon: -71.649, Lat: 42.435}}, "loc", nil}, + // V like concave polygon with a document 'p' residing inside the bottom corner + {[]geo.Point{{Lon: -80.304, Lat: 40.740}, {Lon: -80.038, Lat: 40.239}, {Lon: -79.562, Lat: 40.786}, {Lon: -80.018, Lat: 40.328}}, "loc", []string{"p"}}, + {[]geo.Point{{Lon: -111.918, Lat: 33.515}, {Lon: -111.938, Lat: 33.494}, {Lon: -111.944, Lat: 33.481}, {Lon: -111.886, Lat: 33.517}, + {Lon: -111.919, Lat: 33.468}, {Lon: -111.929, Lat: 33.508}}, "loc", []string{"q"}}, + } + + i := setupGeoPolygonPoints(t) + indexReader, err := i.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err = indexReader.Close() + if err != nil { + t.Fatal(err) + } + }() + + for _, test := range tests { + got, err := testGeoPolygonSearch(indexReader, test.polygon, test.field) + if err != nil { + t.Fatal(err) + } + if !reflect.DeepEqual(got, test.want) { + t.Errorf("expected %v, got %v for polygon: %+v", test.want, got, test.polygon) + } + } +} + +func TestGeoRectanglePolygon(t *testing.T) { + + tests := []struct { + polygon []geo.Point + field string + want []string + }{ + {[]geo.Point{{Lon: 0.001, Lat: 0.001}, {Lon: 85.002, Lat: 38.002}}, "loc", + []string{"a", "b", "c", "d", "e", "f", "g", "h", "i", "j"}}, + } + + i := setupGeo(t) + indexReader, err := i.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err = indexReader.Close() + if err != nil { + t.Fatal(err) + } + }() + + for _, test := range tests { + got, err := testGeoPolygonSearch(indexReader, test.polygon, test.field) + if err != nil { + t.Fatal(err) + } + if !reflect.DeepEqual(got, test.want) { + t.Errorf("expected %v, got %v for polygon: %+v", test.want, got, test.polygon) + } + } +} + +func testGeoPolygonSearch(i index.IndexReader, polygon []geo.Point, field string) ([]string, error) { + var rv []string + gbs, err := NewGeoBoundedPolygonSearcher(i, polygon, field, 1.0, search.SearcherOptions{}) + if err != nil { + return nil, err + } + ctx := &search.SearchContext{ + DocumentMatchPool: search.NewDocumentMatchPool(gbs.DocumentMatchPoolSize(), 0), + } + docMatch, err := gbs.Next(ctx) + for docMatch != nil && err == nil { + rv = append(rv, string(docMatch.IndexInternalID)) + docMatch, err = gbs.Next(ctx) + } + if err != nil { + return nil, err + } + return rv, nil +} + +func setupGeoPolygonPoints(t *testing.T) index.Index { + analysisQueue := index.NewAnalysisQueue(1) + i, err := upsidedown.NewUpsideDownCouch( + gtreap.Name, + map[string]interface{}{ + "path": "", + }, + analysisQueue) + if err != nil { + t.Fatal(err) + } + err = i.Open() + if err != nil { + t.Fatal(err) + } + + err = i.Update(&document.Document{ + ID: "k", + Fields: []document.Field{ + document.NewGeoPointField("loc", []uint64{}, -80.86469327, 35.2782), + }, + }) + if err != nil { + t.Fatal(err) + } + + err = i.Update(&document.Document{ + ID: "l", + Fields: []document.Field{ + document.NewGeoPointField("loc", []uint64{}, -80.8713, 35.28138), + }, + }) + if err != nil { + t.Fatal(err) + } + + err = i.Update(&document.Document{ + ID: "m", + Fields: []document.Field{ + document.NewGeoPointField("loc", []uint64{}, -84.25, 33.153), + }, + }) + if err != nil { + t.Fatal(err) + } + + err = i.Update(&document.Document{ + ID: "n", + Fields: []document.Field{ + document.NewGeoPointField("loc", []uint64{}, -89.992, 35.063), + }, + }) + if err != nil { + t.Fatal(err) + } + + err = i.Update(&document.Document{ + ID: "o", + Fields: []document.Field{ + document.NewGeoPointField("loc", []uint64{}, -71.648, 42.437), + }, + }) + if err != nil { + t.Fatal(err) + } + + err = i.Update(&document.Document{ + ID: "p", + Fields: []document.Field{ + document.NewGeoPointField("loc", []uint64{}, -80.016, 40.314), + }, + }) + if err != nil { + t.Fatal(err) + } + + err = i.Update(&document.Document{ + ID: "q", + Fields: []document.Field{ + document.NewGeoPointField("loc", []uint64{}, -111.919, 33.494), + }, + }) + if err != nil { + t.Fatal(err) + } + + err = i.Update(&document.Document{ + ID: "r", + Fields: []document.Field{ + document.NewGeoPointField("loc", []uint64{}, 1.5, 1.1), + }, + }) + if err != nil { + t.Fatal(err) + } + + err = i.Update(&document.Document{ + ID: "s", + Fields: []document.Field{ + document.NewGeoPointField("loc", []uint64{}, 2, 1.5), + }, + }) + if err != nil { + t.Fatal(err) + } + + err = i.Update(&document.Document{ + ID: "t", + Fields: []document.Field{ + document.NewGeoPointField("loc", []uint64{}, 2.0, 1.9), + }, + }) + if err != nil { + t.Fatal(err) + } + + err = i.Update(&document.Document{ + ID: "u", + Fields: []document.Field{ + document.NewGeoPointField("loc", []uint64{}, 2.0, 1.0), + }, + }) + if err != nil { + t.Fatal(err) + } + return i +} From 39c1fb6b4bcdc34ec4910d138ac3c120fb8752e2 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Tue, 17 Sep 2019 19:15:11 +0530 Subject: [PATCH 627/728] updating integration tests for geo polygon queries --- search/query/geo_boundingpolygon.go | 4 +- search/query/query.go | 2 +- search/searcher/search_geopolygon_test.go | 25 +++++ test/tests/geo/data/amoeba_brewery.json | 1 + test/tests/geo/data/communiti_brewery.json | 1 + test/tests/geo/data/social_brewery.json | 1 + test/tests/geo/searches.json | 102 +++++++++++++++++++++ 7 files changed, 133 insertions(+), 3 deletions(-) create mode 100644 test/tests/geo/data/amoeba_brewery.json create mode 100644 test/tests/geo/data/communiti_brewery.json create mode 100644 test/tests/geo/data/social_brewery.json diff --git a/search/query/geo_boundingpolygon.go b/search/query/geo_boundingpolygon.go index c227f8cda..41c7f7f3a 100644 --- a/search/query/geo_boundingpolygon.go +++ b/search/query/geo_boundingpolygon.go @@ -26,7 +26,7 @@ import ( ) type GeoBoundingPolygonQuery struct { - Points []geo.Point `json:"points"` + Points []geo.Point `json:"polygon_points"` FieldVal string `json:"field,omitempty"` BoostVal *Boost `json:"boost,omitempty"` } @@ -69,7 +69,7 @@ func (q *GeoBoundingPolygonQuery) Validate() error { func (q *GeoBoundingPolygonQuery) UnmarshalJSON(data []byte) error { tmp := struct { - Points []interface{} `json:"points"` + Points []interface{} `json:"polygon_points"` FieldVal string `json:"field,omitempty"` BoostVal *Boost `json:"boost,omitempty"` }{} diff --git a/search/query/query.go b/search/query/query.go index 0c926508e..18aca228d 100644 --- a/search/query/query.go +++ b/search/query/query.go @@ -273,7 +273,7 @@ func ParseQuery(input []byte) (Query, error) { } return &rv, nil } - _, hasPoints := tmp["points"] + _, hasPoints := tmp["polygon_points"] if hasPoints { var rv GeoBoundingPolygonQuery err := json.Unmarshal(input, &rv) diff --git a/search/searcher/search_geopolygon_test.go b/search/searcher/search_geopolygon_test.go index 015d82611..8a6ef8f5a 100644 --- a/search/searcher/search_geopolygon_test.go +++ b/search/searcher/search_geopolygon_test.go @@ -83,6 +83,10 @@ func TestRealGeoPolygons(t *testing.T) { {[]geo.Point{{Lon: -80.304, Lat: 40.740}, {Lon: -80.038, Lat: 40.239}, {Lon: -79.562, Lat: 40.786}, {Lon: -80.018, Lat: 40.328}}, "loc", []string{"p"}}, {[]geo.Point{{Lon: -111.918, Lat: 33.515}, {Lon: -111.938, Lat: 33.494}, {Lon: -111.944, Lat: 33.481}, {Lon: -111.886, Lat: 33.517}, {Lon: -111.919, Lat: 33.468}, {Lon: -111.929, Lat: 33.508}}, "loc", []string{"q"}}, + // real points near cb bangalore + {[]geo.Point{{Lat: 12.974872, Lon: 77.607749}, {Lat: 12.971725, Lon: 77.610110}, + {Lat: 12.972530, Lon: 77.606912}, {Lat: 12.975112, Lon: 77.603780}, + }, "loc", []string{"amoeba", "communiti"}}, } i := setupGeoPolygonPoints(t) @@ -287,5 +291,26 @@ func setupGeoPolygonPoints(t *testing.T) index.Index { if err != nil { t.Fatal(err) } + + err = i.Update(&document.Document{ + ID: "amoeba", + Fields: []document.Field{ + document.NewGeoPointField("loc", []uint64{}, 77.60490, 12.97467), + }, + }) + if err != nil { + t.Fatal(err) + } + + err = i.Update(&document.Document{ + ID: "communiti", + Fields: []document.Field{ + document.NewGeoPointField("loc", []uint64{}, 77.608237, 12.97237), + }, + }) + if err != nil { + t.Fatal(err) + } + return i } diff --git a/test/tests/geo/data/amoeba_brewery.json b/test/tests/geo/data/amoeba_brewery.json new file mode 100644 index 000000000..8997a2415 --- /dev/null +++ b/test/tests/geo/data/amoeba_brewery.json @@ -0,0 +1 @@ +{"name":"amoeba brewery","city":"bangalore","state":"KAR","code":"","country":"India","phone":"","website":"","type":"brewery","updated":"2019-09-17 20:00:20","description":"brewery near cb office","address":[],"geo":{"accuracy":"APPROXIMATE","lat":12.97467,"lon":77.60490}} \ No newline at end of file diff --git a/test/tests/geo/data/communiti_brewery.json b/test/tests/geo/data/communiti_brewery.json new file mode 100644 index 000000000..832ae9147 --- /dev/null +++ b/test/tests/geo/data/communiti_brewery.json @@ -0,0 +1 @@ +{"name":"communiti brewery","city":"bangalore","state":"KAR","code":"","country":"India","phone":"","website":"","type":"brewery","updated":"2019-09-17 20:00:20","description":"brewery near cb office","address":[],"geo":{"accuracy":"APPROXIMATE","lat":12.97237,"lon":77.608237}} \ No newline at end of file diff --git a/test/tests/geo/data/social_brewery.json b/test/tests/geo/data/social_brewery.json new file mode 100644 index 000000000..ae636ad81 --- /dev/null +++ b/test/tests/geo/data/social_brewery.json @@ -0,0 +1 @@ +{"name":"social brewery","city":"bangalore","state":"KAR","code":"","country":"India","phone":"","website":"","type":"brewery","updated":"2019-09-17 20:00:20","description":"brewery near cb office, but outside the polygon","address":[],"geo":{"accuracy":"APPROXIMATE","lat":12.9736946,"lon":77.6042133}} \ No newline at end of file diff --git a/test/tests/geo/searches.json b/test/tests/geo/searches.json index 1570cc0bd..20646b44f 100644 --- a/test/tests/geo/searches.json +++ b/test/tests/geo/searches.json @@ -220,5 +220,107 @@ } ] } + }, + { + "comment": "polygon around cb office area, using GeoJSON lat/lon as array", + "search": { + "from": 0, + "size": 10, + "query": { + "polygon_points": [[77.607749,12.974872],[77.6101101,12.971725],[77.606912,12.972530],[77.603780,12.975112]], + "field": "geo" + }, + "sort": [ + "name" + ] + }, + "result": { + "total_hits": 2, + "hits": [ + { + "id": "amoeba_brewery" + }, + { + "id": "communiti_brewery" + } + ] + } + }, + { + "comment": "polygon around cb office area, using GeoJSON lat/lon as string", + "search": { + "from": 0, + "size": 10, + "query": { + "polygon_points": ["12.974872, 77.607749","12.971725, 77.6101101","12.972530, 77.606912","12.975112, 77.603780"], + "field": "geo" + }, + "sort": [ + "name" + ] + }, + "result": { + "total_hits": 2, + "hits": [ + { + "id": "amoeba_brewery" + }, + { + "id": "communiti_brewery" + } + ] + } + }, + { + "comment": "polygon around cb office area", + "search": { + "from": 0, + "size": 10, + "query": { + "polygon_points": [{"lat":12.974872, "lon":77.607749}, {"lat":12.971725, "lon":77.6101101}, + {"lat":12.972530, "lon":77.606912}, {"lat":12.975112, "lon":77.603780}], + "field": "geo" + }, + "sort": [ + "name" + ] + }, + "result": { + "total_hits": 2, + "hits": [ + { + "id": "amoeba_brewery" + }, + { + "id": "communiti_brewery" + } + ] + } + }, + { + "comment": "polygon around cb office area as geohash", + "search": { + "from": 0, + "size": 10, + "query": { + "polygon_points": ["tdr1y40", "tdr1y13", "tdr1vcx", "tdr1vfj"], + "field": "geo" + }, + "sort": [ + "name" + ] + }, + "result": { + "total_hits": 2, + "hits": [ + { + "id": "amoeba_brewery" + }, + { + "id": "communiti_brewery" + } + ] + } } ] + From f84273e7fe5c015a56d9b52ab7b7cd654000aa75 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 19 Sep 2019 16:13:28 +0530 Subject: [PATCH 628/728] compute any score only when its really needed --- search/scorer/scorer_term.go | 87 +++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 42 deletions(-) diff --git a/search/scorer/scorer_term.go b/search/scorer/scorer_term.go index a0e02e438..718de2ea5 100644 --- a/search/scorer/scorer_term.go +++ b/search/scorer/scorer_term.go @@ -115,58 +115,61 @@ func (s *TermQueryScorer) SetQueryNorm(qnorm float64) { } func (s *TermQueryScorer) Score(ctx *search.SearchContext, termMatch *index.TermFieldDoc) *search.DocumentMatch { - var scoreExplanation *search.Explanation - - // need to compute score - var tf float64 - if termMatch.Freq < MaxSqrtCache { - tf = SqrtCache[int(termMatch.Freq)] - } else { - tf = math.Sqrt(float64(termMatch.Freq)) - } - score := tf * termMatch.Norm * s.idf - - if s.options.Explain { - childrenExplanations := make([]*search.Explanation, 3) - childrenExplanations[0] = &search.Explanation{ - Value: tf, - Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq), - } - childrenExplanations[1] = &search.Explanation{ - Value: termMatch.Norm, - Message: fmt.Sprintf("fieldNorm(field=%s, doc=%s)", s.queryField, termMatch.ID), - } - childrenExplanations[2] = s.idfExplanation - scoreExplanation = &search.Explanation{ - Value: score, - Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.queryField, s.queryTerm, termMatch.ID), - Children: childrenExplanations, + rv := ctx.DocumentMatchPool.Get() + // perform any score computations only when needed + if s.includeScore || s.options.Explain { + var scoreExplanation *search.Explanation + var tf float64 + if termMatch.Freq < MaxSqrtCache { + tf = SqrtCache[int(termMatch.Freq)] + } else { + tf = math.Sqrt(float64(termMatch.Freq)) } - } + score := tf * termMatch.Norm * s.idf - // if the query weight isn't 1, multiply - if s.queryWeight != 1.0 { - score = score * s.queryWeight if s.options.Explain { - childExplanations := make([]*search.Explanation, 2) - childExplanations[0] = s.queryWeightExplanation - childExplanations[1] = scoreExplanation + childrenExplanations := make([]*search.Explanation, 3) + childrenExplanations[0] = &search.Explanation{ + Value: tf, + Message: fmt.Sprintf("tf(termFreq(%s:%s)=%d", s.queryField, s.queryTerm, termMatch.Freq), + } + childrenExplanations[1] = &search.Explanation{ + Value: termMatch.Norm, + Message: fmt.Sprintf("fieldNorm(field=%s, doc=%s)", s.queryField, termMatch.ID), + } + childrenExplanations[2] = s.idfExplanation scoreExplanation = &search.Explanation{ Value: score, - Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.queryField, s.queryTerm, s.queryBoost, termMatch.ID), - Children: childExplanations, + Message: fmt.Sprintf("fieldWeight(%s:%s in %s), product of:", s.queryField, s.queryTerm, termMatch.ID), + Children: childrenExplanations, } } + + // if the query weight isn't 1, multiply + if s.queryWeight != 1.0 { + score = score * s.queryWeight + if s.options.Explain { + childExplanations := make([]*search.Explanation, 2) + childExplanations[0] = s.queryWeightExplanation + childExplanations[1] = scoreExplanation + scoreExplanation = &search.Explanation{ + Value: score, + Message: fmt.Sprintf("weight(%s:%s^%f in %s), product of:", s.queryField, s.queryTerm, s.queryBoost, termMatch.ID), + Children: childExplanations, + } + } + } + + if s.includeScore { + rv.Score = score + } + + if s.options.Explain { + rv.Expl = scoreExplanation + } } - rv := ctx.DocumentMatchPool.Get() rv.IndexInternalID = append(rv.IndexInternalID, termMatch.ID...) - if s.includeScore { - rv.Score = score - } - if s.options.Explain { - rv.Expl = scoreExplanation - } if len(termMatch.Vectors) > 0 { if cap(rv.FieldTermLocations) < len(termMatch.Vectors) { From 7f04e653c572c81ffd94b743a3f33b989647d39b Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Thu, 10 Oct 2019 16:23:15 -0700 Subject: [PATCH 629/728] Update .travis to test on the last 3 releases of golang 1.13.x, 1.12.x, 1.11.x --- .travis.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.travis.yml b/.travis.yml index e00e7b994..8f3340510 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,9 +3,9 @@ sudo: false language: go go: - - "1.10.x" - "1.11.x" - "1.12.x" + - "1.13.x" script: - go get golang.org/x/tools/cmd/cover From bba4f5a488bedac60776d61526249c3cc8ede979 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 4 Oct 2019 18:36:37 +0530 Subject: [PATCH 630/728] adding a disk size stats for the root index The new "num_bytes_used_disk_by_root" stats shows the disk size used by the latest root index. The idea is that this should help in suggesting a sizing recommendation for RAM quota driven by an index size resident ratio. --- index/scorch/persister.go | 6 ++--- index/scorch/scorch.go | 46 +++++++++++++++++++++++++++++++-------- 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 064e9e6a8..b141cfda6 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -253,7 +253,7 @@ func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) // Check the merger lag by counting the segment files on disk, - numFilesOnDisk, _ := s.diskFileStats() + numFilesOnDisk, _, _ := s.diskFileStats(nil) // On finding fewer files on disk, persister takes a short pause // for sufficient in-memory segments to pile up for the next @@ -280,7 +280,7 @@ func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, // 2. The merger could be lagging behind on merging the disk files. if numFilesOnDisk > uint64(po.PersisterNapUnderNumFiles) { s.removeOldData() - numFilesOnDisk, _ = s.diskFileStats() + numFilesOnDisk, _, _ = s.diskFileStats(nil) } // Persister pause until the merger catches up to reduce the segment @@ -305,7 +305,7 @@ OUTER: // let the watchers proceed if they lag behind persistWatchers = notifyMergeWatchers(lastPersistedEpoch, persistWatchers) - numFilesOnDisk, _ = s.diskFileStats() + numFilesOnDisk, _, _ = s.diskFileStats(nil) } return lastMergedEpoch, persistWatchers diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 44a97d1ea..7a1046fc5 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -466,8 +466,9 @@ func (s *Scorch) Stats() json.Marshaler { return &s.stats } -func (s *Scorch) diskFileStats() (uint64, uint64) { - var numFilesOnDisk, numBytesUsedDisk uint64 +func (s *Scorch) diskFileStats(rootSegmentPaths map[string]struct{}) (uint64, + uint64, uint64) { + var numFilesOnDisk, numBytesUsedDisk, numBytesOnDiskByRoot uint64 if s.path != "" { finfos, err := ioutil.ReadDir(s.path) if err == nil { @@ -475,24 +476,48 @@ func (s *Scorch) diskFileStats() (uint64, uint64) { if !finfo.IsDir() { numBytesUsedDisk += uint64(finfo.Size()) numFilesOnDisk++ + if rootSegmentPaths != nil { + fname := s.path + string(os.PathSeparator) + finfo.Name() + if _, fileAtRoot := rootSegmentPaths[fname]; fileAtRoot { + numBytesOnDiskByRoot += uint64(finfo.Size()) + } + } } } } } - return numFilesOnDisk, numBytesUsedDisk + // if no root files path given, then consider all disk files. + if rootSegmentPaths == nil { + return numFilesOnDisk, numBytesUsedDisk, numBytesUsedDisk + } + + return numFilesOnDisk, numBytesUsedDisk, numBytesOnDiskByRoot +} + +func (s *Scorch) rootDiskSegmentsPaths() map[string]struct{} { + rv := make(map[string]struct{}, len(s.root.segment)) + for _, segmentSnapshot := range s.root.segment { + switch seg := segmentSnapshot.segment.(type) { + case *zap.Segment: + rv[seg.Path()] = struct{}{} + } + } + return rv } func (s *Scorch) StatsMap() map[string]interface{} { m := s.stats.ToMap() - numFilesOnDisk, numBytesUsedDisk := s.diskFileStats() + s.rootLock.RLock() + rootSegPaths := s.rootDiskSegmentsPaths() + m["CurFilesIneligibleForRemoval"] = uint64(len(s.ineligibleForRemoval)) + s.rootLock.RUnlock() + + numFilesOnDisk, numBytesUsedDisk, numBytesOnDiskByRoot := s.diskFileStats(rootSegPaths) m["CurOnDiskBytes"] = numBytesUsedDisk m["CurOnDiskFiles"] = numFilesOnDisk - s.rootLock.RLock() - m["CurFilesIneligibleForRemoval"] = uint64(len(s.ineligibleForRemoval)) - s.rootLock.RUnlock() // TODO: consider one day removing these backwards compatible // names for apps using the old names m["updates"] = m["TotUpdates"] @@ -507,8 +532,11 @@ func (s *Scorch) StatsMap() map[string]interface{} { m["num_items_introduced"] = m["TotIntroducedItems"] m["num_items_persisted"] = m["TotPersistedItems"] m["num_recs_to_persist"] = m["TotItemsToPersist"] - m["num_bytes_used_disk"] = m["CurOnDiskBytes"] - m["num_files_on_disk"] = m["CurOnDiskFiles"] + // total disk bytes found in index directory inclusive of older snapshots + m["num_bytes_used_disk"] = numBytesUsedDisk + // total disk bytes by the latest root index, exclusive of older snapshots + m["num_bytes_used_disk_by_root"] = numBytesOnDiskByRoot + m["num_files_on_disk"] = numFilesOnDisk m["num_root_memorysegments"] = m["TotMemorySegmentsAtRoot"] m["num_root_filesegments"] = m["TotFileSegmentsAtRoot"] m["num_persister_nap_pause_completed"] = m["TotPersisterNapPauseCompleted"] From e46f498c353272ae060b9f1bcd77750e069ffd1a Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 25 Oct 2019 06:16:40 +0530 Subject: [PATCH 631/728] Unit tests for a single segment merge policy Its been observed that using an aggressive merge policy, we are able to create a single segment scorch index. --- index/scorch/mergeplan/merge_plan_test.go | 39 +++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/index/scorch/mergeplan/merge_plan_test.go b/index/scorch/mergeplan/merge_plan_test.go index 3adc1f4b8..2791d09c5 100644 --- a/index/scorch/mergeplan/merge_plan_test.go +++ b/index/scorch/mergeplan/merge_plan_test.go @@ -232,6 +232,45 @@ func TestCalcBudget(t *testing.T) { } } +func TestCalcBudgetForSingleSegmentMergePolicy(t *testing.T) { + mpolicy := MergePlanOptions{ + MaxSegmentsPerTier: 1, + MaxSegmentSize: 1 << 30, // ~ 1 Billion + SegmentsPerMergeTask: 10, + FloorSegmentSize: 1 << 30, + } + + tests := []struct { + totalSize int64 + firstTierSize int64 + o MergePlanOptions + expect int + }{ + {0, mpolicy.RaiseToFloorSegmentSize(0), mpolicy, 0}, + {1, mpolicy.RaiseToFloorSegmentSize(1), mpolicy, 1}, + {9, mpolicy.RaiseToFloorSegmentSize(0), mpolicy, 1}, + {1, mpolicy.RaiseToFloorSegmentSize(1), mpolicy, 1}, + {21, mpolicy.RaiseToFloorSegmentSize(21), mpolicy, 1}, + {21, mpolicy.RaiseToFloorSegmentSize(21), mpolicy, 1}, + {1000, mpolicy.RaiseToFloorSegmentSize(2000), mpolicy, 1}, + {5000, mpolicy.RaiseToFloorSegmentSize(5000), mpolicy, 1}, + {10000, mpolicy.RaiseToFloorSegmentSize(10000), mpolicy, 1}, + {30000, mpolicy.RaiseToFloorSegmentSize(30000), mpolicy, 1}, + {1000000, mpolicy.RaiseToFloorSegmentSize(1000000), mpolicy, 1}, + {1000000000, 1 << 30, mpolicy, 1}, + {1013423541, 1 << 30, mpolicy, 1}, + {98765442, 1 << 30, mpolicy, 1}, + } + + for testi, test := range tests { + res := CalcBudget(test.totalSize, test.firstTierSize, &test.o) + if res != test.expect { + t.Errorf("testi: %d, test: %#v, res: %v", + testi, test, res) + } + } +} + // ---------------------------------------- func TestInsert1SameSizedSegmentBetweenMerges(t *testing.T) { From 919079be42ccaa600ff03d61abbc1c0236bea7e5 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 28 Oct 2019 15:34:38 -0700 Subject: [PATCH 632/728] [GEO] Search all numeric type terms to match geo points If a field (for example _all) were to index multiple numeric terms (of type numeric and geopoint morton hash), verify all numeric matches rather than just the last encountered entry to determine if the entry morton unhashes into a geo point match. Fixes: https://github.com/blevesearch/bleve/issues/1301 --- search/searcher/search_geoboundingbox.go | 15 +++++--- search/searcher/search_geopointdistance.go | 15 +++++--- search_test.go | 45 ++++++++++++++++++++++ 3 files changed, 64 insertions(+), 11 deletions(-) diff --git a/search/searcher/search_geoboundingbox.go b/search/searcher/search_geoboundingbox.go index 38cb6467f..c4b8af927 100644 --- a/search/searcher/search_geoboundingbox.go +++ b/search/searcher/search_geoboundingbox.go @@ -224,7 +224,8 @@ func ComputeGeoRange(term uint64, shift uint, func buildRectFilter(dvReader index.DocValueReader, field string, minLon, minLat, maxLon, maxLat float64) FilterFunc { return func(d *search.DocumentMatch) bool { - var lon, lat float64 + // check geo matches against all numeric type terms indexed + var lons, lats []float64 var found bool err := dvReader.VisitDocValues(d.IndexInternalID, func(field string, term []byte) { // only consider the values which are shifted 0 @@ -234,15 +235,19 @@ func buildRectFilter(dvReader index.DocValueReader, field string, var i64 int64 i64, err = prefixCoded.Int64() if err == nil { - lon = geo.MortonUnhashLon(uint64(i64)) - lat = geo.MortonUnhashLat(uint64(i64)) + lons = append(lons, geo.MortonUnhashLon(uint64(i64))) + lats = append(lats, geo.MortonUnhashLat(uint64(i64))) found = true } } }) if err == nil && found { - return geo.BoundingBoxContains(lon, lat, - minLon, minLat, maxLon, maxLat) + for i := range lons { + if geo.BoundingBoxContains(lons[i], lats[i], + minLon, minLat, maxLon, maxLat) { + return true + } + } } return false } diff --git a/search/searcher/search_geopointdistance.go b/search/searcher/search_geopointdistance.go index b01ae6a0a..2444a8423 100644 --- a/search/searcher/search_geopointdistance.go +++ b/search/searcher/search_geopointdistance.go @@ -96,7 +96,8 @@ func boxSearcher(indexReader index.IndexReader, func buildDistFilter(dvReader index.DocValueReader, field string, centerLon, centerLat, maxDist float64) FilterFunc { return func(d *search.DocumentMatch) bool { - var lon, lat float64 + // check geo matches against all numeric tpe terms indexed + var lons, lats []float64 var found bool err := dvReader.VisitDocValues(d.IndexInternalID, func(field string, term []byte) { @@ -106,16 +107,18 @@ func buildDistFilter(dvReader index.DocValueReader, field string, if err == nil && shift == 0 { i64, err := prefixCoded.Int64() if err == nil { - lon = geo.MortonUnhashLon(uint64(i64)) - lat = geo.MortonUnhashLat(uint64(i64)) + lons = append(lons, geo.MortonUnhashLon(uint64(i64))) + lats = append(lats, geo.MortonUnhashLat(uint64(i64))) found = true } } }) if err == nil && found { - dist := geo.Haversin(lon, lat, centerLon, centerLat) - if dist <= maxDist/1000 { - return true + for i := range lons { + dist := geo.Haversin(lons[i], lats[i], centerLon, centerLat) + if dist <= maxDist/1000 { + return true + } } } return false diff --git a/search_test.go b/search_test.go index 8c78d641f..5eede44c4 100644 --- a/search_test.go +++ b/search_test.go @@ -1594,3 +1594,48 @@ func TestSearchScoreNone(t *testing.T) { t.Fatal("unexpected score for the hit") } } + +func TestGeoDistanceIssue1301(t *testing.T) { + shopMapping := NewDocumentMapping() + shopMapping.AddFieldMappingsAt("GEO", NewGeoPointFieldMapping()) + shopIndexMapping := NewIndexMapping() + shopIndexMapping.DefaultMapping = shopMapping + + idx, err := NewUsing("testidx", shopIndexMapping, scorch.Name, Config.DefaultKVStore, nil) + if err != nil { + t.Fatal(err) + } + + defer func() { + err := os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + }() + + for i, g := range []string{"wecpkbeddsmf", "wecpk8tne453", "wecpkb80s09t"} { + if err = idx.Index(strconv.Itoa(i), map[string]interface{}{ + "ID": i, + "GEO": g, + }); err != nil { + t.Fatal(err) + } + } + + // Not setting "Field" for the following query is returning inconsistent + // results, when there's another field indexed which is numeric and both + // these fields are included within _all. + // As reported in: https://github.com/blevesearch/bleve/issues/1301 + lat, lon := 22.371154, 114.112603 + q := NewGeoDistanceQuery(lon, lat, "1km") + + req := NewSearchRequest(q) + sr, err := idx.Search(req) + if err != nil { + t.Fatal(err) + } + + if sr.Total != 3 { + t.Fatalf("Size expected: 3, actual %d\n", sr.Total) + } +} From c0dcea943e3f6e31be3481c3c7788737626ba688 Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 28 Oct 2019 15:49:36 -0700 Subject: [PATCH 633/728] Fix up geo point matches for within a geo-polygon as well --- search/searcher/search_geopolygon.go | 42 +++++++++++++++------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/search/searcher/search_geopolygon.go b/search/searcher/search_geopolygon.go index 3bb47519d..b0d6d552a 100644 --- a/search/searcher/search_geopolygon.go +++ b/search/searcher/search_geopolygon.go @@ -63,7 +63,8 @@ func almostEqual(a, b float64) bool { func buildPolygonFilter(dvReader index.DocValueReader, field string, polygon []geo.Point) FilterFunc { return func(d *search.DocumentMatch) bool { - var lon, lat float64 + // check geo matches against all numeric type terms indexed + var lons, lats []float64 var found bool err := dvReader.VisitDocValues(d.IndexInternalID, func(field string, term []byte) { @@ -73,8 +74,8 @@ func buildPolygonFilter(dvReader index.DocValueReader, field string, if err == nil && shift == 0 { i64, err := prefixCoded.Int64() if err == nil { - lon = geo.MortonUnhashLon(uint64(i64)) - lat = geo.MortonUnhashLat(uint64(i64)) + lons = append(lons, geo.MortonUnhashLon(uint64(i64))) + lats = append(lats, geo.MortonUnhashLat(uint64(i64))) found = true } } @@ -84,26 +85,29 @@ func buildPolygonFilter(dvReader index.DocValueReader, field string, // the polygon. ie it might fail for certain points on the polygon boundaries. if err == nil && found { nVertices := len(polygon) - var inside bool - // check for a direct vertex match - if almostEqual(polygon[0].Lat, lat) && - almostEqual(polygon[0].Lon, lon) { - return true - } - - for i := 1; i < nVertices; i++ { - if almostEqual(polygon[i].Lat, lat) && - almostEqual(polygon[i].Lon, lon) { + for i := range lons { + var inside bool + // check for a direct vertex match + if almostEqual(polygon[0].Lat, lats[i]) && + almostEqual(polygon[0].Lon, lons[i]) { return true } - if (polygon[i].Lat > lat) != (polygon[i-1].Lat > lat) && - lon < (polygon[i-1].Lon-polygon[i].Lon)*(lat-polygon[i].Lat)/ - (polygon[i-1].Lat-polygon[i].Lat)+polygon[i].Lon { - inside = !inside + + for j := 1; j < nVertices; j++ { + if almostEqual(polygon[j].Lat, lats[i]) && + almostEqual(polygon[j].Lon, lons[i]) { + return true + } + if (polygon[j].Lat > lats[i]) != (polygon[j-1].Lat > lats[i]) && + lons[i] < (polygon[j-1].Lon-polygon[j].Lon)*(lats[i]-polygon[j].Lat)/ + (polygon[j-1].Lat-polygon[j].Lat)+polygon[j].Lon { + inside = !inside + } + } + if inside { + return true } } - return inside - } return false } From 1e56045b0dfbbbdf20f34b1fd4ca761b6035c8ec Mon Sep 17 00:00:00 2001 From: abhinavdangeti Date: Mon, 28 Oct 2019 15:54:35 -0700 Subject: [PATCH 634/728] Fix typos/commentary --- search/searcher/search_geopointdistance.go | 2 +- search_test.go | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/search/searcher/search_geopointdistance.go b/search/searcher/search_geopointdistance.go index 2444a8423..5be781fd6 100644 --- a/search/searcher/search_geopointdistance.go +++ b/search/searcher/search_geopointdistance.go @@ -96,7 +96,7 @@ func boxSearcher(indexReader index.IndexReader, func buildDistFilter(dvReader index.DocValueReader, field string, centerLon, centerLat, maxDist float64) FilterFunc { return func(d *search.DocumentMatch) bool { - // check geo matches against all numeric tpe terms indexed + // check geo matches against all numeric type terms indexed var lons, lats []float64 var found bool diff --git a/search_test.go b/search_test.go index 5eede44c4..7cfd44737 100644 --- a/search_test.go +++ b/search_test.go @@ -1622,9 +1622,9 @@ func TestGeoDistanceIssue1301(t *testing.T) { } } - // Not setting "Field" for the following query is returning inconsistent - // results, when there's another field indexed which is numeric and both - // these fields are included within _all. + // Not setting "Field" for the following query, targets it against the _all + // field and this is returning inconsistent results, when there's another + // field indexed along with the geopoint which is numeric. // As reported in: https://github.com/blevesearch/bleve/issues/1301 lat, lon := 22.371154, 114.112603 q := NewGeoDistanceQuery(lon, lat, "1km") From a8dd7894f6394e168fc7e307aca5db3f1e295c4c Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 30 Oct 2019 12:33:39 +0530 Subject: [PATCH 635/728] fix typo in comments --- search/searcher/search_geopointdistance.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/search/searcher/search_geopointdistance.go b/search/searcher/search_geopointdistance.go index b01ae6a0a..7409196a5 100644 --- a/search/searcher/search_geopointdistance.go +++ b/search/searcher/search_geopointdistance.go @@ -83,7 +83,7 @@ func boxSearcher(indexReader index.IndexReader, return boxSearcher, nil } - // build geoboundinggox searcher for that bounding box + // build geoboundingbox searcher for that bounding box boxSearcher, err := NewGeoBoundingBoxSearcher(indexReader, topLeftLon, bottomRightLat, bottomRightLon, topLeftLat, field, boost, options, checkBoundaries) From 13483401f70a7b9a453acc54f0ead439f72330fa Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 3 Jan 2020 12:46:31 +0530 Subject: [PATCH 636/728] Adding json tags for the geo point struct -adding UTs for complex self-intersecting polygons --- geo/geo.go | 4 +- search/searcher/search_geopolygon_test.go | 90 +++++++++++++++++++++++ 2 files changed, 92 insertions(+), 2 deletions(-) diff --git a/geo/geo.go b/geo/geo.go index 583451e30..b3d6909d9 100644 --- a/geo/geo.go +++ b/geo/geo.go @@ -39,8 +39,8 @@ var latScale = float64((uint64(0x1)< Date: Fri, 3 Jan 2020 13:40:08 -0800 Subject: [PATCH 637/728] Update .travis.yml to run for go versions 1.12.x and 1.13.x (latest) >> etcd-io/bbolt complains with go version 1.11.x --- .travis.yml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index 8f3340510..dffe4b8fd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -3,7 +3,6 @@ sudo: false language: go go: - - "1.11.x" - "1.12.x" - "1.13.x" @@ -16,11 +15,7 @@ script: - go test -race -v $(go list ./... | grep -v vendor/) - go vet $(go list ./... | grep -v vendor/) - go test ./test -v -indexType scorch - - if [[ ${TRAVIS_GO_VERSION} =~ ^1\.10 ]]; then - echo "errcheck skipped for go version" $TRAVIS_GO_VERSION; - else - errcheck -ignorepkg fmt $(go list ./... | grep -v vendor/); - fi + - errcheck -ignorepkg fmt $(go list ./... | grep -v vendor/); - docs/project-code-coverage.sh - docs/build_children.sh From e7d28f663caeffefc07e4b04d6391522a1584df0 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 6 Jan 2020 08:29:38 +0530 Subject: [PATCH 638/728] MB-37392 - incorrect results with closed polygons Polygon query returns less results after adding initial point as last point. This fix aims to address that by extending the ray casting checks. --- search/searcher/search_geopolygon.go | 15 +++++++++++---- search/searcher/search_geopolygon_test.go | 5 +++-- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/search/searcher/search_geopolygon.go b/search/searcher/search_geopolygon.go index b0d6d552a..35f3ba2a1 100644 --- a/search/searcher/search_geopolygon.go +++ b/search/searcher/search_geopolygon.go @@ -85,8 +85,17 @@ func buildPolygonFilter(dvReader index.DocValueReader, field string, // the polygon. ie it might fail for certain points on the polygon boundaries. if err == nil && found { nVertices := len(polygon) + if len(polygon) < 3 { + return false + } + rayIntersectsSegment := func(point, a, b geo.Point) bool { + return (a.Lat > point.Lat) != (b.Lat > point.Lat) && + point.Lon < (b.Lon-a.Lon)*(point.Lat-a.Lat)/(b.Lat-a.Lat)+a.Lon + } + for i := range lons { - var inside bool + pt := geo.Point{Lon: lons[i], Lat: lats[i]} + inside := rayIntersectsSegment(pt, polygon[len(polygon)-1], polygon[0]) // check for a direct vertex match if almostEqual(polygon[0].Lat, lats[i]) && almostEqual(polygon[0].Lon, lons[i]) { @@ -98,9 +107,7 @@ func buildPolygonFilter(dvReader index.DocValueReader, field string, almostEqual(polygon[j].Lon, lons[i]) { return true } - if (polygon[j].Lat > lats[i]) != (polygon[j-1].Lat > lats[i]) && - lons[i] < (polygon[j-1].Lon-polygon[j].Lon)*(lats[i]-polygon[j].Lat)/ - (polygon[j-1].Lat-polygon[j].Lat)+polygon[j].Lon { + if rayIntersectsSegment(pt, polygon[j-1], polygon[j]) { inside = !inside } } diff --git a/search/searcher/search_geopolygon_test.go b/search/searcher/search_geopolygon_test.go index 8a6ef8f5a..e5e5c3815 100644 --- a/search/searcher/search_geopolygon_test.go +++ b/search/searcher/search_geopolygon_test.go @@ -119,8 +119,9 @@ func TestGeoRectanglePolygon(t *testing.T) { field string want []string }{ - {[]geo.Point{{Lon: 0.001, Lat: 0.001}, {Lon: 85.002, Lat: 38.002}}, "loc", - []string{"a", "b", "c", "d", "e", "f", "g", "h", "i", "j"}}, + {[]geo.Point{{Lon: 0, Lat: 0}, {Lon: 0, Lat: 50}, {Lon: 50, Lat: 50}, {Lon: 50, Lat: 0}, {Lon: 0, Lat: 0}}, "loc", + []string{"a", "b", "c", "d", "e", "f", "g", "h", "i", "j"}, + }, } i := setupGeo(t) From 8606b51b5d2d57f0bba8a1896b308739d0261abb Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Sat, 11 Jan 2020 10:31:31 +0530 Subject: [PATCH 639/728] MB-37471 - Incomplete polygon points results emtpy response Adding input validation checks for polygon points --- search/searcher/search_geopolygon.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/search/searcher/search_geopolygon.go b/search/searcher/search_geopolygon.go index 35f3ba2a1..5f16aa8d2 100644 --- a/search/searcher/search_geopolygon.go +++ b/search/searcher/search_geopolygon.go @@ -15,6 +15,7 @@ package searcher import ( + "fmt" "github.com/blevesearch/bleve/geo" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/numeric" @@ -26,6 +27,10 @@ func NewGeoBoundedPolygonSearcher(indexReader index.IndexReader, polygon []geo.Point, field string, boost float64, options search.SearcherOptions) (search.Searcher, error) { + if len(polygon) < 3 { + return nil, fmt.Errorf("Too few points specified for the polygon boundary") + } + // compute the bounding box enclosing the polygon topLeftLon, topLeftLat, bottomRightLon, bottomRightLat, err := geo.BoundingRectangleForPolygon(polygon) From f99438d3d4c760ae2f79967b2663b063dd3b5a3e Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Tue, 14 Jan 2020 16:39:16 +0530 Subject: [PATCH 640/728] MB-37500 - Polygon query with invalid geohash gives results Adding geohash length validation checks --- geo/geo.go | 2 ++ geo/parse.go | 8 +++++--- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/geo/geo.go b/geo/geo.go index 583451e30..2f11bd8dd 100644 --- a/geo/geo.go +++ b/geo/geo.go @@ -37,6 +37,8 @@ var geoTolerance = 1E-6 var lonScale = float64((uint64(0x1)< Date: Thu, 16 Jan 2020 11:50:05 +0200 Subject: [PATCH 641/728] Fix concurrent read write on analysis This is the stack trace we got in our tests ``` fatal error: concurrent map read and map write goroutine 7 [running]: runtime.throw(0x17b1b17, 0x21) /usr/local/go/src/runtime/panic.go:774 +0x72 fp=0xc00c97f8b8 sp=0xc00c97f888 pc=0x431422 runtime.mapaccess2_faststr(0x15f4040, 0xc011202f90, 0x257c531, 0x1, 0xc00c97fa70, 0xcb5919) /usr/local/go/src/runtime/map_faststr.go:116 +0x48f fp=0xc00c97f928 sp=0xc00c97f8b8 pc=0x4150ff github.com/blevesearch/bleve/analysis.TokenFrequencies.MergeAll(0xc011202f90, 0x14488dd, 0x2, 0xc01121cc00) /home/circleci/.go_workspace/pkg/mod/github.com/blevesearch/bleve@v0.0.0-20190812174355-c997533a776f/ +0x100 fp=0xc00c97fa38 sp=0xc00c97f928 pc=0x9d6770 github.com/blevesearch/bleve/document.(*CompositeField).Compose(0xc00e499d80, 0x14488dd, 0x2, 0x1, 0xc01121cc00) /home/circleci/.go_workspace/pkg/mod/github.com/blevesearch/bleve@v0.0.0-20190812174355-c997533a776f/document/field_composite.go:122 +0xf2 fp=0xc00c97fa80 sp=0xc00c97fa38 pc=0x9dd902 github.com/blevesearch/bleve/index/upsidedown.(*UpsideDownCouch).Analyze(0xc001dbda80, 0xc00e499d00, 0x2) /home/circleci/.go_workspace/pkg/mod/github.com/blevesearch/bleve@v0.0.0-20190812174355-c997533a776f/index/upsidedown/analysis.go:77 +0xc10 fp=0xc00c97ff28 sp=0xc00c97fa80 pc=0xc9b630 github.com/blevesearch/bleve/index.AnalysisWorker(0xc0004f1e00, 0xc0004f1e60) /home/circleci/.go_workspace/pkg/mod/github.com/blevesearch/bleve@v0.0.0-20190812174355-c997533a776f/index/analysis.go:106 +0x55 fp=0xc00c97ffd0 sp=0xc00c97ff28 pc=0x9e12b5 runtime.goexit() /usr/local/go/src/runtime/asm_amd64.s:1357 +0x1 fp=0xc00c97ffd8 sp=0xc00c97ffd0 pc=0x460741 created by github.com/blevesearch/bleve/index.NewAnalysisQueue /home/circleci/.go_workspace/pkg/mod/github.com/blevesearch/bleve@v0.0.0-20190812174355-c997533a776f/index/analysis.go:94 +0xc8 ``` I think that the root cause is that iterating on the map, pointers change under the hood, which will lead to analysing the same document twice --- index/upsidedown/upsidedown.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/index/upsidedown/upsidedown.go b/index/upsidedown/upsidedown.go index 24f5aae94..8e915c6ad 100644 --- a/index/upsidedown/upsidedown.go +++ b/index/upsidedown/upsidedown.go @@ -820,7 +820,8 @@ func (udc *UpsideDownCouch) Batch(batch *index.Batch) (err error) { if numUpdates > 0 { go func() { - for _, doc := range batch.IndexOps { + for k := range batch.IndexOps { + doc := batch.IndexOps[k] if doc != nil { aw := index.NewAnalysisWork(udc, doc, resultChan) // put the work on the queue From 86d3ceeecdb3da770084a5e087a9b64f05d2b5a8 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 5 Feb 2020 16:42:00 +0530 Subject: [PATCH 642/728] Enabling score/id based search_after/before paginations This change attempts to enable users provide the last fetched document's score or ID values as keys in the search after/before fields to make the deep pagination works for the default _score or _id based sort orders. --- search/collector/topn.go | 13 +++++ test/tests/sort/data/d.json | 2 +- test/tests/sort/data/e.json | 2 +- test/tests/sort/data/f.json | 2 +- test/tests/sort/searches.json | 92 +++++++++++++++++++++++++++++++++++ 5 files changed, 108 insertions(+), 3 deletions(-) diff --git a/search/collector/topn.go b/search/collector/topn.go index a027a12c2..8d4afb63a 100644 --- a/search/collector/topn.go +++ b/search/collector/topn.go @@ -17,6 +17,7 @@ package collector import ( "context" "reflect" + "strconv" "time" "github.com/blevesearch/bleve/index" @@ -90,6 +91,18 @@ func NewTopNCollectorAfter(size int, sort search.SortOrder, after []string) *Top rv.searchAfter = &search.DocumentMatch{ Sort: after, } + + for pos, ss := range sort { + if ss.RequiresDocID() { + rv.searchAfter.ID = after[pos] + } + if ss.RequiresScoring() { + if score, err := strconv.ParseFloat(after[pos], 64); err == nil { + rv.searchAfter.Score = score + } + } + } + return rv } diff --git a/test/tests/sort/data/d.json b/test/tests/sort/data/d.json index 926869a87..febddcafa 100644 --- a/test/tests/sort/data/d.json +++ b/test/tests/sort/data/d.json @@ -2,6 +2,6 @@ "id": "d", "age": 65, "born": "1978-12-02", - "title": "agent", + "title": "agent d is desperately trying out to be successful rapster!", "tags": ["cats"] } diff --git a/test/tests/sort/data/e.json b/test/tests/sort/data/e.json index 436f010bd..9f1c4f9b2 100644 --- a/test/tests/sort/data/e.json +++ b/test/tests/sort/data/e.json @@ -2,6 +2,6 @@ "id": "e", "name": "nancy", "born": "1954-10-22", - "title": "rapstar", + "title": "rapstar nancy rapster", "tags": ["pain"] } diff --git a/test/tests/sort/data/f.json b/test/tests/sort/data/f.json index 14f0921a6..37618bc1a 100644 --- a/test/tests/sort/data/f.json +++ b/test/tests/sort/data/f.json @@ -2,6 +2,6 @@ "id": "f", "name": "frank", "age": 1, - "title": "taxman", + "title": "frank the taxman of cb, Rapster!", "tags": ["vitamin","purple"] } diff --git a/test/tests/sort/searches.json b/test/tests/sort/searches.json index 8b32df5d0..6d34b25c0 100644 --- a/test/tests/sort/searches.json +++ b/test/tests/sort/searches.json @@ -458,5 +458,97 @@ } ] } + }, + { + "comment": "sort by ID, after doc d", + "search": { + "from": 0, + "size": 10, + "query": { + "match_all":{} + }, + "sort": ["_id"], + "search_after": ["d"] + }, + "result": { + "total_hits": 6, + "hits": [ + { + "id": "e" + }, + { + "id": "f" + } + ] + } + }, + { + "comment": "sort by ID, before doc d", + "search": { + "from": 0, + "size": 10, + "query": { + "match_all":{} + }, + "sort": ["_id"], + "search_before": ["d"] + }, + "result": { + "total_hits": 6, + "hits": [ + { + "id": "a" + }, + { + "id": "b" + }, + { + "id": "c" + } + ] + } + }, + { + "comment": "sort by score, after score 0.286889[ e(299646) > f(286889) > d(222224)]", + "search": { + "from": 0, + "size": 10, + "query": { + "query":"rapster" + }, + "sort": ["_score"], + "search_after": ["0.286889"] + }, + "result": { + "total_hits": 3, + "hits": [ + { + "id": "f" + }, + { + "id": "e" + } + ] + } + }, + { + "comment": "sort by score, before score f/0.286889[ e(299646) > f(286889) > d(222224)]", + "search": { + "from": 0, + "size": 10, + "query": { + "query":"rapster" + }, + "sort": ["_score"], + "search_before": ["0.286889"] + }, + "result": { + "total_hits": 3, + "hits": [ + { + "id": "d" + } + ] + } } ] From 5d3ddd2760afdbff717be25440d94c5200fe623b Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 17 Feb 2020 14:39:53 -0500 Subject: [PATCH 643/728] update manifest to fix issue with vellum (#1338) fixes #1336 --- vendor/manifest | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/manifest b/vendor/manifest index e6f985136..3d906dd66 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -78,7 +78,7 @@ "importpath": "github.com/RoaringBitmap/roaring", "repository": "https://github.com/RoaringBitmap/roaring", "vcs": "", - "revision": "d0ce1763c3526f65703c395da50da7a7fb2138d5", + "revision": "4208ad825dda03a6a3d2197df8ec57948aebcc12", "branch": "master", "notests": true }, From 78a985bede29cc9e5707a73bc9b94d75a4ce2717 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 27 Feb 2020 14:39:09 +0530 Subject: [PATCH 644/728] Adding snowball stemmers for en,fr,it,es,de --- analysis/lang/de/stemmer_de_snowball.go | 49 +++++++++++ analysis/lang/de/stemmer_de_test.go | 91 ++++++++++++++++++++ analysis/lang/en/stemmer_en_snowball.go | 49 +++++++++++ analysis/lang/en/stemmer_en_test.go | 79 +++++++++++++++++ analysis/lang/es/stemmer_es_snowball.go | 49 +++++++++++ analysis/lang/es/stemmer_es_snowball_test.go | 79 +++++++++++++++++ analysis/lang/fr/stemmer_fr_snowball.go | 49 +++++++++++ analysis/lang/fr/stemmer_fr_snowball_test.go | 79 +++++++++++++++++ analysis/lang/it/stemmer_it_snowball.go | 49 +++++++++++ analysis/lang/it/stemmer_it_snowball_test.go | 79 +++++++++++++++++ 10 files changed, 652 insertions(+) create mode 100644 analysis/lang/de/stemmer_de_snowball.go create mode 100644 analysis/lang/de/stemmer_de_test.go create mode 100644 analysis/lang/en/stemmer_en_snowball.go create mode 100644 analysis/lang/en/stemmer_en_test.go create mode 100644 analysis/lang/es/stemmer_es_snowball.go create mode 100644 analysis/lang/es/stemmer_es_snowball_test.go create mode 100644 analysis/lang/fr/stemmer_fr_snowball.go create mode 100644 analysis/lang/fr/stemmer_fr_snowball_test.go create mode 100644 analysis/lang/it/stemmer_it_snowball.go create mode 100644 analysis/lang/it/stemmer_it_snowball_test.go diff --git a/analysis/lang/de/stemmer_de_snowball.go b/analysis/lang/de/stemmer_de_snowball.go new file mode 100644 index 000000000..efda0660b --- /dev/null +++ b/analysis/lang/de/stemmer_de_snowball.go @@ -0,0 +1,49 @@ +// Copyright (c) 2020 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package de + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/snowballstem" + "github.com/blevesearch/snowballstem/german" +) + +const SnowballStemmerName = "stemmer_de_snowball" + +type GermanStemmerFilter struct { +} + +func NewGermanStemmerFilter() *GermanStemmerFilter { + return &GermanStemmerFilter{} +} + +func (s *GermanStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + env := snowballstem.NewEnv(string(token.Term)) + german.Stem(env) + token.Term = []byte(env.Current()) + } + return input +} + +func GermanStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewGermanStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(SnowballStemmerName, GermanStemmerFilterConstructor) +} diff --git a/analysis/lang/de/stemmer_de_test.go b/analysis/lang/de/stemmer_de_test.go new file mode 100644 index 000000000..a319f7273 --- /dev/null +++ b/analysis/lang/de/stemmer_de_test.go @@ -0,0 +1,91 @@ +// Copyright (c) 2020 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package de + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestSnowballGermanStemmer(t *testing.T) { + tests := []struct { + input analysis.TokenStream + output analysis.TokenStream + }{ + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("abzuschrecken"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("abzuschreck"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("abzuwarten"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("abzuwart"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("zwirnfabrik"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("zwirnfabr"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("zyniker"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("zynik"), + }, + }, + }, + } + + cache := registry.NewCache() + filter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := filter.Filter(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) + } + } +} diff --git a/analysis/lang/en/stemmer_en_snowball.go b/analysis/lang/en/stemmer_en_snowball.go new file mode 100644 index 000000000..225bb0664 --- /dev/null +++ b/analysis/lang/en/stemmer_en_snowball.go @@ -0,0 +1,49 @@ +// Copyright (c) 2020 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package en + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/snowballstem" + "github.com/blevesearch/snowballstem/english" +) + +const SnowballStemmerName = "stemmer_en_snowball" + +type EnglishStemmerFilter struct { +} + +func NewEnglishStemmerFilter() *EnglishStemmerFilter { + return &EnglishStemmerFilter{} +} + +func (s *EnglishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + env := snowballstem.NewEnv(string(token.Term)) + english.Stem(env) + token.Term = []byte(env.Current()) + } + return input +} + +func EnglishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewEnglishStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(SnowballStemmerName, EnglishStemmerFilterConstructor) +} diff --git a/analysis/lang/en/stemmer_en_test.go b/analysis/lang/en/stemmer_en_test.go new file mode 100644 index 000000000..bc5016d07 --- /dev/null +++ b/analysis/lang/en/stemmer_en_test.go @@ -0,0 +1,79 @@ +// Copyright (c) 2020 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package en + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestSnowballEnglishStemmer(t *testing.T) { + tests := []struct { + input analysis.TokenStream + output analysis.TokenStream + }{ + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("enjoy"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("enjoy"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("enjoyed"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("enjoy"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("enjoyable"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("enjoy"), + }, + }, + }, + } + + cache := registry.NewCache() + filter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := filter.Filter(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) + } + } +} diff --git a/analysis/lang/es/stemmer_es_snowball.go b/analysis/lang/es/stemmer_es_snowball.go new file mode 100644 index 000000000..9ee768a9d --- /dev/null +++ b/analysis/lang/es/stemmer_es_snowball.go @@ -0,0 +1,49 @@ +// Copyright (c) 2020 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package es + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/snowballstem" + "github.com/blevesearch/snowballstem/spanish" +) + +const SnowballStemmerName = "stemmer_es_snowball" + +type SpanishStemmerFilter struct { +} + +func NewSpanishStemmerFilter() *SpanishStemmerFilter { + return &SpanishStemmerFilter{} +} + +func (s *SpanishStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + env := snowballstem.NewEnv(string(token.Term)) + spanish.Stem(env) + token.Term = []byte(env.Current()) + } + return input +} + +func SpanishStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewSpanishStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(SnowballStemmerName, SpanishStemmerFilterConstructor) +} diff --git a/analysis/lang/es/stemmer_es_snowball_test.go b/analysis/lang/es/stemmer_es_snowball_test.go new file mode 100644 index 000000000..d976fc821 --- /dev/null +++ b/analysis/lang/es/stemmer_es_snowball_test.go @@ -0,0 +1,79 @@ +// Copyright (c) 2020 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package es + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestSnowballSpanishStemmer(t *testing.T) { + tests := []struct { + input analysis.TokenStream + output analysis.TokenStream + }{ + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("agresivos"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("agres"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("agresivamente"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("agres"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("agresividad"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("agres"), + }, + }, + }, + } + + cache := registry.NewCache() + filter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := filter.Filter(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) + } + } +} diff --git a/analysis/lang/fr/stemmer_fr_snowball.go b/analysis/lang/fr/stemmer_fr_snowball.go new file mode 100644 index 000000000..e137ce211 --- /dev/null +++ b/analysis/lang/fr/stemmer_fr_snowball.go @@ -0,0 +1,49 @@ +// Copyright (c) 2020 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fr + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/snowballstem" + "github.com/blevesearch/snowballstem/french" +) + +const SnowballStemmerName = "stemmer_fr_snowball" + +type FrenchStemmerFilter struct { +} + +func NewFrenchStemmerFilter() *FrenchStemmerFilter { + return &FrenchStemmerFilter{} +} + +func (s *FrenchStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + env := snowballstem.NewEnv(string(token.Term)) + french.Stem(env) + token.Term = []byte(env.Current()) + } + return input +} + +func FrenchStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewFrenchStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(SnowballStemmerName, FrenchStemmerFilterConstructor) +} diff --git a/analysis/lang/fr/stemmer_fr_snowball_test.go b/analysis/lang/fr/stemmer_fr_snowball_test.go new file mode 100644 index 000000000..aeafa4317 --- /dev/null +++ b/analysis/lang/fr/stemmer_fr_snowball_test.go @@ -0,0 +1,79 @@ +// Copyright (c) 2020 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package fr + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestSnowballFrenchStemmer(t *testing.T) { + tests := []struct { + input analysis.TokenStream + output analysis.TokenStream + }{ + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("antagoniste"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("antagon"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("barbouillait"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("barbouill"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("calculateur"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("calcul"), + }, + }, + }, + } + + cache := registry.NewCache() + filter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := filter.Filter(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) + } + } +} diff --git a/analysis/lang/it/stemmer_it_snowball.go b/analysis/lang/it/stemmer_it_snowball.go new file mode 100644 index 000000000..04c6bd701 --- /dev/null +++ b/analysis/lang/it/stemmer_it_snowball.go @@ -0,0 +1,49 @@ +// Copyright (c) 2020 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package it + +import ( + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" + + "github.com/blevesearch/snowballstem" + "github.com/blevesearch/snowballstem/italian" +) + +const SnowballStemmerName = "stemmer_it_snowball" + +type ItalianStemmerFilter struct { +} + +func NewItalianStemmerFilter() *ItalianStemmerFilter { + return &ItalianStemmerFilter{} +} + +func (s *ItalianStemmerFilter) Filter(input analysis.TokenStream) analysis.TokenStream { + for _, token := range input { + env := snowballstem.NewEnv(string(token.Term)) + italian.Stem(env) + token.Term = []byte(env.Current()) + } + return input +} + +func ItalianStemmerFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.TokenFilter, error) { + return NewItalianStemmerFilter(), nil +} + +func init() { + registry.RegisterTokenFilter(SnowballStemmerName, ItalianStemmerFilterConstructor) +} diff --git a/analysis/lang/it/stemmer_it_snowball_test.go b/analysis/lang/it/stemmer_it_snowball_test.go new file mode 100644 index 000000000..844f2f543 --- /dev/null +++ b/analysis/lang/it/stemmer_it_snowball_test.go @@ -0,0 +1,79 @@ +// Copyright (c) 2020 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package it + +import ( + "reflect" + "testing" + + "github.com/blevesearch/bleve/analysis" + "github.com/blevesearch/bleve/registry" +) + +func TestSnowballItalianStemmer(t *testing.T) { + tests := []struct { + input analysis.TokenStream + output analysis.TokenStream + }{ + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("aizzata"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("aizz"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("aizzargli"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("aizz"), + }, + }, + }, + { + input: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("aizzasse"), + }, + }, + output: analysis.TokenStream{ + &analysis.Token{ + Term: []byte("aizz"), + }, + }, + }, + } + + cache := registry.NewCache() + filter, err := cache.TokenFilterNamed(SnowballStemmerName) + if err != nil { + t.Fatal(err) + } + for _, test := range tests { + actual := filter.Filter(test.input) + if !reflect.DeepEqual(actual, test.output) { + t.Errorf("expected %s, got %s", test.output[0].Term, actual[0].Term) + } + } +} From c12c3e7b3130d86f7fd7ee9311225997616bf8fc Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 16 Mar 2020 15:09:55 +0530 Subject: [PATCH 645/728] Initialise the root snapshot's DecRef errs with Close() During the scorch's Close call, the root snapshot's DecRef return errs were ignored. Fixing that to bubble up the errors so that the higher levels may listen and act on it. --- index/scorch/scorch.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 7a1046fc5..a88486e27 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -263,7 +263,7 @@ func (s *Scorch) Close() (err error) { err = s.rootBolt.Close() s.rootLock.Lock() if s.root != nil { - _ = s.root.DecRef() + err = s.root.DecRef() } s.root = nil s.rootLock.Unlock() From 1b6b97982689cabf814a151a6f4827db67b1d9c4 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Tue, 17 Mar 2020 08:35:34 +0530 Subject: [PATCH 646/728] Fixing the err override for bolt's Close --- index/scorch/scorch.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index a88486e27..796713148 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -263,7 +263,10 @@ func (s *Scorch) Close() (err error) { err = s.rootBolt.Close() s.rootLock.Lock() if s.root != nil { - err = s.root.DecRef() + err2 := s.root.DecRef() + if err == nil { + err = err2 + } } s.root = nil s.rootLock.Unlock() From dce4ebbcd24dbf56bb2211188ca368dc572189ec Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Tue, 17 Mar 2020 19:56:05 +0530 Subject: [PATCH 647/728] Fix for issue #1352 Resetting the closeCh in Open api so that scorch index's main routine trio remains in their active work loops. --- index/scorch/scorch.go | 1 + index/scorch/scorch_test.go | 110 ++++++++++++++++++++++++++++++++++++ 2 files changed, 111 insertions(+) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 796713148..8db3fa966 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -223,6 +223,7 @@ func (s *Scorch) openBolt() error { s.introducerNotifier = make(chan *epochWatcher, 1) s.revertToSnapshots = make(chan *snapshotReversion) s.persisterNotifier = make(chan *epochWatcher, 1) + s.closeCh = make(chan struct{}) if !s.readOnly && s.path != "" { err := s.removeOldZapFiles() // Before persister or merger create any new files. diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index 550a584ae..302a86a5e 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -163,6 +163,116 @@ func TestIndexOpenReopen(t *testing.T) { } } +func TestIndexOpenReopenWithInsert(t *testing.T) { + cfg := CreateConfig("TestIndexOpenReopen") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } + defer func() { + err := DestroyTest(cfg) + if err != nil { + t.Log(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, cfg, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + + var expectedCount uint64 + reader, err := idx.Reader() + if err != nil { + t.Fatal(err) + } + docCount, err := reader.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } + + // insert a doc + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + reader, err = idx.Reader() + if err != nil { + t.Fatal(err) + } + docCount, err = reader.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } + + // now close it + err = idx.Close() + if err != nil { + t.Fatal(err) + } + + // try to open the index and insert data + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + + // insert a doc + doc = document.NewDocument("2") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test2"))) + err = idx.Update(doc) + if err != nil { + t.Errorf("Error updating index: %v", err) + } + expectedCount++ + + // check the doc count again after reopening it + reader, err = idx.Reader() + if err != nil { + t.Fatal(err) + } + docCount, err = reader.DocCount() + if err != nil { + t.Error(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + err = reader.Close() + if err != nil { + t.Fatal(err) + } + + // now close it + err = idx.Close() + if err != nil { + t.Fatal(err) + } +} + func TestIndexInsert(t *testing.T) { cfg := CreateConfig("TestIndexInsert") err := InitTest(cfg) From bac2f33954b316485be0f796e2964a9e773ca204 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 16 Mar 2020 13:11:49 +0530 Subject: [PATCH 648/728] MB-38303 - merger leaks index snapshot refCounts on index closure Upon an index closure event, the file merger skips the reference count decrement operations at line no: https://github.com/blevesearch/ bleve/blob/05d86ea8f6e30456949f612cf68cf4a27ce8c9c5/index/scorch/merge.go#L261 leading to the leaking of index snapshots. This will further lead to the segment file leaking as those segment files will never get ceremoniously un-mapped or handle closed in the event of an index closure. details ref - https://issues.couchbase.com/browse/MB-38303 --- index/scorch/merge.go | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index d7144772f..c39e3117d 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -159,6 +159,21 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, // process tasks in serial for now var notifications []chan *IndexSnapshot var filenames []string + // clean up any pending notifications from introducer on exit + // from an index closure. + defer func() { + for _, notification := range notifications { + select { + case newSnapshot := <-notification: + atomic.AddUint64(&s.stats.TotFileMergeIntroductionsDone, 1) + if newSnapshot != nil { + _ = newSnapshot.DecRef() + } + default: + } + } + }() + for _, task := range resultMergePlan.Tasks { if len(task.Segments) == 0 { atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegmentsEmpty, 1) @@ -217,6 +232,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, s.unmarkIneligibleForRemoval(filename) atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1) if err == segment.ErrClosed { + // handle any pending index snapshot introduction notifications on exit return err } return fmt.Errorf("merging failed: %v", err) @@ -253,6 +269,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, // give it to the introducer select { case <-s.closeCh: + // handle any pending index snapshot introduction notifications on exit _ = seg.Close() return segment.ErrClosed case s.merges <- sm: @@ -265,6 +282,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, for _, notification := range notifications { select { case <-s.closeCh: + // handle any pending index snapshot introduction notifications on exit atomic.AddUint64(&s.stats.TotFileMergeIntroductionsSkipped, 1) return segment.ErrClosed case newSnapshot := <-notification: From 0b8c270ec5ef44df759e8c9a33fe348f7ca3fde8 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 18 Mar 2020 17:15:42 +0530 Subject: [PATCH 649/728] Offline RollbackPoints/Rollback apis for scorch index This change comprises of enabling the rollback related scorch APIs to work only with closed/offline indexes. Both the API signatures have modified to accept an index path pointing to the scorch index location to accommodate this change. The two API approach(RollbackPoints api for fetching the rollback points and the Rollback api for performing the actual rollback) is retained in favour of modularity and simplicity of usage. The Rollback api would make the meta/bolt store with the given rollback point as the most recent epoch to be loaded. During the process of restoring the index state to the relevant rollback point, the new Rollback implementation cleans up the irrelevant epoch data from the bolt store. This is done to favour a lean version of persisted epochs in root bolt anytime to further lead a more efficient disk clean up/utilisation. Also, this helps to better safe guard against any potential recursive rollback request as we aren't introducing any newer data into the system. --- index/scorch/snapshot_rollback.go | 135 ++++++++++++++++--------- index/scorch/snapshot_rollback_test.go | 74 +++++++++----- 2 files changed, 138 insertions(+), 71 deletions(-) diff --git a/index/scorch/snapshot_rollback.go b/index/scorch/snapshot_rollback.go index 470868d0e..d9f89f46f 100644 --- a/index/scorch/snapshot_rollback.go +++ b/index/scorch/snapshot_rollback.go @@ -17,6 +17,7 @@ package scorch import ( "fmt" "log" + "os" "github.com/blevesearch/bleve/index/scorch/segment" bolt "github.com/etcd-io/bbolt" @@ -34,13 +35,22 @@ func (r *RollbackPoint) GetInternal(key []byte) []byte { // RollbackPoints returns an array of rollback points available for // the application to rollback to, with more recent rollback points // (higher epochs) coming first. -func (s *Scorch) RollbackPoints() ([]*RollbackPoint, error) { - if s.rootBolt == nil { - return nil, fmt.Errorf("RollbackPoints: root is nil") +func RollbackPoints(path string) ([]*RollbackPoint, error) { + if len(path) == 0 { + return nil, fmt.Errorf("RollbackPoints: invalid path") + } + + rootBoltPath := path + string(os.PathSeparator) + "root.bolt" + rootBoltOpt := &bolt.Options{ + ReadOnly: true, + } + rootBolt, err := bolt.Open(rootBoltPath, 0600, rootBoltOpt) + if err != nil || rootBolt == nil { + return nil, err } // start a read-only bolt transaction - tx, err := s.rootBolt.Begin(false) + tx, err := rootBolt.Begin(false) if err != nil { return nil, fmt.Errorf("RollbackPoints: failed to start" + " read-only transaction") @@ -49,6 +59,7 @@ func (s *Scorch) RollbackPoints() ([]*RollbackPoint, error) { // read-only bolt transactions to be rolled back defer func() { _ = tx.Rollback() + _ = rootBolt.Close() }() snapshots := tx.Bucket(boltSnapshotsBucket) @@ -105,69 +116,97 @@ func (s *Scorch) RollbackPoints() ([]*RollbackPoint, error) { return rollbackPoints, nil } -// Rollback atomically and durably (if unsafeBatch is unset) brings -// the store back to the point in time as represented by the -// RollbackPoint. Rollback() should only be passed a RollbackPoint -// that came from the same store using the RollbackPoints() API. -func (s *Scorch) Rollback(to *RollbackPoint) error { +// Rollback atomically and durably brings the store back to the point +// in time as represented by the RollbackPoint. +// Rollback() should only be passed a RollbackPoint that came from the +// same store using the RollbackPoints() API along with the index path. +func Rollback(path string, to *RollbackPoint) error { if to == nil { return fmt.Errorf("Rollback: RollbackPoint is nil") } - - if s.rootBolt == nil { - return fmt.Errorf("Rollback: root is nil") + if len(path) == 0 { + return fmt.Errorf("Rollback: index path is empty") } - revert := &snapshotReversion{} - - s.rootLock.Lock() + rootBoltPath := path + string(os.PathSeparator) + "root.bolt" + rootBoltOpt := &bolt.Options{ + ReadOnly: false, + } + rootBolt, err := bolt.Open(rootBoltPath, 0600, rootBoltOpt) + if err != nil || rootBolt == nil { + return err + } + defer func() { + err1 := rootBolt.Close() + if err1 != nil && err == nil { + err = err1 + } + }() - err := s.rootBolt.View(func(tx *bolt.Tx) error { + // pick all the persisted epochs in bolt store + var found bool + var persistedEpochs []uint64 + err = rootBolt.View(func(tx *bolt.Tx) error { snapshots := tx.Bucket(boltSnapshotsBucket) if snapshots == nil { - return fmt.Errorf("Rollback: no snapshots available") - } - - pos := segment.EncodeUvarintAscending(nil, to.epoch) - - snapshot := snapshots.Bucket(pos) - if snapshot == nil { - return fmt.Errorf("Rollback: snapshot not found") + return nil } - - indexSnapshot, err := s.loadSnapshot(snapshot) - if err != nil { - return fmt.Errorf("Rollback: unable to load snapshot: %v", err) - } - - // add segments referenced by loaded index snapshot to the - // ineligibleForRemoval map - for _, segSnap := range indexSnapshot.segment { - filename := zapFileName(segSnap.id) - s.ineligibleForRemoval[filename] = true + sc := snapshots.Cursor() + for sk, _ := sc.Last(); sk != nil; sk, _ = sc.Prev() { + _, snapshotEpoch, err := segment.DecodeUvarintAscending(sk) + if err != nil { + continue + } + if snapshotEpoch == to.epoch { + found = true + } + persistedEpochs = append(persistedEpochs, snapshotEpoch) } - - revert.snapshot = indexSnapshot - revert.applied = make(chan error) - revert.persisted = make(chan error) - return nil }) - s.rootLock.Unlock() + if len(persistedEpochs) == 0 { + return fmt.Errorf("Rollback: no persisted epochs found in bolt") + } + if !found { + return fmt.Errorf("Rollback: target epoch %d not found in bolt", to.epoch) + } + // start a write transaction + tx, err := rootBolt.Begin(true) if err != nil { return err } - // introduce the reversion - s.revertToSnapshots <- revert + defer func() { + if err == nil { + err = tx.Commit() + } else { + _ = tx.Rollback() + } + if err == nil { + err = rootBolt.Sync() + } + }() - // block until this snapshot is applied - err = <-revert.applied - if err != nil { - return fmt.Errorf("Rollback: failed with err: %v", err) + snapshots := tx.Bucket(boltSnapshotsBucket) + if snapshots == nil { + return nil + } + for _, epoch := range persistedEpochs { + k := segment.EncodeUvarintAscending(nil, epoch) + if err != nil { + continue + } + if epoch == to.epoch { + // return here as it already processed until the given epoch + return nil + } + err = snapshots.DeleteBucket(k) + if err == bolt.ErrBucketNotFound { + err = nil + } } - return <-revert.persisted + return err } diff --git a/index/scorch/snapshot_rollback_test.go b/index/scorch/snapshot_rollback_test.go index 73523a0ba..e137e0f09 100644 --- a/index/scorch/snapshot_rollback_test.go +++ b/index/scorch/snapshot_rollback_test.go @@ -44,38 +44,26 @@ func TestIndexRollback(t *testing.T) { if err != nil { t.Fatal(err) } - defer func() { - err := idx.Close() - if err != nil { - t.Fatal(err) - } - }() - sh, ok := idx.(*Scorch) + _, ok := idx.(*Scorch) if !ok { t.Fatalf("Not a scorch index?") } - err = sh.openBolt() - if err != nil { - t.Fatalf("error opening index: %v", err) - } - - // start background goroutines except for the merger, which - // simulates a super slow merger - sh.asyncTasks.Add(2) - go sh.mainLoop() - go sh.persisterLoop() - + indexPath, _ := cfg["path"].(string) // should have no rollback points initially - rollbackPoints, err := sh.RollbackPoints() - if err != nil { + rollbackPoints, err := RollbackPoints(indexPath) + if err == nil { t.Fatalf("expected no err, got: %v, %d", err, len(rollbackPoints)) } if len(rollbackPoints) != 0 { t.Fatalf("expected no rollbackPoints, got %d", len(rollbackPoints)) } + err = idx.Open() + if err != nil { + t.Fatal(err) + } // create a batch, insert 2 new documents batch := index.NewBatch() doc := document.NewDocument("1") @@ -98,8 +86,13 @@ func TestIndexRollback(t *testing.T) { _ = readerSlow.Close() }() + err = idx.Close() + if err != nil { + t.Fatal(err) + } + // fetch rollback points after first batch - rollbackPoints, err = sh.RollbackPoints() + rollbackPoints, err = RollbackPoints(indexPath) if err != nil { t.Fatalf("expected no err, got: %v, %d", err, len(rollbackPoints)) } @@ -110,6 +103,10 @@ func TestIndexRollback(t *testing.T) { // set this as a rollback point for the future rollbackPoint := rollbackPoints[0] + err = idx.Open() + if err != nil { + t.Fatal(err) + } // create another batch, insert 2 new documents, and delete an existing one batch = index.NewBatch() doc = document.NewDocument("3") @@ -125,7 +122,12 @@ func TestIndexRollback(t *testing.T) { t.Fatal(err) } - rollbackPointsB, err := sh.RollbackPoints() + err = idx.Close() + if err != nil { + t.Fatal(err) + } + + rollbackPointsB, err := RollbackPoints(indexPath) if err != nil || len(rollbackPointsB) <= len(rollbackPoints) { t.Fatalf("expected no err, got: %v, %d", err, len(rollbackPointsB)) } @@ -140,6 +142,11 @@ func TestIndexRollback(t *testing.T) { t.Fatalf("expected rollbackPoint epoch to still be available") } + err = idx.Open() + if err != nil { + t.Fatal(err) + } + reader, err := idx.Reader() if err != nil { t.Fatal(err) @@ -176,8 +183,24 @@ func TestIndexRollback(t *testing.T) { t.Fatal(err) } + err = idx.Close() + if err != nil { + t.Fatal(err) + } + + // rollback to a non existing rollback point + err = Rollback(indexPath, &RollbackPoint{epoch: 100}) + if err == nil { + t.Fatalf("expected err: Rollback: target epoch 100 not found in bolt") + } + // rollback to the selected rollback point - err = sh.Rollback(rollbackPoint) + err = Rollback(indexPath, rollbackPoint) + if err != nil { + t.Fatal(err) + } + + err = idx.Open() if err != nil { t.Fatal(err) } @@ -217,4 +240,9 @@ func TestIndexRollback(t *testing.T) { if err != nil { t.Fatal(err) } + + err = idx.Close() + if err != nil { + t.Fatal(err) + } } From a9895fdf9c72cfaa202128a963697d9a98765369 Mon Sep 17 00:00:00 2001 From: Mohsen Samiei <34813843+mohsensamiei@users.noreply.github.com> Date: Wed, 25 Mar 2020 06:31:57 +0430 Subject: [PATCH 650/728] chore: fixes bbolt url (#1357) --- index/scorch/persister.go | 2 +- index/scorch/scorch.go | 2 +- index/scorch/snapshot_rollback.go | 2 +- index/store/boltdb/iterator.go | 2 +- index/store/boltdb/reader.go | 2 +- index/store/boltdb/store.go | 2 +- index/store/boltdb/store_test.go | 2 +- vendor/manifest | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index b141cfda6..d7c335047 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -33,7 +33,7 @@ import ( "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" "github.com/blevesearch/bleve/index/scorch/segment/zap" - bolt "github.com/etcd-io/bbolt" + bolt "go.etcd.io/bbolt" ) var DefaultChunkFactor uint32 = 1024 diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 8db3fa966..67154544b 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -31,7 +31,7 @@ import ( "github.com/blevesearch/bleve/index/scorch/segment/zap" "github.com/blevesearch/bleve/index/store" "github.com/blevesearch/bleve/registry" - bolt "github.com/etcd-io/bbolt" + bolt "go.etcd.io/bbolt" ) const Name = "scorch" diff --git a/index/scorch/snapshot_rollback.go b/index/scorch/snapshot_rollback.go index 470868d0e..4e13aea9a 100644 --- a/index/scorch/snapshot_rollback.go +++ b/index/scorch/snapshot_rollback.go @@ -19,7 +19,7 @@ import ( "log" "github.com/blevesearch/bleve/index/scorch/segment" - bolt "github.com/etcd-io/bbolt" + bolt "go.etcd.io/bbolt" ) type RollbackPoint struct { diff --git a/index/store/boltdb/iterator.go b/index/store/boltdb/iterator.go index 4b5019f1f..cf4da87c3 100644 --- a/index/store/boltdb/iterator.go +++ b/index/store/boltdb/iterator.go @@ -17,7 +17,7 @@ package boltdb import ( "bytes" - bolt "github.com/etcd-io/bbolt" + bolt "go.etcd.io/bbolt" ) type Iterator struct { diff --git a/index/store/boltdb/reader.go b/index/store/boltdb/reader.go index 4cd94183c..7977ebbe5 100644 --- a/index/store/boltdb/reader.go +++ b/index/store/boltdb/reader.go @@ -16,7 +16,7 @@ package boltdb import ( "github.com/blevesearch/bleve/index/store" - bolt "github.com/etcd-io/bbolt" + bolt "go.etcd.io/bbolt" ) type Reader struct { diff --git a/index/store/boltdb/store.go b/index/store/boltdb/store.go index 56613d531..3c749693c 100644 --- a/index/store/boltdb/store.go +++ b/index/store/boltdb/store.go @@ -30,7 +30,7 @@ import ( "github.com/blevesearch/bleve/index/store" "github.com/blevesearch/bleve/registry" - bolt "github.com/etcd-io/bbolt" + bolt "go.etcd.io/bbolt" ) const ( diff --git a/index/store/boltdb/store_test.go b/index/store/boltdb/store_test.go index 6411c239c..ae4cca3e0 100644 --- a/index/store/boltdb/store_test.go +++ b/index/store/boltdb/store_test.go @@ -20,7 +20,7 @@ import ( "github.com/blevesearch/bleve/index/store" "github.com/blevesearch/bleve/index/store/test" - bolt "github.com/etcd-io/bbolt" + bolt "go.etcd.io/bbolt" ) func open(t *testing.T, mo store.MergeOperator) store.KVStore { diff --git a/vendor/manifest b/vendor/manifest index 3d906dd66..431363eef 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -26,7 +26,7 @@ "notests": true }, { - "importpath": "github.com/etcd-io/bbolt", + "importpath": "go.etcd.io/bbolt", "repository": "https://github.com/etcd-io/bbolt", "vcs": "", "revision": "7ee3ded59d4835e10f3e7d0f7603c42aa5e83820", From 7da5cb5eb65ac807e3e87b86251622362baa6686 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 23 Mar 2020 14:23:08 +0530 Subject: [PATCH 651/728] Improving ownership handling during merge introductions As a part of notifying the merger during the merge introductions, the introducer bumps the newer snapshot's reference count. But in situations where the index is getting closed, there is a chance that the merge requested merger(file/in memory) routines would have already exited. This would cause the newly introduced, ref count bumped snapshot to leak as there is none to decrement the bumped up reference count. The fix is about tighening the merge introduction notifications handling at the merger side. As the introducer non-preemptively handles the intro notify channel, its safe for the merger to blockingly awaits for each of those merge introduction notifications and decrements the reference counts where applicable. Unlike earlier, merger is processing each of the merge tasks and awaits its introductions to complete before proceeding to the next merge task. We don't expect any serious performance wrinkles here as the introducer ought to be faster with the merge introductions compared to the merge tasks. Added a few introduction related stats to track any performance considerations raising out of this. --- index/scorch/introducer.go | 2 ++ index/scorch/merge.go | 59 +++++++++++++------------------------- index/scorch/stats.go | 10 ++++--- 3 files changed, 28 insertions(+), 43 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index ac627796f..d3f2ce086 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -312,6 +312,8 @@ func (s *Scorch) introducePersist(persist *persistIntroduction) { close(persist.applied) } +// The introducer should definitely handle the segmentMerge.notify +// channel before exiting the introduceMerge. func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { atomic.AddUint64(&s.stats.TotIntroduceMergeBeg, 1) defer atomic.AddUint64(&s.stats.TotIntroduceMergeEnd, 1) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index c39e3117d..6faad6809 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -157,22 +157,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, atomic.AddUint64(&s.stats.TotFileMergePlanTasks, uint64(len(resultMergePlan.Tasks))) // process tasks in serial for now - var notifications []chan *IndexSnapshot var filenames []string - // clean up any pending notifications from introducer on exit - // from an index closure. - defer func() { - for _, notification := range notifications { - select { - case newSnapshot := <-notification: - atomic.AddUint64(&s.stats.TotFileMergeIntroductionsDone, 1) - if newSnapshot != nil { - _ = newSnapshot.DecRef() - } - default: - } - } - }() for _, task := range resultMergePlan.Tasks { if len(task.Segments) == 0 { @@ -232,7 +217,6 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, s.unmarkIneligibleForRemoval(filename) atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1) if err == segment.ErrClosed { - // handle any pending index snapshot introduction notifications on exit return err } return fmt.Errorf("merging failed: %v", err) @@ -262,35 +246,33 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, old: oldMap, oldNewDocNums: oldNewDocNums, new: seg, - notify: make(chan *IndexSnapshot, 1), + notify: make(chan *IndexSnapshot), } - notifications = append(notifications, sm.notify) // give it to the introducer select { case <-s.closeCh: - // handle any pending index snapshot introduction notifications on exit _ = seg.Close() return segment.ErrClosed case s.merges <- sm: atomic.AddUint64(&s.stats.TotFileMergeIntroductions, 1) } - atomic.AddUint64(&s.stats.TotFileMergePlanTasksDone, 1) - } - - for _, notification := range notifications { - select { - case <-s.closeCh: - // handle any pending index snapshot introduction notifications on exit - atomic.AddUint64(&s.stats.TotFileMergeIntroductionsSkipped, 1) - return segment.ErrClosed - case newSnapshot := <-notification: - atomic.AddUint64(&s.stats.TotFileMergeIntroductionsDone, 1) - if newSnapshot != nil { - _ = newSnapshot.DecRef() - } + introStartTime := time.Now() + // it is safe to blockingly wait for the merge introduction + // here as the introducer is bound to handle the notify channel. + newSnapshot := <-sm.notify + introTime := uint64(time.Since(introStartTime)) + atomic.AddUint64(&s.stats.TotFileMergeZapIntroductionTime, introTime) + if atomic.LoadUint64(&s.stats.MaxFileMergeZapIntroductionTime) < introTime { + atomic.StoreUint64(&s.stats.MaxFileMergeZapIntroductionTime, introTime) + } + atomic.AddUint64(&s.stats.TotFileMergeIntroductionsDone, 1) + if newSnapshot != nil { + _ = newSnapshot.DecRef() } + + atomic.AddUint64(&s.stats.TotFileMergePlanTasksDone, 1) } // once all the newly merged segment introductions are done, @@ -362,7 +344,7 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, old: make(map[uint64]*SegmentSnapshot), oldNewDocNums: make(map[uint64][]uint64), new: seg, - notify: make(chan *IndexSnapshot, 1), + notify: make(chan *IndexSnapshot), } for i, idx := range sbsIndexes { @@ -378,14 +360,13 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, case s.merges <- sm: } - select { // wait for introduction to complete - case <-s.closeCh: - return nil, 0, segment.ErrClosed - case newSnapshot := <-sm.notify: + // blockingly wait for the introduction to complete + newSnapshot := <-sm.notify + if newSnapshot != nil { atomic.AddUint64(&s.stats.TotMemMergeSegments, uint64(len(sbs))) atomic.AddUint64(&s.stats.TotMemMergeDone, 1) - return newSnapshot, newSegmentID, nil } + return newSnapshot, newSegmentID, nil } func (s *Scorch) ReportBytesWritten(bytesWritten uint64) { diff --git a/index/scorch/stats.go b/index/scorch/stats.go index 6549fddf5..e638362a7 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -98,10 +98,12 @@ type Stats struct { TotFileSegmentsAtRoot uint64 TotFileMergeWrittenBytes uint64 - TotFileMergeZapBeg uint64 - TotFileMergeZapEnd uint64 - TotFileMergeZapTime uint64 - MaxFileMergeZapTime uint64 + TotFileMergeZapBeg uint64 + TotFileMergeZapEnd uint64 + TotFileMergeZapTime uint64 + MaxFileMergeZapTime uint64 + TotFileMergeZapIntroductionTime uint64 + MaxFileMergeZapIntroductionTime uint64 TotFileMergeIntroductions uint64 TotFileMergeIntroductionsDone uint64 From 9c779503146ab4f0080943e4f14c64bbb4aa397b Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 25 Mar 2020 15:23:16 +0530 Subject: [PATCH 652/728] cleaning up the online rollback logic --- index/scorch/introducer.go | 80 -------------------------------------- index/scorch/scorch.go | 2 - 2 files changed, 82 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index ac627796f..9a27de94e 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -77,11 +77,6 @@ OUTER: case persist := <-s.persists: s.introducePersist(persist) - case revertTo := <-s.revertToSnapshots: - err := s.revertToSnapshot(revertTo) - if err != nil { - continue OUTER - } } var epochCurr uint64 @@ -443,81 +438,6 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { close(nextMerge.notify) } -func (s *Scorch) revertToSnapshot(revertTo *snapshotReversion) error { - atomic.AddUint64(&s.stats.TotIntroduceRevertBeg, 1) - defer atomic.AddUint64(&s.stats.TotIntroduceRevertEnd, 1) - - if revertTo.snapshot == nil { - err := fmt.Errorf("Cannot revert to a nil snapshot") - revertTo.applied <- err - return err - } - - // acquire lock - s.rootLock.Lock() - - // prepare a new index snapshot, based on next snapshot - newSnapshot := &IndexSnapshot{ - parent: s, - segment: make([]*SegmentSnapshot, len(revertTo.snapshot.segment)), - offsets: revertTo.snapshot.offsets, - internal: revertTo.snapshot.internal, - epoch: s.nextSnapshotEpoch, - refs: 1, - creator: "revertToSnapshot", - } - s.nextSnapshotEpoch++ - - var docsToPersistCount, memSegments, fileSegments uint64 - // iterate through segments - for i, segmentSnapshot := range revertTo.snapshot.segment { - newSnapshot.segment[i] = &SegmentSnapshot{ - id: segmentSnapshot.id, - segment: segmentSnapshot.segment, - deleted: segmentSnapshot.deleted, - cachedDocs: segmentSnapshot.cachedDocs, - creator: segmentSnapshot.creator, - } - newSnapshot.segment[i].segment.AddRef() - - // remove segment from ineligibleForRemoval map - filename := zapFileName(segmentSnapshot.id) - delete(s.ineligibleForRemoval, filename) - - if isMemorySegment(segmentSnapshot) { - docsToPersistCount += segmentSnapshot.Count() - memSegments++ - } else { - fileSegments++ - } - } - - atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount) - atomic.StoreUint64(&s.stats.TotMemorySegmentsAtRoot, memSegments) - atomic.StoreUint64(&s.stats.TotFileSegmentsAtRoot, fileSegments) - - if revertTo.persisted != nil { - s.rootPersisted = append(s.rootPersisted, revertTo.persisted) - } - - newSnapshot.updateSize() - // swap in new snapshot - rootPrev := s.root - s.root = newSnapshot - - atomic.StoreUint64(&s.stats.CurRootEpoch, s.root.epoch) - // release lock - s.rootLock.Unlock() - - if rootPrev != nil { - _ = rootPrev.DecRef() - } - - close(revertTo.applied) - - return nil -} - func isMemorySegment(s *SegmentSnapshot) bool { switch s.segment.(type) { case *zap.SegmentBase: diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 7a1046fc5..884c85845 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -67,7 +67,6 @@ type Scorch struct { persists chan *persistIntroduction merges chan *segmentMerge introducerNotifier chan *epochWatcher - revertToSnapshots chan *snapshotReversion persisterNotifier chan *epochWatcher rootBolt *bolt.DB asyncTasks sync.WaitGroup @@ -221,7 +220,6 @@ func (s *Scorch) openBolt() error { s.persists = make(chan *persistIntroduction) s.merges = make(chan *segmentMerge) s.introducerNotifier = make(chan *epochWatcher, 1) - s.revertToSnapshots = make(chan *snapshotReversion) s.persisterNotifier = make(chan *epochWatcher, 1) if !s.readOnly && s.path != "" { From d9d505af5115efbe19e4e17f1c89bdfdf655c2d3 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 26 Mar 2020 10:11:56 +0530 Subject: [PATCH 653/728] -pick only younger epochs including the target one while iterating over the persisted epochs. --- index/scorch/snapshot_rollback.go | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/index/scorch/snapshot_rollback.go b/index/scorch/snapshot_rollback.go index d9f89f46f..c22b112f7 100644 --- a/index/scorch/snapshot_rollback.go +++ b/index/scorch/snapshot_rollback.go @@ -143,16 +143,17 @@ func Rollback(path string, to *RollbackPoint) error { } }() - // pick all the persisted epochs in bolt store + // pick all the younger persisted epochs in bolt store + // including the target one. var found bool - var persistedEpochs []uint64 + var eligibleEpochs []uint64 err = rootBolt.View(func(tx *bolt.Tx) error { snapshots := tx.Bucket(boltSnapshotsBucket) if snapshots == nil { return nil } sc := snapshots.Cursor() - for sk, _ := sc.Last(); sk != nil; sk, _ = sc.Prev() { + for sk, _ := sc.Last(); sk != nil && !found; sk, _ = sc.Prev() { _, snapshotEpoch, err := segment.DecodeUvarintAscending(sk) if err != nil { continue @@ -160,12 +161,12 @@ func Rollback(path string, to *RollbackPoint) error { if snapshotEpoch == to.epoch { found = true } - persistedEpochs = append(persistedEpochs, snapshotEpoch) + eligibleEpochs = append(eligibleEpochs, snapshotEpoch) } return nil }) - if len(persistedEpochs) == 0 { + if len(eligibleEpochs) == 0 { return fmt.Errorf("Rollback: no persisted epochs found in bolt") } if !found { @@ -193,7 +194,7 @@ func Rollback(path string, to *RollbackPoint) error { if snapshots == nil { return nil } - for _, epoch := range persistedEpochs { + for _, epoch := range eligibleEpochs { k := segment.EncodeUvarintAscending(nil, epoch) if err != nil { continue From 314af131849d2adcfab4620b73c3c194c486e5e8 Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Wed, 1 Apr 2020 17:43:29 -0700 Subject: [PATCH 654/728] Update .travis.yml to include testing on go version 1.14.x + Also, go fmt ./.. --- .travis.yml | 1 + geo/geo.go | 2 +- geo/geo_dist_test.go | 2 +- geo/geo_test.go | 4 ++-- geo/sloppy_test.go | 4 ++-- 5 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.travis.yml b/.travis.yml index dffe4b8fd..7b7297afe 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,7 @@ language: go go: - "1.12.x" - "1.13.x" + - "1.14.x" script: - go get golang.org/x/tools/cmd/cover diff --git a/geo/geo.go b/geo/geo.go index 077e73bf0..b18ace433 100644 --- a/geo/geo.go +++ b/geo/geo.go @@ -33,7 +33,7 @@ var minLonRad = minLon * degreesToRadian var minLatRad = minLat * degreesToRadian var maxLonRad = maxLon * degreesToRadian var maxLatRad = maxLat * degreesToRadian -var geoTolerance = 1E-6 +var geoTolerance = 1e-6 var lonScale = float64((uint64(0x1)< 1E-2 { + if !math.IsNaN(test.want) && math.Abs(got-test.want) > 1e-2 { t.Errorf("expected %f got %f", test.want, got) } } diff --git a/geo/geo_test.go b/geo/geo_test.go index 3b707d75b..52a38e273 100644 --- a/geo/geo_test.go +++ b/geo/geo_test.go @@ -89,10 +89,10 @@ func TestRectFromPointDistance(t *testing.T) { if err != nil { t.Fatal(err) } - if math.Abs(upperLeftLat-1) > 1E-2 { + if math.Abs(upperLeftLat-1) > 1e-2 { t.Errorf("expected bounding box upper left lat to be almost 1, got %f", upperLeftLat) } - if math.Abs(lowerRightLat+1) > 1E-2 { + if math.Abs(lowerRightLat+1) > 1e-2 { t.Errorf("expected bounding box lower right lat to be almost -1, got %f", lowerRightLat) } } diff --git a/geo/sloppy_test.go b/geo/sloppy_test.go index 4fefdc5b9..0f74e375b 100644 --- a/geo/sloppy_test.go +++ b/geo/sloppy_test.go @@ -21,7 +21,7 @@ import ( func TestCos(t *testing.T) { - cosDelta := 1E-15 + cosDelta := 1e-15 tests := []struct { in float64 @@ -55,7 +55,7 @@ func TestCos(t *testing.T) { func TestAsin(t *testing.T) { - asinDelta := 1E-7 + asinDelta := 1e-7 tests := []struct { in float64 From ca3bfc95ff942e5846c005e05c6d0b5236e30de0 Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Fri, 3 Apr 2020 11:58:09 -0700 Subject: [PATCH 655/728] Upgrade etcd-io/bbolt SHA to v1.3.4 --- vendor/manifest | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vendor/manifest b/vendor/manifest index 431363eef..1c0b482ef 100644 --- a/vendor/manifest +++ b/vendor/manifest @@ -29,7 +29,7 @@ "importpath": "go.etcd.io/bbolt", "repository": "https://github.com/etcd-io/bbolt", "vcs": "", - "revision": "7ee3ded59d4835e10f3e7d0f7603c42aa5e83820", + "revision": "68cc10a767ea1c6b9e8dcb9847317ff192d6d974", "branch": "master", "notests": true }, From ff2708cf7f5025afa81c670299339b527b48131a Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Sun, 5 Apr 2020 17:03:56 -0400 Subject: [PATCH 656/728] adopt Go modules As of this commit, the Bleve project has adopted Go modules. Further, the zap file format used by scorch has been broken out into it's own module: github.com/blevesearch/zap This allows us to develop breaking changes to the file format as new major versions of the zap module. Applications can now also build Bleve with support for multiple versions of zap. This allows compatibility with existing indexes, while still allowing for new indexes to take advantage of newer file formats. Applications that want to directly control which versions are supported can use two new functions inside scorch: func RegisterPlugin(plugin segment.Plugin, makeDefault bool) func ResetPlugins() The bleve command-line tool no longer includes the zap sub-command. This functionality is now available as it's own command-line tool in the zap repository. Finally, this commit also ends support for traditional GOPATH builds using the standard 'go get' tool. Closes #1350 --- cmd/bleve/cmd/scorch/snapshot.go | 4 +- cmd/bleve/cmd/zap.go | 25 - cmd/bleve/cmd/zap/dict.go | 116 --- cmd/bleve/cmd/zap/docvalue.go | 280 ------ cmd/bleve/cmd/zap/explore.go | 142 --- cmd/bleve/cmd/zap/fields.go | 66 -- cmd/bleve/cmd/zap/footer.go | 44 - cmd/bleve/cmd/zap/root.go | 58 -- cmd/bleve/cmd/zap/stored.go | 78 -- go.mod | 26 + index/scorch/introducer.go | 13 +- index/scorch/merge.go | 42 +- index/scorch/optimize.go | 74 +- index/scorch/persister.go | 61 +- index/scorch/scorch.go | 26 +- index/scorch/scorch_test.go | 29 + index/scorch/segment/empty.go | 8 +- index/scorch/segment/plugin.go | 58 ++ index/scorch/segment/segment.go | 16 + index/scorch/segment/unadorned.go | 148 +++ index/scorch/segment/zap/README.md | 158 --- index/scorch/segment/zap/build.go | 151 --- index/scorch/segment/zap/build_test.go | 556 ----------- index/scorch/segment/zap/contentcoder.go | 230 ----- index/scorch/segment/zap/contentcoder_test.go | 125 --- index/scorch/segment/zap/count.go | 61 -- index/scorch/segment/zap/dict.go | 263 ----- index/scorch/segment/zap/dict_test.go | 337 ------- index/scorch/segment/zap/docvalues.go | 311 ------ index/scorch/segment/zap/enumerator.go | 126 --- index/scorch/segment/zap/enumerator_test.go | 237 ----- index/scorch/segment/zap/intcoder.go | 172 ---- index/scorch/segment/zap/intcoder_test.go | 269 ------ index/scorch/segment/zap/merge.go | 862 ----------------- index/scorch/segment/zap/merge_test.go | 870 ----------------- index/scorch/segment/zap/new.go | 839 ---------------- index/scorch/segment/zap/posting.go | 897 ------------------ index/scorch/segment/zap/read.go | 43 - index/scorch/segment/zap/segment.go | 572 ----------- index/scorch/segment/zap/segment_test.go | 737 -------------- index/scorch/segment/zap/write.go | 145 --- index/scorch/segment/zap/write_test.go | 86 -- index/scorch/segment/zap/zap.md | 177 ---- index/scorch/segment_plugin.go | 77 ++ vendor/manifest | 145 --- 45 files changed, 468 insertions(+), 9292 deletions(-) delete mode 100644 cmd/bleve/cmd/zap.go delete mode 100644 cmd/bleve/cmd/zap/dict.go delete mode 100644 cmd/bleve/cmd/zap/docvalue.go delete mode 100644 cmd/bleve/cmd/zap/explore.go delete mode 100644 cmd/bleve/cmd/zap/fields.go delete mode 100644 cmd/bleve/cmd/zap/footer.go delete mode 100644 cmd/bleve/cmd/zap/root.go delete mode 100644 cmd/bleve/cmd/zap/stored.go create mode 100644 go.mod create mode 100644 index/scorch/segment/plugin.go create mode 100644 index/scorch/segment/unadorned.go delete mode 100644 index/scorch/segment/zap/README.md delete mode 100644 index/scorch/segment/zap/build.go delete mode 100644 index/scorch/segment/zap/build_test.go delete mode 100644 index/scorch/segment/zap/contentcoder.go delete mode 100644 index/scorch/segment/zap/contentcoder_test.go delete mode 100644 index/scorch/segment/zap/count.go delete mode 100644 index/scorch/segment/zap/dict.go delete mode 100644 index/scorch/segment/zap/dict_test.go delete mode 100644 index/scorch/segment/zap/docvalues.go delete mode 100644 index/scorch/segment/zap/enumerator.go delete mode 100644 index/scorch/segment/zap/enumerator_test.go delete mode 100644 index/scorch/segment/zap/intcoder.go delete mode 100644 index/scorch/segment/zap/intcoder_test.go delete mode 100644 index/scorch/segment/zap/merge.go delete mode 100644 index/scorch/segment/zap/merge_test.go delete mode 100644 index/scorch/segment/zap/new.go delete mode 100644 index/scorch/segment/zap/posting.go delete mode 100644 index/scorch/segment/zap/read.go delete mode 100644 index/scorch/segment/zap/segment.go delete mode 100644 index/scorch/segment/zap/segment_test.go delete mode 100644 index/scorch/segment/zap/write.go delete mode 100644 index/scorch/segment/zap/write_test.go delete mode 100644 index/scorch/segment/zap/zap.md create mode 100644 index/scorch/segment_plugin.go delete mode 100644 vendor/manifest diff --git a/cmd/bleve/cmd/scorch/snapshot.go b/cmd/bleve/cmd/scorch/snapshot.go index df13d4901..3dee55895 100644 --- a/cmd/bleve/cmd/scorch/snapshot.go +++ b/cmd/bleve/cmd/scorch/snapshot.go @@ -18,7 +18,7 @@ import ( "fmt" "strconv" - "github.com/blevesearch/bleve/index/scorch/segment/zap" + seg "github.com/blevesearch/bleve/index/scorch/segment" "github.com/spf13/cobra" ) @@ -49,7 +49,7 @@ var snapshotCmd = &cobra.Command{ segments := snapshot.Segments() for i, segmentSnap := range segments { segment := segmentSnap.Segment() - if segment, ok := segment.(*zap.Segment); ok { + if segment, ok := segment.(seg.PersistedSegment); ok { fmt.Printf("%d %s\n", i, segment.Path()) } } diff --git a/cmd/bleve/cmd/zap.go b/cmd/bleve/cmd/zap.go deleted file mode 100644 index b84d5f2b3..000000000 --- a/cmd/bleve/cmd/zap.go +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package cmd - -import ( - "github.com/blevesearch/bleve/cmd/bleve/cmd/zap" -) - -// make zap command-line tool a bleve sub-command - -func init() { - RootCmd.AddCommand(zap.RootCmd) -} diff --git a/cmd/bleve/cmd/zap/dict.go b/cmd/bleve/cmd/zap/dict.go deleted file mode 100644 index 3cd256fa5..000000000 --- a/cmd/bleve/cmd/zap/dict.go +++ /dev/null @@ -1,116 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "encoding/binary" - "fmt" - "math" - - "github.com/RoaringBitmap/roaring" - "github.com/blevesearch/bleve/index/scorch/segment/zap" - "github.com/couchbase/vellum" - "github.com/spf13/cobra" -) - -// dictCmd represents the dict command -var dictCmd = &cobra.Command{ - Use: "dict [path] [field]", - Short: "dict prints the term dictionary for the specified field", - Long: `The dict command lets you print the term dictionary for the specified field.`, - RunE: func(cmd *cobra.Command, args []string) error { - if len(args) < 2 { - return fmt.Errorf("must specify field") - } - - data := segment.Data() - - addr, err := segment.DictAddr(args[1]) - if err != nil { - return fmt.Errorf("error determining address: %v", err) - } - fmt.Printf("dictionary for field starts at %d (%x)\n", addr, addr) - - vellumLen, read := binary.Uvarint(data[addr : addr+binary.MaxVarintLen64]) - fmt.Printf("vellum length: %d\n", vellumLen) - fstBytes := data[addr+uint64(read) : addr+uint64(read)+vellumLen] - fmt.Printf("raw vellum data:\n % x\n", fstBytes) - fmt.Printf("dictionary:\n") - var termsCount, hit1Count int64 - if fstBytes != nil { - fst, err := vellum.Load(fstBytes) - if err != nil { - return fmt.Errorf("dictionary field %s vellum err: %v", args[1], err) - } - - itr, err := fst.Iterator(nil, nil) - for err == nil { - currTerm, currVal := itr.Current() - extra := "" - if currVal&zap.FSTValEncodingMask == zap.FSTValEncoding1Hit { - docNum, normBits := zap.FSTValDecode1Hit(currVal) - norm := math.Float32frombits(uint32(normBits)) - extra = fmt.Sprintf("-- docNum: %d, norm: %f", docNum, norm) - fmt.Printf(" %s - %d (%x) %s\n", currTerm, currVal, currVal, extra) - hit1Count++ - } else { - // fetch the postings size, cardinality in case of non 1 hits - l, c := readPostingCardinality(currVal, data) - fmt.Printf(" %s - %d (%x) posting byteSize: %d cardinality: %d\n", - currTerm, currVal, currVal, l, c) - } - termsCount++ - err = itr.Next() - } - if err != nil && err != vellum.ErrIteratorDone { - return fmt.Errorf("error iterating dictionary: %v", err) - } - fmt.Printf("Total terms in dictionary : %d 1hit count: %d\n", termsCount, hit1Count) - } - - return nil - }, -} - -func init() { - RootCmd.AddCommand(dictCmd) -} - -func readPostingCardinality(postingsOffset uint64, data []byte) (int, uint64) { - // read the location of the freq/norm details - var n uint64 - var read int - - _, read = binary.Uvarint(data[postingsOffset+n : postingsOffset+binary.MaxVarintLen64]) - n += uint64(read) - - _, read = binary.Uvarint(data[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) - n += uint64(read) - - var postingsLen uint64 - postingsLen, read = binary.Uvarint(data[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) - n += uint64(read) - - roaringBytes := data[postingsOffset+n : postingsOffset+n+postingsLen] - - r := roaring.NewBitmap() - - _, err := r.FromBuffer(roaringBytes) - if err != nil { - fmt.Printf("error loading roaring bitmap: %v", err) - } - - return len(roaringBytes), r.GetCardinality() -} diff --git a/cmd/bleve/cmd/zap/docvalue.go b/cmd/bleve/cmd/zap/docvalue.go deleted file mode 100644 index b8563be93..000000000 --- a/cmd/bleve/cmd/zap/docvalue.go +++ /dev/null @@ -1,280 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "bytes" - "encoding/binary" - "fmt" - "log" - "math" - "sort" - "strconv" - - "github.com/blevesearch/bleve/index/scorch/segment/zap" - "github.com/golang/snappy" - "github.com/spf13/cobra" -) - -// docvalueCmd represents the docvalue command -var docvalueCmd = &cobra.Command{ - Use: "docvalue [path] optional optional", - Short: "docvalue prints the docvalue details by field, and docNum", - Long: `The docvalue command lets you explore the docValues in order of field and by doc number.`, - RunE: func(cmd *cobra.Command, args []string) error { - if len(args) < 1 { - return fmt.Errorf("must specify index file path") - } - - data := segment.Data() - crcOffset := len(data) - 4 - verOffset := crcOffset - 4 - chunkOffset := verOffset - 4 - fieldsOffset := chunkOffset - 16 - fieldsIndexOffset := binary.BigEndian.Uint64(data[fieldsOffset : fieldsOffset+8]) - fieldsIndexEnd := uint64(len(data) - zap.FooterSize) - - // iterate through fields index - var fieldInv []string - var id uint64 - for fieldsIndexOffset+(8*id) < fieldsIndexEnd { - addr := binary.BigEndian.Uint64( - data[fieldsIndexOffset+(8*id) : fieldsIndexOffset+(8*id)+8]) - var n uint64 - _, read := binary.Uvarint(data[addr+n : fieldsIndexEnd]) - n += uint64(read) - - var nameLen uint64 - nameLen, read = binary.Uvarint(data[addr+n : fieldsIndexEnd]) - n += uint64(read) - - name := string(data[addr+n : addr+n+nameLen]) - - id++ - fieldInv = append(fieldInv, name) - } - - dvLoc := segment.DocValueOffset() - var n int - var fieldName string - var fieldID uint16 - var fieldDvSize float64 - var read, fieldStartLoc, fieldEndLoc uint64 - var fieldChunkCount, fieldDvStart, fieldDvEnd, totalDvSize uint64 - var fieldChunkLens []uint64 - - // if no fields are specified then print the docValue offsets for all fields set - for id, field := range fieldInv { - fieldStartLoc, n = binary.Uvarint( - data[dvLoc+read : dvLoc+read+binary.MaxVarintLen64]) - if n <= 0 { - return fmt.Errorf("loadDvIterators: failed to read the "+ - " docvalue offsets for field %d", fieldID) - } - - read += uint64(n) - fieldEndLoc, n = binary.Uvarint( - data[dvLoc+read : dvLoc+read+binary.MaxVarintLen64]) - if n <= 0 { - return fmt.Errorf("Failed to read the docvalue offset "+ - "end for field %d", fieldID) - } - - read += uint64(n) - if fieldStartLoc == math.MaxUint64 && len(args) == 1 { - fmt.Printf("FieldID: %d '%s' docvalue at %d (%x) not "+ - " persisted \n", id, field, fieldStartLoc, fieldStartLoc) - continue - } - - var chunkOffsetsPosition, offset, numChunks uint64 - if fieldEndLoc-fieldStartLoc > 16 { - numChunks = binary.BigEndian.Uint64(data[fieldEndLoc-8 : fieldEndLoc]) - // read the length of chunk offsets - chunkOffsetsLen := binary.BigEndian.Uint64(data[fieldEndLoc-16 : fieldEndLoc-8]) - // acquire position of chunk offsets - chunkOffsetsPosition = (fieldEndLoc - 16) - chunkOffsetsLen - } - - // read the chunk offsets - chunkLens := make([]uint64, numChunks) - dvSize := uint64(0) - for i := 0; i < int(numChunks); i++ { - length, read := binary.Uvarint( - data[chunkOffsetsPosition+offset : chunkOffsetsPosition+offset+ - binary.MaxVarintLen64]) - if read <= 0 { - return fmt.Errorf("Corrupted chunk offset during segment load") - } - - offset += uint64(read) - chunkLens[i] = length - dvSize += length - } - - totalDvSize += dvSize - // if no field args are given, then print out the dv locations for all fields - if len(args) == 1 { - mbsize := float64(dvSize) / (1024 * 1024) - fmt.Printf("FieldID: %d '%s' docvalue at %d (%x) numChunks "+ - "%d diskSize %.6f MB\n", id, field, fieldStartLoc, - fieldStartLoc, numChunks, mbsize) - continue - } - - // if the field is the requested one for more details, - // then remember the details - if field == args[1] { - fieldDvStart = fieldStartLoc - fieldDvEnd = fieldEndLoc - fieldName = field - fieldID = uint16(id) - fieldDvSize = float64(dvSize) / (1024 * 1024) - fieldChunkLens = append(fieldChunkLens, chunkLens...) - fieldChunkCount = numChunks - } - } - - mbsize := float64(totalDvSize) / (1024 * 1024) - fmt.Printf("Total Doc Values Size on Disk: %.6f MB\n", mbsize) - - // done with the fields dv locs printing for the given zap file - if len(args) == 1 { - return nil - } - - if fieldName == "" || fieldDvEnd == 0 { - return fmt.Errorf("No docvalue persisted for given field arg: %s", - args[1]) - } - - if len(args) == 2 { - fmt.Printf("FieldID: %d '%s' docvalue at %d (%x) numChunks "+ - "%d diskSize %.6f MB\n", fieldID, fieldName, fieldDvStart, - fieldDvStart, fieldChunkCount, fieldDvSize) - fmt.Printf("Number of docvalue chunks: %d\n", fieldChunkCount) - return nil - } - - localDocNum, err := strconv.Atoi(args[2]) - if err != nil { - return fmt.Errorf("Unable to parse doc number: %v", err) - } - - if localDocNum >= int(segment.NumDocs()) { - return fmt.Errorf("Invalid doc number %d (valid 0 - %d)", - localDocNum, segment.NumDocs()-1) - } - - // find the chunkNumber where the docValues are stored - docInChunk := uint64(localDocNum) / uint64(segment.ChunkFactor()) - - if fieldChunkCount < docInChunk { - return fmt.Errorf("No chunk exists for chunk number: %d for "+ - "localDocNum: %d", docInChunk, localDocNum) - } - - start, end := readChunkBoundary(int(docInChunk), fieldChunkLens) - destChunkDataLoc := fieldDvStart + start - curChunkEnd := fieldDvStart + end - - // read the number of docs reside in the chunk - var numDocs uint64 - var nr int - numDocs, nr = binary.Uvarint( - data[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64]) - if nr <= 0 { - return fmt.Errorf("Failed to read the chunk") - } - - chunkMetaLoc := destChunkDataLoc + uint64(nr) - curChunkHeader := make([]zap.MetaData, int(numDocs)) - offset := uint64(0) - for i := 0; i < int(numDocs); i++ { - curChunkHeader[i].DocNum, nr = binary.Uvarint( - data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) - offset += uint64(nr) - curChunkHeader[i].DocDvOffset, nr = binary.Uvarint( - data[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) - offset += uint64(nr) - } - - compressedDataLoc := chunkMetaLoc + offset - dataLength := curChunkEnd - compressedDataLoc - curChunkData := data[compressedDataLoc : compressedDataLoc+dataLength] - - start, end = getDocValueLocs(uint64(localDocNum), curChunkHeader) - if start == math.MaxUint64 || end == math.MaxUint64 { - fmt.Printf("No field values found for localDocNum: %d\n", - localDocNum) - fmt.Printf("Try docNums present in chunk: %s\n", - metaDataDocNums(curChunkHeader)) - return nil - } - // uncompress the already loaded data - uncompressed, err := snappy.Decode(nil, curChunkData) - if err != nil { - log.Printf("snappy err %+v ", err) - return err - } - - var termSeparator byte = 0xff - var termSeparatorSplitSlice = []byte{termSeparator} - - // pick the terms for the given docNum - uncompressed = uncompressed[start:end] - for { - i := bytes.Index(uncompressed, termSeparatorSplitSlice) - if i < 0 { - break - } - - fmt.Printf(" %s ", uncompressed[0:i]) - uncompressed = uncompressed[i+1:] - } - fmt.Printf(" \n ") - return nil - }, -} - -func getDocValueLocs(docNum uint64, metaHeader []zap.MetaData) (uint64, uint64) { - i := sort.Search(len(metaHeader), func(i int) bool { - return metaHeader[i].DocNum >= docNum - }) - if i < len(metaHeader) && metaHeader[i].DocNum == docNum { - return zap.ReadDocValueBoundary(i, metaHeader) - } - return math.MaxUint64, math.MaxUint64 -} - -func metaDataDocNums(metaHeader []zap.MetaData) string { - docNums := "" - for _, meta := range metaHeader { - docNums += fmt.Sprintf("%d", meta.DocNum) + ", " - } - return docNums -} - -func readChunkBoundary(chunk int, offsets []uint64) (uint64, uint64) { - var start uint64 - if chunk > 0 { - start = offsets[chunk-1] - } - return start, offsets[chunk] -} - -func init() { - RootCmd.AddCommand(docvalueCmd) -} diff --git a/cmd/bleve/cmd/zap/explore.go b/cmd/bleve/cmd/zap/explore.go deleted file mode 100644 index deac086cb..000000000 --- a/cmd/bleve/cmd/zap/explore.go +++ /dev/null @@ -1,142 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "encoding/binary" - "fmt" - "math" - - "github.com/RoaringBitmap/roaring" - "github.com/blevesearch/bleve/index/scorch/segment/zap" - "github.com/couchbase/vellum" - "github.com/spf13/cobra" -) - -// exploreCmd represents the explore command -var exploreCmd = &cobra.Command{ - Use: "explore [path] [field] ", - Short: "explores the index by field, then term (optional), and then docNum (optional)", - Long: `The explore command lets you explore the index in order of field, then optionally by term, then optionally again by doc number.`, - RunE: func(cmd *cobra.Command, args []string) error { - if len(args) < 2 { - return fmt.Errorf("must specify field") - } - - data := segment.Data() - - addr, err := segment.DictAddr(args[1]) - if err != nil { - return fmt.Errorf("error determining address: %v", err) - } - fmt.Printf("dictionary for field starts at %d (%x)\n", addr, addr) - - vellumLen, read := binary.Uvarint(data[addr : addr+binary.MaxVarintLen64]) - fmt.Printf("vellum length: %d\n", vellumLen) - fstBytes := data[addr+uint64(read) : addr+uint64(read)+vellumLen] - fmt.Printf("raw vellum data:\n % x\n", fstBytes) - - if len(args) >= 3 { - if fstBytes != nil { - fst, err := vellum.Load(fstBytes) - if err != nil { - return fmt.Errorf("dictionary field %s vellum err: %v", args[1], err) - } - postingsAddr, exists, err := fst.Get([]byte(args[2])) - if err != nil { - return fmt.Errorf("error looking for term : %v", err) - } - if exists { - fmt.Printf("FST val is %d (%x)\n", postingsAddr, postingsAddr) - - if postingsAddr&zap.FSTValEncodingMask == zap.FSTValEncoding1Hit { - docNum, normBits := zap.FSTValDecode1Hit(postingsAddr) - norm := math.Float32frombits(uint32(normBits)) - fmt.Printf("Posting List is 1-hit encoded, docNum: %d, norm: %f\n", - docNum, norm) - return nil - } - - if postingsAddr&zap.FSTValEncodingMask != zap.FSTValEncodingGeneral { - return fmt.Errorf("unknown fst val encoding") - } - - var n uint64 - freqAddr, read := binary.Uvarint(data[postingsAddr : postingsAddr+binary.MaxVarintLen64]) - n += uint64(read) - - var locAddr uint64 - locAddr, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64]) - n += uint64(read) - - var postingListLen uint64 - postingListLen, read = binary.Uvarint(data[postingsAddr+n : postingsAddr+n+binary.MaxVarintLen64]) - n += uint64(read) - - fmt.Printf("Posting List Length: %d\n", postingListLen) - bitmap := roaring.New() - _, err = bitmap.FromBuffer(data[postingsAddr+n : postingsAddr+n+postingListLen]) - if err != nil { - return err - } - fmt.Printf("Posting List: %v\n", bitmap) - - fmt.Printf("Freq details at: %d (%x)\n", freqAddr, freqAddr) - numChunks, r2 := binary.Uvarint(data[freqAddr : freqAddr+binary.MaxVarintLen64]) - n = uint64(r2) - - var freqOffsets []uint64 - for j := uint64(0); j < numChunks; j++ { - chunkLen, r3 := binary.Uvarint(data[freqAddr+n : freqAddr+n+binary.MaxVarintLen64]) - n += uint64(r3) - freqOffsets = append(freqOffsets, chunkLen) - } - running := freqAddr + n - for k, offset := range freqOffsets { - fmt.Printf("freq chunk: %d, len %d, start at %d (%x) end %d (%x)\n", k, offset, running, running, running+offset, running+offset) - running += offset - } - - fmt.Printf("Loc details at: %d (%x)\n", locAddr, locAddr) - numLChunks, r4 := binary.Uvarint(data[locAddr : locAddr+binary.MaxVarintLen64]) - n = uint64(r4) - fmt.Printf("there are %d loc chunks\n", numLChunks) - - var locOffsets []uint64 - for j := uint64(0); j < numLChunks; j++ { - lchunkLen, r4 := binary.Uvarint(data[locAddr+n : locAddr+n+binary.MaxVarintLen64]) - n += uint64(r4) - locOffsets = append(locOffsets, lchunkLen) - } - - running2 := locAddr + n - for k, offset := range locOffsets { - fmt.Printf("loc chunk: %d, len %d(%x), start at %d (%x) end %d (%x)\n", k, offset, offset, running2, running2, running2+offset, running2+offset) - running2 += offset - } - - } else { - fmt.Printf("dictionary does not contain term '%s'\n", args[2]) - } - } - } - - return nil - }, -} - -func init() { - RootCmd.AddCommand(exploreCmd) -} diff --git a/cmd/bleve/cmd/zap/fields.go b/cmd/bleve/cmd/zap/fields.go deleted file mode 100644 index cf8cc3d86..000000000 --- a/cmd/bleve/cmd/zap/fields.go +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "encoding/binary" - "fmt" - - "github.com/blevesearch/bleve/index/scorch/segment/zap" - "github.com/spf13/cobra" -) - -// fieldsCmd represents the fields command -var fieldsCmd = &cobra.Command{ - Use: "fields [path]", - Short: "fields prints the fields in the specified file", - Long: `The fields command lets you print the fields in the specified file.`, - RunE: func(cmd *cobra.Command, args []string) error { - - data := segment.Data() - - crcOffset := len(data) - 4 - verOffset := crcOffset - 4 - chunkOffset := verOffset - 4 - fieldsOffset := chunkOffset - 16 - fieldsIndexOffset := binary.BigEndian.Uint64(data[fieldsOffset : fieldsOffset+8]) - fieldsIndexEnd := uint64(len(data) - zap.FooterSize) - - // iterate through fields index - var fieldID uint64 - for fieldsIndexOffset+(8*fieldID) < fieldsIndexEnd { - addr := binary.BigEndian.Uint64(data[fieldsIndexOffset+(8*fieldID) : fieldsIndexOffset+(8*fieldID)+8]) - var n uint64 - dictLoc, read := binary.Uvarint(data[addr+n : fieldsIndexEnd]) - n += uint64(read) - - var nameLen uint64 - nameLen, read = binary.Uvarint(data[addr+n : fieldsIndexEnd]) - n += uint64(read) - - name := string(data[addr+n : addr+n+nameLen]) - - fmt.Printf("field %d '%s' starts at %d (%x)\n", fieldID, name, dictLoc, dictLoc) - - fieldID++ - } - - return nil - }, -} - -func init() { - RootCmd.AddCommand(fieldsCmd) -} diff --git a/cmd/bleve/cmd/zap/footer.go b/cmd/bleve/cmd/zap/footer.go deleted file mode 100644 index 96078ded6..000000000 --- a/cmd/bleve/cmd/zap/footer.go +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "fmt" - - "github.com/spf13/cobra" -) - -// footerCmd represents the footer command -var footerCmd = &cobra.Command{ - Use: "footer [path]", - Short: "prints the contents of the zap footer", - Long: `The footer command will print the contents of the footer.`, - RunE: func(cmd *cobra.Command, args []string) error { - data := segment.Data() - fmt.Printf("Length: %d\n", len(data)) - fmt.Printf("CRC: %#x\n", segment.CRC()) - fmt.Printf("Version: %d\n", segment.Version()) - fmt.Printf("Chunk Factor: %d\n", segment.ChunkFactor()) - fmt.Printf("Fields Idx: %d (%#x)\n", segment.FieldsIndexOffset(), segment.FieldsIndexOffset()) - fmt.Printf("Stored Idx: %d (%#x)\n", segment.StoredIndexOffset(), segment.StoredIndexOffset()) - fmt.Printf("DocValue Idx: %d (%#x)\n", segment.DocValueOffset(), segment.DocValueOffset()) - fmt.Printf("Num Docs: %d\n", segment.NumDocs()) - return nil - }, -} - -func init() { - RootCmd.AddCommand(footerCmd) -} diff --git a/cmd/bleve/cmd/zap/root.go b/cmd/bleve/cmd/zap/root.go deleted file mode 100644 index ee2b62602..000000000 --- a/cmd/bleve/cmd/zap/root.go +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "fmt" - "os" - - "github.com/blevesearch/bleve/index/scorch/segment/zap" - "github.com/spf13/cobra" -) - -var segment *zap.Segment - -// RootCmd represents the base command when called without any subcommands -var RootCmd = &cobra.Command{ - Use: "zap", - Short: "command-line tool to interact with a zap file", - Long: `Zap is a command-line tool to interact with a zap file.`, - PersistentPreRunE: func(cmd *cobra.Command, args []string) error { - - if len(args) < 1 { - return fmt.Errorf("must specify path to zap file") - } - - segInf, err := zap.Open(args[0]) - if err != nil { - return fmt.Errorf("error opening zap file: %v", err) - } - segment = segInf.(*zap.Segment) - - return nil - }, - PersistentPostRunE: func(cmd *cobra.Command, args []string) error { - return nil - }, -} - -// Execute adds all child commands to the root command sets flags appropriately. -// This is called by main.main(). It only needs to happen once to the rootCmd. -func Execute() { - if err := RootCmd.Execute(); err != nil { - fmt.Println(err) - os.Exit(-1) - } -} diff --git a/cmd/bleve/cmd/zap/stored.go b/cmd/bleve/cmd/zap/stored.go deleted file mode 100644 index 28d62c0cb..000000000 --- a/cmd/bleve/cmd/zap/stored.go +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "encoding/binary" - "fmt" - "strconv" - - "github.com/golang/snappy" - "github.com/spf13/cobra" -) - -// storedCmd represents the stored command -var storedCmd = &cobra.Command{ - Use: "stored [path] [docNum]", - Short: "prints the stored section for a doc number", - Long: `The stored command will print the raw stored data bytes for the specified document number.`, - RunE: func(cmd *cobra.Command, args []string) error { - if len(args) < 2 { - return fmt.Errorf("must specify doc number") - } - docNum, err := strconv.Atoi(args[1]) - if err != nil { - return fmt.Errorf("unable to parse doc number: %v", err) - } - if docNum >= int(segment.NumDocs()) { - return fmt.Errorf("invalid doc number %d (valid 0 - %d)", docNum, segment.NumDocs()-1) - } - data := segment.Data() - storedIdx := segment.StoredIndexOffset() - // read docNum entry in the index - indexPos := storedIdx + (8 * uint64(docNum)) - storedStartAddr := binary.BigEndian.Uint64(data[indexPos : indexPos+8]) - fmt.Printf("Stored field starts at %d (%#x)\n", storedStartAddr, storedStartAddr) - - var n uint64 - metaLen, read := binary.Uvarint(data[storedStartAddr : storedStartAddr+binary.MaxVarintLen64]) - n += uint64(read) - fmt.Printf("Meta Len: %d\n", metaLen) - var dataLen uint64 - dataLen, read = binary.Uvarint(data[storedStartAddr+n : storedStartAddr+n+binary.MaxVarintLen64]) - n += uint64(read) - fmt.Printf("Data Len: %d\n", dataLen) - meta := data[storedStartAddr+n : storedStartAddr+n+metaLen] - fmt.Printf("Raw meta: % x\n", meta) - raw := data[storedStartAddr+n+metaLen : storedStartAddr+n+metaLen+dataLen] - fmt.Printf("Raw data (len %d): % x\n", len(raw), raw) - - // handle _id field special case - idFieldValLen, _ := binary.Uvarint(meta) - fmt.Printf("Raw _id (len %d): % x\n", idFieldValLen, raw[:idFieldValLen]) - fmt.Printf("Raw fields (len %d): % x\n", dataLen-idFieldValLen, raw[idFieldValLen:]) - uncompressed, err := snappy.Decode(nil, raw[idFieldValLen:]) - if err != nil { - panic(err) - } - fmt.Printf("Uncompressed fields (len %d): % x\n", len(uncompressed), uncompressed) - - return nil - }, -} - -func init() { - RootCmd.AddCommand(storedCmd) -} diff --git a/go.mod b/go.mod new file mode 100644 index 000000000..7a1483824 --- /dev/null +++ b/go.mod @@ -0,0 +1,26 @@ +module github.com/blugelabs/bleve + +go 1.13 + +require ( + github.com/RoaringBitmap/roaring v0.4.21 + github.com/blevesearch/bleve v1.0.0 + github.com/blevesearch/blevex v0.0.0-20190916190636-152f0fe5c040 + github.com/blevesearch/go-porterstemmer v1.0.3 + github.com/blevesearch/segment v0.9.0 + github.com/blevesearch/snowballstem v0.9.0 + github.com/blevesearch/zap/v11 v11.0.1 + github.com/blevesearch/zap/v12 v12.0.1 + github.com/couchbase/ghistogram v0.1.0 // indirect + github.com/couchbase/moss v0.0.0-20190322010551-a0cae174c498 + github.com/couchbase/vellum v1.0.0 + github.com/golang/protobuf v1.3.2 + github.com/kljensen/snowball v0.6.0 + github.com/rcrowley/go-metrics v0.0.0-20190826022208-cac0b30c2563 + github.com/spf13/cobra v0.0.5 + github.com/steveyen/gtreap v0.0.0-20150807155958-0abe01ef9be2 + github.com/syndtr/goleveldb v1.0.0 + github.com/willf/bitset v1.1.10 + go.etcd.io/bbolt v1.3.4 + golang.org/x/text v0.3.0 +) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index d0f1f2318..e5f00f80e 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -21,7 +21,6 @@ import ( "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/blevesearch/bleve/index/scorch/segment/zap" ) type segmentIntroduction struct { @@ -406,11 +405,11 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { atomic.AddUint64(&s.stats.TotIntroducedSegmentsMerge, 1) switch nextMerge.new.(type) { - case *zap.SegmentBase: + case segment.PersistedSegment: + fileSegments++ + default: docsToPersistCount += nextMerge.new.Count() - newSegmentDeleted.GetCardinality() memSegments++ - case *zap.Segment: - fileSegments++ } } @@ -442,9 +441,9 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { func isMemorySegment(s *SegmentSnapshot) bool { switch s.segment.(type) { - case *zap.SegmentBase: - return true - default: + case segment.PersistedSegment: return false + default: + return true } } diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 6faad6809..37dca529a 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -25,7 +25,6 @@ import ( "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index/scorch/mergeplan" "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/blevesearch/bleve/index/scorch/segment/zap" ) func (s *Scorch) mergerLoop() { @@ -131,18 +130,18 @@ func (s *Scorch) parseMergePlannerOptions() (*mergeplan.MergePlanOptions, func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, options *mergeplan.MergePlanOptions) error { - // build list of zap segments in this snapshot - var onlyZapSnapshots []mergeplan.Segment + // build list of persisted segments in this snapshot + var onlyPersistedSnapshots []mergeplan.Segment for _, segmentSnapshot := range ourSnapshot.segment { - if _, ok := segmentSnapshot.segment.(*zap.Segment); ok { - onlyZapSnapshots = append(onlyZapSnapshots, segmentSnapshot) + if _, ok := segmentSnapshot.segment.(segment.PersistedSegment); ok { + onlyPersistedSnapshots = append(onlyPersistedSnapshots, segmentSnapshot) } } atomic.AddUint64(&s.stats.TotFileMergePlan, 1) // give this list to the planner - resultMergePlan, err := mergeplan.Plan(onlyZapSnapshots, options) + resultMergePlan, err := mergeplan.Plan(onlyPersistedSnapshots, options) if err != nil { atomic.AddUint64(&s.stats.TotFileMergePlanErr, 1) return fmt.Errorf("merge planning err: %v", err) @@ -169,24 +168,24 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, oldMap := make(map[uint64]*SegmentSnapshot) newSegmentID := atomic.AddUint64(&s.nextSegmentID, 1) - segmentsToMerge := make([]*zap.Segment, 0, len(task.Segments)) + segmentsToMerge := make([]segment.Segment, 0, len(task.Segments)) docsToDrop := make([]*roaring.Bitmap, 0, len(task.Segments)) for _, planSegment := range task.Segments { if segSnapshot, ok := planSegment.(*SegmentSnapshot); ok { oldMap[segSnapshot.id] = segSnapshot - if zapSeg, ok := segSnapshot.segment.(*zap.Segment); ok { + if persistedSeg, ok := segSnapshot.segment.(segment.PersistedSegment); ok { if segSnapshot.LiveSize() == 0 { atomic.AddUint64(&s.stats.TotFileMergeSegmentsEmpty, 1) oldMap[segSnapshot.id] = nil } else { - segmentsToMerge = append(segmentsToMerge, zapSeg) + segmentsToMerge = append(segmentsToMerge, segSnapshot.segment) docsToDrop = append(docsToDrop, segSnapshot.deleted) } // track the files getting merged for unsetting the // removal ineligibility. This helps to unflip files // even with fast merger, slow persister work flows. - path := zapSeg.Path() + path := persistedSeg.Path() filenames = append(filenames, strings.TrimPrefix(path, s.path+string(os.PathSeparator))) } @@ -203,8 +202,8 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, fileMergeZapStartTime := time.Now() atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1) - newDocNums, _, err := zap.Merge(segmentsToMerge, docsToDrop, path, - DefaultChunkFactor, s.closeCh, s) + newDocNums, _, err := s.segPlugin.Merge(segmentsToMerge, docsToDrop, path, + s.closeCh, s) atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1) fileMergeZapTime := uint64(time.Since(fileMergeZapStartTime)) @@ -222,17 +221,12 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, return fmt.Errorf("merging failed: %v", err) } - seg, err = zap.Open(path) + seg, err = s.segPlugin.Open(path) if err != nil { s.unmarkIneligibleForRemoval(filename) atomic.AddUint64(&s.stats.TotFileMergePlanTasksErr, 1) return err } - err = zap.ValidateMerge(segmentsToMerge, nil, docsToDrop, seg.(*zap.Segment)) - if err != nil { - s.unmarkIneligibleForRemoval(filename) - return fmt.Errorf("merge validation failed: %v", err) - } oldNewDocNums = make(map[uint64][]uint64) for i, segNewDocNums := range newDocNums { oldNewDocNums[task.Segments[i].Id()] = segNewDocNums @@ -297,8 +291,8 @@ type segmentMerge struct { // persisted segment, and synchronously introduce that new segment // into the root func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, - sbs []*zap.SegmentBase, sbsDrops []*roaring.Bitmap, sbsIndexes []int, - chunkFactor uint32) (*IndexSnapshot, uint64, error) { + sbs []segment.Segment, sbsDrops []*roaring.Bitmap, + sbsIndexes []int) (*IndexSnapshot, uint64, error) { atomic.AddUint64(&s.stats.TotMemMergeBeg, 1) memMergeZapStartTime := time.Now() @@ -310,7 +304,7 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, path := s.path + string(os.PathSeparator) + filename newDocNums, _, err := - zap.MergeSegmentBases(sbs, sbsDrops, path, chunkFactor, s.closeCh, s) + s.segPlugin.Merge(sbs, sbsDrops, path, s.closeCh, s) atomic.AddUint64(&s.stats.TotMemMergeZapEnd, 1) @@ -325,15 +319,11 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, return nil, 0, err } - seg, err := zap.Open(path) + seg, err := s.segPlugin.Open(path) if err != nil { atomic.AddUint64(&s.stats.TotMemMergeErr, 1) return nil, 0, err } - err = zap.ValidateMerge(nil, sbs, sbsDrops, seg.(*zap.Segment)) - if err != nil { - return nil, 0, fmt.Errorf("in-memory merge validation failed: %v", err) - } // update persisted stats atomic.AddUint64(&s.stats.TotPersistedItems, seg.Count()) diff --git a/index/scorch/optimize.go b/index/scorch/optimize.go index b33e3be3d..b9cb9228a 100644 --- a/index/scorch/optimize.go +++ b/index/scorch/optimize.go @@ -18,10 +18,8 @@ import ( "fmt" "github.com/RoaringBitmap/roaring" - "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/blevesearch/bleve/index/scorch/segment/zap" ) var OptimizeConjunction = true @@ -81,25 +79,25 @@ func (o *OptimizeTFRConjunction) Finish() (index.Optimized, error) { } for i := range o.snapshot.segment { - itr0, ok := o.tfrs[0].iterators[i].(*zap.PostingsIterator) - if !ok || itr0.ActualBM == nil { + itr0, ok := o.tfrs[0].iterators[i].(segment.OptimizablePostingsIterator) + if !ok || itr0.ActualBitmap() == nil { continue } - itr1, ok := o.tfrs[1].iterators[i].(*zap.PostingsIterator) - if !ok || itr1.ActualBM == nil { + itr1, ok := o.tfrs[1].iterators[i].(segment.OptimizablePostingsIterator) + if !ok || itr1.ActualBitmap() == nil { continue } - bm := roaring.And(itr0.ActualBM, itr1.ActualBM) + bm := roaring.And(itr0.ActualBitmap(), itr1.ActualBitmap()) for _, tfr := range o.tfrs[2:] { - itr, ok := tfr.iterators[i].(*zap.PostingsIterator) - if !ok || itr.ActualBM == nil { + itr, ok := tfr.iterators[i].(segment.OptimizablePostingsIterator) + if !ok || itr.ActualBitmap() == nil { continue } - bm.And(itr.ActualBM) + bm.And(itr.ActualBitmap()) } // in this conjunction optimization, the postings iterators @@ -107,10 +105,9 @@ func (o *OptimizeTFRConjunction) Finish() (index.Optimized, error) { // regular conjunction searcher machinery will still be used, // but the underlying bitmap will be smaller. for _, tfr := range o.tfrs { - itr, ok := tfr.iterators[i].(*zap.PostingsIterator) - if ok && itr.ActualBM != nil { - itr.ActualBM = bm - itr.Actual = bm.Iterator() + itr, ok := tfr.iterators[i].(segment.OptimizablePostingsIterator) + if ok && itr.ActualBitmap() != nil { + itr.ReplaceActual(bm) } } } @@ -191,9 +188,9 @@ OUTER: continue OUTER } - itr, ok := tfr.iterators[i].(*zap.PostingsIterator) + itr, ok := tfr.iterators[i].(segment.OptimizablePostingsIterator) if !ok { - // We optimize zap postings iterators only. + // We only optimize postings iterators that support this operation. return nil, nil } @@ -201,12 +198,6 @@ OUTER: // can perform several optimizations up-front here. docNum1Hit, ok := itr.DocNum1Hit() if ok { - if docNum1Hit == zap.DocNum1HitFinished { - // An empty docNum here means the entire AND is empty. - oTFR.iterators[i] = segment.AnEmptyPostingsIterator - continue OUTER - } - if docNum1HitLastOk && docNum1HitLast != docNum1Hit { // The docNum1Hit doesn't match the previous // docNum1HitLast, so the entire AND is empty. @@ -220,14 +211,14 @@ OUTER: continue } - if itr.ActualBM == nil { + if itr.ActualBitmap() == nil { // An empty actual bitmap means the entire AND is empty. oTFR.iterators[i] = segment.AnEmptyPostingsIterator continue OUTER } // Collect the actual bitmap for more processing later. - actualBMs = append(actualBMs, itr.ActualBM) + actualBMs = append(actualBMs, itr.ActualBitmap()) } if docNum1HitLastOk { @@ -245,11 +236,7 @@ OUTER: // The actual bitmaps and docNum1Hits all contain or have // the same 1-hit docNum, so that's our AND'ed result. - oTFR.iterators[i], err = zap.PostingsIteratorFrom1Hit( - docNum1HitLast, zap.NormBits1Hit, false, false) - if err != nil { - return nil, nil - } + oTFR.iterators[i] = segment.NewUnadornedPostingsIteratorFrom1Hit(docNum1HitLast) continue OUTER } @@ -263,11 +250,7 @@ OUTER: if len(actualBMs) == 1 { // If we've only 1 actual bitmap, then that's our result. - oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap( - actualBMs[0], false, false) - if err != nil { - return nil, nil - } + oTFR.iterators[i] = segment.NewUnadornedPostingsIteratorFromBitmap(actualBMs[0]) continue OUTER } @@ -279,11 +262,7 @@ OUTER: bm.And(actualBM) } - oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap( - bm, false, false) - if err != nil { - return nil, nil - } + oTFR.iterators[i] = segment.NewUnadornedPostingsIteratorFromBitmap(bm) } return oTFR, nil @@ -337,13 +316,13 @@ func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err erro var cMax uint64 for _, tfr := range o.tfrs { - itr, ok := tfr.iterators[i].(*zap.PostingsIterator) + itr, ok := tfr.iterators[i].(segment.OptimizablePostingsIterator) if !ok { return nil, nil } - if itr.ActualBM != nil { - c := itr.ActualBM.GetCardinality() + if itr.ActualBitmap() != nil { + c := itr.ActualBitmap().GetCardinality() if cMax < c { cMax = c } @@ -379,7 +358,7 @@ func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err erro actualBMs = actualBMs[:0] for _, tfr := range o.tfrs { - itr, ok := tfr.iterators[i].(*zap.PostingsIterator) + itr, ok := tfr.iterators[i].(segment.OptimizablePostingsIterator) if !ok { return nil, nil } @@ -390,8 +369,8 @@ func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err erro continue } - if itr.ActualBM != nil { - actualBMs = append(actualBMs, itr.ActualBM) + if itr.ActualBitmap() != nil { + actualBMs = append(actualBMs, itr.ActualBitmap()) } } @@ -410,10 +389,7 @@ func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err erro bm.AddMany(docNums) - oTFR.iterators[i], err = zap.PostingsIteratorFromBitmap(bm, false, false) - if err != nil { - return nil, nil - } + oTFR.iterators[i] = segment.NewUnadornedPostingsIteratorFromBitmap(bm) } return oTFR, nil diff --git a/index/scorch/persister.go b/index/scorch/persister.go index d7c335047..30e75df77 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -32,12 +32,9 @@ import ( "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/blevesearch/bleve/index/scorch/segment/zap" bolt "go.etcd.io/bbolt" ) -var DefaultChunkFactor uint32 = 1024 - // DefaultPersisterNapTimeMSec is kept to zero as this helps in direct // persistence of segments with the default safe batch option. // If the default safe batch option results in high number of @@ -360,13 +357,13 @@ var DefaultMinSegmentsForInMemoryMerge = 2 func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) ( bool, error) { // collect the in-memory zap segments (SegmentBase instances) - var sbs []*zap.SegmentBase + var sbs []segment.Segment var sbsDrops []*roaring.Bitmap var sbsIndexes []int for i, segmentSnapshot := range snapshot.segment { - if sb, ok := segmentSnapshot.segment.(*zap.SegmentBase); ok { - sbs = append(sbs, sb) + if _, ok := segmentSnapshot.segment.(segment.PersistedSegment); !ok { + sbs = append(sbs, segmentSnapshot.segment) sbsDrops = append(sbsDrops, segmentSnapshot.deleted) sbsIndexes = append(sbsIndexes, i) } @@ -377,7 +374,7 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) ( } newSnapshot, newSegmentID, err := s.mergeSegmentBases( - snapshot, sbs, sbsDrops, sbsIndexes, DefaultChunkFactor) + snapshot, sbs, sbsDrops, sbsIndexes) if err != nil { return false, err } @@ -459,13 +456,13 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { if err != nil { return err } - err = metaBucket.Put([]byte("type"), []byte(zap.Type)) + err = metaBucket.Put(boltMetaDataSegmentTypeKey, []byte(s.segPlugin.Type())) if err != nil { return err } buf := make([]byte, binary.MaxVarintLen32) - binary.BigEndian.PutUint32(buf, zap.Version) - err = metaBucket.Put([]byte("version"), buf) + binary.BigEndian.PutUint32(buf, s.segPlugin.Version()) + err = metaBucket.Put(boltMetaDataSegmentVersionKey, buf) if err != nil { return err } @@ -494,11 +491,19 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { return err } switch seg := segmentSnapshot.segment.(type) { - case *zap.SegmentBase: + case segment.PersistedSegment: + path := seg.Path() + filename := strings.TrimPrefix(path, s.path+string(os.PathSeparator)) + err = snapshotSegmentBucket.Put(boltPathKey, []byte(filename)) + if err != nil { + return err + } + filenames = append(filenames, filename) + case segment.UnpersistedSegment: // need to persist this to disk filename := zapFileName(segmentSnapshot.id) path := s.path + string(os.PathSeparator) + filename - err = zap.PersistSegmentBase(seg, path) + err = seg.Persist(path) if err != nil { return fmt.Errorf("error persisting segment: %v", err) } @@ -508,14 +513,7 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { return err } filenames = append(filenames, filename) - case *zap.Segment: - path := seg.Path() - filename := strings.TrimPrefix(path, s.path+string(os.PathSeparator)) - err = snapshotSegmentBucket.Put(boltPathKey, []byte(filename)) - if err != nil { - return err - } - filenames = append(filenames, filename) + default: return fmt.Errorf("unknown segment type: %T", seg) } @@ -553,7 +551,7 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { } }() for segmentID, path := range newSegmentPaths { - newSegments[segmentID], err = zap.Open(path) + newSegments[segmentID], err = s.segPlugin.Open(path) if err != nil { return fmt.Errorf("error opening new segment at %s, %v", path, err) } @@ -609,6 +607,8 @@ var boltPathKey = []byte{'p'} var boltDeletedKey = []byte{'d'} var boltInternalKey = []byte{'i'} var boltMetaDataKey = []byte{'m'} +var boltMetaDataSegmentTypeKey = []byte("type") +var boltMetaDataSegmentVersionKey = []byte("version") func (s *Scorch) loadFromBolt() error { return s.rootBolt.View(func(tx *bolt.Tx) error { @@ -693,6 +693,23 @@ func (s *Scorch) loadSnapshot(snapshot *bolt.Bucket) (*IndexSnapshot, error) { refs: 1, creator: "loadSnapshot", } + // first we look for the meta-data bucket, this will tell us + // which segment type/version was used for this snapshot + // all operations for this scorch will use this type/version + metaBucket := snapshot.Bucket(boltMetaDataKey) + if metaBucket == nil { + _ = rv.DecRef() + return nil, fmt.Errorf("meta-data bucket missing") + } + segmentType := string(metaBucket.Get(boltMetaDataSegmentTypeKey)) + segmentVersion := binary.BigEndian.Uint32( + metaBucket.Get(boltMetaDataSegmentVersionKey)) + err := s.loadSegmentPlugin(segmentType, segmentVersion) + if err != nil { + _ = rv.DecRef() + return nil, fmt.Errorf( + "unable to load correct segment wrapper: %v", err) + } var running uint64 c := snapshot.Cursor() for k, _ := c.First(); k != nil; k, _ = c.Next() { @@ -737,7 +754,7 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro return nil, fmt.Errorf("segment path missing") } segmentPath := s.path + string(os.PathSeparator) + string(pathBytes) - segment, err := zap.Open(segmentPath) + segment, err := s.segPlugin.Open(segmentPath) if err != nil { return nil, fmt.Errorf("error opening bolt segment: %v", err) } diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index da6a53357..80f9e3a79 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -28,7 +28,6 @@ import ( "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/blevesearch/bleve/index/scorch/segment/zap" "github.com/blevesearch/bleve/index/store" "github.com/blevesearch/bleve/registry" bolt "go.etcd.io/bbolt" @@ -77,6 +76,8 @@ type Scorch struct { pauseLock sync.RWMutex pauseCount uint64 + + segPlugin segment.Plugin } type internalStats struct { @@ -100,7 +101,25 @@ func NewScorch(storeName string, nextSnapshotEpoch: 1, closeCh: make(chan struct{}), ineligibleForRemoval: map[string]bool{}, + segPlugin: defaultSegmentPlugin, } + + // check if the caller has requested a specific segment type/version + forcedSegmentVersion, ok := config["forceSegmentVersion"].(int) + if ok { + forcedSegmentType, ok2 := config["forceSegmentType"].(string) + if !ok2 { + return nil, fmt.Errorf( + "forceSegmentVersion set to %d, must also specify forceSegmentType", forcedSegmentVersion) + } + + err := rv.loadSegmentPlugin(forcedSegmentType, + uint32(forcedSegmentVersion)) + if err != nil { + return nil, err + } + } + rv.root = &IndexSnapshot{parent: rv, refs: 1, creator: "NewScorch"} ro, ok := config["read_only"].(bool) if ok { @@ -351,7 +370,7 @@ func (s *Scorch) Batch(batch *index.Batch) (err error) { var newSegment segment.Segment var bufBytes uint64 if len(analysisResults) > 0 { - newSegment, bufBytes, err = zap.AnalysisResultsToSegmentBase(analysisResults, DefaultChunkFactor) + newSegment, bufBytes, err = s.segPlugin.New(analysisResults) if err != nil { return err } @@ -499,8 +518,7 @@ func (s *Scorch) diskFileStats(rootSegmentPaths map[string]struct{}) (uint64, func (s *Scorch) rootDiskSegmentsPaths() map[string]struct{} { rv := make(map[string]struct{}, len(s.root.segment)) for _, segmentSnapshot := range s.root.segment { - switch seg := segmentSnapshot.segment.(type) { - case *zap.Segment: + if seg, ok := segmentSnapshot.segment.(segment.PersistedSegment); ok { rv[seg.Path()] = struct{}{} } } diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index 302a86a5e..b168728ee 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -2121,3 +2121,32 @@ func TestAllFieldWithDifferentTermVectorsEnabled(t *testing.T) { t.Errorf("Error updating index: %v", err) } } + +func TestForceVersion(t *testing.T) { + cfg := map[string]interface{}{} + cfg["forceSegmentType"] = "zap" + cfg["forceSegmentVersion"] = 11 + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, cfg, analysisQueue) + if err != nil { + t.Fatalf("error opening a supported vesion: %v", err) + } + s := idx.(*Scorch) + if s.segPlugin.Version() != 11 { + t.Fatalf("wrong segment wrapper version loaded, expected %d got %d", 11, s.segPlugin.Version()) + } + cfg["forceSegmentVersion"] = 12 + idx, err = NewScorch(Name, cfg, analysisQueue) + if err != nil { + t.Fatalf("error opening a supported vesion: %v", err) + } + s = idx.(*Scorch) + if s.segPlugin.Version() != 12 { + t.Fatalf("wrong segment wrapper version loaded, expected %d got %d", 12, s.segPlugin.Version()) + } + cfg["forceSegmentVersion"] = 10 + idx, err = NewScorch(Name, cfg, analysisQueue) + if err == nil { + t.Fatalf("expected an error opening an unsupported vesion, got nil") + } +} \ No newline at end of file diff --git a/index/scorch/segment/empty.go b/index/scorch/segment/empty.go index fdc407a74..340db73a6 100644 --- a/index/scorch/segment/empty.go +++ b/index/scorch/segment/empty.go @@ -105,10 +105,6 @@ func (e *EmptyDictionaryIterator) Contains(key []byte) (bool, error) { return false, nil } -func (e *EmptyPostingsIterator) Advance(uint64) (Posting, error) { - return nil, nil -} - type EmptyPostingsList struct{} func (e *EmptyPostingsList) Iterator(includeFreq, includeNorm, includeLocations bool, @@ -130,6 +126,10 @@ func (e *EmptyPostingsIterator) Next() (Posting, error) { return nil, nil } +func (e *EmptyPostingsIterator) Advance(uint64) (Posting, error) { + return nil, nil +} + func (e *EmptyPostingsIterator) Size() int { return 0 } diff --git a/index/scorch/segment/plugin.go b/index/scorch/segment/plugin.go new file mode 100644 index 000000000..d8aaa0b6d --- /dev/null +++ b/index/scorch/segment/plugin.go @@ -0,0 +1,58 @@ +// Copyright (c) 2020 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package segment + +import ( + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/index" +) + +// Plugin represents the essential functions required by a package to plug in +// it's segment implementation +type Plugin interface { + + // Type is the name for this segment plugin + Type() string + + // Version is a numeric value identifying a specific version of this type. + // When incompatible changes are made to a particular type of plugin, the + // version must be incremented. + Version() uint32 + + // New takes a set of AnalysisResults and turns them into a new Segment + New(results []*index.AnalysisResult) (Segment, uint64, error) + + // Open attempts to open the file at the specified path and + // return the corresponding Segment + Open(path string) (Segment, error) + + // Merge takes a set of Segments, and creates a new segment on disk at + // the specified path. + // Drops is a set of bitmaps (one for each segment) indicating which + // documents can be dropped from the segments during the merge. + // If the closeCh channel is closed, Merge will cease doing work at + // the next opportunity, and return an error (closed). + // StatsReporter can optionally be provided, in which case progress + // made during the merge is reported while operation continues. + // Returns: + // A slice of new document numbers (one for each input segment), + // this allows the caller to know a particular document's new + // document number in the newly merged segment. + // The number of bytes written to the new segment file. + // An error, if any occurred. + Merge(segments []Segment, drops []*roaring.Bitmap, path string, + closeCh chan struct{}, s StatsReporter) ( + [][]uint64, uint64, error) +} diff --git a/index/scorch/segment/segment.go b/index/scorch/segment/segment.go index 34c2bc204..ddd0d0910 100644 --- a/index/scorch/segment/segment.go +++ b/index/scorch/segment/segment.go @@ -50,6 +50,16 @@ type Segment interface { DecRef() error } +type UnpersistedSegment interface { + Segment + Persist(path string) error +} + +type PersistedSegment interface { + Segment + Path() string +} + type TermDictionary interface { PostingsList(term []byte, except *roaring.Bitmap, prealloc PostingsList) (PostingsList, error) @@ -96,6 +106,12 @@ type PostingsIterator interface { Size() int } +type OptimizablePostingsIterator interface { + ActualBitmap() *roaring.Bitmap + DocNum1Hit() (uint64, bool) + ReplaceActual(*roaring.Bitmap) +} + type Posting interface { Number() uint64 diff --git a/index/scorch/segment/unadorned.go b/index/scorch/segment/unadorned.go new file mode 100644 index 000000000..9a4d6c76c --- /dev/null +++ b/index/scorch/segment/unadorned.go @@ -0,0 +1,148 @@ +// Copyright (c) 2020 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package segment + +import ( + "github.com/RoaringBitmap/roaring" + "math" + "reflect" +) + +var reflectStaticSizeUnadornedPostingsIteratorBitmap int +var reflectStaticSizeUnadornedPostingsIterator1Hit int +var reflectStaticSizeUnadornedPosting int + + +func init() { + var pib UnadornedPostingsIteratorBitmap + reflectStaticSizeUnadornedPostingsIteratorBitmap = int(reflect.TypeOf(pib).Size()) + var pi1h UnadornedPostingsIterator1Hit + reflectStaticSizeUnadornedPostingsIterator1Hit = int(reflect.TypeOf(pi1h).Size()) + var up UnadornedPosting + reflectStaticSizeUnadornedPosting = int(reflect.TypeOf(up).Size()) +} + +type UnadornedPostingsIteratorBitmap struct{ + actual roaring.IntPeekable + actualBM *roaring.Bitmap +} + +func (i *UnadornedPostingsIteratorBitmap) Next() (Posting, error) { + return i.nextAtOrAfter(0) +} + +func (i *UnadornedPostingsIteratorBitmap) Advance(docNum uint64) (Posting, error) { + return i.nextAtOrAfter(docNum) +} + +func (i *UnadornedPostingsIteratorBitmap) nextAtOrAfter(atOrAfter uint64) (Posting, error) { + docNum, exists := i.nextDocNumAtOrAfter(atOrAfter) + if !exists { + return nil, nil + } + return UnadornedPosting(docNum), nil +} + +func (i *UnadornedPostingsIteratorBitmap) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool) { + if i.actual == nil || !i.actual.HasNext() { + return 0, false + } + i.actual.AdvanceIfNeeded(uint32(atOrAfter)) + + if !i.actual.HasNext() { + return 0, false // couldn't find anything + } + + return uint64(i.actual.Next()), true +} + +func (i *UnadornedPostingsIteratorBitmap) Size() int { + return reflectStaticSizeUnadornedPostingsIteratorBitmap +} + +func NewUnadornedPostingsIteratorFromBitmap(bm *roaring.Bitmap) PostingsIterator { + return &UnadornedPostingsIteratorBitmap{ + actualBM: bm, + actual: bm.Iterator(), + } +} + +const docNum1HitFinished = math.MaxUint64 + +type UnadornedPostingsIterator1Hit struct{ + docNum uint64 +} + +func (i *UnadornedPostingsIterator1Hit) Next() (Posting, error) { + return i.nextAtOrAfter(0) +} + +func (i *UnadornedPostingsIterator1Hit) Advance(docNum uint64) (Posting, error) { + return i.nextAtOrAfter(docNum) +} + +func (i *UnadornedPostingsIterator1Hit) nextAtOrAfter(atOrAfter uint64) (Posting, error) { + docNum, exists := i.nextDocNumAtOrAfter(atOrAfter) + if !exists { + return nil, nil + } + return UnadornedPosting(docNum), nil +} + +func (i *UnadornedPostingsIterator1Hit) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool) { + if i.docNum == docNum1HitFinished { + return 0, false + } + if i.docNum < atOrAfter { + // advanced past our 1-hit + i.docNum = docNum1HitFinished // consume our 1-hit docNum + return 0, false + } + docNum := i.docNum + i.docNum = docNum1HitFinished // consume our 1-hit docNum + return docNum, true +} + +func (i *UnadornedPostingsIterator1Hit) Size() int { + return reflectStaticSizeUnadornedPostingsIterator1Hit +} + +func NewUnadornedPostingsIteratorFrom1Hit(docNum1Hit uint64) PostingsIterator { + return &UnadornedPostingsIterator1Hit{ + docNum1Hit, + } +} + +type UnadornedPosting uint64 + +func (p UnadornedPosting) Number() uint64 { + return uint64(p) +} + +func (p UnadornedPosting) Frequency() uint64 { + return 0 +} + +func (p UnadornedPosting) Norm() float64 { + return 0 +} + +func (p UnadornedPosting) Locations() []Location { + return nil +} + +func (p UnadornedPosting) Size() int { + return reflectStaticSizeUnadornedPosting +} \ No newline at end of file diff --git a/index/scorch/segment/zap/README.md b/index/scorch/segment/zap/README.md deleted file mode 100644 index 0facb669f..000000000 --- a/index/scorch/segment/zap/README.md +++ /dev/null @@ -1,158 +0,0 @@ -# zap file format - -Advanced ZAP File Format Documentation is [here](zap.md). - -The file is written in the reverse order that we typically access data. This helps us write in one pass since later sections of the file require file offsets of things we've already written. - -Current usage: - -- mmap the entire file -- crc-32 bytes and version are in fixed position at end of the file -- reading remainder of footer could be version specific -- remainder of footer gives us: - - 3 important offsets (docValue , fields index and stored data index) - - 2 important values (number of docs and chunk factor) -- field data is processed once and memoized onto the heap so that we never have to go back to disk for it -- access to stored data by doc number means first navigating to the stored data index, then accessing a fixed position offset into that slice, which gives us the actual address of the data. the first bytes of that section tell us the size of data so that we know where it ends. -- access to all other indexed data follows the following pattern: - - first know the field name -> convert to id - - next navigate to term dictionary for that field - - some operations stop here and do dictionary ops - - next use dictionary to navigate to posting list for a specific term - - walk posting list - - if necessary, walk posting details as we go - - if location info is desired, consult location bitmap to see if it is there - -## stored fields section - -- for each document - - preparation phase: - - produce a slice of metadata bytes and data bytes - - produce these slices in field id order - - field value is appended to the data slice - - metadata slice is varint encoded with the following values for each field value - - field id (uint16) - - field type (byte) - - field value start offset in uncompressed data slice (uint64) - - field value length (uint64) - - field number of array positions (uint64) - - one additional value for each array position (uint64) - - compress the data slice using snappy - - file writing phase: - - remember the start offset for this document - - write out meta data length (varint uint64) - - write out compressed data length (varint uint64) - - write out the metadata bytes - - write out the compressed data bytes - -## stored fields idx - -- for each document - - write start offset (remembered from previous section) of stored data (big endian uint64) - -With this index and a known document number, we have direct access to all the stored field data. - -## posting details (freq/norm) section - -- for each posting list - - produce a slice containing multiple consecutive chunks (each chunk is varint stream) - - produce a slice remembering offsets of where each chunk starts - - preparation phase: - - for each hit in the posting list - - if this hit is in next chunk close out encoding of last chunk and record offset start of next - - encode term frequency (uint64) - - encode norm factor (float32) - - file writing phase: - - remember start position for this posting list details - - write out number of chunks that follow (varint uint64) - - write out length of each chunk (each a varint uint64) - - write out the byte slice containing all the chunk data - -If you know the doc number you're interested in, this format lets you jump to the correct chunk (docNum/chunkFactor) directly and then seek within that chunk until you find it. - -## posting details (location) section - -- for each posting list - - produce a slice containing multiple consecutive chunks (each chunk is varint stream) - - produce a slice remembering offsets of where each chunk starts - - preparation phase: - - for each hit in the posting list - - if this hit is in next chunk close out encoding of last chunk and record offset start of next - - encode field (uint16) - - encode field pos (uint64) - - encode field start (uint64) - - encode field end (uint64) - - encode number of array positions to follow (uint64) - - encode each array position (each uint64) - - file writing phase: - - remember start position for this posting list details - - write out number of chunks that follow (varint uint64) - - write out length of each chunk (each a varint uint64) - - write out the byte slice containing all the chunk data - -If you know the doc number you're interested in, this format lets you jump to the correct chunk (docNum/chunkFactor) directly and then seek within that chunk until you find it. - -## postings list section - -- for each posting list - - preparation phase: - - encode roaring bitmap posting list to bytes (so we know the length) - - file writing phase: - - remember the start position for this posting list - - write freq/norm details offset (remembered from previous, as varint uint64) - - write location details offset (remembered from previous, as varint uint64) - - write length of encoded roaring bitmap - - write the serialized roaring bitmap data - -## dictionary - -- for each field - - preparation phase: - - encode vellum FST with dictionary data pointing to file offset of posting list (remembered from previous) - - file writing phase: - - remember the start position of this persistDictionary - - write length of vellum data (varint uint64) - - write out vellum data - -## fields section - -- for each field - - file writing phase: - - remember start offset for each field - - write dictionary address (remembered from previous) (varint uint64) - - write length of field name (varint uint64) - - write field name bytes - -## fields idx - -- for each field - - file writing phase: - - write big endian uint64 of start offset for each field - -NOTE: currently we don't know or record the length of this fields index. Instead we rely on the fact that we know it immediately precedes a footer of known size. - -## fields DocValue - -- for each field - - preparation phase: - - produce a slice containing multiple consecutive chunks, where each chunk is composed of a meta section followed by compressed columnar field data - - produce a slice remembering the length of each chunk - - file writing phase: - - remember the start position of this first field DocValue offset in the footer - - write out number of chunks that follow (varint uint64) - - write out length of each chunk (each a varint uint64) - - write out the byte slice containing all the chunk data - -NOTE: currently the meta header inside each chunk gives clue to the location offsets and size of the data pertaining to a given docID and any -read operation leverage that meta information to extract the document specific data from the file. - -## footer - -- file writing phase - - write number of docs (big endian uint64) - - write stored field index location (big endian uint64) - - write field index location (big endian uint64) - - write field docValue location (big endian uint64) - - write out chunk factor (big endian uint32) - - write out version (big endian uint32) - - write out file CRC of everything preceding this (big endian uint32) diff --git a/index/scorch/segment/zap/build.go b/index/scorch/segment/zap/build.go deleted file mode 100644 index c02333cee..000000000 --- a/index/scorch/segment/zap/build.go +++ /dev/null @@ -1,151 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "bufio" - "github.com/couchbase/vellum" - "math" - "os" -) - -const Version uint32 = 11 - -const Type string = "zap" - -const fieldNotUninverted = math.MaxUint64 - -// PersistSegmentBase persists SegmentBase in the zap file format. -func PersistSegmentBase(sb *SegmentBase, path string) error { - flag := os.O_RDWR | os.O_CREATE - - f, err := os.OpenFile(path, flag, 0600) - if err != nil { - return err - } - - cleanup := func() { - _ = f.Close() - _ = os.Remove(path) - } - - br := bufio.NewWriter(f) - - _, err = br.Write(sb.mem) - if err != nil { - cleanup() - return err - } - - err = persistFooter(sb.numDocs, sb.storedIndexOffset, sb.fieldsIndexOffset, sb.docValueOffset, - sb.chunkFactor, sb.memCRC, br) - if err != nil { - cleanup() - return err - } - - err = br.Flush() - if err != nil { - cleanup() - return err - } - - err = f.Sync() - if err != nil { - cleanup() - return err - } - - err = f.Close() - if err != nil { - cleanup() - return err - } - - return nil -} - -func persistStoredFieldValues(fieldID int, - storedFieldValues [][]byte, stf []byte, spf [][]uint64, - curr int, metaEncode varintEncoder, data []byte) ( - int, []byte, error) { - for i := 0; i < len(storedFieldValues); i++ { - // encode field - _, err := metaEncode(uint64(fieldID)) - if err != nil { - return 0, nil, err - } - // encode type - _, err = metaEncode(uint64(stf[i])) - if err != nil { - return 0, nil, err - } - // encode start offset - _, err = metaEncode(uint64(curr)) - if err != nil { - return 0, nil, err - } - // end len - _, err = metaEncode(uint64(len(storedFieldValues[i]))) - if err != nil { - return 0, nil, err - } - // encode number of array pos - _, err = metaEncode(uint64(len(spf[i]))) - if err != nil { - return 0, nil, err - } - // encode all array positions - for _, pos := range spf[i] { - _, err = metaEncode(pos) - if err != nil { - return 0, nil, err - } - } - - data = append(data, storedFieldValues[i]...) - curr += len(storedFieldValues[i]) - } - - return curr, data, nil -} - -func InitSegmentBase(mem []byte, memCRC uint32, chunkFactor uint32, - fieldsMap map[string]uint16, fieldsInv []string, numDocs uint64, - storedIndexOffset uint64, fieldsIndexOffset uint64, docValueOffset uint64, - dictLocs []uint64) (*SegmentBase, error) { - sb := &SegmentBase{ - mem: mem, - memCRC: memCRC, - chunkFactor: chunkFactor, - fieldsMap: fieldsMap, - fieldsInv: fieldsInv, - numDocs: numDocs, - storedIndexOffset: storedIndexOffset, - fieldsIndexOffset: fieldsIndexOffset, - docValueOffset: docValueOffset, - dictLocs: dictLocs, - fieldDvReaders: make(map[uint16]*docValueReader), - fieldFSTs: make(map[uint16]*vellum.FST), - } - sb.updateSize() - - err := sb.loadDvReaders() - if err != nil { - return nil, err - } - - return sb, nil -} diff --git a/index/scorch/segment/zap/build_test.go b/index/scorch/segment/zap/build_test.go deleted file mode 100644 index 9d1b584f2..000000000 --- a/index/scorch/segment/zap/build_test.go +++ /dev/null @@ -1,556 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "os" - "testing" - - "github.com/blevesearch/bleve/analysis" - "github.com/blevesearch/bleve/document" - "github.com/blevesearch/bleve/index" -) - -func TestBuild(t *testing.T) { - _ = os.RemoveAll("/tmp/scorch.zap") - - sb, _, err := buildTestSegment() - if err != nil { - t.Fatal(err) - } - err = PersistSegmentBase(sb, "/tmp/scorch.zap") - if err != nil { - t.Fatal(err) - } -} - -func buildTestSegment() (*SegmentBase, uint64, error) { - doc := &document.Document{ - ID: "a", - Fields: []document.Field{ - document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil), - document.NewTextFieldCustom("name", nil, []byte("wow"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - }, - CompositeFields: []*document.CompositeField{ - document.NewCompositeField("_all", true, nil, []string{"_id"}), - }, - } - - // forge analyzed docs - results := []*index.AnalysisResult{ - &index.AnalysisResult{ - Document: doc, - Analyzed: []analysis.TokenFrequencies{ - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 1, - Position: 1, - Term: []byte("a"), - }, - }, nil, false), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("wow"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("some"), - }, - &analysis.Token{ - Start: 5, - End: 10, - Position: 2, - Term: []byte("thing"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("cold"), - }, - }, []uint64{0}, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("dark"), - }, - }, []uint64{1}, true), - }, - Length: []int{ - 1, - 1, - 2, - 1, - 1, - }, - }, - } - - // fix up composite fields - for _, ar := range results { - for i, f := range ar.Document.Fields { - for _, cf := range ar.Document.CompositeFields { - cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i]) - } - } - } - - return AnalysisResultsToSegmentBase(results, 1024) -} - -func buildTestSegmentMulti() (*SegmentBase, uint64, error) { - results := buildTestAnalysisResultsMulti() - - return AnalysisResultsToSegmentBase(results, 1024) -} - -func buildTestSegmentMultiWithChunkFactor(chunkFactor uint32) (*SegmentBase, uint64, error) { - results := buildTestAnalysisResultsMulti() - - return AnalysisResultsToSegmentBase(results, chunkFactor) -} - -func buildTestSegmentMultiWithDifferentFields(includeDocA, includeDocB bool) (*SegmentBase, uint64, error) { - results := buildTestAnalysisResultsMultiWithDifferentFields(includeDocA, includeDocB) - - return AnalysisResultsToSegmentBase(results, 1024) -} - -func buildTestAnalysisResultsMulti() []*index.AnalysisResult { - doc := &document.Document{ - ID: "a", - Fields: []document.Field{ - document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil), - document.NewTextFieldCustom("name", nil, []byte("wow"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - }, - CompositeFields: []*document.CompositeField{ - document.NewCompositeField("_all", true, nil, []string{"_id"}), - }, - } - - doc2 := &document.Document{ - ID: "b", - Fields: []document.Field{ - document.NewTextFieldCustom("_id", nil, []byte("b"), document.IndexField|document.StoreField, nil), - document.NewTextFieldCustom("name", nil, []byte("who"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - }, - CompositeFields: []*document.CompositeField{ - document.NewCompositeField("_all", true, nil, []string{"_id"}), - }, - } - - // forge analyzed docs - results := []*index.AnalysisResult{ - &index.AnalysisResult{ - Document: doc, - Analyzed: []analysis.TokenFrequencies{ - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 1, - Position: 1, - Term: []byte("a"), - }, - }, nil, false), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("wow"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("some"), - }, - &analysis.Token{ - Start: 5, - End: 10, - Position: 2, - Term: []byte("thing"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("cold"), - }, - }, []uint64{0}, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("dark"), - }, - }, []uint64{1}, true), - }, - Length: []int{ - 1, - 1, - 2, - 1, - 1, - }, - }, - &index.AnalysisResult{ - Document: doc2, - Analyzed: []analysis.TokenFrequencies{ - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 1, - Position: 1, - Term: []byte("b"), - }, - }, nil, false), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("who"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("some"), - }, - &analysis.Token{ - Start: 5, - End: 10, - Position: 2, - Term: []byte("thing"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("cold"), - }, - }, []uint64{0}, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("dark"), - }, - }, []uint64{1}, true), - }, - Length: []int{ - 1, - 1, - 2, - 1, - 1, - }, - }, - } - - // fix up composite fields - for _, ar := range results { - for i, f := range ar.Document.Fields { - for _, cf := range ar.Document.CompositeFields { - cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i]) - } - } - } - - return results -} - -func buildTestAnalysisResultsMultiWithDifferentFields(includeDocA, includeDocB bool) []*index.AnalysisResult { - results := []*index.AnalysisResult{} - - if includeDocA { - doc := &document.Document{ - ID: "a", - Fields: []document.Field{ - document.NewTextFieldCustom("_id", nil, []byte("a"), - document.IndexField|document.StoreField, nil), - document.NewTextField("name", []uint64{}, []byte("ABC")), - document.NewTextField("dept", []uint64{}, []byte("ABC dept")), - document.NewTextField("manages.id", []uint64{}, []byte("XYZ")), - document.NewTextField("manages.count", []uint64{}, []byte("1")), - }, - CompositeFields: []*document.CompositeField{ - document.NewCompositeField("_all", true, nil, []string{"_id"}), - }, - } - - result := &index.AnalysisResult{ - Document: doc, - Analyzed: []analysis.TokenFrequencies{ - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 1, - Position: 1, - Term: []byte("a"), - }, - }, nil, false), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("ABC"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("ABC"), - }, - &analysis.Token{ - Start: 4, - End: 8, - Position: 2, - Term: []byte("dept"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("XYZ"), - }, - }, []uint64{0}, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 1, - Position: 1, - Term: []byte("1"), - }, - }, []uint64{1}, true), - }, - Length: []int{ - 1, - 1, - 2, - 1, - 1, - }, - } - - results = append(results, result) - } - - if includeDocB { - doc := &document.Document{ - ID: "b", - Fields: []document.Field{ - document.NewTextFieldCustom("_id", nil, []byte("b"), - document.IndexField|document.StoreField, nil), - document.NewTextField("name", []uint64{}, []byte("XYZ")), - document.NewTextField("dept", []uint64{}, []byte("ABC dept")), - document.NewTextField("reportsTo.id", []uint64{}, []byte("ABC")), - }, - CompositeFields: []*document.CompositeField{ - document.NewCompositeField("_all", true, nil, []string{"_id"}), - }, - } - - result := &index.AnalysisResult{ - Document: doc, - Analyzed: []analysis.TokenFrequencies{ - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 1, - Position: 1, - Term: []byte("b"), - }, - }, nil, false), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("XYZ"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("ABC"), - }, - &analysis.Token{ - Start: 4, - End: 8, - Position: 2, - Term: []byte("dept"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("ABC"), - }, - }, []uint64{0}, true), - }, - Length: []int{ - 1, - 1, - 2, - 1, - }, - } - - results = append(results, result) - } - - // fix up composite fields - for _, ar := range results { - for i, f := range ar.Document.Fields { - for _, cf := range ar.Document.CompositeFields { - cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i]) - } - } - } - - return results -} - -func buildTestSegmentWithDefaultFieldMapping(chunkFactor uint32) ( - *SegmentBase, []string, error) { - doc := &document.Document{ - ID: "a", - Fields: []document.Field{ - document.NewTextFieldCustom("_id", nil, []byte("a"), - document.IndexField|document.StoreField, nil), - document.NewTextField("name", nil, []byte("wow")), - document.NewTextField("desc", nil, []byte("some thing")), - document.NewTextField("tag", []uint64{0}, []byte("cold")), - }, - CompositeFields: []*document.CompositeField{ - document.NewCompositeField("_all", true, nil, []string{"_id"}), - }, - } - - var fields []string - fields = append(fields, "_id") - fields = append(fields, "name") - fields = append(fields, "desc") - fields = append(fields, "tag") - - // forge analyzed docs - results := []*index.AnalysisResult{ - &index.AnalysisResult{ - Document: doc, - Analyzed: []analysis.TokenFrequencies{ - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 1, - Position: 1, - Term: []byte("a"), - }, - }, nil, false), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("wow"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("some"), - }, - &analysis.Token{ - Start: 5, - End: 10, - Position: 2, - Term: []byte("thing"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("cold"), - }, - }, []uint64{0}, true), - }, - Length: []int{ - 1, - 1, - 2, - 1, - 1, - }, - }, - } - - // fix up composite fields - for _, ar := range results { - for i, f := range ar.Document.Fields { - for _, cf := range ar.Document.CompositeFields { - cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i]) - } - } - } - - sb, _, err := AnalysisResultsToSegmentBase(results, chunkFactor) - - return sb, fields, err -} diff --git a/index/scorch/segment/zap/contentcoder.go b/index/scorch/segment/zap/contentcoder.go deleted file mode 100644 index b9ff8179b..000000000 --- a/index/scorch/segment/zap/contentcoder.go +++ /dev/null @@ -1,230 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "bytes" - "encoding/binary" - "io" - "reflect" - - "github.com/golang/snappy" -) - -var reflectStaticSizeMetaData int - -func init() { - var md MetaData - reflectStaticSizeMetaData = int(reflect.TypeOf(md).Size()) -} - -var termSeparator byte = 0xff -var termSeparatorSplitSlice = []byte{termSeparator} - -type chunkedContentCoder struct { - final []byte - chunkSize uint64 - currChunk uint64 - chunkLens []uint64 - - w io.Writer - progressiveWrite bool - - chunkMetaBuf bytes.Buffer - chunkBuf bytes.Buffer - - chunkMeta []MetaData - - compressed []byte // temp buf for snappy compression -} - -// MetaData represents the data information inside a -// chunk. -type MetaData struct { - DocNum uint64 // docNum of the data inside the chunk - DocDvOffset uint64 // offset of data inside the chunk for the given docid -} - -// newChunkedContentCoder returns a new chunk content coder which -// packs data into chunks based on the provided chunkSize -func newChunkedContentCoder(chunkSize uint64, maxDocNum uint64, - w io.Writer, progressiveWrite bool) *chunkedContentCoder { - total := maxDocNum/chunkSize + 1 - rv := &chunkedContentCoder{ - chunkSize: chunkSize, - chunkLens: make([]uint64, total), - chunkMeta: make([]MetaData, 0, total), - w: w, - progressiveWrite: progressiveWrite, - } - - return rv -} - -// Reset lets you reuse this chunked content coder. Buffers are reset -// and re used. You cannot change the chunk size. -func (c *chunkedContentCoder) Reset() { - c.currChunk = 0 - c.final = c.final[:0] - c.chunkBuf.Reset() - c.chunkMetaBuf.Reset() - for i := range c.chunkLens { - c.chunkLens[i] = 0 - } - c.chunkMeta = c.chunkMeta[:0] -} - -// Close indicates you are done calling Add() this allows -// the final chunk to be encoded. -func (c *chunkedContentCoder) Close() error { - return c.flushContents() -} - -func (c *chunkedContentCoder) flushContents() error { - // flush the contents, with meta information at first - buf := make([]byte, binary.MaxVarintLen64) - n := binary.PutUvarint(buf, uint64(len(c.chunkMeta))) - _, err := c.chunkMetaBuf.Write(buf[:n]) - if err != nil { - return err - } - - // write out the metaData slice - for _, meta := range c.chunkMeta { - _, err := writeUvarints(&c.chunkMetaBuf, meta.DocNum, meta.DocDvOffset) - if err != nil { - return err - } - } - - // write the metadata to final data - metaData := c.chunkMetaBuf.Bytes() - c.final = append(c.final, c.chunkMetaBuf.Bytes()...) - // write the compressed data to the final data - c.compressed = snappy.Encode(c.compressed[:cap(c.compressed)], c.chunkBuf.Bytes()) - c.final = append(c.final, c.compressed...) - - c.chunkLens[c.currChunk] = uint64(len(c.compressed) + len(metaData)) - - if c.progressiveWrite { - _, err := c.w.Write(c.final) - if err != nil { - return err - } - c.final = c.final[:0] - } - - return nil -} - -// Add encodes the provided byte slice into the correct chunk for the provided -// doc num. You MUST call Add() with increasing docNums. -func (c *chunkedContentCoder) Add(docNum uint64, vals []byte) error { - chunk := docNum / c.chunkSize - if chunk != c.currChunk { - // flush out the previous chunk details - err := c.flushContents() - if err != nil { - return err - } - // clearing the chunk specific meta for next chunk - c.chunkBuf.Reset() - c.chunkMetaBuf.Reset() - c.chunkMeta = c.chunkMeta[:0] - c.currChunk = chunk - } - - // get the starting offset for this doc - dvOffset := c.chunkBuf.Len() - dvSize, err := c.chunkBuf.Write(vals) - if err != nil { - return err - } - - c.chunkMeta = append(c.chunkMeta, MetaData{ - DocNum: docNum, - DocDvOffset: uint64(dvOffset + dvSize), - }) - return nil -} - -// Write commits all the encoded chunked contents to the provided writer. -// -// | ..... data ..... | chunk offsets (varints) -// | position of chunk offsets (uint64) | number of offsets (uint64) | -// -func (c *chunkedContentCoder) Write() (int, error) { - var tw int - - if c.final != nil { - // write out the data section first - nw, err := c.w.Write(c.final) - tw += nw - if err != nil { - return tw, err - } - } - - chunkOffsetsStart := uint64(tw) - - if cap(c.final) < binary.MaxVarintLen64 { - c.final = make([]byte, binary.MaxVarintLen64) - } else { - c.final = c.final[0:binary.MaxVarintLen64] - } - chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens) - // write out the chunk offsets - for _, chunkOffset := range chunkOffsets { - n := binary.PutUvarint(c.final, chunkOffset) - nw, err := c.w.Write(c.final[:n]) - tw += nw - if err != nil { - return tw, err - } - } - - chunkOffsetsLen := uint64(tw) - chunkOffsetsStart - - c.final = c.final[0:8] - // write out the length of chunk offsets - binary.BigEndian.PutUint64(c.final, chunkOffsetsLen) - nw, err := c.w.Write(c.final) - tw += nw - if err != nil { - return tw, err - } - - // write out the number of chunks - binary.BigEndian.PutUint64(c.final, uint64(len(c.chunkLens))) - nw, err = c.w.Write(c.final) - tw += nw - if err != nil { - return tw, err - } - - c.final = c.final[:0] - - return tw, nil -} - -// ReadDocValueBoundary elicits the start, end offsets from a -// metaData header slice -func ReadDocValueBoundary(chunk int, metaHeaders []MetaData) (uint64, uint64) { - var start uint64 - if chunk > 0 { - start = metaHeaders[chunk-1].DocDvOffset - } - return start, metaHeaders[chunk].DocDvOffset -} diff --git a/index/scorch/segment/zap/contentcoder_test.go b/index/scorch/segment/zap/contentcoder_test.go deleted file mode 100644 index 62ffde413..000000000 --- a/index/scorch/segment/zap/contentcoder_test.go +++ /dev/null @@ -1,125 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "bytes" - "testing" -) - -func TestChunkedContentCoder(t *testing.T) { - - tests := []struct { - maxDocNum uint64 - chunkSize uint64 - docNums []uint64 - vals [][]byte - expected []byte - }{ - { - maxDocNum: 0, - chunkSize: 1, - docNums: []uint64{0}, - vals: [][]byte{[]byte("bleve")}, - // 1 chunk, chunk-0 length 11(b), value - expected: []byte{0x1, 0x0, 0x5, 0x5, 0x10, 0x62, 0x6c, 0x65, 0x76, 0x65, - 0xa, - 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1, - 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x1}, - }, - { - maxDocNum: 1, - chunkSize: 1, - docNums: []uint64{0, 1}, - vals: [][]byte{ - []byte("upside"), - []byte("scorch"), - }, - - expected: []byte{0x1, 0x0, 0x6, 0x6, 0x14, 0x75, 0x70, 0x73, 0x69, 0x64, - 0x65, 0x1, 0x1, 0x6, 0x6, 0x14, 0x73, 0x63, 0x6f, 0x72, 0x63, 0x68, - 0xb, 0x16, - 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2, - 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2}, - }, - } - - for _, test := range tests { - - var actual bytes.Buffer - cic := newChunkedContentCoder(test.chunkSize, test.maxDocNum, &actual, false) - for i, docNum := range test.docNums { - err := cic.Add(docNum, test.vals[i]) - if err != nil { - t.Fatalf("error adding to intcoder: %v", err) - } - } - _ = cic.Close() - _, err := cic.Write() - if err != nil { - t.Fatalf("error writing: %v", err) - } - - if !bytes.Equal(test.expected, actual.Bytes()) { - t.Errorf("got:%s, expected:%s", string(actual.Bytes()), string(test.expected)) - } - } -} - -func TestChunkedContentCoders(t *testing.T) { - maxDocNum := uint64(5) - chunkSize := uint64(1) - docNums := []uint64{0, 1, 2, 3, 4, 5} - vals := [][]byte{ - []byte("scorch"), - []byte("does"), - []byte("better"), - []byte("than"), - []byte("upside"), - []byte("down"), - } - - var actual1, actual2 bytes.Buffer - // chunkedContentCoder that writes out at the end - cic1 := newChunkedContentCoder(chunkSize, maxDocNum, &actual1, false) - // chunkedContentCoder that writes out in chunks - cic2 := newChunkedContentCoder(chunkSize, maxDocNum, &actual2, true) - - for i, docNum := range docNums { - err := cic1.Add(docNum, vals[i]) - if err != nil { - t.Fatalf("error adding to intcoder: %v", err) - } - err = cic2.Add(docNum, vals[i]) - if err != nil { - t.Fatalf("error adding to intcoder: %v", err) - } - } - _ = cic1.Close() - _ = cic2.Close() - - _, err := cic1.Write() - if err != nil { - t.Fatalf("error writing: %v", err) - } - _, err = cic2.Write() - if err != nil { - t.Fatalf("error writing: %v", err) - } - - if !bytes.Equal(actual1.Bytes(), actual2.Bytes()) { - t.Errorf("%s != %s", string(actual1.Bytes()), string(actual2.Bytes())) - } -} diff --git a/index/scorch/segment/zap/count.go b/index/scorch/segment/zap/count.go deleted file mode 100644 index 50290f888..000000000 --- a/index/scorch/segment/zap/count.go +++ /dev/null @@ -1,61 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "hash/crc32" - "io" - - "github.com/blevesearch/bleve/index/scorch/segment" -) - -// CountHashWriter is a wrapper around a Writer which counts the number of -// bytes which have been written and computes a crc32 hash -type CountHashWriter struct { - w io.Writer - crc uint32 - n int - s segment.StatsReporter -} - -// NewCountHashWriter returns a CountHashWriter which wraps the provided Writer -func NewCountHashWriter(w io.Writer) *CountHashWriter { - return &CountHashWriter{w: w} -} - -func NewCountHashWriterWithStatsReporter(w io.Writer, s segment.StatsReporter) *CountHashWriter { - return &CountHashWriter{w: w, s: s} -} - -// Write writes the provided bytes to the wrapped writer and counts the bytes -func (c *CountHashWriter) Write(b []byte) (int, error) { - n, err := c.w.Write(b) - c.crc = crc32.Update(c.crc, crc32.IEEETable, b[:n]) - c.n += n - if c.s != nil { - c.s.ReportBytesWritten(uint64(n)) - } - return n, err -} - -// Count returns the number of bytes written -func (c *CountHashWriter) Count() int { - return c.n -} - -// Sum32 returns the CRC-32 hash of the content written to this writer -func (c *CountHashWriter) Sum32() uint32 { - return c.crc -} diff --git a/index/scorch/segment/zap/dict.go b/index/scorch/segment/zap/dict.go deleted file mode 100644 index ad4a8f8dc..000000000 --- a/index/scorch/segment/zap/dict.go +++ /dev/null @@ -1,263 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "bytes" - "fmt" - - "github.com/RoaringBitmap/roaring" - "github.com/blevesearch/bleve/index" - "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/couchbase/vellum" -) - -// Dictionary is the zap representation of the term dictionary -type Dictionary struct { - sb *SegmentBase - field string - fieldID uint16 - fst *vellum.FST - fstReader *vellum.Reader -} - -// PostingsList returns the postings list for the specified term -func (d *Dictionary) PostingsList(term []byte, except *roaring.Bitmap, - prealloc segment.PostingsList) (segment.PostingsList, error) { - var preallocPL *PostingsList - pl, ok := prealloc.(*PostingsList) - if ok && pl != nil { - preallocPL = pl - } - return d.postingsList(term, except, preallocPL) -} - -func (d *Dictionary) postingsList(term []byte, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { - if d.fstReader == nil { - if rv == nil || rv == emptyPostingsList { - return emptyPostingsList, nil - } - return d.postingsListInit(rv, except), nil - } - - postingsOffset, exists, err := d.fstReader.Get(term) - if err != nil { - return nil, fmt.Errorf("vellum err: %v", err) - } - if !exists { - if rv == nil || rv == emptyPostingsList { - return emptyPostingsList, nil - } - return d.postingsListInit(rv, except), nil - } - - return d.postingsListFromOffset(postingsOffset, except, rv) -} - -func (d *Dictionary) postingsListFromOffset(postingsOffset uint64, except *roaring.Bitmap, rv *PostingsList) (*PostingsList, error) { - rv = d.postingsListInit(rv, except) - - err := rv.read(postingsOffset, d) - if err != nil { - return nil, err - } - - return rv, nil -} - -func (d *Dictionary) postingsListInit(rv *PostingsList, except *roaring.Bitmap) *PostingsList { - if rv == nil || rv == emptyPostingsList { - rv = &PostingsList{} - } else { - postings := rv.postings - if postings != nil { - postings.Clear() - } - - *rv = PostingsList{} // clear the struct - - rv.postings = postings - } - rv.sb = d.sb - rv.except = except - return rv -} - -func (d *Dictionary) Contains(key []byte) (bool, error) { - return d.fst.Contains(key) -} - -// Iterator returns an iterator for this dictionary -func (d *Dictionary) Iterator() segment.DictionaryIterator { - rv := &DictionaryIterator{ - d: d, - } - - if d.fst != nil { - itr, err := d.fst.Iterator(nil, nil) - if err == nil { - rv.itr = itr - } else if err != vellum.ErrIteratorDone { - rv.err = err - } - } - - return rv -} - -// PrefixIterator returns an iterator which only visits terms having the -// the specified prefix -func (d *Dictionary) PrefixIterator(prefix string) segment.DictionaryIterator { - rv := &DictionaryIterator{ - d: d, - } - - kBeg := []byte(prefix) - kEnd := segment.IncrementBytes(kBeg) - - if d.fst != nil { - itr, err := d.fst.Iterator(kBeg, kEnd) - if err == nil { - rv.itr = itr - } else if err != vellum.ErrIteratorDone { - rv.err = err - } - } - - return rv -} - -// RangeIterator returns an iterator which only visits terms between the -// start and end terms. NOTE: bleve.index API specifies the end is inclusive. -func (d *Dictionary) RangeIterator(start, end string) segment.DictionaryIterator { - rv := &DictionaryIterator{ - d: d, - } - - // need to increment the end position to be inclusive - var endBytes []byte - if len(end) > 0 { - endBytes = []byte(end) - if endBytes[len(endBytes)-1] < 0xff { - endBytes[len(endBytes)-1]++ - } else { - endBytes = append(endBytes, 0xff) - } - } - - if d.fst != nil { - itr, err := d.fst.Iterator([]byte(start), endBytes) - if err == nil { - rv.itr = itr - } else if err != vellum.ErrIteratorDone { - rv.err = err - } - } - - return rv -} - -// AutomatonIterator returns an iterator which only visits terms -// having the the vellum automaton and start/end key range -func (d *Dictionary) AutomatonIterator(a vellum.Automaton, - startKeyInclusive, endKeyExclusive []byte) segment.DictionaryIterator { - rv := &DictionaryIterator{ - d: d, - } - - if d.fst != nil { - itr, err := d.fst.Search(a, startKeyInclusive, endKeyExclusive) - if err == nil { - rv.itr = itr - } else if err != vellum.ErrIteratorDone { - rv.err = err - } - } - - return rv -} - -func (d *Dictionary) OnlyIterator(onlyTerms [][]byte, - includeCount bool) segment.DictionaryIterator { - - rv := &DictionaryIterator{ - d: d, - omitCount: !includeCount, - } - - var buf bytes.Buffer - builder, err := vellum.New(&buf, nil) - if err != nil { - rv.err = err - return rv - } - for _, term := range onlyTerms { - err = builder.Insert(term, 0) - if err != nil { - rv.err = err - return rv - } - } - err = builder.Close() - if err != nil { - rv.err = err - return rv - } - - onlyFST, err := vellum.Load(buf.Bytes()) - if err != nil { - rv.err = err - return rv - } - - itr, err := d.fst.Search(onlyFST, nil, nil) - if err == nil { - rv.itr = itr - } else if err != vellum.ErrIteratorDone { - rv.err = err - } - - return rv -} - -// DictionaryIterator is an iterator for term dictionary -type DictionaryIterator struct { - d *Dictionary - itr vellum.Iterator - err error - tmp PostingsList - entry index.DictEntry - omitCount bool -} - -// Next returns the next entry in the dictionary -func (i *DictionaryIterator) Next() (*index.DictEntry, error) { - if i.err != nil && i.err != vellum.ErrIteratorDone { - return nil, i.err - } else if i.itr == nil || i.err == vellum.ErrIteratorDone { - return nil, nil - } - term, postingsOffset := i.itr.Current() - i.entry.Term = string(term) - if !i.omitCount { - i.err = i.tmp.read(postingsOffset, i.d) - if i.err != nil { - return nil, i.err - } - i.entry.Count = i.tmp.Count() - } - i.err = i.itr.Next() - return &i.entry, nil -} diff --git a/index/scorch/segment/zap/dict_test.go b/index/scorch/segment/zap/dict_test.go deleted file mode 100644 index c632917bc..000000000 --- a/index/scorch/segment/zap/dict_test.go +++ /dev/null @@ -1,337 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "os" - "reflect" - "testing" - - "github.com/blevesearch/bleve/analysis" - "github.com/blevesearch/bleve/document" - "github.com/blevesearch/bleve/index" - "github.com/couchbase/vellum/levenshtein" -) - -func buildTestSegmentForDict() (*SegmentBase, uint64, error) { - doc := &document.Document{ - ID: "a", - Fields: []document.Field{ - document.NewTextFieldCustom("_id", nil, []byte("a"), document.IndexField|document.StoreField, nil), - document.NewTextFieldCustom("desc", nil, []byte("apple ball cat dog egg fish bat"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - }, - } - - // forge analyzed docs - results := []*index.AnalysisResult{ - &index.AnalysisResult{ - Document: doc, - Analyzed: []analysis.TokenFrequencies{ - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 1, - Position: 1, - Term: []byte("a"), - }, - }, nil, false), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 5, - Position: 1, - Term: []byte("apple"), - }, - &analysis.Token{ - Start: 6, - End: 10, - Position: 2, - Term: []byte("ball"), - }, - &analysis.Token{ - Start: 11, - End: 14, - Position: 3, - Term: []byte("cat"), - }, - &analysis.Token{ - Start: 15, - End: 18, - Position: 4, - Term: []byte("dog"), - }, - &analysis.Token{ - Start: 19, - End: 22, - Position: 5, - Term: []byte("egg"), - }, - &analysis.Token{ - Start: 20, - End: 24, - Position: 6, - Term: []byte("fish"), - }, - &analysis.Token{ - Start: 25, - End: 28, - Position: 7, - Term: []byte("bat"), - }, - }, nil, true), - }, - Length: []int{ - 1, - 7, - }, - }, - } - - return AnalysisResultsToSegmentBase(results, 1024) -} - -func TestDictionary(t *testing.T) { - - _ = os.RemoveAll("/tmp/scorch.zap") - - testSeg, _, _ := buildTestSegmentForDict() - err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") - if err != nil { - t.Fatalf("error persisting segment: %v", err) - } - - segment, err := Open("/tmp/scorch.zap") - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - defer func() { - cerr := segment.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", err) - } - }() - - dict, err := segment.Dictionary("desc") - if err != nil { - t.Fatal(err) - } - - // test basic full iterator - expected := []string{"apple", "ball", "bat", "cat", "dog", "egg", "fish"} - var got []string - itr := dict.Iterator() - next, err := itr.Next() - for next != nil && err == nil { - got = append(got, next.Term) - next, err = itr.Next() - } - if err != nil { - t.Fatalf("dict itr error: %v", err) - } - - if !reflect.DeepEqual(expected, got) { - t.Errorf("expected: %v, got: %v", expected, got) - } - - // test prefix iterator - expected = []string{"ball", "bat"} - got = got[:0] - itr = dict.PrefixIterator("b") - next, err = itr.Next() - for next != nil && err == nil { - got = append(got, next.Term) - next, err = itr.Next() - } - if err != nil { - t.Fatalf("dict itr error: %v", err) - } - - if !reflect.DeepEqual(expected, got) { - t.Errorf("expected: %v, got: %v", expected, got) - } - - // test range iterator - expected = []string{"cat", "dog", "egg"} - got = got[:0] - itr = dict.RangeIterator("cat", "egg") - next, err = itr.Next() - for next != nil && err == nil { - got = append(got, next.Term) - next, err = itr.Next() - } - if err != nil { - t.Fatalf("dict itr error: %v", err) - } - - if !reflect.DeepEqual(expected, got) { - t.Errorf("expected: %v, got: %v", expected, got) - } -} - -func TestDictionaryError(t *testing.T) { - hash := make(map[uint8]levenshtein.LevenshteinAutomatonBuilder, 4) - for i := 1; i <= 3; i++ { - lb, err := levenshtein.NewLevenshteinAutomatonBuilder(uint8(i), false) - if err != nil { - t.Errorf("NewLevenshteinAutomatonBuilder(%d, false) failed, err: %v", i, err) - } - hash[uint8(i)] = *lb - } - - _ = os.RemoveAll("/tmp/scorch.zap") - - testSeg, _, _ := buildTestSegmentForDict() - err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") - if err != nil { - t.Fatalf("error persisting segment: %v", err) - } - - segment, err := Open("/tmp/scorch.zap") - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - defer func() { - cerr := segment.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", err) - } - }() - - dict, err := segment.Dictionary("desc") - if err != nil { - t.Fatal(err) - } - - lb := hash[uint8(2)] - a, err := lb.BuildDfa("summer", 2) - if err != nil { - t.Fatal(err) - } - itr := dict.AutomatonIterator(a, nil, nil) - if itr == nil { - t.Fatalf("got nil itr") - } - nxt, err := itr.Next() - if nxt != nil { - t.Fatalf("expected nil next") - } - if err != nil { - t.Fatalf("expected nil error from iterator, got: %v", err) - } - - lb = hash[uint8(1)] - a, err = lb.BuildDfa("cat", 1) // cat & bat - if err != nil { - t.Fatal(err) - } - itr = dict.AutomatonIterator(a, nil, nil) - if itr == nil { - t.Fatalf("got nil itr") - } - for i := 0; i < 2; i++ { - nxt, err = itr.Next() - if nxt == nil || err != nil { - t.Fatalf("expected non-nil next and nil err, got: %v, %v", nxt, err) - } - } - nxt, err = itr.Next() - if nxt != nil || err != nil { - t.Fatalf("expected nil next and nil err, got: %v, %v", nxt, err) - } - - lb = hash[uint8(2)] - a, err = lb.BuildDfa("cat", 2) // cat & bat - if err != nil { - t.Fatal(err) - } - itr = dict.AutomatonIterator(a, nil, nil) - if itr == nil { - t.Fatalf("got nil itr") - } - for i := 0; i < 2; i++ { - nxt, err = itr.Next() - if nxt == nil || err != nil { - t.Fatalf("expected non-nil next and nil err, got: %v, %v", nxt, err) - } - } - nxt, err = itr.Next() - if nxt != nil || err != nil { - t.Fatalf("expected nil next and nil err, got: %v, %v", nxt, err) - } - - lb = hash[uint8(3)] - a, err = lb.BuildDfa("cat", 3) - if err != nil { - t.Fatal(err) - } - itr = dict.AutomatonIterator(a, nil, nil) - if itr == nil { - t.Fatalf("got nil itr") - } - for i := 0; i < 5; i++ { - nxt, err = itr.Next() - if nxt == nil || err != nil { - t.Fatalf("expected non-nil next and nil err, got: %v, %v", nxt, err) - } - } - nxt, err = itr.Next() - if nxt != nil || err != nil { - t.Fatalf("expected nil next and nil err, got: %v, %v", nxt, err) - } -} - -func TestDictionaryBug1156(t *testing.T) { - - _ = os.RemoveAll("/tmp/scorch.zap") - - testSeg, _, _ := buildTestSegmentForDict() - err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") - if err != nil { - t.Fatalf("error persisting segment: %v", err) - } - - segment, err := Open("/tmp/scorch.zap") - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - defer func() { - cerr := segment.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", err) - } - }() - - dict, err := segment.Dictionary("desc") - if err != nil { - t.Fatal(err) - } - - // test range iterator - expected := []string{"cat", "dog", "egg", "fish"} - var got []string - itr := dict.RangeIterator("cat", "") - next, err := itr.Next() - for next != nil && err == nil { - got = append(got, next.Term) - next, err = itr.Next() - } - if err != nil { - t.Fatalf("dict itr error: %v", err) - } - - if !reflect.DeepEqual(expected, got) { - t.Errorf("expected: %v, got: %v", expected, got) - } -} diff --git a/index/scorch/segment/zap/docvalues.go b/index/scorch/segment/zap/docvalues.go deleted file mode 100644 index a819ca239..000000000 --- a/index/scorch/segment/zap/docvalues.go +++ /dev/null @@ -1,311 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "bytes" - "encoding/binary" - "fmt" - "math" - "reflect" - "sort" - - "github.com/blevesearch/bleve/index" - "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/blevesearch/bleve/size" - "github.com/golang/snappy" -) - -var reflectStaticSizedocValueReader int - -func init() { - var dvi docValueReader - reflectStaticSizedocValueReader = int(reflect.TypeOf(dvi).Size()) -} - -type docNumTermsVisitor func(docNum uint64, terms []byte) error - -type docVisitState struct { - dvrs map[uint16]*docValueReader - segment *SegmentBase -} - -type docValueReader struct { - field string - curChunkNum uint64 - chunkOffsets []uint64 - dvDataLoc uint64 - curChunkHeader []MetaData - curChunkData []byte // compressed data cache - uncompressed []byte // temp buf for snappy decompression -} - -func (di *docValueReader) size() int { - return reflectStaticSizedocValueReader + size.SizeOfPtr + - len(di.field) + - len(di.chunkOffsets)*size.SizeOfUint64 + - len(di.curChunkHeader)*reflectStaticSizeMetaData + - len(di.curChunkData) -} - -func (di *docValueReader) cloneInto(rv *docValueReader) *docValueReader { - if rv == nil { - rv = &docValueReader{} - } - - rv.field = di.field - rv.curChunkNum = math.MaxUint64 - rv.chunkOffsets = di.chunkOffsets // immutable, so it's sharable - rv.dvDataLoc = di.dvDataLoc - rv.curChunkHeader = rv.curChunkHeader[:0] - rv.curChunkData = nil - rv.uncompressed = rv.uncompressed[:0] - - return rv -} - -func (di *docValueReader) fieldName() string { - return di.field -} - -func (di *docValueReader) curChunkNumber() uint64 { - return di.curChunkNum -} - -func (s *SegmentBase) loadFieldDocValueReader(field string, - fieldDvLocStart, fieldDvLocEnd uint64) (*docValueReader, error) { - // get the docValue offset for the given fields - if fieldDvLocStart == fieldNotUninverted { - // no docValues found, nothing to do - return nil, nil - } - - // read the number of chunks, and chunk offsets position - var numChunks, chunkOffsetsPosition uint64 - - if fieldDvLocEnd-fieldDvLocStart > 16 { - numChunks = binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-8 : fieldDvLocEnd]) - // read the length of chunk offsets - chunkOffsetsLen := binary.BigEndian.Uint64(s.mem[fieldDvLocEnd-16 : fieldDvLocEnd-8]) - // acquire position of chunk offsets - chunkOffsetsPosition = (fieldDvLocEnd - 16) - chunkOffsetsLen - } else { - return nil, fmt.Errorf("loadFieldDocValueReader: fieldDvLoc too small: %d-%d", fieldDvLocEnd, fieldDvLocStart) - } - - fdvIter := &docValueReader{ - curChunkNum: math.MaxUint64, - field: field, - chunkOffsets: make([]uint64, int(numChunks)), - } - - // read the chunk offsets - var offset uint64 - for i := 0; i < int(numChunks); i++ { - loc, read := binary.Uvarint(s.mem[chunkOffsetsPosition+offset : chunkOffsetsPosition+offset+binary.MaxVarintLen64]) - if read <= 0 { - return nil, fmt.Errorf("corrupted chunk offset during segment load") - } - fdvIter.chunkOffsets[i] = loc - offset += uint64(read) - } - - // set the data offset - fdvIter.dvDataLoc = fieldDvLocStart - - return fdvIter, nil -} - -func (di *docValueReader) loadDvChunk(chunkNumber uint64, s *SegmentBase) error { - // advance to the chunk where the docValues - // reside for the given docNum - destChunkDataLoc, curChunkEnd := di.dvDataLoc, di.dvDataLoc - start, end := readChunkBoundary(int(chunkNumber), di.chunkOffsets) - if start >= end { - di.curChunkHeader = di.curChunkHeader[:0] - di.curChunkData = nil - di.curChunkNum = chunkNumber - di.uncompressed = di.uncompressed[:0] - return nil - } - - destChunkDataLoc += start - curChunkEnd += end - - // read the number of docs reside in the chunk - numDocs, read := binary.Uvarint(s.mem[destChunkDataLoc : destChunkDataLoc+binary.MaxVarintLen64]) - if read <= 0 { - return fmt.Errorf("failed to read the chunk") - } - chunkMetaLoc := destChunkDataLoc + uint64(read) - - offset := uint64(0) - if cap(di.curChunkHeader) < int(numDocs) { - di.curChunkHeader = make([]MetaData, int(numDocs)) - } else { - di.curChunkHeader = di.curChunkHeader[:int(numDocs)] - } - for i := 0; i < int(numDocs); i++ { - di.curChunkHeader[i].DocNum, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) - offset += uint64(read) - di.curChunkHeader[i].DocDvOffset, read = binary.Uvarint(s.mem[chunkMetaLoc+offset : chunkMetaLoc+offset+binary.MaxVarintLen64]) - offset += uint64(read) - } - - compressedDataLoc := chunkMetaLoc + offset - dataLength := curChunkEnd - compressedDataLoc - di.curChunkData = s.mem[compressedDataLoc : compressedDataLoc+dataLength] - di.curChunkNum = chunkNumber - di.uncompressed = di.uncompressed[:0] - return nil -} - -func (di *docValueReader) iterateAllDocValues(s *SegmentBase, visitor docNumTermsVisitor) error { - for i := 0; i < len(di.chunkOffsets); i++ { - err := di.loadDvChunk(uint64(i), s) - if err != nil { - return err - } - if di.curChunkData == nil || len(di.curChunkHeader) == 0 { - continue - } - - // uncompress the already loaded data - uncompressed, err := snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) - if err != nil { - return err - } - di.uncompressed = uncompressed - - start := uint64(0) - for _, entry := range di.curChunkHeader { - err = visitor(entry.DocNum, uncompressed[start:entry.DocDvOffset]) - if err != nil { - return err - } - - start = entry.DocDvOffset - } - } - - return nil -} - -func (di *docValueReader) visitDocValues(docNum uint64, - visitor index.DocumentFieldTermVisitor) error { - // binary search the term locations for the docNum - start, end := di.getDocValueLocs(docNum) - if start == math.MaxUint64 || end == math.MaxUint64 || start == end { - return nil - } - - var uncompressed []byte - var err error - // use the uncompressed copy if available - if len(di.uncompressed) > 0 { - uncompressed = di.uncompressed - } else { - // uncompress the already loaded data - uncompressed, err = snappy.Decode(di.uncompressed[:cap(di.uncompressed)], di.curChunkData) - if err != nil { - return err - } - di.uncompressed = uncompressed - } - - // pick the terms for the given docNum - uncompressed = uncompressed[start:end] - for { - i := bytes.Index(uncompressed, termSeparatorSplitSlice) - if i < 0 { - break - } - - visitor(di.field, uncompressed[0:i]) - uncompressed = uncompressed[i+1:] - } - - return nil -} - -func (di *docValueReader) getDocValueLocs(docNum uint64) (uint64, uint64) { - i := sort.Search(len(di.curChunkHeader), func(i int) bool { - return di.curChunkHeader[i].DocNum >= docNum - }) - if i < len(di.curChunkHeader) && di.curChunkHeader[i].DocNum == docNum { - return ReadDocValueBoundary(i, di.curChunkHeader) - } - return math.MaxUint64, math.MaxUint64 -} - -// VisitDocumentFieldTerms is an implementation of the -// DocumentFieldTermVisitable interface -func (s *SegmentBase) VisitDocumentFieldTerms(localDocNum uint64, fields []string, - visitor index.DocumentFieldTermVisitor, dvsIn segment.DocVisitState) ( - segment.DocVisitState, error) { - dvs, ok := dvsIn.(*docVisitState) - if !ok || dvs == nil { - dvs = &docVisitState{} - } else { - if dvs.segment != s { - dvs.segment = s - dvs.dvrs = nil - } - } - - var fieldIDPlus1 uint16 - if dvs.dvrs == nil { - dvs.dvrs = make(map[uint16]*docValueReader, len(fields)) - for _, field := range fields { - if fieldIDPlus1, ok = s.fieldsMap[field]; !ok { - continue - } - fieldID := fieldIDPlus1 - 1 - if dvIter, exists := s.fieldDvReaders[fieldID]; exists && - dvIter != nil { - dvs.dvrs[fieldID] = dvIter.cloneInto(dvs.dvrs[fieldID]) - } - } - } - - // find the chunkNumber where the docValues are stored - docInChunk := localDocNum / uint64(s.chunkFactor) - var dvr *docValueReader - for _, field := range fields { - if fieldIDPlus1, ok = s.fieldsMap[field]; !ok { - continue - } - fieldID := fieldIDPlus1 - 1 - if dvr, ok = dvs.dvrs[fieldID]; ok && dvr != nil { - // check if the chunk is already loaded - if docInChunk != dvr.curChunkNumber() { - err := dvr.loadDvChunk(docInChunk, s) - if err != nil { - return dvs, err - } - } - - _ = dvr.visitDocValues(localDocNum, visitor) - } - } - return dvs, nil -} - -// VisitableDocValueFields returns the list of fields with -// persisted doc value terms ready to be visitable using the -// VisitDocumentFieldTerms method. -func (s *SegmentBase) VisitableDocValueFields() ([]string, error) { - return s.fieldDvNames, nil -} diff --git a/index/scorch/segment/zap/enumerator.go b/index/scorch/segment/zap/enumerator.go deleted file mode 100644 index cd6ff73c7..000000000 --- a/index/scorch/segment/zap/enumerator.go +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright (c) 2018 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "bytes" - - "github.com/couchbase/vellum" -) - -// enumerator provides an ordered traversal of multiple vellum -// iterators. Like JOIN of iterators, the enumerator produces a -// sequence of (key, iteratorIndex, value) tuples, sorted by key ASC, -// then iteratorIndex ASC, where the same key might be seen or -// repeated across multiple child iterators. -type enumerator struct { - itrs []vellum.Iterator - currKs [][]byte - currVs []uint64 - - lowK []byte - lowIdxs []int - lowCurr int -} - -// newEnumerator returns a new enumerator over the vellum Iterators -func newEnumerator(itrs []vellum.Iterator) (*enumerator, error) { - rv := &enumerator{ - itrs: itrs, - currKs: make([][]byte, len(itrs)), - currVs: make([]uint64, len(itrs)), - lowIdxs: make([]int, 0, len(itrs)), - } - for i, itr := range rv.itrs { - rv.currKs[i], rv.currVs[i] = itr.Current() - } - rv.updateMatches(false) - if rv.lowK == nil && len(rv.lowIdxs) == 0 { - return rv, vellum.ErrIteratorDone - } - return rv, nil -} - -// updateMatches maintains the low key matches based on the currKs -func (m *enumerator) updateMatches(skipEmptyKey bool) { - m.lowK = nil - m.lowIdxs = m.lowIdxs[:0] - m.lowCurr = 0 - - for i, key := range m.currKs { - if (key == nil && m.currVs[i] == 0) || // in case of empty iterator - (len(key) == 0 && skipEmptyKey) { // skip empty keys - continue - } - - cmp := bytes.Compare(key, m.lowK) - if cmp < 0 || len(m.lowIdxs) == 0 { - // reached a new low - m.lowK = key - m.lowIdxs = m.lowIdxs[:0] - m.lowIdxs = append(m.lowIdxs, i) - } else if cmp == 0 { - m.lowIdxs = append(m.lowIdxs, i) - } - } -} - -// Current returns the enumerator's current key, iterator-index, and -// value. If the enumerator is not pointing at a valid value (because -// Next returned an error previously), Current will return nil,0,0. -func (m *enumerator) Current() ([]byte, int, uint64) { - var i int - var v uint64 - if m.lowCurr < len(m.lowIdxs) { - i = m.lowIdxs[m.lowCurr] - v = m.currVs[i] - } - return m.lowK, i, v -} - -// Next advances the enumerator to the next key/iterator/value result, -// else vellum.ErrIteratorDone is returned. -func (m *enumerator) Next() error { - m.lowCurr += 1 - if m.lowCurr >= len(m.lowIdxs) { - // move all the current low iterators forwards - for _, vi := range m.lowIdxs { - err := m.itrs[vi].Next() - if err != nil && err != vellum.ErrIteratorDone { - return err - } - m.currKs[vi], m.currVs[vi] = m.itrs[vi].Current() - } - // can skip any empty keys encountered at this point - m.updateMatches(true) - } - if m.lowK == nil && len(m.lowIdxs) == 0 { - return vellum.ErrIteratorDone - } - return nil -} - -// Close all the underlying Iterators. The first error, if any, will -// be returned. -func (m *enumerator) Close() error { - var rv error - for _, itr := range m.itrs { - err := itr.Close() - if rv == nil { - rv = err - } - } - return rv -} diff --git a/index/scorch/segment/zap/enumerator_test.go b/index/scorch/segment/zap/enumerator_test.go deleted file mode 100644 index 4ba4aa4c0..000000000 --- a/index/scorch/segment/zap/enumerator_test.go +++ /dev/null @@ -1,237 +0,0 @@ -// Copyright (c) 2018 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or -// implied. See the License for the specific language governing -// permissions and limitations under the License. - -package zap - -import ( - "fmt" - "testing" - - "github.com/couchbase/vellum" -) - -type enumTestEntry struct { - key string - val uint64 -} - -type enumTestWant struct { - key string - idx int - val uint64 -} - -func TestEnumerator(t *testing.T) { - tests := []struct { - desc string - in [][]enumTestEntry - want []enumTestWant - }{ - { - desc: "two non-empty enumerators with no duplicate keys", - in: [][]enumTestEntry{ - []enumTestEntry{ - {"a", 1}, - {"c", 3}, - {"e", 5}, - }, - []enumTestEntry{ - {"b", 2}, - {"d", 4}, - {"f", 6}, - }, - }, - want: []enumTestWant{ - {"a", 0, 1}, - {"b", 1, 2}, - {"c", 0, 3}, - {"d", 1, 4}, - {"e", 0, 5}, - {"f", 1, 6}, - }, - }, - { - desc: "two non-empty enumerators with duplicate keys", - in: [][]enumTestEntry{ - []enumTestEntry{ - {"a", 1}, - {"c", 3}, - {"e", 5}, - }, - []enumTestEntry{ - {"a", 2}, - {"c", 4}, - {"e", 6}, - }, - }, - want: []enumTestWant{ - {"a", 0, 1}, - {"a", 1, 2}, - {"c", 0, 3}, - {"c", 1, 4}, - {"e", 0, 5}, - {"e", 1, 6}, - }, - }, - { - desc: "first iterator is empty", - in: [][]enumTestEntry{ - []enumTestEntry{}, - []enumTestEntry{ - {"a", 2}, - {"c", 4}, - {"e", 6}, - }, - }, - want: []enumTestWant{ - {"a", 1, 2}, - {"c", 1, 4}, - {"e", 1, 6}, - }, - }, - { - desc: "last iterator is empty", - in: [][]enumTestEntry{ - []enumTestEntry{ - {"a", 1}, - {"c", 3}, - {"e", 5}, - }, - []enumTestEntry{}, - }, - want: []enumTestWant{ - {"a", 0, 1}, - {"c", 0, 3}, - {"e", 0, 5}, - }, - }, - { - desc: "two different length enumerators with duplicate keys", - in: [][]enumTestEntry{ - []enumTestEntry{ - {"a", 1}, - {"c", 3}, - {"e", 5}, - }, - []enumTestEntry{ - {"a", 2}, - {"b", 4}, - {"d", 1000}, - {"e", 6}, - }, - }, - want: []enumTestWant{ - {"a", 0, 1}, - {"a", 1, 2}, - {"b", 1, 4}, - {"c", 0, 3}, - {"d", 1, 1000}, - {"e", 0, 5}, - {"e", 1, 6}, - }, - }, - } - - for _, test := range tests { - var itrs []vellum.Iterator - for _, entries := range test.in { - itrs = append(itrs, &testIterator{entries: entries}) - } - - enumerator, err := newEnumerator(itrs) - if err != nil { - t.Fatalf("%s - expected no err on newNumerator, got: %v", test.desc, err) - } - - wanti := 0 - for wanti < len(test.want) { - if err != nil { - t.Fatalf("%s - wanted no err, got: %v", test.desc, err) - } - - currK, currIdx, currV := enumerator.Current() - - want := test.want[wanti] - if want.key != string(currK) { - t.Fatalf("%s - wrong key, wanted: %#v, got: %q, %d, %d", test.desc, - want, currK, currIdx, currV) - } - if want.idx != currIdx { - t.Fatalf("%s - wrong idx, wanted: %#v, got: %q, %d, %d", test.desc, - want, currK, currIdx, currV) - } - if want.val != currV { - t.Fatalf("%s - wrong val, wanted: %#v, got: %q, %d, %d", test.desc, - want, currK, currIdx, currV) - } - - wanti += 1 - - err = enumerator.Next() - } - - if err != vellum.ErrIteratorDone { - t.Fatalf("%s - expected ErrIteratorDone, got: %v", test.desc, err) - } - - err = enumerator.Close() - if err != nil { - t.Fatalf("%s - expected nil err on close, got: %v", test.desc, err) - } - - for _, itr := range itrs { - if itr.(*testIterator).curr != 654321 { - t.Fatalf("%s - expected child iter to be closed", test.desc) - } - } - } -} - -type testIterator struct { - entries []enumTestEntry - curr int -} - -func (m *testIterator) Current() ([]byte, uint64) { - if m.curr >= len(m.entries) { - return nil, 0 - } - return []byte(m.entries[m.curr].key), m.entries[m.curr].val -} - -func (m *testIterator) Next() error { - m.curr++ - if m.curr >= len(m.entries) { - return vellum.ErrIteratorDone - } - return nil -} - -func (m *testIterator) Seek(key []byte) error { - return fmt.Errorf("not implemented for enumerator unit tests") -} - -func (m *testIterator) Reset(f *vellum.FST, - startKeyInclusive, endKeyExclusive []byte, aut vellum.Automaton) error { - return fmt.Errorf("not implemented for enumerator unit tests") -} - -func (m *testIterator) Close() error { - m.curr = 654321 - return nil -} - -func (m *testIterator) Exists(key []byte) (bool, error) { - return false, fmt.Errorf("not implemented for enumerator unit tests") -} diff --git a/index/scorch/segment/zap/intcoder.go b/index/scorch/segment/zap/intcoder.go deleted file mode 100644 index 571d06edb..000000000 --- a/index/scorch/segment/zap/intcoder.go +++ /dev/null @@ -1,172 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "bytes" - "encoding/binary" - "io" -) - -type chunkedIntCoder struct { - final []byte - chunkSize uint64 - chunkBuf bytes.Buffer - chunkLens []uint64 - currChunk uint64 - - buf []byte -} - -// newChunkedIntCoder returns a new chunk int coder which packs data into -// chunks based on the provided chunkSize and supports up to the specified -// maxDocNum -func newChunkedIntCoder(chunkSize uint64, maxDocNum uint64) *chunkedIntCoder { - total := maxDocNum/chunkSize + 1 - rv := &chunkedIntCoder{ - chunkSize: chunkSize, - chunkLens: make([]uint64, total), - final: make([]byte, 0, 64), - } - - return rv -} - -// Reset lets you reuse this chunked int coder. buffers are reset and reused -// from previous use. you cannot change the chunk size or max doc num. -func (c *chunkedIntCoder) Reset() { - c.final = c.final[:0] - c.chunkBuf.Reset() - c.currChunk = 0 - for i := range c.chunkLens { - c.chunkLens[i] = 0 - } -} - -// Add encodes the provided integers into the correct chunk for the provided -// doc num. You MUST call Add() with increasing docNums. -func (c *chunkedIntCoder) Add(docNum uint64, vals ...uint64) error { - chunk := docNum / c.chunkSize - if chunk != c.currChunk { - // starting a new chunk - c.Close() - c.chunkBuf.Reset() - c.currChunk = chunk - } - - if len(c.buf) < binary.MaxVarintLen64 { - c.buf = make([]byte, binary.MaxVarintLen64) - } - - for _, val := range vals { - wb := binary.PutUvarint(c.buf, val) - _, err := c.chunkBuf.Write(c.buf[:wb]) - if err != nil { - return err - } - } - - return nil -} - -func (c *chunkedIntCoder) AddBytes(docNum uint64, buf []byte) error { - chunk := docNum / c.chunkSize - if chunk != c.currChunk { - // starting a new chunk - c.Close() - c.chunkBuf.Reset() - c.currChunk = chunk - } - - _, err := c.chunkBuf.Write(buf) - return err -} - -// Close indicates you are done calling Add() this allows the final chunk -// to be encoded. -func (c *chunkedIntCoder) Close() { - encodingBytes := c.chunkBuf.Bytes() - c.chunkLens[c.currChunk] = uint64(len(encodingBytes)) - c.final = append(c.final, encodingBytes...) - c.currChunk = uint64(cap(c.chunkLens)) // sentinel to detect double close -} - -// Write commits all the encoded chunked integers to the provided writer. -func (c *chunkedIntCoder) Write(w io.Writer) (int, error) { - bufNeeded := binary.MaxVarintLen64 * (1 + len(c.chunkLens)) - if len(c.buf) < bufNeeded { - c.buf = make([]byte, bufNeeded) - } - buf := c.buf - - // convert the chunk lengths into chunk offsets - chunkOffsets := modifyLengthsToEndOffsets(c.chunkLens) - - // write out the number of chunks & each chunk offsets - n := binary.PutUvarint(buf, uint64(len(chunkOffsets))) - for _, chunkOffset := range chunkOffsets { - n += binary.PutUvarint(buf[n:], chunkOffset) - } - - tw, err := w.Write(buf[:n]) - if err != nil { - return tw, err - } - - // write out the data - nw, err := w.Write(c.final) - tw += nw - if err != nil { - return tw, err - } - return tw, nil -} - -func (c *chunkedIntCoder) FinalSize() int { - return len(c.final) -} - -// modifyLengthsToEndOffsets converts the chunk length array -// to a chunk offset array. The readChunkBoundary -// will figure out the start and end of every chunk from -// these offsets. Starting offset of i'th index is stored -// in i-1'th position except for 0'th index and ending offset -// is stored at i'th index position. -// For 0'th element, starting position is always zero. -// eg: -// Lens -> 5 5 5 5 => 5 10 15 20 -// Lens -> 0 5 0 5 => 0 5 5 10 -// Lens -> 0 0 0 5 => 0 0 0 5 -// Lens -> 5 0 0 0 => 5 5 5 5 -// Lens -> 0 5 0 0 => 0 5 5 5 -// Lens -> 0 0 5 0 => 0 0 5 5 -func modifyLengthsToEndOffsets(lengths []uint64) []uint64 { - var runningOffset uint64 - var index, i int - for i = 1; i <= len(lengths); i++ { - runningOffset += lengths[i-1] - lengths[index] = runningOffset - index++ - } - return lengths -} - -func readChunkBoundary(chunk int, offsets []uint64) (uint64, uint64) { - var start uint64 - if chunk > 0 { - start = offsets[chunk-1] - } - return start, offsets[chunk] -} diff --git a/index/scorch/segment/zap/intcoder_test.go b/index/scorch/segment/zap/intcoder_test.go deleted file mode 100644 index 952e0669d..000000000 --- a/index/scorch/segment/zap/intcoder_test.go +++ /dev/null @@ -1,269 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "bytes" - "reflect" - "testing" -) - -func TestChunkIntCoder(t *testing.T) { - tests := []struct { - maxDocNum uint64 - chunkSize uint64 - docNums []uint64 - vals [][]uint64 - expected []byte - }{ - { - maxDocNum: 0, - chunkSize: 1, - docNums: []uint64{0}, - vals: [][]uint64{ - []uint64{3}, - }, - // 1 chunk, chunk-0 length 1, value 3 - expected: []byte{0x1, 0x1, 0x3}, - }, - { - maxDocNum: 1, - chunkSize: 1, - docNums: []uint64{0, 1}, - vals: [][]uint64{ - []uint64{3}, - []uint64{7}, - }, - // 2 chunks, chunk-0 offset 1, chunk-1 offset 2, value 3, value 7 - expected: []byte{0x2, 0x1, 0x2, 0x3, 0x7}, - }, - } - - for _, test := range tests { - - cic := newChunkedIntCoder(test.chunkSize, test.maxDocNum) - for i, docNum := range test.docNums { - err := cic.Add(docNum, test.vals[i]...) - if err != nil { - t.Fatalf("error adding to intcoder: %v", err) - } - } - cic.Close() - var actual bytes.Buffer - _, err := cic.Write(&actual) - if err != nil { - t.Fatalf("error writing: %v", err) - } - if !reflect.DeepEqual(test.expected, actual.Bytes()) { - t.Errorf("got % x, expected % x", actual.Bytes(), test.expected) - } - } -} - -func TestChunkLengthToOffsets(t *testing.T) { - - tests := []struct { - lengths []uint64 - expectedOffsets []uint64 - }{ - { - lengths: []uint64{5, 5, 5, 5, 5}, - expectedOffsets: []uint64{5, 10, 15, 20, 25}, - }, - { - lengths: []uint64{0, 5, 0, 5, 0}, - expectedOffsets: []uint64{0, 5, 5, 10, 10}, - }, - { - lengths: []uint64{0, 0, 0, 0, 5}, - expectedOffsets: []uint64{0, 0, 0, 0, 5}, - }, - { - lengths: []uint64{5, 0, 0, 0, 0}, - expectedOffsets: []uint64{5, 5, 5, 5, 5}, - }, - { - lengths: []uint64{0, 5, 0, 0, 0}, - expectedOffsets: []uint64{0, 5, 5, 5, 5}, - }, - { - lengths: []uint64{0, 0, 0, 5, 0}, - expectedOffsets: []uint64{0, 0, 0, 5, 5}, - }, - { - lengths: []uint64{0, 0, 0, 5, 5}, - expectedOffsets: []uint64{0, 0, 0, 5, 10}, - }, - { - lengths: []uint64{5, 5, 5, 0, 0}, - expectedOffsets: []uint64{5, 10, 15, 15, 15}, - }, - { - lengths: []uint64{5}, - expectedOffsets: []uint64{5}, - }, - { - lengths: []uint64{5, 5}, - expectedOffsets: []uint64{5, 10}, - }, - } - - for i, test := range tests { - modifyLengthsToEndOffsets(test.lengths) - if !reflect.DeepEqual(test.expectedOffsets, test.lengths) { - t.Errorf("Test: %d failed, got %+v, expected %+v", i, test.lengths, test.expectedOffsets) - } - } -} - -func TestChunkReadBoundaryFromOffsets(t *testing.T) { - - tests := []struct { - chunkNumber int - offsets []uint64 - expectedStart uint64 - expectedEnd uint64 - }{ - { - offsets: []uint64{5, 10, 15, 20, 25}, - chunkNumber: 4, - expectedStart: 20, - expectedEnd: 25, - }, - { - offsets: []uint64{5, 10, 15, 20, 25}, - chunkNumber: 0, - expectedStart: 0, - expectedEnd: 5, - }, - { - offsets: []uint64{5, 10, 15, 20, 25}, - chunkNumber: 2, - expectedStart: 10, - expectedEnd: 15, - }, - { - offsets: []uint64{0, 5, 5, 10, 10}, - chunkNumber: 4, - expectedStart: 10, - expectedEnd: 10, - }, - { - offsets: []uint64{0, 5, 5, 10, 10}, - chunkNumber: 1, - expectedStart: 0, - expectedEnd: 5, - }, - { - offsets: []uint64{5, 5, 5, 5, 5}, - chunkNumber: 0, - expectedStart: 0, - expectedEnd: 5, - }, - { - offsets: []uint64{5, 5, 5, 5, 5}, - chunkNumber: 4, - expectedStart: 5, - expectedEnd: 5, - }, - { - offsets: []uint64{5, 5, 5, 5, 5}, - chunkNumber: 1, - expectedStart: 5, - expectedEnd: 5, - }, - { - offsets: []uint64{0, 5, 5, 5, 5}, - chunkNumber: 1, - expectedStart: 0, - expectedEnd: 5, - }, - { - offsets: []uint64{0, 5, 5, 5, 5}, - chunkNumber: 0, - expectedStart: 0, - expectedEnd: 0, - }, - { - offsets: []uint64{0, 0, 0, 5, 5}, - chunkNumber: 2, - expectedStart: 0, - expectedEnd: 0, - }, - { - offsets: []uint64{0, 0, 0, 5, 5}, - chunkNumber: 1, - expectedStart: 0, - expectedEnd: 0, - }, - { - offsets: []uint64{0, 0, 0, 0, 5}, - chunkNumber: 4, - expectedStart: 0, - expectedEnd: 5, - }, - { - offsets: []uint64{0, 0, 0, 0, 5}, - chunkNumber: 2, - expectedStart: 0, - expectedEnd: 0, - }, - { - offsets: []uint64{5, 10, 15, 15, 15}, - chunkNumber: 0, - expectedStart: 0, - expectedEnd: 5, - }, - { - offsets: []uint64{5, 10, 15, 15, 15}, - chunkNumber: 1, - expectedStart: 5, - expectedEnd: 10, - }, - { - offsets: []uint64{5, 10, 15, 15, 15}, - chunkNumber: 2, - expectedStart: 10, - expectedEnd: 15, - }, - { - offsets: []uint64{5, 10, 15, 15, 15}, - chunkNumber: 3, - expectedStart: 15, - expectedEnd: 15, - }, - { - offsets: []uint64{5, 10, 15, 15, 15}, - chunkNumber: 4, - expectedStart: 15, - expectedEnd: 15, - }, - { - offsets: []uint64{5}, - chunkNumber: 0, - expectedStart: 0, - expectedEnd: 5, - }, - } - - for i, test := range tests { - s, e := readChunkBoundary(test.chunkNumber, test.offsets) - if test.expectedStart != s || test.expectedEnd != e { - t.Errorf("Test: %d failed for chunkNumber: %d got start: %d end: %d,"+ - " expected start: %d end: %d", i, test.chunkNumber, s, e, - test.expectedStart, test.expectedEnd) - } - } -} diff --git a/index/scorch/segment/zap/merge.go b/index/scorch/segment/zap/merge.go deleted file mode 100644 index 50bd7207a..000000000 --- a/index/scorch/segment/zap/merge.go +++ /dev/null @@ -1,862 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "bufio" - "bytes" - "encoding/binary" - "fmt" - "math" - "os" - "sort" - - "github.com/RoaringBitmap/roaring" - seg "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/couchbase/vellum" - "github.com/golang/snappy" -) - -var DefaultFileMergerBufferSize = 1024 * 1024 - -// ValidateMerge can be set by applications to perform additional checks -// on a new segment produced by a merge, by default this does nothing. -// Caller should provide EITHER segments or memSegments, but not both. -// This API is experimental and may be removed at any time. -var ValidateMerge = func(segments []*Segment, memSegments []*SegmentBase, drops []*roaring.Bitmap, newSegment *Segment) error { - return nil -} - -const docDropped = math.MaxUint64 // sentinel docNum to represent a deleted doc - -// Merge takes a slice of zap segments and bit masks describing which -// documents may be dropped, and creates a new segment containing the -// remaining data. This new segment is built at the specified path, -// with the provided chunkFactor. -func Merge(segments []*Segment, drops []*roaring.Bitmap, path string, - chunkFactor uint32, closeCh chan struct{}, s seg.StatsReporter) ( - [][]uint64, uint64, error) { - segmentBases := make([]*SegmentBase, len(segments)) - for segmenti, segment := range segments { - segmentBases[segmenti] = &segment.SegmentBase - } - - return MergeSegmentBases(segmentBases, drops, path, chunkFactor, closeCh, s) -} - -func MergeSegmentBases(segmentBases []*SegmentBase, drops []*roaring.Bitmap, path string, - chunkFactor uint32, closeCh chan struct{}, s seg.StatsReporter) ( - [][]uint64, uint64, error) { - flag := os.O_RDWR | os.O_CREATE - - f, err := os.OpenFile(path, flag, 0600) - if err != nil { - return nil, 0, err - } - - cleanup := func() { - _ = f.Close() - _ = os.Remove(path) - } - - // buffer the output - br := bufio.NewWriterSize(f, DefaultFileMergerBufferSize) - - // wrap it for counting (tracking offsets) - cr := NewCountHashWriterWithStatsReporter(br, s) - - newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, _, _, _, err := - MergeToWriter(segmentBases, drops, chunkFactor, cr, closeCh) - if err != nil { - cleanup() - return nil, 0, err - } - - err = persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, - docValueOffset, chunkFactor, cr.Sum32(), cr) - if err != nil { - cleanup() - return nil, 0, err - } - - err = br.Flush() - if err != nil { - cleanup() - return nil, 0, err - } - - err = f.Sync() - if err != nil { - cleanup() - return nil, 0, err - } - - err = f.Close() - if err != nil { - cleanup() - return nil, 0, err - } - - return newDocNums, uint64(cr.Count()), nil -} - -func MergeToWriter(segments []*SegmentBase, drops []*roaring.Bitmap, - chunkFactor uint32, cr *CountHashWriter, closeCh chan struct{}) ( - newDocNums [][]uint64, - numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64, - dictLocs []uint64, fieldsInv []string, fieldsMap map[string]uint16, - err error) { - docValueOffset = uint64(fieldNotUninverted) - - var fieldsSame bool - fieldsSame, fieldsInv = mergeFields(segments) - fieldsMap = mapFields(fieldsInv) - - numDocs = computeNewDocCount(segments, drops) - - if isClosed(closeCh) { - return nil, 0, 0, 0, 0, nil, nil, nil, seg.ErrClosed - } - - if numDocs > 0 { - storedIndexOffset, newDocNums, err = mergeStoredAndRemap(segments, drops, - fieldsMap, fieldsInv, fieldsSame, numDocs, cr, closeCh) - if err != nil { - return nil, 0, 0, 0, 0, nil, nil, nil, err - } - - dictLocs, docValueOffset, err = persistMergedRest(segments, drops, - fieldsInv, fieldsMap, fieldsSame, - newDocNums, numDocs, chunkFactor, cr, closeCh) - if err != nil { - return nil, 0, 0, 0, 0, nil, nil, nil, err - } - } else { - dictLocs = make([]uint64, len(fieldsInv)) - } - - fieldsIndexOffset, err = persistFields(fieldsInv, cr, dictLocs) - if err != nil { - return nil, 0, 0, 0, 0, nil, nil, nil, err - } - - return newDocNums, numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset, dictLocs, fieldsInv, fieldsMap, nil -} - -// mapFields takes the fieldsInv list and returns a map of fieldName -// to fieldID+1 -func mapFields(fields []string) map[string]uint16 { - rv := make(map[string]uint16, len(fields)) - for i, fieldName := range fields { - rv[fieldName] = uint16(i) + 1 - } - return rv -} - -// computeNewDocCount determines how many documents will be in the newly -// merged segment when obsoleted docs are dropped -func computeNewDocCount(segments []*SegmentBase, drops []*roaring.Bitmap) uint64 { - var newDocCount uint64 - for segI, segment := range segments { - newDocCount += segment.numDocs - if drops[segI] != nil { - newDocCount -= drops[segI].GetCardinality() - } - } - return newDocCount -} - -func persistMergedRest(segments []*SegmentBase, dropsIn []*roaring.Bitmap, - fieldsInv []string, fieldsMap map[string]uint16, fieldsSame bool, - newDocNumsIn [][]uint64, newSegDocCount uint64, chunkFactor uint32, - w *CountHashWriter, closeCh chan struct{}) ([]uint64, uint64, error) { - - var bufMaxVarintLen64 []byte = make([]byte, binary.MaxVarintLen64) - var bufLoc []uint64 - - var postings *PostingsList - var postItr *PostingsIterator - - rv := make([]uint64, len(fieldsInv)) - fieldDvLocsStart := make([]uint64, len(fieldsInv)) - fieldDvLocsEnd := make([]uint64, len(fieldsInv)) - - tfEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) - locEncoder := newChunkedIntCoder(uint64(chunkFactor), newSegDocCount-1) - - var vellumBuf bytes.Buffer - newVellum, err := vellum.New(&vellumBuf, nil) - if err != nil { - return nil, 0, err - } - - newRoaring := roaring.NewBitmap() - - // for each field - for fieldID, fieldName := range fieldsInv { - - // collect FST iterators from all active segments for this field - var newDocNums [][]uint64 - var drops []*roaring.Bitmap - var dicts []*Dictionary - var itrs []vellum.Iterator - - var segmentsInFocus []*SegmentBase - - for segmentI, segment := range segments { - - // check for the closure in meantime - if isClosed(closeCh) { - return nil, 0, seg.ErrClosed - } - - dict, err2 := segment.dictionary(fieldName) - if err2 != nil { - return nil, 0, err2 - } - if dict != nil && dict.fst != nil { - itr, err2 := dict.fst.Iterator(nil, nil) - if err2 != nil && err2 != vellum.ErrIteratorDone { - return nil, 0, err2 - } - if itr != nil { - newDocNums = append(newDocNums, newDocNumsIn[segmentI]) - if dropsIn[segmentI] != nil && !dropsIn[segmentI].IsEmpty() { - drops = append(drops, dropsIn[segmentI]) - } else { - drops = append(drops, nil) - } - dicts = append(dicts, dict) - itrs = append(itrs, itr) - segmentsInFocus = append(segmentsInFocus, segment) - } - } - } - - var prevTerm []byte - - newRoaring.Clear() - - var lastDocNum, lastFreq, lastNorm uint64 - - // determines whether to use "1-hit" encoding optimization - // when a term appears in only 1 doc, with no loc info, - // has freq of 1, and the docNum fits into 31-bits - use1HitEncoding := func(termCardinality uint64) (bool, uint64, uint64) { - if termCardinality == uint64(1) && locEncoder.FinalSize() <= 0 { - docNum := uint64(newRoaring.Minimum()) - if under32Bits(docNum) && docNum == lastDocNum && lastFreq == 1 { - return true, docNum, lastNorm - } - } - return false, 0, 0 - } - - finishTerm := func(term []byte) error { - tfEncoder.Close() - locEncoder.Close() - - postingsOffset, err := writePostings(newRoaring, - tfEncoder, locEncoder, use1HitEncoding, w, bufMaxVarintLen64) - if err != nil { - return err - } - - if postingsOffset > 0 { - err = newVellum.Insert(term, postingsOffset) - if err != nil { - return err - } - } - - newRoaring.Clear() - - tfEncoder.Reset() - locEncoder.Reset() - - lastDocNum = 0 - lastFreq = 0 - lastNorm = 0 - - return nil - } - - enumerator, err := newEnumerator(itrs) - - for err == nil { - term, itrI, postingsOffset := enumerator.Current() - - if !bytes.Equal(prevTerm, term) { - // check for the closure in meantime - if isClosed(closeCh) { - return nil, 0, seg.ErrClosed - } - - // if the term changed, write out the info collected - // for the previous term - err = finishTerm(prevTerm) - if err != nil { - return nil, 0, err - } - } - - postings, err = dicts[itrI].postingsListFromOffset( - postingsOffset, drops[itrI], postings) - if err != nil { - return nil, 0, err - } - - postItr = postings.iterator(true, true, true, postItr) - - if fieldsSame { - // can optimize by copying freq/norm/loc bytes directly - lastDocNum, lastFreq, lastNorm, err = mergeTermFreqNormLocsByCopying( - term, postItr, newDocNums[itrI], newRoaring, - tfEncoder, locEncoder) - } else { - lastDocNum, lastFreq, lastNorm, bufLoc, err = mergeTermFreqNormLocs( - fieldsMap, term, postItr, newDocNums[itrI], newRoaring, - tfEncoder, locEncoder, bufLoc) - } - if err != nil { - return nil, 0, err - } - - prevTerm = prevTerm[:0] // copy to prevTerm in case Next() reuses term mem - prevTerm = append(prevTerm, term...) - - err = enumerator.Next() - } - if err != vellum.ErrIteratorDone { - return nil, 0, err - } - - err = finishTerm(prevTerm) - if err != nil { - return nil, 0, err - } - - dictOffset := uint64(w.Count()) - - err = newVellum.Close() - if err != nil { - return nil, 0, err - } - vellumData := vellumBuf.Bytes() - - // write out the length of the vellum data - n := binary.PutUvarint(bufMaxVarintLen64, uint64(len(vellumData))) - _, err = w.Write(bufMaxVarintLen64[:n]) - if err != nil { - return nil, 0, err - } - - // write this vellum to disk - _, err = w.Write(vellumData) - if err != nil { - return nil, 0, err - } - - rv[fieldID] = dictOffset - - // get the field doc value offset (start) - fieldDvLocsStart[fieldID] = uint64(w.Count()) - - // update the field doc values - fdvEncoder := newChunkedContentCoder(uint64(chunkFactor), newSegDocCount-1, w, true) - - fdvReadersAvailable := false - var dvIterClone *docValueReader - for segmentI, segment := range segmentsInFocus { - // check for the closure in meantime - if isClosed(closeCh) { - return nil, 0, seg.ErrClosed - } - - fieldIDPlus1 := uint16(segment.fieldsMap[fieldName]) - if dvIter, exists := segment.fieldDvReaders[fieldIDPlus1-1]; exists && - dvIter != nil { - fdvReadersAvailable = true - dvIterClone = dvIter.cloneInto(dvIterClone) - err = dvIterClone.iterateAllDocValues(segment, func(docNum uint64, terms []byte) error { - if newDocNums[segmentI][docNum] == docDropped { - return nil - } - err := fdvEncoder.Add(newDocNums[segmentI][docNum], terms) - if err != nil { - return err - } - return nil - }) - if err != nil { - return nil, 0, err - } - } - } - - if fdvReadersAvailable { - err = fdvEncoder.Close() - if err != nil { - return nil, 0, err - } - - // persist the doc value details for this field - _, err = fdvEncoder.Write() - if err != nil { - return nil, 0, err - } - - // get the field doc value offset (end) - fieldDvLocsEnd[fieldID] = uint64(w.Count()) - } else { - fieldDvLocsStart[fieldID] = fieldNotUninverted - fieldDvLocsEnd[fieldID] = fieldNotUninverted - } - - // reset vellum buffer and vellum builder - vellumBuf.Reset() - err = newVellum.Reset(&vellumBuf) - if err != nil { - return nil, 0, err - } - } - - fieldDvLocsOffset := uint64(w.Count()) - - buf := bufMaxVarintLen64 - for i := 0; i < len(fieldDvLocsStart); i++ { - n := binary.PutUvarint(buf, fieldDvLocsStart[i]) - _, err := w.Write(buf[:n]) - if err != nil { - return nil, 0, err - } - n = binary.PutUvarint(buf, fieldDvLocsEnd[i]) - _, err = w.Write(buf[:n]) - if err != nil { - return nil, 0, err - } - } - - return rv, fieldDvLocsOffset, nil -} - -func mergeTermFreqNormLocs(fieldsMap map[string]uint16, term []byte, postItr *PostingsIterator, - newDocNums []uint64, newRoaring *roaring.Bitmap, - tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder, bufLoc []uint64) ( - lastDocNum uint64, lastFreq uint64, lastNorm uint64, bufLocOut []uint64, err error) { - next, err := postItr.Next() - for next != nil && err == nil { - hitNewDocNum := newDocNums[next.Number()] - if hitNewDocNum == docDropped { - return 0, 0, 0, nil, fmt.Errorf("see hit with dropped docNum") - } - - newRoaring.Add(uint32(hitNewDocNum)) - - nextFreq := next.Frequency() - nextNorm := uint64(math.Float32bits(float32(next.Norm()))) - - locs := next.Locations() - - err = tfEncoder.Add(hitNewDocNum, - encodeFreqHasLocs(nextFreq, len(locs) > 0), nextNorm) - if err != nil { - return 0, 0, 0, nil, err - } - - if len(locs) > 0 { - numBytesLocs := 0 - for _, loc := range locs { - ap := loc.ArrayPositions() - numBytesLocs += totalUvarintBytes(uint64(fieldsMap[loc.Field()]-1), - loc.Pos(), loc.Start(), loc.End(), uint64(len(ap)), ap) - } - - err = locEncoder.Add(hitNewDocNum, uint64(numBytesLocs)) - if err != nil { - return 0, 0, 0, nil, err - } - - for _, loc := range locs { - ap := loc.ArrayPositions() - if cap(bufLoc) < 5+len(ap) { - bufLoc = make([]uint64, 0, 5+len(ap)) - } - args := bufLoc[0:5] - args[0] = uint64(fieldsMap[loc.Field()] - 1) - args[1] = loc.Pos() - args[2] = loc.Start() - args[3] = loc.End() - args[4] = uint64(len(ap)) - args = append(args, ap...) - err = locEncoder.Add(hitNewDocNum, args...) - if err != nil { - return 0, 0, 0, nil, err - } - } - } - - lastDocNum = hitNewDocNum - lastFreq = nextFreq - lastNorm = nextNorm - - next, err = postItr.Next() - } - - return lastDocNum, lastFreq, lastNorm, bufLoc, err -} - -func mergeTermFreqNormLocsByCopying(term []byte, postItr *PostingsIterator, - newDocNums []uint64, newRoaring *roaring.Bitmap, - tfEncoder *chunkedIntCoder, locEncoder *chunkedIntCoder) ( - lastDocNum uint64, lastFreq uint64, lastNorm uint64, err error) { - nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err := - postItr.nextBytes() - for err == nil && len(nextFreqNormBytes) > 0 { - hitNewDocNum := newDocNums[nextDocNum] - if hitNewDocNum == docDropped { - return 0, 0, 0, fmt.Errorf("see hit with dropped doc num") - } - - newRoaring.Add(uint32(hitNewDocNum)) - err = tfEncoder.AddBytes(hitNewDocNum, nextFreqNormBytes) - if err != nil { - return 0, 0, 0, err - } - - if len(nextLocBytes) > 0 { - err = locEncoder.AddBytes(hitNewDocNum, nextLocBytes) - if err != nil { - return 0, 0, 0, err - } - } - - lastDocNum = hitNewDocNum - lastFreq = nextFreq - lastNorm = nextNorm - - nextDocNum, nextFreq, nextNorm, nextFreqNormBytes, nextLocBytes, err = - postItr.nextBytes() - } - - return lastDocNum, lastFreq, lastNorm, err -} - -func writePostings(postings *roaring.Bitmap, tfEncoder, locEncoder *chunkedIntCoder, - use1HitEncoding func(uint64) (bool, uint64, uint64), - w *CountHashWriter, bufMaxVarintLen64 []byte) ( - offset uint64, err error) { - termCardinality := postings.GetCardinality() - if termCardinality <= 0 { - return 0, nil - } - - if use1HitEncoding != nil { - encodeAs1Hit, docNum1Hit, normBits1Hit := use1HitEncoding(termCardinality) - if encodeAs1Hit { - return FSTValEncode1Hit(docNum1Hit, normBits1Hit), nil - } - } - - tfOffset := uint64(w.Count()) - _, err = tfEncoder.Write(w) - if err != nil { - return 0, err - } - - locOffset := uint64(w.Count()) - _, err = locEncoder.Write(w) - if err != nil { - return 0, err - } - - postingsOffset := uint64(w.Count()) - - n := binary.PutUvarint(bufMaxVarintLen64, tfOffset) - _, err = w.Write(bufMaxVarintLen64[:n]) - if err != nil { - return 0, err - } - - n = binary.PutUvarint(bufMaxVarintLen64, locOffset) - _, err = w.Write(bufMaxVarintLen64[:n]) - if err != nil { - return 0, err - } - - _, err = writeRoaringWithLen(postings, w, bufMaxVarintLen64) - if err != nil { - return 0, err - } - - return postingsOffset, nil -} - -type varintEncoder func(uint64) (int, error) - -func mergeStoredAndRemap(segments []*SegmentBase, drops []*roaring.Bitmap, - fieldsMap map[string]uint16, fieldsInv []string, fieldsSame bool, newSegDocCount uint64, - w *CountHashWriter, closeCh chan struct{}) (uint64, [][]uint64, error) { - var rv [][]uint64 // The remapped or newDocNums for each segment. - - var newDocNum uint64 - - var curr int - var data, compressed []byte - var metaBuf bytes.Buffer - varBuf := make([]byte, binary.MaxVarintLen64) - metaEncode := func(val uint64) (int, error) { - wb := binary.PutUvarint(varBuf, val) - return metaBuf.Write(varBuf[:wb]) - } - - vals := make([][][]byte, len(fieldsInv)) - typs := make([][]byte, len(fieldsInv)) - poss := make([][][]uint64, len(fieldsInv)) - - var posBuf []uint64 - - docNumOffsets := make([]uint64, newSegDocCount) - - vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) - defer visitDocumentCtxPool.Put(vdc) - - // for each segment - for segI, segment := range segments { - // check for the closure in meantime - if isClosed(closeCh) { - return 0, nil, seg.ErrClosed - } - - segNewDocNums := make([]uint64, segment.numDocs) - - dropsI := drops[segI] - - // optimize when the field mapping is the same across all - // segments and there are no deletions, via byte-copying - // of stored docs bytes directly to the writer - if fieldsSame && (dropsI == nil || dropsI.GetCardinality() == 0) { - err := segment.copyStoredDocs(newDocNum, docNumOffsets, w) - if err != nil { - return 0, nil, err - } - - for i := uint64(0); i < segment.numDocs; i++ { - segNewDocNums[i] = newDocNum - newDocNum++ - } - rv = append(rv, segNewDocNums) - - continue - } - - // for each doc num - for docNum := uint64(0); docNum < segment.numDocs; docNum++ { - // TODO: roaring's API limits docNums to 32-bits? - if dropsI != nil && dropsI.Contains(uint32(docNum)) { - segNewDocNums[docNum] = docDropped - continue - } - - segNewDocNums[docNum] = newDocNum - - curr = 0 - metaBuf.Reset() - data = data[:0] - - posTemp := posBuf - - // collect all the data - for i := 0; i < len(fieldsInv); i++ { - vals[i] = vals[i][:0] - typs[i] = typs[i][:0] - poss[i] = poss[i][:0] - } - err := segment.visitDocument(vdc, docNum, func(field string, typ byte, value []byte, pos []uint64) bool { - fieldID := int(fieldsMap[field]) - 1 - vals[fieldID] = append(vals[fieldID], value) - typs[fieldID] = append(typs[fieldID], typ) - - // copy array positions to preserve them beyond the scope of this callback - var curPos []uint64 - if len(pos) > 0 { - if cap(posTemp) < len(pos) { - posBuf = make([]uint64, len(pos)*len(fieldsInv)) - posTemp = posBuf - } - curPos = posTemp[0:len(pos)] - copy(curPos, pos) - posTemp = posTemp[len(pos):] - } - poss[fieldID] = append(poss[fieldID], curPos) - - return true - }) - if err != nil { - return 0, nil, err - } - - // _id field special case optimizes ExternalID() lookups - idFieldVal := vals[uint16(0)][0] - _, err = metaEncode(uint64(len(idFieldVal))) - if err != nil { - return 0, nil, err - } - - // now walk the non-"_id" fields in order - for fieldID := 1; fieldID < len(fieldsInv); fieldID++ { - storedFieldValues := vals[fieldID] - - stf := typs[fieldID] - spf := poss[fieldID] - - var err2 error - curr, data, err2 = persistStoredFieldValues(fieldID, - storedFieldValues, stf, spf, curr, metaEncode, data) - if err2 != nil { - return 0, nil, err2 - } - } - - metaBytes := metaBuf.Bytes() - - compressed = snappy.Encode(compressed[:cap(compressed)], data) - - // record where we're about to start writing - docNumOffsets[newDocNum] = uint64(w.Count()) - - // write out the meta len and compressed data len - _, err = writeUvarints(w, - uint64(len(metaBytes)), - uint64(len(idFieldVal)+len(compressed))) - if err != nil { - return 0, nil, err - } - // now write the meta - _, err = w.Write(metaBytes) - if err != nil { - return 0, nil, err - } - // now write the _id field val (counted as part of the 'compressed' data) - _, err = w.Write(idFieldVal) - if err != nil { - return 0, nil, err - } - // now write the compressed data - _, err = w.Write(compressed) - if err != nil { - return 0, nil, err - } - - newDocNum++ - } - - rv = append(rv, segNewDocNums) - } - - // return value is the start of the stored index - storedIndexOffset := uint64(w.Count()) - - // now write out the stored doc index - for _, docNumOffset := range docNumOffsets { - err := binary.Write(w, binary.BigEndian, docNumOffset) - if err != nil { - return 0, nil, err - } - } - - return storedIndexOffset, rv, nil -} - -// copyStoredDocs writes out a segment's stored doc info, optimized by -// using a single Write() call for the entire set of bytes. The -// newDocNumOffsets is filled with the new offsets for each doc. -func (s *SegmentBase) copyStoredDocs(newDocNum uint64, newDocNumOffsets []uint64, - w *CountHashWriter) error { - if s.numDocs <= 0 { - return nil - } - - indexOffset0, storedOffset0, _, _, _ := - s.getDocStoredOffsets(0) // the segment's first doc - - indexOffsetN, storedOffsetN, readN, metaLenN, dataLenN := - s.getDocStoredOffsets(s.numDocs - 1) // the segment's last doc - - storedOffset0New := uint64(w.Count()) - - storedBytes := s.mem[storedOffset0 : storedOffsetN+readN+metaLenN+dataLenN] - _, err := w.Write(storedBytes) - if err != nil { - return err - } - - // remap the storedOffset's for the docs into new offsets relative - // to storedOffset0New, filling the given docNumOffsetsOut array - for indexOffset := indexOffset0; indexOffset <= indexOffsetN; indexOffset += 8 { - storedOffset := binary.BigEndian.Uint64(s.mem[indexOffset : indexOffset+8]) - storedOffsetNew := storedOffset - storedOffset0 + storedOffset0New - newDocNumOffsets[newDocNum] = storedOffsetNew - newDocNum += 1 - } - - return nil -} - -// mergeFields builds a unified list of fields used across all the -// input segments, and computes whether the fields are the same across -// segments (which depends on fields to be sorted in the same way -// across segments) -func mergeFields(segments []*SegmentBase) (bool, []string) { - fieldsSame := true - - var segment0Fields []string - if len(segments) > 0 { - segment0Fields = segments[0].Fields() - } - - fieldsExist := map[string]struct{}{} - for _, segment := range segments { - fields := segment.Fields() - for fieldi, field := range fields { - fieldsExist[field] = struct{}{} - if len(segment0Fields) != len(fields) || segment0Fields[fieldi] != field { - fieldsSame = false - } - } - } - - rv := make([]string, 0, len(fieldsExist)) - // ensure _id stays first - rv = append(rv, "_id") - for k := range fieldsExist { - if k != "_id" { - rv = append(rv, k) - } - } - - sort.Strings(rv[1:]) // leave _id as first - - return fieldsSame, rv -} - -func isClosed(closeCh chan struct{}) bool { - select { - case <-closeCh: - return true - default: - return false - } -} diff --git a/index/scorch/segment/zap/merge_test.go b/index/scorch/segment/zap/merge_test.go deleted file mode 100644 index 175671226..000000000 --- a/index/scorch/segment/zap/merge_test.go +++ /dev/null @@ -1,870 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "fmt" - "os" - "reflect" - "sort" - "strings" - "testing" - - "github.com/RoaringBitmap/roaring" - "github.com/blevesearch/bleve/analysis" - "github.com/blevesearch/bleve/document" - "github.com/blevesearch/bleve/index" -) - -func TestMerge(t *testing.T) { - _ = os.RemoveAll("/tmp/scorch.zap") - _ = os.RemoveAll("/tmp/scorch2.zap") - _ = os.RemoveAll("/tmp/scorch3.zap") - - testSeg, _, _ := buildTestSegmentMulti() - err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") - if err != nil { - t.Fatal(err) - } - - testSeg2, _, _ := buildTestSegmentMulti2() - err = PersistSegmentBase(testSeg2, "/tmp/scorch2.zap") - if err != nil { - t.Fatal(err) - } - - segment, err := Open("/tmp/scorch.zap") - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - defer func() { - cerr := segment.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", err) - } - }() - - segment2, err := Open("/tmp/scorch2.zap") - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - defer func() { - cerr := segment2.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", err) - } - }() - - segsToMerge := make([]*Segment, 2) - segsToMerge[0] = segment.(*Segment) - segsToMerge[1] = segment2.(*Segment) - - _, _, err = Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024, nil, nil) - if err != nil { - t.Fatal(err) - } - - segm, err := Open("/tmp/scorch3.zap") - if err != nil { - t.Fatalf("error opening merged segment: %v", err) - } - seg3 := segm.(*Segment) - defer func() { - cerr := seg3.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", err) - } - }() - - if seg3.Path() != "/tmp/scorch3.zap" { - t.Fatalf("wrong path") - } - if seg3.Count() != 4 { - t.Fatalf("wrong count") - } - if len(seg3.Fields()) != 5 { - t.Fatalf("wrong # fields: %#v\n", seg3.Fields()) - } - - testMergeWithSelf(t, seg3, 4) -} - -func TestMergeWithEmptySegment(t *testing.T) { - testMergeWithEmptySegments(t, true, 1) -} - -func TestMergeWithEmptySegments(t *testing.T) { - testMergeWithEmptySegments(t, true, 5) -} - -func TestMergeWithEmptySegmentFirst(t *testing.T) { - testMergeWithEmptySegments(t, false, 1) -} - -func TestMergeWithEmptySegmentsFirst(t *testing.T) { - testMergeWithEmptySegments(t, false, 5) -} - -func testMergeWithEmptySegments(t *testing.T, before bool, numEmptySegments int) { - _ = os.RemoveAll("/tmp/scorch.zap") - - testSeg, _, _ := buildTestSegmentMulti() - err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") - if err != nil { - t.Fatal(err) - } - segment, err := Open("/tmp/scorch.zap") - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - defer func() { - cerr := segment.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", err) - } - }() - - var segsToMerge []*Segment - - if before { - segsToMerge = append(segsToMerge, segment.(*Segment)) - } - - for i := 0; i < numEmptySegments; i++ { - fname := fmt.Sprintf("scorch-empty-%d.zap", i) - - _ = os.RemoveAll("/tmp/" + fname) - - emptySegment, _, err := AnalysisResultsToSegmentBase([]*index.AnalysisResult{}, 1024) - if err != nil { - t.Fatal(err) - } - err = PersistSegmentBase(emptySegment, "/tmp/"+fname) - if err != nil { - t.Fatal(err) - } - - emptyFileSegment, err := Open("/tmp/" + fname) - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - defer func(emptyFileSegment *Segment) { - cerr := emptyFileSegment.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", err) - } - }(emptyFileSegment.(*Segment)) - - segsToMerge = append(segsToMerge, emptyFileSegment.(*Segment)) - } - - if !before { - segsToMerge = append(segsToMerge, segment.(*Segment)) - } - - _ = os.RemoveAll("/tmp/scorch3.zap") - - drops := make([]*roaring.Bitmap, len(segsToMerge)) - - _, _, err = Merge(segsToMerge, drops, "/tmp/scorch3.zap", 1024, nil, nil) - if err != nil { - t.Fatal(err) - } - - segm, err := Open("/tmp/scorch3.zap") - if err != nil { - t.Fatalf("error opening merged segment: %v", err) - } - segCur := segm.(*Segment) - defer func() { - cerr := segCur.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", err) - } - }() - - if segCur.Path() != "/tmp/scorch3.zap" { - t.Fatalf("wrong path") - } - if segCur.Count() != 2 { - t.Fatalf("wrong count, numEmptySegments: %d, got count: %d", numEmptySegments, segCur.Count()) - } - if len(segCur.Fields()) != 5 { - t.Fatalf("wrong # fields: %#v\n", segCur.Fields()) - } - - testMergeWithSelf(t, segCur, 2) -} - -func testMergeWithSelf(t *testing.T, segCur *Segment, expectedCount uint64) { - // trying merging the segment with itself for a few rounds - var diffs []string - - for i := 0; i < 10; i++ { - fname := fmt.Sprintf("scorch-self-%d.zap", i) - - _ = os.RemoveAll("/tmp/" + fname) - - segsToMerge := make([]*Segment, 1) - segsToMerge[0] = segCur - - _, _, err := Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/"+fname, 1024, nil, nil) - if err != nil { - t.Fatal(err) - } - - segm, err := Open("/tmp/" + fname) - if err != nil { - t.Fatalf("error opening merged segment: %v", err) - } - segNew := segm.(*Segment) - defer func(s *Segment) { - cerr := s.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", err) - } - }(segNew) - - if segNew.Count() != expectedCount { - t.Fatalf("wrong count") - } - if len(segNew.Fields()) != 5 { - t.Fatalf("wrong # fields: %#v\n", segNew.Fields()) - } - - diff := compareSegments(segCur, segNew) - if diff != "" { - diffs = append(diffs, fname+" is different than previous:\n"+diff) - } - - segCur = segNew - } - - if len(diffs) > 0 { - t.Errorf("mismatches after repeated self-merging: %v", strings.Join(diffs, "\n")) - } -} - -func compareSegments(a, b *Segment) string { - var rv []string - - if a.Count() != b.Count() { - return "counts" - } - - afields := append([]string(nil), a.Fields()...) - bfields := append([]string(nil), b.Fields()...) - sort.Strings(afields) - sort.Strings(bfields) - if !reflect.DeepEqual(afields, bfields) { - return "fields" - } - - for _, fieldName := range afields { - adict, err := a.Dictionary(fieldName) - if err != nil { - return fmt.Sprintf("adict err: %v", err) - } - bdict, err := b.Dictionary(fieldName) - if err != nil { - return fmt.Sprintf("bdict err: %v", err) - } - - if adict.(*Dictionary).fst.Len() != bdict.(*Dictionary).fst.Len() { - rv = append(rv, fmt.Sprintf("field %s, dict fst Len()'s different: %v %v", - fieldName, adict.(*Dictionary).fst.Len(), bdict.(*Dictionary).fst.Len())) - } - - aitr := adict.Iterator() - bitr := bdict.Iterator() - for { - anext, aerr := aitr.Next() - bnext, berr := bitr.Next() - if aerr != berr { - rv = append(rv, fmt.Sprintf("field %s, dict iterator Next() errors different: %v %v", - fieldName, aerr, berr)) - break - } - if !reflect.DeepEqual(anext, bnext) { - rv = append(rv, fmt.Sprintf("field %s, dict iterator Next() results different: %#v %#v", - fieldName, anext, bnext)) - // keep going to try to see more diff details at the postingsList level - } - if aerr != nil || anext == nil || - berr != nil || bnext == nil { - break - } - - for _, next := range []*index.DictEntry{anext, bnext} { - if next == nil { - continue - } - - aplist, aerr := adict.(*Dictionary).postingsList([]byte(next.Term), nil, nil) - bplist, berr := bdict.(*Dictionary).postingsList([]byte(next.Term), nil, nil) - if aerr != berr { - rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsList() errors different: %v %v", - fieldName, next.Term, aerr, berr)) - } - - if (aplist != nil) != (bplist != nil) { - rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsList() results different: %v %v", - fieldName, next.Term, aplist, bplist)) - break - } - - if aerr != nil || aplist == nil || - berr != nil || bplist == nil { - break - } - - if aplist.Count() != bplist.Count() { - rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsList().Count()'s different: %v %v", - fieldName, next.Term, aplist.Count(), bplist.Count())) - } - - apitr := aplist.Iterator(true, true, true, nil) - bpitr := bplist.Iterator(true, true, true, nil) - if (apitr != nil) != (bpitr != nil) { - rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsList.Iterator() results different: %v %v", - fieldName, next.Term, apitr, bpitr)) - break - } - - for { - apitrn, aerr := apitr.Next() - bpitrn, berr := bpitr.Next() - if aerr != berr { - rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsListIterator Next() errors different: %v %v", - fieldName, next.Term, aerr, berr)) - } - - if (apitrn != nil) != (bpitrn != nil) { - rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsListIterator Next() results different: %v %v", - fieldName, next.Term, apitrn, bpitrn)) - break - } - - if aerr != nil || apitrn == nil || - berr != nil || bpitrn == nil { - break - } - - if apitrn.Number() != bpitrn.Number() { - rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsListIterator Next() Number()'s different: %v %v", - fieldName, next.Term, apitrn.Number(), bpitrn.Number())) - } - - if apitrn.Frequency() != bpitrn.Frequency() { - rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsListIterator Next() Frequency()'s different: %v %v", - fieldName, next.Term, apitrn.Frequency(), bpitrn.Frequency())) - } - - if apitrn.Norm() != bpitrn.Norm() { - rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsListIterator Next() Norm()'s different: %v %v", - fieldName, next.Term, apitrn.Norm(), bpitrn.Norm())) - } - - if len(apitrn.Locations()) != len(bpitrn.Locations()) { - rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsListIterator Next() Locations() len's different: %v %v", - fieldName, next.Term, len(apitrn.Locations()), len(bpitrn.Locations()))) - } - - for loci, aloc := range apitrn.Locations() { - bloc := bpitrn.Locations()[loci] - - if (aloc != nil) != (bloc != nil) { - rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsListIterator Next() loc different: %v %v", - fieldName, next.Term, aloc, bloc)) - break - } - - if aloc.Field() != bloc.Field() || - aloc.Start() != bloc.Start() || - aloc.End() != bloc.End() || - aloc.Pos() != bloc.Pos() || - !reflect.DeepEqual(aloc.ArrayPositions(), bloc.ArrayPositions()) { - rv = append(rv, fmt.Sprintf("field %s, term: %s, postingsListIterator Next() loc details different: %v %v", - fieldName, next.Term, aloc, bloc)) - } - } - - if fieldName == "_id" { - docId := next.Term - docNumA := apitrn.Number() - docNumB := bpitrn.Number() - afields := map[string]interface{}{} - err = a.VisitDocument(apitrn.Number(), - func(field string, typ byte, value []byte, pos []uint64) bool { - afields[field+"-typ"] = typ - afields[field+"-value"] = append([]byte(nil), value...) - afields[field+"-pos"] = append([]uint64(nil), pos...) - return true - }) - if err != nil { - rv = append(rv, fmt.Sprintf("a.VisitDocument err: %v", err)) - } - bfields := map[string]interface{}{} - err = b.VisitDocument(bpitrn.Number(), - func(field string, typ byte, value []byte, pos []uint64) bool { - bfields[field+"-typ"] = typ - bfields[field+"-value"] = append([]byte(nil), value...) - bfields[field+"-pos"] = append([]uint64(nil), pos...) - return true - }) - if err != nil { - rv = append(rv, fmt.Sprintf("b.VisitDocument err: %v", err)) - } - if !reflect.DeepEqual(afields, bfields) { - rv = append(rv, fmt.Sprintf("afields != bfields,"+ - " id: %s, docNumA: %d, docNumB: %d,"+ - " afields: %#v, bfields: %#v", - docId, docNumA, docNumB, afields, bfields)) - } - } - } - } - } - } - - return strings.Join(rv, "\n") -} - -func TestMergeAndDrop(t *testing.T) { - docsToDrop := make([]*roaring.Bitmap, 2) - docsToDrop[0] = roaring.NewBitmap() - docsToDrop[0].AddInt(1) - docsToDrop[1] = roaring.NewBitmap() - docsToDrop[1].AddInt(1) - testMergeAndDrop(t, docsToDrop) -} - -func TestMergeAndDropAllFromOneSegment(t *testing.T) { - docsToDrop := make([]*roaring.Bitmap, 2) - docsToDrop[0] = roaring.NewBitmap() - docsToDrop[0].AddInt(0) - docsToDrop[0].AddInt(1) - docsToDrop[1] = roaring.NewBitmap() - testMergeAndDrop(t, docsToDrop) -} - -func testMergeAndDrop(t *testing.T, docsToDrop []*roaring.Bitmap) { - _ = os.RemoveAll("/tmp/scorch.zap") - _ = os.RemoveAll("/tmp/scorch2.zap") - - testSeg, _, _ := buildTestSegmentMulti() - err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") - if err != nil { - t.Fatal(err) - } - segment, err := Open("/tmp/scorch.zap") - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - defer func() { - cerr := segment.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", err) - } - }() - - testSeg2, _, _ := buildTestSegmentMulti2() - err = PersistSegmentBase(testSeg2, "/tmp/scorch2.zap") - if err != nil { - t.Fatal(err) - } - - segment2, err := Open("/tmp/scorch2.zap") - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - defer func() { - cerr := segment2.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", err) - } - }() - - segsToMerge := make([]*Segment, 2) - segsToMerge[0] = segment.(*Segment) - segsToMerge[1] = segment2.(*Segment) - - testMergeAndDropSegments(t, segsToMerge, docsToDrop, 2) -} - -func TestMergeWithUpdates(t *testing.T) { - segmentDocIds := [][]string{ - []string{"a", "b"}, - []string{"b", "c"}, // doc "b" updated - } - - docsToDrop := make([]*roaring.Bitmap, 2) - docsToDrop[0] = roaring.NewBitmap() - docsToDrop[0].AddInt(1) // doc "b" updated - docsToDrop[1] = roaring.NewBitmap() - - testMergeWithUpdates(t, segmentDocIds, docsToDrop, 3) -} - -func TestMergeWithUpdatesOnManySegments(t *testing.T) { - segmentDocIds := [][]string{ - []string{"a", "b"}, - []string{"b", "c"}, // doc "b" updated - []string{"c", "d"}, // doc "c" updated - []string{"d", "e"}, // doc "d" updated - } - - docsToDrop := make([]*roaring.Bitmap, 4) - docsToDrop[0] = roaring.NewBitmap() - docsToDrop[0].AddInt(1) // doc "b" updated - docsToDrop[1] = roaring.NewBitmap() - docsToDrop[1].AddInt(1) // doc "c" updated - docsToDrop[2] = roaring.NewBitmap() - docsToDrop[2].AddInt(1) // doc "d" updated - docsToDrop[3] = roaring.NewBitmap() - - testMergeWithUpdates(t, segmentDocIds, docsToDrop, 5) -} - -func TestMergeWithUpdatesOnOneDoc(t *testing.T) { - segmentDocIds := [][]string{ - []string{"a", "b"}, - []string{"a", "c"}, // doc "a" updated - []string{"a", "d"}, // doc "a" updated - []string{"a", "e"}, // doc "a" updated - } - - docsToDrop := make([]*roaring.Bitmap, 4) - docsToDrop[0] = roaring.NewBitmap() - docsToDrop[0].AddInt(0) // doc "a" updated - docsToDrop[1] = roaring.NewBitmap() - docsToDrop[1].AddInt(0) // doc "a" updated - docsToDrop[2] = roaring.NewBitmap() - docsToDrop[2].AddInt(0) // doc "a" updated - docsToDrop[3] = roaring.NewBitmap() - - testMergeWithUpdates(t, segmentDocIds, docsToDrop, 5) -} - -func testMergeWithUpdates(t *testing.T, segmentDocIds [][]string, docsToDrop []*roaring.Bitmap, expectedNumDocs uint64) { - var segsToMerge []*Segment - - // convert segmentDocIds to segsToMerge - for i, docIds := range segmentDocIds { - fname := fmt.Sprintf("scorch%d.zap", i) - - _ = os.RemoveAll("/tmp/" + fname) - - testSeg, _, _ := buildTestSegmentMultiHelper(docIds) - err := PersistSegmentBase(testSeg, "/tmp/"+fname) - if err != nil { - t.Fatal(err) - } - segment, err := Open("/tmp/" + fname) - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - defer func(segment *Segment) { - cerr := segment.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", err) - } - }(segment.(*Segment)) - - segsToMerge = append(segsToMerge, segment.(*Segment)) - } - - testMergeAndDropSegments(t, segsToMerge, docsToDrop, expectedNumDocs) -} - -func testMergeAndDropSegments(t *testing.T, segsToMerge []*Segment, docsToDrop []*roaring.Bitmap, expectedNumDocs uint64) { - _ = os.RemoveAll("/tmp/scorch-merged.zap") - - _, _, err := Merge(segsToMerge, docsToDrop, "/tmp/scorch-merged.zap", 1024, nil, nil) - if err != nil { - t.Fatal(err) - } - - segm, err := Open("/tmp/scorch-merged.zap") - if err != nil { - t.Fatalf("error opening merged segment: %v", err) - } - defer func() { - cerr := segm.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", err) - } - }() - - if segm.Count() != expectedNumDocs { - t.Fatalf("wrong count, got: %d, wanted: %d", segm.Count(), expectedNumDocs) - } - if len(segm.Fields()) != 5 { - t.Fatalf("wrong # fields: %#v\n", segm.Fields()) - } - - testMergeWithSelf(t, segm.(*Segment), expectedNumDocs) -} - -func buildTestSegmentMulti2() (*SegmentBase, uint64, error) { - return buildTestSegmentMultiHelper([]string{"c", "d"}) -} - -func buildTestSegmentMultiHelper(docIds []string) (*SegmentBase, uint64, error) { - doc := &document.Document{ - ID: "c", - Fields: []document.Field{ - document.NewTextFieldCustom("_id", nil, []byte(docIds[0]), document.IndexField|document.StoreField, nil), - document.NewTextFieldCustom("name", nil, []byte("mat"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - }, - CompositeFields: []*document.CompositeField{ - document.NewCompositeField("_all", true, nil, []string{"_id"}), - }, - } - - doc2 := &document.Document{ - ID: "d", - Fields: []document.Field{ - document.NewTextFieldCustom("_id", nil, []byte(docIds[1]), document.IndexField|document.StoreField, nil), - document.NewTextFieldCustom("name", nil, []byte("joa"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("desc", nil, []byte("some thing"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("tag", []uint64{0}, []byte("cold"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - document.NewTextFieldCustom("tag", []uint64{1}, []byte("dark"), document.IndexField|document.StoreField|document.IncludeTermVectors, nil), - }, - CompositeFields: []*document.CompositeField{ - document.NewCompositeField("_all", true, nil, []string{"_id"}), - }, - } - - // forge analyzed docs - results := []*index.AnalysisResult{ - &index.AnalysisResult{ - Document: doc, - Analyzed: []analysis.TokenFrequencies{ - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 1, - Position: 1, - Term: []byte(docIds[0]), - }, - }, nil, false), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("mat"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("some"), - }, - &analysis.Token{ - Start: 5, - End: 10, - Position: 2, - Term: []byte("thing"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("cold"), - }, - }, []uint64{0}, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("dark"), - }, - }, []uint64{1}, true), - }, - Length: []int{ - 1, - 1, - 2, - 1, - 1, - }, - }, - &index.AnalysisResult{ - Document: doc2, - Analyzed: []analysis.TokenFrequencies{ - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 1, - Position: 1, - Term: []byte(docIds[1]), - }, - }, nil, false), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 3, - Position: 1, - Term: []byte("joa"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("some"), - }, - &analysis.Token{ - Start: 5, - End: 10, - Position: 2, - Term: []byte("thing"), - }, - }, nil, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("cold"), - }, - }, []uint64{0}, true), - analysis.TokenFrequency(analysis.TokenStream{ - &analysis.Token{ - Start: 0, - End: 4, - Position: 1, - Term: []byte("dark"), - }, - }, []uint64{1}, true), - }, - Length: []int{ - 1, - 1, - 2, - 1, - 1, - }, - }, - } - - // fix up composite fields - for _, ar := range results { - for i, f := range ar.Document.Fields { - for _, cf := range ar.Document.CompositeFields { - cf.Compose(f.Name(), ar.Length[i], ar.Analyzed[i]) - } - } - } - - return AnalysisResultsToSegmentBase(results, 1024) -} - -func TestMergeBytesWritten(t *testing.T) { - _ = os.RemoveAll("/tmp/scorch.zap") - _ = os.RemoveAll("/tmp/scorch2.zap") - _ = os.RemoveAll("/tmp/scorch3.zap") - - testSeg, _, _ := buildTestSegmentMulti() - err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") - if err != nil { - t.Fatal(err) - } - - testSeg2, _, _ := buildTestSegmentMulti2() - err = PersistSegmentBase(testSeg2, "/tmp/scorch2.zap") - if err != nil { - t.Fatal(err) - } - - segment, err := Open("/tmp/scorch.zap") - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - defer func() { - cerr := segment.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", err) - } - }() - - segment2, err := Open("/tmp/scorch2.zap") - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - defer func() { - cerr := segment2.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", err) - } - }() - - segsToMerge := make([]*Segment, 2) - segsToMerge[0] = segment.(*Segment) - segsToMerge[1] = segment2.(*Segment) - - _, nBytes, err := Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024, nil, nil) - if err != nil { - t.Fatal(err) - } - - if nBytes == 0 { - t.Fatalf("expected a non zero total_compaction_written_bytes") - } - - segm, err := Open("/tmp/scorch3.zap") - if err != nil { - t.Fatalf("error opening merged segment: %v", err) - } - seg3 := segm.(*Segment) - defer func() { - cerr := seg3.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", err) - } - }() - - if seg3.Path() != "/tmp/scorch3.zap" { - t.Fatalf("wrong path") - } - if seg3.Count() != 4 { - t.Fatalf("wrong count") - } - if len(seg3.Fields()) != 5 { - t.Fatalf("wrong # fields: %#v\n", seg3.Fields()) - } - - testMergeWithSelf(t, seg3, 4) -} - -func TestUnder32Bits(t *testing.T) { - if !under32Bits(0) || !under32Bits(uint64(0x7fffffff)) { - t.Errorf("under32Bits bad") - } - if under32Bits(uint64(0x80000000)) || under32Bits(uint64(0x80000001)) { - t.Errorf("under32Bits wrong") - } -} diff --git a/index/scorch/segment/zap/new.go b/index/scorch/segment/zap/new.go deleted file mode 100644 index c108ec16d..000000000 --- a/index/scorch/segment/zap/new.go +++ /dev/null @@ -1,839 +0,0 @@ -// Copyright (c) 2018 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "bytes" - "encoding/binary" - "math" - "sort" - "sync" - - "github.com/RoaringBitmap/roaring" - "github.com/blevesearch/bleve/analysis" - "github.com/blevesearch/bleve/document" - "github.com/blevesearch/bleve/index" - "github.com/couchbase/vellum" - "github.com/golang/snappy" -) - -var NewSegmentBufferNumResultsBump int = 100 -var NewSegmentBufferNumResultsFactor float64 = 1.0 -var NewSegmentBufferAvgBytesPerDocFactor float64 = 1.0 - -// ValidateDocFields can be set by applications to perform additional checks -// on fields in a document being added to a new segment, by default it does -// nothing. -// This API is experimental and may be removed at any time. -var ValidateDocFields = func(field document.Field) error { - return nil -} - -// AnalysisResultsToSegmentBase produces an in-memory zap-encoded -// SegmentBase from analysis results -func AnalysisResultsToSegmentBase(results []*index.AnalysisResult, - chunkFactor uint32) (*SegmentBase, uint64, error) { - s := interimPool.Get().(*interim) - - var br bytes.Buffer - if s.lastNumDocs > 0 { - // use previous results to initialize the buf with an estimate - // size, but note that the interim instance comes from a - // global interimPool, so multiple scorch instances indexing - // different docs can lead to low quality estimates - estimateAvgBytesPerDoc := int(float64(s.lastOutSize/s.lastNumDocs) * - NewSegmentBufferNumResultsFactor) - estimateNumResults := int(float64(len(results)+NewSegmentBufferNumResultsBump) * - NewSegmentBufferAvgBytesPerDocFactor) - br.Grow(estimateAvgBytesPerDoc * estimateNumResults) - } - - s.results = results - s.chunkFactor = chunkFactor - s.w = NewCountHashWriter(&br) - - storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, - err := s.convert() - if err != nil { - return nil, uint64(0), err - } - - sb, err := InitSegmentBase(br.Bytes(), s.w.Sum32(), chunkFactor, - s.FieldsMap, s.FieldsInv, uint64(len(results)), - storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets) - - if err == nil && s.reset() == nil { - s.lastNumDocs = len(results) - s.lastOutSize = len(br.Bytes()) - interimPool.Put(s) - } - - return sb, uint64(len(br.Bytes())), err -} - -var interimPool = sync.Pool{New: func() interface{} { return &interim{} }} - -// interim holds temporary working data used while converting from -// analysis results to a zap-encoded segment -type interim struct { - results []*index.AnalysisResult - - chunkFactor uint32 - - w *CountHashWriter - - // FieldsMap adds 1 to field id to avoid zero value issues - // name -> field id + 1 - FieldsMap map[string]uint16 - - // FieldsInv is the inverse of FieldsMap - // field id -> name - FieldsInv []string - - // Term dictionaries for each field - // field id -> term -> postings list id + 1 - Dicts []map[string]uint64 - - // Terms for each field, where terms are sorted ascending - // field id -> []term - DictKeys [][]string - - // Fields whose IncludeDocValues is true - // field id -> bool - IncludeDocValues []bool - - // postings id -> bitmap of docNums - Postings []*roaring.Bitmap - - // postings id -> freq/norm's, one for each docNum in postings - FreqNorms [][]interimFreqNorm - freqNormsBacking []interimFreqNorm - - // postings id -> locs, one for each freq - Locs [][]interimLoc - locsBacking []interimLoc - - numTermsPerPostingsList []int // key is postings list id - numLocsPerPostingsList []int // key is postings list id - - builder *vellum.Builder - builderBuf bytes.Buffer - - metaBuf bytes.Buffer - - tmp0 []byte - tmp1 []byte - - lastNumDocs int - lastOutSize int -} - -func (s *interim) reset() (err error) { - s.results = nil - s.chunkFactor = 0 - s.w = nil - s.FieldsMap = nil - s.FieldsInv = nil - for i := range s.Dicts { - s.Dicts[i] = nil - } - s.Dicts = s.Dicts[:0] - for i := range s.DictKeys { - s.DictKeys[i] = s.DictKeys[i][:0] - } - s.DictKeys = s.DictKeys[:0] - for i := range s.IncludeDocValues { - s.IncludeDocValues[i] = false - } - s.IncludeDocValues = s.IncludeDocValues[:0] - for _, idn := range s.Postings { - idn.Clear() - } - s.Postings = s.Postings[:0] - s.FreqNorms = s.FreqNorms[:0] - for i := range s.freqNormsBacking { - s.freqNormsBacking[i] = interimFreqNorm{} - } - s.freqNormsBacking = s.freqNormsBacking[:0] - s.Locs = s.Locs[:0] - for i := range s.locsBacking { - s.locsBacking[i] = interimLoc{} - } - s.locsBacking = s.locsBacking[:0] - s.numTermsPerPostingsList = s.numTermsPerPostingsList[:0] - s.numLocsPerPostingsList = s.numLocsPerPostingsList[:0] - s.builderBuf.Reset() - if s.builder != nil { - err = s.builder.Reset(&s.builderBuf) - } - s.metaBuf.Reset() - s.tmp0 = s.tmp0[:0] - s.tmp1 = s.tmp1[:0] - s.lastNumDocs = 0 - s.lastOutSize = 0 - - return err -} - -func (s *interim) grabBuf(size int) []byte { - buf := s.tmp0 - if cap(buf) < size { - buf = make([]byte, size) - s.tmp0 = buf - } - return buf[0:size] -} - -type interimStoredField struct { - vals [][]byte - typs []byte - arrayposs [][]uint64 // array positions -} - -type interimFreqNorm struct { - freq uint64 - norm float32 - numLocs int -} - -type interimLoc struct { - fieldID uint16 - pos uint64 - start uint64 - end uint64 - arrayposs []uint64 -} - -func (s *interim) convert() (uint64, uint64, uint64, []uint64, error) { - s.FieldsMap = map[string]uint16{} - - s.getOrDefineField("_id") // _id field is fieldID 0 - - for _, result := range s.results { - for _, field := range result.Document.CompositeFields { - s.getOrDefineField(field.Name()) - } - for _, field := range result.Document.Fields { - s.getOrDefineField(field.Name()) - } - } - - sort.Strings(s.FieldsInv[1:]) // keep _id as first field - - for fieldID, fieldName := range s.FieldsInv { - s.FieldsMap[fieldName] = uint16(fieldID + 1) - } - - if cap(s.IncludeDocValues) >= len(s.FieldsInv) { - s.IncludeDocValues = s.IncludeDocValues[:len(s.FieldsInv)] - } else { - s.IncludeDocValues = make([]bool, len(s.FieldsInv)) - } - - s.prepareDicts() - - for _, dict := range s.DictKeys { - sort.Strings(dict) - } - - s.processDocuments() - - storedIndexOffset, err := s.writeStoredFields() - if err != nil { - return 0, 0, 0, nil, err - } - - var fdvIndexOffset uint64 - var dictOffsets []uint64 - - if len(s.results) > 0 { - fdvIndexOffset, dictOffsets, err = s.writeDicts() - if err != nil { - return 0, 0, 0, nil, err - } - } else { - dictOffsets = make([]uint64, len(s.FieldsInv)) - } - - fieldsIndexOffset, err := persistFields(s.FieldsInv, s.w, dictOffsets) - if err != nil { - return 0, 0, 0, nil, err - } - - return storedIndexOffset, fieldsIndexOffset, fdvIndexOffset, dictOffsets, nil -} - -func (s *interim) getOrDefineField(fieldName string) int { - fieldIDPlus1, exists := s.FieldsMap[fieldName] - if !exists { - fieldIDPlus1 = uint16(len(s.FieldsInv) + 1) - s.FieldsMap[fieldName] = fieldIDPlus1 - s.FieldsInv = append(s.FieldsInv, fieldName) - - s.Dicts = append(s.Dicts, make(map[string]uint64)) - - n := len(s.DictKeys) - if n < cap(s.DictKeys) { - s.DictKeys = s.DictKeys[:n+1] - s.DictKeys[n] = s.DictKeys[n][:0] - } else { - s.DictKeys = append(s.DictKeys, []string(nil)) - } - } - - return int(fieldIDPlus1 - 1) -} - -// fill Dicts and DictKeys from analysis results -func (s *interim) prepareDicts() { - var pidNext int - - var totTFs int - var totLocs int - - visitField := func(fieldID uint16, tfs analysis.TokenFrequencies) { - dict := s.Dicts[fieldID] - dictKeys := s.DictKeys[fieldID] - - for term, tf := range tfs { - pidPlus1, exists := dict[term] - if !exists { - pidNext++ - pidPlus1 = uint64(pidNext) - - dict[term] = pidPlus1 - dictKeys = append(dictKeys, term) - - s.numTermsPerPostingsList = append(s.numTermsPerPostingsList, 0) - s.numLocsPerPostingsList = append(s.numLocsPerPostingsList, 0) - } - - pid := pidPlus1 - 1 - - s.numTermsPerPostingsList[pid] += 1 - s.numLocsPerPostingsList[pid] += len(tf.Locations) - - totLocs += len(tf.Locations) - } - - totTFs += len(tfs) - - s.DictKeys[fieldID] = dictKeys - } - - for _, result := range s.results { - // walk each composite field - for _, field := range result.Document.CompositeFields { - fieldID := uint16(s.getOrDefineField(field.Name())) - _, tf := field.Analyze() - visitField(fieldID, tf) - } - - // walk each field - for i, field := range result.Document.Fields { - fieldID := uint16(s.getOrDefineField(field.Name())) - tf := result.Analyzed[i] - visitField(fieldID, tf) - } - } - - numPostingsLists := pidNext - - if cap(s.Postings) >= numPostingsLists { - s.Postings = s.Postings[:numPostingsLists] - } else { - postings := make([]*roaring.Bitmap, numPostingsLists) - copy(postings, s.Postings[:cap(s.Postings)]) - for i := 0; i < numPostingsLists; i++ { - if postings[i] == nil { - postings[i] = roaring.New() - } - } - s.Postings = postings - } - - if cap(s.FreqNorms) >= numPostingsLists { - s.FreqNorms = s.FreqNorms[:numPostingsLists] - } else { - s.FreqNorms = make([][]interimFreqNorm, numPostingsLists) - } - - if cap(s.freqNormsBacking) >= totTFs { - s.freqNormsBacking = s.freqNormsBacking[:totTFs] - } else { - s.freqNormsBacking = make([]interimFreqNorm, totTFs) - } - - freqNormsBacking := s.freqNormsBacking - for pid, numTerms := range s.numTermsPerPostingsList { - s.FreqNorms[pid] = freqNormsBacking[0:0] - freqNormsBacking = freqNormsBacking[numTerms:] - } - - if cap(s.Locs) >= numPostingsLists { - s.Locs = s.Locs[:numPostingsLists] - } else { - s.Locs = make([][]interimLoc, numPostingsLists) - } - - if cap(s.locsBacking) >= totLocs { - s.locsBacking = s.locsBacking[:totLocs] - } else { - s.locsBacking = make([]interimLoc, totLocs) - } - - locsBacking := s.locsBacking - for pid, numLocs := range s.numLocsPerPostingsList { - s.Locs[pid] = locsBacking[0:0] - locsBacking = locsBacking[numLocs:] - } -} - -func (s *interim) processDocuments() { - numFields := len(s.FieldsInv) - reuseFieldLens := make([]int, numFields) - reuseFieldTFs := make([]analysis.TokenFrequencies, numFields) - - for docNum, result := range s.results { - for i := 0; i < numFields; i++ { // clear these for reuse - reuseFieldLens[i] = 0 - reuseFieldTFs[i] = nil - } - - s.processDocument(uint64(docNum), result, - reuseFieldLens, reuseFieldTFs) - } -} - -func (s *interim) processDocument(docNum uint64, - result *index.AnalysisResult, - fieldLens []int, fieldTFs []analysis.TokenFrequencies) { - visitField := func(fieldID uint16, fieldName string, - ln int, tf analysis.TokenFrequencies) { - fieldLens[fieldID] += ln - - existingFreqs := fieldTFs[fieldID] - if existingFreqs != nil { - existingFreqs.MergeAll(fieldName, tf) - } else { - fieldTFs[fieldID] = tf - } - } - - // walk each composite field - for _, field := range result.Document.CompositeFields { - fieldID := uint16(s.getOrDefineField(field.Name())) - ln, tf := field.Analyze() - visitField(fieldID, field.Name(), ln, tf) - } - - // walk each field - for i, field := range result.Document.Fields { - fieldID := uint16(s.getOrDefineField(field.Name())) - ln := result.Length[i] - tf := result.Analyzed[i] - visitField(fieldID, field.Name(), ln, tf) - } - - // now that it's been rolled up into fieldTFs, walk that - for fieldID, tfs := range fieldTFs { - dict := s.Dicts[fieldID] - norm := float32(1.0 / math.Sqrt(float64(fieldLens[fieldID]))) - - for term, tf := range tfs { - pid := dict[term] - 1 - bs := s.Postings[pid] - bs.Add(uint32(docNum)) - - s.FreqNorms[pid] = append(s.FreqNorms[pid], - interimFreqNorm{ - freq: uint64(tf.Frequency()), - norm: norm, - numLocs: len(tf.Locations), - }) - - if len(tf.Locations) > 0 { - locs := s.Locs[pid] - - for _, loc := range tf.Locations { - var locf = uint16(fieldID) - if loc.Field != "" { - locf = uint16(s.getOrDefineField(loc.Field)) - } - var arrayposs []uint64 - if len(loc.ArrayPositions) > 0 { - arrayposs = loc.ArrayPositions - } - locs = append(locs, interimLoc{ - fieldID: locf, - pos: uint64(loc.Position), - start: uint64(loc.Start), - end: uint64(loc.End), - arrayposs: arrayposs, - }) - } - - s.Locs[pid] = locs - } - } - } -} - -func (s *interim) writeStoredFields() ( - storedIndexOffset uint64, err error) { - varBuf := make([]byte, binary.MaxVarintLen64) - metaEncode := func(val uint64) (int, error) { - wb := binary.PutUvarint(varBuf, val) - return s.metaBuf.Write(varBuf[:wb]) - } - - data, compressed := s.tmp0[:0], s.tmp1[:0] - defer func() { s.tmp0, s.tmp1 = data, compressed }() - - // keyed by docNum - docStoredOffsets := make([]uint64, len(s.results)) - - // keyed by fieldID, for the current doc in the loop - docStoredFields := map[uint16]interimStoredField{} - - for docNum, result := range s.results { - for fieldID := range docStoredFields { // reset for next doc - delete(docStoredFields, fieldID) - } - - for _, field := range result.Document.Fields { - fieldID := uint16(s.getOrDefineField(field.Name())) - - opts := field.Options() - - if opts.IsStored() { - isf := docStoredFields[fieldID] - isf.vals = append(isf.vals, field.Value()) - isf.typs = append(isf.typs, encodeFieldType(field)) - isf.arrayposs = append(isf.arrayposs, field.ArrayPositions()) - docStoredFields[fieldID] = isf - } - - if opts.IncludeDocValues() { - s.IncludeDocValues[fieldID] = true - } - - err := ValidateDocFields(field) - if err != nil { - return 0, err - } - } - - var curr int - - s.metaBuf.Reset() - data = data[:0] - - // _id field special case optimizes ExternalID() lookups - idFieldVal := docStoredFields[uint16(0)].vals[0] - _, err = metaEncode(uint64(len(idFieldVal))) - if err != nil { - return 0, err - } - - // handle non-"_id" fields - for fieldID := 1; fieldID < len(s.FieldsInv); fieldID++ { - isf, exists := docStoredFields[uint16(fieldID)] - if exists { - curr, data, err = persistStoredFieldValues( - fieldID, isf.vals, isf.typs, isf.arrayposs, - curr, metaEncode, data) - if err != nil { - return 0, err - } - } - } - - metaBytes := s.metaBuf.Bytes() - - compressed = snappy.Encode(compressed[:cap(compressed)], data) - - docStoredOffsets[docNum] = uint64(s.w.Count()) - - _, err := writeUvarints(s.w, - uint64(len(metaBytes)), - uint64(len(idFieldVal)+len(compressed))) - if err != nil { - return 0, err - } - - _, err = s.w.Write(metaBytes) - if err != nil { - return 0, err - } - - _, err = s.w.Write(idFieldVal) - if err != nil { - return 0, err - } - - _, err = s.w.Write(compressed) - if err != nil { - return 0, err - } - } - - storedIndexOffset = uint64(s.w.Count()) - - for _, docStoredOffset := range docStoredOffsets { - err = binary.Write(s.w, binary.BigEndian, docStoredOffset) - if err != nil { - return 0, err - } - } - - return storedIndexOffset, nil -} - -func (s *interim) writeDicts() (fdvIndexOffset uint64, dictOffsets []uint64, err error) { - dictOffsets = make([]uint64, len(s.FieldsInv)) - - fdvOffsetsStart := make([]uint64, len(s.FieldsInv)) - fdvOffsetsEnd := make([]uint64, len(s.FieldsInv)) - - buf := s.grabBuf(binary.MaxVarintLen64) - - tfEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1)) - locEncoder := newChunkedIntCoder(uint64(s.chunkFactor), uint64(len(s.results)-1)) - fdvEncoder := newChunkedContentCoder(uint64(s.chunkFactor), uint64(len(s.results)-1), s.w, false) - - var docTermMap [][]byte - - if s.builder == nil { - s.builder, err = vellum.New(&s.builderBuf, nil) - if err != nil { - return 0, nil, err - } - } - - for fieldID, terms := range s.DictKeys { - if cap(docTermMap) < len(s.results) { - docTermMap = make([][]byte, len(s.results)) - } else { - docTermMap = docTermMap[0:len(s.results)] - for docNum := range docTermMap { // reset the docTermMap - docTermMap[docNum] = docTermMap[docNum][:0] - } - } - - dict := s.Dicts[fieldID] - - for _, term := range terms { // terms are already sorted - pid := dict[term] - 1 - - postingsBS := s.Postings[pid] - - freqNorms := s.FreqNorms[pid] - freqNormOffset := 0 - - locs := s.Locs[pid] - locOffset := 0 - - postingsItr := postingsBS.Iterator() - for postingsItr.HasNext() { - docNum := uint64(postingsItr.Next()) - - freqNorm := freqNorms[freqNormOffset] - - err = tfEncoder.Add(docNum, - encodeFreqHasLocs(freqNorm.freq, freqNorm.numLocs > 0), - uint64(math.Float32bits(freqNorm.norm))) - if err != nil { - return 0, nil, err - } - - if freqNorm.numLocs > 0 { - numBytesLocs := 0 - for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { - numBytesLocs += totalUvarintBytes( - uint64(loc.fieldID), loc.pos, loc.start, loc.end, - uint64(len(loc.arrayposs)), loc.arrayposs) - } - - err = locEncoder.Add(docNum, uint64(numBytesLocs)) - if err != nil { - return 0, nil, err - } - - for _, loc := range locs[locOffset : locOffset+freqNorm.numLocs] { - err = locEncoder.Add(docNum, - uint64(loc.fieldID), loc.pos, loc.start, loc.end, - uint64(len(loc.arrayposs))) - if err != nil { - return 0, nil, err - } - - err = locEncoder.Add(docNum, loc.arrayposs...) - if err != nil { - return 0, nil, err - } - } - - locOffset += freqNorm.numLocs - } - - freqNormOffset++ - - docTermMap[docNum] = append( - append(docTermMap[docNum], term...), - termSeparator) - } - - tfEncoder.Close() - locEncoder.Close() - - postingsOffset, err := - writePostings(postingsBS, tfEncoder, locEncoder, nil, s.w, buf) - if err != nil { - return 0, nil, err - } - - if postingsOffset > uint64(0) { - err = s.builder.Insert([]byte(term), postingsOffset) - if err != nil { - return 0, nil, err - } - } - - tfEncoder.Reset() - locEncoder.Reset() - } - - err = s.builder.Close() - if err != nil { - return 0, nil, err - } - - // record where this dictionary starts - dictOffsets[fieldID] = uint64(s.w.Count()) - - vellumData := s.builderBuf.Bytes() - - // write out the length of the vellum data - n := binary.PutUvarint(buf, uint64(len(vellumData))) - _, err = s.w.Write(buf[:n]) - if err != nil { - return 0, nil, err - } - - // write this vellum to disk - _, err = s.w.Write(vellumData) - if err != nil { - return 0, nil, err - } - - // reset vellum for reuse - s.builderBuf.Reset() - - err = s.builder.Reset(&s.builderBuf) - if err != nil { - return 0, nil, err - } - - // write the field doc values - if s.IncludeDocValues[fieldID] { - for docNum, docTerms := range docTermMap { - if len(docTerms) > 0 { - err = fdvEncoder.Add(uint64(docNum), docTerms) - if err != nil { - return 0, nil, err - } - } - } - err = fdvEncoder.Close() - if err != nil { - return 0, nil, err - } - - fdvOffsetsStart[fieldID] = uint64(s.w.Count()) - - _, err = fdvEncoder.Write() - if err != nil { - return 0, nil, err - } - - fdvOffsetsEnd[fieldID] = uint64(s.w.Count()) - - fdvEncoder.Reset() - } else { - fdvOffsetsStart[fieldID] = fieldNotUninverted - fdvOffsetsEnd[fieldID] = fieldNotUninverted - } - } - - fdvIndexOffset = uint64(s.w.Count()) - - for i := 0; i < len(fdvOffsetsStart); i++ { - n := binary.PutUvarint(buf, fdvOffsetsStart[i]) - _, err := s.w.Write(buf[:n]) - if err != nil { - return 0, nil, err - } - n = binary.PutUvarint(buf, fdvOffsetsEnd[i]) - _, err = s.w.Write(buf[:n]) - if err != nil { - return 0, nil, err - } - } - - return fdvIndexOffset, dictOffsets, nil -} - -func encodeFieldType(f document.Field) byte { - fieldType := byte('x') - switch f.(type) { - case *document.TextField: - fieldType = 't' - case *document.NumericField: - fieldType = 'n' - case *document.DateTimeField: - fieldType = 'd' - case *document.BooleanField: - fieldType = 'b' - case *document.GeoPointField: - fieldType = 'g' - case *document.CompositeField: - fieldType = 'c' - } - return fieldType -} - -// returns the total # of bytes needed to encode the given uint64's -// into binary.PutUVarint() encoding -func totalUvarintBytes(a, b, c, d, e uint64, more []uint64) (n int) { - n = numUvarintBytes(a) - n += numUvarintBytes(b) - n += numUvarintBytes(c) - n += numUvarintBytes(d) - n += numUvarintBytes(e) - for _, v := range more { - n += numUvarintBytes(v) - } - return n -} - -// returns # of bytes needed to encode x in binary.PutUvarint() encoding -func numUvarintBytes(x uint64) (n int) { - for x >= 0x80 { - x >>= 7 - n++ - } - return n + 1 -} diff --git a/index/scorch/segment/zap/posting.go b/index/scorch/segment/zap/posting.go deleted file mode 100644 index 4c43fdb9b..000000000 --- a/index/scorch/segment/zap/posting.go +++ /dev/null @@ -1,897 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "encoding/binary" - "fmt" - "math" - "reflect" - - "github.com/RoaringBitmap/roaring" - "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/blevesearch/bleve/size" -) - -var reflectStaticSizePostingsList int -var reflectStaticSizePostingsIterator int -var reflectStaticSizePosting int -var reflectStaticSizeLocation int - -func init() { - var pl PostingsList - reflectStaticSizePostingsList = int(reflect.TypeOf(pl).Size()) - var pi PostingsIterator - reflectStaticSizePostingsIterator = int(reflect.TypeOf(pi).Size()) - var p Posting - reflectStaticSizePosting = int(reflect.TypeOf(p).Size()) - var l Location - reflectStaticSizeLocation = int(reflect.TypeOf(l).Size()) -} - -// FST or vellum value (uint64) encoding is determined by the top two -// highest-order or most significant bits... -// -// encoding : MSB -// name : 63 62 61...to...bit #0 (LSB) -// ----------+---+---+--------------------------------------------------- -// general : 0 | 0 | 62-bits of postingsOffset. -// ~ : 0 | 1 | reserved for future. -// 1-hit : 1 | 0 | 31-bits of positive float31 norm | 31-bits docNum. -// ~ : 1 | 1 | reserved for future. -// -// Encoding "general" is able to handle all cases, where the -// postingsOffset points to more information about the postings for -// the term. -// -// Encoding "1-hit" is used to optimize a commonly seen case when a -// term has only a single hit. For example, a term in the _id field -// will have only 1 hit. The "1-hit" encoding is used for a term -// in a field when... -// -// - term vector info is disabled for that field; -// - and, the term appears in only a single doc for that field; -// - and, the term's freq is exactly 1 in that single doc for that field; -// - and, the docNum must fit into 31-bits; -// -// Otherwise, the "general" encoding is used instead. -// -// In the "1-hit" encoding, the field in that single doc may have -// other terms, which is supported in the "1-hit" encoding by the -// positive float31 norm. - -const FSTValEncodingMask = uint64(0xc000000000000000) -const FSTValEncodingGeneral = uint64(0x0000000000000000) -const FSTValEncoding1Hit = uint64(0x8000000000000000) - -func FSTValEncode1Hit(docNum uint64, normBits uint64) uint64 { - return FSTValEncoding1Hit | ((mask31Bits & normBits) << 31) | (mask31Bits & docNum) -} - -func FSTValDecode1Hit(v uint64) (docNum uint64, normBits uint64) { - return (mask31Bits & v), (mask31Bits & (v >> 31)) -} - -const mask31Bits = uint64(0x000000007fffffff) - -func under32Bits(x uint64) bool { - return x <= mask31Bits -} - -const DocNum1HitFinished = math.MaxUint64 - -var NormBits1Hit = uint64(math.Float32bits(float32(1))) - -// PostingsList is an in-memory representation of a postings list -type PostingsList struct { - sb *SegmentBase - postingsOffset uint64 - freqOffset uint64 - locOffset uint64 - postings *roaring.Bitmap - except *roaring.Bitmap - - // when normBits1Hit != 0, then this postings list came from a - // 1-hit encoding, and only the docNum1Hit & normBits1Hit apply - docNum1Hit uint64 - normBits1Hit uint64 -} - -// represents an immutable, empty postings list -var emptyPostingsList = &PostingsList{} - -func (p *PostingsList) Size() int { - sizeInBytes := reflectStaticSizePostingsList + size.SizeOfPtr - - if p.except != nil { - sizeInBytes += int(p.except.GetSizeInBytes()) - } - - return sizeInBytes -} - -func (p *PostingsList) OrInto(receiver *roaring.Bitmap) { - if p.normBits1Hit != 0 { - receiver.Add(uint32(p.docNum1Hit)) - return - } - - if p.postings != nil { - receiver.Or(p.postings) - } -} - -// Iterator returns an iterator for this postings list -func (p *PostingsList) Iterator(includeFreq, includeNorm, includeLocs bool, - prealloc segment.PostingsIterator) segment.PostingsIterator { - if p.normBits1Hit == 0 && p.postings == nil { - return emptyPostingsIterator - } - - var preallocPI *PostingsIterator - pi, ok := prealloc.(*PostingsIterator) - if ok && pi != nil { - preallocPI = pi - } - if preallocPI == emptyPostingsIterator { - preallocPI = nil - } - - return p.iterator(includeFreq, includeNorm, includeLocs, preallocPI) -} - -func (p *PostingsList) iterator(includeFreq, includeNorm, includeLocs bool, - rv *PostingsIterator) *PostingsIterator { - if rv == nil { - rv = &PostingsIterator{} - } else { - freqNormReader := rv.freqNormReader - if freqNormReader != nil { - freqNormReader.Reset([]byte(nil)) - } - - locReader := rv.locReader - if locReader != nil { - locReader.Reset([]byte(nil)) - } - - freqChunkOffsets := rv.freqChunkOffsets[:0] - locChunkOffsets := rv.locChunkOffsets[:0] - - nextLocs := rv.nextLocs[:0] - nextSegmentLocs := rv.nextSegmentLocs[:0] - - buf := rv.buf - - *rv = PostingsIterator{} // clear the struct - - rv.freqNormReader = freqNormReader - rv.locReader = locReader - - rv.freqChunkOffsets = freqChunkOffsets - rv.locChunkOffsets = locChunkOffsets - - rv.nextLocs = nextLocs - rv.nextSegmentLocs = nextSegmentLocs - - rv.buf = buf - } - - rv.postings = p - rv.includeFreqNorm = includeFreq || includeNorm || includeLocs - rv.includeLocs = includeLocs - - if p.normBits1Hit != 0 { - // "1-hit" encoding - rv.docNum1Hit = p.docNum1Hit - rv.normBits1Hit = p.normBits1Hit - - if p.except != nil && p.except.Contains(uint32(rv.docNum1Hit)) { - rv.docNum1Hit = DocNum1HitFinished - } - - return rv - } - - // "general" encoding, check if empty - if p.postings == nil { - return rv - } - - var n uint64 - var read int - - // prepare the freq chunk details - if rv.includeFreqNorm { - var numFreqChunks uint64 - numFreqChunks, read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) - n += uint64(read) - if cap(rv.freqChunkOffsets) >= int(numFreqChunks) { - rv.freqChunkOffsets = rv.freqChunkOffsets[:int(numFreqChunks)] - } else { - rv.freqChunkOffsets = make([]uint64, int(numFreqChunks)) - } - for i := 0; i < int(numFreqChunks); i++ { - rv.freqChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.freqOffset+n : p.freqOffset+n+binary.MaxVarintLen64]) - n += uint64(read) - } - rv.freqChunkStart = p.freqOffset + n - } - - // prepare the loc chunk details - if rv.includeLocs { - n = 0 - var numLocChunks uint64 - numLocChunks, read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) - n += uint64(read) - if cap(rv.locChunkOffsets) >= int(numLocChunks) { - rv.locChunkOffsets = rv.locChunkOffsets[:int(numLocChunks)] - } else { - rv.locChunkOffsets = make([]uint64, int(numLocChunks)) - } - for i := 0; i < int(numLocChunks); i++ { - rv.locChunkOffsets[i], read = binary.Uvarint(p.sb.mem[p.locOffset+n : p.locOffset+n+binary.MaxVarintLen64]) - n += uint64(read) - } - rv.locChunkStart = p.locOffset + n - } - - rv.all = p.postings.Iterator() - if p.except != nil { - rv.ActualBM = roaring.AndNot(p.postings, p.except) - rv.Actual = rv.ActualBM.Iterator() - } else { - rv.ActualBM = p.postings - rv.Actual = rv.all // Optimize to use same iterator for all & Actual. - } - - return rv -} - -// Count returns the number of items on this postings list -func (p *PostingsList) Count() uint64 { - var n, e uint64 - if p.normBits1Hit != 0 { - n = 1 - if p.except != nil && p.except.Contains(uint32(p.docNum1Hit)) { - e = 1 - } - } else if p.postings != nil { - n = p.postings.GetCardinality() - if p.except != nil { - e = p.postings.AndCardinality(p.except) - } - } - return n - e -} - -func (rv *PostingsList) read(postingsOffset uint64, d *Dictionary) error { - rv.postingsOffset = postingsOffset - - // handle "1-hit" encoding special case - if rv.postingsOffset&FSTValEncodingMask == FSTValEncoding1Hit { - return rv.init1Hit(postingsOffset) - } - - // read the location of the freq/norm details - var n uint64 - var read int - - rv.freqOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+binary.MaxVarintLen64]) - n += uint64(read) - - rv.locOffset, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) - n += uint64(read) - - var postingsLen uint64 - postingsLen, read = binary.Uvarint(d.sb.mem[postingsOffset+n : postingsOffset+n+binary.MaxVarintLen64]) - n += uint64(read) - - roaringBytes := d.sb.mem[postingsOffset+n : postingsOffset+n+postingsLen] - - if rv.postings == nil { - rv.postings = roaring.NewBitmap() - } - _, err := rv.postings.FromBuffer(roaringBytes) - if err != nil { - return fmt.Errorf("error loading roaring bitmap: %v", err) - } - - return nil -} - -func (rv *PostingsList) init1Hit(fstVal uint64) error { - docNum, normBits := FSTValDecode1Hit(fstVal) - - rv.docNum1Hit = docNum - rv.normBits1Hit = normBits - - return nil -} - -// PostingsIterator provides a way to iterate through the postings list -type PostingsIterator struct { - postings *PostingsList - all roaring.IntPeekable - Actual roaring.IntPeekable - ActualBM *roaring.Bitmap - - currChunk uint32 - currChunkFreqNorm []byte - currChunkLoc []byte - - freqNormReader *segment.MemUvarintReader - locReader *segment.MemUvarintReader - - freqChunkOffsets []uint64 - freqChunkStart uint64 - - locChunkOffsets []uint64 - locChunkStart uint64 - - next Posting // reused across Next() calls - nextLocs []Location // reused across Next() calls - nextSegmentLocs []segment.Location // reused across Next() calls - - docNum1Hit uint64 - normBits1Hit uint64 - - buf []byte - - includeFreqNorm bool - includeLocs bool -} - -var emptyPostingsIterator = &PostingsIterator{} - -func (i *PostingsIterator) Size() int { - sizeInBytes := reflectStaticSizePostingsIterator + size.SizeOfPtr + - len(i.currChunkFreqNorm) + - len(i.currChunkLoc) + - len(i.freqChunkOffsets)*size.SizeOfUint64 + - len(i.locChunkOffsets)*size.SizeOfUint64 + - i.next.Size() - - for _, entry := range i.nextLocs { - sizeInBytes += entry.Size() - } - - return sizeInBytes -} - -func (i *PostingsIterator) loadChunk(chunk int) error { - if i.includeFreqNorm { - if chunk >= len(i.freqChunkOffsets) { - return fmt.Errorf("tried to load freq chunk that doesn't exist %d/(%d)", - chunk, len(i.freqChunkOffsets)) - } - - end, start := i.freqChunkStart, i.freqChunkStart - s, e := readChunkBoundary(chunk, i.freqChunkOffsets) - start += s - end += e - i.currChunkFreqNorm = i.postings.sb.mem[start:end] - if i.freqNormReader == nil { - i.freqNormReader = segment.NewMemUvarintReader(i.currChunkFreqNorm) - } else { - i.freqNormReader.Reset(i.currChunkFreqNorm) - } - } - - if i.includeLocs { - if chunk >= len(i.locChunkOffsets) { - return fmt.Errorf("tried to load loc chunk that doesn't exist %d/(%d)", - chunk, len(i.locChunkOffsets)) - } - - end, start := i.locChunkStart, i.locChunkStart - s, e := readChunkBoundary(chunk, i.locChunkOffsets) - start += s - end += e - i.currChunkLoc = i.postings.sb.mem[start:end] - if i.locReader == nil { - i.locReader = segment.NewMemUvarintReader(i.currChunkLoc) - } else { - i.locReader.Reset(i.currChunkLoc) - } - } - - i.currChunk = uint32(chunk) - return nil -} - -func (i *PostingsIterator) readFreqNormHasLocs() (uint64, uint64, bool, error) { - if i.normBits1Hit != 0 { - return 1, i.normBits1Hit, false, nil - } - - freqHasLocs, err := i.freqNormReader.ReadUvarint() - if err != nil { - return 0, 0, false, fmt.Errorf("error reading frequency: %v", err) - } - - freq, hasLocs := decodeFreqHasLocs(freqHasLocs) - - normBits, err := i.freqNormReader.ReadUvarint() - if err != nil { - return 0, 0, false, fmt.Errorf("error reading norm: %v", err) - } - - return freq, normBits, hasLocs, nil -} - -func (i *PostingsIterator) skipFreqNormReadHasLocs() (bool, error) { - if i.normBits1Hit != 0 { - return false, nil - } - - freqHasLocs, err := i.freqNormReader.ReadUvarint() - if err != nil { - return false, fmt.Errorf("error reading freqHasLocs: %v", err) - } - - i.freqNormReader.SkipUvarint() // Skip normBits. - - return freqHasLocs&0x01 != 0, nil // See decodeFreqHasLocs() / hasLocs. -} - -func encodeFreqHasLocs(freq uint64, hasLocs bool) uint64 { - rv := freq << 1 - if hasLocs { - rv = rv | 0x01 // 0'th LSB encodes whether there are locations - } - return rv -} - -func decodeFreqHasLocs(freqHasLocs uint64) (uint64, bool) { - freq := freqHasLocs >> 1 - hasLocs := freqHasLocs&0x01 != 0 - return freq, hasLocs -} - -// readLocation processes all the integers on the stream representing a single -// location. -func (i *PostingsIterator) readLocation(l *Location) error { - // read off field - fieldID, err := i.locReader.ReadUvarint() - if err != nil { - return fmt.Errorf("error reading location field: %v", err) - } - // read off pos - pos, err := i.locReader.ReadUvarint() - if err != nil { - return fmt.Errorf("error reading location pos: %v", err) - } - // read off start - start, err := i.locReader.ReadUvarint() - if err != nil { - return fmt.Errorf("error reading location start: %v", err) - } - // read off end - end, err := i.locReader.ReadUvarint() - if err != nil { - return fmt.Errorf("error reading location end: %v", err) - } - // read off num array pos - numArrayPos, err := i.locReader.ReadUvarint() - if err != nil { - return fmt.Errorf("error reading location num array pos: %v", err) - } - - l.field = i.postings.sb.fieldsInv[fieldID] - l.pos = pos - l.start = start - l.end = end - - if cap(l.ap) < int(numArrayPos) { - l.ap = make([]uint64, int(numArrayPos)) - } else { - l.ap = l.ap[:int(numArrayPos)] - } - - // read off array positions - for k := 0; k < int(numArrayPos); k++ { - ap, err := i.locReader.ReadUvarint() - if err != nil { - return fmt.Errorf("error reading array position: %v", err) - } - - l.ap[k] = ap - } - - return nil -} - -// Next returns the next posting on the postings list, or nil at the end -func (i *PostingsIterator) Next() (segment.Posting, error) { - return i.nextAtOrAfter(0) -} - -// Advance returns the posting at the specified docNum or it is not present -// the next posting, or if the end is reached, nil -func (i *PostingsIterator) Advance(docNum uint64) (segment.Posting, error) { - return i.nextAtOrAfter(docNum) -} - -// Next returns the next posting on the postings list, or nil at the end -func (i *PostingsIterator) nextAtOrAfter(atOrAfter uint64) (segment.Posting, error) { - docNum, exists, err := i.nextDocNumAtOrAfter(atOrAfter) - if err != nil || !exists { - return nil, err - } - - i.next = Posting{} // clear the struct - rv := &i.next - rv.docNum = docNum - - if !i.includeFreqNorm { - return rv, nil - } - - var normBits uint64 - var hasLocs bool - - rv.freq, normBits, hasLocs, err = i.readFreqNormHasLocs() - if err != nil { - return nil, err - } - - rv.norm = math.Float32frombits(uint32(normBits)) - - if i.includeLocs && hasLocs { - // prepare locations into reused slices, where we assume - // rv.freq >= "number of locs", since in a composite field, - // some component fields might have their IncludeTermVector - // flags disabled while other component fields are enabled - if cap(i.nextLocs) >= int(rv.freq) { - i.nextLocs = i.nextLocs[0:rv.freq] - } else { - i.nextLocs = make([]Location, rv.freq, rv.freq*2) - } - if cap(i.nextSegmentLocs) < int(rv.freq) { - i.nextSegmentLocs = make([]segment.Location, rv.freq, rv.freq*2) - } - rv.locs = i.nextSegmentLocs[:0] - - numLocsBytes, err := i.locReader.ReadUvarint() - if err != nil { - return nil, fmt.Errorf("error reading location numLocsBytes: %v", err) - } - - j := 0 - startBytesRemaining := i.locReader.Len() // # bytes remaining in the locReader - for startBytesRemaining-i.locReader.Len() < int(numLocsBytes) { - err := i.readLocation(&i.nextLocs[j]) - if err != nil { - return nil, err - } - rv.locs = append(rv.locs, &i.nextLocs[j]) - j++ - } - } - - return rv, nil -} - -var freqHasLocs1Hit = encodeFreqHasLocs(1, false) - -// nextBytes returns the docNum and the encoded freq & loc bytes for -// the next posting -func (i *PostingsIterator) nextBytes() ( - docNumOut uint64, freq uint64, normBits uint64, - bytesFreqNorm []byte, bytesLoc []byte, err error) { - docNum, exists, err := i.nextDocNumAtOrAfter(0) - if err != nil || !exists { - return 0, 0, 0, nil, nil, err - } - - if i.normBits1Hit != 0 { - if i.buf == nil { - i.buf = make([]byte, binary.MaxVarintLen64*2) - } - n := binary.PutUvarint(i.buf, freqHasLocs1Hit) - n += binary.PutUvarint(i.buf[n:], i.normBits1Hit) - return docNum, uint64(1), i.normBits1Hit, i.buf[:n], nil, nil - } - - startFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len() - - var hasLocs bool - - freq, normBits, hasLocs, err = i.readFreqNormHasLocs() - if err != nil { - return 0, 0, 0, nil, nil, err - } - - endFreqNorm := len(i.currChunkFreqNorm) - i.freqNormReader.Len() - bytesFreqNorm = i.currChunkFreqNorm[startFreqNorm:endFreqNorm] - - if hasLocs { - startLoc := len(i.currChunkLoc) - i.locReader.Len() - - numLocsBytes, err := i.locReader.ReadUvarint() - if err != nil { - return 0, 0, 0, nil, nil, - fmt.Errorf("error reading location nextBytes numLocs: %v", err) - } - - // skip over all the location bytes - i.locReader.SkipBytes(int(numLocsBytes)) - - endLoc := len(i.currChunkLoc) - i.locReader.Len() - bytesLoc = i.currChunkLoc[startLoc:endLoc] - } - - return docNum, freq, normBits, bytesFreqNorm, bytesLoc, nil -} - -// nextDocNum returns the next docNum on the postings list, and also -// sets up the currChunk / loc related fields of the iterator. -func (i *PostingsIterator) nextDocNumAtOrAfter(atOrAfter uint64) (uint64, bool, error) { - if i.normBits1Hit != 0 { - if i.docNum1Hit == DocNum1HitFinished { - return 0, false, nil - } - if i.docNum1Hit < atOrAfter { - // advanced past our 1-hit - i.docNum1Hit = DocNum1HitFinished // consume our 1-hit docNum - return 0, false, nil - } - docNum := i.docNum1Hit - i.docNum1Hit = DocNum1HitFinished // consume our 1-hit docNum - return docNum, true, nil - } - - if i.Actual == nil || !i.Actual.HasNext() { - return 0, false, nil - } - - if i.postings == nil || i.postings.postings == i.ActualBM { - return i.nextDocNumAtOrAfterClean(atOrAfter) - } - - i.Actual.AdvanceIfNeeded(uint32(atOrAfter)) - - if !i.Actual.HasNext() { - // couldn't find anything - return 0, false, nil - } - - n := i.Actual.Next() - allN := i.all.Next() - - nChunk := n / i.postings.sb.chunkFactor - - // when allN becomes >= to here, then allN is in the same chunk as nChunk. - allNReachesNChunk := nChunk * i.postings.sb.chunkFactor - - // n is the next actual hit (excluding some postings), and - // allN is the next hit in the full postings, and - // if they don't match, move 'all' forwards until they do - for allN != n { - // we've reached same chunk, so move the freq/norm/loc decoders forward - if i.includeFreqNorm && allN >= allNReachesNChunk { - err := i.currChunkNext(nChunk) - if err != nil { - return 0, false, err - } - } - - allN = i.all.Next() - } - - if i.includeFreqNorm && (i.currChunk != nChunk || i.currChunkFreqNorm == nil) { - err := i.loadChunk(int(nChunk)) - if err != nil { - return 0, false, fmt.Errorf("error loading chunk: %v", err) - } - } - - return uint64(n), true, nil -} - -// optimization when the postings list is "clean" (e.g., no updates & -// no deletions) where the all bitmap is the same as the actual bitmap -func (i *PostingsIterator) nextDocNumAtOrAfterClean( - atOrAfter uint64) (uint64, bool, error) { - - if !i.includeFreqNorm { - i.Actual.AdvanceIfNeeded(uint32(atOrAfter)) - - if !i.Actual.HasNext() { - return 0, false, nil // couldn't find anything - } - - return uint64(i.Actual.Next()), true, nil - } - - // freq-norm's needed, so maintain freq-norm chunk reader - sameChunkNexts := 0 // # of times we called Next() in the same chunk - n := i.Actual.Next() - nChunk := n / i.postings.sb.chunkFactor - - for uint64(n) < atOrAfter && i.Actual.HasNext() { - n = i.Actual.Next() - - nChunkPrev := nChunk - nChunk = n / i.postings.sb.chunkFactor - - if nChunk != nChunkPrev { - sameChunkNexts = 0 - } else { - sameChunkNexts += 1 - } - } - - if uint64(n) < atOrAfter { - // couldn't find anything - return 0, false, nil - } - - for j := 0; j < sameChunkNexts; j++ { - err := i.currChunkNext(nChunk) - if err != nil { - return 0, false, fmt.Errorf("error optimized currChunkNext: %v", err) - } - } - - if i.currChunk != nChunk || i.currChunkFreqNorm == nil { - err := i.loadChunk(int(nChunk)) - if err != nil { - return 0, false, fmt.Errorf("error loading chunk: %v", err) - } - } - - return uint64(n), true, nil -} - -func (i *PostingsIterator) currChunkNext(nChunk uint32) error { - if i.currChunk != nChunk || i.currChunkFreqNorm == nil { - err := i.loadChunk(int(nChunk)) - if err != nil { - return fmt.Errorf("error loading chunk: %v", err) - } - } - - // read off freq/offsets even though we don't care about them - hasLocs, err := i.skipFreqNormReadHasLocs() - if err != nil { - return err - } - - if i.includeLocs && hasLocs { - numLocsBytes, err := i.locReader.ReadUvarint() - if err != nil { - return fmt.Errorf("error reading location numLocsBytes: %v", err) - } - - // skip over all the location bytes - i.locReader.SkipBytes(int(numLocsBytes)) - } - - return nil -} - -// DocNum1Hit returns the docNum and true if this is "1-hit" optimized -// and the docNum is available. -func (p *PostingsIterator) DocNum1Hit() (uint64, bool) { - if p.normBits1Hit != 0 && p.docNum1Hit != DocNum1HitFinished { - return p.docNum1Hit, true - } - return 0, false -} - -// PostingsIteratorFromBitmap constructs a PostingsIterator given an -// "actual" bitmap. -func PostingsIteratorFromBitmap(bm *roaring.Bitmap, - includeFreqNorm, includeLocs bool) (*PostingsIterator, error) { - return &PostingsIterator{ - ActualBM: bm, - Actual: bm.Iterator(), - includeFreqNorm: includeFreqNorm, - includeLocs: includeLocs, - }, nil -} - -// PostingsIteratorFrom1Hit constructs a PostingsIterator given a -// 1-hit docNum. -func PostingsIteratorFrom1Hit(docNum1Hit, normBits1Hit uint64, - includeFreqNorm, includeLocs bool) (*PostingsIterator, error) { - return &PostingsIterator{ - docNum1Hit: docNum1Hit, - normBits1Hit: normBits1Hit, - includeFreqNorm: includeFreqNorm, - includeLocs: includeLocs, - }, nil -} - -// Posting is a single entry in a postings list -type Posting struct { - docNum uint64 - freq uint64 - norm float32 - locs []segment.Location -} - -func (p *Posting) Size() int { - sizeInBytes := reflectStaticSizePosting - - for _, entry := range p.locs { - sizeInBytes += entry.Size() - } - - return sizeInBytes -} - -// Number returns the document number of this posting in this segment -func (p *Posting) Number() uint64 { - return p.docNum -} - -// Frequency returns the frequencies of occurrence of this term in this doc/field -func (p *Posting) Frequency() uint64 { - return p.freq -} - -// Norm returns the normalization factor for this posting -func (p *Posting) Norm() float64 { - return float64(p.norm) -} - -// Locations returns the location information for each occurrence -func (p *Posting) Locations() []segment.Location { - return p.locs -} - -// Location represents the location of a single occurrence -type Location struct { - field string - pos uint64 - start uint64 - end uint64 - ap []uint64 -} - -func (l *Location) Size() int { - return reflectStaticSizeLocation + - len(l.field) + - len(l.ap)*size.SizeOfUint64 -} - -// Field returns the name of the field (useful in composite fields to know -// which original field the value came from) -func (l *Location) Field() string { - return l.field -} - -// Start returns the start byte offset of this occurrence -func (l *Location) Start() uint64 { - return l.start -} - -// End returns the end byte offset of this occurrence -func (l *Location) End() uint64 { - return l.end -} - -// Pos returns the 1-based phrase position of this occurrence -func (l *Location) Pos() uint64 { - return l.pos -} - -// ArrayPositions returns the array position vector associated with this occurrence -func (l *Location) ArrayPositions() []uint64 { - return l.ap -} diff --git a/index/scorch/segment/zap/read.go b/index/scorch/segment/zap/read.go deleted file mode 100644 index e47d4c6ab..000000000 --- a/index/scorch/segment/zap/read.go +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import "encoding/binary" - -func (s *SegmentBase) getDocStoredMetaAndCompressed(docNum uint64) ([]byte, []byte) { - _, storedOffset, n, metaLen, dataLen := s.getDocStoredOffsets(docNum) - - meta := s.mem[storedOffset+n : storedOffset+n+metaLen] - data := s.mem[storedOffset+n+metaLen : storedOffset+n+metaLen+dataLen] - - return meta, data -} - -func (s *SegmentBase) getDocStoredOffsets(docNum uint64) ( - uint64, uint64, uint64, uint64, uint64) { - indexOffset := s.storedIndexOffset + (8 * docNum) - - storedOffset := binary.BigEndian.Uint64(s.mem[indexOffset : indexOffset+8]) - - var n uint64 - - metaLen, read := binary.Uvarint(s.mem[storedOffset : storedOffset+binary.MaxVarintLen64]) - n += uint64(read) - - dataLen, read := binary.Uvarint(s.mem[storedOffset+n : storedOffset+n+binary.MaxVarintLen64]) - n += uint64(read) - - return indexOffset, storedOffset, n, metaLen, dataLen -} diff --git a/index/scorch/segment/zap/segment.go b/index/scorch/segment/zap/segment.go deleted file mode 100644 index 5aa33a26c..000000000 --- a/index/scorch/segment/zap/segment.go +++ /dev/null @@ -1,572 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "bytes" - "encoding/binary" - "fmt" - "io" - "os" - "sync" - "unsafe" - - "github.com/RoaringBitmap/roaring" - "github.com/blevesearch/bleve/index/scorch/segment" - "github.com/blevesearch/bleve/size" - "github.com/couchbase/vellum" - mmap "github.com/edsrzf/mmap-go" - "github.com/golang/snappy" -) - -var reflectStaticSizeSegmentBase int - -func init() { - var sb SegmentBase - reflectStaticSizeSegmentBase = int(unsafe.Sizeof(sb)) -} - -// Open returns a zap impl of a segment -func Open(path string) (segment.Segment, error) { - f, err := os.Open(path) - if err != nil { - return nil, err - } - mm, err := mmap.Map(f, mmap.RDONLY, 0) - if err != nil { - // mmap failed, try to close the file - _ = f.Close() - return nil, err - } - - rv := &Segment{ - SegmentBase: SegmentBase{ - mem: mm[0 : len(mm)-FooterSize], - fieldsMap: make(map[string]uint16), - fieldDvReaders: make(map[uint16]*docValueReader), - fieldFSTs: make(map[uint16]*vellum.FST), - }, - f: f, - mm: mm, - path: path, - refs: 1, - } - rv.SegmentBase.updateSize() - - err = rv.loadConfig() - if err != nil { - _ = rv.Close() - return nil, err - } - - err = rv.loadFields() - if err != nil { - _ = rv.Close() - return nil, err - } - - err = rv.loadDvReaders() - if err != nil { - _ = rv.Close() - return nil, err - } - - return rv, nil -} - -// SegmentBase is a memory only, read-only implementation of the -// segment.Segment interface, using zap's data representation. -type SegmentBase struct { - mem []byte - memCRC uint32 - chunkFactor uint32 - fieldsMap map[string]uint16 // fieldName -> fieldID+1 - fieldsInv []string // fieldID -> fieldName - numDocs uint64 - storedIndexOffset uint64 - fieldsIndexOffset uint64 - docValueOffset uint64 - dictLocs []uint64 - fieldDvReaders map[uint16]*docValueReader // naive chunk cache per field - fieldDvNames []string // field names cached in fieldDvReaders - size uint64 - - m sync.Mutex - fieldFSTs map[uint16]*vellum.FST -} - -func (sb *SegmentBase) Size() int { - return int(sb.size) -} - -func (sb *SegmentBase) updateSize() { - sizeInBytes := reflectStaticSizeSegmentBase + - cap(sb.mem) - - // fieldsMap - for k, _ := range sb.fieldsMap { - sizeInBytes += (len(k) + size.SizeOfString) + size.SizeOfUint16 - } - - // fieldsInv, dictLocs - for _, entry := range sb.fieldsInv { - sizeInBytes += len(entry) + size.SizeOfString - } - sizeInBytes += len(sb.dictLocs) * size.SizeOfUint64 - - // fieldDvReaders - for _, v := range sb.fieldDvReaders { - sizeInBytes += size.SizeOfUint16 + size.SizeOfPtr - if v != nil { - sizeInBytes += v.size() - } - } - - sb.size = uint64(sizeInBytes) -} - -func (sb *SegmentBase) AddRef() {} -func (sb *SegmentBase) DecRef() (err error) { return nil } -func (sb *SegmentBase) Close() (err error) { return nil } - -// Segment implements a persisted segment.Segment interface, by -// embedding an mmap()'ed SegmentBase. -type Segment struct { - SegmentBase - - f *os.File - mm mmap.MMap - path string - version uint32 - crc uint32 - - m sync.Mutex // Protects the fields that follow. - refs int64 -} - -func (s *Segment) Size() int { - // 8 /* size of file pointer */ - // 4 /* size of version -> uint32 */ - // 4 /* size of crc -> uint32 */ - sizeOfUints := 16 - - sizeInBytes := (len(s.path) + size.SizeOfString) + sizeOfUints - - // mutex, refs -> int64 - sizeInBytes += 16 - - // do not include the mmap'ed part - return sizeInBytes + s.SegmentBase.Size() - cap(s.mem) -} - -func (s *Segment) AddRef() { - s.m.Lock() - s.refs++ - s.m.Unlock() -} - -func (s *Segment) DecRef() (err error) { - s.m.Lock() - s.refs-- - if s.refs == 0 { - err = s.closeActual() - } - s.m.Unlock() - return err -} - -func (s *Segment) loadConfig() error { - crcOffset := len(s.mm) - 4 - s.crc = binary.BigEndian.Uint32(s.mm[crcOffset : crcOffset+4]) - - verOffset := crcOffset - 4 - s.version = binary.BigEndian.Uint32(s.mm[verOffset : verOffset+4]) - if s.version != Version { - return fmt.Errorf("unsupported version %d", s.version) - } - - chunkOffset := verOffset - 4 - s.chunkFactor = binary.BigEndian.Uint32(s.mm[chunkOffset : chunkOffset+4]) - - docValueOffset := chunkOffset - 8 - s.docValueOffset = binary.BigEndian.Uint64(s.mm[docValueOffset : docValueOffset+8]) - - fieldsIndexOffset := docValueOffset - 8 - s.fieldsIndexOffset = binary.BigEndian.Uint64(s.mm[fieldsIndexOffset : fieldsIndexOffset+8]) - - storedIndexOffset := fieldsIndexOffset - 8 - s.storedIndexOffset = binary.BigEndian.Uint64(s.mm[storedIndexOffset : storedIndexOffset+8]) - - numDocsOffset := storedIndexOffset - 8 - s.numDocs = binary.BigEndian.Uint64(s.mm[numDocsOffset : numDocsOffset+8]) - return nil -} - -func (s *SegmentBase) loadFields() error { - // NOTE for now we assume the fields index immediately precedes - // the footer, and if this changes, need to adjust accordingly (or - // store explicit length), where s.mem was sliced from s.mm in Open(). - fieldsIndexEnd := uint64(len(s.mem)) - - // iterate through fields index - var fieldID uint64 - for s.fieldsIndexOffset+(8*fieldID) < fieldsIndexEnd { - addr := binary.BigEndian.Uint64(s.mem[s.fieldsIndexOffset+(8*fieldID) : s.fieldsIndexOffset+(8*fieldID)+8]) - - dictLoc, read := binary.Uvarint(s.mem[addr:fieldsIndexEnd]) - n := uint64(read) - s.dictLocs = append(s.dictLocs, dictLoc) - - var nameLen uint64 - nameLen, read = binary.Uvarint(s.mem[addr+n : fieldsIndexEnd]) - n += uint64(read) - - name := string(s.mem[addr+n : addr+n+nameLen]) - s.fieldsInv = append(s.fieldsInv, name) - s.fieldsMap[name] = uint16(fieldID + 1) - - fieldID++ - } - return nil -} - -// Dictionary returns the term dictionary for the specified field -func (s *SegmentBase) Dictionary(field string) (segment.TermDictionary, error) { - dict, err := s.dictionary(field) - if err == nil && dict == nil { - return &segment.EmptyDictionary{}, nil - } - return dict, err -} - -func (sb *SegmentBase) dictionary(field string) (rv *Dictionary, err error) { - fieldIDPlus1 := sb.fieldsMap[field] - if fieldIDPlus1 > 0 { - rv = &Dictionary{ - sb: sb, - field: field, - fieldID: fieldIDPlus1 - 1, - } - - dictStart := sb.dictLocs[rv.fieldID] - if dictStart > 0 { - var ok bool - sb.m.Lock() - if rv.fst, ok = sb.fieldFSTs[rv.fieldID]; !ok { - // read the length of the vellum data - vellumLen, read := binary.Uvarint(sb.mem[dictStart : dictStart+binary.MaxVarintLen64]) - fstBytes := sb.mem[dictStart+uint64(read) : dictStart+uint64(read)+vellumLen] - rv.fst, err = vellum.Load(fstBytes) - if err != nil { - sb.m.Unlock() - return nil, fmt.Errorf("dictionary field %s vellum err: %v", field, err) - } - - sb.fieldFSTs[rv.fieldID] = rv.fst - } - - sb.m.Unlock() - rv.fstReader, err = rv.fst.Reader() - if err != nil { - return nil, fmt.Errorf("dictionary field %s vellum reader err: %v", field, err) - } - - } - } - - return rv, nil -} - -// visitDocumentCtx holds data structures that are reusable across -// multiple VisitDocument() calls to avoid memory allocations -type visitDocumentCtx struct { - buf []byte - reader bytes.Reader - arrayPos []uint64 -} - -var visitDocumentCtxPool = sync.Pool{ - New: func() interface{} { - reuse := &visitDocumentCtx{} - return reuse - }, -} - -// VisitDocument invokes the DocFieldValueVistor for each stored field -// for the specified doc number -func (s *SegmentBase) VisitDocument(num uint64, visitor segment.DocumentFieldValueVisitor) error { - vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) - defer visitDocumentCtxPool.Put(vdc) - return s.visitDocument(vdc, num, visitor) -} - -func (s *SegmentBase) visitDocument(vdc *visitDocumentCtx, num uint64, - visitor segment.DocumentFieldValueVisitor) error { - // first make sure this is a valid number in this segment - if num < s.numDocs { - meta, compressed := s.getDocStoredMetaAndCompressed(num) - - vdc.reader.Reset(meta) - - // handle _id field special case - idFieldValLen, err := binary.ReadUvarint(&vdc.reader) - if err != nil { - return err - } - idFieldVal := compressed[:idFieldValLen] - - keepGoing := visitor("_id", byte('t'), idFieldVal, nil) - if !keepGoing { - visitDocumentCtxPool.Put(vdc) - return nil - } - - // handle non-"_id" fields - compressed = compressed[idFieldValLen:] - - uncompressed, err := snappy.Decode(vdc.buf[:cap(vdc.buf)], compressed) - if err != nil { - return err - } - - for keepGoing { - field, err := binary.ReadUvarint(&vdc.reader) - if err == io.EOF { - break - } - if err != nil { - return err - } - typ, err := binary.ReadUvarint(&vdc.reader) - if err != nil { - return err - } - offset, err := binary.ReadUvarint(&vdc.reader) - if err != nil { - return err - } - l, err := binary.ReadUvarint(&vdc.reader) - if err != nil { - return err - } - numap, err := binary.ReadUvarint(&vdc.reader) - if err != nil { - return err - } - var arrayPos []uint64 - if numap > 0 { - if cap(vdc.arrayPos) < int(numap) { - vdc.arrayPos = make([]uint64, numap) - } - arrayPos = vdc.arrayPos[:numap] - for i := 0; i < int(numap); i++ { - ap, err := binary.ReadUvarint(&vdc.reader) - if err != nil { - return err - } - arrayPos[i] = ap - } - } - - value := uncompressed[offset : offset+l] - keepGoing = visitor(s.fieldsInv[field], byte(typ), value, arrayPos) - } - - vdc.buf = uncompressed - } - return nil -} - -// DocID returns the value of the _id field for the given docNum -func (s *SegmentBase) DocID(num uint64) ([]byte, error) { - if num >= s.numDocs { - return nil, nil - } - - vdc := visitDocumentCtxPool.Get().(*visitDocumentCtx) - - meta, compressed := s.getDocStoredMetaAndCompressed(num) - - vdc.reader.Reset(meta) - - // handle _id field special case - idFieldValLen, err := binary.ReadUvarint(&vdc.reader) - if err != nil { - return nil, err - } - idFieldVal := compressed[:idFieldValLen] - - visitDocumentCtxPool.Put(vdc) - - return idFieldVal, nil -} - -// Count returns the number of documents in this segment. -func (s *SegmentBase) Count() uint64 { - return s.numDocs -} - -// DocNumbers returns a bitset corresponding to the doc numbers of all the -// provided _id strings -func (s *SegmentBase) DocNumbers(ids []string) (*roaring.Bitmap, error) { - rv := roaring.New() - - if len(s.fieldsMap) > 0 { - idDict, err := s.dictionary("_id") - if err != nil { - return nil, err - } - - postingsList := emptyPostingsList - - sMax, err := idDict.fst.GetMaxKey() - if err != nil { - return nil, err - } - sMaxStr := string(sMax) - filteredIds := make([]string, 0, len(ids)) - for _, id := range ids { - if id <= sMaxStr { - filteredIds = append(filteredIds, id) - } - } - - for _, id := range filteredIds { - postingsList, err = idDict.postingsList([]byte(id), nil, postingsList) - if err != nil { - return nil, err - } - postingsList.OrInto(rv) - } - } - - return rv, nil -} - -// Fields returns the field names used in this segment -func (s *SegmentBase) Fields() []string { - return s.fieldsInv -} - -// Path returns the path of this segment on disk -func (s *Segment) Path() string { - return s.path -} - -// Close releases all resources associated with this segment -func (s *Segment) Close() (err error) { - return s.DecRef() -} - -func (s *Segment) closeActual() (err error) { - if s.mm != nil { - err = s.mm.Unmap() - } - // try to close file even if unmap failed - if s.f != nil { - err2 := s.f.Close() - if err == nil { - // try to return first error - err = err2 - } - } - return -} - -// some helpers i started adding for the command-line utility - -// Data returns the underlying mmaped data slice -func (s *Segment) Data() []byte { - return s.mm -} - -// CRC returns the CRC value stored in the file footer -func (s *Segment) CRC() uint32 { - return s.crc -} - -// Version returns the file version in the file footer -func (s *Segment) Version() uint32 { - return s.version -} - -// ChunkFactor returns the chunk factor in the file footer -func (s *Segment) ChunkFactor() uint32 { - return s.chunkFactor -} - -// FieldsIndexOffset returns the fields index offset in the file footer -func (s *Segment) FieldsIndexOffset() uint64 { - return s.fieldsIndexOffset -} - -// StoredIndexOffset returns the stored value index offset in the file footer -func (s *Segment) StoredIndexOffset() uint64 { - return s.storedIndexOffset -} - -// DocValueOffset returns the docValue offset in the file footer -func (s *Segment) DocValueOffset() uint64 { - return s.docValueOffset -} - -// NumDocs returns the number of documents in the file footer -func (s *Segment) NumDocs() uint64 { - return s.numDocs -} - -// DictAddr is a helper function to compute the file offset where the -// dictionary is stored for the specified field. -func (s *Segment) DictAddr(field string) (uint64, error) { - fieldIDPlus1, ok := s.fieldsMap[field] - if !ok { - return 0, fmt.Errorf("no such field '%s'", field) - } - - return s.dictLocs[fieldIDPlus1-1], nil -} - -func (s *SegmentBase) loadDvReaders() error { - if s.docValueOffset == fieldNotUninverted || s.numDocs == 0 { - return nil - } - - var read uint64 - for fieldID, field := range s.fieldsInv { - var fieldLocStart, fieldLocEnd uint64 - var n int - fieldLocStart, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) - if n <= 0 { - return fmt.Errorf("loadDvReaders: failed to read the docvalue offset start for field %d", fieldID) - } - read += uint64(n) - fieldLocEnd, n = binary.Uvarint(s.mem[s.docValueOffset+read : s.docValueOffset+read+binary.MaxVarintLen64]) - if n <= 0 { - return fmt.Errorf("loadDvReaders: failed to read the docvalue offset end for field %d", fieldID) - } - read += uint64(n) - - fieldDvReader, err := s.loadFieldDocValueReader(field, fieldLocStart, fieldLocEnd) - if err != nil { - return err - } - if fieldDvReader != nil { - s.fieldDvReaders[uint16(fieldID)] = fieldDvReader - s.fieldDvNames = append(s.fieldDvNames, field) - } - } - - return nil -} diff --git a/index/scorch/segment/zap/segment_test.go b/index/scorch/segment/zap/segment_test.go deleted file mode 100644 index ffe4c7c80..000000000 --- a/index/scorch/segment/zap/segment_test.go +++ /dev/null @@ -1,737 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "math" - "os" - "reflect" - "testing" - - "github.com/RoaringBitmap/roaring" - "github.com/blevesearch/bleve/index" - "github.com/blevesearch/bleve/index/scorch/segment" -) - -func TestOpen(t *testing.T) { - _ = os.RemoveAll("/tmp/scorch.zap") - - testSeg, _, _ := buildTestSegment() - err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") - if err != nil { - t.Fatalf("error persisting segment: %v", err) - } - - segment, err := Open("/tmp/scorch.zap") - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - defer func() { - cerr := segment.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", cerr) - } - }() - - expectFields := map[string]struct{}{ - "_id": struct{}{}, - "_all": struct{}{}, - "name": struct{}{}, - "desc": struct{}{}, - "tag": struct{}{}, - } - fields := segment.Fields() - if len(fields) != len(expectFields) { - t.Errorf("expected %d fields, only got %d", len(expectFields), len(fields)) - } - for _, field := range fields { - if _, ok := expectFields[field]; !ok { - t.Errorf("got unexpected field: %s", field) - } - } - - docCount := segment.Count() - if docCount != 1 { - t.Errorf("expected count 1, got %d", docCount) - } - - // check the _id field - dict, err := segment.Dictionary("_id") - if err != nil { - t.Fatal(err) - } - if dict == nil { - t.Fatal("got nil dict, expected non-nil") - } - - postingsList, err := dict.PostingsList([]byte("a"), nil, nil) - if err != nil { - t.Fatal(err) - } - if postingsList == nil { - t.Fatal("got nil postings list, expected non-nil") - } - - postingsItr := postingsList.Iterator(true, true, true, nil) - if postingsItr == nil { - t.Fatal("got nil iterator, expected non-nil") - } - - count := 0 - nextPosting, err := postingsItr.Next() - for nextPosting != nil && err == nil { - count++ - if nextPosting.Frequency() != 1 { - t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) - } - if nextPosting.Number() != 0 { - t.Errorf("expected doc number 0, got %d", nextPosting.Number()) - } - if nextPosting.Norm() != 1.0 { - t.Errorf("expected norm 1.0, got %f", nextPosting.Norm()) - } - - nextPosting, err = postingsItr.Next() - } - if err != nil { - t.Fatal(err) - } - - if count != 1 { - t.Errorf("expected count to be 1, got %d", count) - } - - // check the name field - dict, err = segment.Dictionary("name") - if err != nil { - t.Fatal(err) - } - if dict == nil { - t.Fatal("got nil dict, expected non-nil") - } - - postingsList, err = dict.PostingsList([]byte("wow"), nil, nil) - if err != nil { - t.Fatal(err) - } - if postingsList == nil { - t.Fatal("got nil postings list, expected non-nil") - } - - postingsItr = postingsList.Iterator(true, true, true, nil) - if postingsItr == nil { - t.Fatal("got nil iterator, expected non-nil") - } - - count = 0 - nextPosting, err = postingsItr.Next() - for nextPosting != nil && err == nil { - count++ - if nextPosting.Frequency() != 1 { - t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) - } - if nextPosting.Number() != 0 { - t.Errorf("expected doc number 0, got %d", nextPosting.Number()) - } - if nextPosting.Norm() != 1.0 { - t.Errorf("expected norm 1.0, got %f", nextPosting.Norm()) - } - var numLocs uint64 - for _, loc := range nextPosting.Locations() { - numLocs++ - if loc.Field() != "name" { - t.Errorf("expected loc field to be 'name', got '%s'", loc.Field()) - } - if loc.Start() != 0 { - t.Errorf("expected loc start to be 0, got %d", loc.Start()) - } - if loc.End() != 3 { - t.Errorf("expected loc end to be 3, got %d", loc.End()) - } - if loc.Pos() != 1 { - t.Errorf("expected loc pos to be 1, got %d", loc.Pos()) - } - if loc.ArrayPositions() != nil { - t.Errorf("expect loc array pos to be nil, got %v", loc.ArrayPositions()) - } - } - if numLocs != nextPosting.Frequency() { - t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs) - } - - nextPosting, err = postingsItr.Next() - } - if err != nil { - t.Fatal(err) - } - - if count != 1 { - t.Errorf("expected count to be 1, got %d", count) - } - - // check the _all field (composite) - dict, err = segment.Dictionary("_all") - if err != nil { - t.Fatal(err) - } - if dict == nil { - t.Fatal("got nil dict, expected non-nil") - } - - postingsList, err = dict.PostingsList([]byte("wow"), nil, nil) - if err != nil { - t.Fatal(err) - } - if postingsList == nil { - t.Fatal("got nil postings list, expected non-nil") - } - - postingsItr = postingsList.Iterator(true, true, true, nil) - if postingsItr == nil { - t.Fatal("got nil iterator, expected non-nil") - } - - count = 0 - nextPosting, err = postingsItr.Next() - for nextPosting != nil && err == nil { - count++ - if nextPosting.Frequency() != 1 { - t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) - } - if nextPosting.Number() != 0 { - t.Errorf("expected doc number 0, got %d", nextPosting.Number()) - } - expectedNorm := float32(1.0 / math.Sqrt(float64(5))) - if nextPosting.Norm() != float64(expectedNorm) { - t.Errorf("expected norm %f, got %f", expectedNorm, nextPosting.Norm()) - } - var numLocs uint64 - for _, loc := range nextPosting.Locations() { - numLocs++ - if loc.Field() != "name" { - t.Errorf("expected loc field to be 'name', got '%s'", loc.Field()) - } - if loc.Start() != 0 { - t.Errorf("expected loc start to be 0, got %d", loc.Start()) - } - if loc.End() != 3 { - t.Errorf("expected loc end to be 3, got %d", loc.End()) - } - if loc.Pos() != 1 { - t.Errorf("expected loc pos to be 1, got %d", loc.Pos()) - } - if loc.ArrayPositions() != nil { - t.Errorf("expect loc array pos to be nil, got %v", loc.ArrayPositions()) - } - } - if numLocs != nextPosting.Frequency() { - t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs) - } - - nextPosting, err = postingsItr.Next() - } - if err != nil { - t.Fatal(err) - } - - if count != 1 { - t.Errorf("expected count to be 1, got %d", count) - } - - // now try a field with array positions - dict, err = segment.Dictionary("tag") - if err != nil { - t.Fatal(err) - } - if dict == nil { - t.Fatal("got nil dict, expected non-nil") - } - - postingsList, err = dict.PostingsList([]byte("dark"), nil, nil) - if err != nil { - t.Fatal(err) - } - if postingsList == nil { - t.Fatal("got nil postings list, expected non-nil") - } - - postingsItr = postingsList.Iterator(true, true, true, nil) - if postingsItr == nil { - t.Fatal("got nil iterator, expected non-nil") - } - - nextPosting, err = postingsItr.Next() - for nextPosting != nil && err == nil { - - if nextPosting.Frequency() != 1 { - t.Errorf("expected frequency 1, got %d", nextPosting.Frequency()) - } - if nextPosting.Number() != 0 { - t.Errorf("expected doc number 0, got %d", nextPosting.Number()) - } - var numLocs uint64 - for _, loc := range nextPosting.Locations() { - numLocs++ - if loc.Field() != "tag" { - t.Errorf("expected loc field to be 'name', got '%s'", loc.Field()) - } - if loc.Start() != 0 { - t.Errorf("expected loc start to be 0, got %d", loc.Start()) - } - if loc.End() != 4 { - t.Errorf("expected loc end to be 3, got %d", loc.End()) - } - if loc.Pos() != 1 { - t.Errorf("expected loc pos to be 1, got %d", loc.Pos()) - } - expectArrayPos := []uint64{1} - if !reflect.DeepEqual(loc.ArrayPositions(), expectArrayPos) { - t.Errorf("expect loc array pos to be %v, got %v", expectArrayPos, loc.ArrayPositions()) - } - } - if numLocs != nextPosting.Frequency() { - t.Errorf("expected %d locations, got %d", nextPosting.Frequency(), numLocs) - } - - nextPosting, err = postingsItr.Next() - } - if err != nil { - t.Fatal(err) - } - - // now try and visit a document - var fieldValuesSeen int - err = segment.VisitDocument(0, func(field string, typ byte, value []byte, pos []uint64) bool { - fieldValuesSeen++ - return true - }) - if err != nil { - t.Fatal(err) - } - if fieldValuesSeen != 5 { - t.Errorf("expected 5 field values, got %d", fieldValuesSeen) - } -} - -func TestOpenMulti(t *testing.T) { - _ = os.RemoveAll("/tmp/scorch.zap") - - testSeg, _, _ := buildTestSegmentMulti() - err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") - if err != nil { - t.Fatalf("error persisting segment: %v", err) - } - - segment, err := Open("/tmp/scorch.zap") - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - defer func() { - cerr := segment.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", cerr) - } - }() - - if segment.Count() != 2 { - t.Errorf("expected count 2, got %d", segment.Count()) - } - - // check the desc field - dict, err := segment.Dictionary("desc") - if err != nil { - t.Fatal(err) - } - if dict == nil { - t.Fatal("got nil dict, expected non-nil") - } - - postingsList, err := dict.PostingsList([]byte("thing"), nil, nil) - if err != nil { - t.Fatal(err) - } - if postingsList == nil { - t.Fatal("got nil postings list, expected non-nil") - } - - postingsItr := postingsList.Iterator(true, true, true, nil) - if postingsItr == nil { - t.Fatal("got nil iterator, expected non-nil") - } - - count := 0 - nextPosting, err := postingsItr.Next() - for nextPosting != nil && err == nil { - count++ - nextPosting, err = postingsItr.Next() - } - if err != nil { - t.Fatal(err) - } - - if count != 2 { - t.Errorf("expected count to be 2, got %d", count) - } - - // get docnum of a - exclude, err := segment.DocNumbers([]string{"a"}) - if err != nil { - t.Fatal(err) - } - - // look for term 'thing' excluding doc 'a' - postingsListExcluding, err := dict.PostingsList([]byte("thing"), exclude, nil) - if err != nil { - t.Fatal(err) - } - if postingsList == nil { - t.Fatal("got nil postings list, expected non-nil") - } - - postingsListExcludingCount := postingsListExcluding.Count() - if postingsListExcludingCount != 1 { - t.Errorf("expected count from postings list to be 1, got %d", postingsListExcludingCount) - } - - postingsItrExcluding := postingsListExcluding.Iterator(true, true, true, nil) - if postingsItr == nil { - t.Fatal("got nil iterator, expected non-nil") - } - - count = 0 - nextPosting, err = postingsItrExcluding.Next() - for nextPosting != nil && err == nil { - count++ - nextPosting, err = postingsItrExcluding.Next() - } - if err != nil { - t.Fatal(err) - } - - if count != 1 { - t.Errorf("expected count to be 1, got %d", count) - } -} - -func TestOpenMultiWithTwoChunks(t *testing.T) { - _ = os.RemoveAll("/tmp/scorch.zap") - - testSeg, _, _ := buildTestSegmentMultiWithChunkFactor(1) - err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") - if err != nil { - t.Fatalf("error persisting segment: %v", err) - } - - segment, err := Open("/tmp/scorch.zap") - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - defer func() { - cerr := segment.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", cerr) - } - }() - - if segment.Count() != 2 { - t.Errorf("expected count 2, got %d", segment.Count()) - } - - // check the desc field - dict, err := segment.Dictionary("desc") - if err != nil { - t.Fatal(err) - } - if dict == nil { - t.Fatal("got nil dict, expected non-nil") - } - - postingsList, err := dict.PostingsList([]byte("thing"), nil, nil) - if err != nil { - t.Fatal(err) - } - if postingsList == nil { - t.Fatal("got nil postings list, expected non-nil") - } - - postingsItr := postingsList.Iterator(true, true, true, nil) - if postingsItr == nil { - t.Fatal("got nil iterator, expected non-nil") - } - - count := 0 - nextPosting, err := postingsItr.Next() - for nextPosting != nil && err == nil { - count++ - nextPosting, err = postingsItr.Next() - } - if err != nil { - t.Fatal(err) - } - - if count != 2 { - t.Errorf("expected count to be 2, got %d", count) - } - - // get docnum of a - exclude, err := segment.DocNumbers([]string{"a"}) - if err != nil { - t.Fatal(err) - } - - // look for term 'thing' excluding doc 'a' - postingsListExcluding, err := dict.PostingsList([]byte("thing"), exclude, nil) - if err != nil { - t.Fatal(err) - } - if postingsList == nil { - t.Fatal("got nil postings list, expected non-nil") - } - - postingsItrExcluding := postingsListExcluding.Iterator(true, true, true, nil) - if postingsItr == nil { - t.Fatal("got nil iterator, expected non-nil") - } - - count = 0 - nextPosting, err = postingsItrExcluding.Next() - for nextPosting != nil && err == nil { - count++ - nextPosting, err = postingsItrExcluding.Next() - } - if err != nil { - t.Fatal(err) - } - - if count != 1 { - t.Errorf("expected count to be 1, got %d", count) - } -} - -func TestSegmentVisitableDocValueFieldsList(t *testing.T) { - _ = os.RemoveAll("/tmp/scorch.zap") - - testSeg, _, _ := buildTestSegmentMultiWithChunkFactor(1) - err := PersistSegmentBase(testSeg, "/tmp/scorch.zap") - if err != nil { - t.Fatalf("error persisting segment: %v", err) - } - - seg, err := Open("/tmp/scorch.zap") - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - - if zaps, ok := seg.(segment.DocumentFieldTermVisitable); ok { - fields, err := zaps.VisitableDocValueFields() - if err != nil { - t.Fatalf("segment VisitableDocValueFields err: %v", err) - } - // no persisted doc value fields - if len(fields) != 0 { - t.Errorf("expected no persisted fields for doc values, got: %#v", fields) - } - } - - err = seg.Close() - if err != nil { - t.Fatalf("error closing segment: %v", err) - } - _ = os.RemoveAll("/tmp/scorch.zap") - - testSeg, _, _ = buildTestSegmentWithDefaultFieldMapping(1) - err = PersistSegmentBase(testSeg, "/tmp/scorch.zap") - if err != nil { - t.Fatalf("error persisting segment: %v", err) - } - - seg, err = Open("/tmp/scorch.zap") - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - - defer func() { - cerr := seg.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", cerr) - } - }() - - if zaps, ok := seg.(segment.DocumentFieldTermVisitable); ok { - fields, err := zaps.VisitableDocValueFields() - if err != nil { - t.Fatalf("segment VisitableDocValueFields err: %v", err) - } - - expectedFields := []string{"desc", "name", "tag"} - if !reflect.DeepEqual(fields, expectedFields) { - t.Errorf("expected field terms: %#v, got: %#v", expectedFields, fields) - } - - fieldTerms := make(index.FieldTerms) - _, err = zaps.VisitDocumentFieldTerms(0, fields, func(field string, term []byte) { - fieldTerms[field] = append(fieldTerms[field], string(term)) - }, nil) - if err != nil { - t.Error(err) - } - - expectedFieldTerms := index.FieldTerms{ - "name": []string{"wow"}, - "desc": []string{"some", "thing"}, - "tag": []string{"cold"}, - } - if !reflect.DeepEqual(fieldTerms, expectedFieldTerms) { - t.Errorf("expected field terms: %#v, got: %#v", expectedFieldTerms, fieldTerms) - } - - } -} - -func TestSegmentDocsWithNonOverlappingFields(t *testing.T) { - _ = os.RemoveAll("/tmp/scorch.zap") - - testSeg, _, err := buildTestSegmentMultiWithDifferentFields(true, true) - if err != nil { - t.Fatalf("error building segment: %v", err) - } - err = PersistSegmentBase(testSeg, "/tmp/scorch.zap") - if err != nil { - t.Fatalf("error persisting segment: %v", err) - } - - segment, err := Open("/tmp/scorch.zap") - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - defer func() { - cerr := segment.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", cerr) - } - }() - - if segment.Count() != 2 { - t.Errorf("expected 2, got %d", segment.Count()) - } - - expectFields := map[string]struct{}{ - "_id": struct{}{}, - "_all": struct{}{}, - "name": struct{}{}, - "dept": struct{}{}, - "manages.id": struct{}{}, - "manages.count": struct{}{}, - "reportsTo.id": struct{}{}, - } - - fields := segment.Fields() - if len(fields) != len(expectFields) { - t.Errorf("expected %d fields, only got %d", len(expectFields), len(fields)) - } - for _, field := range fields { - if _, ok := expectFields[field]; !ok { - t.Errorf("got unexpected field: %s", field) - } - } -} - -func TestMergedSegmentDocsWithNonOverlappingFields(t *testing.T) { - _ = os.RemoveAll("/tmp/scorch1.zap") - _ = os.RemoveAll("/tmp/scorch2.zap") - _ = os.RemoveAll("/tmp/scorch3.zap") - - testSeg1, _, _ := buildTestSegmentMultiWithDifferentFields(true, false) - err := PersistSegmentBase(testSeg1, "/tmp/scorch1.zap") - if err != nil { - t.Fatalf("error persisting segment: %v", err) - } - - testSeg2, _, _ := buildTestSegmentMultiWithDifferentFields(false, true) - err = PersistSegmentBase(testSeg2, "/tmp/scorch2.zap") - if err != nil { - t.Fatalf("error persisting segment: %v", err) - } - - segment1, err := Open("/tmp/scorch1.zap") - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - defer func() { - cerr := segment1.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", cerr) - } - }() - - segment2, err := Open("/tmp/scorch2.zap") - if err != nil { - t.Fatalf("error opening segment: %v", err) - } - defer func() { - cerr := segment2.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", cerr) - } - }() - - segsToMerge := make([]*Segment, 2) - segsToMerge[0] = segment1.(*Segment) - segsToMerge[1] = segment2.(*Segment) - - _, nBytes, err := Merge(segsToMerge, []*roaring.Bitmap{nil, nil}, "/tmp/scorch3.zap", 1024, nil, nil) - if err != nil { - t.Fatal(err) - } - - if nBytes == 0 { - t.Fatalf("expected a non zero total_compaction_written_bytes") - } - - segmentM, err := Open("/tmp/scorch3.zap") - if err != nil { - t.Fatalf("error opening merged segment: %v", err) - } - defer func() { - cerr := segmentM.Close() - if cerr != nil { - t.Fatalf("error closing segment: %v", cerr) - } - }() - - if segmentM.Count() != 2 { - t.Errorf("expected 2, got %d", segmentM.Count()) - } - - expectFields := map[string]struct{}{ - "_id": struct{}{}, - "_all": struct{}{}, - "name": struct{}{}, - "dept": struct{}{}, - "manages.id": struct{}{}, - "manages.count": struct{}{}, - "reportsTo.id": struct{}{}, - } - - fields := segmentM.Fields() - if len(fields) != len(expectFields) { - t.Errorf("expected %d fields, only got %d", len(expectFields), len(fields)) - } - for _, field := range fields { - if _, ok := expectFields[field]; !ok { - t.Errorf("got unexpected field: %s", field) - } - } -} diff --git a/index/scorch/segment/zap/write.go b/index/scorch/segment/zap/write.go deleted file mode 100644 index cddaedd00..000000000 --- a/index/scorch/segment/zap/write.go +++ /dev/null @@ -1,145 +0,0 @@ -// Copyright (c) 2017 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "encoding/binary" - "io" - - "github.com/RoaringBitmap/roaring" -) - -// writes out the length of the roaring bitmap in bytes as varint -// then writes out the roaring bitmap itself -func writeRoaringWithLen(r *roaring.Bitmap, w io.Writer, - reuseBufVarint []byte) (int, error) { - buf, err := r.ToBytes() - if err != nil { - return 0, err - } - - var tw int - - // write out the length - n := binary.PutUvarint(reuseBufVarint, uint64(len(buf))) - nw, err := w.Write(reuseBufVarint[:n]) - tw += nw - if err != nil { - return tw, err - } - - // write out the roaring bytes - nw, err = w.Write(buf) - tw += nw - if err != nil { - return tw, err - } - - return tw, nil -} - -func persistFields(fieldsInv []string, w *CountHashWriter, dictLocs []uint64) (uint64, error) { - var rv uint64 - var fieldsOffsets []uint64 - - for fieldID, fieldName := range fieldsInv { - // record start of this field - fieldsOffsets = append(fieldsOffsets, uint64(w.Count())) - - // write out the dict location and field name length - _, err := writeUvarints(w, dictLocs[fieldID], uint64(len(fieldName))) - if err != nil { - return 0, err - } - - // write out the field name - _, err = w.Write([]byte(fieldName)) - if err != nil { - return 0, err - } - } - - // now write out the fields index - rv = uint64(w.Count()) - for fieldID := range fieldsInv { - err := binary.Write(w, binary.BigEndian, fieldsOffsets[fieldID]) - if err != nil { - return 0, err - } - } - - return rv, nil -} - -// FooterSize is the size of the footer record in bytes -// crc + ver + chunk + field offset + stored offset + num docs + docValueOffset -const FooterSize = 4 + 4 + 4 + 8 + 8 + 8 + 8 - -func persistFooter(numDocs, storedIndexOffset, fieldsIndexOffset, docValueOffset uint64, - chunkFactor uint32, crcBeforeFooter uint32, writerIn io.Writer) error { - w := NewCountHashWriter(writerIn) - w.crc = crcBeforeFooter - - // write out the number of docs - err := binary.Write(w, binary.BigEndian, numDocs) - if err != nil { - return err - } - // write out the stored field index location: - err = binary.Write(w, binary.BigEndian, storedIndexOffset) - if err != nil { - return err - } - // write out the field index location - err = binary.Write(w, binary.BigEndian, fieldsIndexOffset) - if err != nil { - return err - } - // write out the fieldDocValue location - err = binary.Write(w, binary.BigEndian, docValueOffset) - if err != nil { - return err - } - // write out 32-bit chunk factor - err = binary.Write(w, binary.BigEndian, chunkFactor) - if err != nil { - return err - } - // write out 32-bit version - err = binary.Write(w, binary.BigEndian, Version) - if err != nil { - return err - } - // write out CRC-32 of everything upto but not including this CRC - err = binary.Write(w, binary.BigEndian, w.crc) - if err != nil { - return err - } - return nil -} - -func writeUvarints(w io.Writer, vals ...uint64) (tw int, err error) { - buf := make([]byte, binary.MaxVarintLen64) - for _, val := range vals { - n := binary.PutUvarint(buf, val) - var nw int - nw, err = w.Write(buf[:n]) - tw += nw - if err != nil { - return tw, err - } - } - return tw, err -} diff --git a/index/scorch/segment/zap/write_test.go b/index/scorch/segment/zap/write_test.go deleted file mode 100644 index 2e72d4b82..000000000 --- a/index/scorch/segment/zap/write_test.go +++ /dev/null @@ -1,86 +0,0 @@ -// Copyright (c) 2018 Couchbase, Inc. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -package zap - -import ( - "math" - "testing" - - "github.com/RoaringBitmap/roaring" -) - -func TestRoaringSizes(t *testing.T) { - tests := []struct { - vals []uint32 - expectedSize int // expected serialized # bytes - optimizedSize int // after calling roaring's RunOptimize() API - }{ - {[]uint32{}, 8, 8}, // empty roaring is 8 bytes - - {[]uint32{0}, 18, 18}, // single entry roaring is 18 bytes - {[]uint32{1}, 18, 18}, - {[]uint32{4}, 18, 18}, - {[]uint32{4000}, 18, 18}, - {[]uint32{40000000}, 18, 18}, - {[]uint32{math.MaxUint32}, 18, 18}, - {[]uint32{math.MaxUint32 - 1}, 18, 18}, - - {[]uint32{0, 1}, 20, 20}, - {[]uint32{0, 10000000}, 28, 28}, - - {[]uint32{0, 1, 2}, 22, 15}, - {[]uint32{0, 1, 20000000}, 30, 30}, - - {[]uint32{0, 1, 2, 3}, 24, 15}, - {[]uint32{0, 1, 2, 30000000}, 32, 21}, - } - - for _, test := range tests { - bm := roaring.New() - for _, val := range test.vals { - bm.Add(val) - } - - b, err := bm.ToBytes() - if err != nil { - t.Errorf("expected no ToBytes() err, got: %v", err) - } - if len(b) != test.expectedSize { - t.Errorf("size did not match,"+ - " got: %d, test: %#v", len(b), test) - } - if int(bm.GetSerializedSizeInBytes()) != test.expectedSize { - t.Errorf("GetSerializedSizeInBytes did not match,"+ - " got: %d, test: %#v", - bm.GetSerializedSizeInBytes(), test) - } - - bm.RunOptimize() - - b, err = bm.ToBytes() - if err != nil { - t.Errorf("expected no ToBytes() err, got: %v", err) - } - if len(b) != test.optimizedSize { - t.Errorf("optimized size did not match,"+ - " got: %d, test: %#v", len(b), test) - } - if int(bm.GetSerializedSizeInBytes()) != test.optimizedSize { - t.Errorf("optimized GetSerializedSizeInBytes did not match,"+ - " got: %d, test: %#v", - bm.GetSerializedSizeInBytes(), test) - } - } -} diff --git a/index/scorch/segment/zap/zap.md b/index/scorch/segment/zap/zap.md deleted file mode 100644 index d74dc548b..000000000 --- a/index/scorch/segment/zap/zap.md +++ /dev/null @@ -1,177 +0,0 @@ -# ZAP File Format - -## Legend - -### Sections - - |========| - | | section - |========| - -### Fixed-size fields - - |--------| |----| |--| |-| - | | uint64 | | uint32 | | uint16 | | uint8 - |--------| |----| |--| |-| - -### Varints - - |~~~~~~~~| - | | varint(up to uint64) - |~~~~~~~~| - -### Arbitrary-length fields - - |--------...---| - | | arbitrary-length field (string, vellum, roaring bitmap) - |--------...---| - -### Chunked data - - [--------] - [ ] - [--------] - -## Overview - -Footer section describes the configuration of particular ZAP file. The format of footer is version-dependent, so it is necessary to check `V` field before the parsing. - - |==================================================| - | Stored Fields | - |==================================================| - |-----> | Stored Fields Index | - | |==================================================| - | | Dictionaries + Postings + DocValues | - | |==================================================| - | |---> | DocValues Index | - | | |==================================================| - | | | Fields | - | | |==================================================| - | | |-> | Fields Index | - | | | |========|========|========|========|====|====|====| - | | | | D# | SF | F | FDV | CF | V | CC | (Footer) - | | | |========|====|===|====|===|====|===|====|====|====| - | | | | | | - |-+-+-----------------| | | - | |--------------------------| | - |-------------------------------------| - - D#. Number of Docs. - SF. Stored Fields Index Offset. - F. Field Index Offset. - FDV. Field DocValue Offset. - CF. Chunk Factor. - V. Version. - CC. CRC32. - -## Stored Fields - -Stored Fields Index is `D#` consecutive 64-bit unsigned integers - offsets, where relevant Stored Fields Data records are located. - - 0 [SF] [SF + D# * 8] - | Stored Fields | Stored Fields Index | - |================================|==================================| - | | | - | |--------------------| ||--------|--------|. . .|--------|| - | |-> | Stored Fields Data | || 0 | 1 | | D# - 1 || - | | |--------------------| ||--------|----|---|. . .|--------|| - | | | | | - |===|============================|==============|===================| - | | - |-------------------------------------------| - -Stored Fields Data is an arbitrary size record, which consists of metadata and [Snappy](https://github.com/golang/snappy)-compressed data. - - Stored Fields Data - |~~~~~~~~|~~~~~~~~|~~~~~~~~...~~~~~~~~|~~~~~~~~...~~~~~~~~| - | MDS | CDS | MD | CD | - |~~~~~~~~|~~~~~~~~|~~~~~~~~...~~~~~~~~|~~~~~~~~...~~~~~~~~| - - MDS. Metadata size. - CDS. Compressed data size. - MD. Metadata. - CD. Snappy-compressed data. - -## Fields - -Fields Index section located between addresses `F` and `len(file) - len(footer)` and consist of `uint64` values (`F1`, `F2`, ...) which are offsets to records in Fields section. We have `F# = (len(file) - len(footer) - F) / sizeof(uint64)` fields. - - - (...) [F] [F + F#] - | Fields | Fields Index. | - |================================|================================| - | | | - | |~~~~~~~~|~~~~~~~~|---...---|||--------|--------|...|--------|| - ||->| Dict | Length | Name ||| 0 | 1 | | F# - 1 || - || |~~~~~~~~|~~~~~~~~|---...---|||--------|----|---|...|--------|| - || | | | - ||===============================|==============|=================| - | | - |----------------------------------------------| - - -## Dictionaries + Postings - -Each of fields has its own dictionary, encoded in [Vellum](https://github.com/couchbase/vellum) format. Dictionary consists of pairs `(term, offset)`, where `offset` indicates the position of postings (list of documents) for this particular term. - - |================================================================|- Dictionaries + - | | Postings + - | | DocValues - | Freq/Norm (chunked) | - | [~~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~] | - | |->[ Freq | Norm (float32 under varint) ] | - | | [~~~~~~|~~~~~~~~~~~~~~~~~~~~~~~~~~~~~] | - | | | - | |------------------------------------------------------------| | - | Location Details (chunked) | | - | [~~~~~~|~~~~~|~~~~~~~|~~~~~|~~~~~~|~~~~~~~~|~~~~~] | | - | |->[ Size | Pos | Start | End | Arr# | ArrPos | ... ] | | - | | [~~~~~~|~~~~~|~~~~~~~|~~~~~|~~~~~~|~~~~~~~~|~~~~~] | | - | | | | - | |----------------------| | | - | Postings List | | | - | |~~~~~~~~|~~~~~|~~|~~~~~~~~|-----------...--| | | - | |->| F/N | LD | Length | ROARING BITMAP | | | - | | |~~~~~|~~|~~~~~~~~|~~~~~~~~|-----------...--| | | - | | |----------------------------------------------| | - | |--------------------------------------| | - | Dictionary | | - | |~~~~~~~~|--------------------------|-...-| | - | |->| Length | VELLUM DATA : (TERM -> OFFSET) | | - | | |~~~~~~~~|----------------------------...-| | - | | | - |======|=========================================================|- DocValues Index - | | | - |======|=========================================================|- Fields - | | | - | |~~~~|~~~|~~~~~~~~|---...---| | - | | Dict | Length | Name | | - | |~~~~~~~~|~~~~~~~~|---...---| | - | | - |================================================================| - -## DocValues - -DocValues Index is `F#` pairs of varints, one pair per field. Each pair of varints indicates start and end point of DocValues slice. - - |================================================================| - | |------...--| | - | |->| DocValues |<-| | - | | |------...--| | | - |==|=================|===========================================|- DocValues Index - ||~|~~~~~~~~~|~~~~~~~|~~| |~~~~~~~~~~~~~~|~~~~~~~~~~~~|| - || DV1 START | DV1 STOP | . . . . . | DV(F#) START | DV(F#) END || - ||~~~~~~~~~~~|~~~~~~~~~~| |~~~~~~~~~~~~~~|~~~~~~~~~~~~|| - |================================================================| - -DocValues is chunked Snappy-compressed values for each document and field. - - [~~~~~~~~~~~~~~~|~~~~~~|~~~~~~~~~|-...-|~~~~~~|~~~~~~~~~|--------------------...-] - [ Doc# in Chunk | Doc1 | Offset1 | ... | DocN | OffsetN | SNAPPY COMPRESSED DATA ] - [~~~~~~~~~~~~~~~|~~~~~~|~~~~~~~~~|-...-|~~~~~~|~~~~~~~~~|--------------------...-] - -Last 16 bytes are description of chunks. - - |~~~~~~~~~~~~...~|----------------|----------------| - | Chunk Sizes | Chunk Size Arr | Chunk# | - |~~~~~~~~~~~~...~|----------------|----------------| diff --git a/index/scorch/segment_plugin.go b/index/scorch/segment_plugin.go new file mode 100644 index 000000000..01eda7fbd --- /dev/null +++ b/index/scorch/segment_plugin.go @@ -0,0 +1,77 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "fmt" + + "github.com/blevesearch/bleve/index/scorch/segment" + + zapv11 "github.com/blevesearch/zap/v11" + zapv12 "github.com/blevesearch/zap/v12" +) + +var supportedSegmentPlugins map[string]map[uint32]segment.Plugin +var defaultSegmentPlugin segment.Plugin + +func init() { + ResetPlugins() + RegisterPlugin(zapv12.Plugin(), false) + RegisterPlugin(zapv11.Plugin(), true) +} + +func ResetPlugins() { + supportedSegmentPlugins = map[string]map[uint32]segment.Plugin{} +} + +func RegisterPlugin(plugin segment.Plugin, makeDefault bool) { + if _, ok := supportedSegmentPlugins[plugin.Type()]; !ok { + supportedSegmentPlugins[plugin.Type()] = map[uint32]segment.Plugin{} + } + supportedSegmentPlugins[plugin.Type()][plugin.Version()] = plugin + if makeDefault { + defaultSegmentPlugin = plugin + } +} + +func SupportedSegmentTypes() (rv []string) { + for k := range supportedSegmentPlugins { + rv = append(rv, k) + } + return +} + +func SupportedSegmentTypeVersions(typ string) (rv []uint32) { + for k := range supportedSegmentPlugins[typ] { + rv = append(rv, k) + } + return rv +} + +func (s *Scorch) loadSegmentPlugin(forcedSegmentType string, + forcedSegmentVersion uint32) error { + if versions, ok := supportedSegmentPlugins[forcedSegmentType]; ok { + if segPlugin, ok := versions[uint32(forcedSegmentVersion)]; ok { + s.segPlugin = segPlugin + return nil + } + return fmt.Errorf( + "unsupported version %d for segment type: %s, supported: %v", + forcedSegmentVersion, forcedSegmentType, + SupportedSegmentTypeVersions(forcedSegmentType)) + } + return fmt.Errorf("unsupported segment type: %s, supported: %v", + forcedSegmentType, SupportedSegmentTypes()) +} diff --git a/vendor/manifest b/vendor/manifest deleted file mode 100644 index 1c0b482ef..000000000 --- a/vendor/manifest +++ /dev/null @@ -1,145 +0,0 @@ -{ - "version": 0, - "dependencies": [ - { - "importpath": "github.com/blevesearch/go-porterstemmer", - "repository": "https://github.com/blevesearch/go-porterstemmer", - "vcs": "", - "revision": "23a2c8e5cf1f380f27722c6d2ae8896431dc7d0e", - "branch": "master", - "notests": true - }, - { - "importpath": "github.com/blevesearch/segment", - "repository": "https://github.com/blevesearch/segment", - "vcs": "", - "revision": "db70c57796cc8c310613541dfade3dce627d09c7", - "branch": "master", - "notests": true - }, - { - "importpath": "github.com/blevesearch/snowballstem", - "repository": "https://github.com/blevesearch/snowballstem", - "vcs": "", - "revision": "26b06a2c243d4f8ca5db3486f94409dd5b2a7467", - "branch": "master", - "notests": true - }, - { - "importpath": "go.etcd.io/bbolt", - "repository": "https://github.com/etcd-io/bbolt", - "vcs": "", - "revision": "68cc10a767ea1c6b9e8dcb9847317ff192d6d974", - "branch": "master", - "notests": true - }, - { - "importpath": "github.com/couchbase/moss", - "repository": "https://github.com/couchbase/moss", - "vcs": "git", - "revision": "013a19c55df3e689a66b632c7c8074e37162217d", - "branch": "master", - "notests": true - }, - { - "importpath": "github.com/golang/protobuf/proto", - "repository": "https://github.com/golang/protobuf", - "vcs": "", - "revision": "655cdfa588ea190e901bc5590e65d5621688847c", - "branch": "master", - "path": "/proto", - "notests": true - }, - { - "importpath": "github.com/golang/snappy", - "repository": "https://github.com/golang/snappy", - "vcs": "", - "revision": "cef980a12b316c5b7e5bb3a8e168eb43ae999a88", - "branch": "master", - "notests": true - }, - { - "importpath": "github.com/leemcloughlin/gofarmhash", - "repository": "https://github.com/leemcloughlin/gofarmhash", - "vcs": "git", - "revision": "b3cc1466b93e9c540eda444faaeba9876682c61b", - "branch": "master", - "notests": true - }, - { - "importpath": "github.com/rcrowley/go-metrics", - "repository": "https://github.com/rcrowley/go-metrics", - "vcs": "", - "revision": "dee209f2455f101a5e4e593dea94872d2c62d85d", - "branch": "master", - "notests": true - }, - { - "importpath": "github.com/RoaringBitmap/roaring", - "repository": "https://github.com/RoaringBitmap/roaring", - "vcs": "", - "revision": "4208ad825dda03a6a3d2197df8ec57948aebcc12", - "branch": "master", - "notests": true - }, - { - "importpath": "github.com/seiflotfy/cuckoofilter", - "repository": "https://github.com/seiflotfy/cuckoofilter", - "vcs": "git", - "revision": "d04838794ab86926d32b124345777e55e6f43974", - "branch": "master", - "notests": true - }, - { - "importpath": "github.com/steveyen/gtreap", - "repository": "https://github.com/steveyen/gtreap", - "vcs": "", - "revision": "0abe01ef9be25c4aedc174758ec2d917314d6d70", - "branch": "master", - "notests": true - }, - { - "importpath": "github.com/syndtr/goleveldb/leveldb", - "repository": "https://github.com/syndtr/goleveldb", - "vcs": "", - "revision": "93fc893f2dadb96ffde441c7546cc67ea290a3a8", - "branch": "master", - "path": "/leveldb", - "notests": true - }, - { - "importpath": "github.com/willf/bitset", - "repository": "https://github.com/willf/bitset", - "vcs": "", - "revision": "2e6e8094ef4745224150c88c16191c7dceaad16f", - "branch": "master", - "notests": true - }, - { - "importpath": "golang.org/x/text/transform", - "repository": "https://go.googlesource.com/text", - "vcs": "", - "revision": "5ee49cfe751141f8017047bab800d1f528ee3be1", - "branch": "master", - "path": "/transform", - "notests": true - }, - { - "importpath": "golang.org/x/text/unicode/norm", - "repository": "https://go.googlesource.com/text", - "vcs": "", - "revision": "5ee49cfe751141f8017047bab800d1f528ee3be1", - "branch": "master", - "path": "/unicode/norm", - "notests": true - }, - { - "importpath": "github.com/couchbase/vellum", - "repository": "https://github.com/couchbase/vellum", - "vcs": "git", - "revision": "dc222902e86f298bfae0b3dec6ba8b9d874ad5f8", - "branch": "master", - "notests": true - } - ] -} From e862319dd29284fadd8e41bae2c885e031c32ece Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Sun, 5 Apr 2020 17:16:55 -0400 Subject: [PATCH 657/728] remove bleve self reference from go.mod --- go.mod | 1 - 1 file changed, 1 deletion(-) diff --git a/go.mod b/go.mod index 7a1483824..b3a82c3d8 100644 --- a/go.mod +++ b/go.mod @@ -4,7 +4,6 @@ go 1.13 require ( github.com/RoaringBitmap/roaring v0.4.21 - github.com/blevesearch/bleve v1.0.0 github.com/blevesearch/blevex v0.0.0-20190916190636-152f0fe5c040 github.com/blevesearch/go-porterstemmer v1.0.3 github.com/blevesearch/segment v0.9.0 From 660a798904d25c02bf96ac118ed5f0bec39a8cf2 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Sun, 5 Apr 2020 17:20:56 -0400 Subject: [PATCH 658/728] fix module name --- go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go.mod b/go.mod index b3a82c3d8..11f5e5906 100644 --- a/go.mod +++ b/go.mod @@ -1,4 +1,4 @@ -module github.com/blugelabs/bleve +module github.com/blevesearch/bleve go 1.13 From a9dc7b417a7f440ff36f231879bdabfe08928ef4 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Sun, 5 Apr 2020 17:28:02 -0400 Subject: [PATCH 659/728] bump zap versions used by bleve --- go.mod | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/go.mod b/go.mod index 11f5e5906..942224c79 100644 --- a/go.mod +++ b/go.mod @@ -8,8 +8,8 @@ require ( github.com/blevesearch/go-porterstemmer v1.0.3 github.com/blevesearch/segment v0.9.0 github.com/blevesearch/snowballstem v0.9.0 - github.com/blevesearch/zap/v11 v11.0.1 - github.com/blevesearch/zap/v12 v12.0.1 + github.com/blevesearch/zap/v11 v11.0.2 + github.com/blevesearch/zap/v12 v12.0.2 github.com/couchbase/ghistogram v0.1.0 // indirect github.com/couchbase/moss v0.0.0-20190322010551-a0cae174c498 github.com/couchbase/vellum v1.0.0 From c34fdc810a196c03fc2b8aee19f37f520fd5c575 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Sun, 5 Apr 2020 17:41:15 -0400 Subject: [PATCH 660/728] bump zap versions again --- go.mod | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/go.mod b/go.mod index 942224c79..c9cffbc7a 100644 --- a/go.mod +++ b/go.mod @@ -8,8 +8,8 @@ require ( github.com/blevesearch/go-porterstemmer v1.0.3 github.com/blevesearch/segment v0.9.0 github.com/blevesearch/snowballstem v0.9.0 - github.com/blevesearch/zap/v11 v11.0.2 - github.com/blevesearch/zap/v12 v12.0.2 + github.com/blevesearch/zap/v11 v11.0.3 + github.com/blevesearch/zap/v12 v12.0.3 github.com/couchbase/ghistogram v0.1.0 // indirect github.com/couchbase/moss v0.0.0-20190322010551-a0cae174c498 github.com/couchbase/vellum v1.0.0 From 81b1327ecc52ddaec8e6b373c84297c6253feee6 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Sun, 5 Apr 2020 17:47:38 -0400 Subject: [PATCH 661/728] bump again again again --- go.mod | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/go.mod b/go.mod index c9cffbc7a..8702417e7 100644 --- a/go.mod +++ b/go.mod @@ -8,8 +8,8 @@ require ( github.com/blevesearch/go-porterstemmer v1.0.3 github.com/blevesearch/segment v0.9.0 github.com/blevesearch/snowballstem v0.9.0 - github.com/blevesearch/zap/v11 v11.0.3 - github.com/blevesearch/zap/v12 v12.0.3 + github.com/blevesearch/zap/v11 v11.0.4 + github.com/blevesearch/zap/v12 v12.0.4 github.com/couchbase/ghistogram v0.1.0 // indirect github.com/couchbase/moss v0.0.0-20190322010551-a0cae174c498 github.com/couchbase/vellum v1.0.0 From 487ec05ccabc3abad32f9d32d4a1b31aad76beee Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Wed, 15 Apr 2020 10:28:43 -0700 Subject: [PATCH 662/728] Fix slice out-of-bounds panic within highlighter Addresses https://github.com/blevesearch/bleve/issues/1370 --- search/highlight/fragmenter/simple/simple.go | 5 ++ search_test.go | 68 ++++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/search/highlight/fragmenter/simple/simple.go b/search/highlight/fragmenter/simple/simple.go index 6f6ecedf5..9c63f7fb6 100644 --- a/search/highlight/fragmenter/simple/simple.go +++ b/search/highlight/fragmenter/simple/simple.go @@ -58,6 +58,11 @@ OUTER: // push back towards beginning // without cross maxbegin for start > 0 && used < s.fragmentSize { + if start > len(orig) { + // bail if out of bounds, possibly due to token replacement + // e.g with a regexp replacement + continue OUTER + } r, size := utf8.DecodeLastRune(orig[0:start]) if r == utf8.RuneError { continue OUTER // bail diff --git a/search_test.go b/search_test.go index 7cfd44737..50faaedd8 100644 --- a/search_test.go +++ b/search_test.go @@ -28,6 +28,7 @@ import ( "github.com/blevesearch/bleve/analysis/analyzer/custom" "github.com/blevesearch/bleve/analysis/analyzer/keyword" "github.com/blevesearch/bleve/analysis/analyzer/standard" + regexp_char_filter "github.com/blevesearch/bleve/analysis/char/regexp" "github.com/blevesearch/bleve/analysis/token/length" "github.com/blevesearch/bleve/analysis/token/lowercase" "github.com/blevesearch/bleve/analysis/token/shingle" @@ -38,6 +39,7 @@ import ( "github.com/blevesearch/bleve/index/upsidedown" "github.com/blevesearch/bleve/mapping" "github.com/blevesearch/bleve/search" + "github.com/blevesearch/bleve/search/highlight/highlighter/ansi" "github.com/blevesearch/bleve/search/highlight/highlighter/html" "github.com/blevesearch/bleve/search/query" ) @@ -1639,3 +1641,69 @@ func TestGeoDistanceIssue1301(t *testing.T) { t.Fatalf("Size expected: 3, actual %d\n", sr.Total) } } + +func TestSearchHighlightingWithRegexpReplacement(t *testing.T) { + idxMapping := NewIndexMapping() + if err := idxMapping.AddCustomCharFilter(regexp_char_filter.Name, map[string]interface{}{ + "regexp": `([a-z])\s+(\d)`, + "replace": "ooooo$1-$2", + "type": regexp_char_filter.Name, + }); err != nil { + t.Fatal(err) + } + if err := idxMapping.AddCustomAnalyzer("regexp_replace", map[string]interface{}{ + "type": custom.Name, + "tokenizer": "unicode", + "char_filters": []string{ + regexp_char_filter.Name, + }, + }); err != nil { + t.Fatal(err) + } + + idxMapping.DefaultAnalyzer = "regexp_replace" + idxMapping.StoreDynamic = true + idx, err := NewUsing("testidx", idxMapping, scorch.Name, Config.DefaultKVStore, nil) + if err != nil { + t.Fatal(err) + } + + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + + err = os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + }() + + doc := map[string]interface{}{ + "status": "fool 10", + } + + batch := idx.NewBatch() + if err = batch.Index("doc", doc); err != nil { + t.Fatal(err) + } + + if err = idx.Batch(batch); err != nil { + t.Fatal(err) + } + + query := NewMatchQuery("fool 10") + sreq := NewSearchRequest(query) + sreq.Fields = []string{"*"} + sreq.Highlight = NewHighlightWithStyle(ansi.Name) + + sres, err := idx.Search(sreq) + if err != nil { + t.Fatal(err) + } + + if sres.Total != 1 { + t.Fatalf("Expected 1 hit, got: %v", sres.Total) + } +} From 2fc361aa497e658d8a0f9bfe1c52c6faa00a68fd Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Mon, 16 Mar 2020 15:51:02 -0700 Subject: [PATCH 663/728] MB-38312: regexp to use ReplaceAll which uses `Expand` + Expand essentially interprets $ signs, so for example $1 represents the text of the first submatch. + For example .. - Consider the following regex: ([a-z])\s+(\d) - For the string "temp 1", the above regex matches: "p 1" - Let the replacement be "$1-$2", so the expectation is that "p 1" gets replaced by "p-1". - The code before the fix replaces "p 1" with: "$1-$2$1-$2$1-$2" --- analysis/char/regexp/regexp.go | 3 +-- analysis/char/regexp/regexp_test.go | 22 ++++++++++++++++++---- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/analysis/char/regexp/regexp.go b/analysis/char/regexp/regexp.go index c290d39ad..fc344b312 100644 --- a/analysis/char/regexp/regexp.go +++ b/analysis/char/regexp/regexp.go @@ -15,7 +15,6 @@ package regexp import ( - "bytes" "fmt" "regexp" @@ -38,7 +37,7 @@ func New(r *regexp.Regexp, replacement []byte) *CharFilter { } func (s *CharFilter) Filter(input []byte) []byte { - return s.r.ReplaceAllFunc(input, func(in []byte) []byte { return bytes.Repeat(s.replacement, len(in)) }) + return s.r.ReplaceAll(input, s.replacement) } func CharFilterConstructor(config map[string]interface{}, cache *registry.Cache) (analysis.CharFilter, error) { diff --git a/analysis/char/regexp/regexp_test.go b/analysis/char/regexp/regexp_test.go index 3d25b3f92..86a375438 100644 --- a/analysis/char/regexp/regexp_test.go +++ b/analysis/char/regexp/regexp_test.go @@ -21,7 +21,6 @@ import ( ) func TestRegexpCharFilter(t *testing.T) { - htmlTagPattern := `\s]+))?)+\s*|\s*)/?>` htmlRegex := regexp.MustCompile(htmlTagPattern) @@ -31,7 +30,7 @@ func TestRegexpCharFilter(t *testing.T) { }{ { input: []byte(`test`), - output: []byte(` test `), + output: []byte(` test `), }, } @@ -45,7 +44,6 @@ func TestRegexpCharFilter(t *testing.T) { } func TestZeroWidthNonJoinerCharFilter(t *testing.T) { - zeroWidthNonJoinerPattern := `\x{200C}` zeroWidthNonJoinerRegex := regexp.MustCompile(zeroWidthNonJoinerPattern) @@ -55,7 +53,7 @@ func TestZeroWidthNonJoinerCharFilter(t *testing.T) { }{ { input: []byte("water\u200Cunder\u200Cthe\u200Cbridge"), - output: []byte("water under the bridge"), + output: []byte("water under the bridge"), }, } @@ -67,3 +65,19 @@ func TestZeroWidthNonJoinerCharFilter(t *testing.T) { } } } + +func TestRegexpCustomReplace(t *testing.T) { + regexStr := `([a-z])\s+(\d)` + replace := []byte(`$1-$2`) + + regex := regexp.MustCompile(regexStr) + filter := New(regex, replace) + + input := []byte("temp 1") + expectOutput := []byte("temp-1") + + output := filter.Filter(input) + if !reflect.DeepEqual(output, expectOutput) { + t.Errorf("Expected: %s, Got: %s\n", string(expectOutput), string(output)) + } +} From e94966ded5931686b1eaa9cf14b4c810e4c4b711 Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Mon, 16 Mar 2020 17:53:47 -0700 Subject: [PATCH 664/728] MB-38312: More test cases for regexp-replace --- analysis/char/regexp/regexp_test.go | 54 +++++++++++++++++++++++------ 1 file changed, 44 insertions(+), 10 deletions(-) diff --git a/analysis/char/regexp/regexp_test.go b/analysis/char/regexp/regexp_test.go index 86a375438..a3430555e 100644 --- a/analysis/char/regexp/regexp_test.go +++ b/analysis/char/regexp/regexp_test.go @@ -67,17 +67,51 @@ func TestZeroWidthNonJoinerCharFilter(t *testing.T) { } func TestRegexpCustomReplace(t *testing.T) { - regexStr := `([a-z])\s+(\d)` - replace := []byte(`$1-$2`) - - regex := regexp.MustCompile(regexStr) - filter := New(regex, replace) + tests := []struct { + regexStr string + replace []byte + input []byte + output []byte + }{ + { + regexStr: `([a-z])\s+(\d)`, + replace: []byte(`$1-$2`), + input: []byte(`temp 1`), + output: []byte(`temp-1`), + }, + { + regexStr: `foo.?`, + replace: []byte(`X`), + input: []byte(`seafood, fool`), + output: []byte(`seaX, X`), + }, + { + regexStr: `def`, + replace: []byte(`_`), + input: []byte(`abcdefghi`), + output: []byte(`abc_ghi`), + }, + { + regexStr: `456`, + replace: []byte(`000000`), + input: []byte(`123456789`), + output: []byte(`123000000789`), + }, + { + regexStr: `“|”`, + replace: []byte(`"`), + input: []byte(`“hello”`), + output: []byte(`"hello"`), + }, + } - input := []byte("temp 1") - expectOutput := []byte("temp-1") + for i := range tests { + regex := regexp.MustCompile(tests[i].regexStr) + filter := New(regex, tests[i].replace) - output := filter.Filter(input) - if !reflect.DeepEqual(output, expectOutput) { - t.Errorf("Expected: %s, Got: %s\n", string(expectOutput), string(output)) + output := filter.Filter(tests[i].input) + if !reflect.DeepEqual(tests[i].output, output) { + t.Errorf("[%d] Expected: `%s`, Got: `%s`\n", i, string(tests[i].output), string(output)) + } } } From dad40529ec1681dd3d1f772c5f5914a3661879b7 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 17 Apr 2020 19:57:28 -0400 Subject: [PATCH 665/728] update to latest zap everyone moving to blevesearch/mmap-go --- go.mod | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/go.mod b/go.mod index 8702417e7..d38cf8f92 100644 --- a/go.mod +++ b/go.mod @@ -8,16 +8,16 @@ require ( github.com/blevesearch/go-porterstemmer v1.0.3 github.com/blevesearch/segment v0.9.0 github.com/blevesearch/snowballstem v0.9.0 - github.com/blevesearch/zap/v11 v11.0.4 - github.com/blevesearch/zap/v12 v12.0.4 + github.com/blevesearch/zap/v11 v11.0.7 + github.com/blevesearch/zap/v12 v12.0.7 github.com/couchbase/ghistogram v0.1.0 // indirect - github.com/couchbase/moss v0.0.0-20190322010551-a0cae174c498 - github.com/couchbase/vellum v1.0.0 + github.com/couchbase/moss v0.1.0 + github.com/couchbase/vellum v1.0.1 github.com/golang/protobuf v1.3.2 github.com/kljensen/snowball v0.6.0 github.com/rcrowley/go-metrics v0.0.0-20190826022208-cac0b30c2563 github.com/spf13/cobra v0.0.5 - github.com/steveyen/gtreap v0.0.0-20150807155958-0abe01ef9be2 + github.com/steveyen/gtreap v0.1.0 github.com/syndtr/goleveldb v1.0.0 github.com/willf/bitset v1.1.10 go.etcd.io/bbolt v1.3.4 From 4dc13100a7fd0120229e81450e6ed7ebcf8951fb Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 20 Apr 2020 13:17:13 -0400 Subject: [PATCH 666/728] add CI for unit tests (#1368) * add CI for unit tests * fix some tests that did not close their index * increase delays for tests running via github actions seems as though on the machines these tests run sometimes by the time the cancel fires all the results have been returned anyway in spite of their delays --- .github/workflows/tests.yml | 24 ++++++++++++++++++++++++ index_alias_impl_test.go | 4 ++-- search_test.go | 12 ++++++++++-- 3 files changed, 36 insertions(+), 4 deletions(-) create mode 100644 .github/workflows/tests.yml diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml new file mode 100644 index 000000000..bcafb812b --- /dev/null +++ b/.github/workflows/tests.yml @@ -0,0 +1,24 @@ +on: + push: + branches: + - master + pull_request: +name: Tests +jobs: + test: + strategy: + matrix: + go-version: [1.13.x, 1.14.x] + platform: [ubuntu-latest, macos-latest, windows-latest] + runs-on: ${{ matrix.platform }} + steps: + - name: Install Go + uses: actions/setup-go@v1 + with: + go-version: ${{ matrix.go-version }} + - name: Checkout code + uses: actions/checkout@v2 + - name: Test + run: | + go version + go test -race ./... diff --git a/index_alias_impl_test.go b/index_alias_impl_test.go index 6e25157de..3d05b1e54 100644 --- a/index_alias_impl_test.go +++ b/index_alias_impl_test.go @@ -1013,7 +1013,7 @@ func TestIndexAliasMultipleLayer(t *testing.T) { select { case <-ctx.Done(): return ctx.Err() - case <-time.After(50 * time.Millisecond): + case <-time.After(250 * time.Millisecond): return nil } }, @@ -1042,7 +1042,7 @@ func TestIndexAliasMultipleLayer(t *testing.T) { select { case <-ctx.Done(): return ctx.Err() - case <-time.After(50 * time.Millisecond): + case <-time.After(250 * time.Millisecond): return nil } }, diff --git a/search_test.go b/search_test.go index 50faaedd8..453657ee3 100644 --- a/search_test.go +++ b/search_test.go @@ -1558,7 +1558,11 @@ func TestSearchScoreNone(t *testing.T) { } defer func() { - err := os.RemoveAll("testidx") + err := idx.Close() + if err != nil { + t.Fatal(err) + } + err = os.RemoveAll("testidx") if err != nil { t.Fatal(err) } @@ -1609,7 +1613,11 @@ func TestGeoDistanceIssue1301(t *testing.T) { } defer func() { - err := os.RemoveAll("testidx") + err := idx.Close() + if err != nil { + t.Fatal(err) + } + err = os.RemoveAll("testidx") if err != nil { t.Fatal(err) } From b658b028ecf09c97c79e2d8d29401ba6e56b4a11 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 20 Apr 2020 13:24:56 -0400 Subject: [PATCH 667/728] misc cleanup while reviewing chans/goroutines (#1375) - renamed snapshot_rollback.go and snapshot_rollback_test.go to rollback.go and rollback_test.go, the snapshot_ prefix should be limited to code that relates directly to either an index snapshot or segment snapshot. rollback can be seen as operating on the index as a whole. - renamed mainLoop to introducerLoop, consistent with persisterLoop and mergerLoop - remove snapshotReversion this is no longer used - remove type uint64Descending and related methods, no longer used --- index/scorch/introducer.go | 8 +------- index/scorch/persister.go | 6 ------ index/scorch/{snapshot_rollback.go => rollback.go} | 0 .../{snapshot_rollback_test.go => rollback_test.go} | 0 index/scorch/scorch.go | 2 +- 5 files changed, 2 insertions(+), 14 deletions(-) rename index/scorch/{snapshot_rollback.go => rollback.go} (100%) rename index/scorch/{snapshot_rollback_test.go => rollback_test.go} (100%) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index e5f00f80e..64ca969bd 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -45,13 +45,7 @@ type epochWatcher struct { notifyCh notificationChan } -type snapshotReversion struct { - snapshot *IndexSnapshot - applied chan error - persisted chan error -} - -func (s *Scorch) mainLoop() { +func (s *Scorch) introducerLoop() { var epochWatchers []*epochWatcher OUTER: for { diff --git a/index/scorch/persister.go b/index/scorch/persister.go index 30e75df77..ffa656693 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -780,12 +780,6 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro return rv, nil } -type uint64Descending []uint64 - -func (p uint64Descending) Len() int { return len(p) } -func (p uint64Descending) Less(i, j int) bool { return p[i] > p[j] } -func (p uint64Descending) Swap(i, j int) { p[i], p[j] = p[j], p[i] } - func (s *Scorch) removeOldData() { removed, err := s.removeOldBoltSnapshots() if err != nil { diff --git a/index/scorch/snapshot_rollback.go b/index/scorch/rollback.go similarity index 100% rename from index/scorch/snapshot_rollback.go rename to index/scorch/rollback.go diff --git a/index/scorch/snapshot_rollback_test.go b/index/scorch/rollback_test.go similarity index 100% rename from index/scorch/snapshot_rollback_test.go rename to index/scorch/rollback_test.go diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 80f9e3a79..698aaf16a 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -181,7 +181,7 @@ func (s *Scorch) Open() error { } s.asyncTasks.Add(1) - go s.mainLoop() + go s.introducerLoop() if !s.readOnly && s.path != "" { s.asyncTasks.Add(1) From fb361a72a60251e8103e7bf8c2f01bfdb171553e Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 20 Apr 2020 13:56:41 -0400 Subject: [PATCH 668/728] update README (#1376) move each badge to it's own line, as this seems to still format correctly, but is easier to maintain replaced build badge from travis with tests badge using the new github workflow --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7c1a7c7c4..eff0be97e 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,13 @@ # ![bleve](docs/bleve.png) bleve -[![Build Status](https://travis-ci.org/blevesearch/bleve.svg?branch=master)](https://travis-ci.org/blevesearch/bleve) [![Coverage Status](https://coveralls.io/repos/github/blevesearch/bleve/badge.svg?branch=master)](https://coveralls.io/github/blevesearch/bleve?branch=master) [![GoDoc](https://godoc.org/github.com/blevesearch/bleve?status.svg)](https://godoc.org/github.com/blevesearch/bleve) +[![Tests](https://github.com/blevesearch/bleve/workflows/Tests/badge.svg?branch=master&event=push)](https://github.com/blevesearch/bleve/actions?query=workflow%3ATests+event%3Apush+branch%3Amaster) +[![Coverage Status](https://coveralls.io/repos/github/blevesearch/bleve/badge.svg?branch=master)](https://coveralls.io/github/blevesearch/bleve?branch=master) +[![GoDoc](https://godoc.org/github.com/blevesearch/bleve?status.svg)](https://godoc.org/github.com/blevesearch/bleve) [![Join the chat at https://gitter.im/blevesearch/bleve](https://badges.gitter.im/Join%20Chat.svg)](https://gitter.im/blevesearch/bleve?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge&utm_content=badge) [![codebeat](https://codebeat.co/badges/38a7cbc9-9cf5-41c0-a315-0746178230f4)](https://codebeat.co/projects/github-com-blevesearch-bleve) [![Go Report Card](https://goreportcard.com/badge/blevesearch/bleve)](https://goreportcard.com/report/blevesearch/bleve) -[![Sourcegraph](https://sourcegraph.com/github.com/blevesearch/bleve/-/badge.svg)](https://sourcegraph.com/github.com/blevesearch/bleve?badge) [![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) +[![Sourcegraph](https://sourcegraph.com/github.com/blevesearch/bleve/-/badge.svg)](https://sourcegraph.com/github.com/blevesearch/bleve?badge) +[![License](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0) modern text indexing in go - [blevesearch.com](http://www.blevesearch.com/) From ba54e05e30587b38e10e3571ca1c2faeb846dcd8 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 29 Apr 2020 16:39:18 -0400 Subject: [PATCH 669/728] improve reliability of test on mac observed this test failing frequently on mac environment through github actions simple change seems to still test the same behavior instead of arranging for one request to sleep longer than the context timeout, instead arrange for it to never return. that way no matter when the context timeout eventually happens, we correctly observe partial results. --- index_alias_impl_test.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/index_alias_impl_test.go b/index_alias_impl_test.go index 3d05b1e54..0c07390a3 100644 --- a/index_alias_impl_test.go +++ b/index_alias_impl_test.go @@ -913,8 +913,6 @@ func TestMultiSearchTimeoutPartial(t *testing.T) { select { case <-ctx.Done(): return ctx.Err() - case <-time.After(50 * time.Millisecond): - return nil } }, err: nil, From 5353f4da6d3048f47db1da17bd65d0adcf497ed3 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 30 Apr 2020 11:06:37 -0400 Subject: [PATCH 670/728] improve unit test execution time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before: $ go test -v -run=TestSearchQueryCallback === RUN TestSearchQueryCallback --- PASS: TestSearchQueryCallback (20.99s) PASS ok github.com/blevesearch/bleve 20.997s After: $ ✗ go test -v -run=TestSearchQueryCallback === RUN TestSearchQueryCallback --- PASS: TestSearchQueryCallback (1.50s) PASS ok github.com/blevesearch/bleve 1.509s --- index_test.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/index_test.go b/index_test.go index 65ddedaa8..57c6fc0fb 100644 --- a/index_test.go +++ b/index_test.go @@ -1892,13 +1892,18 @@ func TestSearchQueryCallback(t *testing.T) { }() elements := []string{"air", "water", "fire", "earth"} + b := index.NewBatch() for j := 0; j < 10000; j++ { - err = index.Index(fmt.Sprintf("%d", j), + err = b.Index(fmt.Sprintf("%d", j), map[string]interface{}{"name": elements[j%len(elements)]}) if err != nil { t.Fatal(err) } } + err = index.Batch(b) + if err != nil { + t.Fatal(err) + } query := NewTermQuery("water") req := NewSearchRequest(query) From 98b3d3b5b7fac7aab20172eb438ff3e04e0cef04 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 30 Apr 2020 14:10:16 -0400 Subject: [PATCH 671/728] improve unit test perf using batch --- index_test.go | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/index_test.go b/index_test.go index 57c6fc0fb..73db16bf6 100644 --- a/index_test.go +++ b/index_test.go @@ -726,16 +726,21 @@ func TestSortMatchSearch(t *testing.T) { names := []string{"Noam", "Uri", "David", "Yosef", "Eitan", "Itay", "Ariel", "Daniel", "Omer", "Yogev", "Yehonatan", "Moshe", "Mohammed", "Yusuf", "Omar"} days := []string{"Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday"} numbers := []string{"One", "Two", "Three", "Four", "Five", "Six", "Seven", "Eight", "Nine", "Ten", "Eleven", "Twelve"} + b := index.NewBatch() for i := 0; i < 200; i++ { doc := make(map[string]interface{}) doc["Name"] = names[i%len(names)] doc["Day"] = days[i%len(days)] doc["Number"] = numbers[i%len(numbers)] - err = index.Index(fmt.Sprintf("%d", i), doc) + err = b.Index(fmt.Sprintf("%d", i), doc) if err != nil { t.Fatal(err) } } + err = index.Batch(b) + if err != nil { + t.Fatal(err) + } req := NewSearchRequest(NewMatchQuery("One")) req.SortBy([]string{"Day", "Name"}) From 94161c7d81e5ca622d1652877ec2b407d1d04966 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 30 Apr 2020 16:16:45 -0400 Subject: [PATCH 672/728] improve performance of query callback test the behavior being tested did not depend on any data actually being indexed, so that was removed additional documentation was added to clarify what this test was testing, since it was not clear and we did not remember --- index_test.go | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/index_test.go b/index_test.go index 73db16bf6..0ee1bf53f 100644 --- a/index_test.go +++ b/index_test.go @@ -1896,29 +1896,15 @@ func TestSearchQueryCallback(t *testing.T) { } }() - elements := []string{"air", "water", "fire", "earth"} - b := index.NewBatch() - for j := 0; j < 10000; j++ { - err = b.Index(fmt.Sprintf("%d", j), - map[string]interface{}{"name": elements[j%len(elements)]}) - if err != nil { - t.Fatal(err) - } - } - err = index.Batch(b) - if err != nil { - t.Fatal(err) - } - query := NewTermQuery("water") req := NewSearchRequest(query) expErr := fmt.Errorf("MEM_LIMIT_EXCEEDED") f := func(size uint64) error { - if size > 1000 { - return expErr - } - return nil + // the intended usage of this callback is to see the estimated + // memory usage before executing, and possibly abort early + // in this test we simulate returning such an error + return expErr } ctx := context.WithValue(context.Background(), SearchQueryStartCallbackKey, From 4638928d99bdde25a5c75a36dfd63b4e5ce30ddd Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 30 Apr 2020 20:01:43 -0400 Subject: [PATCH 673/728] replace testidx with proper ioutil.TempDir (#1386) --- index_test.go | 355 +++++++++++++++----------------------- search_test.go | 176 +++++++++---------- test/versus_score_test.go | 17 +- 3 files changed, 227 insertions(+), 321 deletions(-) diff --git a/index_test.go b/index_test.go index 0ee1bf53f..1662c60e3 100644 --- a/index_test.go +++ b/index_test.go @@ -21,6 +21,7 @@ import ( "log" "math" "os" + "path/filepath" "reflect" "sort" "strconv" @@ -42,15 +43,30 @@ import ( "github.com/blevesearch/bleve/index/upsidedown" ) +type Fatalfable interface { + Fatalf(format string, args ...interface{}) +} + +func createTmpIndexPath(f Fatalfable) string { + tmpIndexPath, err := ioutil.TempDir("", "bleve-testidx") + if err != nil { + f.Fatalf("error creating temp dir: %v", err) + } + return tmpIndexPath +} + +func cleanupTmpIndexPath(f Fatalfable, path string) { + err := os.RemoveAll(path) + if err != nil { + f.Fatalf("error removing temp dir: %v", err) + } +} + func TestCrud(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) - index, err := New("testidx", NewIndexMapping()) + index, err := New(tmpIndexPath, NewIndexMapping()) if err != nil { t.Fatal(err) } @@ -157,7 +173,7 @@ func TestCrud(t *testing.T) { t.Fatal(err) } - index, err = Open("testidx") + index, err = Open(tmpIndexPath) if err != nil { t.Fatal(err) } @@ -216,14 +232,10 @@ func TestCrud(t *testing.T) { } func TestIndexCreateNewOverExisting(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) - index, err := New("testidx", NewIndexMapping()) + index, err := New(tmpIndexPath, NewIndexMapping()) if err != nil { t.Fatal(err) } @@ -231,7 +243,7 @@ func TestIndexCreateNewOverExisting(t *testing.T) { if err != nil { t.Fatal(err) } - index, err = New("testidx", NewIndexMapping()) + index, err = New(tmpIndexPath, NewIndexMapping()) if err != ErrorIndexPathExists { t.Fatalf("expected error index path exists, got %v", err) } @@ -245,14 +257,10 @@ func TestIndexOpenNonExisting(t *testing.T) { } func TestIndexOpenMetaMissingOrCorrupt(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) - index, err := New("testidx", NewIndexMapping()) + index, err := New(tmpIndexPath, NewIndexMapping()) if err != nil { t.Fatal(err) } @@ -261,35 +269,37 @@ func TestIndexOpenMetaMissingOrCorrupt(t *testing.T) { t.Fatal(err) } + tmpIndexPathMeta := filepath.Join(tmpIndexPath, "index_meta.json") + // now intentionally change the storage type - err = ioutil.WriteFile("testidx/index_meta.json", []byte(`{"storage":"mystery"}`), 0666) + err = ioutil.WriteFile(tmpIndexPathMeta, []byte(`{"storage":"mystery"}`), 0666) if err != nil { t.Fatal(err) } - index, err = Open("testidx") + index, err = Open(tmpIndexPath) if err != ErrorUnknownStorageType { t.Fatalf("expected error unknown storage type, got %v", err) } // now intentionally corrupt the metadata - err = ioutil.WriteFile("testidx/index_meta.json", []byte("corrupted"), 0666) + err = ioutil.WriteFile(tmpIndexPathMeta, []byte("corrupted"), 0666) if err != nil { t.Fatal(err) } - index, err = Open("testidx") + index, err = Open(tmpIndexPath) if err != ErrorIndexMetaCorrupt { t.Fatalf("expected error index metadata corrupted, got %v", err) } // now intentionally remove the metadata - err = os.Remove("testidx/index_meta.json") + err = os.Remove(tmpIndexPathMeta) if err != nil { t.Fatal(err) } - index, err = Open("testidx") + index, err = Open(tmpIndexPath) if err != ErrorIndexMetaMissing { t.Fatalf("expected error index metadata missing, got %v", err) } @@ -365,12 +375,8 @@ func (s *slowQuery) Searcher(i index.IndexReader, m mapping.IndexMapping, option } func TestSlowSearch(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) defer func() { // reset logger back to normal @@ -380,7 +386,7 @@ func TestSlowSearch(t *testing.T) { var sdw sawDataWriter SetLog(log.New(&sdw, "bleve", log.LstdFlags)) - index, err := New("testidx", NewIndexMapping()) + index, err := New(tmpIndexPath, NewIndexMapping()) if err != nil { t.Fatal(err) } @@ -430,14 +436,10 @@ func (s *sawDataWriter) Write(p []byte) (n int, err error) { } func TestStoredFieldPreserved(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) - index, err := New("testidx", NewIndexMapping()) + index, err := New(tmpIndexPath, NewIndexMapping()) if err != nil { t.Fatal(err) } @@ -485,14 +487,10 @@ func TestStoredFieldPreserved(t *testing.T) { } func TestDict(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) - index, err := New("testidx", NewIndexMapping()) + index, err := New(tmpIndexPath, NewIndexMapping()) if err != nil { t.Fatal(err) } @@ -612,14 +610,10 @@ func TestDict(t *testing.T) { } func TestBatchString(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) - index, err := New("testidx", NewIndexMapping()) + index, err := New(tmpIndexPath, NewIndexMapping()) if err != nil { t.Fatal(err) } @@ -659,14 +653,10 @@ func TestBatchString(t *testing.T) { } func TestIndexMetadataRaceBug198(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) - index, err := New("testidx", NewIndexMapping()) + index, err := New(tmpIndexPath, NewIndexMapping()) if err != nil { t.Fatal(err) } @@ -711,14 +701,10 @@ func TestIndexMetadataRaceBug198(t *testing.T) { } func TestSortMatchSearch(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) - index, err := New("testidx", NewIndexMapping()) + index, err := New(tmpIndexPath, NewIndexMapping()) if err != nil { t.Fatal(err) } @@ -764,14 +750,10 @@ func TestSortMatchSearch(t *testing.T) { } func TestIndexCountMatchSearch(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) - index, err := New("testidx", NewIndexMapping()) + index, err := New(tmpIndexPath, NewIndexMapping()) if err != nil { t.Fatal(err) } @@ -831,14 +813,10 @@ func TestIndexCountMatchSearch(t *testing.T) { } func TestBatchReset(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) - index, err := New("testidx", NewIndexMapping()) + index, err := New(tmpIndexPath, NewIndexMapping()) if err != nil { t.Fatal(err) } @@ -874,14 +852,10 @@ func TestBatchReset(t *testing.T) { } func TestDocumentFieldArrayPositions(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) - index, err := New("testidx", NewIndexMapping()) + index, err := New(tmpIndexPath, NewIndexMapping()) if err != nil { t.Fatal(err) } @@ -983,12 +957,8 @@ func TestDocumentFieldArrayPositions(t *testing.T) { } func TestKeywordSearchBug207(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) f := NewTextFieldMapping() f.Analyzer = keyword.Name @@ -997,7 +967,7 @@ func TestKeywordSearchBug207(t *testing.T) { m.DefaultMapping = NewDocumentMapping() m.DefaultMapping.AddFieldMappingsAt("Body", f) - index, err := New("testidx", m) + index, err := New(tmpIndexPath, m) if err != nil { t.Fatal(err) } @@ -1081,14 +1051,10 @@ func TestKeywordSearchBug207(t *testing.T) { } func TestTermVectorArrayPositions(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) - index, err := New("testidx", NewIndexMapping()) + index, err := New(tmpIndexPath, NewIndexMapping()) if err != nil { t.Fatal(err) } @@ -1158,12 +1124,8 @@ func TestTermVectorArrayPositions(t *testing.T) { } func TestDocumentStaticMapping(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) m := NewIndexMapping() m.DefaultMapping = NewDocumentStaticMapping() @@ -1171,7 +1133,7 @@ func TestDocumentStaticMapping(t *testing.T) { m.DefaultMapping.AddFieldMappingsAt("Date", NewDateTimeFieldMapping()) m.DefaultMapping.AddFieldMappingsAt("Numeric", NewNumericFieldMapping()) - index, err := New("testidx", m) + index, err := New(tmpIndexPath, m) if err != nil { t.Fatal(err) } @@ -1219,14 +1181,10 @@ func TestDocumentStaticMapping(t *testing.T) { } func TestIndexEmptyDocId(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) - index, err := New("testidx", NewIndexMapping()) + index, err := New(tmpIndexPath, NewIndexMapping()) if err != nil { t.Fatal(err) } @@ -1264,12 +1222,8 @@ func TestIndexEmptyDocId(t *testing.T) { } func TestDateTimeFieldMappingIssue287(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) f := NewDateTimeFieldMapping() @@ -1277,7 +1231,7 @@ func TestDateTimeFieldMappingIssue287(t *testing.T) { m.DefaultMapping = NewDocumentMapping() m.DefaultMapping.AddFieldMappingsAt("Date", f) - index, err := New("testidx", m) + index, err := New(tmpIndexPath, m) if err != nil { t.Fatal(err) } @@ -1347,14 +1301,10 @@ func TestDateTimeFieldMappingIssue287(t *testing.T) { } func TestDocumentFieldArrayPositionsBug295(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) - index, err := New("testidx", NewIndexMapping()) + index, err := New(tmpIndexPath, NewIndexMapping()) if err != nil { t.Fatal(err) } @@ -1431,18 +1381,14 @@ func TestDocumentFieldArrayPositionsBug295(t *testing.T) { } func TestBooleanFieldMappingIssue109(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) m := NewIndexMapping() m.DefaultMapping = NewDocumentMapping() m.DefaultMapping.AddFieldMappingsAt("Bool", NewBooleanFieldMapping()) - index, err := New("testidx", m) + index, err := New(tmpIndexPath, m) if err != nil { t.Fatal(err) } @@ -1497,14 +1443,10 @@ func TestBooleanFieldMappingIssue109(t *testing.T) { } func TestSearchTimeout(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) - index, err := New("testidx", NewIndexMapping()) + index, err := New(tmpIndexPath, NewIndexMapping()) if err != nil { t.Fatal(err) } @@ -1565,13 +1507,9 @@ func TestConfigCache(t *testing.T) { } func TestBatchRaceBug260(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() - i, err := New("testidx", NewIndexMapping()) + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + i, err := New(tmpIndexPath, NewIndexMapping()) if err != nil { t.Fatal(err) } @@ -1603,14 +1541,10 @@ func TestBatchRaceBug260(t *testing.T) { } func BenchmarkBatchOverhead(b *testing.B) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - b.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(b) + defer cleanupTmpIndexPath(b, tmpIndexPath) m := NewIndexMapping() - i, err := NewUsing("testidx", m, Config.DefaultIndexType, null.Name, nil) + i, err := NewUsing(tmpIndexPath, m, Config.DefaultIndexType, null.Name, nil) if err != nil { b.Fatal(err) } @@ -1632,15 +1566,11 @@ func BenchmarkBatchOverhead(b *testing.B) { } func TestOpenReadonlyMultiple(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) // build an index and close it - index, err := New("testidx", NewIndexMapping()) + index, err := New(tmpIndexPath, NewIndexMapping()) if err != nil { t.Fatal(err) } @@ -1660,7 +1590,7 @@ func TestOpenReadonlyMultiple(t *testing.T) { } // now open it read-only - index, err = OpenUsing("testidx", map[string]interface{}{ + index, err = OpenUsing(tmpIndexPath, map[string]interface{}{ "read_only": true, }) @@ -1669,7 +1599,7 @@ func TestOpenReadonlyMultiple(t *testing.T) { } // now open it again - index2, err := OpenUsing("testidx", map[string]interface{}{ + index2, err := OpenUsing(tmpIndexPath, map[string]interface{}{ "read_only": true, }) @@ -1753,14 +1683,10 @@ func TestBug408(t *testing.T) { } func TestIndexAdvancedCountMatchSearch(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) - index, err := New("testidx", NewIndexMapping()) + index, err := New(tmpIndexPath, NewIndexMapping()) if err != nil { t.Fatal(err) } @@ -1826,14 +1752,10 @@ func TestIndexAdvancedCountMatchSearch(t *testing.T) { } func benchmarkSearchOverhead(indexType string, b *testing.B) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - b.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(b) + defer cleanupTmpIndexPath(b, tmpIndexPath) - index, err := NewUsing("testidx", NewIndexMapping(), + index, err := NewUsing(tmpIndexPath, NewIndexMapping(), indexType, Config.DefaultKVStore, nil) if err != nil { b.Fatal(err) @@ -1878,14 +1800,10 @@ func BenchmarkScorchSearchOverhead(b *testing.B) { } func TestSearchQueryCallback(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) - index, err := New("testidx", NewIndexMapping()) + index, err := New(tmpIndexPath, NewIndexMapping()) if err != nil { t.Fatal(err) } @@ -1916,14 +1834,10 @@ func TestSearchQueryCallback(t *testing.T) { } func TestBatchMerge(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) - index, err := New("testidx", NewIndexMapping()) + index, err := New(tmpIndexPath, NewIndexMapping()) if err != nil { t.Fatal(err) } @@ -1991,7 +1905,7 @@ func TestBatchMerge(t *testing.T) { t.Fatal(err) } - index, err = Open("testidx") + index, err = Open(tmpIndexPath) if err != nil { t.Fatal(err) } @@ -2072,19 +1986,15 @@ func TestBatchMerge(t *testing.T) { } func TestBug1096(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) // use default mapping mapping := NewIndexMapping() // create a scorch index with default SAFE batches var idx Index - idx, err = NewUsing("testidx", mapping, "scorch", "scorch", nil) + idx, err = NewUsing(tmpIndexPath, mapping, "scorch", "scorch", nil) if err != nil { log.Fatal(err) } @@ -2148,18 +2058,14 @@ func TestBug1096(t *testing.T) { } func TestDataRaceBug1092(t *testing.T) { - defer func() { - rerr := os.RemoveAll("testidx") - if rerr != nil { - t.Fatal(rerr) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) // use default mapping mapping := NewIndexMapping() var idx Index - idx, err = NewUsing("testidx", mapping, upsidedown.Name, boltdb.Name, nil) + idx, err = NewUsing(tmpIndexPath, mapping, upsidedown.Name, boltdb.Name, nil) if err != nil { log.Fatal(err) } @@ -2182,14 +2088,25 @@ func TestDataRaceBug1092(t *testing.T) { } func TestBatchRaceBug1149(t *testing.T) { + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + i, err := New(tmpIndexPath, NewIndexMapping()) + if err != nil { + t.Fatal(err) + } defer func() { - err := os.RemoveAll("testidx") + err := i.Close() if err != nil { t.Fatal(err) } }() - i, err := New("testidx", NewIndexMapping()) - //i, err := NewUsing("testidx", NewIndexMapping(), "scorch", "scorch", nil) + testBatchRaceBug1149(t, i) +} + +func TestBatchRaceBug1149Scorch(t *testing.T) { + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + i, err := NewUsing(tmpIndexPath, NewIndexMapping(), "scorch", "scorch", nil) if err != nil { t.Fatal(err) } @@ -2199,6 +2116,10 @@ func TestBatchRaceBug1149(t *testing.T) { t.Fatal(err) } }() + testBatchRaceBug1149(t, i) +} + +func testBatchRaceBug1149(t *testing.T, i Index) { b := i.NewBatch() b.Delete("1") err = i.Batch(b) diff --git a/search_test.go b/search_test.go index 453657ee3..755999db5 100644 --- a/search_test.go +++ b/search_test.go @@ -17,7 +17,6 @@ package bleve import ( "encoding/json" "fmt" - "os" "reflect" "strconv" "strings" @@ -221,7 +220,7 @@ func TestFacetNumericDateRangeRequests(t *testing.T) { Field: "Date_Range_Success_With_StartEnd", Size: 1, DateTimeRanges: []*dateTimeRange{ - &dateTimeRange{Name: "testName", Start: time.Unix(0, 0), End: time.Now()}, + {Name: "testName", Start: time.Unix(0, 0), End: time.Now()}, }, }, result: nil, @@ -231,7 +230,7 @@ func TestFacetNumericDateRangeRequests(t *testing.T) { Field: "Date_Range_Success_With_Start", Size: 1, DateTimeRanges: []*dateTimeRange{ - &dateTimeRange{Name: "testName", Start: time.Unix(0, 0)}, + {Name: "testName", Start: time.Unix(0, 0)}, }, }, result: nil, @@ -241,7 +240,7 @@ func TestFacetNumericDateRangeRequests(t *testing.T) { Field: "Date_Range_Success_With_End", Size: 1, DateTimeRanges: []*dateTimeRange{ - &dateTimeRange{Name: "testName", End: time.Now()}, + {Name: "testName", End: time.Now()}, }, }, result: nil, @@ -251,7 +250,7 @@ func TestFacetNumericDateRangeRequests(t *testing.T) { Field: "Numeric_Range_Success_With_MinMax", Size: 1, NumericRanges: []*numericRange{ - &numericRange{Name: "testName", Min: &value, Max: &value}, + {Name: "testName", Min: &value, Max: &value}, }, }, result: nil, @@ -261,7 +260,7 @@ func TestFacetNumericDateRangeRequests(t *testing.T) { Field: "Numeric_Range_Success_With_Min", Size: 1, NumericRanges: []*numericRange{ - &numericRange{Name: "testName", Min: &value}, + {Name: "testName", Min: &value}, }, }, result: nil, @@ -271,7 +270,7 @@ func TestFacetNumericDateRangeRequests(t *testing.T) { Field: "Numeric_Range_Success_With_Max", Size: 1, NumericRanges: []*numericRange{ - &numericRange{Name: "testName", Max: &value}, + {Name: "testName", Max: &value}, }, }, result: nil, @@ -281,9 +280,9 @@ func TestFacetNumericDateRangeRequests(t *testing.T) { Field: "Date_Range_Missing_Failure", Size: 1, DateTimeRanges: []*dateTimeRange{ - &dateTimeRange{Name: "testName2", Start: time.Unix(0, 0)}, - &dateTimeRange{Name: "testName1", End: time.Now()}, - &dateTimeRange{Name: "testName"}, + {Name: "testName2", Start: time.Unix(0, 0)}, + {Name: "testName1", End: time.Now()}, + {Name: "testName"}, }, }, result: drMissingErr, @@ -293,9 +292,9 @@ func TestFacetNumericDateRangeRequests(t *testing.T) { Field: "Numeric_Range_Missing_Failure", Size: 1, NumericRanges: []*numericRange{ - &numericRange{Name: "testName2", Min: &value}, - &numericRange{Name: "testName1", Max: &value}, - &numericRange{Name: "testName"}, + {Name: "testName2", Min: &value}, + {Name: "testName1", Max: &value}, + {Name: "testName"}, }, }, result: nrMissingErr, @@ -305,10 +304,10 @@ func TestFacetNumericDateRangeRequests(t *testing.T) { Field: "Numeric_And_DateRanges_Failure", Size: 1, NumericRanges: []*numericRange{ - &numericRange{Name: "testName", Max: &value}, + {Name: "testName", Max: &value}, }, DateTimeRanges: []*dateTimeRange{ - &dateTimeRange{Name: "testName", End: time.Now()}, + {Name: "testName", End: time.Now()}, }, }, result: drNrErr, @@ -318,8 +317,8 @@ func TestFacetNumericDateRangeRequests(t *testing.T) { Field: "Numeric_Range_Name_Repeat_Failure", Size: 1, NumericRanges: []*numericRange{ - &numericRange{Name: "testName", Min: &value}, - &numericRange{Name: "testName", Max: &value}, + {Name: "testName", Min: &value}, + {Name: "testName", Max: &value}, }, }, result: nrNameDupErr, @@ -329,8 +328,8 @@ func TestFacetNumericDateRangeRequests(t *testing.T) { Field: "Date_Range_Name_Repeat_Failure", Size: 1, DateTimeRanges: []*dateTimeRange{ - &dateTimeRange{Name: "testName", Start: time.Unix(0, 0)}, - &dateTimeRange{Name: "testName", End: time.Now()}, + {Name: "testName", Start: time.Unix(0, 0)}, + {Name: "testName", End: time.Now()}, }, }, result: drNameDupErr, @@ -447,7 +446,11 @@ func TestNestedBooleanSearchers(t *testing.T) { } idxMapping.DefaultAnalyzer = "3xbla" - idx, err := New("testidx", idxMapping) + + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + idx, err := New(tmpIndexPath, idxMapping) if err != nil { t.Fatal(err) } @@ -457,11 +460,6 @@ func TestNestedBooleanSearchers(t *testing.T) { if err != nil { t.Fatal(err) } - - err = os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } }() // create and insert documents as a batch @@ -559,9 +557,12 @@ func TestNestedBooleanSearchers(t *testing.T) { } func TestNestedBooleanMustNotSearcherUpsidedown(t *testing.T) { + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + // create an index with default settings idxMapping := NewIndexMapping() - idx, err := New("testidx", idxMapping) + idx, err := New(tmpIndexPath, idxMapping) if err != nil { t.Fatal(err) } @@ -571,11 +572,6 @@ func TestNestedBooleanMustNotSearcherUpsidedown(t *testing.T) { if err != nil { t.Fatal(err) } - - err = os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } }() // create and insert documents as a batch @@ -706,7 +702,10 @@ func TestSearchScorchOverEmptyKeyword(t *testing.T) { imap.DefaultMapping = dmap imap.DefaultAnalyzer = standard.Name - idx, err := New("testidx", imap) + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + idx, err := New(tmpIndexPath, imap) if err != nil { t.Fatal(err) } @@ -717,10 +716,6 @@ func TestSearchScorchOverEmptyKeyword(t *testing.T) { t.Fatal(err) } - err = os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } Config.DefaultIndexType = defaultIndexType }() @@ -754,9 +749,12 @@ func TestMultipleNestedBooleanMustNotSearchersOnScorch(t *testing.T) { defaultIndexType := Config.DefaultIndexType Config.DefaultIndexType = scorch.Name + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + // create an index with default settings idxMapping := NewIndexMapping() - idx, err := New("testidx", idxMapping) + idx, err := New(tmpIndexPath, idxMapping) if err != nil { t.Fatal(err) } @@ -767,10 +765,6 @@ func TestMultipleNestedBooleanMustNotSearchersOnScorch(t *testing.T) { t.Fatal(err) } - err = os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } Config.DefaultIndexType = defaultIndexType }() @@ -896,8 +890,11 @@ func TestMultipleNestedBooleanMustNotSearchersOnScorch(t *testing.T) { } func testBooleanMustNotSearcher(t *testing.T, indexName string) { + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + im := NewIndexMapping() - idx, err := NewUsing("testidx", im, indexName, Config.DefaultKVStore, nil) + idx, err := NewUsing(tmpIndexPath, im, indexName, Config.DefaultKVStore, nil) if err != nil { t.Fatal(err) } @@ -907,11 +904,6 @@ func testBooleanMustNotSearcher(t *testing.T, indexName string) { if err != nil { t.Fatal(err) } - - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } }() docs := []struct { @@ -1021,9 +1013,12 @@ func TestQueryStringEmptyConjunctionSearcher(t *testing.T) { } func TestDisjunctionQueryIncorrectMin(t *testing.T) { + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + // create an index with default settings idxMapping := NewIndexMapping() - idx, err := New("testidx", idxMapping) + idx, err := New(tmpIndexPath, idxMapping) if err != nil { t.Fatal(err) } @@ -1032,10 +1027,6 @@ func TestDisjunctionQueryIncorrectMin(t *testing.T) { if err != nil { t.Fatal(err) } - err = os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } }() // create and insert documents as a batch @@ -1090,7 +1081,10 @@ func TestDisjunctionQueryIncorrectMin(t *testing.T) { } func TestBooleanShouldMinPropagation(t *testing.T) { - idx, err := New("testidx", NewIndexMapping()) + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + idx, err := New(tmpIndexPath, NewIndexMapping()) if err != nil { t.Fatal(err) } @@ -1100,11 +1094,6 @@ func TestBooleanShouldMinPropagation(t *testing.T) { if err != nil { t.Fatal(err) } - - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } }() doc1 := map[string]interface{}{ @@ -1160,7 +1149,10 @@ func TestBooleanShouldMinPropagation(t *testing.T) { } func TestDisjunctionMinPropagation(t *testing.T) { - idx, err := New("testidx", NewIndexMapping()) + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + idx, err := New(tmpIndexPath, NewIndexMapping()) if err != nil { t.Fatal(err) } @@ -1170,11 +1162,6 @@ func TestDisjunctionMinPropagation(t *testing.T) { if err != nil { t.Fatal(err) } - - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } }() doc1 := map[string]interface{}{ @@ -1287,7 +1274,11 @@ func TestBooleanMustSingleMatchNone(t *testing.T) { } idxMapping.DefaultAnalyzer = "custom1" - idx, err := New("testidx", idxMapping) + + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + idx, err := New(tmpIndexPath, idxMapping) if err != nil { t.Fatal(err) } @@ -1297,11 +1288,6 @@ func TestBooleanMustSingleMatchNone(t *testing.T) { if err != nil { t.Fatal(err) } - - err = os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } }() doc := map[string]interface{}{ @@ -1360,7 +1346,11 @@ func TestBooleanMustNotSingleMatchNone(t *testing.T) { } idxMapping.DefaultAnalyzer = "custom1" - idx, err := New("testidx", idxMapping) + + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + idx, err := New(tmpIndexPath, idxMapping) if err != nil { t.Fatal(err) } @@ -1370,11 +1360,6 @@ func TestBooleanMustNotSingleMatchNone(t *testing.T) { if err != nil { t.Fatal(err) } - - err = os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } }() doc := map[string]interface{}{ @@ -1419,12 +1404,8 @@ func TestBooleanMustNotSingleMatchNone(t *testing.T) { } func TestBooleanSearchBug1185(t *testing.T) { - defer func() { - err := os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } - }() + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) of := NewTextFieldMapping() of.Analyzer = keyword.Name @@ -1436,7 +1417,7 @@ func TestBooleanSearchBug1185(t *testing.T) { m := NewIndexMapping() m.DefaultMapping = dm - idx, err := NewUsing("testidx", m, "scorch", "scorch", nil) + idx, err := NewUsing(tmpIndexPath, m, "scorch", "scorch", nil) if err != nil { t.Fatal(err) } @@ -1552,7 +1533,10 @@ func TestBooleanSearchBug1185(t *testing.T) { } func TestSearchScoreNone(t *testing.T) { - idx, err := NewUsing("testidx", NewIndexMapping(), scorch.Name, Config.DefaultKVStore, nil) + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + idx, err := NewUsing(tmpIndexPath, NewIndexMapping(), scorch.Name, Config.DefaultKVStore, nil) if err != nil { t.Fatal(err) } @@ -1562,10 +1546,6 @@ func TestSearchScoreNone(t *testing.T) { if err != nil { t.Fatal(err) } - err = os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } }() doc := map[string]interface{}{ @@ -1607,7 +1587,10 @@ func TestGeoDistanceIssue1301(t *testing.T) { shopIndexMapping := NewIndexMapping() shopIndexMapping.DefaultMapping = shopMapping - idx, err := NewUsing("testidx", shopIndexMapping, scorch.Name, Config.DefaultKVStore, nil) + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + idx, err := NewUsing(tmpIndexPath, shopIndexMapping, scorch.Name, Config.DefaultKVStore, nil) if err != nil { t.Fatal(err) } @@ -1617,10 +1600,6 @@ func TestGeoDistanceIssue1301(t *testing.T) { if err != nil { t.Fatal(err) } - err = os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } }() for i, g := range []string{"wecpkbeddsmf", "wecpk8tne453", "wecpkb80s09t"} { @@ -1671,7 +1650,11 @@ func TestSearchHighlightingWithRegexpReplacement(t *testing.T) { idxMapping.DefaultAnalyzer = "regexp_replace" idxMapping.StoreDynamic = true - idx, err := NewUsing("testidx", idxMapping, scorch.Name, Config.DefaultKVStore, nil) + + tmpIndexPath := createTmpIndexPath(t) + defer cleanupTmpIndexPath(t, tmpIndexPath) + + idx, err := NewUsing(tmpIndexPath, idxMapping, scorch.Name, Config.DefaultKVStore, nil) if err != nil { t.Fatal(err) } @@ -1681,11 +1664,6 @@ func TestSearchHighlightingWithRegexpReplacement(t *testing.T) { if err != nil { t.Fatal(err) } - - err = os.RemoveAll("testidx") - if err != nil { - t.Fatal(err) - } }() doc := map[string]interface{}{ diff --git a/test/versus_score_test.go b/test/versus_score_test.go index dcaf8d650..b0cb4afe1 100644 --- a/test/versus_score_test.go +++ b/test/versus_score_test.go @@ -16,6 +16,7 @@ package test import ( "fmt" + "io/ioutil" "os" "strconv" "testing" @@ -48,9 +49,19 @@ func TestDisjunctionSearchScoreIndexWithCompositeFields(t *testing.T) { func disjunctionQueryiOnIndexWithCompositeFields(indexName string, t *testing.T) []*search.DocumentMatch { + tmpIndexPath, err := ioutil.TempDir("", "bleve-testidx") + if err != nil { + t.Fatalf("error creating temp dir: %v", err) + } + defer func() { + err := os.RemoveAll(tmpIndexPath) + if err != nil { + t.Fatalf("error removing temp dir: %v", err) + } + }() // create an index idxMapping := mapping.NewIndexMapping() - idx, err := bleve.NewUsing("testidx", idxMapping, indexName, + idx, err := bleve.NewUsing(tmpIndexPath, idxMapping, indexName, bleve.Config.DefaultKVStore, nil) if err != nil { t.Error(err) @@ -61,10 +72,6 @@ func disjunctionQueryiOnIndexWithCompositeFields(indexName string, if err != nil { t.Error(err) } - err = os.RemoveAll("testidx") - if err != nil { - t.Error(err) - } }() // create and insert documents as a batch From 837a9e6e1b6a248d87e001202659ecbacc4b52bc Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Mon, 4 May 2020 15:39:18 -0700 Subject: [PATCH 674/728] MB-38957: Document mapping's analyzer to be inherited correctly Check if the parent document mapping's default analyzer is available while determining the analyzer to use for a child field in the event that neither the child field has an analyzer set nor the intermediate parent property. Fixes: https://github.com/blevesearch/bleve/issues/1390 --- go.mod | 1 - mapping/document.go | 6 ++++++ search_test.go | 44 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 1 deletion(-) diff --git a/go.mod b/go.mod index d38cf8f92..22133ea7a 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,6 @@ require ( github.com/blevesearch/snowballstem v0.9.0 github.com/blevesearch/zap/v11 v11.0.7 github.com/blevesearch/zap/v12 v12.0.7 - github.com/couchbase/ghistogram v0.1.0 // indirect github.com/couchbase/moss v0.1.0 github.com/couchbase/vellum v1.0.1 github.com/golang/protobuf v1.3.2 diff --git a/mapping/document.go b/mapping/document.go index 15cb6b5fa..58ad06c79 100644 --- a/mapping/document.go +++ b/mapping/document.go @@ -316,10 +316,16 @@ func (dm *DocumentMapping) defaultAnalyzerName(path []string) string { if !ok { break } + if current.DefaultAnalyzer != "" { rv = current.DefaultAnalyzer } } + + if rv == "" { + rv = dm.DefaultAnalyzer + } + return rv } diff --git a/search_test.go b/search_test.go index 755999db5..3ee0827b4 100644 --- a/search_test.go +++ b/search_test.go @@ -1693,3 +1693,47 @@ func TestSearchHighlightingWithRegexpReplacement(t *testing.T) { t.Fatalf("Expected 1 hit, got: %v", sres.Total) } } + +func TestAnalyzerInheritance(t *testing.T) { + dMapping := mapping.NewDocumentStaticMapping() + dMapping.DefaultAnalyzer = keyword.Name + + fMapping := mapping.NewTextFieldMapping() + dMapping.AddFieldMappingsAt("city", fMapping) + + idxMapping := NewIndexMapping() + idxMapping.DefaultMapping = dMapping + + tmpIndexPath := createTmpIndexPath(t) + idx, err := New(tmpIndexPath, idxMapping) + if err != nil { + t.Fatal(err) + } + + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + doc := map[string]interface{}{ + "city": "San Francisco", + } + + if err = idx.Index("doc", doc); err != nil { + t.Fatal(err) + } + + q := NewTermQuery("San Francisco") + q.SetField("city") + + res, err := idx.Search(NewSearchRequest(q)) + if err != nil { + t.Fatal(err) + } + + if len(res.Hits) != 1 { + t.Fatalf("unexpected number of hits: %v", len(res.Hits)) + } +} From ead5428c6109e5c2abe03315f8942eb190fa2a0f Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Tue, 5 May 2020 16:12:54 -0700 Subject: [PATCH 675/728] MB-38957: Adding more test cases + Note that the bug is only with inheriting the top level type mapping's analyzer. + The inheritance of any child mapping's analyzers to child fields works as expected. --- search_test.go | 125 +++++++++++++++++++++++++++++++++++++------------ 1 file changed, 95 insertions(+), 30 deletions(-) diff --git a/search_test.go b/search_test.go index 3ee0827b4..c31750965 100644 --- a/search_test.go +++ b/search_test.go @@ -1695,45 +1695,110 @@ func TestSearchHighlightingWithRegexpReplacement(t *testing.T) { } func TestAnalyzerInheritance(t *testing.T) { - dMapping := mapping.NewDocumentStaticMapping() - dMapping.DefaultAnalyzer = keyword.Name - - fMapping := mapping.NewTextFieldMapping() - dMapping.AddFieldMappingsAt("city", fMapping) - - idxMapping := NewIndexMapping() - idxMapping.DefaultMapping = dMapping - - tmpIndexPath := createTmpIndexPath(t) - idx, err := New(tmpIndexPath, idxMapping) - if err != nil { - t.Fatal(err) + tests := []struct { + mappingStr string + doc map[string]interface{} + queryField string + queryTerm string + }{ + { + /* + index_mapping: keyword + default_mapping: "" + -> child field (should inherit keyword) + */ + mappingStr: `{"default_mapping":{"enabled":true,"dynamic":false,"properties":` + + `{"city":{"enabled":true,"dynamic":false,"fields":[{"name":"city","type":"text",` + + `"store":false,"index":true}]}}},"default_analyzer":"keyword"}`, + doc: map[string]interface{}{"city": "San Francisco"}, + queryField: "city", + queryTerm: "San Francisco", + }, + { + /* + index_mapping: standard + default_mapping: keyword + -> child field (should inherit keyword) + */ + mappingStr: `{"default_mapping":{"enabled":true,"dynamic":false,"properties":` + + `{"city":{"enabled":true,"dynamic":false,"fields":[{"name":"city","type":"text",` + + `"index":true}]}},"default_analyzer":"keyword"},"default_analyzer":"standard"}`, + doc: map[string]interface{}{"city": "San Francisco"}, + queryField: "city", + queryTerm: "San Francisco", + }, + { + /* + index_mapping: standard + default_mapping: keyword + -> child mapping: "" + -> child field: (should inherit keyword) + */ + mappingStr: `{"default_mapping":{"enabled":true,"dynamic":false,"default_analyzer":` + + `"keyword","properties":{"address":{"enabled":true,"dynamic":false,"properties":` + + `{"city":{"enabled":true,"dynamic":false,"fields":[{"name":"city","type":"text",` + + `"index":true}]}}}}},"default_analyzer":"standard"}`, + doc: map[string]interface{}{ + "address": map[string]interface{}{"city": "San Francisco"}, + }, + queryField: "address.city", + queryTerm: "San Francisco", + }, + { + /* + index_mapping: standard + default_mapping: "" + -> child mapping: "keyword" + -> child mapping: "" + -> child field: (should inherit keyword) + */ + mappingStr: `{"default_mapping":{"enabled":true,"dynamic":false,"properties":` + + `{"address":{"enabled":true,"dynamic":false,"default_analyzer":"keyword",` + + `"properties":{"state":{"enabled":true,"dynamic":false,"properties":{"city":` + + `{"enabled":true,"dynamic":false,"fields":[{"name":"city","type":"text",` + + `"store":false,"index":true}]}}}}}}},"default_analyer":"standard"}`, + doc: map[string]interface{}{ + "address": map[string]interface{}{ + "state": map[string]interface{}{"city": "San Francisco"}, + }, + }, + queryField: "address.state.city", + queryTerm: "San Francisco", + }, } - defer func() { - err := idx.Close() + for i := range tests { + idxMapping := NewIndexMapping() + if err := idxMapping.UnmarshalJSON([]byte(tests[i].mappingStr)); err != nil { + t.Fatal(err) + } + + tmpIndexPath := createTmpIndexPath(t) + idx, err := New(tmpIndexPath, idxMapping) if err != nil { t.Fatal(err) } - }() - doc := map[string]interface{}{ - "city": "San Francisco", - } + defer func() { + if err := idx.Close(); err != nil { + t.Fatal(err) + } + }() - if err = idx.Index("doc", doc); err != nil { - t.Fatal(err) - } + if err = idx.Index("doc", tests[i].doc); err != nil { + t.Fatal(err) + } - q := NewTermQuery("San Francisco") - q.SetField("city") + q := NewTermQuery(tests[i].queryTerm) + q.SetField(tests[i].queryField) - res, err := idx.Search(NewSearchRequest(q)) - if err != nil { - t.Fatal(err) - } + res, err := idx.Search(NewSearchRequest(q)) + if err != nil { + t.Fatal(err) + } - if len(res.Hits) != 1 { - t.Fatalf("unexpected number of hits: %v", len(res.Hits)) + if len(res.Hits) != 1 { + t.Errorf("[%d] Unexpected number of hits: %v", i, len(res.Hits)) + } } } From e2db10a90111893a597bbbe261a4bdd1fbd516e6 Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Wed, 6 May 2020 11:39:23 -0700 Subject: [PATCH 676/728] MB-38957: Initialize rv to dm.DefaultAnalyzer to start with + This is a better approach than to overwrite at the end after checking for an empty string. --- mapping/document.go | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/mapping/document.go b/mapping/document.go index 58ad06c79..4083d55d5 100644 --- a/mapping/document.go +++ b/mapping/document.go @@ -308,8 +308,8 @@ func (dm *DocumentMapping) UnmarshalJSON(data []byte) error { } func (dm *DocumentMapping) defaultAnalyzerName(path []string) string { - rv := "" current := dm + rv := current.DefaultAnalyzer for _, pathElement := range path { var ok bool current, ok = current.Properties[pathElement] @@ -322,10 +322,6 @@ func (dm *DocumentMapping) defaultAnalyzerName(path []string) string { } } - if rv == "" { - rv = dm.DefaultAnalyzer - } - return rv } From 124045f0fea96ef2f6dc2a5b1a0bd6e49691f42e Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Wed, 6 May 2020 13:31:39 -0700 Subject: [PATCH 677/728] Remove unintended extra lines --- mapping/document.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/mapping/document.go b/mapping/document.go index 4083d55d5..355a602e5 100644 --- a/mapping/document.go +++ b/mapping/document.go @@ -251,7 +251,6 @@ func (dm *DocumentMapping) AddFieldMapping(fm *FieldMapping) { // UnmarshalJSON offers custom unmarshaling with optional strict validation func (dm *DocumentMapping) UnmarshalJSON(data []byte) error { - var tmp map[string]json.RawMessage err := json.Unmarshal(data, &tmp) if err != nil { @@ -316,12 +315,10 @@ func (dm *DocumentMapping) defaultAnalyzerName(path []string) string { if !ok { break } - if current.DefaultAnalyzer != "" { rv = current.DefaultAnalyzer } } - return rv } From 3d8737411a6785793b0ac33a3c9d0af54e3ca48e Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 7 May 2020 10:38:08 +0530 Subject: [PATCH 678/728] Api to trigger a force merge operation on an index Introducing a new synchronous ForceMerge api to enable users conveniently invoke a manual merge operation on an online scorch index. This api would let advanced bleve users to compact their index all the way down to a single file segment based index. The api takes an optional mergePlanOptions argument which would be applied for the forceful merge operations, while the original mergePlanOptions property of the index remains intact after the forceful merge cycle exits. In the absense of a mergePlanOptions argument, default policy assumed would be an aggressive one to target a single file segment index with a maximum segment size of 1B approx. The api would trigger the merger's work loop only once during it's invocation and hence it is the caller's responsibility to actively monitor the expected file segments count at the root and trigger any subsequent forceMerge invocations if needed to meet the expected segment's count at root. The caller could also pass a cancelCh for cancelling an ongoing forceMerge operation in case of any runtime anomalies during the merge operation. The api also takes an optional parameter to override the numSnapshotsToKeep for reducing the space wastage out of segments with duplicate contents. The caller should also ensure that there is enough disk space before invoking this compaction/merge operation as it needs at least double the disk space whenever there is a full compaction. --- index/scorch/introducer.go | 6 +- index/scorch/merge.go | 128 ++++++++++++++- index/scorch/mergeplan/merge_plan.go | 11 ++ index/scorch/persister.go | 25 ++- index/scorch/scorch.go | 3 + index/scorch/scorch_test.go | 237 ++++++++++++++++++++++++++- index/scorch/stats.go | 3 + index_alias_impl.go | 12 ++ 8 files changed, 411 insertions(+), 14 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 64ca969bd..3325c9b17 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -313,11 +313,15 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { defer func() { _ = root.DecRef() }() + if nextMerge.creator == "" { + nextMerge.creator = "introduceMerge" + } + newSnapshot := &IndexSnapshot{ parent: s, internal: root.internal, refs: 1, - creator: "introduceMerge", + creator: nextMerge.creator, } // iterate through current segments diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 37dca529a..3be781f73 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -27,14 +27,18 @@ import ( "github.com/blevesearch/bleve/index/scorch/segment" ) +var numSnapShotsToKeepOverRuler = "introduceForceMerge" + func (s *Scorch) mergerLoop() { var lastEpochMergePlanned uint64 + var ctrlMsg *mergerCtrl mergePlannerOptions, err := s.parseMergePlannerOptions() if err != nil { s.fireAsyncError(fmt.Errorf("mergePlannerOption json parsing err: %v", err)) s.asyncTasks.Done() return } + ctrlMsgDflt := &mergerCtrl{options: mergePlannerOptions, doneCh: nil} OUTER: for { @@ -53,16 +57,30 @@ OUTER: atomic.StoreUint64(&s.iStats.mergeEpoch, ourSnapshot.epoch) s.rootLock.Unlock() - if ourSnapshot.epoch != lastEpochMergePlanned { + if ctrlMsg == nil && ourSnapshot.epoch != lastEpochMergePlanned { + ctrlMsg = ctrlMsgDflt + } + if ctrlMsg != nil { startTime := time.Now() // lets get started - err := s.planMergeAtSnapshot(ourSnapshot, mergePlannerOptions) + err := s.planMergeAtSnapshot(ourSnapshot, ctrlMsg.options, + ctrlMsg.creator, ctrlMsg.cancelCh) if err != nil { atomic.StoreUint64(&s.iStats.mergeEpoch, 0) if err == segment.ErrClosed { // index has been closed _ = ourSnapshot.DecRef() + + // continue the workloop on a user triggered cancel + if ctrlMsg.doneCh != nil { + close(ctrlMsg.doneCh) + ctrlMsg = nil + continue OUTER + } + + // exit the workloop on index closure + ctrlMsg = nil break OUTER } s.fireAsyncError(fmt.Errorf("merging err: %v", err)) @@ -70,6 +88,12 @@ OUTER: atomic.AddUint64(&s.stats.TotFileMergeLoopErr, 1) continue OUTER } + + if ctrlMsg.doneCh != nil { + close(ctrlMsg.doneCh) + } + ctrlMsg = nil + lastEpochMergePlanned = ourSnapshot.epoch atomic.StoreUint64(&s.stats.LastMergedEpoch, ourSnapshot.epoch) @@ -90,6 +114,8 @@ OUTER: case <-s.closeCh: break OUTER case s.persisterNotifier <- ew: + case ctrlMsg = <-s.mergerKickCh: + continue OUTER } // now wait for persister (but also detect close) @@ -97,6 +123,7 @@ OUTER: case <-s.closeCh: break OUTER case <-ew.notifyCh: + case ctrlMsg = <-s.mergerKickCh: } } @@ -106,6 +133,76 @@ OUTER: s.asyncTasks.Done() } +type mergerCtrl struct { + creator string + options *mergeplan.MergePlanOptions + doneCh chan struct{} + cancelCh chan struct{} +} + +// MergeRequest represents various control +// parameters for the ForceMerge API. +type MergeRequest struct { + // MergeOptions specify the merge policy applied during + // the forced merge cycles. This doesn't override the + // index's original merge policy. + MergeOptions *mergeplan.MergePlanOptions + + // OverrideNumSnapshotsToKeep specify whether to retain + // a number of older snapshots dictated by numSnapshotsToKeep + // during a forced merge cycle. Enabling this reduces the + // disk space requirements during a forced merge operation. + OverrideNumSnapshotsToKeep bool + + // CancelCh helps in cancelling an ongoing merge operation. + CancelCh chan struct{} +} + +// ForceMerge helps users trigger a merge operation on +// an online scorch index. +func (s *Scorch) ForceMerge(mr *MergeRequest) error { + // check whether force merge is already under processing + s.rootLock.Lock() + if s.stats.TotFileMergeForceOpsStarted > + s.stats.TotFileMergeForceOpsCompleted { + s.rootLock.Unlock() + return fmt.Errorf("force merge already in progress") + } + + s.stats.TotFileMergeForceOpsStarted++ + s.rootLock.Unlock() + + if mr.MergeOptions == nil { + // assume the default single segment merge policy + mr.MergeOptions = &mergeplan.SingleSegmentMergePlanOptions + } + var ssCreator string + if mr.OverrideNumSnapshotsToKeep { + ssCreator = numSnapShotsToKeepOverRuler + } + msg := &mergerCtrl{creator: ssCreator, + options: mr.MergeOptions, + doneCh: make(chan struct{}), + cancelCh: mr.CancelCh, + } + + // kick the merger workloop + select { + case s.mergerKickCh <- msg: + case <-s.closeCh: + return nil + } + + // wait for the force merge operation completion + select { + case <-msg.doneCh: + atomic.AddUint64(&s.stats.TotFileMergeForceOpsCompleted, 1) + case <-s.closeCh: + } + + return nil +} + func (s *Scorch) parseMergePlannerOptions() (*mergeplan.MergePlanOptions, error) { mergePlannerOptions := mergeplan.DefaultMergePlanOptions @@ -129,7 +226,8 @@ func (s *Scorch) parseMergePlannerOptions() (*mergeplan.MergePlanOptions, } func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, - options *mergeplan.MergePlanOptions) error { + options *mergeplan.MergePlanOptions, creator string, + cancelCh chan struct{}) error { // build list of persisted segments in this snapshot var onlyPersistedSnapshots []mergeplan.Segment for _, segmentSnapshot := range ourSnapshot.segment { @@ -158,6 +256,26 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, // process tasks in serial for now var filenames []string + closeCh := make(chan struct{}) + defer func() { + select { + case <-closeCh: + default: + close(closeCh) + } + }() + // cancel the merge operation on events like the index closure + // or upon a user cancel. + go func() { + select { + case <-s.closeCh: + close(closeCh) + case <-cancelCh: + close(closeCh) + case <-closeCh: + } + }() + for _, task := range resultMergePlan.Tasks { if len(task.Segments) == 0 { atomic.AddUint64(&s.stats.TotFileMergePlanTasksSegmentsEmpty, 1) @@ -203,7 +321,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1) newDocNums, _, err := s.segPlugin.Merge(segmentsToMerge, docsToDrop, path, - s.closeCh, s) + closeCh, s) atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1) fileMergeZapTime := uint64(time.Since(fileMergeZapStartTime)) @@ -241,6 +359,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, oldNewDocNums: oldNewDocNums, new: seg, notify: make(chan *IndexSnapshot), + creator: creator, } // give it to the introducer @@ -285,6 +404,7 @@ type segmentMerge struct { oldNewDocNums map[uint64][]uint64 new segment.Segment notify chan *IndexSnapshot + creator string } // perform a merging of the given SegmentBase instances into a new, diff --git a/index/scorch/mergeplan/merge_plan.go b/index/scorch/mergeplan/merge_plan.go index c2a0d3c64..e02923cc1 100644 --- a/index/scorch/mergeplan/merge_plan.go +++ b/index/scorch/mergeplan/merge_plan.go @@ -134,6 +134,17 @@ var DefaultMergePlanOptions = MergePlanOptions{ ReclaimDeletesWeight: 2.0, } +// SingleSegmentMergePlanOptions helps in creating a +// single segment index. +var SingleSegmentMergePlanOptions = MergePlanOptions{ + MaxSegmentsPerTier: 1, + MaxSegmentSize: 1 << 30, + TierGrowth: 1.0, + SegmentsPerMergeTask: 10, + FloorSegmentSize: 1 << 30, + ReclaimDeletesWeight: 2.0, +} + // ------------------------------------------- func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) { diff --git a/index/scorch/persister.go b/index/scorch/persister.go index ffa656693..aec1c0fea 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -209,7 +209,7 @@ OUTER: case s.introducerNotifier <- w: } - s.removeOldData() // might as well cleanup while waiting + s.removeOldData(s.getNumSnapshotsToKeep(ourSnapshot)) // might as well cleanup while waiting atomic.AddUint64(&s.stats.TotPersistLoopWait, 1) @@ -276,7 +276,7 @@ func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, // 1. Too many older snapshots awaiting the clean up. // 2. The merger could be lagging behind on merging the disk files. if numFilesOnDisk > uint64(po.PersisterNapUnderNumFiles) { - s.removeOldData() + s.removeOldData(s.numSnapshotsToKeep) numFilesOnDisk, _, _ = s.diskFileStats(nil) } @@ -780,8 +780,8 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro return rv, nil } -func (s *Scorch) removeOldData() { - removed, err := s.removeOldBoltSnapshots() +func (s *Scorch) removeOldData(numSnapshotsToKeep int) { + removed, err := s.removeOldBoltSnapshots(numSnapshotsToKeep) if err != nil { s.fireAsyncError(fmt.Errorf("got err removing old bolt snapshots: %v", err)) } @@ -798,22 +798,31 @@ func (s *Scorch) removeOldData() { // rollback'ability. var NumSnapshotsToKeep = 1 +func (s *Scorch) getNumSnapshotsToKeep(ourSnapshot *IndexSnapshot) int { + if ourSnapshot != nil && + ourSnapshot.creator != numSnapShotsToKeepOverRuler { + return s.numSnapshotsToKeep + } + return 1 +} + // Removes enough snapshots from the rootBolt so that the // s.eligibleForRemoval stays under the NumSnapshotsToKeep policy. -func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) { +func (s *Scorch) removeOldBoltSnapshots(numSnapshotsToKeep int) ( + numRemoved int, err error) { persistedEpochs, err := s.RootBoltSnapshotEpochs() if err != nil { return 0, err } - if len(persistedEpochs) <= s.numSnapshotsToKeep { + if len(persistedEpochs) <= numSnapshotsToKeep { // we need to keep everything return 0, nil } // make a map of epochs to protect from deletion - protectedEpochs := make(map[uint64]struct{}, s.numSnapshotsToKeep) - for _, epoch := range persistedEpochs[0:s.numSnapshotsToKeep] { + protectedEpochs := make(map[uint64]struct{}, numSnapshotsToKeep) + for _, epoch := range persistedEpochs[0:numSnapshotsToKeep] { protectedEpochs[epoch] = struct{}{} } diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 698aaf16a..affda93a3 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -77,6 +77,8 @@ type Scorch struct { pauseCount uint64 + mergerKickCh chan *mergerCtrl + segPlugin segment.Plugin } @@ -101,6 +103,7 @@ func NewScorch(storeName string, nextSnapshotEpoch: 1, closeCh: make(chan struct{}), ineligibleForRemoval: map[string]bool{}, + mergerKickCh: make(chan *mergerCtrl, 1), segPlugin: defaultSegmentPlugin, } diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index b168728ee..1e6f0ab7f 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -32,6 +32,7 @@ import ( regexpTokenizer "github.com/blevesearch/bleve/analysis/tokenizer/regexp" "github.com/blevesearch/bleve/document" "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/mergeplan" "github.com/blevesearch/bleve/mapping" ) @@ -2149,4 +2150,238 @@ func TestForceVersion(t *testing.T) { if err == nil { t.Fatalf("expected an error opening an unsupported vesion, got nil") } -} \ No newline at end of file +} + +func TestIndexForceMerge(t *testing.T) { + cfg := CreateConfig("TestIndexBatch") + err := InitTest(cfg) + tmp := struct { + MaxSegmentsPerTier int `json:"maxSegmentsPerTier"` + SegmentsPerMergeTask int `json:"segmentsPerMergeTask"` + FloorSegmentSize int64 `json:"floorSegmentSize"` + }{ + int(1), + int(1), + int64(2), + } + cfg["scorchMergePlanOptions"] = &tmp + + if err != nil { + t.Fatal(err) + } + defer func() { + err := DestroyTest(cfg) + if err != nil { + t.Log(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, cfg, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Fatalf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + var expectedCount uint64 + batch := index.NewBatch() + for i := 0; i < 10; i++ { + doc := document.NewDocument(fmt.Sprintf("doc1-%d", i)) + doc.AddField(document.NewTextField("name", []uint64{}, []byte(fmt.Sprintf("text1-%d", i)))) + batch.Update(doc) + doc = document.NewDocument(fmt.Sprintf("doc2-%d", i)) + doc.AddField(document.NewTextField("name", []uint64{}, []byte(fmt.Sprintf("text2-%d", i)))) + batch.Update(doc) + err = idx.Batch(batch) + if err != nil { + t.Error(err) + } + batch.Reset() + expectedCount += 2 + } + + // verify doc count + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + docCount, err := indexReader.DocCount() + if err != nil { + t.Fatal(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + err = indexReader.Close() + if err != nil { + t.Fatal(err) + } + if ns, ok := idx.StatsMap()["TotFileSegmentsAtRoot"].(uint64); ok && + ns != 10 { + t.Errorf("expected 10 root file segments, got: %d", ns) + } + + for { + if ns, ok := idx.StatsMap()["TotFileSegmentsAtRoot"].(uint64); ok && + ns == 1 { + break + } + + if si, ok := idx.(*Scorch); ok { + err := si.ForceMerge(&MergeRequest{ + MergeOptions: &mergeplan.MergePlanOptions{ + MaxSegmentsPerTier: 1, + MaxSegmentSize: 10000, + SegmentsPerMergeTask: 10, + FloorSegmentSize: 10000}, + OverrideNumSnapshotsToKeep: true}) + if err != nil { + t.Errorf("RequestMerge failed, err: %v", err) + } + } + } + + // verify the final root segment count + if ns, ok := idx.StatsMap()["TotFileSegmentsAtRoot"].(uint64); ok && + ns != 1 { + t.Errorf("expected a single root file segments, got: %d", ns) + } + +} + +func TestCancelIndexForceMerge(t *testing.T) { + cfg := CreateConfig("TestIndexBatch") + err := InitTest(cfg) + tmp := struct { + MaxSegmentsPerTier int `json:"maxSegmentsPerTier"` + SegmentsPerMergeTask int `json:"segmentsPerMergeTask"` + FloorSegmentSize int64 `json:"floorSegmentSize"` + }{ + int(1), + int(1), + int64(2), + } + cfg["scorchMergePlanOptions"] = &tmp + + if err != nil { + t.Fatal(err) + } + defer func() { + err := DestroyTest(cfg) + if err != nil { + t.Log(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, cfg, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Fatalf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + var expectedCount uint64 + batch := index.NewBatch() + for i := 0; i < 20; i++ { + doc := document.NewDocument(fmt.Sprintf("doc1-%d", i)) + doc.AddField(document.NewTextField("name", []uint64{}, []byte(fmt.Sprintf("text1-%d", i)))) + batch.Update(doc) + doc = document.NewDocument(fmt.Sprintf("doc2-%d", i)) + doc.AddField(document.NewTextField("name", []uint64{}, []byte(fmt.Sprintf("text2-%d", i)))) + batch.Update(doc) + err = idx.Batch(batch) + if err != nil { + t.Error(err) + } + batch.Reset() + expectedCount += 2 + } + + // verify doc count + indexReader, err := idx.Reader() + if err != nil { + t.Error(err) + } + docCount, err := indexReader.DocCount() + if err != nil { + t.Fatal(err) + } + if docCount != expectedCount { + t.Errorf("Expected document count to be %d got %d", expectedCount, docCount) + } + err = indexReader.Close() + if err != nil { + t.Fatal(err) + } + + // no merge operations are expected as per the original merge policy. + if ns, ok := idx.StatsMap()["TotFileSegmentsAtRoot"].(uint64); ok && + ns != 20 { + t.Errorf("expected 20 root file segments, got: %d", ns) + } + + fsar := uint64(0) + cancelCh := make(chan struct{}) + // cancel the force merge operation once the root has some new merge + // introductions. ie if the root has lesser file segments than earlier. + go func() { + for { + if nval, ok := idx.StatsMap()["TotFileSegmentsAtRoot"].(uint64); ok && + nval < fsar { + close(cancelCh) + return + } + time.Sleep(time.Millisecond * 5) + } + }() + +OUTER: + for { + select { + case <-cancelCh: + break OUTER + default: + } + // get the number of file segments at root right before + // the force merge operation. + fsar, _ = idx.StatsMap()["TotFileSegmentsAtRoot"].(uint64) + + if si, ok := idx.(*Scorch); ok { + err := si.ForceMerge(&MergeRequest{ + MergeOptions: &mergeplan.MergePlanOptions{ + MaxSegmentsPerTier: 1, + MaxSegmentSize: 10000, + SegmentsPerMergeTask: 5, + FloorSegmentSize: 10000}, + OverrideNumSnapshotsToKeep: true, + CancelCh: cancelCh}) + if err != nil { + t.Errorf("RequestMerge failed, err: %v", err) + } + } + } + + // verify the final root file segment count or forceMerge completion + if ns, ok := idx.StatsMap()["TotFileSegmentsAtRoot"].(uint64); ok && + ns == 1 { + t.Errorf("expected many files at root, but got: %d segments", ns) + } +} diff --git a/index/scorch/stats.go b/index/scorch/stats.go index e638362a7..2900ac0df 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -82,6 +82,9 @@ type Stats struct { TotFileMergeLoopErr uint64 TotFileMergeLoopEnd uint64 + TotFileMergeForceOpsStarted uint64 + TotFileMergeForceOpsCompleted uint64 + TotFileMergePlan uint64 TotFileMergePlanErr uint64 TotFileMergePlanNone uint64 diff --git a/index_alias_impl.go b/index_alias_impl.go index 4366fc795..94eb50f4e 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -44,6 +44,18 @@ func NewIndexAlias(indexes ...Index) *indexAliasImpl { } } +// Indexes just returns the indexes included in the +// index alias at the moment in an unsafe way. +// Caller must be aware that the results will be +// inconsistent if there are concurrent Add/Remove +// operations on the alias. +func (i *indexAliasImpl) Indexes() []Index { + i.mutex.RLock() + rv := i.indexes + i.mutex.RUnlock() + return rv +} + func (i *indexAliasImpl) isAliasToSingleIndex() error { if len(i.indexes) < 1 { return ErrorAliasEmpty From 50532472b773525ac3606fb4529c812bf4578f5c Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 7 May 2020 11:49:12 +0530 Subject: [PATCH 679/728] fixing the raciness issue --- index/scorch/merge.go | 10 ++- index/scorch/scorch_test.go | 163 ++++++++++++++++-------------------- 2 files changed, 80 insertions(+), 93 deletions(-) diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 3be781f73..c0464fba8 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -257,21 +257,23 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, var filenames []string closeCh := make(chan struct{}) - defer func() { + cleanup := func() { select { case <-closeCh: default: close(closeCh) } - }() + } + defer cleanup() + // cancel the merge operation on events like the index closure // or upon a user cancel. go func() { select { case <-s.closeCh: - close(closeCh) + cleanup() case <-cancelCh: - close(closeCh) + cleanup() case <-closeCh: } }() diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index 1e6f0ab7f..2e26f6b24 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -23,6 +23,7 @@ import ( "regexp" "strconv" "sync" + "sync/atomic" "testing" "time" @@ -2153,8 +2154,14 @@ func TestForceVersion(t *testing.T) { } func TestIndexForceMerge(t *testing.T) { - cfg := CreateConfig("TestIndexBatch") + cfg := CreateConfig("TestIndexForceMerge") err := InitTest(cfg) + defer func() { + err := DestroyTest(cfg) + if err != nil { + t.Log(err) + } + }() tmp := struct { MaxSegmentsPerTier int `json:"maxSegmentsPerTier"` SegmentsPerMergeTask int `json:"segmentsPerMergeTask"` @@ -2166,16 +2173,6 @@ func TestIndexForceMerge(t *testing.T) { } cfg["scorchMergePlanOptions"] = &tmp - if err != nil { - t.Fatal(err) - } - defer func() { - err := DestroyTest(cfg) - if err != nil { - t.Log(err) - } - }() - analysisQueue := index.NewAnalysisQueue(1) idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { @@ -2185,13 +2182,6 @@ func TestIndexForceMerge(t *testing.T) { if err != nil { t.Fatalf("error opening index: %v", err) } - defer func() { - err := idx.Close() - if err != nil { - t.Fatal(err) - } - }() - var expectedCount uint64 batch := index.NewBatch() for i := 0; i < 10; i++ { @@ -2225,42 +2215,57 @@ func TestIndexForceMerge(t *testing.T) { if err != nil { t.Fatal(err) } - if ns, ok := idx.StatsMap()["TotFileSegmentsAtRoot"].(uint64); ok && - ns != 10 { - t.Errorf("expected 10 root file segments, got: %d", ns) + var si *Scorch + var ok bool + if si, ok = idx.(*Scorch); !ok { + t.Errorf("expects a scorch index") + } + + nfs := atomic.LoadUint64(&si.stats.TotFileSegmentsAtRoot) + if nfs != 10 { + t.Errorf("expected 10 root file segments, got: %d", nfs) } for { - if ns, ok := idx.StatsMap()["TotFileSegmentsAtRoot"].(uint64); ok && - ns == 1 { + if atomic.LoadUint64(&si.stats.TotFileSegmentsAtRoot) == 1 { break } - - if si, ok := idx.(*Scorch); ok { - err := si.ForceMerge(&MergeRequest{ - MergeOptions: &mergeplan.MergePlanOptions{ - MaxSegmentsPerTier: 1, - MaxSegmentSize: 10000, - SegmentsPerMergeTask: 10, - FloorSegmentSize: 10000}, - OverrideNumSnapshotsToKeep: true}) - if err != nil { - t.Errorf("RequestMerge failed, err: %v", err) - } + err := si.ForceMerge(&MergeRequest{ + MergeOptions: &mergeplan.MergePlanOptions{ + MaxSegmentsPerTier: 1, + MaxSegmentSize: 10000, + SegmentsPerMergeTask: 10, + FloorSegmentSize: 10000}, + OverrideNumSnapshotsToKeep: true}) + if err != nil { + t.Errorf("ForceMerge failed, err: %v", err) } } // verify the final root segment count - if ns, ok := idx.StatsMap()["TotFileSegmentsAtRoot"].(uint64); ok && - ns != 1 { - t.Errorf("expected a single root file segments, got: %d", ns) + if atomic.LoadUint64(&si.stats.TotFileSegmentsAtRoot) != 1 { + t.Errorf("expected a single root file segments, got: %d", + atomic.LoadUint64(&si.stats.TotFileSegmentsAtRoot)) + } + err = idx.Close() + if err != nil { + t.Fatal(err) } - } func TestCancelIndexForceMerge(t *testing.T) { - cfg := CreateConfig("TestIndexBatch") + cfg := CreateConfig("TestCancelIndexForceMerge") err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } + defer func() { + err := DestroyTest(cfg) + if err != nil { + t.Log(err) + } + }() + tmp := struct { MaxSegmentsPerTier int `json:"maxSegmentsPerTier"` SegmentsPerMergeTask int `json:"segmentsPerMergeTask"` @@ -2272,16 +2277,6 @@ func TestCancelIndexForceMerge(t *testing.T) { } cfg["scorchMergePlanOptions"] = &tmp - if err != nil { - t.Fatal(err) - } - defer func() { - err := DestroyTest(cfg) - if err != nil { - t.Log(err) - } - }() - analysisQueue := index.NewAnalysisQueue(1) idx, err := NewScorch(Name, cfg, analysisQueue) if err != nil { @@ -2291,12 +2286,6 @@ func TestCancelIndexForceMerge(t *testing.T) { if err != nil { t.Fatalf("error opening index: %v", err) } - defer func() { - err := idx.Close() - if err != nil { - t.Fatal(err) - } - }() var expectedCount uint64 batch := index.NewBatch() @@ -2332,20 +2321,25 @@ func TestCancelIndexForceMerge(t *testing.T) { t.Fatal(err) } + var si *Scorch + var ok bool + if si, ok = idx.(*Scorch); !ok { + t.Fatal("expects a scorch index") + } + // no merge operations are expected as per the original merge policy. - if ns, ok := idx.StatsMap()["TotFileSegmentsAtRoot"].(uint64); ok && - ns != 20 { - t.Errorf("expected 20 root file segments, got: %d", ns) + nfsr := atomic.LoadUint64(&si.stats.TotFileSegmentsAtRoot) + if nfsr != 20 { + t.Errorf("expected 20 root file segments, got: %d", nfsr) } - fsar := uint64(0) cancelCh := make(chan struct{}) // cancel the force merge operation once the root has some new merge // introductions. ie if the root has lesser file segments than earlier. go func() { for { - if nval, ok := idx.StatsMap()["TotFileSegmentsAtRoot"].(uint64); ok && - nval < fsar { + nval := atomic.LoadUint64(&si.stats.TotFileSegmentsAtRoot) + if nval < nfsr { close(cancelCh) return } @@ -2353,35 +2347,26 @@ func TestCancelIndexForceMerge(t *testing.T) { } }() -OUTER: - for { - select { - case <-cancelCh: - break OUTER - default: - } - // get the number of file segments at root right before - // the force merge operation. - fsar, _ = idx.StatsMap()["TotFileSegmentsAtRoot"].(uint64) - - if si, ok := idx.(*Scorch); ok { - err := si.ForceMerge(&MergeRequest{ - MergeOptions: &mergeplan.MergePlanOptions{ - MaxSegmentsPerTier: 1, - MaxSegmentSize: 10000, - SegmentsPerMergeTask: 5, - FloorSegmentSize: 10000}, - OverrideNumSnapshotsToKeep: true, - CancelCh: cancelCh}) - if err != nil { - t.Errorf("RequestMerge failed, err: %v", err) - } - } + err = si.ForceMerge(&MergeRequest{ + MergeOptions: &mergeplan.MergePlanOptions{ + MaxSegmentsPerTier: 1, + MaxSegmentSize: 10000, + SegmentsPerMergeTask: 5, + FloorSegmentSize: 10000}, + OverrideNumSnapshotsToKeep: true, + CancelCh: cancelCh}) + if err != nil { + t.Errorf("ForceMerge failed, err: %v", err) } // verify the final root file segment count or forceMerge completion - if ns, ok := idx.StatsMap()["TotFileSegmentsAtRoot"].(uint64); ok && - ns == 1 { - t.Errorf("expected many files at root, but got: %d segments", ns) + if atomic.LoadUint64(&si.stats.TotFileSegmentsAtRoot) == 1 { + t.Errorf("expected many files at root, but got: %d segments", + atomic.LoadUint64(&si.stats.TotFileSegmentsAtRoot)) + } + + err = idx.Close() + if err != nil { + t.Fatal(err) } } From 878e5f1d5187732c346bb1da6f48b7a8591e86d1 Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Thu, 7 May 2020 10:17:16 -0700 Subject: [PATCH 680/728] MB-38957: Add a unit test for mapping API: defaultAnalyzerName --- mapping/mapping_test.go | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/mapping/mapping_test.go b/mapping/mapping_test.go index b57283f3a..2b6870966 100644 --- a/mapping/mapping_test.go +++ b/mapping/mapping_test.go @@ -1111,7 +1111,6 @@ func TestClosestDocDynamicMapping(t *testing.T) { } func TestMappingPointerToTimeBug1152(t *testing.T) { - when, err := time.Parse(time.RFC3339, "2019-03-06T15:04:05Z") if err != nil { t.Fatal(err) @@ -1141,3 +1140,14 @@ func TestMappingPointerToTimeBug1152(t *testing.T) { t.Fatalf("expected field to be type *document.DateTimeField, got %T", doc.Fields[0]) } } + +func TestDefaultAnalyzerInheritance(t *testing.T) { + docMapping := NewDocumentMapping() + docMapping.DefaultAnalyzer = "xyz" + childMapping := NewTextFieldMapping() + docMapping.AddFieldMappingsAt("field", childMapping) + + if analyzer := docMapping.defaultAnalyzerName([]string{"field"}); analyzer != "xyz" { + t.Fatalf("Expected analyzer: xyz to be inherited by field, but got: '%v'", analyzer) + } +} From f194b0026293045dad84a48169035f3706ad415c Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Thu, 7 May 2020 10:18:08 -0700 Subject: [PATCH 681/728] Undo go.mod edit + Adding back the unnecessary go.mod entry. --- go.mod | 1 + 1 file changed, 1 insertion(+) diff --git a/go.mod b/go.mod index 22133ea7a..d38cf8f92 100644 --- a/go.mod +++ b/go.mod @@ -10,6 +10,7 @@ require ( github.com/blevesearch/snowballstem v0.9.0 github.com/blevesearch/zap/v11 v11.0.7 github.com/blevesearch/zap/v12 v12.0.7 + github.com/couchbase/ghistogram v0.1.0 // indirect github.com/couchbase/moss v0.1.0 github.com/couchbase/vellum v1.0.1 github.com/golang/protobuf v1.3.2 From 1e93807deaf1fbdc77b85dde8487da86a7d7052f Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Thu, 7 May 2020 11:58:30 -0700 Subject: [PATCH 682/728] Adding sub-test functionality for table-driven tests --- search_test.go | 57 ++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 25 deletions(-) diff --git a/search_test.go b/search_test.go index c31750965..98623e1a2 100644 --- a/search_test.go +++ b/search_test.go @@ -1696,6 +1696,7 @@ func TestSearchHighlightingWithRegexpReplacement(t *testing.T) { func TestAnalyzerInheritance(t *testing.T) { tests := []struct { + name string mappingStr string doc map[string]interface{} queryField string @@ -1707,6 +1708,7 @@ func TestAnalyzerInheritance(t *testing.T) { default_mapping: "" -> child field (should inherit keyword) */ + name: "Child field to inherit index mapping's default analyzer", mappingStr: `{"default_mapping":{"enabled":true,"dynamic":false,"properties":` + `{"city":{"enabled":true,"dynamic":false,"fields":[{"name":"city","type":"text",` + `"store":false,"index":true}]}}},"default_analyzer":"keyword"}`, @@ -1720,6 +1722,7 @@ func TestAnalyzerInheritance(t *testing.T) { default_mapping: keyword -> child field (should inherit keyword) */ + name: "Child field to inherit default mapping's default analyzer", mappingStr: `{"default_mapping":{"enabled":true,"dynamic":false,"properties":` + `{"city":{"enabled":true,"dynamic":false,"fields":[{"name":"city","type":"text",` + `"index":true}]}},"default_analyzer":"keyword"},"default_analyzer":"standard"}`, @@ -1734,6 +1737,7 @@ func TestAnalyzerInheritance(t *testing.T) { -> child mapping: "" -> child field: (should inherit keyword) */ + name: "Nested child field to inherit default mapping's default analyzer", mappingStr: `{"default_mapping":{"enabled":true,"dynamic":false,"default_analyzer":` + `"keyword","properties":{"address":{"enabled":true,"dynamic":false,"properties":` + `{"city":{"enabled":true,"dynamic":false,"fields":[{"name":"city","type":"text",` + @@ -1752,6 +1756,7 @@ func TestAnalyzerInheritance(t *testing.T) { -> child mapping: "" -> child field: (should inherit keyword) */ + name: "Nested child field to inherit first child mapping's default analyzer", mappingStr: `{"default_mapping":{"enabled":true,"dynamic":false,"properties":` + `{"address":{"enabled":true,"dynamic":false,"default_analyzer":"keyword",` + `"properties":{"state":{"enabled":true,"dynamic":false,"properties":{"city":` + @@ -1768,37 +1773,39 @@ func TestAnalyzerInheritance(t *testing.T) { } for i := range tests { - idxMapping := NewIndexMapping() - if err := idxMapping.UnmarshalJSON([]byte(tests[i].mappingStr)); err != nil { - t.Fatal(err) - } - - tmpIndexPath := createTmpIndexPath(t) - idx, err := New(tmpIndexPath, idxMapping) - if err != nil { - t.Fatal(err) - } + t.Run(fmt.Sprintf("%s", tests[i].name), func(t *testing.T) { + idxMapping := NewIndexMapping() + if err := idxMapping.UnmarshalJSON([]byte(tests[i].mappingStr)); err != nil { + t.Fatal(err) + } - defer func() { - if err := idx.Close(); err != nil { + tmpIndexPath := createTmpIndexPath(t) + idx, err := New(tmpIndexPath, idxMapping) + if err != nil { t.Fatal(err) } - }() - if err = idx.Index("doc", tests[i].doc); err != nil { - t.Fatal(err) - } + defer func() { + if err := idx.Close(); err != nil { + t.Fatal(err) + } + }() - q := NewTermQuery(tests[i].queryTerm) - q.SetField(tests[i].queryField) + if err = idx.Index("doc", tests[i].doc); err != nil { + t.Fatal(err) + } - res, err := idx.Search(NewSearchRequest(q)) - if err != nil { - t.Fatal(err) - } + q := NewTermQuery(tests[i].queryTerm) + q.SetField(tests[i].queryField) - if len(res.Hits) != 1 { - t.Errorf("[%d] Unexpected number of hits: %v", i, len(res.Hits)) - } + res, err := idx.Search(NewSearchRequest(q)) + if err != nil { + t.Fatal(err) + } + + if len(res.Hits) != 1 { + t.Errorf("Unexpected number of hits: %v", len(res.Hits)) + } + }) } } From c5a10892e327a422f169b9a81270cfcdf3d61307 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 12 May 2020 19:59:39 -0400 Subject: [PATCH 683/728] introduce new scorch index builder (#1282) The purpose of the index builder is to better support use cases where one would like to index a set of data, not search it during the build, ensure the resulting file is in an efficient optimized state, and then support opening this index read-only to serve queries. Useful implementation details: Only an Index() method is offered, meaning you do not control the batch size. The builder will group data into batches using a configurable 'batchSize' parameter (default 1000) All of these batches are persisted as segments into a temp location. You can control this using 'buildPathPrefix', or it will default to a system temp location. You must call Close() after indexing all data. This will flush the last batch, and begin merging segments. The builder will attempt to merge several segments at once. This is configurable with the 'mergeMax' setting, which defaults to 10. Merging continues until there is only 1 segment remaining. At this point, the final segment is moved to the final location (as specified by the original builder constructor). The root.bolt is created, pointing to the segment, and the remaining bleve index wrapping is completed. --- builder.go | 94 ++++++++++ builder_test.go | 89 +++++++++ index.go | 14 ++ index/index.go | 7 + index/scorch/builder.go | 333 +++++++++++++++++++++++++++++++++ index/scorch/builder_test.go | 160 ++++++++++++++++ index/scorch/persister.go | 75 ++++---- index/scorch/scorch.go | 34 +++- index/scorch/segment_plugin.go | 22 ++- 9 files changed, 780 insertions(+), 48 deletions(-) create mode 100644 builder.go create mode 100644 builder_test.go create mode 100644 index/scorch/builder.go create mode 100644 index/scorch/builder_test.go diff --git a/builder.go b/builder.go new file mode 100644 index 000000000..de00c97b6 --- /dev/null +++ b/builder.go @@ -0,0 +1,94 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package bleve + +import ( + "encoding/json" + "fmt" + + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch" + "github.com/blevesearch/bleve/mapping" +) + +type builderImpl struct { + b index.IndexBuilder + m mapping.IndexMapping +} + +func (b *builderImpl) Index(id string, data interface{}) error { + if id == "" { + return ErrorEmptyID + } + + doc := document.NewDocument(id) + err := b.m.MapDocument(doc, data) + if err != nil { + return err + } + err = b.b.Index(doc) + return err +} + +func (b *builderImpl) Close() error { + return b.b.Close() +} + +func newBuilder(path string, mapping mapping.IndexMapping, config map[string]interface{}) (Builder, error) { + if path == "" { + return nil, fmt.Errorf("builder requires path") + } + + err := mapping.Validate() + if err != nil { + return nil, err + } + + if config == nil { + config = map[string]interface{}{} + } + + // the builder does not have an API to interact with internal storage + // however we can pass k/v pairs through the config + mappingBytes, err := json.Marshal(mapping) + if err != nil { + return nil, err + } + config["internal"] = map[string][]byte{ + string(mappingInternalKey): mappingBytes, + } + + // do not use real config, as these are options for the builder, + // not the resulting index + meta := newIndexMeta(scorch.Name, scorch.Name, map[string]interface{}{}) + err = meta.Save(path) + if err != nil { + return nil, err + } + + config["path"] = indexStorePath(path) + + b, err := scorch.NewBuilder(config) + if err != nil { + return nil, err + } + rv := &builderImpl{ + b: b, + m: mapping, + } + + return rv, nil +} diff --git a/builder_test.go b/builder_test.go new file mode 100644 index 000000000..119f39f58 --- /dev/null +++ b/builder_test.go @@ -0,0 +1,89 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package bleve + +import ( + "fmt" + "io/ioutil" + "os" + "testing" +) + +func TestBuilder(t *testing.T) { + tmpDir, err := ioutil.TempDir("", "bleve-scorch-builder-test") + if err != nil { + t.Fatal(err) + } + defer func() { + err = os.RemoveAll(tmpDir) + if err != nil { + t.Fatalf("error cleaning up test index") + } + }() + + conf := map[string]interface{}{ + "batchSize": 2, + "mergeMax": 2, + } + b, err := NewBuilder(tmpDir, NewIndexMapping(), conf) + if err != nil { + t.Fatal(err) + } + + for i := 0; i < 10; i++ { + doc := map[string]interface{}{ + "name": "hello", + } + err = b.Index(fmt.Sprintf("%d", i), doc) + if err != nil { + t.Fatal(err) + } + } + + err = b.Close() + if err != nil { + t.Fatal(err) + } + + idx, err := Open(tmpDir) + if err != nil { + t.Fatalf("error opening index: %v", err) + } + defer func() { + err = idx.Close() + if err != nil { + t.Errorf("error closing index: %v", err) + } + }() + + docCount, err := idx.DocCount() + if err != nil { + t.Errorf("error checking doc count: %v", err) + } + if docCount != 10 { + t.Errorf("expected doc count to be 10, got %d", docCount) + } + + q := NewTermQuery("hello") + q.SetField("name") + req := NewSearchRequest(q) + res, err := idx.Search(req) + if err != nil { + t.Errorf("error searching index: %v", err) + } + if res.Total != 10 { + t.Errorf("expected 10 search hits, got %d", res.Total) + } +} diff --git a/index.go b/index.go index ef6ede934..974358b81 100644 --- a/index.go +++ b/index.go @@ -293,3 +293,17 @@ func Open(path string) (Index, error) { func OpenUsing(path string, runtimeConfig map[string]interface{}) (Index, error) { return openIndexUsing(path, runtimeConfig) } + +// Builder is a limited interface, used to build indexes in an offline mode. +// Items cannot be updated or deleted, and the caller MUST ensure a document is +// indexed only once. +type Builder interface { + Index(id string, data interface{}) error + Close() error +} + +// NewBuilder creates a builder, which will build an index at the specified path, +// using the specified mapping and options. +func NewBuilder(path string, mapping mapping.IndexMapping, config map[string]interface{}) (Builder, error) { + return newBuilder(path, mapping, config) +} diff --git a/index/index.go b/index/index.go index 3e866f3aa..551f8de84 100644 --- a/index/index.go +++ b/index/index.go @@ -367,3 +367,10 @@ type OptimizableContext interface { type DocValueReader interface { VisitDocValues(id IndexInternalID, visitor DocumentFieldTermVisitor) error } + +// IndexBuilder is an interface supported by some index schemes +// to allow direct write-only index building +type IndexBuilder interface { + Index(doc *document.Document) error + Close() error +} diff --git a/index/scorch/builder.go b/index/scorch/builder.go new file mode 100644 index 000000000..cd400bd1d --- /dev/null +++ b/index/scorch/builder.go @@ -0,0 +1,333 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "fmt" + "io/ioutil" + "os" + "sync" + + "github.com/RoaringBitmap/roaring" + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" + "github.com/blevesearch/bleve/index/scorch/segment" + bolt "go.etcd.io/bbolt" +) + +const DefaultBuilderBatchSize = 1000 +const DefaultBuilderMergeMax = 10 + +type Builder struct { + m sync.Mutex + segCount uint64 + path string + buildPath string + segPaths []string + batchSize int + mergeMax int + batch *index.Batch + internal map[string][]byte + segPlugin segment.Plugin +} + +func NewBuilder(config map[string]interface{}) (*Builder, error) { + path, ok := config["path"].(string) + if !ok { + return nil, fmt.Errorf("must specify path") + } + + buildPathPrefix, _ := config["buildPathPrefix"].(string) + buildPath, err := ioutil.TempDir(buildPathPrefix, "scorch-offline-build") + if err != nil { + return nil, err + } + + rv := &Builder{ + path: path, + buildPath: buildPath, + mergeMax: DefaultBuilderMergeMax, + batchSize: DefaultBuilderBatchSize, + batch: index.NewBatch(), + segPlugin: defaultSegmentPlugin, + } + + err = rv.parseConfig(config) + if err != nil { + return nil, fmt.Errorf("error parsing builder config: %v", err) + } + + return rv, nil +} + +func (o *Builder) parseConfig(config map[string]interface{}) (err error) { + if v, ok := config["mergeMax"]; ok { + var t int + if t, err = parseToInteger(v); err != nil { + return fmt.Errorf("mergeMax parse err: %v", err) + } + if t > 0 { + o.mergeMax = t + } + } + + if v, ok := config["batchSize"]; ok { + var t int + if t, err = parseToInteger(v); err != nil { + return fmt.Errorf("batchSize parse err: %v", err) + } + if t > 0 { + o.batchSize = t + } + } + + if v, ok := config["internal"]; ok { + if vinternal, ok := v.(map[string][]byte); ok { + o.internal = vinternal + } + } + + forcedSegmentType, forcedSegmentVersion, err := configForceSegmentTypeVersion(config) + if err != nil { + return err + } + if forcedSegmentType != "" && forcedSegmentVersion != 0 { + segPlugin, err := chooseSegmentPlugin(forcedSegmentType, + uint32(forcedSegmentVersion)) + if err != nil { + o.segPlugin = segPlugin + } + } + + return nil +} + +// Index will place the document into the index. +// It is invalid to index the same document multiple times. +func (o *Builder) Index(doc *document.Document) error { + o.m.Lock() + defer o.m.Unlock() + + o.batch.Update(doc) + + return o.maybeFlushBatchLOCKED(o.batchSize) +} + +func (o *Builder) maybeFlushBatchLOCKED(moreThan int) error { + if len(o.batch.IndexOps) >= moreThan { + defer o.batch.Reset() + return o.executeBatchLOCKED(o.batch) + } + return nil +} + +func (o *Builder) executeBatchLOCKED(batch *index.Batch) (err error) { + analysisResults := make([]*index.AnalysisResult, 0, len(batch.IndexOps)) + for _, doc := range batch.IndexOps { + if doc != nil { + // insert _id field + doc.AddField(document.NewTextFieldCustom("_id", nil, []byte(doc.ID), document.IndexField|document.StoreField, nil)) + // perform analysis directly + analysisResult := analyze(doc) + analysisResults = append(analysisResults, analysisResult) + } + } + + seg, _, err := o.segPlugin.New(analysisResults) + if err != nil { + return fmt.Errorf("error building segment base: %v", err) + } + + filename := zapFileName(o.segCount) + o.segCount++ + path := o.buildPath + string(os.PathSeparator) + filename + + if segUnpersisted, ok := seg.(segment.UnpersistedSegment); ok { + err = segUnpersisted.Persist(path) + if err != nil { + return fmt.Errorf("error persisting segment base to %s: %v", path, err) + } + + o.segPaths = append(o.segPaths, path) + return nil + } + + return fmt.Errorf("new segment does not implement unpersisted: %T", seg) +} + +func (o *Builder) doMerge() error { + // as long as we have more than 1 segment, keep merging + for len(o.segPaths) > 1 { + + // merge the next number of segments into one new one + // or, if there are fewer than remaining, merge them all + mergeCount := o.mergeMax + if mergeCount > len(o.segPaths) { + mergeCount = len(o.segPaths) + } + + mergePaths := o.segPaths[0:mergeCount] + o.segPaths = o.segPaths[mergeCount:] + + // open each of the segments to be merged + mergeSegs := make([]segment.Segment, 0, mergeCount) + + // closeOpenedSegs attempts to close all opened + // segments even if an error occurs, in which case + // the first error is returned + closeOpenedSegs := func() error { + var err error + for _, seg := range mergeSegs { + clErr := seg.Close() + if clErr != nil && err == nil { + err = clErr + } + } + return err + } + + for _, mergePath := range mergePaths { + seg, err := o.segPlugin.Open(mergePath) + if err != nil { + _ = closeOpenedSegs() + return fmt.Errorf("error opening segment (%s) for merge: %v", mergePath, err) + } + mergeSegs = append(mergeSegs, seg) + } + + // do the merge + mergedSegPath := o.buildPath + string(os.PathSeparator) + zapFileName(o.segCount) + drops := make([]*roaring.Bitmap, mergeCount) + _, _, err := o.segPlugin.Merge(mergeSegs, drops, mergedSegPath, nil, nil) + if err != nil { + _ = closeOpenedSegs() + return fmt.Errorf("error merging segments (%v): %v", mergePaths, err) + } + o.segCount++ + o.segPaths = append(o.segPaths, mergedSegPath) + + // close segments opened for merge + err = closeOpenedSegs() + if err != nil { + return fmt.Errorf("error closing opened segments: %v", err) + } + + // remove merged segments + for _, mergePath := range mergePaths { + err = os.RemoveAll(mergePath) + if err != nil { + return fmt.Errorf("error removing segment %s after merge: %v", mergePath, err) + } + } + } + + return nil +} + +func (o *Builder) Close() error { + o.m.Lock() + defer o.m.Unlock() + + // see if there is a partial batch + err := o.maybeFlushBatchLOCKED(1) + if err != nil { + return fmt.Errorf("error flushing batch before close: %v", err) + } + + // perform all the merging + err = o.doMerge() + if err != nil { + return fmt.Errorf("error while merging: %v", err) + } + + // ensure the store path exists + err = os.MkdirAll(o.path, 0700) + if err != nil { + return err + } + + // move final segment into place + // segment id 2 is chosen to match the behavior of a scorch + // index which indexes a single batch of data + finalSegPath := o.path + string(os.PathSeparator) + zapFileName(2) + err = os.Rename(o.segPaths[0], finalSegPath) + if err != nil { + return fmt.Errorf("error moving final segment into place: %v", err) + } + + // remove the buildPath, as it is no longer needed + err = os.RemoveAll(o.buildPath) + if err != nil { + return fmt.Errorf("error removing build path: %v", err) + } + + // prepare wrapping + seg, err := o.segPlugin.Open(finalSegPath) + if err != nil { + return fmt.Errorf("error opening final segment") + } + + // create a segment snapshot for this segment + ss := &SegmentSnapshot{ + segment: seg, + } + is := &IndexSnapshot{ + epoch: 3, // chosen to match scorch behavior when indexing a single batch + segment: []*SegmentSnapshot{ss}, + creator: "scorch-builder", + internal: o.internal, + } + + // create the root bolt + rootBoltPath := o.path + string(os.PathSeparator) + "root.bolt" + rootBolt, err := bolt.Open(rootBoltPath, 0600, nil) + if err != nil { + return err + } + + // start a write transaction + tx, err := rootBolt.Begin(true) + if err != nil { + return err + } + + // fill the root bolt with this fake index snapshot + _, _, err = prepareBoltSnapshot(is, tx, o.path, o.segPlugin) + if err != nil { + _ = tx.Rollback() + _ = rootBolt.Close() + return fmt.Errorf("error preparing bolt snapshot in root.bolt: %v", err) + } + + // commit bolt data + err = tx.Commit() + if err != nil { + _ = rootBolt.Close() + return fmt.Errorf("error committing bolt tx in root.bolt: %v", err) + } + + // close bolt + err = rootBolt.Close() + if err != nil { + return fmt.Errorf("error closing root.bolt: %v", err) + } + + // close final segment + err = seg.Close() + if err != nil { + return fmt.Errorf("error closing final segment: %v", err) + } + return nil +} diff --git a/index/scorch/builder_test.go b/index/scorch/builder_test.go new file mode 100644 index 000000000..1d0ce7160 --- /dev/null +++ b/index/scorch/builder_test.go @@ -0,0 +1,160 @@ +// Copyright (c) 2019 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "fmt" + "io/ioutil" + "os" + "testing" + + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" +) + +func TestBuilder(t *testing.T) { + tmpDir, err := ioutil.TempDir("", "scorch-builder-test") + if err != nil { + t.Fatal(err) + } + defer func() { + err = os.RemoveAll(tmpDir) + if err != nil { + t.Fatalf("error cleaning up test index: %v", err) + } + }() + options := map[string]interface{}{ + "path": tmpDir, + "batchSize": 2, + "mergeMax": 2, + } + b, err := NewBuilder(options) + if err != nil { + t.Fatal(err) + } + + for i := 0; i < 10; i++ { + doc := document.NewDocument(fmt.Sprintf("%d", i)) + doc.AddField(document.NewTextField("name", nil, []byte("hello"))) + err = b.Index(doc) + if err != nil { + t.Fatal(err) + } + } + + err = b.Close() + if err != nil { + t.Fatal(err) + } + + checkIndex(t, tmpDir, []byte("hello"), "name", 10) + +} + +func checkIndex(t *testing.T, path string, term []byte, field string, expectCount int) { + cfg := make(map[string]interface{}) + cfg["path"] = path + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, cfg, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Fatalf("error opening index: %v", err) + } + defer func() { + err = idx.Close() + if err != nil { + t.Fatalf("error closing index: %v", err) + } + }() + + r, err := idx.Reader() + if err != nil { + t.Fatalf("error accessing index reader: %v", err) + } + defer func() { + err = r.Close() + if err != nil { + t.Fatalf("error closing reader: %v", err) + } + }() + + // check the count, expect 10 docs + count, err := r.DocCount() + if err != nil { + t.Errorf("error accessing index doc count: %v", err) + } else if count != uint64(expectCount) { + t.Errorf("expected %d docs, got %d", expectCount, count) + } + + // run a search for hello + tfr, err := r.TermFieldReader(term, field, false, false, false) + if err != nil { + t.Errorf("error accessing term field reader: %v", err) + } else { + var rows int + tfd, err := tfr.Next(nil) + for err == nil && tfd != nil { + rows++ + tfd, err = tfr.Next(nil) + } + if err != nil { + t.Errorf("error calling next on term field reader: %v", err) + } + if rows != expectCount { + t.Errorf("expected %d rows for term hello, field name, got %d", expectCount, rows) + } + } +} + +func TestBuilderFlushFinalBatch(t *testing.T) { + tmpDir, err := ioutil.TempDir("", "scorch-builder-test") + if err != nil { + t.Fatal(err) + } + defer func() { + err = os.RemoveAll(tmpDir) + if err != nil { + t.Fatalf("error cleaning up test index: %v", err) + } + }() + options := map[string]interface{}{ + "path": tmpDir, + "batchSize": 2, + "mergeMax": 2, + } + b, err := NewBuilder(options) + if err != nil { + t.Fatal(err) + } + + for i := 0; i < 9; i++ { + doc := document.NewDocument(fmt.Sprintf("%d", i)) + doc.AddField(document.NewTextField("name", nil, []byte("hello"))) + err = b.Index(doc) + if err != nil { + t.Fatal(err) + } + } + + err = b.Close() + if err != nil { + t.Fatal(err) + } + + checkIndex(t, tmpDir, []byte("hello"), "name", 9) +} diff --git a/index/scorch/persister.go b/index/scorch/persister.go index ffa656693..ea3667bbf 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -428,55 +428,44 @@ func (s *Scorch) persistSnapshotMaybeMerge(snapshot *IndexSnapshot) ( return true, nil } -func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { - // start a write transaction - tx, err := s.rootBolt.Begin(true) - if err != nil { - return err - } - // defer rollback on error - defer func() { - if err != nil { - _ = tx.Rollback() - } - }() - +func prepareBoltSnapshot(snapshot *IndexSnapshot, tx *bolt.Tx, path string, + segPlugin segment.Plugin) ([]string, map[uint64]string, error) { snapshotsBucket, err := tx.CreateBucketIfNotExists(boltSnapshotsBucket) if err != nil { - return err + return nil, nil, err } newSnapshotKey := segment.EncodeUvarintAscending(nil, snapshot.epoch) snapshotBucket, err := snapshotsBucket.CreateBucketIfNotExists(newSnapshotKey) if err != nil { - return err + return nil, nil, err } // persist meta values metaBucket, err := snapshotBucket.CreateBucketIfNotExists(boltMetaDataKey) if err != nil { - return err + return nil, nil, err } - err = metaBucket.Put(boltMetaDataSegmentTypeKey, []byte(s.segPlugin.Type())) + err = metaBucket.Put(boltMetaDataSegmentTypeKey, []byte(segPlugin.Type())) if err != nil { - return err + return nil, nil, err } buf := make([]byte, binary.MaxVarintLen32) - binary.BigEndian.PutUint32(buf, s.segPlugin.Version()) + binary.BigEndian.PutUint32(buf, segPlugin.Version()) err = metaBucket.Put(boltMetaDataSegmentVersionKey, buf) if err != nil { - return err + return nil, nil, err } // persist internal values internalBucket, err := snapshotBucket.CreateBucketIfNotExists(boltInternalKey) if err != nil { - return err + return nil, nil, err } // TODO optimize writing these in order? for k, v := range snapshot.internal { err = internalBucket.Put([]byte(k), v) if err != nil { - return err + return nil, nil, err } } @@ -488,49 +477,69 @@ func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { snapshotSegmentKey := segment.EncodeUvarintAscending(nil, segmentSnapshot.id) snapshotSegmentBucket, err := snapshotBucket.CreateBucketIfNotExists(snapshotSegmentKey) if err != nil { - return err + return nil, nil, err } switch seg := segmentSnapshot.segment.(type) { case segment.PersistedSegment: - path := seg.Path() - filename := strings.TrimPrefix(path, s.path+string(os.PathSeparator)) + segPath := seg.Path() + filename := strings.TrimPrefix(segPath, path+string(os.PathSeparator)) err = snapshotSegmentBucket.Put(boltPathKey, []byte(filename)) if err != nil { - return err + return nil, nil, err } filenames = append(filenames, filename) case segment.UnpersistedSegment: // need to persist this to disk filename := zapFileName(segmentSnapshot.id) - path := s.path + string(os.PathSeparator) + filename + path := path + string(os.PathSeparator) + filename err = seg.Persist(path) if err != nil { - return fmt.Errorf("error persisting segment: %v", err) + return nil, nil, fmt.Errorf("error persisting segment: %v", err) } newSegmentPaths[segmentSnapshot.id] = path err = snapshotSegmentBucket.Put(boltPathKey, []byte(filename)) if err != nil { - return err + return nil, nil, err } filenames = append(filenames, filename) - default: - return fmt.Errorf("unknown segment type: %T", seg) + return nil, nil, fmt.Errorf("unknown segment type: %T", seg) } // store current deleted bits var roaringBuf bytes.Buffer if segmentSnapshot.deleted != nil { _, err = segmentSnapshot.deleted.WriteTo(&roaringBuf) if err != nil { - return fmt.Errorf("error persisting roaring bytes: %v", err) + return nil, nil, fmt.Errorf("error persisting roaring bytes: %v", err) } err = snapshotSegmentBucket.Put(boltDeletedKey, roaringBuf.Bytes()) if err != nil { - return err + return nil, nil, err } } } + return filenames, newSegmentPaths, nil +} + +func (s *Scorch) persistSnapshotDirect(snapshot *IndexSnapshot) (err error) { + // start a write transaction + tx, err := s.rootBolt.Begin(true) + if err != nil { + return err + } + // defer rollback on error + defer func() { + if err != nil { + _ = tx.Rollback() + } + }() + + filenames, newSegmentPaths, err := prepareBoltSnapshot(snapshot, tx, s.path, s.segPlugin) + if err != nil { + return err + } + // we need to swap in a new root only when we've persisted 1 or // more segments -- whereby the new root would have 1-for-1 // replacements of in-memory segments with file-based segments diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 698aaf16a..d1e1c6af5 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -104,15 +104,11 @@ func NewScorch(storeName string, segPlugin: defaultSegmentPlugin, } - // check if the caller has requested a specific segment type/version - forcedSegmentVersion, ok := config["forceSegmentVersion"].(int) - if ok { - forcedSegmentType, ok2 := config["forceSegmentType"].(string) - if !ok2 { - return nil, fmt.Errorf( - "forceSegmentVersion set to %d, must also specify forceSegmentType", forcedSegmentVersion) - } - + forcedSegmentType, forcedSegmentVersion, err := configForceSegmentTypeVersion(config) + if err != nil { + return nil, err + } + if forcedSegmentType != "" && forcedSegmentVersion != 0 { err := rv.loadSegmentPlugin(forcedSegmentType, uint32(forcedSegmentVersion)) if err != nil { @@ -140,6 +136,22 @@ func NewScorch(storeName string, return rv, nil } +// configForceSegmentTypeVersion checks if the caller has requested a +// specific segment type/version +func configForceSegmentTypeVersion(config map[string]interface{}) (string, uint32, error) { + forcedSegmentVersion, ok := config["forceSegmentVersion"].(int) + if ok { + forcedSegmentType, ok2 := config["forceSegmentType"].(string) + if !ok2 { + return "", 0, fmt.Errorf( + "forceSegmentVersion set to %d, must also specify forceSegmentType", forcedSegmentVersion) + } + + return forcedSegmentType, uint32(forcedSegmentVersion), nil + } + return "", 0, nil +} + func (s *Scorch) paused() uint64 { s.pauseLock.Lock() pc := s.pauseCount @@ -567,6 +579,10 @@ func (s *Scorch) StatsMap() map[string]interface{} { } func (s *Scorch) Analyze(d *document.Document) *index.AnalysisResult { + return analyze(d) +} + +func analyze(d *document.Document) *index.AnalysisResult { rv := &index.AnalysisResult{ Document: d, Analyzed: make([]analysis.TokenFrequencies, len(d.Fields)+len(d.CompositeFields)), diff --git a/index/scorch/segment_plugin.go b/index/scorch/segment_plugin.go index 01eda7fbd..6dfa3b282 100644 --- a/index/scorch/segment_plugin.go +++ b/index/scorch/segment_plugin.go @@ -60,18 +60,28 @@ func SupportedSegmentTypeVersions(typ string) (rv []uint32) { return rv } -func (s *Scorch) loadSegmentPlugin(forcedSegmentType string, - forcedSegmentVersion uint32) error { +func chooseSegmentPlugin(forcedSegmentType string, + forcedSegmentVersion uint32) (segment.Plugin, error) { if versions, ok := supportedSegmentPlugins[forcedSegmentType]; ok { if segPlugin, ok := versions[uint32(forcedSegmentVersion)]; ok { - s.segPlugin = segPlugin - return nil + return segPlugin, nil } - return fmt.Errorf( + return nil, fmt.Errorf( "unsupported version %d for segment type: %s, supported: %v", forcedSegmentVersion, forcedSegmentType, SupportedSegmentTypeVersions(forcedSegmentType)) } - return fmt.Errorf("unsupported segment type: %s, supported: %v", + return nil, fmt.Errorf("unsupported segment type: %s, supported: %v", forcedSegmentType, SupportedSegmentTypes()) } + +func (s *Scorch) loadSegmentPlugin(forcedSegmentType string, + forcedSegmentVersion uint32) error { + segPlugin, err := chooseSegmentPlugin(forcedSegmentType, + forcedSegmentVersion) + if err != nil { + return err + } + s.segPlugin = segPlugin + return nil +} From 472e3e19a293a902de8cc83236419fd38881594a Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Wed, 13 May 2020 11:50:19 +0530 Subject: [PATCH 684/728] introducing context argument to the ForceMerge for cancellation of the api. --- index/scorch/introducer.go | 6 +- index/scorch/merge.go | 130 +++++++++++++++++------------------- index/scorch/persister.go | 25 +++---- index/scorch/scorch.go | 5 +- index/scorch/scorch_test.go | 44 +++++++----- index_alias_impl.go | 14 ++-- 6 files changed, 107 insertions(+), 117 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 3325c9b17..64ca969bd 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -313,15 +313,11 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { defer func() { _ = root.DecRef() }() - if nextMerge.creator == "" { - nextMerge.creator = "introduceMerge" - } - newSnapshot := &IndexSnapshot{ parent: s, internal: root.internal, refs: 1, - creator: nextMerge.creator, + creator: "introduceMerge", } // iterate through current segments diff --git a/index/scorch/merge.go b/index/scorch/merge.go index c0464fba8..3eccee52e 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -15,6 +15,7 @@ package scorch import ( + "context" "encoding/json" "fmt" "os" @@ -27,8 +28,6 @@ import ( "github.com/blevesearch/bleve/index/scorch/segment" ) -var numSnapShotsToKeepOverRuler = "introduceForceMerge" - func (s *Scorch) mergerLoop() { var lastEpochMergePlanned uint64 var ctrlMsg *mergerCtrl @@ -38,7 +37,9 @@ func (s *Scorch) mergerLoop() { s.asyncTasks.Done() return } - ctrlMsgDflt := &mergerCtrl{options: mergePlannerOptions, doneCh: nil} + ctrlMsgDflt := &mergerCtrl{ctx: context.Background(), + options: mergePlannerOptions, + doneCh: nil} OUTER: for { @@ -64,8 +65,8 @@ OUTER: startTime := time.Now() // lets get started - err := s.planMergeAtSnapshot(ourSnapshot, ctrlMsg.options, - ctrlMsg.creator, ctrlMsg.cancelCh) + err := s.planMergeAtSnapshot(ctrlMsg.ctx, ctrlMsg.options, + ourSnapshot) if err != nil { atomic.StoreUint64(&s.iStats.mergeEpoch, 0) if err == segment.ErrClosed { @@ -114,7 +115,7 @@ OUTER: case <-s.closeCh: break OUTER case s.persisterNotifier <- ew: - case ctrlMsg = <-s.mergerKickCh: + case ctrlMsg = <-s.forceMergeRequestCh: continue OUTER } @@ -123,7 +124,7 @@ OUTER: case <-s.closeCh: break OUTER case <-ew.notifyCh: - case ctrlMsg = <-s.mergerKickCh: + case ctrlMsg = <-s.forceMergeRequestCh: } } @@ -134,33 +135,15 @@ OUTER: } type mergerCtrl struct { - creator string - options *mergeplan.MergePlanOptions - doneCh chan struct{} - cancelCh chan struct{} -} - -// MergeRequest represents various control -// parameters for the ForceMerge API. -type MergeRequest struct { - // MergeOptions specify the merge policy applied during - // the forced merge cycles. This doesn't override the - // index's original merge policy. - MergeOptions *mergeplan.MergePlanOptions - - // OverrideNumSnapshotsToKeep specify whether to retain - // a number of older snapshots dictated by numSnapshotsToKeep - // during a forced merge cycle. Enabling this reduces the - // disk space requirements during a forced merge operation. - OverrideNumSnapshotsToKeep bool - - // CancelCh helps in cancelling an ongoing merge operation. - CancelCh chan struct{} + ctx context.Context + options *mergeplan.MergePlanOptions + doneCh chan struct{} } // ForceMerge helps users trigger a merge operation on // an online scorch index. -func (s *Scorch) ForceMerge(mr *MergeRequest) error { +func (s *Scorch) ForceMerge(ctx context.Context, + mo *mergeplan.MergePlanOptions) error { // check whether force merge is already under processing s.rootLock.Lock() if s.stats.TotFileMergeForceOpsStarted > @@ -172,23 +155,23 @@ func (s *Scorch) ForceMerge(mr *MergeRequest) error { s.stats.TotFileMergeForceOpsStarted++ s.rootLock.Unlock() - if mr.MergeOptions == nil { + if mo != nil { + err := mergeplan.ValidateMergePlannerOptions(mo) + if err != nil { + return err + } + } else { // assume the default single segment merge policy - mr.MergeOptions = &mergeplan.SingleSegmentMergePlanOptions - } - var ssCreator string - if mr.OverrideNumSnapshotsToKeep { - ssCreator = numSnapShotsToKeepOverRuler + mo = &mergeplan.SingleSegmentMergePlanOptions } - msg := &mergerCtrl{creator: ssCreator, - options: mr.MergeOptions, - doneCh: make(chan struct{}), - cancelCh: mr.CancelCh, + msg := &mergerCtrl{options: mo, + doneCh: make(chan struct{}), + ctx: ctx, } - // kick the merger workloop + // request the merger perform a force merge select { - case s.mergerKickCh <- msg: + case s.forceMergeRequestCh <- msg: case <-s.closeCh: return nil } @@ -225,9 +208,39 @@ func (s *Scorch) parseMergePlannerOptions() (*mergeplan.MergePlanOptions, return &mergePlannerOptions, nil } -func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, - options *mergeplan.MergePlanOptions, creator string, - cancelCh chan struct{}) error { +type closeChWrapper struct { + ch1 chan struct{} + ctx context.Context + closeCh chan struct{} +} + +func newCloseChWrapper(ch1 chan struct{}, + ctx context.Context) *closeChWrapper { + return &closeChWrapper{ch1: ch1, + ctx: ctx, + closeCh: make(chan struct{})} +} + +func (w *closeChWrapper) close() { + select { + case <-w.closeCh: + default: + close(w.closeCh) + } +} + +func (w *closeChWrapper) listen() { + select { + case <-w.ch1: + w.close() + case <-w.ctx.Done(): + w.close() + case <-w.closeCh: + } +} + +func (s *Scorch) planMergeAtSnapshot(ctx context.Context, + options *mergeplan.MergePlanOptions, ourSnapshot *IndexSnapshot) error { // build list of persisted segments in this snapshot var onlyPersistedSnapshots []mergeplan.Segment for _, segmentSnapshot := range ourSnapshot.segment { @@ -256,27 +269,10 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, // process tasks in serial for now var filenames []string - closeCh := make(chan struct{}) - cleanup := func() { - select { - case <-closeCh: - default: - close(closeCh) - } - } - defer cleanup() + cw := newCloseChWrapper(s.closeCh, ctx) + defer cw.close() - // cancel the merge operation on events like the index closure - // or upon a user cancel. - go func() { - select { - case <-s.closeCh: - cleanup() - case <-cancelCh: - cleanup() - case <-closeCh: - } - }() + go cw.listen() for _, task := range resultMergePlan.Tasks { if len(task.Segments) == 0 { @@ -323,7 +319,7 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, atomic.AddUint64(&s.stats.TotFileMergeZapBeg, 1) newDocNums, _, err := s.segPlugin.Merge(segmentsToMerge, docsToDrop, path, - closeCh, s) + cw.closeCh, s) atomic.AddUint64(&s.stats.TotFileMergeZapEnd, 1) fileMergeZapTime := uint64(time.Since(fileMergeZapStartTime)) @@ -361,7 +357,6 @@ func (s *Scorch) planMergeAtSnapshot(ourSnapshot *IndexSnapshot, oldNewDocNums: oldNewDocNums, new: seg, notify: make(chan *IndexSnapshot), - creator: creator, } // give it to the introducer @@ -406,7 +401,6 @@ type segmentMerge struct { oldNewDocNums map[uint64][]uint64 new segment.Segment notify chan *IndexSnapshot - creator string } // perform a merging of the given SegmentBase instances into a new, diff --git a/index/scorch/persister.go b/index/scorch/persister.go index aec1c0fea..ffa656693 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -209,7 +209,7 @@ OUTER: case s.introducerNotifier <- w: } - s.removeOldData(s.getNumSnapshotsToKeep(ourSnapshot)) // might as well cleanup while waiting + s.removeOldData() // might as well cleanup while waiting atomic.AddUint64(&s.stats.TotPersistLoopWait, 1) @@ -276,7 +276,7 @@ func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, // 1. Too many older snapshots awaiting the clean up. // 2. The merger could be lagging behind on merging the disk files. if numFilesOnDisk > uint64(po.PersisterNapUnderNumFiles) { - s.removeOldData(s.numSnapshotsToKeep) + s.removeOldData() numFilesOnDisk, _, _ = s.diskFileStats(nil) } @@ -780,8 +780,8 @@ func (s *Scorch) loadSegment(segmentBucket *bolt.Bucket) (*SegmentSnapshot, erro return rv, nil } -func (s *Scorch) removeOldData(numSnapshotsToKeep int) { - removed, err := s.removeOldBoltSnapshots(numSnapshotsToKeep) +func (s *Scorch) removeOldData() { + removed, err := s.removeOldBoltSnapshots() if err != nil { s.fireAsyncError(fmt.Errorf("got err removing old bolt snapshots: %v", err)) } @@ -798,31 +798,22 @@ func (s *Scorch) removeOldData(numSnapshotsToKeep int) { // rollback'ability. var NumSnapshotsToKeep = 1 -func (s *Scorch) getNumSnapshotsToKeep(ourSnapshot *IndexSnapshot) int { - if ourSnapshot != nil && - ourSnapshot.creator != numSnapShotsToKeepOverRuler { - return s.numSnapshotsToKeep - } - return 1 -} - // Removes enough snapshots from the rootBolt so that the // s.eligibleForRemoval stays under the NumSnapshotsToKeep policy. -func (s *Scorch) removeOldBoltSnapshots(numSnapshotsToKeep int) ( - numRemoved int, err error) { +func (s *Scorch) removeOldBoltSnapshots() (numRemoved int, err error) { persistedEpochs, err := s.RootBoltSnapshotEpochs() if err != nil { return 0, err } - if len(persistedEpochs) <= numSnapshotsToKeep { + if len(persistedEpochs) <= s.numSnapshotsToKeep { // we need to keep everything return 0, nil } // make a map of epochs to protect from deletion - protectedEpochs := make(map[uint64]struct{}, numSnapshotsToKeep) - for _, epoch := range persistedEpochs[0:numSnapshotsToKeep] { + protectedEpochs := make(map[uint64]struct{}, s.numSnapshotsToKeep) + for _, epoch := range persistedEpochs[0:s.numSnapshotsToKeep] { protectedEpochs[epoch] = struct{}{} } diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index affda93a3..06a5e6b12 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -77,7 +77,7 @@ type Scorch struct { pauseCount uint64 - mergerKickCh chan *mergerCtrl + forceMergeRequestCh chan *mergerCtrl segPlugin segment.Plugin } @@ -103,7 +103,7 @@ func NewScorch(storeName string, nextSnapshotEpoch: 1, closeCh: make(chan struct{}), ineligibleForRemoval: map[string]bool{}, - mergerKickCh: make(chan *mergerCtrl, 1), + forceMergeRequestCh: make(chan *mergerCtrl, 1), segPlugin: defaultSegmentPlugin, } @@ -244,6 +244,7 @@ func (s *Scorch) openBolt() error { s.introducerNotifier = make(chan *epochWatcher, 1) s.persisterNotifier = make(chan *epochWatcher, 1) s.closeCh = make(chan struct{}) + s.forceMergeRequestCh = make(chan *mergerCtrl, 1) if !s.readOnly && s.path != "" { err := s.removeOldZapFiles() // Before persister or merger create any new files. diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index 2e26f6b24..5a9ef40cd 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -15,6 +15,7 @@ package scorch import ( + "context" "fmt" "log" "math/rand" @@ -2226,17 +2227,16 @@ func TestIndexForceMerge(t *testing.T) { t.Errorf("expected 10 root file segments, got: %d", nfs) } + ctx := context.Background() for { if atomic.LoadUint64(&si.stats.TotFileSegmentsAtRoot) == 1 { break } - err := si.ForceMerge(&MergeRequest{ - MergeOptions: &mergeplan.MergePlanOptions{ - MaxSegmentsPerTier: 1, - MaxSegmentSize: 10000, - SegmentsPerMergeTask: 10, - FloorSegmentSize: 10000}, - OverrideNumSnapshotsToKeep: true}) + err := si.ForceMerge(ctx, &mergeplan.MergePlanOptions{ + MaxSegmentsPerTier: 1, + MaxSegmentSize: 10000, + SegmentsPerMergeTask: 10, + FloorSegmentSize: 10000}) if err != nil { t.Errorf("ForceMerge failed, err: %v", err) } @@ -2247,6 +2247,17 @@ func TestIndexForceMerge(t *testing.T) { t.Errorf("expected a single root file segments, got: %d", atomic.LoadUint64(&si.stats.TotFileSegmentsAtRoot)) } + + // verify with an invalid merge plan + err = si.ForceMerge(ctx, &mergeplan.MergePlanOptions{ + MaxSegmentsPerTier: 1, + MaxSegmentSize: 1 << 33, + SegmentsPerMergeTask: 10, + FloorSegmentSize: 10000}) + if err != mergeplan.ErrMaxSegmentSizeTooLarge { + t.Errorf("ForceMerge expected to fail with ErrMaxSegmentSizeTooLarge") + } + err = idx.Close() if err != nil { t.Fatal(err) @@ -2333,28 +2344,27 @@ func TestCancelIndexForceMerge(t *testing.T) { t.Errorf("expected 20 root file segments, got: %d", nfsr) } - cancelCh := make(chan struct{}) + ctx := context.Background() + ctx, cancel := context.WithCancel(ctx) + // cancel the force merge operation once the root has some new merge // introductions. ie if the root has lesser file segments than earlier. go func() { for { nval := atomic.LoadUint64(&si.stats.TotFileSegmentsAtRoot) if nval < nfsr { - close(cancelCh) + cancel() return } time.Sleep(time.Millisecond * 5) } }() - err = si.ForceMerge(&MergeRequest{ - MergeOptions: &mergeplan.MergePlanOptions{ - MaxSegmentsPerTier: 1, - MaxSegmentSize: 10000, - SegmentsPerMergeTask: 5, - FloorSegmentSize: 10000}, - OverrideNumSnapshotsToKeep: true, - CancelCh: cancelCh}) + err = si.ForceMerge(ctx, &mergeplan.MergePlanOptions{ + MaxSegmentsPerTier: 1, + MaxSegmentSize: 10000, + SegmentsPerMergeTask: 5, + FloorSegmentSize: 10000}) if err != nil { t.Errorf("ForceMerge failed, err: %v", err) } diff --git a/index_alias_impl.go b/index_alias_impl.go index 94eb50f4e..bd5fabc86 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -44,16 +44,14 @@ func NewIndexAlias(indexes ...Index) *indexAliasImpl { } } -// Indexes just returns the indexes included in the -// index alias at the moment in an unsafe way. -// Caller must be aware that the results will be -// inconsistent if there are concurrent Add/Remove -// operations on the alias. -func (i *indexAliasImpl) Indexes() []Index { +// VisitIndexes invokes the visit callback on every +// indexes included in the index alias. +func (i *indexAliasImpl) VisitIndexes(visit func(Index)) { i.mutex.RLock() - rv := i.indexes + for _, idx := range i.indexes { + visit(idx) + } i.mutex.RUnlock() - return rv } func (i *indexAliasImpl) isAliasToSingleIndex() error { From e65c4bb7d3d34c785f3fb110e451576d29d8b7dc Mon Sep 17 00:00:00 2001 From: Tyler Kovacs Date: Wed, 20 May 2020 09:21:56 -0700 Subject: [PATCH 685/728] use inline comparison operators instead of strings.Compare In the comments from strings.Compare: // Compare is included only for symmetry with package bytes. // It is usually clearer and always faster to use the built-in // string comparison operators ==, <, >, and so on. And another comment within the implementation: // NOTE(rsc): This function does NOT call the runtime cmpstring function, // because we do not want to provide any performance justification for // using strings.Compare. Basically no one should use strings.Compare. --- search/sort.go | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/search/sort.go b/search/sort.go index 6e4ed80fa..3dc118518 100644 --- a/search/sort.go +++ b/search/sort.go @@ -233,7 +233,11 @@ func (so SortOrder) Compare(cachedScoring, cachedDesc []bool, i, j *DocumentMatc } else { iVal := i.Sort[x] jVal := j.Sort[x] - c = strings.Compare(iVal, jVal) + if iVal < jVal { + c = -1 + } else if iVal > jVal { + c = 1 + } } if c == 0 { From 5b905c99f4350d29be5a5f9c5ba5becc9aecdcf0 Mon Sep 17 00:00:00 2001 From: Tyler Kovacs Date: Wed, 20 May 2020 13:11:11 -0700 Subject: [PATCH 686/728] Add SortImpl to SearchRequest This allows SearchRequests to specify the sort implementation (defaults to sort.Sort) to use when sorting hits. --- index_alias_impl.go | 6 +++--- index_impl.go | 3 +-- search.go | 23 +++++++++++++++++++++++ 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/index_alias_impl.go b/index_alias_impl.go index bd5fabc86..7b8bf09b5 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -16,7 +16,6 @@ package bleve import ( "context" - "sort" "sync" "time" @@ -521,10 +520,11 @@ func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*Se } } + sortImpl := req.GetSortImpl() // sort all hits with the requested order if len(req.Sort) > 0 { sorter := newSearchHitSorter(req.Sort, sr.Hits) - sort.Sort(sorter) + sortImpl(sorter) } // now skip over the correct From @@ -549,7 +549,7 @@ func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*Se req.Sort.Reverse() // resort using the original order mhs := newSearchHitSorter(req.Sort, sr.Hits) - sort.Sort(mhs) + sortImpl(mhs) // reset request req.SearchBefore = req.SearchAfter req.SearchAfter = nil diff --git a/index_impl.go b/index_impl.go index 6324d960e..69eb44fae 100644 --- a/index_impl.go +++ b/index_impl.go @@ -19,7 +19,6 @@ import ( "encoding/json" "fmt" "os" - "sort" "sync" "sync/atomic" "time" @@ -579,7 +578,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr req.Sort.Reverse() // resort using the original order mhs := newSearchHitSorter(req.Sort, hits) - sort.Sort(mhs) + req.GetSortImpl()(mhs) // reset request req.SearchBefore = req.SearchAfter req.SearchAfter = nil diff --git a/search.go b/search.go index b337edc9e..c2758848d 100644 --- a/search.go +++ b/search.go @@ -18,6 +18,7 @@ import ( "encoding/json" "fmt" "reflect" + "sort" "time" "github.com/blevesearch/bleve/analysis" @@ -264,6 +265,7 @@ func (h *HighlightRequest) AddField(field string) { // Score controls the kind of scoring performed // SearchAfter supports deep paging by providing a minimum sort key // SearchBefore supports deep paging by providing a maximum sort key +// SortImpl specifies the sort implementation to use for sorting results. // // A special field named "*" can be used to return all fields. type SearchRequest struct { @@ -279,6 +281,8 @@ type SearchRequest struct { Score string `json:"score,omitempty"` SearchAfter []string `json:"search_after"` SearchBefore []string `json:"search_before"` + + SortImpl func(sort.Interface) `json:"-"` } func (r *SearchRequest) Validate() error { @@ -606,3 +610,22 @@ func MemoryNeededForSearchResult(req *SearchRequest) uint64 { return uint64(estimate) } + +// SetSortFunc sets the sort implementation to use when sorting hits. +// +// SearchRequests can specify a custom sort implementation to meet +// their needs. For instance, by specifying a parallel sort +// that uses all available cores. +func (r *SearchRequest) SetSortImpl(s func(sort.Interface)) { + r.SortImpl = s +} + +// GetSortFunc returns the sort implementation to use when sorting hits. +// Defaults to sort.Sort. +func (r *SearchRequest) GetSortImpl() func(data sort.Interface) { + if r.SortImpl != nil { + return r.SortImpl + } + + return sort.Sort +} From f1983f7cd7b3b0f1591ff47f9bb4b48f78493f42 Mon Sep 17 00:00:00 2001 From: Tyler Kovacs Date: Wed, 20 May 2020 14:39:10 -0700 Subject: [PATCH 687/728] rename SortImpl to SortFunc and rename getter --- index_alias_impl.go | 6 +++--- index_impl.go | 2 +- search.go | 16 ++++++++-------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/index_alias_impl.go b/index_alias_impl.go index 7b8bf09b5..5aa57d8ac 100644 --- a/index_alias_impl.go +++ b/index_alias_impl.go @@ -520,11 +520,11 @@ func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*Se } } - sortImpl := req.GetSortImpl() + sortFunc := req.SortFunc() // sort all hits with the requested order if len(req.Sort) > 0 { sorter := newSearchHitSorter(req.Sort, sr.Hits) - sortImpl(sorter) + sortFunc(sorter) } // now skip over the correct From @@ -549,7 +549,7 @@ func MultiSearch(ctx context.Context, req *SearchRequest, indexes ...Index) (*Se req.Sort.Reverse() // resort using the original order mhs := newSearchHitSorter(req.Sort, sr.Hits) - sortImpl(mhs) + sortFunc(mhs) // reset request req.SearchBefore = req.SearchAfter req.SearchAfter = nil diff --git a/index_impl.go b/index_impl.go index 69eb44fae..629cc9b2f 100644 --- a/index_impl.go +++ b/index_impl.go @@ -578,7 +578,7 @@ func (i *indexImpl) SearchInContext(ctx context.Context, req *SearchRequest) (sr req.Sort.Reverse() // resort using the original order mhs := newSearchHitSorter(req.Sort, hits) - req.GetSortImpl()(mhs) + req.SortFunc()(mhs) // reset request req.SearchBefore = req.SearchAfter req.SearchAfter = nil diff --git a/search.go b/search.go index c2758848d..f67450779 100644 --- a/search.go +++ b/search.go @@ -265,7 +265,7 @@ func (h *HighlightRequest) AddField(field string) { // Score controls the kind of scoring performed // SearchAfter supports deep paging by providing a minimum sort key // SearchBefore supports deep paging by providing a maximum sort key -// SortImpl specifies the sort implementation to use for sorting results. +// sortFunc specifies the sort implementation to use for sorting results. // // A special field named "*" can be used to return all fields. type SearchRequest struct { @@ -282,7 +282,7 @@ type SearchRequest struct { SearchAfter []string `json:"search_after"` SearchBefore []string `json:"search_before"` - SortImpl func(sort.Interface) `json:"-"` + sortFunc func(sort.Interface) } func (r *SearchRequest) Validate() error { @@ -616,15 +616,15 @@ func MemoryNeededForSearchResult(req *SearchRequest) uint64 { // SearchRequests can specify a custom sort implementation to meet // their needs. For instance, by specifying a parallel sort // that uses all available cores. -func (r *SearchRequest) SetSortImpl(s func(sort.Interface)) { - r.SortImpl = s +func (r *SearchRequest) SetSortFunc(s func(sort.Interface)) { + r.sortFunc = s } -// GetSortFunc returns the sort implementation to use when sorting hits. +// SortFunc returns the sort implementation to use when sorting hits. // Defaults to sort.Sort. -func (r *SearchRequest) GetSortImpl() func(data sort.Interface) { - if r.SortImpl != nil { - return r.SortImpl +func (r *SearchRequest) SortFunc() func(data sort.Interface) { + if r.sortFunc != nil { + return r.sortFunc } return sort.Sort From 9f5cdcf2c9fb65c7c7ca69087769c005fd2079c7 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 22 May 2020 11:11:48 -0400 Subject: [PATCH 688/728] speed up versus test by using larger batch (#1387) --- test/versus_test.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/test/versus_test.go b/test/versus_test.go index 20a85d161..4257a9849 100644 --- a/test/versus_test.go +++ b/test/versus_test.go @@ -53,7 +53,7 @@ func TestScorchVersusUpsideDownBoltAll(t *testing.T) { NumDocs: 1000, MaxWordsPerDoc: 20, NumWords: 10, - BatchSize: 10, + BatchSize: 1000, NumAttemptsPerSearch: 100, }).run(scorch.Name, boltdb.Name, upsidedown.Name, boltdb.Name, nil, nil) } @@ -275,7 +275,6 @@ func testVersusSearches(vt *VersusTest, searchTemplates []string, idxA, idxB ble if err != nil { t.Fatalf("could not parse search template: %s, err: %v", searchTemplate, err) } - for j := 0; j < vt.NumAttemptsPerSearch; j++ { vt.CurAttempt = j From a62b0bca7cfc82f26d4eb0f4b86080b8abc3f8d3 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 22 May 2020 11:12:36 -0400 Subject: [PATCH 689/728] update char filter table tests to use subtests (#1388) --- .../char/asciifolding/asciifolding_test.go | 11 ++- analysis/char/regexp/regexp_test.go | 75 ++++++------------- 2 files changed, 30 insertions(+), 56 deletions(-) diff --git a/analysis/char/asciifolding/asciifolding_test.go b/analysis/char/asciifolding/asciifolding_test.go index 216583d1e..d79542a8a 100644 --- a/analysis/char/asciifolding/asciifolding_test.go +++ b/analysis/char/asciifolding/asciifolding_test.go @@ -15,6 +15,7 @@ package asciifolding import ( + "fmt" "reflect" "testing" ) @@ -52,9 +53,11 @@ func TestAsciiFoldingFilter(t *testing.T) { for _, test := range tests { filter := New() - output := filter.Filter(test.input) - if !reflect.DeepEqual(output, test.output) { - t.Errorf("Expected:\n`%s`\ngot:\n`%s`\nfor:\n`%s`\n", string(test.output), string(output), string(test.input)) - } + t.Run(fmt.Sprintf("on %s", test.input), func(t *testing.T) { + output := filter.Filter(test.input) + if !reflect.DeepEqual(output, test.output) { + t.Errorf("\nExpected:\n`%s`\ngot:\n`%s`\n", string(test.output), string(output)) + } + }) } } diff --git a/analysis/char/regexp/regexp_test.go b/analysis/char/regexp/regexp_test.go index a3430555e..ff1b04b3b 100644 --- a/analysis/char/regexp/regexp_test.go +++ b/analysis/char/regexp/regexp_test.go @@ -15,64 +15,32 @@ package regexp import ( + "fmt" "reflect" "regexp" "testing" ) func TestRegexpCharFilter(t *testing.T) { - htmlTagPattern := `\s]+))?)+\s*|\s*)/?>` - htmlRegex := regexp.MustCompile(htmlTagPattern) - tests := []struct { - input []byte - output []byte - }{ - { - input: []byte(`test`), - output: []byte(` test `), - }, - } - - for _, test := range tests { - filter := New(htmlRegex, []byte{' '}) - output := filter.Filter(test.input) - if !reflect.DeepEqual(output, test.output) { - t.Errorf("Expected:\n`%s`\ngot:\n`%s`\nfor:\n`%s`\n", string(test.output), string(output), string(test.input)) - } - } -} - -func TestZeroWidthNonJoinerCharFilter(t *testing.T) { - zeroWidthNonJoinerPattern := `\x{200C}` - zeroWidthNonJoinerRegex := regexp.MustCompile(zeroWidthNonJoinerPattern) - - tests := []struct { - input []byte - output []byte - }{ - { - input: []byte("water\u200Cunder\u200Cthe\u200Cbridge"), - output: []byte("water under the bridge"), - }, - } - - for _, test := range tests { - filter := New(zeroWidthNonJoinerRegex, []byte{' '}) - output := filter.Filter(test.input) - if !reflect.DeepEqual(output, test.output) { - t.Errorf("Expected:\n`%s`\ngot:\n`%s`\nfor:\n`%s`\n", string(test.output), string(output), string(test.input)) - } - } -} - -func TestRegexpCustomReplace(t *testing.T) { tests := []struct { regexStr string replace []byte input []byte output []byte }{ + { + regexStr: `\s]+))?)+\s*|\s*)/?>`, + replace: []byte{' '}, + input: []byte(`test`), + output: []byte(` test `), + }, + { + regexStr: `\x{200C}`, + replace: []byte{' '}, + input: []byte("water\u200Cunder\u200Cthe\u200Cbridge"), + output: []byte("water under the bridge"), + }, { regexStr: `([a-z])\s+(\d)`, replace: []byte(`$1-$2`), @@ -105,13 +73,16 @@ func TestRegexpCustomReplace(t *testing.T) { }, } - for i := range tests { - regex := regexp.MustCompile(tests[i].regexStr) - filter := New(regex, tests[i].replace) + for _, test := range tests { + t.Run(fmt.Sprintf("match %s replace %s", test.regexStr, string(test.replace)), func(t *testing.T) { + regex := regexp.MustCompile(test.regexStr) + filter := New(regex, test.replace) + + output := filter.Filter(test.input) + if !reflect.DeepEqual(test.output, output) { + t.Errorf("Expected: `%s`, Got: `%s`\n", string(test.output), string(output)) + } + }) - output := filter.Filter(tests[i].input) - if !reflect.DeepEqual(tests[i].output, output) { - t.Errorf("[%d] Expected: `%s`, Got: `%s`\n", i, string(tests[i].output), string(output)) - } } } From c0444356976e14fb97636de3038ff86fa9351992 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 22 May 2020 11:12:54 -0400 Subject: [PATCH 690/728] update datetime table test to use subtests (#1389) --- analysis/datetime/flexible/flexible_test.go | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/analysis/datetime/flexible/flexible_test.go b/analysis/datetime/flexible/flexible_test.go index e97423b78..644d91eb0 100644 --- a/analysis/datetime/flexible/flexible_test.go +++ b/analysis/datetime/flexible/flexible_test.go @@ -76,14 +76,14 @@ func TestFlexibleDateTimeParser(t *testing.T) { }) for _, test := range tests { - actualTime, actualErr := dateOptionalTimeParser.ParseDateTime(test.input) - if actualErr != test.expectedError { - t.Errorf("expected error %#v, got %#v", test.expectedError, actualErr) - continue - } - if !reflect.DeepEqual(actualTime, test.expectedTime) { - t.Errorf("expected time %#v, got %#v", test.expectedTime, actualTime) - t.Errorf("expected location %#v,\n got %#v", test.expectedTime.Location(), actualTime.Location()) - } + t.Run(test.input, func(t *testing.T) { + actualTime, actualErr := dateOptionalTimeParser.ParseDateTime(test.input) + if actualErr != test.expectedError { + t.Fatalf("expected error %#v, got %#v", test.expectedError, actualErr) + } + if !reflect.DeepEqual(actualTime, test.expectedTime) { + t.Errorf("expected time %v, got %v", test.expectedTime, actualTime) + } + }) } } From 5426dde14309ecb3545ac12f62760cab0f4de126 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 22 May 2020 13:34:20 -0400 Subject: [PATCH 691/728] add support for zap v13 (#1403) --- go.mod | 5 +++-- index/scorch/segment_plugin.go | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/go.mod b/go.mod index d38cf8f92..e9e27ba10 100644 --- a/go.mod +++ b/go.mod @@ -8,8 +8,9 @@ require ( github.com/blevesearch/go-porterstemmer v1.0.3 github.com/blevesearch/segment v0.9.0 github.com/blevesearch/snowballstem v0.9.0 - github.com/blevesearch/zap/v11 v11.0.7 - github.com/blevesearch/zap/v12 v12.0.7 + github.com/blevesearch/zap/v11 v11.0.8 + github.com/blevesearch/zap/v12 v12.0.8 + github.com/blevesearch/zap/v13 v13.0.0 github.com/couchbase/ghistogram v0.1.0 // indirect github.com/couchbase/moss v0.1.0 github.com/couchbase/vellum v1.0.1 diff --git a/index/scorch/segment_plugin.go b/index/scorch/segment_plugin.go index 6dfa3b282..f5c39e063 100644 --- a/index/scorch/segment_plugin.go +++ b/index/scorch/segment_plugin.go @@ -21,6 +21,7 @@ import ( zapv11 "github.com/blevesearch/zap/v11" zapv12 "github.com/blevesearch/zap/v12" + zapv13 "github.com/blevesearch/zap/v13" ) var supportedSegmentPlugins map[string]map[uint32]segment.Plugin @@ -28,6 +29,7 @@ var defaultSegmentPlugin segment.Plugin func init() { ResetPlugins() + RegisterPlugin(zapv13.Plugin(), false) RegisterPlugin(zapv12.Plugin(), false) RegisterPlugin(zapv11.Plugin(), true) } From f830931c0ffc2c80a20e02fdad4847c7d669423e Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Sat, 23 May 2020 17:37:05 -0400 Subject: [PATCH 692/728] allow integration tests to force scorch segment type/ver (#1402) --- test/integration_test.go | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/test/integration_test.go b/test/integration_test.go index ed59323ff..cdd50f03a 100644 --- a/test/integration_test.go +++ b/test/integration_test.go @@ -39,14 +39,20 @@ var keepIndex = flag.Bool("keepIndex", false, "keep the index after testing") var indexType = flag.String("indexType", bleve.Config.DefaultIndexType, "index type to build") var kvType = flag.String("kvType", bleve.Config.DefaultKVStore, "kv store type to build") +var segType = flag.String("segType", "", "force scorch segment type") +var segVer = flag.Int("segVer", 0, "force scorch segment version") func TestIntegration(t *testing.T) { flag.Parse() - bleve.Config.DefaultIndexType = *indexType - bleve.Config.DefaultKVStore = *kvType t.Logf("using index type %s and kv type %s", *indexType, *kvType) + if *segType != "" { + t.Logf("forcing segment type: %s", *segType) + } + if *segVer != 0 { + t.Logf("forcing segment version: %d", *segVer) + } var err error var datasetRegexp *regexp.Regexp @@ -194,7 +200,15 @@ func runTestDir(t *testing.T, dir, datasetName string) { func loadDataSet(t *testing.T, datasetName string, mapping mapping.IndexMappingImpl, path string) (bleve.Index, func(), error) { idxPath := fmt.Sprintf("test-%s.bleve", datasetName) - index, err := bleve.New(idxPath, &mapping) + cfg := map[string]interface{}{} + if *segType != "" { + cfg["forceSegmentType"] = *segType + } + if *segVer != 0 { + cfg["forceSegmentVersion"] = *segVer + } + + index, err := bleve.NewUsing(idxPath, &mapping, *indexType, *kvType, cfg) if err != nil { return nil, nil, fmt.Errorf("error creating new index: %v", err) } From 04c39ad184bb7dacaaf0dfc12357991897d14743 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 26 May 2020 11:25:09 -0400 Subject: [PATCH 693/728] allow segment type/version override to work (#1407) logic was reversed, preventing correct configuration from actually switching the segment plugin used --- index/scorch/builder.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/index/scorch/builder.go b/index/scorch/builder.go index cd400bd1d..1f4b41d63 100644 --- a/index/scorch/builder.go +++ b/index/scorch/builder.go @@ -107,8 +107,9 @@ func (o *Builder) parseConfig(config map[string]interface{}) (err error) { segPlugin, err := chooseSegmentPlugin(forcedSegmentType, uint32(forcedSegmentVersion)) if err != nil { - o.segPlugin = segPlugin + return err } + o.segPlugin = segPlugin } return nil From 0c90a2fc4e8ccb06077958f766135b04610adcce Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 26 May 2020 11:36:20 -0400 Subject: [PATCH 694/728] improve scorch config parsing of forced zap version (#1401) previous version did not correctly handle integer values stored in a float64 due to JSON parsing --- index/scorch/scorch.go | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index c33f760ea..6b5fd3923 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -142,17 +142,18 @@ func NewScorch(storeName string, // configForceSegmentTypeVersion checks if the caller has requested a // specific segment type/version func configForceSegmentTypeVersion(config map[string]interface{}) (string, uint32, error) { - forcedSegmentVersion, ok := config["forceSegmentVersion"].(int) - if ok { - forcedSegmentType, ok2 := config["forceSegmentType"].(string) - if !ok2 { - return "", 0, fmt.Errorf( - "forceSegmentVersion set to %d, must also specify forceSegmentType", forcedSegmentVersion) - } + forcedSegmentVersion, err := parseToInteger(config["forceSegmentVersion"]) + if err != nil { + return "", 0, nil + } - return forcedSegmentType, uint32(forcedSegmentVersion), nil + forcedSegmentType, ok := config["forceSegmentType"].(string) + if !ok { + return "", 0, fmt.Errorf( + "forceSegmentVersion set to %d, must also specify forceSegmentType", forcedSegmentVersion) } - return "", 0, nil + + return forcedSegmentType, uint32(forcedSegmentVersion), nil } func (s *Scorch) paused() uint64 { From 2b80a2aedf6bb1746054c6771d0625866a0dcabe Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 26 May 2020 12:57:16 -0400 Subject: [PATCH 695/728] add support for zap v14 (#1408) bump all the zap versions --- go.mod | 7 ++++--- index/scorch/segment_plugin.go | 2 ++ 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index e9e27ba10..2b23a510b 100644 --- a/go.mod +++ b/go.mod @@ -8,9 +8,10 @@ require ( github.com/blevesearch/go-porterstemmer v1.0.3 github.com/blevesearch/segment v0.9.0 github.com/blevesearch/snowballstem v0.9.0 - github.com/blevesearch/zap/v11 v11.0.8 - github.com/blevesearch/zap/v12 v12.0.8 - github.com/blevesearch/zap/v13 v13.0.0 + github.com/blevesearch/zap/v11 v11.0.9 + github.com/blevesearch/zap/v12 v12.0.9 + github.com/blevesearch/zap/v13 v13.0.1 + github.com/blevesearch/zap/v14 v14.0.0 github.com/couchbase/ghistogram v0.1.0 // indirect github.com/couchbase/moss v0.1.0 github.com/couchbase/vellum v1.0.1 diff --git a/index/scorch/segment_plugin.go b/index/scorch/segment_plugin.go index f5c39e063..4729e6bd6 100644 --- a/index/scorch/segment_plugin.go +++ b/index/scorch/segment_plugin.go @@ -22,6 +22,7 @@ import ( zapv11 "github.com/blevesearch/zap/v11" zapv12 "github.com/blevesearch/zap/v12" zapv13 "github.com/blevesearch/zap/v13" + zapv14 "github.com/blevesearch/zap/v14" ) var supportedSegmentPlugins map[string]map[uint32]segment.Plugin @@ -29,6 +30,7 @@ var defaultSegmentPlugin segment.Plugin func init() { ResetPlugins() + RegisterPlugin(zapv14.Plugin(), false) RegisterPlugin(zapv13.Plugin(), false) RegisterPlugin(zapv12.Plugin(), false) RegisterPlugin(zapv11.Plugin(), true) From 25a604dd310c0d9fd57216282640237a52c5dd87 Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Mon, 8 Jun 2020 09:15:46 -0700 Subject: [PATCH 696/728] Log field name where available on tooManyClauses error --- search/searcher/search_disjunction.go | 6 +++--- search/searcher/search_disjunction_heap.go | 2 +- search/searcher/search_disjunction_slice.go | 2 +- search/searcher/search_fuzzy.go | 4 ++-- search/searcher/search_multi_term.go | 4 ++-- search/searcher/search_numeric_range.go | 2 +- search/searcher/search_regexp.go | 2 +- search/searcher/search_term_prefix.go | 2 +- 8 files changed, 12 insertions(+), 12 deletions(-) diff --git a/search/searcher/search_disjunction.go b/search/searcher/search_disjunction.go index 6a296b68f..4ad33b6ff 100644 --- a/search/searcher/search_disjunction.go +++ b/search/searcher/search_disjunction.go @@ -103,7 +103,7 @@ func tooManyClauses(count int) bool { return false } -func tooManyClausesErr(count int) error { - return fmt.Errorf("TooManyClauses[%d > maxClauseCount, which is set to %d]", - count, DisjunctionMaxClauseCount) +func tooManyClausesErr(field string, count int) error { + return fmt.Errorf("TooManyClauses over field: `%s` [%d > maxClauseCount,"+ + " which is set to %d]", field, count, DisjunctionMaxClauseCount) } diff --git a/search/searcher/search_disjunction_heap.go b/search/searcher/search_disjunction_heap.go index ec133f1f8..00b09fcb9 100644 --- a/search/searcher/search_disjunction_heap.go +++ b/search/searcher/search_disjunction_heap.go @@ -62,7 +62,7 @@ func newDisjunctionHeapSearcher(indexReader index.IndexReader, limit bool) ( *DisjunctionHeapSearcher, error) { if limit && tooManyClauses(len(searchers)) { - return nil, tooManyClausesErr(len(searchers)) + return nil, tooManyClausesErr("", len(searchers)) } // build our searcher diff --git a/search/searcher/search_disjunction_slice.go b/search/searcher/search_disjunction_slice.go index e47f39ad0..464878bc6 100644 --- a/search/searcher/search_disjunction_slice.go +++ b/search/searcher/search_disjunction_slice.go @@ -50,7 +50,7 @@ func newDisjunctionSliceSearcher(indexReader index.IndexReader, limit bool) ( *DisjunctionSliceSearcher, error) { if limit && tooManyClauses(len(qsearchers)) { - return nil, tooManyClausesErr(len(qsearchers)) + return nil, tooManyClausesErr("", len(qsearchers)) } // build the downstream searchers searchers := make(OrderedSearcherList, len(qsearchers)) diff --git a/search/searcher/search_fuzzy.go b/search/searcher/search_fuzzy.go index 8176e59b5..aca8a7d9f 100644 --- a/search/searcher/search_fuzzy.go +++ b/search/searcher/search_fuzzy.go @@ -75,7 +75,7 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, for err == nil && tfd != nil { rv = append(rv, tfd.Term) if tooManyClauses(len(rv)) { - return nil, tooManyClausesErr(len(rv)) + return nil, tooManyClausesErr(field, len(rv)) } tfd, err = fieldDict.Next() } @@ -107,7 +107,7 @@ func findFuzzyCandidateTerms(indexReader index.IndexReader, term string, if !exceeded && ld <= fuzziness { rv = append(rv, tfd.Term) if tooManyClauses(len(rv)) { - return nil, tooManyClausesErr(len(rv)) + return nil, tooManyClausesErr(field, len(rv)) } } tfd, err = fieldDict.Next() diff --git a/search/searcher/search_multi_term.go b/search/searcher/search_multi_term.go index c48366ee2..1c60d4a7a 100644 --- a/search/searcher/search_multi_term.go +++ b/search/searcher/search_multi_term.go @@ -23,7 +23,7 @@ func NewMultiTermSearcher(indexReader index.IndexReader, terms []string, field string, boost float64, options search.SearcherOptions, limit bool) ( search.Searcher, error) { if limit && tooManyClauses(len(terms)) { - return nil, tooManyClausesErr(len(terms)) + return nil, tooManyClausesErr(field, len(terms)) } qsearchers := make([]search.Searcher, len(terms)) @@ -51,7 +51,7 @@ func NewMultiTermSearcherBytes(indexReader index.IndexReader, terms [][]byte, field string, boost float64, options search.SearcherOptions, limit bool) ( search.Searcher, error) { if limit && tooManyClauses(len(terms)) { - return nil, tooManyClausesErr(len(terms)) + return nil, tooManyClausesErr(field, len(terms)) } qsearchers := make([]search.Searcher, len(terms)) diff --git a/search/searcher/search_numeric_range.go b/search/searcher/search_numeric_range.go index 83107f020..675f569d9 100644 --- a/search/searcher/search_numeric_range.go +++ b/search/searcher/search_numeric_range.go @@ -97,7 +97,7 @@ func NewNumericRangeSearcher(indexReader index.IndexReader, } if tooManyClauses(len(terms)) { - return nil, tooManyClausesErr(len(terms)) + return nil, tooManyClausesErr(field, len(terms)) } return NewMultiTermSearcherBytes(indexReader, terms, field, boost, options, diff --git a/search/searcher/search_regexp.go b/search/searcher/search_regexp.go index 4def832c4..11a44f159 100644 --- a/search/searcher/search_regexp.go +++ b/search/searcher/search_regexp.go @@ -110,7 +110,7 @@ func findRegexpCandidateTerms(indexReader index.IndexReader, if matchPos != nil && matchPos[0] == 0 && matchPos[1] == len(tfd.Term) { rv = append(rv, tfd.Term) if tooManyClauses(len(rv)) { - return rv, tooManyClausesErr(len(rv)) + return rv, tooManyClausesErr(field, len(rv)) } } tfd, err = fieldDict.Next() diff --git a/search/searcher/search_term_prefix.go b/search/searcher/search_term_prefix.go index b5af4631f..2a8f22cff 100644 --- a/search/searcher/search_term_prefix.go +++ b/search/searcher/search_term_prefix.go @@ -38,7 +38,7 @@ func NewTermPrefixSearcher(indexReader index.IndexReader, prefix string, for err == nil && tfd != nil { terms = append(terms, tfd.Term) if tooManyClauses(len(terms)) { - return nil, tooManyClausesErr(len(terms)) + return nil, tooManyClausesErr(field, len(terms)) } tfd, err = fieldDict.Next() } From 4aea3d9e665c34c08d063bf956d91ada3b441ad7 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 18 Jun 2020 15:11:15 +0530 Subject: [PATCH 697/728] MB-40007 -Obsolete segment file leaks during merge introductions During merge segment introduction, if the segment contents becomes totally obsolete, then the introducer skips it's introduction to root. But it wasn't handling the clean up ceremonies for the segment like Closing and cleaning the entries from the ineligibleToRemove file list. --- index/scorch/introducer.go | 8 ++++++-- index/scorch/merge.go | 38 +++++++++++++++++++++++++++++--------- index/scorch/stats.go | 7 ++++--- 3 files changed, 39 insertions(+), 14 deletions(-) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index 64ca969bd..bcb04f87a 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -383,6 +383,7 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { } } } + var skipped bool // In case where all the docs in the newly merged segment getting // deleted by the time we reach here, can skip the introduction. if nextMerge.new != nil && @@ -405,6 +406,9 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { docsToPersistCount += nextMerge.new.Count() - newSegmentDeleted.GetCardinality() memSegments++ } + } else { + skipped = true + atomic.AddUint64(&s.stats.TotFileMergeIntroductionsObsoleted, 1) } atomic.StoreUint64(&s.stats.TotItemsToPersist, docsToPersistCount) @@ -429,8 +433,8 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { } // notify requester that we incorporated this - nextMerge.notify <- newSnapshot - close(nextMerge.notify) + nextMerge.notifyCh <- ¬ify{iss: newSnapshot, skipped: skipped} + close(nextMerge.notifyCh) } func isMemorySegment(s *SegmentSnapshot) bool { diff --git a/index/scorch/merge.go b/index/scorch/merge.go index 3eccee52e..e88023a3e 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -310,8 +310,9 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, var oldNewDocNums map[uint64][]uint64 var seg segment.Segment + var filename string if len(segmentsToMerge) > 0 { - filename := zapFileName(newSegmentID) + filename = zapFileName(newSegmentID) s.markIneligibleForRemoval(filename) path := s.path + string(os.PathSeparator) + filename @@ -356,7 +357,7 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, old: oldMap, oldNewDocNums: oldNewDocNums, new: seg, - notify: make(chan *IndexSnapshot), + notifyCh: make(chan *notify), } // give it to the introducer @@ -371,15 +372,20 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, introStartTime := time.Now() // it is safe to blockingly wait for the merge introduction // here as the introducer is bound to handle the notify channel. - newSnapshot := <-sm.notify + notify := <-sm.notifyCh introTime := uint64(time.Since(introStartTime)) atomic.AddUint64(&s.stats.TotFileMergeZapIntroductionTime, introTime) if atomic.LoadUint64(&s.stats.MaxFileMergeZapIntroductionTime) < introTime { atomic.StoreUint64(&s.stats.MaxFileMergeZapIntroductionTime, introTime) } atomic.AddUint64(&s.stats.TotFileMergeIntroductionsDone, 1) - if newSnapshot != nil { - _ = newSnapshot.DecRef() + if notify != nil && notify.iss != nil { + _ = notify.iss.DecRef() + if notify.skipped { + // decrement the ref counts on skipping introduction. + s.unmarkIneligibleForRemoval(filename) + _ = seg.DecRef() + } } atomic.AddUint64(&s.stats.TotFileMergePlanTasksDone, 1) @@ -395,12 +401,17 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, return nil } +type notify struct { + iss *IndexSnapshot + skipped bool +} + type segmentMerge struct { id uint64 old map[uint64]*SegmentSnapshot oldNewDocNums map[uint64][]uint64 new segment.Segment - notify chan *IndexSnapshot + notifyCh chan *notify } // perform a merging of the given SegmentBase instances into a new, @@ -450,7 +461,7 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, old: make(map[uint64]*SegmentSnapshot), oldNewDocNums: make(map[uint64][]uint64), new: seg, - notify: make(chan *IndexSnapshot), + notifyCh: make(chan *notify), } for i, idx := range sbsIndexes { @@ -467,11 +478,20 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, } // blockingly wait for the introduction to complete - newSnapshot := <-sm.notify - if newSnapshot != nil { + var newSnapshot *IndexSnapshot + notify := <-sm.notifyCh + if notify != nil && notify.iss != nil { + newSnapshot = notify.iss atomic.AddUint64(&s.stats.TotMemMergeSegments, uint64(len(sbs))) atomic.AddUint64(&s.stats.TotMemMergeDone, 1) + if notify.skipped { + // decrement the ref counts on skipping introduction. + _ = newSnapshot.DecRef() + _ = seg.DecRef() + newSnapshot = nil + } } + return newSnapshot, newSegmentID, nil } diff --git a/index/scorch/stats.go b/index/scorch/stats.go index 2900ac0df..9d7fbc0e2 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -108,9 +108,10 @@ type Stats struct { TotFileMergeZapIntroductionTime uint64 MaxFileMergeZapIntroductionTime uint64 - TotFileMergeIntroductions uint64 - TotFileMergeIntroductionsDone uint64 - TotFileMergeIntroductionsSkipped uint64 + TotFileMergeIntroductions uint64 + TotFileMergeIntroductionsDone uint64 + TotFileMergeIntroductionsSkipped uint64 + TotFileMergeIntroductionsObsoleted uint64 CurFilesIneligibleForRemoval uint64 TotSnapshotsRemovedFromMetaStore uint64 From dcafaff05395de11df95fb74f7f6a75c645a1b49 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 18 Jun 2020 10:49:33 -0400 Subject: [PATCH 698/728] fix stats handling of term field reader advancing backwards (#1416) The current implementation creates a new term field reader, which replaces the original one, this causes the stats tracking start/finished to get out of sync. This fixes the issue by recording the original term field reader as finished when replacing with the new one. fixes #1415 --- index/scorch/scorch_test.go | 85 ++++++++++++++++++++++++++++++ index/scorch/snapshot_index_tfr.go | 2 + 2 files changed, 87 insertions(+) diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index 5a9ef40cd..e22fafa27 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -2380,3 +2380,88 @@ func TestCancelIndexForceMerge(t *testing.T) { t.Fatal(err) } } + +func TestIndexSeekBackwardsStats(t *testing.T) { + cfg := CreateConfig("TestIndexOpenReopen") + err := InitTest(cfg) + if err != nil { + t.Fatal(err) + } + defer func() { + err := DestroyTest(cfg) + if err != nil { + t.Log(err) + } + }() + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, cfg, analysisQueue) + if err != nil { + t.Fatal(err) + } + err = idx.Open() + if err != nil { + t.Errorf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + // insert a doc + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("cat"))) + err = idx.Update(doc) + if err != nil { + t.Fatalf("error updating index: %v", err) + } + + // insert another doc + doc = document.NewDocument("2") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("cat"))) + err = idx.Update(doc) + if err != nil { + t.Fatalf("error updating index: %v", err) + } + + reader, err := idx.Reader() + if err != nil { + t.Fatalf("error getting index reader: %v", err) + } + defer reader.Close() + + tfr, err := reader.TermFieldReader([]byte("cat"), "name", false, false, false) + if err != nil { + t.Fatalf("error getting term field readyer for name/cat: %v", err) + } + + tfdFirst, err := tfr.Next(nil) + if err != nil { + t.Fatalf("error getting first tfd: %v", err) + } + + _, err = tfr.Next(nil) + if err != nil { + t.Fatalf("error getting second tfd: %v", err) + } + + // seek backwards to the first + _, err = tfr.Advance(tfdFirst.ID, nil) + if err != nil { + t.Fatalf("error adancing backwards: %v", err) + } + + err = tfr.Close() + if err != nil { + t.Fatalf("error closing term field reader: %v", err) + } + + + if idx.(*Scorch).stats.TotTermSearchersStarted != idx.(*Scorch).stats.TotTermSearchersFinished { + t.Errorf("expected term searchers started %d to equal term searchers finished %d", + idx.(*Scorch).stats.TotTermSearchersStarted, + idx.(*Scorch).stats.TotTermSearchersFinished) + } +} diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index 5d56f1944..61537fc4f 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -133,6 +133,8 @@ func (i *IndexSnapshotTermFieldReader) Advance(ID index.IndexInternalID, preAllo if err != nil { return nil, err } + // close the current term field reader before replacing it with a new one + _ = i.Close() *i = *(i2.(*IndexSnapshotTermFieldReader)) } num, err := docInternalToNumber(ID) From 1314e3722897b3939bf97cbad4140a4c6dcaf0eb Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 19 Jun 2020 14:45:49 +0530 Subject: [PATCH 699/728] Switching to atomic ops instead of locks for events --- index/scorch/persister.go | 4 ++-- index/scorch/scorch.go | 29 ++++++----------------------- index/scorch/stats.go | 3 +++ 3 files changed, 11 insertions(+), 25 deletions(-) diff --git a/index/scorch/persister.go b/index/scorch/persister.go index ea3667bbf..498378a4f 100644 --- a/index/scorch/persister.go +++ b/index/scorch/persister.go @@ -256,7 +256,7 @@ func (s *Scorch) pausePersisterForMergerCatchUp(lastPersistedEpoch uint64, // for sufficient in-memory segments to pile up for the next // memory merge cum persist loop. if numFilesOnDisk < uint64(po.PersisterNapUnderNumFiles) && - po.PersisterNapTimeMSec > 0 && s.paused() == 0 { + po.PersisterNapTimeMSec > 0 && s.NumEventsBlocking() == 0 { select { case <-s.closeCh: case <-time.After(time.Millisecond * time.Duration(po.PersisterNapTimeMSec)): @@ -333,7 +333,7 @@ func (s *Scorch) persistSnapshot(snapshot *IndexSnapshot, // Perform in-memory segment merging only when the memory pressure is // below the configured threshold, else the persister performs the // direct persistence of segments. - if s.paused() < po.MemoryPressurePauseThreshold { + if s.NumEventsBlocking() < po.MemoryPressurePauseThreshold { persisted, err := s.persistSnapshotMaybeMerge(snapshot) if err != nil { return err diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index 6b5fd3923..ba98a460d 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -73,10 +73,6 @@ type Scorch struct { onEvent func(event Event) onAsyncError func(err error) - pauseLock sync.RWMutex - - pauseCount uint64 - forceMergeRequestCh chan *mergerCtrl segPlugin segment.Plugin @@ -156,30 +152,17 @@ func configForceSegmentTypeVersion(config map[string]interface{}) (string, uint3 return forcedSegmentType, uint32(forcedSegmentVersion), nil } -func (s *Scorch) paused() uint64 { - s.pauseLock.Lock() - pc := s.pauseCount - s.pauseLock.Unlock() - return pc -} - -func (s *Scorch) incrPause() { - s.pauseLock.Lock() - s.pauseCount++ - s.pauseLock.Unlock() -} - -func (s *Scorch) decrPause() { - s.pauseLock.Lock() - s.pauseCount-- - s.pauseLock.Unlock() +func (s *Scorch) NumEventsBlocking() uint64 { + eventsCompleted := atomic.LoadUint64(&s.stats.TotEventTriggerCompleted) + eventsStarted := atomic.LoadUint64(&s.stats.TotEventTriggerStarted) + return eventsStarted - eventsCompleted } func (s *Scorch) fireEvent(kind EventKind, dur time.Duration) { if s.onEvent != nil { - s.incrPause() + atomic.AddUint64(&s.stats.TotEventTriggerStarted, 1) s.onEvent(Event{Kind: kind, Scorch: s, Duration: dur}) - s.decrPause() + atomic.AddUint64(&s.stats.TotEventTriggerCompleted, 1) } } diff --git a/index/scorch/stats.go b/index/scorch/stats.go index 2900ac0df..5795870f2 100644 --- a/index/scorch/stats.go +++ b/index/scorch/stats.go @@ -47,6 +47,9 @@ type Stats struct { TotTermSearchersStarted uint64 TotTermSearchersFinished uint64 + TotEventTriggerStarted uint64 + TotEventTriggerCompleted uint64 + TotIntroduceLoop uint64 TotIntroduceSegmentBeg uint64 TotIntroduceSegmentEnd uint64 From 57aabdae99de69996720121be8cbd6b340a3e42a Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Fri, 19 Jun 2020 13:52:58 +0530 Subject: [PATCH 700/728] adding UT and minor refactoring --- index/scorch/event.go | 8 ++ index/scorch/introducer.go | 4 +- index/scorch/merge.go | 40 +++++----- index/scorch/merge_test.go | 154 +++++++++++++++++++++++++++++++++++++ 4 files changed, 187 insertions(+), 19 deletions(-) create mode 100644 index/scorch/merge_test.go diff --git a/index/scorch/event.go b/index/scorch/event.go index dd79d6d06..8f3fc1914 100644 --- a/index/scorch/event.go +++ b/index/scorch/event.go @@ -54,3 +54,11 @@ var EventKindBatchIntroductionStart = EventKind(5) // EventKindBatchIntroduction is fired when Batch() completes. var EventKindBatchIntroduction = EventKind(6) + +// EventKindMergeTaskIntroductionStart is fired when the merger is about to +// start the introduction of merged segment from a single merge task. +var EventKindMergeTaskIntroductionStart = EventKind(7) + +// EventKindMergeTaskIntroduction is fired when the merger has completed +// the introduction of merged segment from a single merge task. +var EventKindMergeTaskIntroduction = EventKind(8) diff --git a/index/scorch/introducer.go b/index/scorch/introducer.go index bcb04f87a..7770c41c5 100644 --- a/index/scorch/introducer.go +++ b/index/scorch/introducer.go @@ -433,7 +433,9 @@ func (s *Scorch) introduceMerge(nextMerge *segmentMerge) { } // notify requester that we incorporated this - nextMerge.notifyCh <- ¬ify{iss: newSnapshot, skipped: skipped} + nextMerge.notifyCh <- &mergeTaskIntroStatus{ + indexSnapshot: newSnapshot, + skipped: skipped} close(nextMerge.notifyCh) } diff --git a/index/scorch/merge.go b/index/scorch/merge.go index e88023a3e..56c0953f4 100644 --- a/index/scorch/merge.go +++ b/index/scorch/merge.go @@ -357,9 +357,11 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, old: oldMap, oldNewDocNums: oldNewDocNums, new: seg, - notifyCh: make(chan *notify), + notifyCh: make(chan *mergeTaskIntroStatus), } + s.fireEvent(EventKindMergeTaskIntroductionStart, 0) + // give it to the introducer select { case <-s.closeCh: @@ -372,23 +374,25 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, introStartTime := time.Now() // it is safe to blockingly wait for the merge introduction // here as the introducer is bound to handle the notify channel. - notify := <-sm.notifyCh + introStatus := <-sm.notifyCh introTime := uint64(time.Since(introStartTime)) atomic.AddUint64(&s.stats.TotFileMergeZapIntroductionTime, introTime) if atomic.LoadUint64(&s.stats.MaxFileMergeZapIntroductionTime) < introTime { atomic.StoreUint64(&s.stats.MaxFileMergeZapIntroductionTime, introTime) } atomic.AddUint64(&s.stats.TotFileMergeIntroductionsDone, 1) - if notify != nil && notify.iss != nil { - _ = notify.iss.DecRef() - if notify.skipped { - // decrement the ref counts on skipping introduction. + if introStatus != nil && introStatus.indexSnapshot != nil { + _ = introStatus.indexSnapshot.DecRef() + if introStatus.skipped { + // close the segment on skipping introduction. s.unmarkIneligibleForRemoval(filename) - _ = seg.DecRef() + _ = seg.Close() } } atomic.AddUint64(&s.stats.TotFileMergePlanTasksDone, 1) + + s.fireEvent(EventKindMergeTaskIntroduction, 0) } // once all the newly merged segment introductions are done, @@ -401,9 +405,9 @@ func (s *Scorch) planMergeAtSnapshot(ctx context.Context, return nil } -type notify struct { - iss *IndexSnapshot - skipped bool +type mergeTaskIntroStatus struct { + indexSnapshot *IndexSnapshot + skipped bool } type segmentMerge struct { @@ -411,7 +415,7 @@ type segmentMerge struct { old map[uint64]*SegmentSnapshot oldNewDocNums map[uint64][]uint64 new segment.Segment - notifyCh chan *notify + notifyCh chan *mergeTaskIntroStatus } // perform a merging of the given SegmentBase instances into a new, @@ -461,7 +465,7 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, old: make(map[uint64]*SegmentSnapshot), oldNewDocNums: make(map[uint64][]uint64), new: seg, - notifyCh: make(chan *notify), + notifyCh: make(chan *mergeTaskIntroStatus), } for i, idx := range sbsIndexes { @@ -479,15 +483,15 @@ func (s *Scorch) mergeSegmentBases(snapshot *IndexSnapshot, // blockingly wait for the introduction to complete var newSnapshot *IndexSnapshot - notify := <-sm.notifyCh - if notify != nil && notify.iss != nil { - newSnapshot = notify.iss + introStatus := <-sm.notifyCh + if introStatus != nil && introStatus.indexSnapshot != nil { + newSnapshot = introStatus.indexSnapshot atomic.AddUint64(&s.stats.TotMemMergeSegments, uint64(len(sbs))) atomic.AddUint64(&s.stats.TotMemMergeDone, 1) - if notify.skipped { - // decrement the ref counts on skipping introduction. + if introStatus.skipped { + // close the segment on skipping introduction. _ = newSnapshot.DecRef() - _ = seg.DecRef() + _ = seg.Close() newSnapshot = nil } } diff --git a/index/scorch/merge_test.go b/index/scorch/merge_test.go new file mode 100644 index 000000000..41f6892e7 --- /dev/null +++ b/index/scorch/merge_test.go @@ -0,0 +1,154 @@ +// Copyright (c) 2020 Couchbase, Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package scorch + +import ( + "sync" + "sync/atomic" + "testing" + + "github.com/blevesearch/bleve/document" + "github.com/blevesearch/bleve/index" +) + +func TestObsoleteSegmentMergeIntroduction(t *testing.T) { + testConfig := CreateConfig("TestObsoleteSegmentMergeIntroduction") + err := InitTest(testConfig) + if err != nil { + t.Fatal(err) + } + defer func() { + err := DestroyTest(testConfig) + if err != nil { + t.Fatal(err) + } + }() + + var introComplete, mergeIntroStart, mergeIntroComplete sync.WaitGroup + introComplete.Add(1) + mergeIntroStart.Add(1) + mergeIntroComplete.Add(1) + var segIntroCompleted int + RegistryEventCallbacks["test"] = func(e Event) { + if e.Kind == EventKindBatchIntroduction { + segIntroCompleted++ + if segIntroCompleted == 3 { + // all 3 segments introduced + introComplete.Done() + } + } else if e.Kind == EventKindMergeTaskIntroductionStart { + // signal the start of merge task introduction so that + // we can introduce a new batch which obsoletes the + // merged segment's contents. + mergeIntroStart.Done() + // hold the merge task introduction until the merged segment contents + // are obsoleted with the next batch/segment introduction. + introComplete.Wait() + } else if e.Kind == EventKindMergeTaskIntroduction { + // signal the completion of the merge task introduction. + mergeIntroComplete.Done() + + } + } + + ourConfig := make(map[string]interface{}, len(testConfig)) + for k, v := range testConfig { + ourConfig[k] = v + } + ourConfig["eventCallbackName"] = "test" + + analysisQueue := index.NewAnalysisQueue(1) + idx, err := NewScorch(Name, ourConfig, analysisQueue) + if err != nil { + t.Fatal(err) + } + + err = idx.Open() + if err != nil { + t.Fatalf("error opening index: %v", err) + } + defer func() { + err := idx.Close() + if err != nil { + t.Fatal(err) + } + }() + + // first introduce two documents over two batches. + batch := index.NewBatch() + doc := document.NewDocument("1") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test3"))) + batch.Update(doc) + err = idx.Batch(batch) + if err != nil { + t.Error(err) + } + + batch.Reset() + doc = document.NewDocument("2") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test2updated"))) + batch.Update(doc) + err = idx.Batch(batch) + if err != nil { + t.Error(err) + } + + // wait until the merger trying to introduce the new merged segment. + mergeIntroStart.Wait() + + // execute another batch which obsoletes the contents of the new merged + // segment awaiting introduction. + batch.Reset() + batch.Delete("1") + batch.Delete("2") + doc = document.NewDocument("3") + doc.AddField(document.NewTextField("name", []uint64{}, []byte("test3updated"))) + batch.Update(doc) + err = idx.Batch(batch) + if err != nil { + t.Error(err) + } + + // wait until the merge task introduction complete. + mergeIntroComplete.Wait() + + idxr, err := idx.Reader() + if err != nil { + t.Error(err) + } + + numSegments := len(idxr.(*IndexSnapshot).segment) + if numSegments != 1 { + t.Errorf("expected one segment at the root, got: %d", numSegments) + } + + skipIntroCount := atomic.LoadUint64(&idxr.(*IndexSnapshot).parent.stats.TotFileMergeIntroductionsObsoleted) + if skipIntroCount != 1 { + t.Errorf("expected one obsolete merge segment skipping the introduction, got: %d", skipIntroCount) + } + + docCount, err := idxr.DocCount() + if err != nil { + t.Fatal(err) + } + if docCount != 1 { + t.Errorf("Expected document count to be %d got %d", 1, docCount) + } + + err = idxr.Close() + if err != nil { + t.Fatal(err) + } +} From 9da414702d919c6f24afc87ce8cbf8cf91abad8b Mon Sep 17 00:00:00 2001 From: ITmeze Date: Fri, 19 Jun 2020 21:16:42 +0200 Subject: [PATCH 701/728] Fix for import paths in function documentation Documentation for AddCustomAnalyzer is using non existing import paths --- mapping/index.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mapping/index.go b/mapping/index.go index 602764cbb..21ca5cce3 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -101,26 +101,26 @@ func (im *IndexMappingImpl) AddCustomTokenFilter(name string, config map[string] // returned analyzer is registered in the IndexMapping. // // bleve comes with predefined analyzers, like -// github.com/blevesearch/bleve/analysis/analyzers/custom_analyzer. They are +// github.com/blevesearch/bleve/analysis/analyzer/custom. They are // available only if their package is imported by client code. To achieve this, // use their metadata to fill configuration entries: // // import ( -// "github.com/blevesearch/bleve/analysis/analyzers/custom_analyzer" -// "github.com/blevesearch/bleve/analysis/char_filters/html_char_filter" -// "github.com/blevesearch/bleve/analysis/token_filters/lower_case_filter" -// "github.com/blevesearch/bleve/analysis/tokenizers/unicode" +// "github.com/blevesearch/bleve/analysis/analyzer/custom" +// "github.com/blevesearch/bleve/analysis/char/html" +// "github.com/blevesearch/bleve/analysis/token/lowercase" +// "github.com/blevesearch/bleve/analysis/tokenizer/unicode" // ) // // m := bleve.NewIndexMapping() // err := m.AddCustomAnalyzer("html", map[string]interface{}{ -// "type": custom_analyzer.Name, +// "type": custom.Name, // "char_filters": []string{ -// html_char_filter.Name, +// html.Name, // }, // "tokenizer": unicode.Name, // "token_filters": []string{ -// lower_case_filter.Name, +// lowercase.Name, // ... // }, // }) From 1f15e1d641e8ec16df952423b14bd899a3b53b06 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Thu, 13 Aug 2020 11:23:28 -0400 Subject: [PATCH 702/728] remove TooManyClauses limitation when optimizable (#1426) Observation: When a search does not perform scoring and does not need location information, disjunction queries are optimized by directly OR'ing the underlying bitset. This avoids all the usual multi-iterator disjunction logic. However, in it's traditional form, we still have a TooManyClauses limit, and this makes sense as all the underlying iterators are still in memory at one time. Observation: The MultiTerm search is unique in that we have a flat list of terms that are used to build the disjunction. This is significant because it means we can ensure that all the underlying searchers are optimizable. By combining these two observations we can introduce a new mode of operation for the MultiTerm search. When it does not perform scoring and does not need location information, we can do a new optimization where we create smaller batches of disjunctions which are immediately optimizable into a single term searcher. By repeating this process across all terms, we end up with the correct searcher, and we never had more than the batch size iterators built in memory at one time. UnadornedPostingsIteratorBitmap was refactored to also implement OptimizablePostingsIterator, this allows us to keep the in-progress final iterator in each batch, simplifying the logic. A new optimization mode "disjunction:unadorned-force" was introduced. It behaves exacdtly the same as "disjunction:unadorned" only it always performs the optimization without regard for the cardinality of the underlying iterators. --- index/scorch/optimize.go | 21 ++- index/scorch/segment/unadorned.go | 13 ++ search/searcher/search_disjunction.go | 8 +- search/searcher/search_multi_term.go | 180 ++++++++++++++++++---- search/searcher/search_term_range_test.go | 83 ++++++++++ 5 files changed, 269 insertions(+), 36 deletions(-) diff --git a/index/scorch/optimize.go b/index/scorch/optimize.go index b9cb9228a..f23fdfb8a 100644 --- a/index/scorch/optimize.go +++ b/index/scorch/optimize.go @@ -16,10 +16,10 @@ package scorch import ( "fmt" - "github.com/RoaringBitmap/roaring" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/index/scorch/segment" + "sync/atomic" ) var OptimizeConjunction = true @@ -37,7 +37,11 @@ func (s *IndexSnapshotTermFieldReader) Optimize(kind string, } if OptimizeDisjunctionUnadorned && kind == "disjunction:unadorned" { - return s.optimizeDisjunctionUnadorned(octx) + return s.optimizeDisjunctionUnadorned(octx, OptimizeDisjunctionUnadornedMinChildCardinality) + } + + if OptimizeDisjunctionUnadorned && kind == "disjunction:unadorned-force" { + return s.optimizeDisjunctionUnadorned(octx, 0) } return octx, nil @@ -265,6 +269,7 @@ OUTER: oTFR.iterators[i] = segment.NewUnadornedPostingsIteratorFromBitmap(bm) } + atomic.AddUint64(&o.snapshot.parent.stats.TotTermSearchersStarted, uint64(1)) return oTFR, nil } @@ -275,9 +280,12 @@ OUTER: // term-vectors are not required, and instead only the internal-id's // are needed. func (s *IndexSnapshotTermFieldReader) optimizeDisjunctionUnadorned( - octx index.OptimizableContext) (index.OptimizableContext, error) { + octx index.OptimizableContext, minChildCardinality uint64) (index.OptimizableContext, error) { if octx == nil { - octx = &OptimizeTFRDisjunctionUnadorned{snapshot: s.snapshot} + octx = &OptimizeTFRDisjunctionUnadorned{ + snapshot: s.snapshot, + minChildCardinality: minChildCardinality, + } } o, ok := octx.(*OptimizeTFRDisjunctionUnadorned) @@ -298,6 +306,8 @@ type OptimizeTFRDisjunctionUnadorned struct { snapshot *IndexSnapshot tfrs []*IndexSnapshotTermFieldReader + + minChildCardinality uint64 } var OptimizeTFRDisjunctionUnadornedTerm = []byte("") @@ -332,7 +342,7 @@ func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err erro // Heuristic to skip the optimization if all the constituent // bitmaps are too small, where the processing & resource // overhead to create the OR'ed bitmap outweighs the benefit. - if cMax < OptimizeDisjunctionUnadornedMinChildCardinality { + if cMax < o.minChildCardinality { return nil, nil } } @@ -392,5 +402,6 @@ func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err erro oTFR.iterators[i] = segment.NewUnadornedPostingsIteratorFromBitmap(bm) } + atomic.AddUint64(&o.snapshot.parent.stats.TotTermSearchersStarted, uint64(1)) return oTFR, nil } diff --git a/index/scorch/segment/unadorned.go b/index/scorch/segment/unadorned.go index 9a4d6c76c..56e752348 100644 --- a/index/scorch/segment/unadorned.go +++ b/index/scorch/segment/unadorned.go @@ -72,6 +72,19 @@ func (i *UnadornedPostingsIteratorBitmap) Size() int { return reflectStaticSizeUnadornedPostingsIteratorBitmap } +func (i *UnadornedPostingsIteratorBitmap) ActualBitmap() *roaring.Bitmap { + return i.actualBM +} + +func (i *UnadornedPostingsIteratorBitmap) DocNum1Hit() (uint64, bool) { + return 0, false +} + +func (i *UnadornedPostingsIteratorBitmap) ReplaceActual(actual *roaring.Bitmap) { + i.actualBM = actual + i.actual = actual.Iterator() +} + func NewUnadornedPostingsIteratorFromBitmap(bm *roaring.Bitmap) PostingsIterator { return &UnadornedPostingsIteratorBitmap{ actualBM: bm, diff --git a/search/searcher/search_disjunction.go b/search/searcher/search_disjunction.go index 4ad33b6ff..f47da27c4 100644 --- a/search/searcher/search_disjunction.go +++ b/search/searcher/search_disjunction.go @@ -16,7 +16,6 @@ package searcher import ( "fmt" - "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" ) @@ -37,6 +36,11 @@ func NewDisjunctionSearcher(indexReader index.IndexReader, return newDisjunctionSearcher(indexReader, qsearchers, min, options, true) } +func optionsDisjunctionOptimizable(options search.SearcherOptions) bool { + rv := options.Score == "none" && !options.IncludeTermVectors + return rv +} + func newDisjunctionSearcher(indexReader index.IndexReader, qsearchers []search.Searcher, min float64, options search.SearcherOptions, limit bool) (search.Searcher, error) { @@ -44,7 +48,7 @@ func newDisjunctionSearcher(indexReader index.IndexReader, // do not need extra information like freq-norm's or term vectors // and the requested min is simple if len(qsearchers) > 1 && min <= 1 && - options.Score == "none" && !options.IncludeTermVectors { + optionsDisjunctionOptimizable(options) { rv, err := optimizeCompositeSearcher("disjunction:unadorned", indexReader, qsearchers, options) if err != nil || rv != nil { diff --git a/search/searcher/search_multi_term.go b/search/searcher/search_multi_term.go index 1c60d4a7a..85aacc176 100644 --- a/search/searcher/search_multi_term.go +++ b/search/searcher/search_multi_term.go @@ -15,6 +15,7 @@ package searcher import ( + "fmt" "github.com/blevesearch/bleve/index" "github.com/blevesearch/bleve/search" ) @@ -22,10 +23,113 @@ import ( func NewMultiTermSearcher(indexReader index.IndexReader, terms []string, field string, boost float64, options search.SearcherOptions, limit bool) ( search.Searcher, error) { - if limit && tooManyClauses(len(terms)) { - return nil, tooManyClausesErr(field, len(terms)) + + if tooManyClauses(len(terms)) { + if optionsDisjunctionOptimizable(options) { + return optimizeMultiTermSearcher(indexReader, terms, field, boost, options) + } + if limit { + return nil, tooManyClausesErr(field, len(terms)) + } + } + + qsearchers, err := makeBatchSearchers(indexReader, terms, field, boost, options) + if err != nil { + return nil, err } + // build disjunction searcher of these ranges + return newMultiTermSearcherInternal(indexReader, qsearchers, field, boost, + options, limit) +} + +func NewMultiTermSearcherBytes(indexReader index.IndexReader, terms [][]byte, + field string, boost float64, options search.SearcherOptions, limit bool) ( + search.Searcher, error) { + + if tooManyClauses(len(terms)) { + if optionsDisjunctionOptimizable(options) { + return optimizeMultiTermSearcherBytes(indexReader, terms, field, boost, options) + } + + if limit { + return nil, tooManyClausesErr(field, len(terms)) + } + } + + qsearchers, err := makeBatchSearchersBytes(indexReader, terms, field, boost, options) + if err != nil { + return nil, err + } + + // build disjunction searcher of these ranges + return newMultiTermSearcherInternal(indexReader, qsearchers, field, boost, + options, limit) +} + +func newMultiTermSearcherInternal(indexReader index.IndexReader, + searchers []search.Searcher, field string, boost float64, + options search.SearcherOptions, limit bool) ( + search.Searcher, error) { + + // build disjunction searcher of these ranges + searcher, err := newDisjunctionSearcher(indexReader, searchers, 0, options, + limit) + if err != nil { + for _, s := range searchers { + _ = s.Close() + } + return nil, err + } + + return searcher, nil +} + +func optimizeMultiTermSearcher(indexReader index.IndexReader, terms []string, + field string, boost float64, options search.SearcherOptions) ( + search.Searcher, error) { + var finalSearcher search.Searcher + for len(terms) > 0 { + var batchTerms []string + if len(terms) > DisjunctionMaxClauseCount { + batchTerms = terms[:DisjunctionMaxClauseCount] + terms = terms[DisjunctionMaxClauseCount:] + } else { + batchTerms = terms + terms = nil + } + batch, err := makeBatchSearchers(indexReader, batchTerms, field, boost, options) + if err != nil { + return nil, err + } + if finalSearcher != nil { + batch = append(batch, finalSearcher) + } + cleanup := func() { + for _, searcher := range batch { + if searcher != nil { + _ = searcher.Close() + } + } + } + finalSearcher, err = optimizeCompositeSearcher("disjunction:unadorned-force", + indexReader, batch, options) + // all searchers in batch should be closed, regardless of error or optimization failure + // either we're returning, or continuing and only finalSearcher is needed for next loop + cleanup() + if err != nil { + return nil, err + } + if finalSearcher == nil { + return nil, fmt.Errorf("unable to optimize") + } + } + return finalSearcher, nil +} + +func makeBatchSearchers(indexReader index.IndexReader, terms []string, field string, + boost float64, options search.SearcherOptions) ([]search.Searcher, error) { + qsearchers := make([]search.Searcher, len(terms)) qsearchersClose := func() { for _, searcher := range qsearchers { @@ -42,17 +146,54 @@ func NewMultiTermSearcher(indexReader index.IndexReader, terms []string, return nil, err } } - // build disjunction searcher of these ranges - return newMultiTermSearcherBytes(indexReader, qsearchers, field, boost, - options, limit) + return qsearchers, nil } -func NewMultiTermSearcherBytes(indexReader index.IndexReader, terms [][]byte, - field string, boost float64, options search.SearcherOptions, limit bool) ( +func optimizeMultiTermSearcherBytes(indexReader index.IndexReader, terms [][]byte, + field string, boost float64, options search.SearcherOptions) ( search.Searcher, error) { - if limit && tooManyClauses(len(terms)) { - return nil, tooManyClausesErr(field, len(terms)) + + var finalSearcher search.Searcher + for len(terms) > 0 { + var batchTerms [][]byte + if len(terms) > DisjunctionMaxClauseCount { + batchTerms = terms[:DisjunctionMaxClauseCount] + terms = terms[DisjunctionMaxClauseCount:] + } else { + batchTerms = terms + terms = nil + } + batch, err := makeBatchSearchersBytes(indexReader, batchTerms, field, boost, options) + if err != nil { + return nil, err + } + if finalSearcher != nil { + batch = append(batch, finalSearcher) + } + cleanup := func() { + for _, searcher := range batch { + if searcher != nil { + _ = searcher.Close() + } + } + } + finalSearcher, err = optimizeCompositeSearcher("disjunction:unadorned-force", + indexReader, batch, options) + // all searchers in batch should be closed, regardless of error or optimization failure + // either we're returning, or continuing and only finalSearcher is needed for next loop + cleanup() + if err != nil { + return nil, err + } + if finalSearcher == nil { + return nil, fmt.Errorf("unable to optimize") + } } + return finalSearcher, nil +} + +func makeBatchSearchersBytes(indexReader index.IndexReader, terms [][]byte, field string, + boost float64, options search.SearcherOptions) ([]search.Searcher, error) { qsearchers := make([]search.Searcher, len(terms)) qsearchersClose := func() { @@ -70,24 +211,5 @@ func NewMultiTermSearcherBytes(indexReader index.IndexReader, terms [][]byte, return nil, err } } - return newMultiTermSearcherBytes(indexReader, qsearchers, field, boost, - options, limit) -} - -func newMultiTermSearcherBytes(indexReader index.IndexReader, - searchers []search.Searcher, field string, boost float64, - options search.SearcherOptions, limit bool) ( - search.Searcher, error) { - - // build disjunction searcher of these ranges - searcher, err := newDisjunctionSearcher(indexReader, searchers, 0, options, - limit) - if err != nil { - for _, s := range searchers { - _ = s.Close() - } - return nil, err - } - - return searcher, nil + return qsearchers, nil } diff --git a/search/searcher/search_term_range_test.go b/search/searcher/search_term_range_test.go index cd4e89114..ec8935c3e 100644 --- a/search/searcher/search_term_range_test.go +++ b/search/searcher/search_term_range_test.go @@ -15,7 +15,11 @@ package searcher import ( + "github.com/blevesearch/bleve/index/scorch" + "io/ioutil" + "os" "reflect" + "sort" "testing" "github.com/blevesearch/bleve/search" @@ -199,3 +203,82 @@ func TestTermRangeSearch(t *testing.T) { } } + +func TestTermRangeSearchTooManyTerms(t *testing.T) { + dir, _ := ioutil.TempDir("", "scorchTwoDoc") + defer func() { + _ = os.RemoveAll(dir) + }() + + scorchIndex := initTwoDocScorch(dir) + + // use lower limit for this test + origLimit := DisjunctionMaxClauseCount + DisjunctionMaxClauseCount = 2 + defer func() { + DisjunctionMaxClauseCount = origLimit + }() + + scorchReader, err := scorchIndex.Reader() + if err != nil { + t.Error(err) + } + defer func() { + err := scorchReader.Close() + if err != nil { + t.Fatal(err) + } + }() + + var want = []string{"1", "3", "4", "5"} + var truth = true + searcher, err := NewTermRangeSearcher(scorchReader, []byte("bobert"), []byte("ravi"), + &truth, &truth, "name", 1.0, search.SearcherOptions{Score: "none", IncludeTermVectors: false}) + if err != nil { + t.Fatal(err) + } + + var got []string + ctx := &search.SearchContext{ + DocumentMatchPool: search.NewDocumentMatchPool( + searcher.DocumentMatchPoolSize(), 0), + } + next, err := searcher.Next(ctx) + i := 0 + for err == nil && next != nil { + extId, err := scorchReader.ExternalID(next.IndexInternalID) + if err != nil { + t.Fatal(err) + } + got = append(got, extId) + ctx.DocumentMatchPool.Put(next) + next, err = searcher.Next(ctx) + i++ + } + if err != nil { + t.Fatalf("error iterating searcher: %v", err) + } + err = searcher.Close() + if err != nil { + t.Fatal(err) + } + + // check that the expected number of term searchers were started + // 6 = 4 original terms, 1 optimized after first round, then final searcher + // from the last round + statsMap := scorchIndex.(*scorch.Scorch).StatsMap() + if statsMap["term_searchers_started"].(uint64) != 6 { + t.Errorf("expected 6 term searchers started, got %d", statsMap["term_searchers_started"]) + } + // check that all started searchers were closed + if statsMap["term_searchers_started"] != statsMap["term_searchers_finished"] { + t.Errorf("expected all term searchers closed, %d started %d closed", + statsMap["term_searchers_started"], statsMap["term_searchers_finished"]) + } + + sort.Strings(got) + if !reflect.DeepEqual(got, want) { + t.Errorf("expected: %#v, got %#v", want, got) + } + +} \ No newline at end of file From 079fce84252a18eb11cbcf5a2270ff8a38e40551 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 17 Aug 2020 20:30:15 +0530 Subject: [PATCH 703/728] Fix the incorrect behaviour for Composite queries with Score:none (#1439) * Fix the incorrect behaviour for Composite queries with Score:none Currently when one of the constituent query is not optimizable, it just returns the incoming octx without any modifications.This causes the optimization to proceed incorrectly as it misses the non optimisable child searcher. With the fix, when a sub query is not optimizable it returns a nil context to stop the parent's optimizations. This would prevent the parent to stop the optimized path and take the slower path with score. * adding UTs --- index/scorch/optimize.go | 2 +- index_test.go | 98 +++++++++++++++++++++ search/searcher/search_disjunction_heap.go | 2 +- search/searcher/search_disjunction_slice.go | 2 +- search/searcher/search_term.go | 2 +- 5 files changed, 102 insertions(+), 4 deletions(-) diff --git a/index/scorch/optimize.go b/index/scorch/optimize.go index f23fdfb8a..6711603d5 100644 --- a/index/scorch/optimize.go +++ b/index/scorch/optimize.go @@ -44,7 +44,7 @@ func (s *IndexSnapshotTermFieldReader) Optimize(kind string, return s.optimizeDisjunctionUnadorned(octx, 0) } - return octx, nil + return nil, nil } var OptimizeDisjunctionUnadornedMinChildCardinality = uint64(256) diff --git a/index_test.go b/index_test.go index 1662c60e3..c535456b0 100644 --- a/index_test.go +++ b/index_test.go @@ -2133,3 +2133,101 @@ func testBatchRaceBug1149(t *testing.T, i Index) { } b.Reset() } + +func TestOptimisedConjunctionSearchHits(t *testing.T) { + defer func() { + err := os.RemoveAll("testidx") + if err != nil { + t.Fatal(err) + } + }() + + idx, err := NewUsing("testidx", NewIndexMapping(), "scorch", "scorch", nil) + if err != nil { + t.Fatal(err) + } + doca := map[string]interface{}{ + "country": "united", + "name": "Mercure Hotel", + "directions": "B560 and B56 Follow signs to the M56", + } + docb := map[string]interface{}{ + "country": "united", + "name": "Mercure Altrincham Bowdon Hotel", + "directions": "A570 and A57 Follow signs to the M56 Manchester Airport", + } + docc := map[string]interface{}{ + "country": "india united", + "name": "Sonoma Hotel", + "directions": "Northwest", + } + docd := map[string]interface{}{ + "country": "United Kingdom", + "name": "Cresta Court Hotel", + "directions": "junction of A560 and A56", + } + + b := idx.NewBatch() + err = b.Index("a", doca) + if err != nil { + t.Error(err) + } + err = b.Index("b", docb) + if err != nil { + t.Error(err) + } + err = b.Index("c", docc) + if err != nil { + t.Error(err) + } + err = b.Index("d", docd) + if err != nil { + t.Error(err) + } + // execute the batch + err = idx.Batch(b) + if err != nil { + log.Fatal(err) + } + + mq := NewMatchQuery("united") + mq.SetField("country") + + cq := NewConjunctionQuery(mq) + + mq1 := NewMatchQuery("hotel") + mq1.SetField("name") + cq.AddQuery(mq1) + + mq2 := NewMatchQuery("56") + mq2.SetField("directions") + mq2.SetFuzziness(1) + cq.AddQuery(mq2) + + req := NewSearchRequest(cq) + req.Score = "none" + + res, err := idx.Search(req) + if err != nil { + t.Fatal(err) + } + hitsWithOutScore := res.Total + + req = NewSearchRequest(cq) + req.Score = "" + + res, err = idx.Search(req) + if err != nil { + t.Fatal(err) + } + hitsWithScore := res.Total + + if hitsWithOutScore != hitsWithScore { + t.Errorf("expected %d hits without score, got %d", hitsWithScore, hitsWithOutScore) + } + + err = idx.Close() + if err != nil { + t.Fatal(err) + } +} diff --git a/search/searcher/search_disjunction_heap.go b/search/searcher/search_disjunction_heap.go index 00b09fcb9..7f0a5a00e 100644 --- a/search/searcher/search_disjunction_heap.go +++ b/search/searcher/search_disjunction_heap.go @@ -310,7 +310,7 @@ func (s *DisjunctionHeapSearcher) Optimize(kind string, octx index.OptimizableCo } } - return octx, nil + return nil, nil } // heap impl diff --git a/search/searcher/search_disjunction_slice.go b/search/searcher/search_disjunction_slice.go index 464878bc6..dc566ade5 100644 --- a/search/searcher/search_disjunction_slice.go +++ b/search/searcher/search_disjunction_slice.go @@ -294,5 +294,5 @@ func (s *DisjunctionSliceSearcher) Optimize(kind string, octx index.OptimizableC } } - return octx, nil + return nil, nil } diff --git a/search/searcher/search_term.go b/search/searcher/search_term.go index c1af74c76..e07d25333 100644 --- a/search/searcher/search_term.go +++ b/search/searcher/search_term.go @@ -137,5 +137,5 @@ func (s *TermSearcher) Optimize(kind string, octx index.OptimizableContext) ( return o.Optimize(kind, octx) } - return octx, nil + return nil, nil } From f5339fb83361137667c2e1df635d52b3f4eca917 Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Mon, 17 Aug 2020 14:57:05 -0600 Subject: [PATCH 704/728] Fix mishandled error within IndexSnapshot's newDocIDReader + Fixes https://github.com/blevesearch/bleve/issues/1442 + Also removing unnecessary go.mod entry for ghistogram --- go.mod | 1 - index/scorch/snapshot_index.go | 9 ++++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/go.mod b/go.mod index 2b23a510b..09bf53735 100644 --- a/go.mod +++ b/go.mod @@ -12,7 +12,6 @@ require ( github.com/blevesearch/zap/v12 v12.0.9 github.com/blevesearch/zap/v13 v13.0.1 github.com/blevesearch/zap/v14 v14.0.0 - github.com/couchbase/ghistogram v0.1.0 // indirect github.com/couchbase/moss v0.1.0 github.com/couchbase/vellum v1.0.1 github.com/golang/protobuf v1.3.2 diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 47cc809b2..53eb3ed35 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -303,9 +303,12 @@ func (i *IndexSnapshot) newDocIDReader(results chan *asynchSegmentResult) (index var err error for count := 0; count < len(i.segment); count++ { asr := <-results - if asr.err != nil && err != nil { - err = asr.err - } else { + if asr.err != nil { + if err == nil { + // returns the first error encountered + err = asr.err + } + } else if err == nil { rv.iterators[asr.index] = asr.docs.Iterator() } } From 3d6bf42ddd39e24e417d753d18697a5263313355 Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Tue, 18 Aug 2020 15:16:03 -0600 Subject: [PATCH 705/728] MB-40916: Do not recycle unadorned term field readers for optimized conjunctions/disjunctions (#1438) MB-40916: Do not recycle optimized unadorned term field readers Do not recycle an optimized unadorned term field reader (used for ConjunctionUnadorned or DisjunctionUnadorned), during when a fresh roaring.Bitmap is built by AND-ing or OR-ing individual bitmaps - after which we'll need to release them for GC. --- index/scorch/optimize.go | 45 ++++++++++++----------- index/scorch/scorch_test.go | 1 - index/scorch/segment/unadorned.go | 15 ++++---- index/scorch/segment_plugin.go | 14 +++---- index/scorch/snapshot_index.go | 12 +++++- index/scorch/snapshot_index_tfr.go | 1 + search/searcher/search_multi_term.go | 8 ++-- search/searcher/search_term_range_test.go | 3 +- 8 files changed, 55 insertions(+), 44 deletions(-) diff --git a/index/scorch/optimize.go b/index/scorch/optimize.go index 6711603d5..b3d736f71 100644 --- a/index/scorch/optimize.go +++ b/index/scorch/optimize.go @@ -165,16 +165,8 @@ func (o *OptimizeTFRConjunctionUnadorned) Finish() (rv index.Optimized, err erro // We use an artificial term and field because the optimized // termFieldReader can represent multiple terms and fields. - oTFR := &IndexSnapshotTermFieldReader{ - term: OptimizeTFRConjunctionUnadornedTerm, - field: OptimizeTFRConjunctionUnadornedField, - snapshot: o.snapshot, - iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)), - segmentOffset: 0, - includeFreq: false, - includeNorm: false, - includeTermVectors: false, - } + oTFR := o.snapshot.unadornedTermFieldReader( + OptimizeTFRConjunctionUnadornedTerm, OptimizeTFRConjunctionUnadornedField) var actualBMs []*roaring.Bitmap // Collected from regular posting lists. @@ -283,7 +275,7 @@ func (s *IndexSnapshotTermFieldReader) optimizeDisjunctionUnadorned( octx index.OptimizableContext, minChildCardinality uint64) (index.OptimizableContext, error) { if octx == nil { octx = &OptimizeTFRDisjunctionUnadorned{ - snapshot: s.snapshot, + snapshot: s.snapshot, minChildCardinality: minChildCardinality, } } @@ -349,16 +341,8 @@ func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err erro // We use an artificial term and field because the optimized // termFieldReader can represent multiple terms and fields. - oTFR := &IndexSnapshotTermFieldReader{ - term: OptimizeTFRDisjunctionUnadornedTerm, - field: OptimizeTFRDisjunctionUnadornedField, - snapshot: o.snapshot, - iterators: make([]segment.PostingsIterator, len(o.snapshot.segment)), - segmentOffset: 0, - includeFreq: false, - includeNorm: false, - includeTermVectors: false, - } + oTFR := o.snapshot.unadornedTermFieldReader( + OptimizeTFRDisjunctionUnadornedTerm, OptimizeTFRDisjunctionUnadornedField) var docNums []uint32 // Collected docNum's from 1-hit posting lists. var actualBMs []*roaring.Bitmap // Collected from regular posting lists. @@ -405,3 +389,22 @@ func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err erro atomic.AddUint64(&o.snapshot.parent.stats.TotTermSearchersStarted, uint64(1)) return oTFR, nil } + +// ---------------------------------------------------------------- + +func (i *IndexSnapshot) unadornedTermFieldReader( + term []byte, field string) *IndexSnapshotTermFieldReader { + // This IndexSnapshotTermFieldReader will not be recycled, more + // conversation here: https://github.com/blevesearch/bleve/pull/1438 + return &IndexSnapshotTermFieldReader{ + term: term, + field: field, + snapshot: i, + iterators: make([]segment.PostingsIterator, len(i.segment)), + segmentOffset: 0, + includeFreq: false, + includeNorm: false, + includeTermVectors: false, + recycle: false, + } +} diff --git a/index/scorch/scorch_test.go b/index/scorch/scorch_test.go index e22fafa27..5f0ec1b75 100644 --- a/index/scorch/scorch_test.go +++ b/index/scorch/scorch_test.go @@ -2458,7 +2458,6 @@ func TestIndexSeekBackwardsStats(t *testing.T) { t.Fatalf("error closing term field reader: %v", err) } - if idx.(*Scorch).stats.TotTermSearchersStarted != idx.(*Scorch).stats.TotTermSearchersFinished { t.Errorf("expected term searchers started %d to equal term searchers finished %d", idx.(*Scorch).stats.TotTermSearchersStarted, diff --git a/index/scorch/segment/unadorned.go b/index/scorch/segment/unadorned.go index 56e752348..db06562df 100644 --- a/index/scorch/segment/unadorned.go +++ b/index/scorch/segment/unadorned.go @@ -24,7 +24,6 @@ var reflectStaticSizeUnadornedPostingsIteratorBitmap int var reflectStaticSizeUnadornedPostingsIterator1Hit int var reflectStaticSizeUnadornedPosting int - func init() { var pib UnadornedPostingsIteratorBitmap reflectStaticSizeUnadornedPostingsIteratorBitmap = int(reflect.TypeOf(pib).Size()) @@ -34,7 +33,7 @@ func init() { reflectStaticSizeUnadornedPosting = int(reflect.TypeOf(up).Size()) } -type UnadornedPostingsIteratorBitmap struct{ +type UnadornedPostingsIteratorBitmap struct { actual roaring.IntPeekable actualBM *roaring.Bitmap } @@ -72,15 +71,15 @@ func (i *UnadornedPostingsIteratorBitmap) Size() int { return reflectStaticSizeUnadornedPostingsIteratorBitmap } -func (i *UnadornedPostingsIteratorBitmap) ActualBitmap() *roaring.Bitmap { +func (i *UnadornedPostingsIteratorBitmap) ActualBitmap() *roaring.Bitmap { return i.actualBM } -func (i *UnadornedPostingsIteratorBitmap) DocNum1Hit() (uint64, bool) { +func (i *UnadornedPostingsIteratorBitmap) DocNum1Hit() (uint64, bool) { return 0, false } -func (i *UnadornedPostingsIteratorBitmap) ReplaceActual(actual *roaring.Bitmap) { +func (i *UnadornedPostingsIteratorBitmap) ReplaceActual(actual *roaring.Bitmap) { i.actualBM = actual i.actual = actual.Iterator() } @@ -88,13 +87,13 @@ func (i *UnadornedPostingsIteratorBitmap) ReplaceActual(actual *roaring.Bitmap) func NewUnadornedPostingsIteratorFromBitmap(bm *roaring.Bitmap) PostingsIterator { return &UnadornedPostingsIteratorBitmap{ actualBM: bm, - actual: bm.Iterator(), + actual: bm.Iterator(), } } const docNum1HitFinished = math.MaxUint64 -type UnadornedPostingsIterator1Hit struct{ +type UnadornedPostingsIterator1Hit struct { docNum uint64 } @@ -158,4 +157,4 @@ func (p UnadornedPosting) Locations() []Location { func (p UnadornedPosting) Size() int { return reflectStaticSizeUnadornedPosting -} \ No newline at end of file +} diff --git a/index/scorch/segment_plugin.go b/index/scorch/segment_plugin.go index 4729e6bd6..b830b2c05 100644 --- a/index/scorch/segment_plugin.go +++ b/index/scorch/segment_plugin.go @@ -81,11 +81,11 @@ func chooseSegmentPlugin(forcedSegmentType string, func (s *Scorch) loadSegmentPlugin(forcedSegmentType string, forcedSegmentVersion uint32) error { - segPlugin, err := chooseSegmentPlugin(forcedSegmentType, - forcedSegmentVersion) - if err != nil { - return err - } - s.segPlugin = segPlugin - return nil + segPlugin, err := chooseSegmentPlugin(forcedSegmentType, + forcedSegmentVersion) + if err != nil { + return err + } + s.segPlugin = segPlugin + return nil } diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 53eb3ed35..9d17bcb2c 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -514,10 +514,20 @@ func (i *IndexSnapshot) allocTermFieldReaderDicts(field string) (tfr *IndexSnaps } } i.m2.Unlock() - return &IndexSnapshotTermFieldReader{} + return &IndexSnapshotTermFieldReader{ + recycle: true, + } } func (i *IndexSnapshot) recycleTermFieldReader(tfr *IndexSnapshotTermFieldReader) { + if !tfr.recycle { + // Do not recycle an optimized unadorned term field reader (used for + // ConjunctionUnadorned or DisjunctionUnadorned), during when a fresh + // roaring.Bitmap is built by AND-ing or OR-ing individual bitmaps, + // and we'll need to release them for GC. (See MB-40916) + return + } + i.parent.rootLock.RLock() obsolete := i.parent.root != i i.parent.rootLock.RUnlock() diff --git a/index/scorch/snapshot_index_tfr.go b/index/scorch/snapshot_index_tfr.go index 61537fc4f..239f68fbe 100644 --- a/index/scorch/snapshot_index_tfr.go +++ b/index/scorch/snapshot_index_tfr.go @@ -45,6 +45,7 @@ type IndexSnapshotTermFieldReader struct { includeTermVectors bool currPosting segment.Posting currID index.IndexInternalID + recycle bool } func (i *IndexSnapshotTermFieldReader) Size() int { diff --git a/search/searcher/search_multi_term.go b/search/searcher/search_multi_term.go index 85aacc176..06a265c7e 100644 --- a/search/searcher/search_multi_term.go +++ b/search/searcher/search_multi_term.go @@ -33,7 +33,7 @@ func NewMultiTermSearcher(indexReader index.IndexReader, terms []string, } } - qsearchers, err := makeBatchSearchers(indexReader, terms, field, boost, options) + qsearchers, err := makeBatchSearchers(indexReader, terms, field, boost, options) if err != nil { return nil, err } @@ -57,7 +57,7 @@ func NewMultiTermSearcherBytes(indexReader index.IndexReader, terms [][]byte, } } - qsearchers, err := makeBatchSearchersBytes(indexReader, terms, field, boost, options) + qsearchers, err := makeBatchSearchersBytes(indexReader, terms, field, boost, options) if err != nil { return nil, err } @@ -112,7 +112,7 @@ func optimizeMultiTermSearcher(indexReader index.IndexReader, terms []string, } } } - finalSearcher, err = optimizeCompositeSearcher("disjunction:unadorned-force", + finalSearcher, err = optimizeCompositeSearcher("disjunction:unadorned-force", indexReader, batch, options) // all searchers in batch should be closed, regardless of error or optimization failure // either we're returning, or continuing and only finalSearcher is needed for next loop @@ -177,7 +177,7 @@ func optimizeMultiTermSearcherBytes(indexReader index.IndexReader, terms [][]byt } } } - finalSearcher, err = optimizeCompositeSearcher("disjunction:unadorned-force", + finalSearcher, err = optimizeCompositeSearcher("disjunction:unadorned-force", indexReader, batch, options) // all searchers in batch should be closed, regardless of error or optimization failure // either we're returning, or continuing and only finalSearcher is needed for next loop diff --git a/search/searcher/search_term_range_test.go b/search/searcher/search_term_range_test.go index ec8935c3e..22cf065d2 100644 --- a/search/searcher/search_term_range_test.go +++ b/search/searcher/search_term_range_test.go @@ -280,5 +280,4 @@ func TestTermRangeSearchTooManyTerms(t *testing.T) { if !reflect.DeepEqual(got, want) { t.Errorf("expected: %#v, got %#v", want, got) } - -} \ No newline at end of file +} From 6c7808ca5341bf5fb1a17f14b3fb1bb5c3061727 Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Wed, 19 Aug 2020 09:08:32 -0600 Subject: [PATCH 706/728] Fix mishandled error within NewNumericRangeSearcher (#1445) + Return the error upon failure to close the field dictionary obtained from the IndexReader. + Fixes: https://github.com/blevesearch/bleve/issues/1444 --- search/searcher/search_numeric_range.go | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/search/searcher/search_numeric_range.go b/search/searcher/search_numeric_range.go index 675f569d9..48d6226e1 100644 --- a/search/searcher/search_numeric_range.go +++ b/search/searcher/search_numeric_range.go @@ -74,9 +74,8 @@ func NewNumericRangeSearcher(indexReader index.IndexReader, terms := termRanges.Enumerate(isIndexed) if fieldDict != nil { if fd, ok := fieldDict.(index.FieldDict); ok { - cerr := fd.Close() - if cerr != nil { - err = cerr + if err = fd.Close(); err != nil { + return nil, err } } } From ed463b9d02851c11c84603f30febb5a3efdc7318 Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Wed, 19 Aug 2020 12:09:49 -0600 Subject: [PATCH 707/728] MB-40962: Dropping the heuristic to skip disjunction optimizations (#1446) + Getting rid of the minChildCardinality heuristic for optimized unadorned disjunctions where we previously used to skip the optimization in case of low-cardinality bitmaps. + Removing this heuristic showed tremendous improvement for certain types of queries which involved a conjunction of disjunctions of fuzzy queries. + However, here's something worth noting from the commit that introduced this heuristic .. https://github.com/blevesearch/bleve/commit/58e6641d13844e212687bd35823914c5e26b04dd ``` Regarding perf microbenchmarks (bleve-query) on a 200K en-wiki docs scorch index... for a high number of high-frequency terms... - wildcard search on "th*" (~31K hits)... before the change, with normal scoring - ~4.7 q/sec w/ NoScore:true - ~5.1 q/sec w/ NoScore:true & unadorned disj. optimization - ~11.6 q/sec for a low number of high-frequency terms... - query-string search on "http www com" (~25K hits)... before the change, with normal scoring - ~190 q/sec w/ NoScore:true - ~260 q/sec w/ NoScore:true & unadorned disj. optimization - ~415 q/sec for a low number of low-frequency terms... - query-string search on "marty shoch" (207 hits)... before the change, with normal scoring - ~15.0K q/sec w/ NoScore:true - ~21.3K q/sec w/ NoScore:true & unadorned disj. optimization - ~16.1K q/sec ``` --- index/scorch/optimize.go | 20 +++----------------- search/searcher/search_multi_term.go | 4 ++-- 2 files changed, 5 insertions(+), 19 deletions(-) diff --git a/index/scorch/optimize.go b/index/scorch/optimize.go index b3d736f71..658354cd7 100644 --- a/index/scorch/optimize.go +++ b/index/scorch/optimize.go @@ -37,11 +37,7 @@ func (s *IndexSnapshotTermFieldReader) Optimize(kind string, } if OptimizeDisjunctionUnadorned && kind == "disjunction:unadorned" { - return s.optimizeDisjunctionUnadorned(octx, OptimizeDisjunctionUnadornedMinChildCardinality) - } - - if OptimizeDisjunctionUnadorned && kind == "disjunction:unadorned-force" { - return s.optimizeDisjunctionUnadorned(octx, 0) + return s.optimizeDisjunctionUnadorned(octx) } return nil, nil @@ -272,11 +268,10 @@ OUTER: // term-vectors are not required, and instead only the internal-id's // are needed. func (s *IndexSnapshotTermFieldReader) optimizeDisjunctionUnadorned( - octx index.OptimizableContext, minChildCardinality uint64) (index.OptimizableContext, error) { + octx index.OptimizableContext) (index.OptimizableContext, error) { if octx == nil { octx = &OptimizeTFRDisjunctionUnadorned{ - snapshot: s.snapshot, - minChildCardinality: minChildCardinality, + snapshot: s.snapshot, } } @@ -298,8 +293,6 @@ type OptimizeTFRDisjunctionUnadorned struct { snapshot *IndexSnapshot tfrs []*IndexSnapshotTermFieldReader - - minChildCardinality uint64 } var OptimizeTFRDisjunctionUnadornedTerm = []byte("") @@ -330,13 +323,6 @@ func (o *OptimizeTFRDisjunctionUnadorned) Finish() (rv index.Optimized, err erro } } } - - // Heuristic to skip the optimization if all the constituent - // bitmaps are too small, where the processing & resource - // overhead to create the OR'ed bitmap outweighs the benefit. - if cMax < o.minChildCardinality { - return nil, nil - } } // We use an artificial term and field because the optimized diff --git a/search/searcher/search_multi_term.go b/search/searcher/search_multi_term.go index 06a265c7e..70a2fa38c 100644 --- a/search/searcher/search_multi_term.go +++ b/search/searcher/search_multi_term.go @@ -112,7 +112,7 @@ func optimizeMultiTermSearcher(indexReader index.IndexReader, terms []string, } } } - finalSearcher, err = optimizeCompositeSearcher("disjunction:unadorned-force", + finalSearcher, err = optimizeCompositeSearcher("disjunction:unadorned", indexReader, batch, options) // all searchers in batch should be closed, regardless of error or optimization failure // either we're returning, or continuing and only finalSearcher is needed for next loop @@ -177,7 +177,7 @@ func optimizeMultiTermSearcherBytes(indexReader index.IndexReader, terms [][]byt } } } - finalSearcher, err = optimizeCompositeSearcher("disjunction:unadorned-force", + finalSearcher, err = optimizeCompositeSearcher("disjunction:unadorned", indexReader, batch, options) // all searchers in batch should be closed, regardless of error or optimization failure // either we're returning, or continuing and only finalSearcher is needed for next loop From 9a99dfbb79795c6bd5f0dad12242d572f6dc20f4 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Thu, 20 Aug 2020 16:46:06 +0530 Subject: [PATCH 708/728] MB-40730 - Sort by field returns incorrect results Fixing the filterTermsByType method by resetting the terms only when valid prefix coded, zero shifted values are found to guard against any inadvertent overrides of the original field values. --- search/sort.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/search/sort.go b/search/sort.go index 3dc118518..dca422ebd 100644 --- a/search/sort.go +++ b/search/sort.go @@ -427,7 +427,8 @@ func (s *SortField) filterTermsByType(terms [][]byte) [][]byte { allTermsPrefixCoded = false } } - if allTermsPrefixCoded { + // reset the terms only when valid zero shift terms are found. + if allTermsPrefixCoded && len(termsWithShiftZero) > 0 { terms = termsWithShiftZero s.tmp = termsWithShiftZero[:0] } From 5ef9653c4e13f3ff8f48368c816752b404ac8706 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 24 Aug 2020 10:44:16 -0400 Subject: [PATCH 709/728] fix ineffectual assignment (#1450) Intention was for calcBudget local var to either use the CalcBudget provided in the Options, or fall back to the default mergeplan.CalcBudget method. However, after assigning the correct impl to the calcBudget local var, the code always invoked mergeplan.CalcBudget anwyay. This change allows the CalcBudget option to work as intended. --- index/scorch/mergeplan/merge_plan.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/index/scorch/mergeplan/merge_plan.go b/index/scorch/mergeplan/merge_plan.go index e02923cc1..752350662 100644 --- a/index/scorch/mergeplan/merge_plan.go +++ b/index/scorch/mergeplan/merge_plan.go @@ -184,7 +184,7 @@ func plan(segmentsIn []Segment, o *MergePlanOptions) (*MergePlan, error) { calcBudget = CalcBudget } - budgetNumSegments := CalcBudget(eligiblesLiveSize, minLiveSize, o) + budgetNumSegments := calcBudget(eligiblesLiveSize, minLiveSize, o) scoreSegments := o.ScoreSegments if scoreSegments == nil { From abc1d043a690fcf392cf4b107fcb80915a58f401 Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Mon, 24 Aug 2020 08:44:32 -0600 Subject: [PATCH 710/728] Update vellum version to v1.0.2 (#1451) Diff since v1.0.1: * 8f70ead Marty Schoch | fix bug in iterator next optimization * 915a44a Marty Schoch | update README with badge showing github CI * 2dd513f Marty Schoch | add github workflow to run tests --- go.mod | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go.mod b/go.mod index 09bf53735..de5fe8425 100644 --- a/go.mod +++ b/go.mod @@ -13,7 +13,7 @@ require ( github.com/blevesearch/zap/v13 v13.0.1 github.com/blevesearch/zap/v14 v14.0.0 github.com/couchbase/moss v0.1.0 - github.com/couchbase/vellum v1.0.1 + github.com/couchbase/vellum v1.0.2 github.com/golang/protobuf v1.3.2 github.com/kljensen/snowball v0.6.0 github.com/rcrowley/go-metrics v0.0.0-20190826022208-cac0b30c2563 From d553d2c9f1613896f411d642a2cfc4b19e42ec9c Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Mon, 24 Aug 2020 08:46:18 -0600 Subject: [PATCH 711/728] Upgrading go.mod tags for etcd-io/bbolt, RoaringBitmap/roaring (#1422) * Upgrading etcd-io/bbolt's tag to v1.3.5 This fixes issues related to bad usage of the unsafe library. * Update RoaringBitmap/roaring's tag to v0.4.23 This fixes issues related to bad usage of the unsafe library. https://github.com/RoaringBitmap/roaring/commit/9f3bcbda4913333a8445291f5055bb5b2619feb4 --- go.mod | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/go.mod b/go.mod index de5fe8425..758d17d51 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module github.com/blevesearch/bleve go 1.13 require ( - github.com/RoaringBitmap/roaring v0.4.21 + github.com/RoaringBitmap/roaring v0.4.23 github.com/blevesearch/blevex v0.0.0-20190916190636-152f0fe5c040 github.com/blevesearch/go-porterstemmer v1.0.3 github.com/blevesearch/segment v0.9.0 @@ -21,6 +21,6 @@ require ( github.com/steveyen/gtreap v0.1.0 github.com/syndtr/goleveldb v1.0.0 github.com/willf/bitset v1.1.10 - go.etcd.io/bbolt v1.3.4 + go.etcd.io/bbolt v1.3.5 golang.org/x/text v0.3.0 ) From b7678e38dcb6c73c16183c9a37ed9fb14b426c45 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 24 Aug 2020 10:48:06 -0400 Subject: [PATCH 712/728] simplify compute geo range (#1447) * simplify compute geo range attempt to go back to a simpler compute geo range but not give up the perf improvements * refactor again to consolidate arguments I went back to the other point Steve had made about how many arguments we passed around, and observed that many of them were never altered. So, I converted that into a struct, and made the functions methods on that struct. This helped performance get back to match or beat Steve's version. * avoid an extra bit shift op review feedback from Abhinav --- search/searcher/search_geoboundingbox.go | 184 +++++++++++++---------- 1 file changed, 101 insertions(+), 83 deletions(-) diff --git a/search/searcher/search_geoboundingbox.go b/search/searcher/search_geoboundingbox.go index c4b8af927..76157f01a 100644 --- a/search/searcher/search_geoboundingbox.go +++ b/search/searcher/search_geoboundingbox.go @@ -24,7 +24,7 @@ import ( type filterFunc func(key []byte) bool -var GeoBitsShift1 = (geo.GeoBits << 1) +var GeoBitsShift1 = geo.GeoBits << 1 var GeoBitsShift1Minus1 = GeoBitsShift1 - 1 func NewGeoBoundingBoxSearcher(indexReader index.IndexReader, minLon, minLat, @@ -100,30 +100,42 @@ func NewGeoBoundingBoxSearcher(indexReader index.IndexReader, minLon, minLat, var geoMaxShift = document.GeoPrecisionStep * 4 var geoDetailLevel = ((geo.GeoBits << 1) - geoMaxShift) / 2 +type closeFunc func() error func ComputeGeoRange(term uint64, shift uint, sminLon, sminLat, smaxLon, smaxLat float64, checkBoundaries bool, indexReader index.IndexReader, field string) ( onBoundary [][]byte, notOnBoundary [][]byte, err error) { - preallocBytesLen := 32 - preallocBytes := make([]byte, preallocBytesLen) - makePrefixCoded := func(in int64, shift uint) (rv numeric.PrefixCoded) { - if len(preallocBytes) <= 0 { - preallocBytesLen = preallocBytesLen * 2 - preallocBytes = make([]byte, preallocBytesLen) - } - - rv, preallocBytes, err = - numeric.NewPrefixCodedInt64Prealloc(in, shift, preallocBytes) + isIndexed, closeF, err := buildIsIndexedFunc(indexReader, field) + if closeF != nil { + defer func() { + cerr := closeF() + if cerr != nil { + err = cerr + } + }() + } - return rv + grc := &geoRangeCompute{ + preallocBytesLen: 32, + preallocBytes: make([]byte, 32), + sminLon: sminLon, + sminLat: sminLat, + smaxLon: smaxLon, + smaxLat: smaxLat, + checkBoundaries: checkBoundaries, + isIndexed: isIndexed, } - var fieldDict index.FieldDictContains - var isIndexed filterFunc + grc.computeGeoRange(term, shift) + + return grc.onBoundary, grc.notOnBoundary, nil +} + +func buildIsIndexedFunc(indexReader index.IndexReader, field string) (isIndexed filterFunc, closeF closeFunc, err error) { if irr, ok := indexReader.(index.IndexReaderContains); ok { - fieldDict, err = irr.FieldDictContains(field) + fieldDict, err := irr.FieldDictContains(field) if err != nil { return nil, nil, err } @@ -132,22 +144,18 @@ func ComputeGeoRange(term uint64, shift uint, found, err := fieldDict.Contains(term) return err == nil && found } - } - defer func() { - if fieldDict != nil { + closeF = func() error { if fd, ok := fieldDict.(index.FieldDict); ok { - cerr := fd.Close() - if cerr != nil { - err = cerr + err := fd.Close() + if err != nil { + return err } } + return nil } - }() - - if isIndexed == nil { + } else if indexReader != nil { isIndexed = func(term []byte) bool { - if indexReader != nil { reader, err := indexReader.TermFieldReader(term, field, false, false, false) if err != nil || reader == nil { return false @@ -157,68 +165,15 @@ func ComputeGeoRange(term uint64, shift uint, return false } _ = reader.Close() - } - return true + return true } - } - var computeGeoRange func(term uint64, shift uint) // declare for recursion - - relateAndRecurse := func(start, end uint64, res, level uint) { - minLon := geo.MortonUnhashLon(start) - minLat := geo.MortonUnhashLat(start) - maxLon := geo.MortonUnhashLon(end) - maxLat := geo.MortonUnhashLat(end) - - within := res%document.GeoPrecisionStep == 0 && - geo.RectWithin(minLon, minLat, maxLon, maxLat, - sminLon, sminLat, smaxLon, smaxLat) - if within || (level == geoDetailLevel && - geo.RectIntersects(minLon, minLat, maxLon, maxLat, - sminLon, sminLat, smaxLon, smaxLat)) { - codedTerm := makePrefixCoded(int64(start), res) - if isIndexed(codedTerm) { - if !within && checkBoundaries { - onBoundary = append(onBoundary, codedTerm) - } else { - notOnBoundary = append(notOnBoundary, codedTerm) - } - } - } else if level < geoDetailLevel && - geo.RectIntersects(minLon, minLat, maxLon, maxLat, - sminLon, sminLat, smaxLon, smaxLat) { - computeGeoRange(start, res-1) + } else { + isIndexed = func([]byte) bool { + return true } } - - computeGeoRange = func(term uint64, shift uint) { - if err != nil { - return - } - - split := term | uint64(0x1)<> 1 - - relateAndRecurse(term, lowerMax, shift, level) - relateAndRecurse(split, upperMax, shift, level) - } - - computeGeoRange(term, shift) - - if err != nil { - return nil, nil, err - } - - return onBoundary, notOnBoundary, err + return isIndexed, closeF, err } func buildRectFilter(dvReader index.DocValueReader, field string, @@ -252,3 +207,66 @@ func buildRectFilter(dvReader index.DocValueReader, field string, return false } } + +type geoRangeCompute struct { + preallocBytesLen int + preallocBytes []byte + sminLon, sminLat, smaxLon, smaxLat float64 + checkBoundaries bool + onBoundary, notOnBoundary [][]byte + isIndexed func(term []byte) bool +} + +func (grc *geoRangeCompute) makePrefixCoded(in int64, shift uint) (rv numeric.PrefixCoded) { + if len(grc.preallocBytes) <= 0 { + grc.preallocBytesLen = grc.preallocBytesLen * 2 + grc.preallocBytes = make([]byte, grc.preallocBytesLen) + } + + rv, grc.preallocBytes, _ = + numeric.NewPrefixCodedInt64Prealloc(in, shift, grc.preallocBytes) + + return rv +} + +func (grc *geoRangeCompute) computeGeoRange(term uint64, shift uint) { + split := term | uint64(0x1)<> 1 + + within := res%document.GeoPrecisionStep == 0 && + geo.RectWithin(minLon, minLat, maxLon, maxLat, + grc.sminLon, grc.sminLat, grc.smaxLon, grc.smaxLat) + if within || (level == geoDetailLevel && + geo.RectIntersects(minLon, minLat, maxLon, maxLat, + grc.sminLon, grc.sminLat, grc.smaxLon, grc.smaxLat)) { + codedTerm := grc.makePrefixCoded(int64(start), res) + if grc.isIndexed(codedTerm) { + if !within && grc.checkBoundaries { + grc.onBoundary = append(grc.onBoundary, codedTerm) + } else { + grc.notOnBoundary = append(grc.notOnBoundary, codedTerm) + } + } + } else if level < geoDetailLevel && + geo.RectIntersects(minLon, minLat, maxLon, maxLat, + grc.sminLon, grc.sminLat, grc.smaxLon, grc.smaxLat) { + grc.computeGeoRange(start, res-1) + } +} \ No newline at end of file From a929f94722d22f3a80da56a7519c4e5de1c58fa4 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Mon, 24 Aug 2020 14:06:09 -0400 Subject: [PATCH 713/728] bump bleve versions for next bleve release (#1454) zap v11 v11.0.10 zap v12 v12.0.10 zap v13 v13.0.2 zap v14 v14.0.1 --- go.mod | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/go.mod b/go.mod index 758d17d51..9a12454fd 100644 --- a/go.mod +++ b/go.mod @@ -8,10 +8,10 @@ require ( github.com/blevesearch/go-porterstemmer v1.0.3 github.com/blevesearch/segment v0.9.0 github.com/blevesearch/snowballstem v0.9.0 - github.com/blevesearch/zap/v11 v11.0.9 - github.com/blevesearch/zap/v12 v12.0.9 - github.com/blevesearch/zap/v13 v13.0.1 - github.com/blevesearch/zap/v14 v14.0.0 + github.com/blevesearch/zap/v11 v11.0.10 + github.com/blevesearch/zap/v12 v12.0.10 + github.com/blevesearch/zap/v13 v13.0.2 + github.com/blevesearch/zap/v14 v14.0.1 github.com/couchbase/moss v0.1.0 github.com/couchbase/vellum v1.0.2 github.com/golang/protobuf v1.3.2 From c997b4a57648e06d8c588d93e279562279f74959 Mon Sep 17 00:00:00 2001 From: Tuomas Salo Date: Tue, 22 Sep 2020 15:23:34 +0300 Subject: [PATCH 714/728] fix tests on go 1.15 (#1466) --- mapping/mapping_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mapping/mapping_test.go b/mapping/mapping_test.go index 2b6870966..1c9ced58a 100644 --- a/mapping/mapping_test.go +++ b/mapping/mapping_test.go @@ -959,7 +959,7 @@ func TestMappingForGeo(t *testing.T) { expect = append(expect, []float64{-71.34, 41.12}) for i, geopoint := range geopoints { - doc := document.NewDocument(string(i)) + doc := document.NewDocument(fmt.Sprint(i)) err := mapping.MapDocument(doc, geopoint) if err != nil { t.Fatal(err) From 99e9e901c29699c3f99bc0b2c5480b74d3e27535 Mon Sep 17 00:00:00 2001 From: Sreekanth Sivasankaran Date: Mon, 28 Sep 2020 19:53:47 +0530 Subject: [PATCH 715/728] New stats for reclaimable disk space from scorch index (#1470) * New stats for reclaimable disk space from scorch index Adding a new stats `num_bytes_used_disk_by_root_reclaimable` which helps to approximate the amount of reclaimable disk space from a scorch index. This could turn out to be a useful insight into knowing the amount of disk space wasted due to tombstoned/obsoleted contents across the segments. * refactoring to reduce the lock time for capturing the disk stats. --- index/scorch/scorch.go | 22 +++++++++++----------- index/scorch/snapshot_index.go | 27 +++++++++++++++++++++++++++ 2 files changed, 38 insertions(+), 11 deletions(-) diff --git a/index/scorch/scorch.go b/index/scorch/scorch.go index ba98a460d..fccff67ab 100644 --- a/index/scorch/scorch.go +++ b/index/scorch/scorch.go @@ -515,21 +515,17 @@ func (s *Scorch) diskFileStats(rootSegmentPaths map[string]struct{}) (uint64, return numFilesOnDisk, numBytesUsedDisk, numBytesOnDiskByRoot } -func (s *Scorch) rootDiskSegmentsPaths() map[string]struct{} { - rv := make(map[string]struct{}, len(s.root.segment)) - for _, segmentSnapshot := range s.root.segment { - if seg, ok := segmentSnapshot.segment.(segment.PersistedSegment); ok { - rv[seg.Path()] = struct{}{} - } - } - return rv -} - func (s *Scorch) StatsMap() map[string]interface{} { m := s.stats.ToMap() + indexSnapshot := s.currentSnapshot() + defer func() { + _ = indexSnapshot.Close() + }() + + rootSegPaths := indexSnapshot.diskSegmentsPaths() + s.rootLock.RLock() - rootSegPaths := s.rootDiskSegmentsPaths() m["CurFilesIneligibleForRemoval"] = uint64(len(s.ineligibleForRemoval)) s.rootLock.RUnlock() @@ -556,6 +552,10 @@ func (s *Scorch) StatsMap() map[string]interface{} { m["num_bytes_used_disk"] = numBytesUsedDisk // total disk bytes by the latest root index, exclusive of older snapshots m["num_bytes_used_disk_by_root"] = numBytesOnDiskByRoot + // num_bytes_used_disk_by_root_reclaimable is an approximation about the + // reclaimable disk space in an index. (eg: from a full compaction) + m["num_bytes_used_disk_by_root_reclaimable"] = uint64(float64(numBytesOnDiskByRoot) * + indexSnapshot.reClaimableDocsRatio()) m["num_files_on_disk"] = numFilesOnDisk m["num_root_memorysegments"] = m["TotMemorySegmentsAtRoot"] m["num_root_filesegments"] = m["TotFileSegmentsAtRoot"] diff --git a/index/scorch/snapshot_index.go b/index/scorch/snapshot_index.go index 9d17bcb2c..61204ebbc 100644 --- a/index/scorch/snapshot_index.go +++ b/index/scorch/snapshot_index.go @@ -708,6 +708,33 @@ func (i *IndexSnapshot) DumpFields() chan interface{} { return rv } +func (i *IndexSnapshot) diskSegmentsPaths() map[string]struct{} { + rv := make(map[string]struct{}, len(i.segment)) + for _, segmentSnapshot := range i.segment { + if seg, ok := segmentSnapshot.segment.(segment.PersistedSegment); ok { + rv[seg.Path()] = struct{}{} + } + } + return rv +} + +// reClaimableDocsRatio gives a ratio about the obsoleted or +// reclaimable documents present in a given index snapshot. +func (i *IndexSnapshot) reClaimableDocsRatio() float64 { + var totalCount, liveCount uint64 + for _, segmentSnapshot := range i.segment { + if _, ok := segmentSnapshot.segment.(segment.PersistedSegment); ok { + totalCount += uint64(segmentSnapshot.FullSize()) + liveCount += uint64(segmentSnapshot.Count()) + } + } + + if totalCount > 0 { + return float64(totalCount-liveCount) / float64(totalCount) + } + return 0 +} + // subtractStrings returns set a minus elements of set b. func subtractStrings(a, b []string) []string { if len(b) == 0 { From 0197e8ddfac6af208d0cf6b58d3b7b5f863b08a7 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 30 Sep 2020 10:00:03 -0400 Subject: [PATCH 716/728] an optimization fix test was no longer helping (#1459) it was determined that this test was no longer testing what it was designed to test, due to other changes. this fix now correctly verifies the fix again by disabling an optimization that interferes with the test --- index_test.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/index_test.go b/index_test.go index c535456b0..50684420d 100644 --- a/index_test.go +++ b/index_test.go @@ -2135,6 +2135,11 @@ func testBatchRaceBug1149(t *testing.T, i Index) { } func TestOptimisedConjunctionSearchHits(t *testing.T) { + scorch.OptimizeDisjunctionUnadorned = false + defer func() { + scorch.OptimizeDisjunctionUnadorned = true + }() + defer func() { err := os.RemoveAll("testidx") if err != nil { From 7a377b75a2295f20568129a81dadd53efd013762 Mon Sep 17 00:00:00 2001 From: pavelbazika Date: Tue, 6 Oct 2020 17:22:43 +0200 Subject: [PATCH 717/728] Index is always left closed when opening process failed (#1479) Co-authored-by: Pavel Bazika --- index_impl.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/index_impl.go b/index_impl.go index 629cc9b2f..0520fe43b 100644 --- a/index_impl.go +++ b/index_impl.go @@ -112,6 +112,11 @@ func newIndexUsing(path string, mapping mapping.IndexMapping, indexType string, } return nil, err } + defer func(rv *indexImpl) { + if !rv.open { + rv.i.Close() + } + }(&rv) // now persist the mapping mappingBytes, err := json.Marshal(mapping) @@ -177,6 +182,11 @@ func openIndexUsing(path string, runtimeConfig map[string]interface{}) (rv *inde } return nil, err } + defer func(rv *indexImpl) { + if !rv.open { + rv.i.Close() + } + }(rv) // now load the mapping indexReader, err := rv.i.Reader() From bffe7ea0d6b1d48bfac69661b69a60cd1fda9da4 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 6 Oct 2020 11:23:28 -0400 Subject: [PATCH 718/728] add go 1.15.x to the testing matrix (#1468) --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index bcafb812b..f57aa35ec 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -8,7 +8,7 @@ jobs: test: strategy: matrix: - go-version: [1.13.x, 1.14.x] + go-version: [1.13.x, 1.14.x, 1.15.x] platform: [ubuntu-latest, macos-latest, windows-latest] runs-on: ${{ matrix.platform }} steps: From 3bb3c7b4883f9cc7d46616a01a31c145105126ad Mon Sep 17 00:00:00 2001 From: vdimir Date: Tue, 6 Oct 2020 20:51:08 +0300 Subject: [PATCH 719/728] Shutdown analysis queue (#1414) --- config.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/config.go b/config.go index 482efb408..99f2e081f 100644 --- a/config.go +++ b/config.go @@ -43,9 +43,16 @@ type configuration struct { } func (c *configuration) SetAnalysisQueueSize(n int) { + if c.analysisQueue != nil { + c.analysisQueue.Close() + } c.analysisQueue = index.NewAnalysisQueue(n) } +func (c *configuration) Shutdown() { + c.SetAnalysisQueueSize(0) +} + func newConfiguration() *configuration { return &configuration{ Cache: registry.NewCache(), From cc29456f6e8b72c06776eb63e7ac3b66c98e505e Mon Sep 17 00:00:00 2001 From: pavelbazika Date: Tue, 6 Oct 2020 20:40:04 +0200 Subject: [PATCH 720/728] Fixed ASCII folding of \u24A2 (#1434) * Fixed ASCII folding of \u24A2 * add unit test * fix comment Co-authored-by: Pavel Bazika Co-authored-by: Marty Schoch --- analysis/char/asciifolding/asciifolding.go | 2 +- analysis/char/asciifolding/asciifolding_test.go | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/analysis/char/asciifolding/asciifolding.go b/analysis/char/asciifolding/asciifolding.go index 469102e2c..facce07e1 100644 --- a/analysis/char/asciifolding/asciifolding.go +++ b/analysis/char/asciifolding/asciifolding.go @@ -874,7 +874,7 @@ func foldToASCII(input []rune, inputPos int, output []rune, outputPos int, lengt outputPos++ case '\u24A2': // ⒢ [PARENTHESIZED LATIN SMALL LETTER G] - output = output[:(len(output) + 1)] + output = output[:(len(output) + 2)] output[outputPos] = '(' outputPos++ output[outputPos] = 'g' diff --git a/analysis/char/asciifolding/asciifolding_test.go b/analysis/char/asciifolding/asciifolding_test.go index d79542a8a..3eafd8cdd 100644 --- a/analysis/char/asciifolding/asciifolding_test.go +++ b/analysis/char/asciifolding/asciifolding_test.go @@ -48,6 +48,10 @@ func TestAsciiFoldingFilter(t *testing.T) { // apples from https://issues.couchbase.com/browse/MB-33486 input: []byte(`Ápple Àpple Äpple Âpple Ãpple Åpple`), output: []byte(`Apple Apple Apple Apple Apple Apple`), + }, { + // Fix ASCII folding of \u24A2 + input: []byte(`⒢`), + output: []byte(`(g)`), }, } From b9b775998330bf9efdc8748021a80e38fc1e250c Mon Sep 17 00:00:00 2001 From: Tuomas Salo Date: Tue, 6 Oct 2020 21:43:59 +0300 Subject: [PATCH 721/728] escape the input text in HTML highlighter when adding tags, fixes #1464 (#1465) --- search/highlight/format/html/html.go | 10 ++++++---- test/tests/basic/data/b.json | 2 +- test/tests/basic/searches.json | 8 ++++---- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/search/highlight/format/html/html.go b/search/highlight/format/html/html.go index 8154e790b..259a03795 100644 --- a/search/highlight/format/html/html.go +++ b/search/highlight/format/html/html.go @@ -15,6 +15,8 @@ package html import ( + "html" + "github.com/blevesearch/bleve/registry" "github.com/blevesearch/bleve/search/highlight" ) @@ -54,18 +56,18 @@ func (a *FragmentFormatter) Format(f *highlight.Fragment, orderedTermLocations h break } // add the stuff before this location - rv += string(f.Orig[curr:termLocation.Start]) - // add the color + rv += html.EscapeString(string(f.Orig[curr:termLocation.Start])) + // start the tag rv += a.before // add the term itself rv += string(f.Orig[termLocation.Start:termLocation.End]) - // reset the color + // end the tag rv += a.after // update current curr = termLocation.End } // add any remaining text after the last token - rv += string(f.Orig[curr:f.End]) + rv += html.EscapeString(string(f.Orig[curr:f.End])) return rv } diff --git a/test/tests/basic/data/b.json b/test/tests/basic/data/b.json index 58118a55c..069727276 100644 --- a/test/tests/basic/data/b.json +++ b/test/tests/basic/data/b.json @@ -1,6 +1,6 @@ { "id": "b", - "name": "steve has a long name", + "name": "steve has long & complicated name", "age": 27, "birthday": "2001-09-09T01:46:40Z", "title": "missess" diff --git a/test/tests/basic/searches.json b/test/tests/basic/searches.json index 8b6206c32..7ddfce375 100644 --- a/test/tests/basic/searches.json +++ b/test/tests/basic/searches.json @@ -42,7 +42,7 @@ "size": 10, "sort": ["-_score", "_id"], "query": { - "match_phrase": "long name" + "match_phrase": "steve has" } }, "result": { @@ -385,7 +385,7 @@ { "id": "b", "fragments": { - "name": ["steve has a long name"] + "name": ["steve has <a> long & complicated name"] } } ] @@ -409,7 +409,7 @@ { "id": "b", "fragments": { - "name": ["steve has a long name"] + "name": ["steve has <a> long & complicated name"] } } ] @@ -485,7 +485,7 @@ { "id": "b", "fragments": { - "name": ["steve has a long name"], + "name": ["steve has <a> long & complicated name"], "title": ["missess"] } } From 7efdbbb66a9e0cf40e7fe8ec4d9e194ec54d94dd Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 6 Oct 2020 15:34:03 -0400 Subject: [PATCH 722/728] add support for zap v15 and update zap version (#1482) * add support for zap v15 and update zap version --- go.mod | 9 +++++---- index/scorch/segment_plugin.go | 2 ++ 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/go.mod b/go.mod index 9a12454fd..615430132 100644 --- a/go.mod +++ b/go.mod @@ -8,10 +8,11 @@ require ( github.com/blevesearch/go-porterstemmer v1.0.3 github.com/blevesearch/segment v0.9.0 github.com/blevesearch/snowballstem v0.9.0 - github.com/blevesearch/zap/v11 v11.0.10 - github.com/blevesearch/zap/v12 v12.0.10 - github.com/blevesearch/zap/v13 v13.0.2 - github.com/blevesearch/zap/v14 v14.0.1 + github.com/blevesearch/zap/v11 v11.0.11 + github.com/blevesearch/zap/v12 v12.0.11 + github.com/blevesearch/zap/v13 v13.0.3 + github.com/blevesearch/zap/v14 v14.0.2 + github.com/blevesearch/zap/v15 v15.0.0 github.com/couchbase/moss v0.1.0 github.com/couchbase/vellum v1.0.2 github.com/golang/protobuf v1.3.2 diff --git a/index/scorch/segment_plugin.go b/index/scorch/segment_plugin.go index b830b2c05..2f7db48b3 100644 --- a/index/scorch/segment_plugin.go +++ b/index/scorch/segment_plugin.go @@ -23,6 +23,7 @@ import ( zapv12 "github.com/blevesearch/zap/v12" zapv13 "github.com/blevesearch/zap/v13" zapv14 "github.com/blevesearch/zap/v14" + zapv15 "github.com/blevesearch/zap/v15" ) var supportedSegmentPlugins map[string]map[uint32]segment.Plugin @@ -30,6 +31,7 @@ var defaultSegmentPlugin segment.Plugin func init() { ResetPlugins() + RegisterPlugin(zapv15.Plugin(), false) RegisterPlugin(zapv14.Plugin(), false) RegisterPlugin(zapv13.Plugin(), false) RegisterPlugin(zapv12.Plugin(), false) From 7e8fd30c725926ae0a70df5e26ee9dac5c6176b0 Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 6 Oct 2020 16:31:53 -0400 Subject: [PATCH 723/728] prepare bleve v1.0.12 (#1483) --- go.mod | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/go.mod b/go.mod index 615430132..fe5d25a35 100644 --- a/go.mod +++ b/go.mod @@ -8,11 +8,11 @@ require ( github.com/blevesearch/go-porterstemmer v1.0.3 github.com/blevesearch/segment v0.9.0 github.com/blevesearch/snowballstem v0.9.0 - github.com/blevesearch/zap/v11 v11.0.11 - github.com/blevesearch/zap/v12 v12.0.11 - github.com/blevesearch/zap/v13 v13.0.3 - github.com/blevesearch/zap/v14 v14.0.2 - github.com/blevesearch/zap/v15 v15.0.0 + github.com/blevesearch/zap/v11 v11.0.12 + github.com/blevesearch/zap/v12 v12.0.12 + github.com/blevesearch/zap/v13 v13.0.4 + github.com/blevesearch/zap/v14 v14.0.3 + github.com/blevesearch/zap/v15 v15.0.1 github.com/couchbase/moss v0.1.0 github.com/couchbase/vellum v1.0.2 github.com/golang/protobuf v1.3.2 From c67bc71f5a3f50c9365be68eb07cbdc3ee43f507 Mon Sep 17 00:00:00 2001 From: Abhinav Dangeti Date: Fri, 9 Oct 2020 09:18:38 -0600 Subject: [PATCH 724/728] Fix DocValuesDynamic's json-ability within IndexMappingImpl (#1485) + DocValuesDynamic defaults to "true" if unset. + Now, because of the "omitempty" json setting for the attribute, we incorrectly flip a "false" setting for the attribute (which is omitted as it's the zero value for boolean) to "true" (it's default). + Dropping the omitempty setting for this attribute corrects the behavior to this: - when unset, DocValuesDynamic will default to true (like before) - when set, DocValuesDynamic will take that value and will retain the value until an index definition update for the setting. + Fixes https://github.com/blevesearch/bleve/issues/1484 (where the scenario is highlighted better) --- index_test.go | 37 +++++++++++++++++++++++++++++++++++++ mapping/index.go | 2 +- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/index_test.go b/index_test.go index 50684420d..394e73083 100644 --- a/index_test.go +++ b/index_test.go @@ -16,6 +16,7 @@ package bleve import ( "context" + "encoding/json" "fmt" "io/ioutil" "log" @@ -2236,3 +2237,39 @@ func TestOptimisedConjunctionSearchHits(t *testing.T) { t.Fatal(err) } } + +func TestIndexMappingDocValuesDynamic(t *testing.T) { + im := NewIndexMapping() + // DocValuesDynamic's default is true + // Now explicitly set it to false + im.DocValuesDynamic = false + + // Next, retrieve the JSON dump of the index mapping + var data []byte + data, err = json.Marshal(im) + if err != nil { + t.Fatal(err) + } + + // Now, edit an unrelated setting in the index mapping + var m map[string]interface{} + err = json.Unmarshal(data, &m) + if err != nil { + t.Fatal(err) + } + m["index_dynamic"] = false + data, err = json.Marshal(m) + if err != nil { + t.Fatal(err) + } + + // Unmarshal back the changes into the index mapping struct + if err = im.UnmarshalJSON(data); err != nil { + t.Fatal(err) + } + + // Expect DocValuesDynamic to remain false! + if im.DocValuesDynamic { + t.Fatalf("Expected DocValuesDynamic to remain false after the index mapping edit") + } +} diff --git a/mapping/index.go b/mapping/index.go index 21ca5cce3..319ba949c 100644 --- a/mapping/index.go +++ b/mapping/index.go @@ -50,7 +50,7 @@ type IndexMappingImpl struct { DefaultField string `json:"default_field"` StoreDynamic bool `json:"store_dynamic"` IndexDynamic bool `json:"index_dynamic"` - DocValuesDynamic bool `json:"docvalues_dynamic,omitempty"` + DocValuesDynamic bool `json:"docvalues_dynamic"` CustomAnalysis *customAnalysis `json:"analysis,omitempty"` cache *registry.Cache } From e24b3afc4804a779eff58954221cd52f70215f09 Mon Sep 17 00:00:00 2001 From: Eric Lindsey Date: Wed, 14 Oct 2020 11:13:14 -0400 Subject: [PATCH 725/728] Remove default from text description (#1489) Default is added to the output automatically by the flags package, so having it in the text as well is redundant. --- cmd/bleve/cmd/bulk.go | 4 ++-- cmd/bleve/cmd/check.go | 4 ++-- cmd/bleve/cmd/index.go | 6 +++--- cmd/bleve/cmd/query.go | 16 ++++++++-------- 4 files changed, 15 insertions(+), 15 deletions(-) diff --git a/cmd/bleve/cmd/bulk.go b/cmd/bleve/cmd/bulk.go index c3c90e73a..3f374e769 100644 --- a/cmd/bleve/cmd/bulk.go +++ b/cmd/bleve/cmd/bulk.go @@ -112,6 +112,6 @@ func randomString(n int) string { func init() { RootCmd.AddCommand(bulkCmd) - bulkCmd.Flags().IntVarP(&batchSize, "batch", "b", 1000, "Batch size for loading, default 1000.") - bulkCmd.Flags().BoolVarP(&parseJSON, "json", "j", true, "Parse the contents as JSON, defaults true.") + bulkCmd.Flags().IntVarP(&batchSize, "batch", "b", 1000, "Batch size for loading.") + bulkCmd.Flags().BoolVarP(&parseJSON, "json", "j", true, "Parse the contents as JSON.") } diff --git a/cmd/bleve/cmd/check.go b/cmd/bleve/cmd/check.go index a71def801..370ece983 100644 --- a/cmd/bleve/cmd/check.go +++ b/cmd/bleve/cmd/check.go @@ -126,6 +126,6 @@ func getDictionary(index bleve.Index, field string) (map[string]uint64, error) { func init() { RootCmd.AddCommand(checkCmd) - checkCmd.Flags().StringVarP(&checkFieldName, "field", "f", "", "Restrict check to the specified field name, by default check all fields.") - checkCmd.Flags().IntVarP(&checkCount, "count", "c", 100, "Check this many terms, default 100.") + checkCmd.Flags().StringVarP(&checkFieldName, "field", "f", "", "Restrict check to the specified field name.") + checkCmd.Flags().IntVarP(&checkCount, "count", "c", 100, "Check this many terms.") } diff --git a/cmd/bleve/cmd/index.go b/cmd/bleve/cmd/index.go index c8925b49b..96b6067a9 100644 --- a/cmd/bleve/cmd/index.go +++ b/cmd/bleve/cmd/index.go @@ -111,7 +111,7 @@ func getAllFiles(args []string, rv chan file) { func init() { RootCmd.AddCommand(indexCmd) - indexCmd.Flags().BoolVarP(&keepDir, "keepDir", "d", false, "Keep the directory in the document id, defaults false.") - indexCmd.Flags().BoolVarP(&keepExt, "keepExt", "x", false, "Keep the extension in the document id, defaults false.") - indexCmd.Flags().BoolVarP(&parseJSON, "json", "j", true, "Parse the contents as JSON, defaults true.") + indexCmd.Flags().BoolVarP(&keepDir, "keepDir", "d", false, "Keep the directory in the document id.") + indexCmd.Flags().BoolVarP(&keepExt, "keepExt", "x", false, "Keep the extension in the document id.") + indexCmd.Flags().BoolVarP(&parseJSON, "json", "j", true, "Parse the contents as JSON.") } diff --git a/cmd/bleve/cmd/query.go b/cmd/bleve/cmd/query.go index e0babc88f..6e1bd0943 100644 --- a/cmd/bleve/cmd/query.go +++ b/cmd/bleve/cmd/query.go @@ -89,13 +89,13 @@ func buildQuery(args []string) query.Query { func init() { RootCmd.AddCommand(queryCmd) - queryCmd.Flags().IntVarP(&repeat, "repeat", "r", 1, "Repeat the query this many times, default 1.") - queryCmd.Flags().IntVarP(&limit, "limit", "l", 10, "Limit number of results returned, default 10.") - queryCmd.Flags().IntVarP(&skip, "skip", "s", 0, "Skip the first N results, default 0.") - queryCmd.Flags().BoolVarP(&explain, "explain", "x", false, "Explain the result scoring, default false.") - queryCmd.Flags().BoolVar(&highlight, "highlight", true, "Highlight matching text in results, default true.") - queryCmd.Flags().BoolVar(&fields, "fields", false, "Load stored fields, default false.") - queryCmd.Flags().StringVarP(&qtype, "type", "t", "query_string", "Type of query to run, defaults to 'query_string'") - queryCmd.Flags().StringVarP(&qfield, "field", "f", "", "Restrict query to field, by default no restriction, not applicable to query_string queries.") + queryCmd.Flags().IntVarP(&repeat, "repeat", "r", 1, "Repeat the query this many times.") + queryCmd.Flags().IntVarP(&limit, "limit", "l", 10, "Limit number of results returned.") + queryCmd.Flags().IntVarP(&skip, "skip", "s", 0, "Skip the first N results.") + queryCmd.Flags().BoolVarP(&explain, "explain", "x", false, "Explain the result scoring.") + queryCmd.Flags().BoolVar(&highlight, "highlight", true, "Highlight matching text in results.") + queryCmd.Flags().BoolVar(&fields, "fields", false, "Load stored fields.") + queryCmd.Flags().StringVarP(&qtype, "type", "t", "query_string", "Type of query to run.") + queryCmd.Flags().StringVarP(&qfield, "field", "f", "", "Restrict query to field, not applicable to query_string queries.") queryCmd.Flags().StringVarP(&sortby, "sort-by", "b", "", "Sort by field.") } From 4b6bddf7bce80d69649c2bdfb4180906a9df395a Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Wed, 11 Nov 2020 13:49:17 -0500 Subject: [PATCH 726/728] fix analyzer lookup when field name contains dot (#1496) At query time, we sometimes attempt to lookup the correct analyzer for a field. When doing so, the "." character can be interpretted as a path separater. Doing this can result in lookup failure in cases where the user specified a field name which also contains a dot. This fix addresses the case where the path contains a dot, but the next element does not match a mapping at this level. Previously this ended all looukp in this mapping, now we attempt to match the remaining path as a whole, without splitting on the dot. This fix is intended to fix most common cases where a user has given a field a name with a dot. However, ambigutities between custom field names containing dots, and actual mapping paths can still happen, and must be manually avoided. --- mapping/document.go | 41 ++++++++++++++++++++++------------------- mapping/mapping_test.go | 20 ++++++++++++++++++++ 2 files changed, 42 insertions(+), 19 deletions(-) diff --git a/mapping/document.go b/mapping/document.go index 355a602e5..dd42fab96 100644 --- a/mapping/document.go +++ b/mapping/document.go @@ -106,28 +106,31 @@ func (dm *DocumentMapping) fieldDescribedByPath(path string) *FieldMapping { return subDocMapping.fieldDescribedByPath(encodePath(pathElements[1:])) } } - } else { - // just 1 path elememnt - // first look for property name with empty field - for propName, subDocMapping := range dm.Properties { - if propName == pathElements[0] { - // found property name match, now look at its fields - for _, field := range subDocMapping.Fields { - if field.Name == "" || field.Name == pathElements[0] { - // match - return field - } + } + + // either the path just had one element + // or it had multiple, but no match for the first element at this level + // look for match with full path + + // first look for property name with empty field + for propName, subDocMapping := range dm.Properties { + if propName == path { + // found property name match, now look at its fields + for _, field := range subDocMapping.Fields { + if field.Name == "" || field.Name == path { + // match + return field } } } - // next, walk the properties again, looking for field overriding the name - for propName, subDocMapping := range dm.Properties { - if propName != pathElements[0] { - // property name isn't a match, but field name could override it - for _, field := range subDocMapping.Fields { - if field.Name == pathElements[0] { - return field - } + } + // next, walk the properties again, looking for field overriding the name + for propName, subDocMapping := range dm.Properties { + if propName != path { + // property name isn't a match, but field name could override it + for _, field := range subDocMapping.Fields { + if field.Name == path { + return field } } } diff --git a/mapping/mapping_test.go b/mapping/mapping_test.go index 1c9ced58a..564fbb212 100644 --- a/mapping/mapping_test.go +++ b/mapping/mapping_test.go @@ -1151,3 +1151,23 @@ func TestDefaultAnalyzerInheritance(t *testing.T) { t.Fatalf("Expected analyzer: xyz to be inherited by field, but got: '%v'", analyzer) } } + +func TestWrongAnalyzerSearchableAs(t *testing.T) { + fieldMapping := NewTextFieldMapping() + fieldMapping.Name = "geo.accuracy" + fieldMapping.Analyzer = "xyz" + + nestedMapping := NewDocumentMapping() + nestedMapping.AddFieldMappingsAt("accuracy", fieldMapping) + + docMapping := NewDocumentMapping() + docMapping.AddSubDocumentMapping("geo", nestedMapping) + + indexMapping := NewIndexMapping() + indexMapping.AddDocumentMapping("brewery", docMapping) + + analyzerName := indexMapping.AnalyzerNameForPath("geo.geo.accuracy") + if analyzerName != "xyz" { + t.Errorf("expected analyzer name `xyz`, got `%s`", analyzerName) + } +} From c3457bb70309d45b3c5daef87d06f6a0b3d0813c Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Fri, 20 Nov 2020 15:56:01 -0500 Subject: [PATCH 727/728] prepare to support zap v15.0.2 (#1499) --- go.mod | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/go.mod b/go.mod index fe5d25a35..10f4c5711 100644 --- a/go.mod +++ b/go.mod @@ -8,11 +8,11 @@ require ( github.com/blevesearch/go-porterstemmer v1.0.3 github.com/blevesearch/segment v0.9.0 github.com/blevesearch/snowballstem v0.9.0 - github.com/blevesearch/zap/v11 v11.0.12 - github.com/blevesearch/zap/v12 v12.0.12 - github.com/blevesearch/zap/v13 v13.0.4 - github.com/blevesearch/zap/v14 v14.0.3 - github.com/blevesearch/zap/v15 v15.0.1 + github.com/blevesearch/zap/v11 v11.0.13 + github.com/blevesearch/zap/v12 v12.0.13 + github.com/blevesearch/zap/v13 v13.0.5 + github.com/blevesearch/zap/v14 v14.0.4 + github.com/blevesearch/zap/v15 v15.0.2 github.com/couchbase/moss v0.1.0 github.com/couchbase/vellum v1.0.2 github.com/golang/protobuf v1.3.2 From d6a3fe28350d693afd14d58fe9f03e70ec693eca Mon Sep 17 00:00:00 2001 From: Marty Schoch Date: Tue, 8 Dec 2020 12:04:30 -0500 Subject: [PATCH 728/728] prepare for blevex go.mod (#1519) --- go.mod | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index 10f4c5711..6300aea17 100644 --- a/go.mod +++ b/go.mod @@ -4,15 +4,15 @@ go 1.13 require ( github.com/RoaringBitmap/roaring v0.4.23 - github.com/blevesearch/blevex v0.0.0-20190916190636-152f0fe5c040 + github.com/blevesearch/blevex v1.0.0 github.com/blevesearch/go-porterstemmer v1.0.3 github.com/blevesearch/segment v0.9.0 github.com/blevesearch/snowballstem v0.9.0 - github.com/blevesearch/zap/v11 v11.0.13 - github.com/blevesearch/zap/v12 v12.0.13 - github.com/blevesearch/zap/v13 v13.0.5 - github.com/blevesearch/zap/v14 v14.0.4 - github.com/blevesearch/zap/v15 v15.0.2 + github.com/blevesearch/zap/v11 v11.0.14 + github.com/blevesearch/zap/v12 v12.0.14 + github.com/blevesearch/zap/v13 v13.0.6 + github.com/blevesearch/zap/v14 v14.0.5 + github.com/blevesearch/zap/v15 v15.0.3 github.com/couchbase/moss v0.1.0 github.com/couchbase/vellum v1.0.2 github.com/golang/protobuf v1.3.2