Skip to content

Commit

Permalink
MB-31210: Support Fuzzy MatchPhrase, MultiPhrase and Phrase queries (#…
Browse files Browse the repository at this point in the history
…1847)

* Support setting of a fuzziness attribute for MatchPhrase, MultiPhrase and Phrase queries 

* Add unit tests

* Minor fixes
  • Loading branch information
CascadingRadium committed Aug 29, 2023
1 parent 911fa7f commit d162c05
Show file tree
Hide file tree
Showing 8 changed files with 211 additions and 53 deletions.
6 changes: 6 additions & 0 deletions search/query/match_phrase.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ type MatchPhraseQuery struct {
FieldVal string `json:"field,omitempty"`
Analyzer string `json:"analyzer,omitempty"`
BoostVal *Boost `json:"boost,omitempty"`
Fuzziness int `json:"fuzziness"`
}

// NewMatchPhraseQuery creates a new Query object
Expand Down Expand Up @@ -58,6 +59,10 @@ func (q *MatchPhraseQuery) SetField(f string) {
q.FieldVal = f
}

func (q *MatchPhraseQuery) SetFuzziness(f int) {
q.Fuzziness = f
}

func (q *MatchPhraseQuery) Field() string {
return q.FieldVal
}
Expand All @@ -84,6 +89,7 @@ func (q *MatchPhraseQuery) Searcher(ctx context.Context, i index.IndexReader, m
phrase := tokenStreamToPhrase(tokens)
phraseQuery := NewMultiPhraseQuery(phrase, field)
phraseQuery.SetBoost(q.BoostVal.Value())
phraseQuery.SetFuzziness(q.Fuzziness)
return phraseQuery.Searcher(ctx, i, m, options)
}
noneQuery := NewMatchNoneQuery()
Expand Down
14 changes: 10 additions & 4 deletions search/query/multi_phrase.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@ import (
)

type MultiPhraseQuery struct {
Terms [][]string `json:"terms"`
Field string `json:"field,omitempty"`
BoostVal *Boost `json:"boost,omitempty"`
Terms [][]string `json:"terms"`
Field string `json:"field,omitempty"`
BoostVal *Boost `json:"boost,omitempty"`
Fuzziness int `json:"fuzziness"`
}

// NewMultiPhraseQuery creates a new Query for finding
Expand All @@ -47,6 +48,10 @@ func NewMultiPhraseQuery(terms [][]string, field string) *MultiPhraseQuery {
}
}

func (q *MultiPhraseQuery) SetFuzziness(f int) {
q.Fuzziness = f
}

func (q *MultiPhraseQuery) SetBoost(b float64) {
boost := Boost(b)
q.BoostVal = &boost
Expand All @@ -57,7 +62,7 @@ func (q *MultiPhraseQuery) Boost() float64 {
}

func (q *MultiPhraseQuery) Searcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) {
return searcher.NewMultiPhraseSearcher(ctx, i, q.Terms, q.Field, options)
return searcher.NewMultiPhraseSearcher(ctx, i, q.Terms, q.Fuzziness, q.Field, q.BoostVal.Value(), options)
}

func (q *MultiPhraseQuery) Validate() error {
Expand All @@ -77,5 +82,6 @@ func (q *MultiPhraseQuery) UnmarshalJSON(data []byte) error {
q.Terms = tmp.Terms
q.Field = tmp.Field
q.BoostVal = tmp.BoostVal
q.Fuzziness = tmp.Fuzziness
return nil
}
14 changes: 10 additions & 4 deletions search/query/phrase.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,10 @@ import (
)

type PhraseQuery struct {
Terms []string `json:"terms"`
Field string `json:"field,omitempty"`
BoostVal *Boost `json:"boost,omitempty"`
Terms []string `json:"terms"`
Field string `json:"field,omitempty"`
BoostVal *Boost `json:"boost,omitempty"`
Fuzziness int `json:"fuzziness"`
}

// NewPhraseQuery creates a new Query for finding
Expand All @@ -49,12 +50,16 @@ func (q *PhraseQuery) SetBoost(b float64) {
q.BoostVal = &boost
}

func (q *PhraseQuery) SetFuzziness(f int) {
q.Fuzziness = f
}

func (q *PhraseQuery) Boost() float64 {
return q.BoostVal.Value()
}

func (q *PhraseQuery) Searcher(ctx context.Context, i index.IndexReader, m mapping.IndexMapping, options search.SearcherOptions) (search.Searcher, error) {
return searcher.NewPhraseSearcher(ctx, i, q.Terms, q.Field, options)
return searcher.NewPhraseSearcher(ctx, i, q.Terms, q.Fuzziness, q.Field, q.BoostVal.Value(), options)
}

func (q *PhraseQuery) Validate() error {
Expand All @@ -74,5 +79,6 @@ func (q *PhraseQuery) UnmarshalJSON(data []byte) error {
q.Terms = tmp.Terms
q.Field = tmp.Field
q.BoostVal = tmp.BoostVal
q.Fuzziness = tmp.Fuzziness
return nil
}
48 changes: 24 additions & 24 deletions search/query/query.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,25 +72,18 @@ func ParseQuery(input []byte) (Query, error) {
if err != nil {
return nil, err
}
_, isMatchQuery := tmp["match"]
_, hasFuzziness := tmp["fuzziness"]
if hasFuzziness && !isMatchQuery {
_, isMatchQuery := tmp["match"]
_, isMatchPhraseQuery := tmp["match_phrase"]
_, hasTerms := tmp["terms"]
if hasFuzziness && !isMatchQuery && !isMatchPhraseQuery && !hasTerms {
var rv FuzzyQuery
err := json.Unmarshal(input, &rv)
if err != nil {
return nil, err
}
return &rv, nil
}
_, isTermQuery := tmp["term"]
if isTermQuery {
var rv TermQuery
err := json.Unmarshal(input, &rv)
if err != nil {
return nil, err
}
return &rv, nil
}
if isMatchQuery {
var rv MatchQuery
err := json.Unmarshal(input, &rv)
Expand All @@ -99,7 +92,6 @@ func ParseQuery(input []byte) (Query, error) {
}
return &rv, nil
}
_, isMatchPhraseQuery := tmp["match_phrase"]
if isMatchPhraseQuery {
var rv MatchPhraseQuery
err := json.Unmarshal(input, &rv)
Expand All @@ -108,18 +100,6 @@ func ParseQuery(input []byte) (Query, error) {
}
return &rv, nil
}
_, hasMust := tmp["must"]
_, hasShould := tmp["should"]
_, hasMustNot := tmp["must_not"]
if hasMust || hasShould || hasMustNot {
var rv BooleanQuery
err := json.Unmarshal(input, &rv)
if err != nil {
return nil, err
}
return &rv, nil
}
_, hasTerms := tmp["terms"]
if hasTerms {
var rv PhraseQuery
err := json.Unmarshal(input, &rv)
Expand All @@ -134,6 +114,26 @@ func ParseQuery(input []byte) (Query, error) {
}
return &rv, nil
}
_, isTermQuery := tmp["term"]
if isTermQuery {
var rv TermQuery
err := json.Unmarshal(input, &rv)
if err != nil {
return nil, err
}
return &rv, nil
}
_, hasMust := tmp["must"]
_, hasShould := tmp["should"]
_, hasMustNot := tmp["must_not"]
if hasMust || hasShould || hasMustNot {
var rv BooleanQuery
err := json.Unmarshal(input, &rv)
if err != nil {
return nil, err
}
return &rv, nil
}
_, hasConjuncts := tmp["conjuncts"]
if hasConjuncts {
var rv ConjunctionQuery
Expand Down
4 changes: 4 additions & 0 deletions search/searcher/search_fuzzy.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ func NewFuzzySearcher(ctx context.Context, indexReader index.IndexReader, term s
if ctx != nil {
reportIOStats(ctx, dictBytesRead)
search.RecordSearchCost(ctx, search.AddM, dictBytesRead)
fuzzyTermMatches := ctx.Value(search.FuzzyMatchPhraseKey)
if fuzzyTermMatches != nil {
fuzzyTermMatches.(map[string][]string)[term] = candidates
}
}

return NewMultiTermSearcher(ctx, indexReader, candidates, field,
Expand Down
96 changes: 77 additions & 19 deletions search/searcher/search_phrase.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ type PhraseSearcher struct {
paths []phrasePath
locations []search.Location
initialized bool
// map a term to a list of fuzzy terms that match it
fuzzyTermMatches map[string][]string
}

func (s *PhraseSearcher) Size() int {
Expand All @@ -64,22 +66,42 @@ func (s *PhraseSearcher) Size() int {
return sizeInBytes
}

func NewPhraseSearcher(ctx context.Context, indexReader index.IndexReader, terms []string, field string, options search.SearcherOptions) (*PhraseSearcher, error) {
func NewPhraseSearcher(ctx context.Context, indexReader index.IndexReader, terms []string,
fuzziness int, field string, boost float64, options search.SearcherOptions) (*PhraseSearcher, error) {

// turn flat terms []string into [][]string
mterms := make([][]string, len(terms))
for i, term := range terms {
mterms[i] = []string{term}
}
return NewMultiPhraseSearcher(ctx, indexReader, mterms, field, options)
return NewMultiPhraseSearcher(ctx, indexReader, mterms, fuzziness, field, boost, options)
}

func NewMultiPhraseSearcher(ctx context.Context, indexReader index.IndexReader, terms [][]string, field string, options search.SearcherOptions) (*PhraseSearcher, error) {
func NewMultiPhraseSearcher(ctx context.Context, indexReader index.IndexReader, terms [][]string,
fuzziness int, field string, boost float64, options search.SearcherOptions) (*PhraseSearcher, error) {

options.IncludeTermVectors = true
var termPositionSearchers []search.Searcher
var err error
var ts search.Searcher
var fuzzyTermMatches map[string][]string
if fuzziness > 0 {
fuzzyTermMatches = make(map[string][]string)
ctx = context.WithValue(ctx, search.FuzzyMatchPhraseKey, fuzzyTermMatches)
}
// in case of fuzzy multi-phrase, phrase and match-phrase queries we hardcode the
// prefix length to 0, as setting a per word matching prefix length would not
// make sense from a user perspective.
for _, termPos := range terms {
if len(termPos) == 1 && termPos[0] != "" {
// single term
ts, err := NewTermSearcher(ctx, indexReader, termPos[0], field, 1.0, options)
if fuzziness > 0 {
// fuzzy
ts, err = NewFuzzySearcher(ctx, indexReader, termPos[0], 0, fuzziness, field, boost, options)
} else {
// non-fuzzy
ts, err = NewTermSearcher(ctx, indexReader, termPos[0], field, boost, options)
}
if err != nil {
// close any searchers already opened
for _, ts := range termPositionSearchers {
Expand All @@ -95,7 +117,13 @@ func NewMultiPhraseSearcher(ctx context.Context, indexReader index.IndexReader,
if term == "" {
continue
}
ts, err := NewTermSearcher(ctx, indexReader, term, field, 1.0, options)
if fuzziness > 0 {
// fuzzy
ts, err = NewFuzzySearcher(ctx, indexReader, term, 0, fuzziness, field, boost, options)
} else {
// non-fuzzy
ts, err = NewTermSearcher(ctx, indexReader, term, field, boost, options)
}
if err != nil {
// close any searchers already opened
for _, ts := range termPositionSearchers {
Expand Down Expand Up @@ -128,8 +156,9 @@ func NewMultiPhraseSearcher(ctx context.Context, indexReader index.IndexReader,

// build our searcher
rv := PhraseSearcher{
mustSearcher: mustSearcher,
terms: terms,
mustSearcher: mustSearcher,
terms: terms,
fuzzyTermMatches: fuzzyTermMatches,
}
rv.computeQueryNorm()
return &rv, nil
Expand Down Expand Up @@ -213,7 +242,7 @@ func (s *PhraseSearcher) Next(ctx *search.SearchContext) (*search.DocumentMatch,

// checkCurrMustMatch is solely concerned with determining if the DocumentMatch
// pointed to by s.currMust (which satisifies the pre-condition searcher)
// also satisfies the phase constraints. if so, it returns a DocumentMatch
// also satisfies the phrase constraints. if so, it returns a DocumentMatch
// for this document, otherwise nil
func (s *PhraseSearcher) checkCurrMustMatch(ctx *search.SearchContext) *search.DocumentMatch {
s.locations = s.currMust.Complete(s.locations)
Expand Down Expand Up @@ -244,7 +273,7 @@ func (s *PhraseSearcher) checkCurrMustMatch(ctx *search.SearchContext) *search.D

// checkCurrMustMatchField is solely concerned with determining if one
// particular field within the currMust DocumentMatch Locations
// satisfies the phase constraints (possibly more than once). if so,
// satisfies the phrase constraints (possibly more than once). if so,
// the matching field term locations are appended to the provided
// slice
func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext,
Expand All @@ -253,7 +282,21 @@ func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext,
if s.path == nil {
s.path = make(phrasePath, 0, len(s.terms))
}
s.paths = findPhrasePaths(0, nil, s.terms, tlm, s.path[:0], 0, s.paths[:0])
var tlmPtr *search.TermLocationMap = &tlm
if s.fuzzyTermMatches != nil {
// if fuzzy search, we need to expand the tlm to include all the fuzzy matches
// Example - term is "foo" and fuzzy matches are "foo", "fool", "food"
// the non expanded tlm will be:
// foo -> Locations[foo]
// fool -> Locations[fool]
// food -> Locations[food]
// the expanded tlm will be:
// foo -> [Locations[foo], Locations[fool], Locations[food]]
expandedTlm := make(search.TermLocationMap)
s.expandFuzzyMatches(tlm, expandedTlm)
tlmPtr = &expandedTlm
}
s.paths = findPhrasePaths(0, nil, s.terms, *tlmPtr, s.path[:0], 0, s.paths[:0])
for _, p := range s.paths {
for _, pp := range p {
ftls = append(ftls, search.FieldTermLocation{
Expand All @@ -271,6 +314,16 @@ func (s *PhraseSearcher) checkCurrMustMatchField(ctx *search.SearchContext,
return ftls
}

func (s *PhraseSearcher) expandFuzzyMatches(tlm search.TermLocationMap, expandedTlm search.TermLocationMap) {
for term, fuzzyMatches := range s.fuzzyTermMatches {
locations := tlm[term]
for _, fuzzyMatch := range fuzzyMatches {
locations = append(locations, tlm[fuzzyMatch]...)
}
expandedTlm[term] = locations
}
}

type phrasePart struct {
term string
loc *search.Location
Expand Down Expand Up @@ -300,26 +353,31 @@ func (p phrasePath) String() string {
return rv
}

// findPhrasePaths is a function to identify phase matches from a set
// findPhrasePaths is a function to identify phrase matches from a set
// of known term locations. it recursive so care must be taken with
// arguments and return values.
//
// prevPos - the previous location, 0 on first invocation
//
// ap - array positions of the first candidate phrase part to
// which further recursive phrase parts must match,
// nil on initial invocation or when there are no array positions
// which further recursive phrase parts must match,
// nil on initial invocation or when there are no array positions
//
// phraseTerms - slice containing the phrase terms,
// may contain empty string as placeholder (don't care)
// may contain empty string as placeholder (don't care)
//
// tlm - the Term Location Map containing all relevant term locations
//
// p - the current path being explored (appended to in recursive calls)
// this is the primary state being built during the traversal
// this is the primary state being built during the traversal
//
// remainingSlop - amount of sloppiness that's allowed, which is the
// sum of the editDistances from each matching phrase part,
// where 0 means no sloppiness allowed (all editDistances must be 0),
// decremented during recursion
// sum of the editDistances from each matching phrase part, where 0 means no
// sloppiness allowed (all editDistances must be 0), decremented during recursion
//
// rv - the final result being appended to by all the recursive calls
//
// returns slice of paths, or nil if invocation did not find any successul paths
// returns slice of paths, or nil if invocation did not find any successful paths
func findPhrasePaths(prevPos uint64, ap search.ArrayPositions, phraseTerms [][]string,
tlm search.TermLocationMap, p phrasePath, remainingSlop int, rv []phrasePath) []phrasePath {
// no more terms
Expand Down
Loading

0 comments on commit d162c05

Please sign in to comment.