Skip to content

Commit

Permalink
Normalize on TrieMap creation #416 (#430)
Browse files Browse the repository at this point in the history
* Adding input normalization to TrieMap building in CommonRules #416

* Remove input sequence creation from Input and Term #416

* Removing ignoreCase config and defaulting to LOWERCASE lookup #416

* Removed obsolete TODO
  • Loading branch information
renekrie authored Feb 1, 2023
1 parent d2e717a commit 9b9f2df
Show file tree
Hide file tree
Showing 29 changed files with 594 additions and 405 deletions.
35 changes: 0 additions & 35 deletions querqy-core/src/main/java/querqy/model/Input.java
Original file line number Diff line number Diff line change
Expand Up @@ -107,42 +107,7 @@ public void applyInstructions(final Instructions instructions, final RulesCollec
builder.addRule(this, instructions);
}

public List<ComparableCharSequence> getInputSequences(final boolean lowerCaseValues) {

if (inputTerms.size() == 1) {
return inputTerms.get(0).getCharSequences(lowerCaseValues);
}

LinkedList<List<ComparableCharSequence>> slots = new LinkedList<>();

for (final Term inputTerm : inputTerms) {
slots.add(inputTerm.getCharSequences(lowerCaseValues));
}

final List<ComparableCharSequence> seqs = new LinkedList<>();
collectTails(new LinkedList<>(), slots, seqs);
return seqs;

}

void collectTails(final List<ComparableCharSequence> prefix, List<List<ComparableCharSequence>> tailSlots,
final List<ComparableCharSequence> result) {
if (tailSlots.size() == 1) {
for (final ComparableCharSequence sequence : tailSlots.get(0)) {
final List<ComparableCharSequence> combined = new LinkedList<>(prefix);
combined.add(sequence);
result.add(new CompoundCharSequence(" ", combined));
}
} else {

final List<List<ComparableCharSequence>> newTail = tailSlots.subList(1, tailSlots.size());
for (final ComparableCharSequence sequence : tailSlots.get(0)) {
final List<ComparableCharSequence> newPrefix = new LinkedList<>(prefix);
newPrefix.add(sequence);
collectTails(newPrefix, newTail, result);
}
}
}

}

Expand Down
5 changes: 0 additions & 5 deletions querqy-core/src/main/java/querqy/model/Term.java
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,6 @@ public ComparableCharSequence subSequence(final int start, final int end) {
return value.subSequence(start, end);
}

public ComparableCharSequence toCharSequenceWithField(final boolean lowerCaseValue) {
final ComparableCharSequence valueToUse = lowerCaseValue ? new LowerCaseCharSequence(this) : value;
return (field == null) ? valueToUse : new CompoundCharSequence(":", field, valueToUse);
}

@Override
public int hashCode() {
final int prime = 31;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ public class SimpleCommonRulesRewriterFactory extends RewriterFactory {
* @param boostMethod The {@link querqy.rewrite.commonrules.model.BoostInstruction.BoostMethod} to use when creating
* {@link querqy.rewrite.commonrules.model.BoostInstruction}s
* @param querqyParserFactory A parser for the right-hand side of rules
* @param ignoreCase Iff true, rule input matching is case insensitive.
* @param selectionStrategyFactories A mapping between names of rule selection strategies and their factories.
* @param defaultSelectionStrategyFactory The default {@link SelectionStrategyFactory} to be used if no strategy is
* specified as a request parameter
Expand All @@ -85,7 +84,6 @@ public SimpleCommonRulesRewriterFactory(final String rewriterId,
final boolean allowBooleanInput,
final BoostMethod boostMethod,
final QuerqyParserFactory querqyParserFactory,
final boolean ignoreCase,
final Map<String, SelectionStrategyFactory> selectionStrategyFactories,
final SelectionStrategyFactory defaultSelectionStrategyFactory,
final boolean buildTermCache,
Expand Down Expand Up @@ -122,7 +120,7 @@ public SimpleCommonRulesRewriterFactory(final String rewriterId,
.querqyParserFactory(querqyParserFactory)
.allowedInstructionTypes(ALLOWED_TYPES)
.build())
.rulesCollectionBuilder(new TrieMapRulesCollectionBuilder(ignoreCase, lookupPreprocessor))
.rulesCollectionBuilder(new TrieMapRulesCollectionBuilder(lookupPreprocessor))
.build();

final RulesParser rulesParser = RulesParserFactory.textParser(config);
Expand All @@ -131,7 +129,6 @@ public SimpleCommonRulesRewriterFactory(final String rewriterId,
trieMapLookupQueryVisitorFactory = TrieMapLookupQueryVisitorFactory.of(
trieMap,
LookupConfig.builder()
.ignoreCase(ignoreCase)
.hasBoundaries(true)
.preprocessor(lookupPreprocessor)
.build()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,22 @@

import java.util.*;

import querqy.CompoundCharSequence;
import querqy.model.ExpandedQuery;
import querqy.rewrite.QueryRewriter;
import querqy.rewrite.SearchEngineRequestAdapter;
import querqy.rewrite.lookup.preprocessing.LookupPreprocessor;
import querqy.rewrite.lookup.preprocessing.LookupPreprocessorFactory;

/**
* @author René Kriegler, @renekrie
*
*/
public class DeleteInstruction implements Instruction {

private static final LookupPreprocessor LOWERCASE_PREPROCESSOR = LookupPreprocessorFactory.lowercase();
private static final InputSequenceNormalizer LOWERCASE_INPUT_NORMALIZER = new InputSequenceNormalizer(
LOWERCASE_PREPROCESSOR);

protected final List<? extends Term> termsToDelete;
protected final Set<CharSequence> charSequencesToDelete;
Expand All @@ -29,26 +36,22 @@ public DeleteInstruction(final List<? extends Term> termsToDelete) {
this(termsToDelete, InstructionDescription.empty());
}

public DeleteInstruction(final List<? extends Term> termsToDelete, final InstructionDescription instructionDescription) {
public DeleteInstruction(final List<? extends Term> termsToDelete,
final InstructionDescription instructionDescription) {
this.termsToDelete = termsToDelete;
charSequencesToDelete = new HashSet<>();
final List<PrefixTerm> prefixes = new ArrayList<>();
for (Term term : termsToDelete) {
for (final Term term : termsToDelete) {
if (term instanceof PrefixTerm) {
prefixes.add((PrefixTerm) term);
} else {
charSequencesToDelete.addAll(term.getCharSequences(true));
charSequencesToDelete.addAll(LOWERCASE_INPUT_NORMALIZER.getTermCharSequences(term));
}
}
prefixesToDeleted = prefixes.isEmpty() ? null : prefixes;
this.instructionDescription = instructionDescription;
}

public List<? extends Term> getTermsToDelete() {
return termsToDelete;
}


/* (non-Javadoc)
* @see querqy.rewrite.commonrules.model.Instruction#apply(querqy.rewrite.commonrules.model.PositionSequence, querqy.rewrite.commonrules.model.TermMatches, int, int, querqy.model.ExpandedQuery, java.util.Map)
*/
Expand All @@ -72,7 +75,12 @@ public boolean isToBeDeleted(final querqy.model.Term term) {
}
}
}
return charSequencesToDelete.contains(term.toCharSequenceWithField(true));
return charSequencesToDelete.contains(getLookupCharSequence(term));
}

private CharSequence getLookupCharSequence(final querqy.model.Term term) {
final CharSequence value = LOWERCASE_PREPROCESSOR.process(term);
return term.getField() == null ? value : new CompoundCharSequence(":", term.getField(), value);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
package querqy.rewrite.commonrules.model;

import querqy.CompoundCharSequence;
import querqy.model.Input;
import querqy.rewrite.lookup.preprocessing.LookupPreprocessor;
import querqy.rewrite.lookup.preprocessing.LookupPreprocessorFactory;

import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.Collectors;

import static querqy.rewrite.commonrules.model.TrieMapRulesCollection.*;

public class InputSequenceNormalizer {

private static final List<CharSequence> SPACE = Collections.singletonList(
new CompoundCharSequence(" ", BOUNDARY_WORD, BOUNDARY_WORD));

private static final LookupPreprocessor LOWER_CASE_PREPROCESSOR = LookupPreprocessorFactory.lowercase();

private final LookupPreprocessor lookupPreprocessor;
private final boolean isIdentityPreprocessor;

public InputSequenceNormalizer(final LookupPreprocessor lookupPreprocessor) {
this.lookupPreprocessor = lookupPreprocessor;
isIdentityPreprocessor = lookupPreprocessor == LookupPreprocessorFactory.identity();
}

public List<CharSequence> getNormalizedInputSequences(final Input.SimpleInput input) {
final List<Term> inputTerms = input.getInputTerms();
if (inputTerms.isEmpty()) {
return spaceInput(input.isRequiresLeftBoundary(), input.isRequiresRightBoundary());
}

final List<CharSequence> seqs = inputTerms.size() == 1
? getTermCharSequences(inputTerms.get(0))
: getTermCharSequences(inputTerms);

return seqs.stream()
.map(seq -> applyBoundaries(seq, input.isRequiresLeftBoundary(), input.isRequiresRightBoundary()))
.collect(Collectors.toList());

}

protected CharSequence applyBoundaries(final CharSequence seq, final boolean requiresLeftBoundary,
final boolean requiresRightBoundary) {
if (requiresLeftBoundary == requiresRightBoundary) {
if (requiresLeftBoundary) {
return new CompoundCharSequence(" ", BOUNDARY_WORD, seq, BOUNDARY_WORD);
} else {
return seq;
}
} else if (requiresLeftBoundary) {
return new CompoundCharSequence(" ", BOUNDARY_WORD, seq);
} else {
return new CompoundCharSequence(" ", seq, BOUNDARY_WORD);
}
}

protected List<CharSequence> spaceInput(final boolean isLeftBoundaryRequired,
final boolean isRightBoundaryRequired) {
if (!(isLeftBoundaryRequired && isRightBoundaryRequired)) {
throw new IllegalArgumentException("Empty input!");
}

return SPACE;
}

protected List<CharSequence> getTermCharSequences(final Term term) {

final CharSequence value = term instanceof PrefixTerm ? getPrefixCharSequence((PrefixTerm) term)
: lookupPreprocessor.process(term);

if (!term.hasFieldNames()) {
return Collections.singletonList(value);
}

return term.getFieldNames().stream().map(name -> new CompoundCharSequence(Term.FIELD_CHAR, name, value))
.collect(Collectors.toList());

}

protected List<CharSequence> getTermCharSequences(final List<Term> terms) {

LinkedList<List<CharSequence>> slots = new LinkedList<>();

for (final Term inputTerm : terms) {
slots.add(getTermCharSequences(inputTerm));
}

final List<CharSequence> seqs = new LinkedList<>();
collectTails(new LinkedList<>(), slots, seqs);

return seqs;

}

void collectTails(final List<CharSequence> prefix, List<List<CharSequence>> tailSlots,
final List<CharSequence> result) {
if (tailSlots.size() == 1) {
for (final CharSequence sequence : tailSlots.get(0)) {
final List<CharSequence> combined = new LinkedList<>(prefix);
combined.add(sequence);
result.add(new CompoundCharSequence(" ", combined));
}
} else {

final List<List<CharSequence>> newTail = tailSlots.subList(1, tailSlots.size());
for (final CharSequence sequence : tailSlots.get(0)) {
final List<CharSequence> newPrefix = new LinkedList<>(prefix);
newPrefix.add(sequence);
collectTails(newPrefix, newTail, result);
}
}
}


/**
* Get the char sequence for a {@link PrefixTerm}. Returns the sequence verbatim if we are using an identity
* LookupPreprocessor and the lower-cased sequence otherwise. This means that stemming etc. will not be applied to
* a prefix term.
*
* @param term The prefix term to process.
* @return The char sequence for the prefix term
*/

protected CharSequence getPrefixCharSequence(final PrefixTerm term) {
return isIdentityPreprocessor ? term : LOWER_CASE_PREPROCESSOR.process(term);
}


}
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,11 @@

import querqy.ComparableCharSequence;
import querqy.CompoundCharSequence;
import querqy.LowerCaseCharSequence;
import querqy.SimpleComparableCharSequence;

public class Term implements ComparableCharSequence {

public final String FIELD_CHAR = ":";
public static final String FIELD_CHAR = ":";
protected final char[] value;
protected final int start;
protected final int length;
Expand Down Expand Up @@ -228,30 +227,14 @@ public ComparableCharSequence subSequence(final int start, final int end) {
return new SimpleComparableCharSequence(value, this.start + start, end - start);
}


public List<ComparableCharSequence> getCharSequences(final boolean lowerCaseValue) {

final SimpleComparableCharSequence seq = new SimpleComparableCharSequence(value, start, length);

final ComparableCharSequence valueSequence = lowerCaseValue ? new LowerCaseCharSequence(seq) : seq;

final List<ComparableCharSequence> seqs = new LinkedList<>();

if (fieldNames == null) {
seqs.add(valueSequence);
} else {
for (final String name : fieldNames) {
seqs.add(new CompoundCharSequence(FIELD_CHAR, name, valueSequence));
}
}

return seqs;
}

public List<String> getFieldNames() {
return fieldNames;
}

public boolean hasFieldNames() {
return fieldNames != null;
}

public LinkedList<PlaceHolder> getPlaceHolders() {
return placeHolders;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
import querqy.model.InputSequenceElement;
import querqy.model.Term;
import querqy.rewrite.commonrules.select.TopRewritingActionCollector;
import querqy.rewrite.lookup.preprocessing.LookupPreprocessor;
import querqy.rewrite.lookup.preprocessing.LookupPreprocessorFactory;
import querqy.trie.State;
import querqy.trie.States;
import querqy.trie.TrieMap;
Expand All @@ -26,14 +28,15 @@ public class TrieMapRulesCollection implements RulesCollection {
public static final String BOUNDARY_WORD = "\u0002";

final TrieMap<InstructionsSupplier> trieMap;
final boolean ignoreCase;
private final LookupPreprocessor lookupPreprocessor;

public TrieMapRulesCollection(final TrieMap<InstructionsSupplier> trieMap, final boolean ignoreCase) {
public TrieMapRulesCollection(final TrieMap<InstructionsSupplier> trieMap,
final LookupPreprocessor lookupPreprocessor) {
if (trieMap == null) {
throw new IllegalArgumentException("trieMap must not be null");
}
this.trieMap = trieMap;
this.ignoreCase = ignoreCase;
this.lookupPreprocessor = lookupPreprocessor;
}

/* (non-Javadoc)
Expand All @@ -57,7 +60,7 @@ public void collectRewriteActions(final PositionSequence<InputSequenceElement> s
.filter(Term.class::isInstance)
.map(Term.class::cast).forEach(term -> {

final States<InstructionsSupplier> states = trieMap.get(term.toCharSequenceWithField(ignoreCase));
final States<InstructionsSupplier> states = trieMap.get(createLookupCharSequence(term));

final State<InstructionsSupplier> stateExactMatch = states.getStateForCompleteSequence();
if (stateExactMatch.isFinal() && stateExactMatch.value != null) {
Expand Down Expand Up @@ -101,7 +104,7 @@ public void collectRewriteActions(final PositionSequence<InputSequenceElement> s

final CharSequence charSequenceForLookup;
if (isTerm) {
charSequenceForLookup = ((Term) element).toCharSequenceWithField(ignoreCase);
charSequenceForLookup = createLookupCharSequence((Term) element);
} else if (element instanceof InputBoundary) {
charSequenceForLookup = BOUNDARY_WORD;
} else {
Expand Down Expand Up @@ -245,6 +248,12 @@ public Set<Instruction> getInstructions() {
return result;
}

private CharSequence createLookupCharSequence(final Term term) {
final CharSequence value = lookupPreprocessor.process(term);
final String field = term.getField();
return (field == null) ? value : new CompoundCharSequence(":", field, value);
}

public static class Prefix<T> {
final State<T> stateInfo;
final List<TermMatch> matches;
Expand Down
Loading

0 comments on commit 9b9f2df

Please sign in to comment.