-
Notifications
You must be signed in to change notification settings - Fork 25k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Index phrases #30450
Index phrases #30450
Changes from 10 commits
e39d396
e931107
3719e30
4cea300
efd612a
b591fc4
aca2b7e
910f0c1
e0fe29d
c156cbd
007ee3d
66b1e48
4d6cb66
9eb8a6d
69cf210
0deebb6
6cfa4b1
6208c31
1d7852e
289186e
518e280
8dd5cd5
dab97ad
b2e732a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -96,6 +96,12 @@ The following parameters are accepted by `text` fields: | |
the expense of a larger index. Accepts an | ||
<<index-prefix-config,`index-prefix configuration block`>> | ||
|
||
<<index-phrases,`index_phrases`>>:: | ||
|
||
If enabled, two-term word combinations ('shingles') are indexed into a separate | ||
field. This allows phrase queries to run more efficiently, at the expense | ||
of a larger index. Accepts `true` or `false` (default). | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe we should document that this feature works better when stop words are not removed? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1 |
||
|
||
<<norms,`norms`>>:: | ||
|
||
Whether field-length should be taken into account when scoring queries. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -80,7 +80,7 @@ private ICUTokenizerConfig getIcuConfig(Environment env, Settings settings) { | |
if (tailored.isEmpty()) { | ||
return null; | ||
} else { | ||
final BreakIterator breakers[] = new BreakIterator[UScript.CODE_LIMIT]; | ||
final RuleBasedBreakIterator breakers[] = new RuleBasedBreakIterator[UScript.CODE_LIMIT]; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This change is already in master |
||
for (Map.Entry<Integer, String> entry : tailored.entrySet()) { | ||
int code = entry.getKey(); | ||
String resourcePath = entry.getValue(); | ||
|
@@ -105,7 +105,7 @@ public RuleBasedBreakIterator getBreakIterator(int script) { | |
} | ||
|
||
//parse a single RBBi rule file | ||
private BreakIterator parseRules(String filename, Environment env) throws IOException { | ||
private RuleBasedBreakIterator parseRules(String filename, Environment env) throws IOException { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. same here |
||
|
||
final Path path = env.configFile().resolve(filename); | ||
String rules = Files.readAllLines(path) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
--- | ||
"search with indexed phrases": | ||
- skip: | ||
version: " - 6.99.99" | ||
reason: index_phrase is only available as of 7.0.0 | ||
- do: | ||
indices.create: | ||
index: test | ||
body: | ||
mappings: | ||
test: | ||
properties: | ||
text: | ||
type: text | ||
index_phrases: true | ||
|
||
- do: | ||
index: | ||
index: test | ||
type: test | ||
id: 1 | ||
body: { text: "peter piper picked a peck of pickled peppers" } | ||
|
||
- do: | ||
indices.refresh: | ||
index: [test] | ||
|
||
- do: | ||
search: | ||
index: test | ||
body: | ||
query: | ||
match_phrase: | ||
text: | ||
query: "peter piper" | ||
|
||
- match: {hits.total: 1} | ||
|
||
- do: | ||
search: | ||
index: test | ||
q: '"peter piper"~1' | ||
df: text | ||
|
||
- match: {hits.total: 1} | ||
|
||
- do: | ||
search: | ||
index: test | ||
body: | ||
query: | ||
match_phrase: | ||
text: "peter piper picked" | ||
|
||
- match: {hits.total: 1} | ||
|
||
- do: | ||
search: | ||
index: test | ||
body: | ||
query: | ||
match_phrase: | ||
text: "piper" | ||
|
||
- match: {hits.total: 1} | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -43,6 +43,7 @@ | |
import org.elasticsearch.index.query.QueryRewriteContext; | ||
import org.elasticsearch.index.query.QueryShardContext; | ||
import org.elasticsearch.index.query.QueryShardException; | ||
import org.elasticsearch.index.search.MatchQuery; | ||
import org.elasticsearch.index.similarity.SimilarityProvider; | ||
import org.elasticsearch.search.DocValueFormat; | ||
import org.joda.time.DateTimeZone; | ||
|
@@ -360,6 +361,10 @@ public Query nullValueQuery() { | |
|
||
public abstract Query existsQuery(QueryShardContext context); | ||
|
||
public MatchQuery matchQuery(QueryShardContext context, String analyzer, int slop) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can this be changed to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. or even further: There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. +1, I think the There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jimczi I would be cautious about that. This optimization should return the same hits as a phrase query on the original field. On the regular field, a query for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for clarifying. I agree this is trappy so +1 to disable this optim when gaps are detected and to add documentation about this limitation. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've changed the signature to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We might not want the latter, since the phrase query on the regular field would match everything that has Even though this might better fit the original intention of the user, I'd like this feature to be as transparent as possible and not change the set of matches for a given query? |
||
throw new QueryShardException(context, "Can only use match queries on keyword and text fields - not on [" + name + "] which is of type [" + typeName() + "]"); | ||
} | ||
|
||
/** | ||
* An enum used to describe the relation between the range of terms in a | ||
* shard when compared with a query range | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
as a non native speaker it feels weird that
index_prefix
is singular whileindex_phrases
is plural?