Skip to content

Commit

Permalink
Allow plugins to register pre-configured tokenizers (#24751)
Browse files Browse the repository at this point in the history
Allows plugins to register pre-configured tokenizers. Much
of the decisions are the same as those in #24223, #24572,
and #24223. This only migrates the lowercase tokenizer but
I figure that is a good start because it proves out the features.
  • Loading branch information
nik9000 authored May 19, 2017
1 parent ae73670 commit b9ea579
Show file tree
Hide file tree
Showing 14 changed files with 557 additions and 323 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -74,14 +74,15 @@ public AnalysisRegistry(Environment environment,
Map<String, AnalysisProvider<TokenizerFactory>> tokenizers,
Map<String, AnalysisProvider<AnalyzerProvider<?>>> analyzers,
Map<String, AnalysisProvider<AnalyzerProvider<?>>> normalizers,
Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters) {
Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters,
Map<String, PreConfiguredTokenizer> preConfiguredTokenizers) {
this.environment = environment;
this.charFilters = unmodifiableMap(charFilters);
this.tokenFilters = unmodifiableMap(tokenFilters);
this.tokenizers = unmodifiableMap(tokenizers);
this.analyzers = unmodifiableMap(analyzers);
this.normalizers = unmodifiableMap(normalizers);
prebuiltAnalysis = new PrebuiltAnalysis(preConfiguredTokenFilters);
prebuiltAnalysis = new PrebuiltAnalysis(preConfiguredTokenFilters, preConfiguredTokenizers);
}

/**
Expand Down Expand Up @@ -169,12 +170,12 @@ public Map<String, TokenFilterFactory> buildTokenFilterFactories(IndexSettings i
*/
tokenFilters.put("synonym", requiresAnalysisSettings((is, env, name, settings) -> new SynonymTokenFilterFactory(is, env, this, name, settings)));
tokenFilters.put("synonym_graph", requiresAnalysisSettings((is, env, name, settings) -> new SynonymGraphTokenFilterFactory(is, env, this, name, settings)));
return buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.tokenFilterFactories);
return buildMapping(Component.FILTER, indexSettings, tokenFiltersSettings, Collections.unmodifiableMap(tokenFilters), prebuiltAnalysis.preConfiguredTokenFilters);
}

public Map<String, TokenizerFactory> buildTokenizerFactories(IndexSettings indexSettings) throws IOException {
final Map<String, Settings> tokenizersSettings = indexSettings.getSettings().getGroups(INDEX_ANALYSIS_TOKENIZER);
return buildMapping(Component.TOKENIZER, indexSettings, tokenizersSettings, tokenizers, prebuiltAnalysis.tokenizerFactories);
return buildMapping(Component.TOKENIZER, indexSettings, tokenizersSettings, tokenizers, prebuiltAnalysis.preConfiguredTokenizers);
}

public Map<String, CharFilterFactory> buildCharFilterFactories(IndexSettings indexSettings) throws IOException {
Expand Down Expand Up @@ -394,31 +395,22 @@ private <T> AnalysisProvider<T> getAnalysisProvider(Component component, Map<Str
private static class PrebuiltAnalysis implements Closeable {

final Map<String, AnalysisModule.AnalysisProvider<AnalyzerProvider<?>>> analyzerProviderFactories;
final Map<String, AnalysisModule.AnalysisProvider<TokenizerFactory>> tokenizerFactories;
final Map<String, ? extends AnalysisProvider<TokenFilterFactory>> tokenFilterFactories;
final Map<String, ? extends AnalysisProvider<TokenFilterFactory>> preConfiguredTokenFilters;
final Map<String, ? extends AnalysisProvider<TokenizerFactory>> preConfiguredTokenizers;
final Map<String, AnalysisModule.AnalysisProvider<CharFilterFactory>> charFilterFactories;

private PrebuiltAnalysis(Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters) {
private PrebuiltAnalysis(
Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters,
Map<String, PreConfiguredTokenizer> preConfiguredTokenizers) {
Map<String, PreBuiltAnalyzerProviderFactory> analyzerProviderFactories = new HashMap<>();
Map<String, PreBuiltTokenizerFactoryFactory> tokenizerFactories = new HashMap<>();
Map<String, PreBuiltCharFilterFactoryFactory> charFilterFactories = new HashMap<>();

// Analyzers
for (PreBuiltAnalyzers preBuiltAnalyzerEnum : PreBuiltAnalyzers.values()) {
String name = preBuiltAnalyzerEnum.name().toLowerCase(Locale.ROOT);
analyzerProviderFactories.put(name, new PreBuiltAnalyzerProviderFactory(name, AnalyzerScope.INDICES, preBuiltAnalyzerEnum.getAnalyzer(Version.CURRENT)));
}

// Tokenizers
for (PreBuiltTokenizers preBuiltTokenizer : PreBuiltTokenizers.values()) {
String name = preBuiltTokenizer.name().toLowerCase(Locale.ROOT);
tokenizerFactories.put(name, new PreBuiltTokenizerFactoryFactory(preBuiltTokenizer.getTokenizerFactory(Version.CURRENT)));
}

// Tokenizer aliases
tokenizerFactories.put("nGram", new PreBuiltTokenizerFactoryFactory(PreBuiltTokenizers.NGRAM.getTokenizerFactory(Version.CURRENT)));
tokenizerFactories.put("edgeNGram", new PreBuiltTokenizerFactoryFactory(PreBuiltTokenizers.EDGE_NGRAM.getTokenizerFactory(Version.CURRENT)));
tokenizerFactories.put("PathHierarchy", new PreBuiltTokenizerFactoryFactory(PreBuiltTokenizers.PATH_HIERARCHY.getTokenizerFactory(Version.CURRENT)));

// Char Filters
for (PreBuiltCharFilters preBuiltCharFilter : PreBuiltCharFilters.values()) {
String name = preBuiltCharFilter.name().toLowerCase(Locale.ROOT);
Expand All @@ -429,20 +421,20 @@ private PrebuiltAnalysis(Map<String, PreConfiguredTokenFilter> preConfiguredToke

this.analyzerProviderFactories = Collections.unmodifiableMap(analyzerProviderFactories);
this.charFilterFactories = Collections.unmodifiableMap(charFilterFactories);
this.tokenizerFactories = Collections.unmodifiableMap(tokenizerFactories);
tokenFilterFactories = preConfiguredTokenFilters;
this.preConfiguredTokenFilters = preConfiguredTokenFilters;
this.preConfiguredTokenizers = preConfiguredTokenizers;
}

public AnalysisModule.AnalysisProvider<CharFilterFactory> getCharFilterFactory(String name) {
return charFilterFactories.get(name);
}

public AnalysisModule.AnalysisProvider<TokenFilterFactory> getTokenFilterFactory(String name) {
return tokenFilterFactories.get(name);
return preConfiguredTokenFilters.get(name);
}

public AnalysisModule.AnalysisProvider<TokenizerFactory> getTokenizerFactory(String name) {
return tokenizerFactories.get(name);
return preConfiguredTokenizers.get(name);
}

public AnalysisModule.AnalysisProvider<AnalyzerProvider<?>> getAnalyzerProvider(String name) {
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.index.analysis;

import org.elasticsearch.Version;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory;

import java.io.IOException;

/**
* Shared implementation for pre-configured analysis components.
*/
public abstract class PreConfiguredAnalysisComponent<T> implements AnalysisModule.AnalysisProvider<T> {
private final String name;
private final PreBuiltCacheFactory.PreBuiltCache<T> cache;

protected PreConfiguredAnalysisComponent(String name, PreBuiltCacheFactory.CachingStrategy cache) {
this.name = name;
this.cache = PreBuiltCacheFactory.getCache(cache);
}

@Override
public T get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
Version versionCreated = Version.indexCreated(settings);
synchronized (this) {
T factory = cache.get(versionCreated);
if (factory == null) {
factory = create(versionCreated);
cache.put(versionCreated, factory);
}
return factory;
}
}

/**
* The name of the analysis component in the API.
*/
public String getName() {
return name;
}

protected abstract T create(Version version);
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,21 +22,16 @@
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.elasticsearch.Version;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.env.Environment;
import org.elasticsearch.index.IndexSettings;
import org.elasticsearch.indices.analysis.AnalysisModule;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory;
import org.elasticsearch.indices.analysis.PreBuiltCacheFactory.CachingStrategy;

import java.io.IOException;
import java.util.function.BiFunction;
import java.util.function.Function;

/**
* Provides pre-configured, shared {@link TokenFilter}s.
*/
public final class PreConfiguredTokenFilter implements AnalysisModule.AnalysisProvider<TokenFilterFactory> {
public final class PreConfiguredTokenFilter extends PreConfiguredAnalysisComponent<TokenFilterFactory> {
/**
* Create a pre-configured token filter that may not vary at all.
*/
Expand All @@ -60,35 +55,19 @@ public static PreConfiguredTokenFilter luceneVersion(String name, boolean useFil
*/
public static PreConfiguredTokenFilter elasticsearchVersion(String name, boolean useFilterForMultitermQueries,
BiFunction<TokenStream, org.elasticsearch.Version, TokenStream> create) {
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ELASTICSEARCH,
(tokenStream, version) -> create.apply(tokenStream, version));
return new PreConfiguredTokenFilter(name, useFilterForMultitermQueries, CachingStrategy.ELASTICSEARCH, create);
}

private final String name;
private final boolean useFilterForMultitermQueries;
private final PreBuiltCacheFactory.PreBuiltCache<TokenFilterFactory> cache;
private final BiFunction<TokenStream, Version, TokenStream> create;

private PreConfiguredTokenFilter(String name, boolean useFilterForMultitermQueries,
PreBuiltCacheFactory.CachingStrategy cache, BiFunction<TokenStream, Version, TokenStream> create) {
this.name = name;
super(name, cache);
this.useFilterForMultitermQueries = useFilterForMultitermQueries;
this.cache = PreBuiltCacheFactory.getCache(cache);
this.create = create;
}

@Override
public TokenFilterFactory get(IndexSettings indexSettings, Environment environment, String name, Settings settings) throws IOException {
return getTokenFilterFactory(Version.indexCreated(settings));
}

/**
* The name of the {@link TokenFilter} in the API.
*/
public String getName() {
return name;
}

/**
* Can this {@link TokenFilter} be used in multi-term queries?
*/
Expand All @@ -98,42 +77,36 @@ public boolean shouldUseFilterForMultitermQueries() {

private interface MultiTermAwareTokenFilterFactory extends TokenFilterFactory, MultiTermAwareComponent {}

private synchronized TokenFilterFactory getTokenFilterFactory(final Version version) {
TokenFilterFactory factory = cache.get(version);
if (factory == null) {
if (useFilterForMultitermQueries) {
factory = new MultiTermAwareTokenFilterFactory() {
@Override
public String name() {
return name;
}

@Override
public TokenStream create(TokenStream tokenStream) {
return create.apply(tokenStream, version);
}

@Override
public Object getMultiTermComponent() {
return this;
}
};
} else {
factory = new TokenFilterFactory() {
@Override
public String name() {
return name;
}

@Override
public TokenStream create(TokenStream tokenStream) {
return create.apply(tokenStream, version);
}
};
}
cache.put(version, factory);
@Override
protected TokenFilterFactory create(Version version) {
if (useFilterForMultitermQueries) {
return new MultiTermAwareTokenFilterFactory() {
@Override
public String name() {
return getName();
}

@Override
public TokenStream create(TokenStream tokenStream) {
return create.apply(tokenStream, version);
}

@Override
public Object getMultiTermComponent() {
return this;
}
};
}
return new TokenFilterFactory() {
@Override
public String name() {
return getName();
}

return factory;
@Override
public TokenStream create(TokenStream tokenStream) {
return create.apply(tokenStream, version);
}
};
}
}
Loading

0 comments on commit b9ea579

Please sign in to comment.