Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding access to noSubMatches and noOverlappingMatches in Hyphenation… #13895

1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Add took time to request nodes stats ([#15054](https://github.com/opensearch-project/OpenSearch/pull/15054))
- [Workload Management] QueryGroup resource tracking framework changes ([#13897](https://github.com/opensearch-project/OpenSearch/pull/13897))
- Add slice execution listeners to SearchOperationListener interface ([#15153](https://github.com/opensearch-project/OpenSearch/pull/15153))
- Adding access to noSubMatches and noOverlappingMatches in Hyphenation ([#13895](https://github.com/opensearch-project/OpenSearch/pull/13895))

### Dependencies
- Bump `netty` from 4.1.111.Final to 4.1.112.Final ([#15081](https://github.com/opensearch-project/OpenSearch/pull/15081))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,16 @@
*/
public class HyphenationCompoundWordTokenFilterFactory extends AbstractCompoundWordTokenFilterFactory {

private final boolean noSubMatches;
private final boolean noOverlappingMatches;
private final HyphenationTree hyphenationTree;

HyphenationCompoundWordTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
super(indexSettings, env, name, settings);

noSubMatches = settings.getAsBoolean("no_sub_matches", false);
noOverlappingMatches = settings.getAsBoolean("no_overlapping_matches", false);
msfroh marked this conversation as resolved.
Show resolved Hide resolved

String hyphenationPatternsPath = settings.get("hyphenation_patterns_path", null);
if (hyphenationPatternsPath == null) {
throw new IllegalArgumentException("hyphenation_patterns_path is a required setting.");
Expand All @@ -85,7 +90,9 @@ public TokenStream create(TokenStream tokenStream) {
minWordSize,
minSubwordSize,
maxSubwordSize,
onlyLongestMatch
onlyLongestMatch,
noSubMatches,
noOverlappingMatches
);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.junit.Before;
import org.opensearch.Version;
import org.opensearch.cluster.metadata.IndexMetadata;
import org.opensearch.common.settings.Settings;
Expand All @@ -52,6 +53,9 @@
import org.hamcrest.MatcherAssert;

import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
Expand All @@ -63,17 +67,25 @@
import static org.hamcrest.Matchers.instanceOf;

public class CompoundAnalysisTests extends OpenSearchTestCase {

Settings[] settingsArr;

@Before
public void initialize() throws IOException {
this.settingsArr = getSettingsArr();
}
hasnain2808 marked this conversation as resolved.
Show resolved Hide resolved

public void testDefaultsCompoundAnalysis() throws Exception {
Settings settings = getJsonSettings();
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
AnalysisModule analysisModule = createAnalysisModule(settings);
TokenFilterFactory filterFactory = analysisModule.getAnalysisRegistry().buildTokenFilterFactories(idxSettings).get("dict_dec");
MatcherAssert.assertThat(filterFactory, instanceOf(DictionaryCompoundWordTokenFilterFactory.class));
for (Settings settings : this.settingsArr) {
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
AnalysisModule analysisModule = createAnalysisModule(settings);
TokenFilterFactory filterFactory = analysisModule.getAnalysisRegistry().buildTokenFilterFactories(idxSettings).get("dict_dec");
MatcherAssert.assertThat(filterFactory, instanceOf(DictionaryCompoundWordTokenFilterFactory.class));
}
}

public void testDictionaryDecompounder() throws Exception {
Settings[] settingsArr = new Settings[] { getJsonSettings(), getYamlSettings() };
for (Settings settings : settingsArr) {
for (Settings settings : this.settingsArr) {
List<String> terms = analyze(settings, "decompoundingAnalyzer", "donaudampfschiff spargelcremesuppe");
MatcherAssert.assertThat(terms.size(), equalTo(8));
MatcherAssert.assertThat(
Expand All @@ -83,6 +95,26 @@ public void testDictionaryDecompounder() throws Exception {
}
}

// Hyphenation Decompounder tests mimic the behavior of lucene tests
// lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestHyphenationCompoundWordTokenFilterFactory.java
public void testHyphenationDecompounder() throws Exception {
for (Settings settings : this.settingsArr) {
List<String> terms = analyze(settings, "hyphenationAnalyzer", "min veninde som er lidt af en læsehest");
MatcherAssert.assertThat(terms.size(), equalTo(10));
MatcherAssert.assertThat(terms, hasItems("min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest"));
}
}

// Hyphenation Decompounder tests mimic the behavior of lucene tests
// lucene/analysis/common/src/test/org/apache/lucene/analysis/compound/TestHyphenationCompoundWordTokenFilterFactory.java
public void testHyphenationDecompounderNoSubMatches() throws Exception {
for (Settings settings : this.settingsArr) {
List<String> terms = analyze(settings, "hyphenationAnalyzerNoSubMatches", "basketballkurv");
MatcherAssert.assertThat(terms.size(), equalTo(3));
MatcherAssert.assertThat(terms, hasItems("basketballkurv", "basketball", "kurv"));
}
}

private List<String> analyze(Settings settings, String analyzerName, String text) throws IOException {
IndexSettings idxSettings = IndexSettingsModule.newIndexSettings("test", settings);
AnalysisModule analysisModule = createAnalysisModule(settings);
Expand Down Expand Up @@ -111,21 +143,38 @@ public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() {
}));
}

private Settings getJsonSettings() throws IOException {
private Path createAndGetHomeDirectory() throws IOException {
return createTempDir();
}

private void copyHyphenationPatternsFile(Path home) throws IOException {
InputStream hyphenation_patterns_path = getClass().getResourceAsStream("da_UTF8.xml");
Path config = home.resolve("config");
Files.createDirectory(config);
Files.copy(hyphenation_patterns_path, config.resolve("da_UTF8.xml"));
}

private Settings getJsonSettings(Path home) throws IOException {
String json = "/org/opensearch/analysis/common/test1.json";
return Settings.builder()
.loadFromStream(json, getClass().getResourceAsStream(json), false)
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put(Environment.PATH_HOME_SETTING.getKey(), home.toString())
.build();
}

private Settings getYamlSettings() throws IOException {
private Settings getYamlSettings(Path home) throws IOException {
String yaml = "/org/opensearch/analysis/common/test1.yml";
return Settings.builder()
.loadFromStream(yaml, getClass().getResourceAsStream(yaml), false)
.put(IndexMetadata.SETTING_VERSION_CREATED, Version.CURRENT)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.put(Environment.PATH_HOME_SETTING.getKey(), home.toString())
.build();
}

private Settings[] getSettingsArr() throws IOException {
Path home = createAndGetHomeDirectory();
hasnain2808 marked this conversation as resolved.
Show resolved Hide resolved
copyHyphenationPatternsFile(home);
return new Settings[] { getJsonSettings(home), getYamlSettings(home) };
}
}
Loading
Loading