Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: MinHash token filter parameters not working #15233

Merged
merged 3 commits into from
Aug 16, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Bump `com.azure:azure-core` from 1.49.1 to 1.51.0 ([#15111](https://github.com/opensearch-project/OpenSearch/pull/15111))
- Bump `org.xerial.snappy:snappy-java` from 1.1.10.5 to 1.1.10.6 ([#15207](https://github.com/opensearch-project/OpenSearch/pull/15207))
- Bump `com.azure:azure-xml` from 1.0.0 to 1.1.0 ([#15206](https://github.com/opensearch-project/OpenSearch/pull/15206))
- Bump `reactor` from 3.5.19 to 3.5.20 ([#15262](https://github.com/opensearch-project/OpenSearch/pull/15262))
- Bump `reactor` from 3.5.19 to 3.5.20 ([#15262](https://github.com/opensearch-project/OpenSearch/pull/15262))
- Bump `reactor-netty` from 1.1.21 to 1.1.22 ([#15262](https://github.com/opensearch-project/OpenSearch/pull/15262))

### Changed
Expand All @@ -52,6 +52,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Fix array_index_out_of_bounds_exception when indexing documents with field name containing only dot ([#15126](https://github.com/opensearch-project/OpenSearch/pull/15126))
- Fixed array field name omission in flat_object function for nested JSON ([#13620](https://github.com/opensearch-project/OpenSearch/pull/13620))
- Fix range aggregation optimization ignoring top level queries ([#15194](https://github.com/opensearch-project/OpenSearch/pull/15194))
- Fix incorrect parameter names in MinHash token filter configuration handling ([#15233](https://github.com/opensearch-project/OpenSearch/pull/15233))

### Security

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,10 @@ private Map<String, String> convertSettings(Settings settings) {
if (settings.hasValue("hash_count")) {
settingMap.put("hashCount", settings.get("hash_count"));
}
if (settings.hasValue("bucketCount")) {
if (settings.hasValue("bucket_count")) {
settingMap.put("bucketCount", settings.get("bucket_count"));
}
if (settings.hasValue("hashSetSize")) {
if (settings.hasValue("hash_set_size")) {
settingMap.put("hashSetSize", settings.get("hash_set_size"));
}
if (settings.hasValue("with_rotation")) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,10 @@ public void testDefault() throws IOException {
int default_bucket_size = 512;
int default_hash_set_size = 1;
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
settings,
new CommonAnalysisModulePlugin()
);
OpenSearchTestCase.TestAnalysis analysis = getTestAnalysisFromSettings(settings);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("min_hash");
String source = "the quick brown fox";
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
Tokenizer tokenizer = getTokenizer(source);

// with_rotation is true by default, and hash_set_size is 1, so even though the source doesn't
// have enough tokens to fill all the buckets, we still expect 512 tokens.
Expand All @@ -73,17 +69,63 @@ public void testSettings() throws IOException {
.put("index.analysis.filter.test_min_hash.with_rotation", false)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
settings,
new CommonAnalysisModulePlugin()
);
OpenSearchTestCase.TestAnalysis analysis = getTestAnalysisFromSettings(settings);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("test_min_hash");
String source = "sushi";
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
Tokenizer tokenizer = getTokenizer(source);

// despite the fact that bucket_count is 2 and hash_set_size is 1,
// because with_rotation is false, we only expect 1 token here.
assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer), 1);
}

public void testBucketCountSetting() throws IOException {
// Correct case with "bucket_count"
Settings settingsWithBucketCount = Settings.builder()
.put("index.analysis.filter.test_min_hash.type", "min_hash")
.put("index.analysis.filter.test_min_hash.hash_count", "1")
.put("index.analysis.filter.test_min_hash.bucket_count", "3")
.put("index.analysis.filter.test_min_hash.hash_set_size", "1")
.put("index.analysis.filter.test_min_hash.with_rotation", false)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();

OpenSearchTestCase.TestAnalysis analysisWithBucketCount = getTestAnalysisFromSettings(settingsWithBucketCount);

TokenFilterFactory tokenFilterWithBucketCount = analysisWithBucketCount.tokenFilter.get("test_min_hash");
String sourceWithBucketCount = "salmon avocado roll uramaki";
Tokenizer tokenizerWithBucketCount = getTokenizer(sourceWithBucketCount);
// Expect 3 tokens due to bucket_count being set to 3
assertStreamHasNumberOfTokens(tokenFilterWithBucketCount.create(tokenizerWithBucketCount), 3);
}

public void testHashSetSizeSetting() throws IOException {
// Correct case with "hash_set_size"
Settings settingsWithHashSetSize = Settings.builder()
.put("index.analysis.filter.test_min_hash.type", "min_hash")
.put("index.analysis.filter.test_min_hash.hash_count", "1")
.put("index.analysis.filter.test_min_hash.bucket_count", "1")
.put("index.analysis.filter.test_min_hash.hash_set_size", "2")
.put("index.analysis.filter.test_min_hash.with_rotation", false)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();

OpenSearchTestCase.TestAnalysis analysisWithHashSetSize = getTestAnalysisFromSettings(settingsWithHashSetSize);

TokenFilterFactory tokenFilterWithHashSetSize = analysisWithHashSetSize.tokenFilter.get("test_min_hash");
String sourceWithHashSetSize = "salmon avocado roll uramaki";
Tokenizer tokenizerWithHashSetSize = getTokenizer(sourceWithHashSetSize);
// Expect 2 tokens due to hash_set_size being set to 2 and bucket_count being 1
assertStreamHasNumberOfTokens(tokenFilterWithHashSetSize.create(tokenizerWithHashSetSize), 2);
}

private static OpenSearchTestCase.TestAnalysis getTestAnalysisFromSettings(Settings settingsWithBucketCount) throws IOException {
return AnalysisTestsHelper.createTestAnalysisFromSettings(settingsWithBucketCount, new CommonAnalysisModulePlugin());
}

private static Tokenizer getTokenizer(String sourceWithBucketCount) {
Tokenizer tokenizerWithBucketCount = new WhitespaceTokenizer();
tokenizerWithBucketCount.setReader(new StringReader(sourceWithBucketCount));
return tokenizerWithBucketCount;
}
}
Loading