Skip to content

Commit

Permalink
fix: minhash configuration
Browse files Browse the repository at this point in the history
  • Loading branch information
mridehalgh committed Aug 13, 2024
1 parent 54c13a6 commit 83afab7
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 14 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -65,10 +65,10 @@ private Map<String, String> convertSettings(Settings settings) {
if (settings.hasValue("hash_count")) {
settingMap.put("hashCount", settings.get("hash_count"));
}
if (settings.hasValue("bucketCount")) {
if (settings.hasValue("bucket_count")) {
settingMap.put("bucketCount", settings.get("bucket_count"));
}
if (settings.hasValue("hashSetSize")) {
if (settings.hasValue("hash_set_size")) {
settingMap.put("hashSetSize", settings.get("hash_set_size"));
}
if (settings.hasValue("with_rotation")) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,14 +50,10 @@ public void testDefault() throws IOException {
int default_bucket_size = 512;
int default_hash_set_size = 1;
Settings settings = Settings.builder().put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
settings,
new CommonAnalysisModulePlugin()
);
OpenSearchTestCase.TestAnalysis analysis = getTestAnalysisFromSettings(settings);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("min_hash");
String source = "the quick brown fox";
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
Tokenizer tokenizer = getTokenizer(source);

// with_rotation is true by default, and hash_set_size is 1, so even though the source doesn't
// have enough tokens to fill all the buckets, we still expect 512 tokens.
Expand All @@ -73,17 +69,66 @@ public void testSettings() throws IOException {
.put("index.analysis.filter.test_min_hash.with_rotation", false)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();
OpenSearchTestCase.TestAnalysis analysis = AnalysisTestsHelper.createTestAnalysisFromSettings(
settings,
new CommonAnalysisModulePlugin()
);
OpenSearchTestCase.TestAnalysis analysis = getTestAnalysisFromSettings(settings);
TokenFilterFactory tokenFilter = analysis.tokenFilter.get("test_min_hash");
String source = "sushi";
Tokenizer tokenizer = new WhitespaceTokenizer();
tokenizer.setReader(new StringReader(source));
Tokenizer tokenizer = getTokenizer(source);

// despite the fact that bucket_count is 2 and hash_set_size is 1,
// because with_rotation is false, we only expect 1 token here.
assertStreamHasNumberOfTokens(tokenFilter.create(tokenizer), 1);
}

public void testBucketCountSetting() throws IOException {
// Correct case with "bucket_count"
Settings settingsWithBucketCount = Settings.builder()
.put("index.analysis.filter.test_min_hash.type", "min_hash")
.put("index.analysis.filter.test_min_hash.hash_count", "1")
.put("index.analysis.filter.test_min_hash.bucket_count", "3")
.put("index.analysis.filter.test_min_hash.hash_set_size", "1")
.put("index.analysis.filter.test_min_hash.with_rotation", false)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();

OpenSearchTestCase.TestAnalysis analysisWithBucketCount = getTestAnalysisFromSettings(settingsWithBucketCount);

TokenFilterFactory tokenFilterWithBucketCount = analysisWithBucketCount.tokenFilter.get("test_min_hash");
String sourceWithBucketCount = "salmon avocado roll uramaki";
Tokenizer tokenizerWithBucketCount = getTokenizer(sourceWithBucketCount);
// Expect 3 tokens due to bucket_count being set to 3
assertStreamHasNumberOfTokens(tokenFilterWithBucketCount.create(tokenizerWithBucketCount), 3);
}

public void testHashSetSizeSetting() throws IOException {
// Correct case with "hash_set_size"
Settings settingsWithHashSetSize = Settings.builder()
.put("index.analysis.filter.test_min_hash.type", "min_hash")
.put("index.analysis.filter.test_min_hash.hash_count", "1")
.put("index.analysis.filter.test_min_hash.bucket_count", "1")
.put("index.analysis.filter.test_min_hash.hash_set_size", "2")
.put("index.analysis.filter.test_min_hash.with_rotation", false)
.put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString())
.build();

OpenSearchTestCase.TestAnalysis analysisWithHashSetSize = getTestAnalysisFromSettings(settingsWithHashSetSize);

TokenFilterFactory tokenFilterWithHashSetSize = analysisWithHashSetSize.tokenFilter.get("test_min_hash");
String sourceWithHashSetSize = "salmon avocado roll uramaki";
Tokenizer tokenizerWithHashSetSize = getTokenizer(sourceWithHashSetSize);
// Expect 2 tokens due to hash_set_size being set to 2 and bucket_count being 1
assertStreamHasNumberOfTokens(tokenFilterWithHashSetSize.create(tokenizerWithHashSetSize), 2);
}

private static OpenSearchTestCase.TestAnalysis getTestAnalysisFromSettings(Settings settingsWithBucketCount) throws IOException {
return AnalysisTestsHelper.createTestAnalysisFromSettings(
settingsWithBucketCount,
new CommonAnalysisModulePlugin()
);
}

private static Tokenizer getTokenizer(String sourceWithBucketCount) {
Tokenizer tokenizerWithBucketCount = new WhitespaceTokenizer();
tokenizerWithBucketCount.setReader(new StringReader(sourceWithBucketCount));
return tokenizerWithBucketCount;
}
}

0 comments on commit 83afab7

Please sign in to comment.