Skip to content

Commit

Permalink
OAK-11197: improve statistical facets by introducing pre-computed ran…
Browse files Browse the repository at this point in the history
…dom values (apache#1791)

* OAK-11197: improve statistical facets by introducing pre-computed random values

* OAK-11197: use short for random values to reduce disk footprint

* OAK-11197: fix issue in ElasticStatisticalFacetAsyncProvider

* OAK-11197: (minor) fix indentation

* OAK-11197: replaced sha-256 random value generation with murmur3

* OAK-11197: use murmur3 impl from apache commons + seed from index def
  • Loading branch information
fabriziofortino authored Oct 16, 2024
1 parent 8e72b23 commit 1122bd7
Show file tree
Hide file tree
Showing 6 changed files with 29 additions and 9 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,11 @@ public class ElasticIndexDefinition extends IndexDefinition {
*/
public static final String DYNAMIC_BOOST_FULLTEXT = ":dynamic-boost-ft";

/**
* Precomputed random value based on the path of the document
*/
public static final String PATH_RANDOM_VALUE = ":path-random-value";

/**
* Dynamic properties are fields that are not explicitly defined in the index mapping and are added on the fly when a document is indexed.
* Examples: aggregations with relative nodes, regex properties (to be supported), etc.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,15 @@
import com.fasterxml.jackson.annotation.JsonAnyGetter;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonProperty;
import org.apache.commons.codec.digest.MurmurHash3;
import org.apache.jackrabbit.oak.api.Blob;
import org.apache.jackrabbit.oak.commons.PathUtils;
import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition;
import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
import org.apache.jackrabbit.oak.plugins.index.search.spi.binary.BlobByteSource;

import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
Expand All @@ -40,6 +42,8 @@ public class ElasticDocument {

@JsonProperty(FieldNames.PATH)
public final String path;
@JsonProperty(ElasticIndexDefinition.PATH_RANDOM_VALUE)
public final int pathRandomValue;
@JsonProperty(FieldNames.FULLTEXT)
public final Set<String> fulltext;
@JsonProperty(FieldNames.SUGGEST)
Expand All @@ -57,7 +61,13 @@ public class ElasticDocument {
private final List<Map<String, Object>> dynamicProperties;

ElasticDocument(String path) {
this(path, 0);
}

ElasticDocument(String path, int seed) {
this.path = path;
byte[] pathBytes = path.getBytes(StandardCharsets.UTF_8);
this.pathRandomValue = MurmurHash3.hash32x86(pathBytes, 0, pathBytes.length, seed);
this.fulltext = new LinkedHashSet<>();
this.suggest = new LinkedHashSet<>();
this.spellcheck = new LinkedHashSet<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.apache.jackrabbit.oak.api.Blob;
import org.apache.jackrabbit.oak.api.PropertyState;
import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition;
import org.apache.jackrabbit.oak.plugins.index.search.Aggregate;
import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition;
Expand Down Expand Up @@ -48,7 +49,7 @@ public ElasticDocumentMaker(@Nullable FulltextBinaryTextExtractor textExtractor,

@Override
protected ElasticDocument initDoc() {
return new ElasticDocument(path);
return new ElasticDocument(path, (int) definition.getDefinitionNodeState().getLong(ElasticIndexDefinition.PROP_INDEX_NAME_SEED));
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,8 @@ private static ObjectBuilder<TypeMapping> loadMappings(@NotNull TypeMapping.Buil
private static void mapInternalProperties(@NotNull TypeMapping.Builder builder) {
builder.properties(FieldNames.PATH,
b1 -> b1.keyword(builder3 -> builder3))
.properties(ElasticIndexDefinition.PATH_RANDOM_VALUE,
b1 -> b1.integer(b2 -> b2.docValues(true).index(false)))
.properties(FieldNames.ANCESTORS,
b1 -> b1.text(
b2 -> b2.analyzer("ancestor_analyzer")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

import co.elastic.clients.elasticsearch._types.aggregations.Aggregate;
import co.elastic.clients.elasticsearch._types.aggregations.StringTermsBucket;
import co.elastic.clients.elasticsearch._types.query_dsl.BoolQuery;
import co.elastic.clients.elasticsearch._types.mapping.FieldType;
import co.elastic.clients.elasticsearch._types.query_dsl.Query;
import co.elastic.clients.elasticsearch.core.SearchRequest;
import co.elastic.clients.elasticsearch.core.SearchResponse;
Expand Down Expand Up @@ -74,17 +74,19 @@ public class ElasticStatisticalFacetAsyncProvider implements ElasticFacetProvide
this.isAccessible = isAccessible;
this.facetFields = elasticRequestHandler.facetFields().collect(Collectors.toSet());

BoolQuery.Builder builder = elasticRequestHandler.baseQueryBuilder();
builder.should(sb -> sb.functionScore(fsb ->
fsb.functions(f -> f.randomScore(rsb -> rsb.seed("" + randomSeed).field(FieldNames.PATH)))
));

SearchRequest searchRequest = SearchRequest.of(srb -> srb.index(indexDefinition.getIndexAlias())
.trackTotalHits(thb -> thb.enabled(true))
.source(SourceConfig.of(scf -> scf.filter(ff -> ff.includes(FieldNames.PATH).includes(new ArrayList<>(facetFields)))))
.query(Query.of(qb -> qb.bool(builder.build())))
.query(Query.of(qb -> qb.bool(elasticRequestHandler.baseQueryBuilder().build())))
.aggregations(elasticRequestHandler.aggregations())
.size(sampleSize)
.sort(s ->
s.field(fs -> fs.field(
ElasticIndexDefinition.PATH_RANDOM_VALUE)
// this will handle the case when the field is not present in the index
.unmappedType(FieldType.Integer)
)
)
);

LOG.trace("Kicking search query with random sampling {}", searchRequest);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -443,6 +443,6 @@ public static IndexingMode from(String indexingMode) {
* needed from an outside process that does not have visibility to the specific index module.
*/
Map<String, String> INDEX_VERSION_BY_TYPE = Map.of(
"elasticsearch", "1.1.0"
"elasticsearch", "1.2.0"
);
}

0 comments on commit 1122bd7

Please sign in to comment.