From 0d23900bd8aafd06b8c2281fb367f812e5bb9128 Mon Sep 17 00:00:00 2001 From: Nuno Santos Date: Mon, 12 Aug 2024 11:11:47 +0200 Subject: [PATCH] OAK-11006 - indexing-job: better logging of text extraction and indexing statistics (#1629) --- .../oak/plugins/blob/FileCache.java | 57 ++++++------ .../oak/plugins/index/FormattingUtils.java | 16 ++++ .../plugins/index/FormattingUtilsTest.java | 18 ++++ .../indexer/document/CompositeIndexer.java | 7 ++ .../document/DocumentStoreIndexerBase.java | 1 + .../document/IndexerStatisticsTracker.java | 90 +++++++++++++++++++ .../indexer/document/NodeStateIndexer.java | 2 + .../indexer/document/ElasticIndexer.java | 18 +++- .../index/indexer/document/LuceneIndexer.java | 18 ++++ .../index/search/ExtractedTextCache.java | 28 +++--- .../binary/FulltextBinaryTextExtractor.java | 63 +++++++------ .../spi/binary/TextExtractionStats.java | 68 ++++++++++---- 12 files changed, 290 insertions(+), 96 deletions(-) create mode 100644 oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/IndexerStatisticsTracker.java diff --git a/oak-blob-plugins/src/main/java/org/apache/jackrabbit/oak/plugins/blob/FileCache.java b/oak-blob-plugins/src/main/java/org/apache/jackrabbit/oak/plugins/blob/FileCache.java index f5db2f8dd33..88db2ebd5fa 100644 --- a/oak-blob-plugins/src/main/java/org/apache/jackrabbit/oak/plugins/blob/FileCache.java +++ b/oak-blob-plugins/src/main/java/org/apache/jackrabbit/oak/plugins/blob/FileCache.java @@ -39,12 +39,10 @@ import org.apache.jackrabbit.guava.common.cache.RemovalCause; import org.apache.jackrabbit.guava.common.cache.Weigher; import org.apache.jackrabbit.oak.cache.CacheLIRS; -import org.apache.jackrabbit.oak.cache.CacheLIRS.EvictionCallback; import org.apache.jackrabbit.oak.cache.CacheStats; import org.apache.jackrabbit.oak.commons.StringUtils; import org.apache.jackrabbit.oak.commons.concurrent.ExecutorCloser; import org.apache.jackrabbit.oak.commons.io.FileTreeTraverser; -import org.jetbrains.annotations.NotNull; import org.jetbrains.annotations.Nullable; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -86,18 +84,14 @@ public class FileCache extends AbstractCache implements Closeable /** * Convert the size calculation to KB to support max file size of 2 TB */ - private static final Weigher weigher = new Weigher() { - @Override public int weigh(String key, File value) { - // convert to number of 4 KB blocks - return Math.round(value.length() / (4 * 1024)); - }}; + private static final Weigher weigher = (key, value) -> { + // convert to number of 4 KB blocks + return Math.round(value.length() / (4 * 1024)); + }; //Rough estimate of the in-memory key, value pair - private static final Weigher memWeigher = new Weigher() { - @Override public int weigh(String key, File value) { - return (StringUtils.estimateMemoryUsage(key) + - StringUtils.estimateMemoryUsage(value.getAbsolutePath()) + 48); - }}; + private static final Weigher memWeigher = (key, value) -> (StringUtils.estimateMemoryUsage(key) + + StringUtils.estimateMemoryUsage(value.getAbsolutePath()) + 48); private FileCache(long maxSize /* bytes */, File root, final CacheLoader loader, @Nullable final ExecutorService executor) { @@ -108,8 +102,9 @@ private FileCache(long maxSize /* bytes */, File root, // convert to number of 4 KB blocks long size = Math.round(maxSize / (1024L * 4)); - cacheLoader = new CacheLoader() { - @Override public File load(String key) throws Exception { + cacheLoader = new CacheLoader<>() { + @Override + public File load(String key) throws Exception { // Fetch from local cache directory and if not found load from backend File cachedFile = DataStoreCacheUtils.getFile(key, cacheRoot); if (cachedFile.exists()) { @@ -117,6 +112,7 @@ private FileCache(long maxSize /* bytes */, File root, } else { InputStream is = null; boolean threw = true; + long startNanos = System.nanoTime(); try { is = loader.load(key); copyInputStreamToFile(is, cachedFile); @@ -127,6 +123,9 @@ private FileCache(long maxSize /* bytes */, File root, } finally { Closeables.close(is, threw); } + if (LOG.isDebugEnabled()) { + LOG.debug("Loaded file: {} in {}", key, (System.nanoTime() - startNanos) / 1_000_000); + } return cachedFile; } } @@ -137,21 +136,17 @@ private FileCache(long maxSize /* bytes */, File root, .recordStats() .weigher(weigher) .segmentCount(SEGMENT_COUNT) - .evictionCallback(new EvictionCallback() { - @Override - public void evicted(@NotNull String key, @Nullable File cachedFile, - @NotNull RemovalCause cause) { - try { - if (cachedFile != null && cachedFile.exists() - && cause != RemovalCause.REPLACED) { - DataStoreCacheUtils.recursiveDelete(cachedFile, cacheRoot); - LOG.info("File [{}] evicted with reason [{}]", cachedFile, cause - .toString()); - } - } catch (IOException e) { - LOG.info("Cached file deletion failed after eviction", e); + .evictionCallback((key, cachedFile, cause) -> { + try { + if (cachedFile != null && cachedFile.exists() + && cause != RemovalCause.REPLACED) { + DataStoreCacheUtils.recursiveDelete(cachedFile, cacheRoot); + LOG.info("File [{}] evicted with reason [{}]", cachedFile, cause); } - }}) + } catch (IOException e) { + LOG.info("Cached file deletion failed after eviction", e); + } + }) .build(); this.cacheStats = @@ -177,7 +172,7 @@ public static FileCache build(long maxSize /* bytes */, File root, } return new FileCache() { - private final Cache cache = new CacheLIRS<>(0); + private final Cache cache = new CacheLIRS<>(0); @Override public void put(String key, File file) { } @@ -190,7 +185,7 @@ public static FileCache build(long maxSize /* bytes */, File root, return null; } - @Override public File get(String key) throws IOException { + @Override public File get(String key) { return null; } @@ -291,7 +286,7 @@ public void close() { /** * Called to initialize the in-memory cache from the fs folder */ - private class CacheBuildJob implements Callable { + private class CacheBuildJob implements Callable { @Override public Integer call() { Stopwatch watch = Stopwatch.createStarted(); diff --git a/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/index/FormattingUtils.java b/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/index/FormattingUtils.java index a885ce4de72..13451aaf5ca 100644 --- a/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/index/FormattingUtils.java +++ b/oak-core/src/main/java/org/apache/jackrabbit/oak/plugins/index/FormattingUtils.java @@ -21,6 +21,14 @@ import java.util.concurrent.TimeUnit; public class FormattingUtils { + public static String formatNanosToSeconds(long nanos) { + return formatToSeconds(nanos/1_000_000_000); + } + + public static String formatMillisToSeconds(long millis) { + return formatToSeconds(millis/1000); + } + public static String formatToSeconds(Stopwatch stopwatch) { return formatToSeconds(stopwatch.elapsed(TimeUnit.SECONDS)); } @@ -44,4 +52,12 @@ public static String formatToMillis(Stopwatch stopwatch) { String sign = millis < 0 ? "-" : ""; return String.format("%s%02d:%02d:%02d.%03d", sign, hoursPart, minutesPart, secondsPart, millisPart); } + + public static double safeComputePercentage(long numerator, long denominator) { + return denominator == 0 ? -1 : (double) numerator / denominator * 100; + } + + public static double safeComputeAverage(long totalTime, long numberOfEvents) { + return numberOfEvents == 0 ? -1 : ((double)totalTime / numberOfEvents); + } } diff --git a/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/index/FormattingUtilsTest.java b/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/index/FormattingUtilsTest.java index eb494dbb455..74155188d8c 100644 --- a/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/index/FormattingUtilsTest.java +++ b/oak-core/src/test/java/org/apache/jackrabbit/oak/plugins/index/FormattingUtilsTest.java @@ -85,4 +85,22 @@ private void testFormatToMillis(String expected, long nanos) { ticker.set(nanos); assertEquals(expected, FormattingUtils.formatToMillis(sw)); } + + @Test + public void testSafeComputePercentage() { + assertEquals(50.0, FormattingUtils.safeComputePercentage(50, 100), 0.001); + assertEquals(0.0, FormattingUtils.safeComputePercentage(0, 100), 0.001); + assertEquals(-1.0, FormattingUtils.safeComputePercentage(50, 0), 0.001); + assertEquals(100.0, FormattingUtils.safeComputePercentage(100, 100), 0.001); + assertEquals(33.333, FormattingUtils.safeComputePercentage(1, 3), 0.001); + } + + @Test + public void testSafeComputeAverage() { + assertEquals(50.0, FormattingUtils.safeComputeAverage(100, 2), 0.001); + assertEquals(0.0, FormattingUtils.safeComputeAverage(0, 100), 0.001); + assertEquals(-1.0, FormattingUtils.safeComputeAverage(100, 0), 0.001); + assertEquals(100.0, FormattingUtils.safeComputeAverage(100, 1), 0.001); + assertEquals(33.333, FormattingUtils.safeComputeAverage(100, 3), 0.001); + } } \ No newline at end of file diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/CompositeIndexer.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/CompositeIndexer.java index 71513ddc2a2..476c9753ae7 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/CompositeIndexer.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/CompositeIndexer.java @@ -41,6 +41,13 @@ public boolean isEmpty() { return indexers.isEmpty(); } + @Override + public void onIndexingStarting() { + for (NodeStateIndexer indexer : indexers) { + indexer.onIndexingStarting(); + } + } + @Override public boolean shouldInclude(String path) { return indexers.stream().anyMatch(indexer -> indexer.shouldInclude(path)); diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/DocumentStoreIndexerBase.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/DocumentStoreIndexerBase.java index b805a27227e..57676d1a24f 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/DocumentStoreIndexerBase.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/DocumentStoreIndexerBase.java @@ -358,6 +358,7 @@ public void reindex() throws CommitFailedException, IOException { } else if (flatFileStores.size() == 1) { FlatFileStore flatFileStore = flatFileStores.get(0); TopKSlowestPaths slowestTopKElements = new TopKSlowestPaths(TOP_SLOWEST_PATHS_TO_LOG); + indexer.onIndexingStarting(); long entryStart = System.nanoTime(); for (NodeStateEntry entry : flatFileStore) { reportDocumentRead(entry.getPath(), progressReporter); diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/IndexerStatisticsTracker.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/IndexerStatisticsTracker.java new file mode 100644 index 00000000000..4508e1d2d50 --- /dev/null +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/IndexerStatisticsTracker.java @@ -0,0 +1,90 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.jackrabbit.oak.index.indexer.document; + +import org.apache.jackrabbit.oak.plugins.index.FormattingUtils; +import org.slf4j.Logger; + +public final class IndexerStatisticsTracker { + private static final int SLOW_DOCUMENT_LOG_THRESHOLD = Integer.getInteger("oak.indexer.slowDocumentLogThreshold", 1000); + + private final Logger logger; + + // Timestamp of when indexing started. + private long startIndexingNanos = 0; + // Time spent indexing entries. Should be almost the same as totalMakeDocumentTimeNanos+totalWriteTimeNanos + private long totalIndexingTimeNanos = 0; + // Time generating the Lucene document. + private long totalMakeDocumentTimeNanos = 0; + // Time writing the Lucene document to disk. + private long totalWriteTimeNanos = 0; + private long nodesIndexed = 0; + + // Timestamp of when the current entry started being indexed + private long startEntryNanos = 0; + // Timestamp of when the current entry finished the makeDocument phase. + private long endEntryMakeDocumentNanos = 0; + + public IndexerStatisticsTracker(Logger logger) { + this.logger = logger; + } + + public void onIndexingStarting() { + this.startIndexingNanos = System.nanoTime(); + } + + public void onEntryStart() { + startEntryNanos = System.nanoTime(); + } + + public void onEntryEndMakeDocument() { + endEntryMakeDocumentNanos = System.nanoTime(); + } + + public void onEntryEnd(String entryPath) { + long endEntryWriteNanos = System.nanoTime(); + nodesIndexed++; + long entryIndexingTimeNanos = endEntryWriteNanos - startEntryNanos; + long entryMakeDocumentTimeNanos = endEntryMakeDocumentNanos - startEntryNanos; + long entryWriteTimeNanos = endEntryWriteNanos - endEntryMakeDocumentNanos; + totalIndexingTimeNanos += entryIndexingTimeNanos; + totalMakeDocumentTimeNanos += entryMakeDocumentTimeNanos; + totalWriteTimeNanos += entryWriteTimeNanos; + if (entryIndexingTimeNanos >= (long) SLOW_DOCUMENT_LOG_THRESHOLD * 1_000_000) { + logger.info("Slow document: {}. Times: total={}ms, makeDocument={}ms, writeToIndex={}ms", + entryPath, entryIndexingTimeNanos / 1_000_000, entryMakeDocumentTimeNanos / 1_000_000, entryWriteTimeNanos / 1_000_000); + } + startEntryNanos = 0; + endEntryMakeDocumentNanos = 0; + } + + public String formatStats() { + long endTimeNanos = System.nanoTime(); + long totalTimeNanos = endTimeNanos - startIndexingNanos; + long avgTimePerDocumentMicros = Math.round(FormattingUtils.safeComputeAverage(totalIndexingTimeNanos / 1000, nodesIndexed)); + double percentageIndexing = FormattingUtils.safeComputePercentage(totalIndexingTimeNanos, totalTimeNanos); + double percentageMakingDocument = FormattingUtils.safeComputePercentage(totalMakeDocumentTimeNanos, totalIndexingTimeNanos); + double percentageWritingToIndex = FormattingUtils.safeComputePercentage(totalWriteTimeNanos, totalIndexingTimeNanos); + return String.format("Indexed %d nodes in %s. Avg per node: %d microseconds. indexingTime: %s (%2.1f%% of total time). Breakup of indexing time: makeDocument: %s (%2.1f%%), writeIndex: %s (%2.1f%%)", + nodesIndexed, FormattingUtils.formatNanosToSeconds(totalTimeNanos), avgTimePerDocumentMicros, + FormattingUtils.formatNanosToSeconds(totalIndexingTimeNanos), percentageIndexing, + FormattingUtils.formatNanosToSeconds(totalMakeDocumentTimeNanos), percentageMakingDocument, + FormattingUtils.formatNanosToSeconds(totalWriteTimeNanos), percentageWritingToIndex); + } +} \ No newline at end of file diff --git a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/NodeStateIndexer.java b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/NodeStateIndexer.java index d1ee473ddbe..41835411ab7 100644 --- a/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/NodeStateIndexer.java +++ b/oak-run-commons/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/NodeStateIndexer.java @@ -28,6 +28,8 @@ public interface NodeStateIndexer extends Closeable{ + default void onIndexingStarting() {} + boolean shouldInclude(String path); boolean shouldInclude(NodeDocument doc); diff --git a/oak-run-elastic/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/ElasticIndexer.java b/oak-run-elastic/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/ElasticIndexer.java index 56af3e85d3d..cb881a84e05 100644 --- a/oak-run-elastic/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/ElasticIndexer.java +++ b/oak-run-elastic/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/ElasticIndexer.java @@ -33,6 +33,8 @@ import org.apache.jackrabbit.oak.spi.commit.CommitInfo; import org.apache.jackrabbit.oak.spi.filter.PathFilter; import org.apache.jackrabbit.oak.spi.state.NodeBuilder; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.IOException; import java.util.Set; @@ -43,6 +45,7 @@ NodeStateIndexer for Elastic. Indexes entries from a given nodestate. */ public class ElasticIndexer implements NodeStateIndexer { + private static final Logger LOG = LoggerFactory.getLogger(ElasticIndexer.class); private final IndexDefinition definition; private final FulltextBinaryTextExtractor binaryTextExtractor; @@ -51,6 +54,7 @@ public class ElasticIndexer implements NodeStateIndexer { private final FulltextIndexWriter indexWriter; private final ElasticIndexEditorProvider elasticIndexEditorProvider; private final IndexHelper indexHelper; + private final IndexerStatisticsTracker indexerStatisticsTracker = new IndexerStatisticsTracker(LOG); public ElasticIndexer(IndexDefinition definition, FulltextBinaryTextExtractor binaryTextExtractor, NodeBuilder definitionBuilder, IndexingProgressReporter progressReporter, @@ -64,6 +68,12 @@ public ElasticIndexer(IndexDefinition definition, FulltextBinaryTextExtractor bi this.indexHelper = indexHelper; } + @Override + public void onIndexingStarting() { + indexerStatisticsTracker.onIndexingStarting(); + binaryTextExtractor.resetStartTime(); + } + @Override public boolean shouldInclude(String path) { return getFilterResult(path) != PathFilter.Result.EXCLUDE; @@ -77,7 +87,7 @@ public boolean shouldInclude(NodeDocument doc) { public void provisionIndex() { FulltextIndexEditor editor = (FulltextIndexEditor) elasticIndexEditorProvider.getIndexEditor( - TYPE_ELASTICSEARCH, definitionBuilder, indexHelper.getNodeStore().getRoot(), new ReportingCallback(definition.getIndexPath(),false)); + TYPE_ELASTICSEARCH, definitionBuilder, indexHelper.getNodeStore().getRoot(), new ReportingCallback(definition.getIndexPath(), false)); editor.getContext().enableReindexMode(); } @@ -92,12 +102,14 @@ public boolean index(NodeStateEntry entry) throws IOException, CommitFailedExcep if (indexingRule == null) { return false; } + indexerStatisticsTracker.onEntryStart(); ElasticDocumentMaker maker = newDocumentMaker(indexingRule, entry.getPath()); ElasticDocument doc = maker.makeDocument(entry.getNodeState()); - + indexerStatisticsTracker.onEntryEndMakeDocument(); if (doc != null) { writeToIndex(doc, entry.getPath()); + indexerStatisticsTracker.onEntryEnd(entry.getPath()); progressReporter.indexUpdate(definition.getIndexPath()); return true; } @@ -116,6 +128,8 @@ public Set getRelativeIndexedNodeNames() { @Override public void close() throws IOException { + LOG.info("Statistics: {}", indexerStatisticsTracker.formatStats()); + binaryTextExtractor.logStats(); indexWriter.close(System.currentTimeMillis()); } diff --git a/oak-run/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/LuceneIndexer.java b/oak-run/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/LuceneIndexer.java index b80d980ff58..b3d0819e9ed 100644 --- a/oak-run/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/LuceneIndexer.java +++ b/oak-run/src/main/java/org/apache/jackrabbit/oak/index/indexer/document/LuceneIndexer.java @@ -35,8 +35,12 @@ import org.apache.jackrabbit.oak.spi.state.NodeBuilder; import org.apache.lucene.document.Document; import org.apache.lucene.facet.FacetsConfig; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; public class LuceneIndexer implements NodeStateIndexer, FacetsConfigProvider { + private static final Logger LOG = LoggerFactory.getLogger(LuceneIndexer.class); + private final IndexDefinition definition; private final FulltextBinaryTextExtractor binaryTextExtractor; private final NodeBuilder definitionBuilder; @@ -44,6 +48,8 @@ public class LuceneIndexer implements NodeStateIndexer, FacetsConfigProvider { private final IndexingProgressReporter progressReporter; private FacetsConfig facetsConfig; + private final IndexerStatisticsTracker indexerStatisticsTracker = new IndexerStatisticsTracker(LOG); + public LuceneIndexer(IndexDefinition definition, LuceneIndexWriter indexWriter, NodeBuilder builder, FulltextBinaryTextExtractor binaryTextExtractor, IndexingProgressReporter progressReporter) { @@ -54,6 +60,12 @@ public LuceneIndexer(IndexDefinition definition, LuceneIndexWriter indexWriter, this.progressReporter = progressReporter; } + @Override + public void onIndexingStarting() { + indexerStatisticsTracker.onIndexingStarting(); + binaryTextExtractor.resetStartTime(); + } + @Override public boolean shouldInclude(String path) { return getFilterResult(path) != PathFilter.Result.EXCLUDE; @@ -77,10 +89,14 @@ public boolean index(NodeStateEntry entry) throws IOException, CommitFailedExcep return false; } + indexerStatisticsTracker.onEntryStart(); + LuceneDocumentMaker maker = newDocumentMaker(indexingRule, entry.getPath()); Document doc = maker.makeDocument(entry.getNodeState()); + indexerStatisticsTracker.onEntryEndMakeDocument(); if (doc != null) { writeToIndex(doc, entry.getPath()); + indexerStatisticsTracker.onEntryEnd(entry.getPath()); progressReporter.indexUpdate(definition.getIndexPath()); return true; } @@ -100,6 +116,8 @@ public Set getRelativeIndexedNodeNames() { @Override public void close() throws IOException { + LOG.info("Statistics: {}", indexerStatisticsTracker.formatStats()); + binaryTextExtractor.logStats(); indexWriter.close(System.currentTimeMillis()); } diff --git a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/ExtractedTextCache.java b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/ExtractedTextCache.java index 84ab00038ca..be75b64f514 100644 --- a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/ExtractedTextCache.java +++ b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/ExtractedTextCache.java @@ -91,7 +91,7 @@ public class ExtractedTextCache { private final boolean alwaysUsePreExtractedCache; private volatile ExecutorService executorService; private volatile int timeoutCount; - private long extractionTimeoutMillis = EXTRACTION_TIMEOUT_SECONDS * 1000; + private long extractionTimeoutMillis = EXTRACTION_TIMEOUT_SECONDS * 1000L; public ExtractedTextCache(long maxWeight, long expiryTimeInSecs){ this(maxWeight, expiryTimeInSecs, false, null); @@ -305,17 +305,14 @@ public void close() { } public void process(String name, Callable callable) throws Throwable { - Callable callable2 = new Callable() { - @Override - public Void call() throws Exception { - Thread t = Thread.currentThread(); - String oldThreadName = t.getName(); - t.setName(oldThreadName + ": " + name); - try { - return callable.call(); - } finally { - Thread.currentThread().setName(oldThreadName); - } + Callable callable2 = () -> { + Thread t = Thread.currentThread(); + String oldThreadName = t.getName(); + t.setName(oldThreadName + ": " + name); + try { + return callable.call(); + } finally { + Thread.currentThread().setName(oldThreadName); } }; try { @@ -353,12 +350,7 @@ private synchronized void createExecutor() { 60L, TimeUnit.SECONDS, new LinkedBlockingQueue<>(), new ThreadFactory() { private final AtomicInteger counter = new AtomicInteger(); - private final Thread.UncaughtExceptionHandler handler = new Thread.UncaughtExceptionHandler() { - @Override - public void uncaughtException(Thread t, Throwable e) { - log.warn("Error occurred in asynchronous processing ", e); - } - }; + private final Thread.UncaughtExceptionHandler handler = (t, e) -> log.warn("Error occurred in asynchronous processing ", e); @Override public Thread newThread(@NotNull Runnable r) { Thread thread = new Thread(r, createName()); diff --git a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/FulltextBinaryTextExtractor.java b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/FulltextBinaryTextExtractor.java index 84f81a6a2d8..34a6f47b5c4 100644 --- a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/FulltextBinaryTextExtractor.java +++ b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/FulltextBinaryTextExtractor.java @@ -24,6 +24,7 @@ import org.apache.jackrabbit.oak.api.Blob; import org.apache.jackrabbit.oak.api.PropertyState; import org.apache.jackrabbit.oak.api.Type; +import org.apache.jackrabbit.oak.cache.CacheStats; import org.apache.jackrabbit.oak.commons.io.LazyInputStream; import org.apache.jackrabbit.oak.plugins.index.fulltext.ExtractedText; import org.apache.jackrabbit.oak.plugins.index.search.ExtractedTextCache; @@ -86,13 +87,23 @@ public FulltextBinaryTextExtractor(ExtractedTextCache extractedTextCache, IndexD this.reindex = reindex; } - public void done(boolean reindex) { + public void resetStartTime() { + textExtractionStats.reset(); + } + + public void done(boolean reindex){ textExtractionStats.log(reindex); textExtractionStats.collectStats(extractedTextCache); } - public List newBinary( - PropertyState property, NodeState state, String path) { + public void logStats() { + log.info("[{}] Text extraction statistics: {}", getIndexName(), textExtractionStats.formatStats()); + CacheStats cacheStats = extractedTextCache.getCacheStats(); + log.info("[{}] Text extraction cache statistics: {}", + getIndexName(), cacheStats == null ? "N/A" : cacheStats.cacheInfoAsString()); + } + + public List newBinary(PropertyState property, NodeState state, String path) { List values = new ArrayList<>(); Metadata metadata = new Metadata(); @@ -101,8 +112,7 @@ public List newBinary( type = definition.getTikaMappedMimeType(type); if (type == null || !isSupportedMediaType(type)) { - log.trace( - "[{}] Ignoring binary content for node {} due to unsupported (or null) jcr:mimeType [{}]", + log.trace("[{}] Ignoring binary content for node {} due to unsupported (or null) jcr:mimeType [{}]", getIndexName(), path, type); return values; } @@ -143,20 +153,22 @@ private String parseStringValue(Blob v, Metadata metadata, String path, String p return text; } - private String parseStringValue0(Blob v, Metadata metadata, String path) { + private String parseStringValue0(Blob blob, Metadata metadata, String path) { WriteOutContentHandler handler = new WriteOutContentHandler(definition.getMaxExtractLength()); - long start = System.currentTimeMillis(); long bytesRead = 0; - long length = v.length(); + long blobLength = blob.length(); if (log.isDebugEnabled()) { - log.debug("Extracting {}, {} bytes, id {}", path, length, v.getContentIdentity()); + log.debug("Extracting {}. Length: {}, reference: {}", path, blobLength, blob.getReference()); } + textExtractionStats.startExtraction(); try { - CountingInputStream stream = new CountingInputStream(new LazyInputStream(v::getNewStream)); + CountingInputStream stream = new CountingInputStream(new LazyInputStream(blob::getNewStream)); try { - if (length > SMALL_BINARY) { - String name = "Extracting " + path + ", " + length + " bytes"; - extractedTextCache.process(name, () -> { + if (blobLength > SMALL_BINARY) { + // Extracting can take a long time, so if a binary is large enough, delegate extraction to the + // ExtractedTextCache#process, which may execute the extraction with a timeout (depends on configuration). + String threadName = "Extracting " + path + ", " + blobLength + " bytes"; + extractedTextCache.process(threadName, () -> { getParser().parse(stream, handler, metadata, new ParseContext()); return null; }); @@ -177,14 +189,13 @@ private String parseStringValue0(Blob v, Metadata metadata, String path) { String indexName = getIndexName(); log.info(format, indexName, path); log.debug(format, indexName, path, e); - extractedTextCache.put(v, ExtractedText.ERROR); + extractedTextCache.put(blob, ExtractedText.ERROR); return TEXT_EXTRACTION_ERROR; } catch (TimeoutException t) { - log.warn( - "[{}] Failed to extract text from a binary property due to timeout: {}.", + log.warn("[{}] Failed to extract text from a binary property due to timeout: {}.", getIndexName(), path); - extractedTextCache.put(v, ExtractedText.ERROR); - extractedTextCache.putTimeout(v, ExtractedText.ERROR); + extractedTextCache.put(blob, ExtractedText.ERROR); + extractedTextCache.putTimeout(blob, ExtractedText.ERROR); return TEXT_EXTRACTION_ERROR; } catch (Throwable t) { // Capture and report any other full text extraction problems. @@ -193,9 +204,9 @@ private String parseStringValue0(Blob v, Metadata metadata, String path) { String format = "[{}] Failed to extract text from a binary property: {}. " + "This is quite common, and usually nothing to worry about."; String indexName = getIndexName(); - log.info(format, indexName, path); + log.info(format + " Error: " + t, indexName, path); log.debug(format, indexName, path, t); - extractedTextCache.put(v, ExtractedText.ERROR); + extractedTextCache.put(blob, ExtractedText.ERROR); return TEXT_EXTRACTION_ERROR; } else { log.debug("Extracted text size exceeded configured limit({})", definition.getMaxExtractLength()); @@ -203,22 +214,16 @@ private String parseStringValue0(Blob v, Metadata metadata, String path) { } String result = handler.toString(); if (bytesRead > 0) { - long time = System.currentTimeMillis() - start; int len = result.length(); - recordTextExtractionStats(time, bytesRead, len); + long extractionTimeMillis = textExtractionStats.finishExtraction(bytesRead, len); if (log.isDebugEnabled()) { - log.debug("Extracting {} took {} ms, {} bytes read, {} text size", - path, time, bytesRead, len); + log.debug("Extracted {}. Time: {} ms, Bytes read: {}, Text size: {}", path, extractionTimeMillis, bytesRead, len); } } - extractedTextCache.put(v, new ExtractedText(ExtractedText.ExtractionResult.SUCCESS, result)); + extractedTextCache.put(blob, new ExtractedText(ExtractedText.ExtractionResult.SUCCESS, result)); return result; } - private void recordTextExtractionStats(long timeInMillis, long bytesRead, int textLength) { - textExtractionStats.addStats(timeInMillis, bytesRead, textLength); - } - private String getIndexName() { return definition.getIndexName(); } diff --git a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TextExtractionStats.java b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TextExtractionStats.java index 9bf23c3f2a3..52a114c1b88 100644 --- a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TextExtractionStats.java +++ b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/spi/binary/TextExtractionStats.java @@ -21,28 +21,40 @@ import java.util.concurrent.TimeUnit; +import org.apache.jackrabbit.oak.plugins.index.FormattingUtils; import org.apache.jackrabbit.oak.plugins.index.search.ExtractedTextCache; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import static org.apache.jackrabbit.oak.commons.IOUtils.humanReadableByteCount; -class TextExtractionStats { +final class TextExtractionStats { private static final Logger log = LoggerFactory.getLogger(TextExtractionStats.class); /** * Log stats only if time spent is more than 1 min */ private static final long LOGGING_THRESHOLD = TimeUnit.MINUTES.toMillis(1); - private int count; + + + private int numberOfExtractions; private long totalBytesRead; - private long totalTime; - private long totalTextLength; + private long totalExtractedTextLength; + private long totalExtractionTimeNanos; + private long currentExtractionStartNanos = 0; + private long startTimeNanos = System.nanoTime(); - public void addStats(long timeInMillis, long bytesRead, int textLength) { - count++; - totalBytesRead += bytesRead; - totalTime += timeInMillis; - totalTextLength += textLength; + public void reset() { + log.info("Resetting statistics"); + this.numberOfExtractions = 0; + this.totalBytesRead = 0; + this.totalExtractedTextLength = 0; + this.totalExtractionTimeNanos = 0; + this.currentExtractionStartNanos = 0; + this.startTimeNanos = System.nanoTime(); + } + + public void startExtraction() { + currentExtractionStartNanos = System.nanoTime(); } public void log(boolean reindex) { @@ -53,25 +65,49 @@ public void log(boolean reindex) { } } - public void collectStats(ExtractedTextCache cache){ - cache.addStats(count, totalTime, totalBytesRead, totalTextLength); + public long finishExtraction(long bytesRead, int extractedTextLength) { + long elapsedNanos = System.nanoTime() - currentExtractionStartNanos; + numberOfExtractions++; + totalBytesRead += bytesRead; + totalExtractedTextLength += extractedTextLength; + totalExtractionTimeNanos += elapsedNanos; + return elapsedNanos / 1_000_000; + } + + public void collectStats(ExtractedTextCache cache) { + cache.addStats(numberOfExtractions, totalExtractionTimeNanos / 1_000_000, totalBytesRead, totalExtractedTextLength); } private boolean isTakingLotsOfTime() { - return totalTime > LOGGING_THRESHOLD; + return totalExtractionTimeNanos > LOGGING_THRESHOLD * 1_000_000; } private boolean anyParsingDone() { - return count > 0; + return numberOfExtractions > 0; } @Override public String toString() { return String.format(" %d (Time Taken %s, Bytes Read %s, Extracted text size %s)", - count, - timeInWords(totalTime), + numberOfExtractions, + timeInWords(totalExtractionTimeNanos), humanReadableByteCount(totalBytesRead), - humanReadableByteCount(totalTextLength)); + humanReadableByteCount(totalExtractedTextLength)); + } + + public String formatStats() { + long timeSinceStartNanos = System.nanoTime() - startTimeNanos; + double timeExtractingPercentage = FormattingUtils.safeComputePercentage(totalExtractionTimeNanos, timeSinceStartNanos); + long avgExtractionTimeMillis = Math.round(FormattingUtils.safeComputeAverage(totalExtractionTimeNanos / 1_000_000, numberOfExtractions)); + + return String.format("{extractions: %d, timeExtracting: %s (%2.1f%%), totalTime: %s, " + + "avgExtractionTime: %s ms, bytesRead: %s, extractedTextSize: %s}", + numberOfExtractions, + FormattingUtils.formatNanosToSeconds(totalExtractionTimeNanos), + timeExtractingPercentage, + FormattingUtils.formatNanosToSeconds(timeSinceStartNanos), + avgExtractionTimeMillis, + humanReadableByteCount(totalBytesRead), humanReadableByteCount(totalExtractedTextLength)); } private static String timeInWords(long millis) {