From 5f98d674c74ab5d24195993734bb054840906b47 Mon Sep 17 00:00:00 2001 From: Raghuvansh Raj Date: Tue, 11 Jul 2023 18:06:01 +0530 Subject: [PATCH 1/7] [Remote Store] Add multipart upload integration for translog and segment files (#7119) Signed-off-by: Raghuvansh Raj --- .../common/io/InputStreamContainer.java | 11 +- .../RemoteStoreBaseIntegTestCase.java | 4 +- .../RemoteStoreMultipartFileCorruptionIT.java | 111 +++++++++ .../multipart/RemoteStoreMultipartIT.java | 38 +++ .../multipart/mocks/MockFsBlobStore.java | 36 +++ .../multipart/mocks/MockFsRepository.java | 46 ++++ .../mocks/MockFsRepositoryPlugin.java | 38 +++ .../mocks/MockFsVerifyingBlobContainer.java | 120 +++++++++ .../VerifyingMultiStreamBlobContainer.java | 34 +++ .../transfer/RemoteTransferContainer.java | 2 +- .../org/opensearch/common/util/ByteUtils.java | 10 + .../shard/RemoteStoreRefreshListener.java | 227 ++++++++---------- .../index/store/RemoteDirectory.java | 4 + .../store/RemoteSegmentStoreDirectory.java | 127 ++++++++++ .../ChecksumCombinationException.java | 2 +- .../opensearch/index/translog/Checkpoint.java | 2 +- .../TranslogCheckedContainer.java | 17 +- .../index/translog/TranslogHeader.java | 12 +- .../index/translog/TranslogReader.java | 37 ++- .../index/translog/TranslogWriter.java | 25 +- .../index/translog/checked/package-info.java | 10 - .../transfer/BlobStoreTransferService.java | 98 +++++++- .../index/translog/transfer/FileSnapshot.java | 17 +- .../translog/transfer/TransferService.java | 28 ++- .../TranslogCheckpointTransferSnapshot.java | 10 +- .../transfer/TranslogTransferManager.java | 15 +- .../org/opensearch/threadpool/ThreadPool.java | 2 +- .../RemoteTransferContainerTests.java | 41 ++++ .../RemoteSegmentStoreDirectoryTests.java | 86 +++++++ .../index/store/TestUploadListener.java | 43 ++++ ...oreTransferServiceMockRepositoryTests.java | 189 +++++++++++++++ .../BlobStoreTransferServiceTests.java | 22 +- .../translog/transfer/FileSnapshotTests.java | 6 +- .../transfer/FileTransferTrackerTests.java | 12 +- .../TranslogTransferManagerTests.java | 31 ++- 35 files changed, 1293 insertions(+), 220 deletions(-) rename {server => libs/common}/src/main/java/org/opensearch/common/io/InputStreamContainer.java (85%) create mode 100644 server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/RemoteStoreMultipartFileCorruptionIT.java create mode 100644 server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/RemoteStoreMultipartIT.java create mode 100644 server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/mocks/MockFsBlobStore.java create mode 100644 server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/mocks/MockFsRepository.java create mode 100644 server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/mocks/MockFsRepositoryPlugin.java create mode 100644 server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/mocks/MockFsVerifyingBlobContainer.java create mode 100644 server/src/main/java/org/opensearch/common/blobstore/VerifyingMultiStreamBlobContainer.java rename server/src/main/java/org/opensearch/index/translog/{checked => }/TranslogCheckedContainer.java (74%) delete mode 100644 server/src/main/java/org/opensearch/index/translog/checked/package-info.java create mode 100644 server/src/test/java/org/opensearch/index/store/TestUploadListener.java create mode 100644 server/src/test/java/org/opensearch/index/translog/transfer/BlobStoreTransferServiceMockRepositoryTests.java diff --git a/server/src/main/java/org/opensearch/common/io/InputStreamContainer.java b/libs/common/src/main/java/org/opensearch/common/io/InputStreamContainer.java similarity index 85% rename from server/src/main/java/org/opensearch/common/io/InputStreamContainer.java rename to libs/common/src/main/java/org/opensearch/common/io/InputStreamContainer.java index ce5dcff9f5349..eb8a4e1382497 100644 --- a/server/src/main/java/org/opensearch/common/io/InputStreamContainer.java +++ b/libs/common/src/main/java/org/opensearch/common/io/InputStreamContainer.java @@ -19,6 +19,7 @@ public class InputStreamContainer { private final InputStream inputStream; private final long contentLength; + private final long offset; /** * Construct a new stream object @@ -26,9 +27,10 @@ public class InputStreamContainer { * @param inputStream The input stream that is to be encapsulated * @param contentLength The total content length that is to be read from the stream */ - public InputStreamContainer(InputStream inputStream, long contentLength) { + public InputStreamContainer(InputStream inputStream, long contentLength, long offset) { this.inputStream = inputStream; this.contentLength = contentLength; + this.offset = offset; } /** @@ -44,4 +46,11 @@ public InputStream getInputStream() { public long getContentLength() { return contentLength; } + + /** + * @return offset of the source content. + */ + public long getOffset() { + return offset; + } } diff --git a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreBaseIntegTestCase.java b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreBaseIntegTestCase.java index 2b3fcadfc645e..10f01749ab4c5 100644 --- a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreBaseIntegTestCase.java +++ b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreBaseIntegTestCase.java @@ -102,9 +102,7 @@ protected void putRepository(Path path) { protected void setupRepo() { internalCluster().startClusterManagerOnlyNode(); absolutePath = randomRepoPath().toAbsolutePath(); - assertAcked( - clusterAdmin().preparePutRepository(REPOSITORY_NAME).setType("fs").setSettings(Settings.builder().put("location", absolutePath)) - ); + putRepository(absolutePath); } @After diff --git a/server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/RemoteStoreMultipartFileCorruptionIT.java b/server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/RemoteStoreMultipartFileCorruptionIT.java new file mode 100644 index 0000000000000..8f375ca6e2b01 --- /dev/null +++ b/server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/RemoteStoreMultipartFileCorruptionIT.java @@ -0,0 +1,111 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.remotestore.multipart; + +import org.junit.After; +import org.junit.Before; +import org.opensearch.action.index.IndexResponse; +import org.opensearch.action.support.IndicesOptions; +import org.opensearch.cluster.metadata.IndexMetadata; +import org.opensearch.common.UUIDs; +import org.opensearch.common.settings.Settings; +import org.opensearch.common.util.FeatureFlags; +import org.opensearch.index.IndexModule; +import org.opensearch.indices.replication.common.ReplicationType; +import org.opensearch.plugins.Plugin; +import org.opensearch.remotestore.multipart.mocks.MockFsRepository; +import org.opensearch.remotestore.multipart.mocks.MockFsRepositoryPlugin; +import org.opensearch.test.OpenSearchIntegTestCase; + +import java.nio.file.Path; +import java.util.Collection; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertAcked; + +public class RemoteStoreMultipartFileCorruptionIT extends OpenSearchIntegTestCase { + + protected static final String REPOSITORY_NAME = "test-remore-store-repo"; + private static final String INDEX_NAME = "remote-store-test-idx-1"; + + @Override + protected Collection> nodePlugins() { + return Stream.concat(super.nodePlugins().stream(), Stream.of(MockFsRepositoryPlugin.class)).collect(Collectors.toList()); + } + + @Override + protected Settings featureFlagSettings() { + return Settings.builder().put(super.featureFlagSettings()).put(FeatureFlags.REMOTE_STORE, "true").build(); + } + + @Before + public void setup() { + internalCluster().startClusterManagerOnlyNode(); + Path absolutePath = randomRepoPath().toAbsolutePath(); + putRepository(absolutePath); + } + + protected void putRepository(Path path) { + assertAcked( + clusterAdmin().preparePutRepository(REPOSITORY_NAME) + .setType(MockFsRepositoryPlugin.TYPE) + .setSettings( + Settings.builder() + .put("location", path) + // custom setting for MockFsRepositoryPlugin + .put(MockFsRepository.TRIGGER_DATA_INTEGRITY_FAILURE.getKey(), true) + ) + ); + } + + @After + public void teardown() { + assertAcked(clusterAdmin().prepareDeleteRepository(REPOSITORY_NAME)); + } + + protected Settings remoteStoreIndexSettings() { + return Settings.builder() + .put(super.indexSettings()) + .put("index.refresh_interval", "300s") + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0) + .put(IndexModule.INDEX_QUERY_CACHE_ENABLED_SETTING.getKey(), false) + .put(IndexMetadata.SETTING_REPLICATION_TYPE, ReplicationType.SEGMENT) + .put(IndexMetadata.SETTING_REMOTE_STORE_ENABLED, true) + .put(IndexMetadata.SETTING_REMOTE_STORE_REPOSITORY, REPOSITORY_NAME) + .build(); + } + + private IndexResponse indexSingleDoc() { + return client().prepareIndex(INDEX_NAME) + .setId(UUIDs.randomBase64UUID()) + .setSource(randomAlphaOfLength(5), randomAlphaOfLength(5)) + .get(); + } + + public void testLocalFileCorruptionDuringUpload() { + internalCluster().startDataOnlyNodes(1); + createIndex(INDEX_NAME, remoteStoreIndexSettings()); + ensureYellowAndNoInitializingShards(INDEX_NAME); + ensureGreen(INDEX_NAME); + + indexSingleDoc(); + + client().admin() + .indices() + .prepareRefresh(INDEX_NAME) + .setIndicesOptions(IndicesOptions.STRICT_EXPAND_OPEN_HIDDEN_FORBID_CLOSED) + .execute() + .actionGet(); + + // ensuring red cluster meaning shard has failed and is unassigned + ensureRed(INDEX_NAME); + } +} diff --git a/server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/RemoteStoreMultipartIT.java b/server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/RemoteStoreMultipartIT.java new file mode 100644 index 0000000000000..a523d5c0f5470 --- /dev/null +++ b/server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/RemoteStoreMultipartIT.java @@ -0,0 +1,38 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.remotestore.multipart; + +import org.opensearch.common.settings.Settings; +import org.opensearch.plugins.Plugin; +import org.opensearch.remotestore.RemoteStoreIT; +import org.opensearch.remotestore.multipart.mocks.MockFsRepositoryPlugin; + +import java.nio.file.Path; +import java.util.Collection; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertAcked; + +public class RemoteStoreMultipartIT extends RemoteStoreIT { + + @Override + protected Collection> nodePlugins() { + return Stream.concat(super.nodePlugins().stream(), Stream.of(MockFsRepositoryPlugin.class)).collect(Collectors.toList()); + } + + @Override + protected void putRepository(Path path) { + assertAcked( + clusterAdmin().preparePutRepository(REPOSITORY_NAME) + .setType(MockFsRepositoryPlugin.TYPE) + .setSettings(Settings.builder().put("location", path)) + ); + } +} diff --git a/server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/mocks/MockFsBlobStore.java b/server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/mocks/MockFsBlobStore.java new file mode 100644 index 0000000000000..f1d9fbba84528 --- /dev/null +++ b/server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/mocks/MockFsBlobStore.java @@ -0,0 +1,36 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.remotestore.multipart.mocks; + +import org.opensearch.OpenSearchException; +import org.opensearch.common.blobstore.BlobContainer; +import org.opensearch.common.blobstore.BlobPath; +import org.opensearch.common.blobstore.fs.FsBlobStore; + +import java.io.IOException; +import java.nio.file.Path; + +public class MockFsBlobStore extends FsBlobStore { + + private final boolean triggerDataIntegrityFailure; + + public MockFsBlobStore(int bufferSizeInBytes, Path path, boolean readonly, boolean triggerDataIntegrityFailure) throws IOException { + super(bufferSizeInBytes, path, readonly); + this.triggerDataIntegrityFailure = triggerDataIntegrityFailure; + } + + @Override + public BlobContainer blobContainer(BlobPath path) { + try { + return new MockFsVerifyingBlobContainer(this, path, buildAndCreate(path), triggerDataIntegrityFailure); + } catch (IOException ex) { + throw new OpenSearchException("failed to create blob container", ex); + } + } +} diff --git a/server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/mocks/MockFsRepository.java b/server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/mocks/MockFsRepository.java new file mode 100644 index 0000000000000..15a9853477081 --- /dev/null +++ b/server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/mocks/MockFsRepository.java @@ -0,0 +1,46 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.remotestore.multipart.mocks; + +import org.opensearch.cluster.metadata.RepositoryMetadata; +import org.opensearch.cluster.service.ClusterService; +import org.opensearch.common.blobstore.BlobStore; +import org.opensearch.common.blobstore.fs.FsBlobStore; +import org.opensearch.common.settings.Setting; +import org.opensearch.core.xcontent.NamedXContentRegistry; +import org.opensearch.env.Environment; +import org.opensearch.indices.recovery.RecoverySettings; +import org.opensearch.repositories.fs.FsRepository; + +public class MockFsRepository extends FsRepository { + + public static Setting TRIGGER_DATA_INTEGRITY_FAILURE = Setting.boolSetting( + "mock_fs_repository.trigger_data_integrity_failure", + false + ); + + private final boolean triggerDataIntegrityFailure; + + public MockFsRepository( + RepositoryMetadata metadata, + Environment environment, + NamedXContentRegistry namedXContentRegistry, + ClusterService clusterService, + RecoverySettings recoverySettings + ) { + super(metadata, environment, namedXContentRegistry, clusterService, recoverySettings); + triggerDataIntegrityFailure = TRIGGER_DATA_INTEGRITY_FAILURE.get(metadata.settings()); + } + + @Override + protected BlobStore createBlobStore() throws Exception { + FsBlobStore fsBlobStore = (FsBlobStore) super.createBlobStore(); + return new MockFsBlobStore(fsBlobStore.bufferSizeInBytes(), fsBlobStore.path(), isReadOnly(), triggerDataIntegrityFailure); + } +} diff --git a/server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/mocks/MockFsRepositoryPlugin.java b/server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/mocks/MockFsRepositoryPlugin.java new file mode 100644 index 0000000000000..ffd53adf4e29e --- /dev/null +++ b/server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/mocks/MockFsRepositoryPlugin.java @@ -0,0 +1,38 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.remotestore.multipart.mocks; + +import org.opensearch.cluster.service.ClusterService; +import org.opensearch.core.xcontent.NamedXContentRegistry; +import org.opensearch.env.Environment; +import org.opensearch.indices.recovery.RecoverySettings; +import org.opensearch.plugins.Plugin; +import org.opensearch.plugins.RepositoryPlugin; +import org.opensearch.repositories.Repository; + +import java.util.Collections; +import java.util.Map; + +public class MockFsRepositoryPlugin extends Plugin implements RepositoryPlugin { + + public static final String TYPE = "fs_multipart_repository"; + + @Override + public Map getRepositories( + Environment env, + NamedXContentRegistry namedXContentRegistry, + ClusterService clusterService, + RecoverySettings recoverySettings + ) { + return Collections.singletonMap( + "fs_multipart_repository", + metadata -> new MockFsRepository(metadata, env, namedXContentRegistry, clusterService, recoverySettings) + ); + } +} diff --git a/server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/mocks/MockFsVerifyingBlobContainer.java b/server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/mocks/MockFsVerifyingBlobContainer.java new file mode 100644 index 0000000000000..8f2814eb7c4c4 --- /dev/null +++ b/server/src/internalClusterTest/java/org/opensearch/remotestore/multipart/mocks/MockFsVerifyingBlobContainer.java @@ -0,0 +1,120 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.remotestore.multipart.mocks; + +import org.apache.lucene.index.CorruptIndexException; +import org.opensearch.action.ActionListener; +import org.opensearch.common.blobstore.VerifyingMultiStreamBlobContainer; +import org.opensearch.common.io.InputStreamContainer; +import org.opensearch.common.StreamContext; +import org.opensearch.common.blobstore.BlobPath; +import org.opensearch.common.blobstore.fs.FsBlobContainer; +import org.opensearch.common.blobstore.fs.FsBlobStore; +import org.opensearch.common.blobstore.stream.write.WriteContext; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicLong; + +public class MockFsVerifyingBlobContainer extends FsBlobContainer implements VerifyingMultiStreamBlobContainer { + + private static final int TRANSFER_TIMEOUT_MILLIS = 30000; + + private final boolean triggerDataIntegrityFailure; + + public MockFsVerifyingBlobContainer(FsBlobStore blobStore, BlobPath blobPath, Path path, boolean triggerDataIntegrityFailure) { + super(blobStore, blobPath, path); + this.triggerDataIntegrityFailure = triggerDataIntegrityFailure; + } + + @Override + public void asyncBlobUpload(WriteContext writeContext, ActionListener completionListener) throws IOException { + + int nParts = 10; + long partSize = writeContext.getFileSize() / nParts; + StreamContext streamContext = writeContext.getStreamProvider(partSize); + final Path file = path.resolve(writeContext.getFileName()); + byte[] buffer = new byte[(int) writeContext.getFileSize()]; + AtomicLong totalContentRead = new AtomicLong(); + CountDownLatch latch = new CountDownLatch(streamContext.getNumberOfParts()); + for (int partIdx = 0; partIdx < streamContext.getNumberOfParts(); partIdx++) { + int finalPartIdx = partIdx; + Thread thread = new Thread(() -> { + try { + InputStreamContainer inputStreamContainer = streamContext.provideStream(finalPartIdx); + InputStream inputStream = inputStreamContainer.getInputStream(); + long remainingContentLength = inputStreamContainer.getContentLength(); + long offset = partSize * finalPartIdx; + while (remainingContentLength > 0) { + int readContentLength = inputStream.read(buffer, (int) offset, (int) remainingContentLength); + totalContentRead.addAndGet(readContentLength); + remainingContentLength -= readContentLength; + offset += readContentLength; + } + inputStream.close(); + } catch (IOException e) { + completionListener.onFailure(e); + } finally { + latch.countDown(); + } + }); + thread.start(); + } + try { + if (!latch.await(TRANSFER_TIMEOUT_MILLIS, TimeUnit.MILLISECONDS)) { + throw new IOException("Timed out waiting for file transfer to complete for " + writeContext.getFileName()); + } + } catch (InterruptedException e) { + throw new IOException("Await interrupted on CountDownLatch, transfer failed for " + writeContext.getFileName()); + } + try (OutputStream outputStream = Files.newOutputStream(file, StandardOpenOption.CREATE_NEW)) { + outputStream.write(buffer); + } + if (writeContext.getFileSize() != totalContentRead.get()) { + throw new IOException( + "Incorrect content length read for file " + + writeContext.getFileName() + + ", actual file size: " + + writeContext.getFileSize() + + ", bytes read: " + + totalContentRead.get() + ); + } + + try { + // bulks need to succeed for segment files to be generated + if (isSegmentFile(writeContext.getFileName()) && triggerDataIntegrityFailure) { + completionListener.onFailure( + new RuntimeException( + new CorruptIndexException( + "Data integrity check failure for file: " + writeContext.getFileName(), + writeContext.getFileName() + ) + ) + ); + } else { + writeContext.getUploadFinalizer().accept(true); + completionListener.onResponse(null); + } + } catch (Exception e) { + completionListener.onFailure(e); + } + + } + + private boolean isSegmentFile(String filename) { + return !filename.endsWith(".tlog") && !filename.endsWith(".ckp"); + } +} diff --git a/server/src/main/java/org/opensearch/common/blobstore/VerifyingMultiStreamBlobContainer.java b/server/src/main/java/org/opensearch/common/blobstore/VerifyingMultiStreamBlobContainer.java new file mode 100644 index 0000000000000..0dfcc5c50e4b1 --- /dev/null +++ b/server/src/main/java/org/opensearch/common/blobstore/VerifyingMultiStreamBlobContainer.java @@ -0,0 +1,34 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.common.blobstore; + +import org.opensearch.action.ActionListener; +import org.opensearch.common.blobstore.stream.write.WriteContext; + +import java.io.IOException; + +/** + * An extension of {@link BlobContainer} that adds {@link VerifyingMultiStreamBlobContainer#asyncBlobUpload} to allow + * multipart uploads and performs integrity checks on transferred files + * + * @opensearch.internal + */ +public interface VerifyingMultiStreamBlobContainer extends BlobContainer { + + /** + * Reads blob content from multiple streams, each from a specific part of the file, which is provided by the + * StreamContextSupplier in the WriteContext passed to this method. An {@link IOException} is thrown if reading + * any of the input streams fails, or writing to the target blob fails + * + * @param writeContext A WriteContext object encapsulating all information needed to perform the upload + * @param completionListener Listener on which upload events should be published. + * @throws IOException if any of the input streams could not be read, or the target blob could not be written to + */ + void asyncBlobUpload(WriteContext writeContext, ActionListener completionListener) throws IOException; +} diff --git a/server/src/main/java/org/opensearch/common/blobstore/transfer/RemoteTransferContainer.java b/server/src/main/java/org/opensearch/common/blobstore/transfer/RemoteTransferContainer.java index 7864c3ab5c794..ca744efae902d 100644 --- a/server/src/main/java/org/opensearch/common/blobstore/transfer/RemoteTransferContainer.java +++ b/server/src/main/java/org/opensearch/common/blobstore/transfer/RemoteTransferContainer.java @@ -160,7 +160,7 @@ private LocalStreamSupplier getMultipartStreamSupplier( : offsetRangeInputStream; Objects.requireNonNull(inputStreams.get())[streamIdx] = inputStream; - return new InputStreamContainer(inputStream, size); + return new InputStreamContainer(inputStream, size, position); } catch (IOException e) { log.error("Failed to create input stream", e); throw e; diff --git a/server/src/main/java/org/opensearch/common/util/ByteUtils.java b/server/src/main/java/org/opensearch/common/util/ByteUtils.java index 36ae3b1f5bcaa..8c7665d991751 100644 --- a/server/src/main/java/org/opensearch/common/util/ByteUtils.java +++ b/server/src/main/java/org/opensearch/common/util/ByteUtils.java @@ -61,6 +61,16 @@ public static void writeLongLE(long l, byte[] arr, int offset) { assert l == 0; } + /** Convert long to a byte array in big-endian format */ + public static byte[] toByteArrayBE(long l) { + byte[] result = new byte[8]; + for (int i = 7; i >= 0; i--) { + result[i] = (byte) (l & 0xffL); + l >>= 8; + } + return result; + } + /** Write a long in little-endian format. */ public static long readLongLE(byte[] arr, int offset) { long l = arr[offset++] & 0xFFL; diff --git a/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java b/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java index 46d52bc8ca5df..e087bbb265727 100644 --- a/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java +++ b/server/src/main/java/org/opensearch/index/shard/RemoteStoreRefreshListener.java @@ -11,6 +11,7 @@ import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.message.ParameterizedMessage; import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexFileNames; import org.apache.lucene.index.SegmentInfos; import org.apache.lucene.search.ReferenceManager; @@ -18,11 +19,14 @@ import org.apache.lucene.store.FilterDirectory; import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; +import org.opensearch.action.ActionListener; +import org.opensearch.action.LatchedActionListener; import org.opensearch.action.bulk.BackoffPolicy; -import org.opensearch.common.CheckedFunction; +import org.opensearch.action.support.GroupedActionListener; import org.opensearch.common.concurrent.GatedCloseable; import org.opensearch.common.logging.Loggers; import org.opensearch.common.unit.TimeValue; +import org.opensearch.common.util.UploadListener; import org.opensearch.common.util.concurrent.ConcurrentCollections; import org.opensearch.index.engine.EngineException; import org.opensearch.index.engine.InternalEngine; @@ -46,6 +50,7 @@ import java.util.Map; import java.util.Optional; import java.util.Set; +import java.util.concurrent.CountDownLatch; import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicBoolean; @@ -110,7 +115,7 @@ public final class RemoteStoreRefreshListener implements ReferenceManager.Refres private final SegmentReplicationCheckpointPublisher checkpointPublisher; - private final FileUploader fileUploader; + private final UploadListener statsListener; public RemoteStoreRefreshListener( IndexShard indexShard, @@ -137,7 +142,7 @@ public RemoteStoreRefreshListener( this.segmentTracker = segmentTracker; resetBackOffDelayIterator(); this.checkpointPublisher = checkpointPublisher; - this.fileUploader = new FileUploader(new UploadTracker() { + this.statsListener = new UploadListener() { @Override public void beforeUpload(String file) { // Start tracking the upload bytes started @@ -156,7 +161,7 @@ public void onFailure(String file) { // Track upload failure segmentTracker.addUploadBytesFailed(latestFileNameSizeOnLocalMap.get(file)); } - }, remoteDirectory, storeDirectory, this::getChecksumOfLocalFile, logger); + }; } @Override @@ -190,7 +195,7 @@ private synchronized void syncSegments(boolean isRetry) { long refreshTimeMs = segmentTracker.getLocalRefreshTimeMs(), refreshClockTimeMs = segmentTracker.getLocalRefreshClockTimeMs(); long refreshSeqNo = segmentTracker.getLocalRefreshSeqNo(); long bytesBeforeUpload = segmentTracker.getUploadBytesSucceeded(), startTimeInNS = System.nanoTime(); - boolean shouldRetry = true; + try { if (this.primaryTerm != indexShard.getOperationPrimaryTerm()) { @@ -242,18 +247,51 @@ private synchronized void syncSegments(boolean isRetry) { // Create a map of file name to size and update the refresh segment tracker updateLocalSizeMapAndTracker(localSegmentsPostRefresh); + CountDownLatch latch = new CountDownLatch(1); + ActionListener segmentUploadsCompletedListener = new LatchedActionListener<>(new ActionListener<>() { + @Override + public void onResponse(Void unused) { + boolean shouldRetry = true; + try { + // Start metadata file upload + uploadMetadata(localSegmentsPostRefresh, segmentInfos); + clearStaleFilesFromLocalSegmentChecksumMap(localSegmentsPostRefresh); + onSuccessfulSegmentsSync( + refreshTimeMs, + refreshClockTimeMs, + refreshSeqNo, + lastRefreshedCheckpoint, + checkpoint + ); + // At this point since we have uploaded new segments, segment infos and segment metadata file, + // along with marking minSeqNoToKeep, upload has succeeded completely. + shouldRetry = false; + } catch (Exception e) { + // We don't want to fail refresh if upload of new segments fails. The missed segments will be re-tried + // in the next refresh. This should not affect durability of the indexed data after remote trans-log + // integration. + logger.warn("Exception in post new segment upload actions", e); + } finally { + doComplete(shouldRetry); + } + } + + @Override + public void onFailure(Exception e) { + logger.warn("Exception while uploading new segments to the remote segment store", e); + doComplete(true); + } + + private void doComplete(boolean shouldRetry) { + // Update the segment tracker with the final upload status as seen at the end + updateFinalUploadStatusInSegmentTracker(shouldRetry == false, bytesBeforeUpload, startTimeInNS); + afterSegmentsSync(isRetry, shouldRetry); + } + }, latch); // Start the segments files upload - boolean newSegmentsUploadStatus = uploadNewSegments(localSegmentsPostRefresh); - if (newSegmentsUploadStatus) { - // Start metadata file upload - uploadMetadata(localSegmentsPostRefresh, segmentInfos); - clearStaleFilesFromLocalSegmentChecksumMap(localSegmentsPostRefresh); - onSuccessfulSegmentsSync(refreshTimeMs, refreshClockTimeMs, refreshSeqNo, lastRefreshedCheckpoint, checkpoint); - // At this point since we have uploaded new segments, segment infos and segment metadata file, - // along with marking minSeqNoToKeep, upload has succeeded completely. - shouldRetry = false; - } + uploadNewSegments(localSegmentsPostRefresh, segmentUploadsCompletedListener); + latch.await(); } } catch (EngineException e) { logger.warn("Exception while reading SegmentInfosSnapshot", e); @@ -265,11 +303,7 @@ private synchronized void syncSegments(boolean isRetry) { } } catch (Throwable t) { logger.error("Exception in RemoteStoreRefreshListener.afterRefresh()", t); - } finally { - // Update the segment tracker with the final upload status as seen at the end - updateFinalUploadStatusInSegmentTracker(shouldRetry == false, bytesBeforeUpload, startTimeInNS); } - afterSegmentsSync(isRetry, shouldRetry); } /** @@ -378,17 +412,50 @@ void uploadMetadata(Collection localSegmentsPostRefresh, SegmentInfos se } } - private boolean uploadNewSegments(Collection localSegmentsPostRefresh) throws IOException { - AtomicBoolean uploadSuccess = new AtomicBoolean(true); - localSegmentsPostRefresh.forEach(file -> { - try { - fileUploader.uploadFile(file); - } catch (IOException e) { - uploadSuccess.set(false); - logger.warn(() -> new ParameterizedMessage("Exception while uploading file {} to the remote segment store", file), e); - } - }); - return uploadSuccess.get(); + private void uploadNewSegments(Collection localSegmentsPostRefresh, ActionListener listener) { + Collection filteredFiles = localSegmentsPostRefresh.stream().filter(file -> !skipUpload(file)).collect(Collectors.toList()); + if (filteredFiles.size() == 0) { + listener.onResponse(null); + return; + } + + ActionListener> mappedListener = ActionListener.map(listener, resp -> null); + GroupedActionListener batchUploadListener = new GroupedActionListener<>(mappedListener, filteredFiles.size()); + + for (String src : filteredFiles) { + ActionListener aggregatedListener = ActionListener.wrap(resp -> { + statsListener.onSuccess(src); + batchUploadListener.onResponse(resp); + }, ex -> { + logger.warn(() -> new ParameterizedMessage("Exception: [{}] while uploading segment files", ex), ex); + if (ex instanceof CorruptIndexException) { + indexShard.failShard(ex.getMessage(), ex); + } + statsListener.onFailure(src); + batchUploadListener.onFailure(ex); + }); + statsListener.beforeUpload(src); + remoteDirectory.copyFrom(storeDirectory, src, IOContext.DEFAULT, aggregatedListener); + } + } + + /** + * Whether to upload a file or not depending on whether file is in excluded list or has been already uploaded. + * + * @param file that needs to be uploaded. + * @return true if the upload has to be skipped for the file. + */ + private boolean skipUpload(String file) { + try { + // Exclude files that are already uploaded and the exclude files to come up with the list of files to be uploaded. + return EXCLUDE_FILES.contains(file) || remoteDirectory.containsFile(file, getChecksumOfLocalFile(file)); + } catch (IOException e) { + logger.error( + "Exception while reading checksum of local segment file: {}, ignoring the exception and re-uploading the file", + file + ); + } + return false; } private String getChecksumOfLocalFile(String file) throws IOException { @@ -462,104 +529,4 @@ private void updateFinalUploadStatusInSegmentTracker(boolean uploadStatus, long segmentTracker.incrementTotalUploadsFailed(); } } - - /** - * This class is a wrapper over the copying of file from local to remote store allowing to decorate the actual copy - * method along with adding hooks of code that can be run before, on success and on failure. - * - * @opensearch.internal - */ - private static class FileUploader { - - private final Logger logger; - - private final UploadTracker uploadTracker; - - private final RemoteSegmentStoreDirectory remoteDirectory; - - private final Directory storeDirectory; - - private final CheckedFunction checksumProvider; - - public FileUploader( - UploadTracker uploadTracker, - RemoteSegmentStoreDirectory remoteDirectory, - Directory storeDirectory, - CheckedFunction checksumProvider, - Logger logger - ) { - this.uploadTracker = uploadTracker; - this.remoteDirectory = remoteDirectory; - this.storeDirectory = storeDirectory; - this.checksumProvider = checksumProvider; - this.logger = logger; - } - - /** - * Calling this method will lead to before getting executed and then the actual upload. Based on the upload status, - * the onSuccess or onFailure method gets invoked. - * - * @param file the file which is to be uploaded. - * @throws IOException is thrown if the upload fails. - */ - private void uploadFile(String file) throws IOException { - if (skipUpload(file)) { - return; - } - uploadTracker.beforeUpload(file); - boolean success = false; - try { - performUpload(file); - uploadTracker.onSuccess(file); - success = true; - } finally { - if (!success) { - uploadTracker.onFailure(file); - } - } - } - - /** - * Whether to upload a file or not depending on whether file is in excluded list or has been already uploaded. - * - * @param file that needs to be uploaded. - * @return true if the upload has to be skipped for the file. - */ - private boolean skipUpload(String file) { - try { - // Exclude files that are already uploaded and the exclude files to come up with the list of files to be uploaded. - return EXCLUDE_FILES.contains(file) || remoteDirectory.containsFile(file, checksumProvider.apply(file)); - } catch (IOException e) { - logger.error( - "Exception while reading checksum of local segment file: {}, ignoring the exception and re-uploading the file", - file - ); - } - return false; - } - - /** - * This method does the actual upload. - * - * @param file that needs to be uploaded. - * @throws IOException is thrown if the upload fails. - */ - private void performUpload(String file) throws IOException { - remoteDirectory.copyFrom(storeDirectory, file, file, IOContext.DEFAULT); - } - } - - /** - * A tracker class that is fed to FileUploader. - * - * @opensearch.internal - */ - interface UploadTracker { - - void beforeUpload(String file); - - void onSuccess(String file); - - void onFailure(String file); - } } diff --git a/server/src/main/java/org/opensearch/index/store/RemoteDirectory.java b/server/src/main/java/org/opensearch/index/store/RemoteDirectory.java index 8782808c070ab..f7fe7ca62e6ba 100644 --- a/server/src/main/java/org/opensearch/index/store/RemoteDirectory.java +++ b/server/src/main/java/org/opensearch/index/store/RemoteDirectory.java @@ -45,6 +45,10 @@ public class RemoteDirectory extends Directory { protected final BlobContainer blobContainer; + public BlobContainer getBlobContainer() { + return blobContainer; + } + public RemoteDirectory(BlobContainer blobContainer) { this.blobContainer = blobContainer; } diff --git a/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectory.java b/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectory.java index e7602203440d2..395ecba442e86 100644 --- a/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectory.java +++ b/server/src/main/java/org/opensearch/index/store/RemoteSegmentStoreDirectory.java @@ -8,9 +8,12 @@ package org.opensearch.index.store; +import com.jcraft.jzlib.JZlib; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.apache.logging.log4j.message.ParameterizedMessage; import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.SegmentInfos; import org.apache.lucene.store.ByteBuffersDataOutput; import org.apache.lucene.store.ByteBuffersIndexOutput; @@ -19,10 +22,20 @@ import org.apache.lucene.store.IOContext; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; +import org.opensearch.ExceptionsHelper; +import org.opensearch.action.ActionListener; import org.opensearch.common.UUIDs; +import org.opensearch.common.blobstore.VerifyingMultiStreamBlobContainer; +import org.opensearch.common.blobstore.exception.CorruptFileException; +import org.opensearch.common.blobstore.stream.write.WriteContext; +import org.opensearch.common.blobstore.stream.write.WritePriority; +import org.opensearch.common.blobstore.transfer.RemoteTransferContainer; +import org.opensearch.common.blobstore.transfer.stream.OffsetRangeIndexInputStream; import org.opensearch.common.io.VersionedCodecStreamWrapper; import org.opensearch.common.lucene.store.ByteArrayIndexInput; +import org.opensearch.common.util.ByteUtils; import org.opensearch.index.remote.RemoteStoreUtils; +import org.opensearch.index.store.exception.ChecksumCombinationException; import org.opensearch.index.store.lockmanager.FileLockInfo; import org.opensearch.index.store.lockmanager.RemoteStoreCommitLevelLockManager; import org.opensearch.index.store.lockmanager.RemoteStoreLockManager; @@ -44,6 +57,7 @@ import java.util.concurrent.atomic.AtomicBoolean; import java.util.concurrent.atomic.AtomicLong; import java.util.stream.Collectors; +import java.util.zip.CRC32; /** * A RemoteDirectory extension for remote segment store. We need to make sure we don't overwrite a segment file once uploaded. @@ -62,6 +76,11 @@ public final class RemoteSegmentStoreDirectory extends FilterDirectory implement */ public static final String SEGMENT_NAME_UUID_SEPARATOR = "__"; + /** + * Number of bytes in the segment file to store checksum + */ + private static final int SEGMENT_CHECKSUM_BYTES = 8; + /** * remoteDataDirectory is used to store segment files at path: cluster_UUID/index_UUID/shardId/segments/data */ @@ -349,6 +368,89 @@ public IndexInput openInput(String name, IOContext context) throws IOException { } } + /** + * Copies a file from the source directory to a remote based on multi-stream upload support. + * If vendor plugin supports uploading multiple parts in parallel, BlobContainer#writeBlobByStreams + * will be used, else, the legacy {@link RemoteSegmentStoreDirectory#copyFrom(Directory, String, String, IOContext)} + * will be called. + * + * @param from The directory for the file to be uploaded + * @param src File to be uploaded + * @param context IOContext to be used to open IndexInput of file during remote upload + * @param listener Listener to handle upload callback events + */ + public void copyFrom(Directory from, String src, IOContext context, ActionListener listener) { + if (remoteDataDirectory.getBlobContainer() instanceof VerifyingMultiStreamBlobContainer) { + try { + String remoteFilename = getNewRemoteSegmentFilename(src); + uploadBlob(from, src, remoteFilename, context, listener); + } catch (Exception e) { + listener.onFailure(e); + } + } else { + try { + copyFrom(from, src, src, context); + listener.onResponse(null); + } catch (Exception e) { + logger.warn(() -> new ParameterizedMessage("Exception while uploading file {} to the remote segment store", src), e); + listener.onFailure(e); + } + } + } + + private void uploadBlob(Directory from, String src, String remoteFileName, IOContext ioContext, ActionListener listener) + throws Exception { + long expectedChecksum = calculateChecksumOfChecksum(from, src); + long contentLength; + try (IndexInput indexInput = from.openInput(src, ioContext)) { + contentLength = indexInput.length(); + } + RemoteTransferContainer remoteTransferContainer = new RemoteTransferContainer( + src, + remoteFileName, + contentLength, + true, + WritePriority.NORMAL, + (size, position) -> new OffsetRangeIndexInputStream(from.openInput(src, ioContext), size, position), + expectedChecksum, + remoteDataDirectory.getBlobContainer() instanceof VerifyingMultiStreamBlobContainer + ); + ActionListener completionListener = ActionListener.wrap(resp -> { + try { + postUpload(from, src, remoteFileName, getChecksumOfLocalFile(from, src)); + listener.onResponse(null); + } catch (Exception e) { + logger.error(() -> new ParameterizedMessage("Exception in segment postUpload for file [{}]", src), e); + listener.onFailure(e); + } + }, ex -> { + logger.error(() -> new ParameterizedMessage("Failed to upload blob {}", src), ex); + IOException corruptIndexException = ExceptionsHelper.unwrapCorruption(ex); + if (corruptIndexException != null) { + listener.onFailure(corruptIndexException); + return; + } + Throwable throwable = ExceptionsHelper.unwrap(ex, CorruptFileException.class); + if (throwable != null) { + CorruptFileException corruptFileException = (CorruptFileException) throwable; + listener.onFailure(new CorruptIndexException(corruptFileException.getMessage(), corruptFileException.getFileName())); + return; + } + listener.onFailure(ex); + }); + + completionListener = ActionListener.runBefore(completionListener, () -> { + try { + remoteTransferContainer.close(); + } catch (Exception e) { + logger.warn("Error occurred while closing streams", e); + } + }); + + WriteContext writeContext = remoteTransferContainer.createWriteContext(); + ((VerifyingMultiStreamBlobContainer) remoteDataDirectory.getBlobContainer()).asyncBlobUpload(writeContext, completionListener); + } + /** * This acquires a lock on a given commit by creating a lock file in lock directory using {@code FileLockInfo} * @param primaryTerm Primary Term of index at the time of commit. @@ -425,6 +527,10 @@ public void copyFrom(Directory from, String src, String dest, IOContext context, String remoteFilename; remoteFilename = getNewRemoteSegmentFilename(dest); remoteDataDirectory.copyFrom(from, src, remoteFilename, context); + postUpload(from, src, remoteFilename, checksum); + } + + private void postUpload(Directory from, String src, String remoteFilename, String checksum) throws IOException { UploadedSegmentMetadata segmentMetadata = new UploadedSegmentMetadata(src, remoteFilename, checksum, from.fileLength(src)); segmentsUploadedToRemoteStore.put(src, segmentMetadata); } @@ -528,6 +634,27 @@ private String getChecksumOfLocalFile(Directory directory, String file) throws I } } + private long calculateChecksumOfChecksum(Directory directory, String file) throws IOException { + try (IndexInput indexInput = directory.openInput(file, IOContext.DEFAULT)) { + long storedChecksum = CodecUtil.retrieveChecksum(indexInput); + CRC32 checksumOfChecksum = new CRC32(); + checksumOfChecksum.update(ByteUtils.toByteArrayBE(storedChecksum)); + try { + return JZlib.crc32_combine(storedChecksum, checksumOfChecksum.getValue(), SEGMENT_CHECKSUM_BYTES); + } catch (Exception e) { + throw new ChecksumCombinationException( + "Potentially corrupted file: Checksum combination failed while combining stored checksum " + + "and calculated checksum of stored checksum in segment file: " + + file + + ", directory: " + + directory, + file, + e + ); + } + } + } + private String getExistingRemoteFilename(String localFilename) { if (segmentsUploadedToRemoteStore.containsKey(localFilename)) { return segmentsUploadedToRemoteStore.get(localFilename).uploadedFilename; diff --git a/server/src/main/java/org/opensearch/index/store/exception/ChecksumCombinationException.java b/server/src/main/java/org/opensearch/index/store/exception/ChecksumCombinationException.java index a355473aa2afd..d8e1739fbaa9d 100644 --- a/server/src/main/java/org/opensearch/index/store/exception/ChecksumCombinationException.java +++ b/server/src/main/java/org/opensearch/index/store/exception/ChecksumCombinationException.java @@ -11,7 +11,7 @@ import org.apache.lucene.index.CorruptIndexException; /** - * Exception is raised when combination to two crc checksums fail. + * Exception is raised when combination of two CRC checksums fail. * * @opensearch.internal */ diff --git a/server/src/main/java/org/opensearch/index/translog/Checkpoint.java b/server/src/main/java/org/opensearch/index/translog/Checkpoint.java index 56de7e5daf55f..a9f905f52bc3a 100644 --- a/server/src/main/java/org/opensearch/index/translog/Checkpoint.java +++ b/server/src/main/java/org/opensearch/index/translog/Checkpoint.java @@ -233,7 +233,7 @@ public static void write(FileChannel fileChannel, Path checkpointFile, Checkpoin } } - private static byte[] createCheckpointBytes(Path checkpointFile, Checkpoint checkpoint) throws IOException { + public static byte[] createCheckpointBytes(Path checkpointFile, Checkpoint checkpoint) throws IOException { final ByteArrayOutputStream byteOutputStream = new ByteArrayOutputStream(V4_FILE_SIZE) { @Override public synchronized byte[] toByteArray() { diff --git a/server/src/main/java/org/opensearch/index/translog/checked/TranslogCheckedContainer.java b/server/src/main/java/org/opensearch/index/translog/TranslogCheckedContainer.java similarity index 74% rename from server/src/main/java/org/opensearch/index/translog/checked/TranslogCheckedContainer.java rename to server/src/main/java/org/opensearch/index/translog/TranslogCheckedContainer.java index b90794e29d2b1..7e2a38559166f 100644 --- a/server/src/main/java/org/opensearch/index/translog/checked/TranslogCheckedContainer.java +++ b/server/src/main/java/org/opensearch/index/translog/TranslogCheckedContainer.java @@ -6,13 +6,10 @@ * compatible open source license. */ -package org.opensearch.index.translog.checked; +package org.opensearch.index.translog; -import org.opensearch.common.io.Channels; import org.opensearch.common.util.concurrent.ReleasableLock; -import java.io.IOException; -import java.nio.channels.FileChannel; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.ReentrantLock; import java.util.zip.CRC32; @@ -28,21 +25,15 @@ public class TranslogCheckedContainer { private final Checksum checksum; private final AtomicLong contentLength; private final ReleasableLock updateLock = new ReleasableLock(new ReentrantLock()); - private final String file; /** - * Creates TranslogCheckedContainer from provided channel. + * Create TranslogCheckedContainer from provided bytes * - * @param channel {@link FileChannel} to read from - * @param offset offset of channel from which bytes are to be read. - * @param len Length of bytes to be read. + * @param bytes The byte array to read from */ - public TranslogCheckedContainer(FileChannel channel, int offset, int len, String file) throws IOException { + public TranslogCheckedContainer(byte[] bytes) { this.checksum = new CRC32(); this.contentLength = new AtomicLong(); - this.file = file; - - byte[] bytes = Channels.readFromFileChannel(channel, offset, len); updateFromBytes(bytes, 0, bytes.length); } diff --git a/server/src/main/java/org/opensearch/index/translog/TranslogHeader.java b/server/src/main/java/org/opensearch/index/translog/TranslogHeader.java index af6ebcf7b7c66..8067cccb772a2 100644 --- a/server/src/main/java/org/opensearch/index/translog/TranslogHeader.java +++ b/server/src/main/java/org/opensearch/index/translog/TranslogHeader.java @@ -46,6 +46,7 @@ import java.io.EOFException; import java.io.IOException; +import java.io.OutputStream; import java.nio.channels.FileChannel; import java.nio.file.Path; @@ -213,12 +214,10 @@ private static void tryReportOldVersionError(final Path path, final FileChannel /** * Writes this header with the latest format into the file channel */ - void write(final FileChannel channel, boolean fsync) throws IOException { + void write(final OutputStream outputStream) throws IOException { // This output is intentionally not closed because closing it will close the FileChannel. @SuppressWarnings({ "IOResourceOpenedButNotSafelyClosed", "resource" }) - final BufferedChecksumStreamOutput out = new BufferedChecksumStreamOutput( - new OutputStreamStreamOutput(java.nio.channels.Channels.newOutputStream(channel)) - ); + final BufferedChecksumStreamOutput out = new BufferedChecksumStreamOutput(new OutputStreamStreamOutput(outputStream)); CodecUtil.writeHeader(new OutputStreamDataOutput(out), TRANSLOG_CODEC, CURRENT_VERSION); // Write uuid final BytesRef uuid = new BytesRef(translogUUID); @@ -229,6 +228,11 @@ void write(final FileChannel channel, boolean fsync) throws IOException { // Checksum header out.writeInt((int) out.getChecksum()); out.flush(); + } + + void write(final FileChannel channel, boolean fsync) throws IOException { + OutputStream outputStream = java.nio.channels.Channels.newOutputStream(channel); + write(outputStream); if (fsync == true) { channel.force(true); } diff --git a/server/src/main/java/org/opensearch/index/translog/TranslogReader.java b/server/src/main/java/org/opensearch/index/translog/TranslogReader.java index c4a4fb7a460a0..9ea3328587645 100644 --- a/server/src/main/java/org/opensearch/index/translog/TranslogReader.java +++ b/server/src/main/java/org/opensearch/index/translog/TranslogReader.java @@ -33,6 +33,7 @@ package org.opensearch.index.translog; import org.apache.lucene.store.AlreadyClosedException; +import org.opensearch.common.Nullable; import org.opensearch.common.io.Channels; import org.opensearch.common.util.io.IOUtils; import org.opensearch.index.seqno.SequenceNumbers; @@ -59,6 +60,11 @@ public class TranslogReader extends BaseTranslogReader implements Closeable { private final Checkpoint checkpoint; protected final AtomicBoolean closed = new AtomicBoolean(false); + @Nullable + private final Long translogChecksum; + @Nullable + private final Long checkpointChecksum; + /** * Create a translog writer against the specified translog file channel. * @@ -67,11 +73,34 @@ public class TranslogReader extends BaseTranslogReader implements Closeable { * @param path the path to the translog * @param header the header of the translog file */ - TranslogReader(final Checkpoint checkpoint, final FileChannel channel, final Path path, final TranslogHeader header) { + TranslogReader( + final Checkpoint checkpoint, + final FileChannel channel, + final Path path, + final TranslogHeader header, + final Long translogChecksum + ) throws IOException { super(checkpoint.generation, channel, path, header); this.length = checkpoint.offset; this.totalOperations = checkpoint.numOps; this.checkpoint = checkpoint; + this.translogChecksum = translogChecksum; + this.checkpointChecksum = (translogChecksum != null) ? calculateCheckpointChecksum(checkpoint, path) : null; + } + + private static Long calculateCheckpointChecksum(Checkpoint checkpoint, Path path) throws IOException { + TranslogCheckedContainer checkpointCheckedContainer = new TranslogCheckedContainer( + Checkpoint.createCheckpointBytes(path.getParent().resolve(Translog.CHECKPOINT_FILE_NAME), checkpoint) + ); + return checkpointCheckedContainer.getChecksum(); + } + + public Long getTranslogChecksum() { + return translogChecksum; + } + + public Long getCheckpointChecksum() { + return checkpointChecksum; } /** @@ -87,7 +116,7 @@ public class TranslogReader extends BaseTranslogReader implements Closeable { public static TranslogReader open(final FileChannel channel, final Path path, final Checkpoint checkpoint, final String translogUUID) throws IOException { final TranslogHeader header = TranslogHeader.read(translogUUID, path, channel); - return new TranslogReader(checkpoint, channel, path, header); + return new TranslogReader(checkpoint, channel, path, header, null); } /** @@ -115,9 +144,9 @@ TranslogReader closeIntoTrimmedReader(long aboveSeqNo, ChannelFactory channelFac IOUtils.fsync(checkpointFile.getParent(), true); - newReader = new TranslogReader(newCheckpoint, channel, path, header); + newReader = new TranslogReader(newCheckpoint, channel, path, header, translogChecksum); } else { - newReader = new TranslogReader(checkpoint, channel, path, header); + newReader = new TranslogReader(checkpoint, channel, path, header, translogChecksum); } toCloseOnFailure = null; return newReader; diff --git a/server/src/main/java/org/opensearch/index/translog/TranslogWriter.java b/server/src/main/java/org/opensearch/index/translog/TranslogWriter.java index e19aece60adc0..e7b08b1dda3d2 100644 --- a/server/src/main/java/org/opensearch/index/translog/TranslogWriter.java +++ b/server/src/main/java/org/opensearch/index/translog/TranslogWriter.java @@ -37,6 +37,7 @@ import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.util.BytesRef; import org.apache.lucene.util.BytesRefIterator; +import org.opensearch.common.Nullable; import org.opensearch.common.SuppressForbidden; import org.opensearch.common.bytes.BytesArray; import org.opensearch.common.bytes.BytesReference; @@ -54,6 +55,7 @@ import org.opensearch.index.seqno.SequenceNumbers; import org.opensearch.index.shard.ShardId; +import java.io.ByteArrayOutputStream; import java.io.Closeable; import java.io.IOException; import java.nio.ByteBuffer; @@ -110,6 +112,9 @@ public class TranslogWriter extends BaseTranslogReader implements Closeable { private final Map> seenSequenceNumbers; + @Nullable + private final TranslogCheckedContainer translogCheckedContainer; + private final Boolean remoteTranslogEnabled; private TranslogWriter( @@ -126,6 +131,7 @@ private TranslogWriter( final TragicExceptionHolder tragedy, final LongConsumer persistedSequenceNumberConsumer, final BigArrays bigArrays, + TranslogCheckedContainer translogCheckedContainer, Boolean remoteTranslogEnabled ) throws IOException { super(initialCheckpoint.generation, channel, path, header); @@ -151,6 +157,7 @@ private TranslogWriter( this.bigArrays = bigArrays; this.seenSequenceNumbers = Assertions.ENABLED ? new HashMap<>() : null; this.tragedy = tragedy; + this.translogCheckedContainer = translogCheckedContainer; this.remoteTranslogEnabled = remoteTranslogEnabled; } @@ -179,6 +186,12 @@ public static TranslogWriter create( checkpointChannel = channelFactory.open(checkpointFile, StandardOpenOption.WRITE); final TranslogHeader header = new TranslogHeader(translogUUID, primaryTerm); header.write(channel, !Boolean.TRUE.equals(remoteTranslogEnabled)); + TranslogCheckedContainer translogCheckedContainer = null; + if (Boolean.TRUE.equals(remoteTranslogEnabled)) { + ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream(); + header.write(byteArrayOutputStream); + translogCheckedContainer = new TranslogCheckedContainer(byteArrayOutputStream.toByteArray()); + } final Checkpoint checkpoint = Checkpoint.emptyTranslogCheckpoint( header.sizeInBytes(), fileGeneration, @@ -214,6 +227,7 @@ public static TranslogWriter create( tragedy, persistedSequenceNumberConsumer, bigArrays, + translogCheckedContainer, remoteTranslogEnabled ); } catch (Exception exception) { @@ -438,7 +452,13 @@ public TranslogReader closeIntoReader() throws IOException { closeWithTragicEvent(ex); throw ex; } - return new TranslogReader(getLastSyncedCheckpoint(), channel, path, header); + return new TranslogReader( + getLastSyncedCheckpoint(), + channel, + path, + header, + (translogCheckedContainer != null) ? translogCheckedContainer.getChecksum() : null + ); } else { throw new AlreadyClosedException( "translog [" + getGeneration() + "] is already closed (path [" + path + "]", @@ -571,6 +591,9 @@ private void writeAndReleaseOps(ReleasableBytesReference toWrite) throws IOExcep while (currentBytesConsumed != current.length) { int nBytesToWrite = Math.min(current.length - currentBytesConsumed, ioBuffer.remaining()); ioBuffer.put(current.bytes, current.offset + currentBytesConsumed, nBytesToWrite); + if (translogCheckedContainer != null) { + translogCheckedContainer.updateFromBytes(current.bytes, current.offset + currentBytesConsumed, nBytesToWrite); + } currentBytesConsumed += nBytesToWrite; if (ioBuffer.hasRemaining() == false) { ioBuffer.flip(); diff --git a/server/src/main/java/org/opensearch/index/translog/checked/package-info.java b/server/src/main/java/org/opensearch/index/translog/checked/package-info.java deleted file mode 100644 index ddb235fdbedce..0000000000000 --- a/server/src/main/java/org/opensearch/index/translog/checked/package-info.java +++ /dev/null @@ -1,10 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -/** Contains checksum related utilities for translog files */ -package org.opensearch.index.translog.checked; diff --git a/server/src/main/java/org/opensearch/index/translog/transfer/BlobStoreTransferService.java b/server/src/main/java/org/opensearch/index/translog/transfer/BlobStoreTransferService.java index d9feb1a832681..974e8af42b939 100644 --- a/server/src/main/java/org/opensearch/index/translog/transfer/BlobStoreTransferService.java +++ b/server/src/main/java/org/opensearch/index/translog/transfer/BlobStoreTransferService.java @@ -16,12 +16,22 @@ import org.opensearch.common.blobstore.BlobMetadata; import org.opensearch.common.blobstore.BlobPath; import org.opensearch.common.blobstore.BlobStore; +import org.opensearch.common.blobstore.VerifyingMultiStreamBlobContainer; +import org.opensearch.common.blobstore.stream.write.WriteContext; +import org.opensearch.common.blobstore.stream.write.WritePriority; +import org.opensearch.common.blobstore.transfer.RemoteTransferContainer; +import org.opensearch.common.blobstore.transfer.stream.OffsetRangeFileInputStream; +import org.opensearch.index.translog.ChannelFactory; import org.opensearch.index.translog.transfer.FileSnapshot.TransferFileSnapshot; import org.opensearch.threadpool.ThreadPool; import java.io.IOException; import java.io.InputStream; +import java.nio.channels.FileChannel; +import java.nio.file.StandardOpenOption; import java.util.List; +import java.util.Map; +import java.util.Objects; import java.util.Set; import static org.opensearch.common.blobstore.BlobContainer.BlobNameSortOrder.LEXICOGRAPHIC; @@ -44,18 +54,18 @@ public BlobStoreTransferService(BlobStore blobStore, ThreadPool threadPool) { } @Override - public void uploadBlobAsync( - String threadpoolName, + public void uploadBlob( + String threadPoolName, final TransferFileSnapshot fileSnapshot, Iterable remoteTransferPath, - ActionListener listener + ActionListener listener, + WritePriority writePriority ) { assert remoteTransferPath instanceof BlobPath; BlobPath blobPath = (BlobPath) remoteTransferPath; - threadPool.executor(threadpoolName).execute(ActionRunnable.wrap(listener, l -> { - try (InputStream inputStream = fileSnapshot.inputStream()) { - blobStore.blobContainer(blobPath) - .writeBlobAtomic(fileSnapshot.getName(), inputStream, fileSnapshot.getContentLength(), true); + threadPool.executor(threadPoolName).execute(ActionRunnable.wrap(listener, l -> { + try { + uploadBlob(fileSnapshot, blobPath, writePriority); l.onResponse(fileSnapshot); } catch (Exception e) { logger.error(() -> new ParameterizedMessage("Failed to upload blob {}", fileSnapshot.getName()), e); @@ -65,14 +75,84 @@ public void uploadBlobAsync( } @Override - public void uploadBlob(final TransferFileSnapshot fileSnapshot, Iterable remoteTransferPath) throws IOException { - assert remoteTransferPath instanceof BlobPath; + public void uploadBlob(final TransferFileSnapshot fileSnapshot, Iterable remoteTransferPath, WritePriority writePriority) + throws IOException { BlobPath blobPath = (BlobPath) remoteTransferPath; try (InputStream inputStream = fileSnapshot.inputStream()) { blobStore.blobContainer(blobPath).writeBlobAtomic(fileSnapshot.getName(), inputStream, fileSnapshot.getContentLength(), true); } } + @Override + public void uploadBlobs( + Set fileSnapshots, + final Map blobPaths, + ActionListener listener, + WritePriority writePriority + ) { + fileSnapshots.forEach(fileSnapshot -> { + BlobPath blobPath = blobPaths.get(fileSnapshot.getPrimaryTerm()); + if (!(blobStore.blobContainer(blobPath) instanceof VerifyingMultiStreamBlobContainer)) { + uploadBlob(ThreadPool.Names.TRANSLOG_TRANSFER, fileSnapshot, blobPath, listener, writePriority); + } else { + uploadBlob(fileSnapshot, listener, blobPath, writePriority); + } + }); + + } + + private void uploadBlob( + TransferFileSnapshot fileSnapshot, + ActionListener listener, + BlobPath blobPath, + WritePriority writePriority + ) { + + try { + ChannelFactory channelFactory = FileChannel::open; + long contentLength; + try (FileChannel channel = channelFactory.open(fileSnapshot.getPath(), StandardOpenOption.READ)) { + contentLength = channel.size(); + } + RemoteTransferContainer remoteTransferContainer = new RemoteTransferContainer( + fileSnapshot.getName(), + fileSnapshot.getName(), + contentLength, + true, + writePriority, + (size, position) -> new OffsetRangeFileInputStream(fileSnapshot.getPath(), size, position), + Objects.requireNonNull(fileSnapshot.getChecksum()), + blobStore.blobContainer(blobPath) instanceof VerifyingMultiStreamBlobContainer + ); + ActionListener completionListener = ActionListener.wrap(resp -> listener.onResponse(fileSnapshot), ex -> { + logger.error(() -> new ParameterizedMessage("Failed to upload blob {}", fileSnapshot.getName()), ex); + listener.onFailure(new FileTransferException(fileSnapshot, ex)); + }); + + completionListener = ActionListener.runBefore(completionListener, () -> { + try { + remoteTransferContainer.close(); + } catch (Exception e) { + logger.warn("Error occurred while closing streams", e); + } + }); + + WriteContext writeContext = remoteTransferContainer.createWriteContext(); + ((VerifyingMultiStreamBlobContainer) blobStore.blobContainer(blobPath)).asyncBlobUpload(writeContext, completionListener); + + } catch (Exception e) { + logger.error(() -> new ParameterizedMessage("Failed to upload blob {}", fileSnapshot.getName()), e); + listener.onFailure(new FileTransferException(fileSnapshot, e)); + } finally { + try { + fileSnapshot.close(); + } catch (IOException e) { + logger.warn("Error while closing TransferFileSnapshot", e); + } + } + + } + @Override public InputStream downloadBlob(Iterable path, String fileName) throws IOException { return blobStore.blobContainer((BlobPath) path).readBlob(fileName); diff --git a/server/src/main/java/org/opensearch/index/translog/transfer/FileSnapshot.java b/server/src/main/java/org/opensearch/index/translog/transfer/FileSnapshot.java index 239ef7c3c9300..dcec94edd694f 100644 --- a/server/src/main/java/org/opensearch/index/translog/transfer/FileSnapshot.java +++ b/server/src/main/java/org/opensearch/index/translog/transfer/FileSnapshot.java @@ -107,10 +107,12 @@ public void close() throws IOException { public static class TransferFileSnapshot extends FileSnapshot { private final long primaryTerm; + private Long checksum; - public TransferFileSnapshot(Path path, long primaryTerm) throws IOException { + public TransferFileSnapshot(Path path, long primaryTerm, Long checksum) throws IOException { super(path); this.primaryTerm = primaryTerm; + this.checksum = checksum; } public TransferFileSnapshot(String name, byte[] content, long primaryTerm) throws IOException { @@ -118,6 +120,10 @@ public TransferFileSnapshot(String name, byte[] content, long primaryTerm) throw this.primaryTerm = primaryTerm; } + public Long getChecksum() { + return checksum; + } + public long getPrimaryTerm() { return primaryTerm; } @@ -148,8 +154,8 @@ public static final class TranslogFileSnapshot extends TransferFileSnapshot { private final long generation; - public TranslogFileSnapshot(long primaryTerm, long generation, Path path) throws IOException { - super(path, primaryTerm); + public TranslogFileSnapshot(long primaryTerm, long generation, Path path, Long checksum) throws IOException { + super(path, primaryTerm, checksum); this.generation = generation; } @@ -185,8 +191,9 @@ public static final class CheckpointFileSnapshot extends TransferFileSnapshot { private final long minTranslogGeneration; - public CheckpointFileSnapshot(long primaryTerm, long generation, long minTranslogGeneration, Path path) throws IOException { - super(path, primaryTerm); + public CheckpointFileSnapshot(long primaryTerm, long generation, long minTranslogGeneration, Path path, Long checksum) + throws IOException { + super(path, primaryTerm, checksum); this.minTranslogGeneration = minTranslogGeneration; this.generation = generation; } diff --git a/server/src/main/java/org/opensearch/index/translog/transfer/TransferService.java b/server/src/main/java/org/opensearch/index/translog/transfer/TransferService.java index 0e6496042e3d8..a240fd38cda11 100644 --- a/server/src/main/java/org/opensearch/index/translog/transfer/TransferService.java +++ b/server/src/main/java/org/opensearch/index/translog/transfer/TransferService.java @@ -10,11 +10,14 @@ import org.opensearch.action.ActionListener; import org.opensearch.common.blobstore.BlobMetadata; +import org.opensearch.common.blobstore.BlobPath; +import org.opensearch.common.blobstore.stream.write.WritePriority; import org.opensearch.index.translog.transfer.FileSnapshot.TransferFileSnapshot; import java.io.IOException; import java.io.InputStream; import java.util.List; +import java.util.Map; import java.util.Set; /** @@ -26,25 +29,40 @@ public interface TransferService { /** * Uploads the {@link TransferFileSnapshot} async, once the upload is complete the callback is invoked - * @param threadpoolName threadpool type which will be used to upload blobs asynchronously + * @param threadPoolName threadpool type which will be used to upload blobs asynchronously * @param fileSnapshot the file snapshot to upload * @param remotePath the remote path where upload should be made * @param listener the callback to be invoked once upload completes successfully/fails */ - void uploadBlobAsync( - String threadpoolName, + void uploadBlob( + String threadPoolName, final TransferFileSnapshot fileSnapshot, Iterable remotePath, - ActionListener listener + ActionListener listener, + WritePriority writePriority ); + /** + * Uploads multiple {@link TransferFileSnapshot}, once the upload is complete the callback is invoked + * @param fileSnapshots the file snapshots to upload + * @param blobPaths Primary term to {@link BlobPath} map + * @param listener the callback to be invoked once uploads complete successfully/fail + */ + void uploadBlobs( + Set fileSnapshots, + final Map blobPaths, + ActionListener listener, + WritePriority writePriority + ) throws Exception; + /** * Uploads the {@link TransferFileSnapshot} blob * @param fileSnapshot the file snapshot to upload * @param remotePath the remote path where upload should be made + * @param writePriority Priority by which content needs to be written. * @throws IOException the exception while transferring the data */ - void uploadBlob(final TransferFileSnapshot fileSnapshot, Iterable remotePath) throws IOException; + void uploadBlob(final TransferFileSnapshot fileSnapshot, Iterable remotePath, WritePriority writePriority) throws IOException; void deleteBlobs(Iterable path, List fileNames) throws IOException; diff --git a/server/src/main/java/org/opensearch/index/translog/transfer/TranslogCheckpointTransferSnapshot.java b/server/src/main/java/org/opensearch/index/translog/transfer/TranslogCheckpointTransferSnapshot.java index b34c2282e874f..10dec13c81e1a 100644 --- a/server/src/main/java/org/opensearch/index/translog/transfer/TranslogCheckpointTransferSnapshot.java +++ b/server/src/main/java/org/opensearch/index/translog/transfer/TranslogCheckpointTransferSnapshot.java @@ -145,8 +145,14 @@ public TranslogCheckpointTransferSnapshot build() throws IOException { Path checkpointPath = location.resolve(checkpointGenFileNameMapper.apply(readerGeneration)); generations.add(readerGeneration); translogTransferSnapshot.add( - new TranslogFileSnapshot(readerPrimaryTerm, readerGeneration, translogPath), - new CheckpointFileSnapshot(readerPrimaryTerm, checkpointGeneration, minTranslogGeneration, checkpointPath) + new TranslogFileSnapshot(readerPrimaryTerm, readerGeneration, translogPath, reader.getTranslogChecksum()), + new CheckpointFileSnapshot( + readerPrimaryTerm, + checkpointGeneration, + minTranslogGeneration, + checkpointPath, + reader.getCheckpointChecksum() + ) ); if (readerGeneration > highestGeneration) { highestGeneration = readerGeneration; diff --git a/server/src/main/java/org/opensearch/index/translog/transfer/TranslogTransferManager.java b/server/src/main/java/org/opensearch/index/translog/transfer/TranslogTransferManager.java index 54140226e3744..0c63a7ffe4cce 100644 --- a/server/src/main/java/org/opensearch/index/translog/transfer/TranslogTransferManager.java +++ b/server/src/main/java/org/opensearch/index/translog/transfer/TranslogTransferManager.java @@ -17,6 +17,7 @@ import org.opensearch.common.SetOnce; import org.opensearch.common.blobstore.BlobMetadata; import org.opensearch.common.blobstore.BlobPath; +import org.opensearch.common.blobstore.stream.write.WritePriority; import org.opensearch.common.bytes.BytesReference; import org.opensearch.common.io.VersionedCodecStreamWrapper; import org.opensearch.common.io.stream.BytesStreamOutput; @@ -119,14 +120,16 @@ public boolean transferSnapshot(TransferSnapshot transferSnapshot, TranslogTrans }), latch ); + Map blobPathMap = new HashMap<>(); toUpload.forEach( - fileSnapshot -> transferService.uploadBlobAsync( - ThreadPool.Names.TRANSLOG_TRANSFER, - fileSnapshot, - remoteDataTransferPath.add(String.valueOf(fileSnapshot.getPrimaryTerm())), - latchedActionListener + fileSnapshot -> blobPathMap.put( + fileSnapshot.getPrimaryTerm(), + remoteDataTransferPath.add(String.valueOf(fileSnapshot.getPrimaryTerm())) ) ); + + transferService.uploadBlobs(toUpload, blobPathMap, latchedActionListener, WritePriority.HIGH); + try { if (latch.await(TRANSFER_TIMEOUT_IN_MILLIS, TimeUnit.MILLISECONDS) == false) { Exception ex = new TimeoutException("Timed out waiting for transfer of snapshot " + transferSnapshot + " to complete"); @@ -139,7 +142,7 @@ public boolean transferSnapshot(TransferSnapshot transferSnapshot, TranslogTrans throw ex; } if (exceptionList.isEmpty()) { - transferService.uploadBlob(prepareMetadata(transferSnapshot), remoteMetadataTransferPath); + transferService.uploadBlob(prepareMetadata(transferSnapshot), remoteMetadataTransferPath, WritePriority.HIGH); translogTransferListener.onUploadComplete(transferSnapshot); return true; } else { diff --git a/server/src/main/java/org/opensearch/threadpool/ThreadPool.java b/server/src/main/java/org/opensearch/threadpool/ThreadPool.java index ebc68c288e25a..d9f73a9b41658 100644 --- a/server/src/main/java/org/opensearch/threadpool/ThreadPool.java +++ b/server/src/main/java/org/opensearch/threadpool/ThreadPool.java @@ -46,10 +46,10 @@ import org.opensearch.common.unit.TimeValue; import org.opensearch.common.util.FeatureFlags; import org.opensearch.common.util.concurrent.OpenSearchExecutors; -import org.opensearch.core.concurrency.OpenSearchRejectedExecutionException; import org.opensearch.common.util.concurrent.OpenSearchThreadPoolExecutor; import org.opensearch.common.util.concurrent.ThreadContext; import org.opensearch.common.util.concurrent.XRejectedExecutionHandler; +import org.opensearch.core.concurrency.OpenSearchRejectedExecutionException; import org.opensearch.core.xcontent.ToXContentFragment; import org.opensearch.core.xcontent.XContentBuilder; import org.opensearch.node.Node; diff --git a/server/src/test/java/org/opensearch/common/blobstore/transfer/RemoteTransferContainerTests.java b/server/src/test/java/org/opensearch/common/blobstore/transfer/RemoteTransferContainerTests.java index 1ebec46042247..48940a0d401fd 100644 --- a/server/src/test/java/org/opensearch/common/blobstore/transfer/RemoteTransferContainerTests.java +++ b/server/src/test/java/org/opensearch/common/blobstore/transfer/RemoteTransferContainerTests.java @@ -9,6 +9,7 @@ package org.opensearch.common.blobstore.transfer; import org.junit.Before; +import org.opensearch.common.blobstore.stream.write.WriteContext; import org.opensearch.common.io.InputStreamContainer; import org.opensearch.common.StreamContext; import org.opensearch.common.blobstore.stream.write.WritePriority; @@ -21,6 +22,7 @@ import java.nio.file.Files; import java.nio.file.Path; import java.nio.file.StandardOpenOption; +import java.util.UUID; public class RemoteTransferContainerTests extends OpenSearchTestCase { @@ -140,6 +142,45 @@ public void testTypeOfProvidedStreamsAllCases() throws IOException { testTypeOfProvidedStreams(false); } + public void testCreateWriteContextAllCases() throws IOException { + testCreateWriteContext(true); + testCreateWriteContext(false); + } + + private void testCreateWriteContext(boolean doRemoteDataIntegrityCheck) throws IOException { + String remoteFileName = testFile.getFileName().toString() + UUID.randomUUID(); + Long expectedChecksum = randomLong(); + try ( + RemoteTransferContainer remoteTransferContainer = new RemoteTransferContainer( + testFile.getFileName().toString(), + remoteFileName, + TEST_FILE_SIZE_BYTES, + true, + WritePriority.HIGH, + new RemoteTransferContainer.OffsetRangeInputStreamSupplier() { + @Override + public OffsetRangeInputStream get(long size, long position) throws IOException { + return new OffsetRangeFileInputStream(testFile, size, position); + } + }, + expectedChecksum, + doRemoteDataIntegrityCheck + ) + ) { + WriteContext writeContext = remoteTransferContainer.createWriteContext(); + assertEquals(remoteFileName, writeContext.getFileName()); + assertTrue(writeContext.isFailIfAlreadyExists()); + assertEquals(TEST_FILE_SIZE_BYTES, writeContext.getFileSize()); + assertEquals(WritePriority.HIGH, writeContext.getWritePriority()); + assertEquals(doRemoteDataIntegrityCheck, writeContext.doRemoteDataIntegrityCheck()); + if (doRemoteDataIntegrityCheck) { + assertEquals(expectedChecksum, writeContext.getExpectedChecksum()); + } else { + assertNull(writeContext.getExpectedChecksum()); + } + } + } + private void testTypeOfProvidedStreams(boolean isRemoteDataIntegritySupported) throws IOException { try ( RemoteTransferContainer remoteTransferContainer = new RemoteTransferContainer( diff --git a/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryTests.java b/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryTests.java index c37893877253e..ea092fffa3a9a 100644 --- a/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryTests.java +++ b/server/src/test/java/org/opensearch/index/store/RemoteSegmentStoreDirectoryTests.java @@ -23,8 +23,12 @@ import org.apache.lucene.tests.util.LuceneTestCase; import org.junit.After; import org.junit.Before; +import org.mockito.Mockito; +import org.opensearch.action.ActionListener; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.common.UUIDs; +import org.opensearch.common.blobstore.VerifyingMultiStreamBlobContainer; +import org.opensearch.common.blobstore.stream.write.WriteContext; import org.opensearch.common.bytes.BytesReference; import org.opensearch.common.io.VersionedCodecStreamWrapper; import org.opensearch.common.io.stream.BytesStreamOutput; @@ -48,6 +52,10 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.HashMap; +import java.util.Collection; import java.util.concurrent.ExecutorService; import static org.mockito.Mockito.mock; @@ -68,6 +76,7 @@ public class RemoteSegmentStoreDirectoryTests extends IndexShardTestCase { private RemoteStoreMetadataLockManager mdLockManager; private RemoteSegmentStoreDirectory remoteSegmentStoreDirectory; + private TestUploadListener testUploadTracker; private IndexShard indexShard; private SegmentInfos segmentInfos; private ThreadPool threadPool; @@ -89,6 +98,7 @@ public void setup() throws IOException { mdLockManager, threadPool ); + testUploadTracker = new TestUploadListener(); Settings indexSettings = Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, org.opensearch.Version.CURRENT).build(); ExecutorService executorService = OpenSearchExecutors.newDirectExecutorService(); @@ -503,6 +513,82 @@ public void testCopyFrom() throws IOException { storeDirectory.close(); } + public void testCopyFilesFromMultipart() throws Exception { + String filename = "_100.si"; + populateMetadata(); + remoteSegmentStoreDirectory.init(); + + Directory storeDirectory = LuceneTestCase.newDirectory(); + IndexOutput indexOutput = storeDirectory.createOutput(filename, IOContext.DEFAULT); + indexOutput.writeString("Hello World!"); + CodecUtil.writeFooter(indexOutput); + indexOutput.close(); + storeDirectory.sync(List.of(filename)); + + assertFalse(remoteSegmentStoreDirectory.getSegmentsUploadedToRemoteStore().containsKey(filename)); + + VerifyingMultiStreamBlobContainer blobContainer = mock(VerifyingMultiStreamBlobContainer.class); + when(remoteDataDirectory.getBlobContainer()).thenReturn(blobContainer); + Mockito.doAnswer(invocation -> { + ActionListener completionListener = invocation.getArgument(1); + completionListener.onResponse(null); + return null; + }).when(blobContainer).asyncBlobUpload(any(WriteContext.class), any()); + + CountDownLatch latch = new CountDownLatch(1); + ActionListener completionListener = new ActionListener() { + @Override + public void onResponse(Void unused) { + latch.countDown(); + } + + @Override + public void onFailure(Exception e) {} + }; + remoteSegmentStoreDirectory.copyFrom(storeDirectory, filename, IOContext.DEFAULT, completionListener); + assertTrue(latch.await(5000, TimeUnit.SECONDS)); + assertTrue(remoteSegmentStoreDirectory.getSegmentsUploadedToRemoteStore().containsKey(filename)); + storeDirectory.close(); + } + + public void testCopyFilesFromMultipartIOException() throws Exception { + String filename = "_100.si"; + populateMetadata(); + remoteSegmentStoreDirectory.init(); + + Directory storeDirectory = LuceneTestCase.newDirectory(); + IndexOutput indexOutput = storeDirectory.createOutput(filename, IOContext.DEFAULT); + indexOutput.writeString("Hello World!"); + CodecUtil.writeFooter(indexOutput); + indexOutput.close(); + storeDirectory.sync(List.of(filename)); + + assertFalse(remoteSegmentStoreDirectory.getSegmentsUploadedToRemoteStore().containsKey(filename)); + + VerifyingMultiStreamBlobContainer blobContainer = mock(VerifyingMultiStreamBlobContainer.class); + when(remoteDataDirectory.getBlobContainer()).thenReturn(blobContainer); + Mockito.doAnswer(invocation -> { + ActionListener completionListener = invocation.getArgument(1); + completionListener.onFailure(new Exception("Test exception")); + return null; + }).when(blobContainer).asyncBlobUpload(any(WriteContext.class), any()); + CountDownLatch latch = new CountDownLatch(1); + ActionListener completionListener = new ActionListener<>() { + @Override + public void onResponse(Void unused) {} + + @Override + public void onFailure(Exception e) { + latch.countDown(); + } + }; + remoteSegmentStoreDirectory.copyFrom(storeDirectory, filename, IOContext.DEFAULT, completionListener); + assertTrue(latch.await(5000, TimeUnit.SECONDS)); + assertFalse(remoteSegmentStoreDirectory.getSegmentsUploadedToRemoteStore().containsKey(filename)); + + storeDirectory.close(); + } + public void testCopyFromException() throws IOException { String filename = "_100.si"; Directory storeDirectory = LuceneTestCase.newDirectory(); diff --git a/server/src/test/java/org/opensearch/index/store/TestUploadListener.java b/server/src/test/java/org/opensearch/index/store/TestUploadListener.java new file mode 100644 index 0000000000000..a2a61a93371e8 --- /dev/null +++ b/server/src/test/java/org/opensearch/index/store/TestUploadListener.java @@ -0,0 +1,43 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.index.store; + +import org.opensearch.common.util.UploadListener; + +import java.util.concurrent.ConcurrentHashMap; + +public class TestUploadListener implements UploadListener { + + private final ConcurrentHashMap uploadStatusMap = new ConcurrentHashMap<>(); + + enum UploadStatus { + BEFORE_UPLOAD, + UPLOAD_SUCCESS, + UPLOAD_FAILURE + } + + @Override + public void beforeUpload(String file) { + uploadStatusMap.put(file, UploadStatus.BEFORE_UPLOAD); + } + + @Override + public void onSuccess(String file) { + uploadStatusMap.put(file, UploadStatus.UPLOAD_SUCCESS); + } + + @Override + public void onFailure(String file) { + uploadStatusMap.put(file, UploadStatus.UPLOAD_FAILURE); + } + + public UploadStatus getUploadStatus(String file) { + return uploadStatusMap.get(file); + } +} diff --git a/server/src/test/java/org/opensearch/index/translog/transfer/BlobStoreTransferServiceMockRepositoryTests.java b/server/src/test/java/org/opensearch/index/translog/transfer/BlobStoreTransferServiceMockRepositoryTests.java new file mode 100644 index 0000000000000..1175716679d0f --- /dev/null +++ b/server/src/test/java/org/opensearch/index/translog/transfer/BlobStoreTransferServiceMockRepositoryTests.java @@ -0,0 +1,189 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.index.translog.transfer; + +import org.mockito.Mockito; +import org.opensearch.action.ActionListener; +import org.opensearch.action.LatchedActionListener; +import org.opensearch.common.blobstore.BlobPath; +import org.opensearch.common.blobstore.BlobStore; +import org.opensearch.common.blobstore.VerifyingMultiStreamBlobContainer; +import org.opensearch.common.blobstore.stream.write.WriteContext; +import org.opensearch.common.blobstore.stream.write.WritePriority; +import org.opensearch.test.OpenSearchTestCase; +import org.opensearch.threadpool.TestThreadPool; +import org.opensearch.threadpool.ThreadPool; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.StandardOpenOption; +import java.util.Collections; +import java.util.HashMap; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.concurrent.atomic.AtomicReference; + +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.doThrow; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.verify; +import static org.mockito.Mockito.when; + +public class BlobStoreTransferServiceMockRepositoryTests extends OpenSearchTestCase { + + private ThreadPool threadPool; + + private BlobStore blobStore; + + @Override + public void setUp() throws Exception { + super.setUp(); + blobStore = mock(BlobStore.class); + threadPool = new TestThreadPool(getClass().getName()); + } + + public void testUploadBlobs() throws Exception { + Path testFile = createTempFile(); + Files.write(testFile, randomByteArrayOfLength(128), StandardOpenOption.APPEND); + FileSnapshot.TransferFileSnapshot transferFileSnapshot = new FileSnapshot.TransferFileSnapshot( + testFile, + randomNonNegativeLong(), + 0L + ); + + VerifyingMultiStreamBlobContainer blobContainer = mock(VerifyingMultiStreamBlobContainer.class); + Mockito.doAnswer(invocation -> { + ActionListener completionListener = invocation.getArgument(1); + completionListener.onResponse(null); + return null; + }).when(blobContainer).asyncBlobUpload(any(WriteContext.class), any()); + when(blobStore.blobContainer(any(BlobPath.class))).thenReturn(blobContainer); + + TransferService transferService = new BlobStoreTransferService(blobStore, threadPool); + CountDownLatch latch = new CountDownLatch(1); + AtomicBoolean onResponseCalled = new AtomicBoolean(false); + AtomicReference exceptionRef = new AtomicReference<>(); + AtomicReference fileSnapshotRef = new AtomicReference<>(); + transferService.uploadBlobs(Collections.singleton(transferFileSnapshot), new HashMap<>() { + { + put(transferFileSnapshot.getPrimaryTerm(), new BlobPath().add("sample_path")); + } + }, new LatchedActionListener<>(new ActionListener<>() { + @Override + public void onResponse(FileSnapshot.TransferFileSnapshot fileSnapshot) { + onResponseCalled.set(true); + fileSnapshotRef.set(fileSnapshot); + } + + @Override + public void onFailure(Exception e) { + exceptionRef.set(e); + } + }, latch), WritePriority.HIGH); + + assertTrue(latch.await(1000, TimeUnit.MILLISECONDS)); + verify(blobContainer).asyncBlobUpload(any(WriteContext.class), any()); + assertTrue(onResponseCalled.get()); + assertEquals(transferFileSnapshot.getPrimaryTerm(), fileSnapshotRef.get().getPrimaryTerm()); + assertEquals(transferFileSnapshot.getName(), fileSnapshotRef.get().getName()); + assertNull(exceptionRef.get()); + } + + public void testUploadBlobsIOException() throws Exception { + Path testFile = createTempFile(); + Files.write(testFile, randomByteArrayOfLength(128), StandardOpenOption.APPEND); + FileSnapshot.TransferFileSnapshot transferFileSnapshot = new FileSnapshot.TransferFileSnapshot( + testFile, + randomNonNegativeLong(), + 0L + ); + + VerifyingMultiStreamBlobContainer blobContainer = mock(VerifyingMultiStreamBlobContainer.class); + doThrow(new IOException()).when(blobContainer).asyncBlobUpload(any(WriteContext.class), any()); + when(blobStore.blobContainer(any(BlobPath.class))).thenReturn(blobContainer); + + TransferService transferService = new BlobStoreTransferService(blobStore, threadPool); + CountDownLatch latch = new CountDownLatch(1); + AtomicBoolean onResponseCalled = new AtomicBoolean(false); + AtomicReference exceptionRef = new AtomicReference<>(); + transferService.uploadBlobs(Collections.singleton(transferFileSnapshot), new HashMap<>() { + { + put(transferFileSnapshot.getPrimaryTerm(), new BlobPath().add("sample_path")); + } + }, new LatchedActionListener<>(new ActionListener<>() { + @Override + public void onResponse(FileSnapshot.TransferFileSnapshot fileSnapshot) { + onResponseCalled.set(true); + } + + @Override + public void onFailure(Exception e) { + exceptionRef.set(e); + } + }, latch), WritePriority.HIGH); + + assertTrue(latch.await(1000, TimeUnit.MILLISECONDS)); + verify(blobContainer).asyncBlobUpload(any(WriteContext.class), any()); + assertFalse(onResponseCalled.get()); + assertTrue(exceptionRef.get() instanceof FileTransferException); + } + + public void testUploadBlobsUploadFutureCompletedExceptionally() throws Exception { + Path testFile = createTempFile(); + Files.write(testFile, randomByteArrayOfLength(128), StandardOpenOption.APPEND); + FileSnapshot.TransferFileSnapshot transferFileSnapshot = new FileSnapshot.TransferFileSnapshot( + testFile, + randomNonNegativeLong(), + 0L + ); + + VerifyingMultiStreamBlobContainer blobContainer = mock(VerifyingMultiStreamBlobContainer.class); + Mockito.doAnswer(invocation -> { + ActionListener completionListener = invocation.getArgument(1); + completionListener.onFailure(new Exception("Test exception")); + return null; + }).when(blobContainer).asyncBlobUpload(any(WriteContext.class), any()); + + when(blobStore.blobContainer(any(BlobPath.class))).thenReturn(blobContainer); + + TransferService transferService = new BlobStoreTransferService(blobStore, threadPool); + CountDownLatch latch = new CountDownLatch(1); + AtomicBoolean onResponseCalled = new AtomicBoolean(false); + AtomicReference exceptionRef = new AtomicReference<>(); + LatchedActionListener listener = new LatchedActionListener<>(new ActionListener<>() { + @Override + public void onResponse(FileSnapshot.TransferFileSnapshot fileSnapshot) { + onResponseCalled.set(true); + } + + @Override + public void onFailure(Exception e) { + exceptionRef.set(e); + } + }, latch); + transferService.uploadBlobs(Collections.singleton(transferFileSnapshot), new HashMap<>() { + { + put(transferFileSnapshot.getPrimaryTerm(), new BlobPath().add("sample_path")); + } + }, listener, WritePriority.HIGH); + + assertTrue(latch.await(1000, TimeUnit.MILLISECONDS)); + verify(blobContainer).asyncBlobUpload(any(WriteContext.class), any()); + assertFalse(onResponseCalled.get()); + assertTrue(exceptionRef.get() instanceof FileTransferException); + } + + @Override + public void tearDown() throws Exception { + super.tearDown(); + ThreadPool.terminate(threadPool, 10, TimeUnit.SECONDS); + } +} diff --git a/server/src/test/java/org/opensearch/index/translog/transfer/BlobStoreTransferServiceTests.java b/server/src/test/java/org/opensearch/index/translog/transfer/BlobStoreTransferServiceTests.java index 196fbd58c2c20..5502dc3089c62 100644 --- a/server/src/test/java/org/opensearch/index/translog/transfer/BlobStoreTransferServiceTests.java +++ b/server/src/test/java/org/opensearch/index/translog/transfer/BlobStoreTransferServiceTests.java @@ -12,6 +12,7 @@ import org.opensearch.action.LatchedActionListener; import org.opensearch.cluster.metadata.RepositoryMetadata; import org.opensearch.cluster.service.ClusterService; +import org.opensearch.common.blobstore.stream.write.WritePriority; import org.opensearch.common.settings.ClusterSettings; import org.opensearch.common.settings.Settings; import org.opensearch.env.Environment; @@ -49,9 +50,13 @@ public void setUp() throws Exception { public void testUploadBlob() throws IOException { Path testFile = createTempFile(); Files.write(testFile, randomByteArrayOfLength(128), StandardOpenOption.APPEND); - FileSnapshot.TransferFileSnapshot transferFileSnapshot = new FileSnapshot.TransferFileSnapshot(testFile, randomNonNegativeLong()); + FileSnapshot.TransferFileSnapshot transferFileSnapshot = new FileSnapshot.TransferFileSnapshot( + testFile, + randomNonNegativeLong(), + null + ); TransferService transferService = new BlobStoreTransferService(repository.blobStore(), threadPool); - transferService.uploadBlob(transferFileSnapshot, repository.basePath()); + transferService.uploadBlob(transferFileSnapshot, repository.basePath(), WritePriority.HIGH); } public void testUploadBlobFromByteArray() throws IOException { @@ -61,17 +66,21 @@ public void testUploadBlobFromByteArray() throws IOException { 1 ); TransferService transferService = new BlobStoreTransferService(repository.blobStore(), threadPool); - transferService.uploadBlob(transferFileSnapshot, repository.basePath()); + transferService.uploadBlob(transferFileSnapshot, repository.basePath(), WritePriority.NORMAL); } public void testUploadBlobAsync() throws IOException, InterruptedException { Path testFile = createTempFile(); Files.write(testFile, randomByteArrayOfLength(128), StandardOpenOption.APPEND); AtomicBoolean succeeded = new AtomicBoolean(false); - FileSnapshot.TransferFileSnapshot transferFileSnapshot = new FileSnapshot.TransferFileSnapshot(testFile, randomNonNegativeLong()); + FileSnapshot.TransferFileSnapshot transferFileSnapshot = new FileSnapshot.TransferFileSnapshot( + testFile, + randomNonNegativeLong(), + null + ); CountDownLatch latch = new CountDownLatch(1); TransferService transferService = new BlobStoreTransferService(repository.blobStore(), threadPool); - transferService.uploadBlobAsync( + transferService.uploadBlob( ThreadPool.Names.TRANSLOG_TRANSFER, transferFileSnapshot, repository.basePath(), @@ -87,7 +96,8 @@ public void onResponse(FileSnapshot.TransferFileSnapshot fileSnapshot) { public void onFailure(Exception e) { throw new AssertionError("Failed to perform uploadBlobAsync", e); } - }, latch) + }, latch), + WritePriority.HIGH ); assertTrue(latch.await(1000, TimeUnit.MILLISECONDS)); assertTrue(succeeded.get()); diff --git a/server/src/test/java/org/opensearch/index/translog/transfer/FileSnapshotTests.java b/server/src/test/java/org/opensearch/index/translog/transfer/FileSnapshotTests.java index 6d2fb3794b107..8d07af5927135 100644 --- a/server/src/test/java/org/opensearch/index/translog/transfer/FileSnapshotTests.java +++ b/server/src/test/java/org/opensearch/index/translog/transfer/FileSnapshotTests.java @@ -28,15 +28,15 @@ public void tearDown() throws Exception { public void testFileSnapshotPath() throws IOException { Path file = createTempFile(); Files.writeString(file, "hello"); - fileSnapshot = new FileSnapshot.TransferFileSnapshot(file, 12); + fileSnapshot = new FileSnapshot.TransferFileSnapshot(file, 12, null); assertFileSnapshotProperties(file); - try (FileSnapshot sameFileSnapshot = new FileSnapshot.TransferFileSnapshot(file, 12)) { + try (FileSnapshot sameFileSnapshot = new FileSnapshot.TransferFileSnapshot(file, 12, null)) { assertEquals(sameFileSnapshot, fileSnapshot); } - try (FileSnapshot sameFileDiffPTSnapshot = new FileSnapshot.TransferFileSnapshot(file, 34)) { + try (FileSnapshot sameFileDiffPTSnapshot = new FileSnapshot.TransferFileSnapshot(file, 34, null)) { assertNotEquals(sameFileDiffPTSnapshot, fileSnapshot); } } diff --git a/server/src/test/java/org/opensearch/index/translog/transfer/FileTransferTrackerTests.java b/server/src/test/java/org/opensearch/index/translog/transfer/FileTransferTrackerTests.java index be14e4a7bd380..fd0d44564ef6b 100644 --- a/server/src/test/java/org/opensearch/index/translog/transfer/FileTransferTrackerTests.java +++ b/server/src/test/java/org/opensearch/index/translog/transfer/FileTransferTrackerTests.java @@ -34,7 +34,8 @@ public void testOnSuccess() throws IOException { try ( FileSnapshot.TransferFileSnapshot transferFileSnapshot = new FileSnapshot.TransferFileSnapshot( testFile, - randomNonNegativeLong() + randomNonNegativeLong(), + null ) ) { fileTransferTracker.onSuccess(transferFileSnapshot); @@ -58,11 +59,13 @@ public void testOnFailure() throws IOException { try ( FileSnapshot.TransferFileSnapshot transferFileSnapshot = new FileSnapshot.TransferFileSnapshot( testFile, - randomNonNegativeLong() + randomNonNegativeLong(), + null ); FileSnapshot.TransferFileSnapshot transferFileSnapshot2 = new FileSnapshot.TransferFileSnapshot( testFile2, - randomNonNegativeLong() + randomNonNegativeLong(), + null ) ) { @@ -82,7 +85,8 @@ public void testUploaded() throws IOException { try ( FileSnapshot.TransferFileSnapshot transferFileSnapshot = new FileSnapshot.TransferFileSnapshot( testFile, - randomNonNegativeLong() + randomNonNegativeLong(), + null ); ) { diff --git a/server/src/test/java/org/opensearch/index/translog/transfer/TranslogTransferManagerTests.java b/server/src/test/java/org/opensearch/index/translog/transfer/TranslogTransferManagerTests.java index 5f8aa64457896..66cd257299e25 100644 --- a/server/src/test/java/org/opensearch/index/translog/transfer/TranslogTransferManagerTests.java +++ b/server/src/test/java/org/opensearch/index/translog/transfer/TranslogTransferManagerTests.java @@ -17,6 +17,7 @@ import org.opensearch.common.blobstore.BlobPath; import org.opensearch.common.blobstore.BlobStore; import org.opensearch.common.blobstore.support.PlainBlobMetadata; +import org.opensearch.common.blobstore.stream.write.WritePriority; import org.opensearch.index.Index; import org.opensearch.index.shard.ShardId; import org.opensearch.index.translog.Translog; @@ -41,6 +42,8 @@ import java.util.concurrent.atomic.AtomicInteger; import static org.mockito.ArgumentMatchers.anyInt; +import static org.mockito.ArgumentMatchers.anyMap; +import static org.mockito.ArgumentMatchers.anySet; import static org.mockito.Mockito.any; import static org.mockito.Mockito.doAnswer; import static org.mockito.Mockito.doNothing; @@ -80,20 +83,24 @@ public void tearDown() throws Exception { } @SuppressWarnings("unchecked") - public void testTransferSnapshot() throws IOException { + public void testTransferSnapshot() throws Exception { AtomicInteger fileTransferSucceeded = new AtomicInteger(); AtomicInteger fileTransferFailed = new AtomicInteger(); AtomicInteger translogTransferSucceeded = new AtomicInteger(); AtomicInteger translogTransferFailed = new AtomicInteger(); doNothing().when(transferService) - .uploadBlob(any(TransferFileSnapshot.class), Mockito.eq(remoteBaseTransferPath.add(String.valueOf(primaryTerm)))); + .uploadBlob( + any(TransferFileSnapshot.class), + Mockito.eq(remoteBaseTransferPath.add(String.valueOf(primaryTerm))), + any(WritePriority.class) + ); doAnswer(invocationOnMock -> { - ActionListener listener = (ActionListener) invocationOnMock.getArguments()[3]; - listener.onResponse((TransferFileSnapshot) invocationOnMock.getArguments()[1]); + ActionListener listener = (ActionListener) invocationOnMock.getArguments()[2]; + Set transferFileSnapshots = (Set) invocationOnMock.getArguments()[0]; + transferFileSnapshots.forEach(listener::onResponse); return null; - }).when(transferService) - .uploadBlobAsync(any(String.class), any(TransferFileSnapshot.class), any(BlobPath.class), any(ActionListener.class)); + }).when(transferService).uploadBlobs(anySet(), anyMap(), any(ActionListener.class), any(WritePriority.class)); FileTransferTracker fileTransferTracker = new FileTransferTracker(new ShardId("index", "indexUUid", 0)) { @Override @@ -145,13 +152,15 @@ public Set getCheckpointFileSnapshots() { primaryTerm, generation, minTranslogGeneration, - createTempFile(Translog.TRANSLOG_FILE_PREFIX + generation, Translog.CHECKPOINT_SUFFIX) + createTempFile(Translog.TRANSLOG_FILE_PREFIX + generation, Translog.CHECKPOINT_SUFFIX), + null ), new CheckpointFileSnapshot( primaryTerm, generation, minTranslogGeneration, - createTempFile(Translog.TRANSLOG_FILE_PREFIX + (generation - 1), Translog.CHECKPOINT_SUFFIX) + createTempFile(Translog.TRANSLOG_FILE_PREFIX + (generation - 1), Translog.CHECKPOINT_SUFFIX), + null ) ); } catch (IOException e) { @@ -166,12 +175,14 @@ public Set getTranslogFileSnapshots() { new TranslogFileSnapshot( primaryTerm, generation, - createTempFile(Translog.TRANSLOG_FILE_PREFIX + generation, Translog.TRANSLOG_FILE_SUFFIX) + createTempFile(Translog.TRANSLOG_FILE_PREFIX + generation, Translog.TRANSLOG_FILE_SUFFIX), + null ), new TranslogFileSnapshot( primaryTerm, generation - 1, - createTempFile(Translog.TRANSLOG_FILE_PREFIX + (generation - 1), Translog.TRANSLOG_FILE_SUFFIX) + createTempFile(Translog.TRANSLOG_FILE_PREFIX + (generation - 1), Translog.TRANSLOG_FILE_SUFFIX), + null ) ); } catch (IOException e) { From a07aac80d6e44e4963b5dffd3b86178135b11a4d Mon Sep 17 00:00:00 2001 From: Rishikesh Pasham <62345295+Rishikesh1159@users.noreply.github.com> Date: Tue, 11 Jul 2023 08:31:56 -0700 Subject: [PATCH 2/7] [Mute] flaky RemoteStoreRefreshListenerTests (#8626) * Mute Flaky RemoteStoreRefreshListenerTests. Signed-off-by: Rishikesh1159 * Apply Spotless Check. Signed-off-by: Rishikesh1159 --------- Signed-off-by: Rishikesh1159 --- .../opensearch/index/shard/RemoteStoreRefreshListenerTests.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/server/src/test/java/org/opensearch/index/shard/RemoteStoreRefreshListenerTests.java b/server/src/test/java/org/opensearch/index/shard/RemoteStoreRefreshListenerTests.java index 21a9393408529..ed1438cbc3b08 100644 --- a/server/src/test/java/org/opensearch/index/shard/RemoteStoreRefreshListenerTests.java +++ b/server/src/test/java/org/opensearch/index/shard/RemoteStoreRefreshListenerTests.java @@ -13,6 +13,7 @@ import org.apache.lucene.store.Directory; import org.apache.lucene.store.FilterDirectory; import org.apache.lucene.tests.store.BaseDirectoryWrapper; +import org.apache.lucene.tests.util.LuceneTestCase; import org.junit.After; import org.opensearch.action.ActionListener; import org.opensearch.cluster.metadata.IndexMetadata; @@ -47,6 +48,7 @@ import static org.mockito.Mockito.when; import static org.opensearch.cluster.metadata.IndexMetadata.SETTING_REPLICATION_TYPE; +@LuceneTestCase.AwaitsFix(bugUrl = "https://github.com/opensearch-project/OpenSearch/issues/8549") public class RemoteStoreRefreshListenerTests extends IndexShardTestCase { private IndexShard indexShard; private ClusterService clusterService; From 542041f98421cfc7fc8022167d8037db66bdb7c6 Mon Sep 17 00:00:00 2001 From: Varun Bansal Date: Tue, 11 Jul 2023 22:05:08 +0530 Subject: [PATCH 3/7] Add logs for remote store metadata intermittent read failures (#8618) --------- Signed-off-by: bansvaru --- .../io/VersionedCodecStreamWrapper.java | 25 ++++++++++++++++--- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/server/src/main/java/org/opensearch/common/io/VersionedCodecStreamWrapper.java b/server/src/main/java/org/opensearch/common/io/VersionedCodecStreamWrapper.java index ff0af3954a3a3..9907e2225c64e 100644 --- a/server/src/main/java/org/opensearch/common/io/VersionedCodecStreamWrapper.java +++ b/server/src/main/java/org/opensearch/common/io/VersionedCodecStreamWrapper.java @@ -10,7 +10,11 @@ import java.io.IOException; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.apache.logging.log4j.message.ParameterizedMessage; import org.apache.lucene.codecs.CodecUtil; +import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.store.ChecksumIndexInput; import org.apache.lucene.store.IndexInput; import org.apache.lucene.store.IndexOutput; @@ -22,6 +26,8 @@ * @opensearch.internal */ public class VersionedCodecStreamWrapper { + private static final Logger logger = LogManager.getLogger(VersionedCodecStreamWrapper.class); + // TODO This can be updated to hold a streamReadWriteHandlerFactory and get relevant handler based on the stream versions private final IndexIOStreamHandler indexIOStreamHandler; private final int currentVersion; @@ -46,10 +52,21 @@ public VersionedCodecStreamWrapper(IndexIOStreamHandler indexIOStreamHandler, * @return stream content parsed into {@link T} */ public T readStream(IndexInput indexInput) throws IOException { - CodecUtil.checksumEntireFile(indexInput); - int readStreamVersion = checkHeader(indexInput); - T content = getHandlerForVersion(readStreamVersion).readContent(indexInput); - return content; + logger.debug("Reading input stream [{}] of length - [{}]", indexInput.toString(), indexInput.length()); + try { + CodecUtil.checksumEntireFile(indexInput); + int readStreamVersion = checkHeader(indexInput); + return getHandlerForVersion(readStreamVersion).readContent(indexInput); + } catch (CorruptIndexException cie) { + logger.error( + () -> new ParameterizedMessage( + "Error while validating header/footer for [{}]. Total data length [{}]", + indexInput.toString(), + indexInput.length() + ) + ); + throw cie; + } } /** From 4ccbf9d12f4cccae84a58f0264b718454e8a1e99 Mon Sep 17 00:00:00 2001 From: Marc Handalian Date: Tue, 11 Jul 2023 09:39:01 -0700 Subject: [PATCH 4/7] Fix bug where ReplicationListeners would not complete on cancellation. (#8478) * [Segment Replication] Fix bug where ReplicationListeners would not complete on target cancellation. This change updates cancellation with Segment Replication to ensure all listeners are resolved. It does this by requesting cancellation before shard closure instead of using ReplicationCollection's cancelForShard which immediately removes it from the replicationCollection. This would cause the underlying ReplicationListener to never get invoked on close. This change includes new tests using suite scope to catch for any open tasks. This caught other locations where this was possible: 1. On a replica during force sync if the shard was closed while resolving its listeners, it would never call back to the primary if an exception was caught in the onDone method. - Fixed by refactoring those paths to use a ChannelActionListener and always reply to primary. 2. On the primary during forceSync, the primary would not successfully cancel before shard close during a forceSync, Fixed by wrapping the synchronous recoveryTarget::forceSync call in cancellableThreads. Signed-off-by: Marc Handalian PR cleanup. Signed-off-by: Marc Handalian Update log message Signed-off-by: Marc Handalian * PR feedback. Signed-off-by: Marc Handalian * Update server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java Co-authored-by: Suraj Singh Signed-off-by: Marc Handalian * Add more tests. Signed-off-by: Marc Handalian --------- Signed-off-by: Marc Handalian Co-authored-by: Suraj Singh --- .../replication/SegmentReplicationBaseIT.java | 27 ++- .../SegmentReplicationSuiteIT.java | 88 ++++++++ .../recovery/RecoverySourceHandler.java | 2 +- .../replication/SegmentReplicationState.java | 11 +- .../replication/SegmentReplicationTarget.java | 182 +++++++--------- .../SegmentReplicationTargetService.java | 201 +++++++++++------- .../common/ReplicationCollection.java | 36 ++++ .../replication/common/ReplicationTarget.java | 1 + .../SegmentReplicationIndexShardTests.java | 124 ++++++++++- .../SegmentReplicationTargetServiceTests.java | 171 ++++++++++----- .../SegmentReplicationTargetTests.java | 6 +- .../recovery/ReplicationCollectionTests.java | 29 ++- 12 files changed, 623 insertions(+), 255 deletions(-) create mode 100644 server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationSuiteIT.java diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationBaseIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationBaseIT.java index 52fe85b51cebd..8fef88cbe6820 100644 --- a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationBaseIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationBaseIT.java @@ -24,7 +24,9 @@ import org.opensearch.index.IndexService; import org.opensearch.index.SegmentReplicationPerGroupStats; import org.opensearch.index.SegmentReplicationShardStats; +import org.opensearch.index.engine.Engine; import org.opensearch.index.shard.IndexShard; +import org.opensearch.index.shard.ShardId; import org.opensearch.index.store.Store; import org.opensearch.index.store.StoreFileMetadata; import org.opensearch.indices.IndicesService; @@ -158,6 +160,7 @@ protected void verifyStoreContent() throws Exception { final String indexName = primaryRouting.getIndexName(); final List replicaRouting = shardRoutingTable.replicaShards(); final IndexShard primaryShard = getIndexShard(clusterState, primaryRouting, indexName); + final int primaryDocCount = getDocCountFromShard(primaryShard); final Map primarySegmentMetadata = primaryShard.getSegmentMetadataMap(); for (ShardRouting replica : replicaRouting) { IndexShard replicaShard = getIndexShard(clusterState, replica, indexName); @@ -165,6 +168,8 @@ protected void verifyStoreContent() throws Exception { primarySegmentMetadata, replicaShard.getSegmentMetadataMap() ); + final int replicaDocCount = getDocCountFromShard(replicaShard); + assertEquals("Doc counts should match", primaryDocCount, replicaDocCount); if (recoveryDiff.missing.isEmpty() == false || recoveryDiff.different.isEmpty() == false) { fail( "Expected no missing or different segments between primary and replica but diff was missing: " @@ -185,10 +190,30 @@ protected void verifyStoreContent() throws Exception { }, 1, TimeUnit.MINUTES); } + private int getDocCountFromShard(IndexShard shard) { + try (final Engine.Searcher searcher = shard.acquireSearcher("test")) { + return searcher.getDirectoryReader().numDocs(); + } + } + private IndexShard getIndexShard(ClusterState state, ShardRouting routing, String indexName) { - return getIndexShard(state.nodes().get(routing.currentNodeId()).getName(), indexName); + return getIndexShard(state.nodes().get(routing.currentNodeId()).getName(), routing.shardId(), indexName); + } + + /** + * Fetch IndexShard by shardId, multiple shards per node allowed. + */ + protected IndexShard getIndexShard(String node, ShardId shardId, String indexName) { + final Index index = resolveIndex(indexName); + IndicesService indicesService = internalCluster().getInstance(IndicesService.class, node); + IndexService indexService = indicesService.indexServiceSafe(index); + final Optional id = indexService.shardIds().stream().filter(sid -> sid == shardId.id()).findFirst(); + return indexService.getShard(id.get()); } + /** + * Fetch IndexShard, assumes only a single shard per node. + */ protected IndexShard getIndexShard(String node, String indexName) { final Index index = resolveIndex(indexName); IndicesService indicesService = internalCluster().getInstance(IndicesService.class, node); diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationSuiteIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationSuiteIT.java new file mode 100644 index 0000000000000..9025c1cc79884 --- /dev/null +++ b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationSuiteIT.java @@ -0,0 +1,88 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +package org.opensearch.indices.replication; + +import org.junit.Before; +import org.opensearch.action.admin.indices.delete.DeleteIndexRequest; +import org.opensearch.cluster.metadata.IndexMetadata; +import org.opensearch.common.settings.Settings; +import org.opensearch.indices.replication.common.ReplicationType; +import org.opensearch.test.OpenSearchIntegTestCase; + +@OpenSearchIntegTestCase.ClusterScope(scope = OpenSearchIntegTestCase.Scope.SUITE, minNumDataNodes = 2) +public class SegmentReplicationSuiteIT extends SegmentReplicationBaseIT { + + @Before + public void setup() { + internalCluster().startClusterManagerOnlyNode(); + createIndex(INDEX_NAME); + } + + @Override + public Settings indexSettings() { + final Settings.Builder builder = Settings.builder() + .put(super.indexSettings()) + // reset shard & replica count to random values set by OpenSearchIntegTestCase. + .put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, numberOfShards()) + .put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, numberOfReplicas()) + .put(IndexMetadata.SETTING_REPLICATION_TYPE, ReplicationType.SEGMENT); + + // TODO: Randomly enable remote store on these tests. + return builder.build(); + } + + public void testBasicReplication() throws Exception { + final int docCount = scaledRandomIntBetween(10, 200); + for (int i = 0; i < docCount; i++) { + client().prepareIndex(INDEX_NAME).setId(Integer.toString(i)).setSource("field", "value" + i).execute().get(); + } + refresh(); + ensureGreen(INDEX_NAME); + verifyStoreContent(); + } + + public void testDropRandomNodeDuringReplication() throws Exception { + internalCluster().ensureAtLeastNumDataNodes(2); + internalCluster().startClusterManagerOnlyNodes(1); + + final int docCount = scaledRandomIntBetween(10, 200); + for (int i = 0; i < docCount; i++) { + client().prepareIndex(INDEX_NAME).setId(Integer.toString(i)).setSource("field", "value" + i).execute().get(); + } + refresh(); + + internalCluster().restartRandomDataNode(); + + ensureYellow(INDEX_NAME); + client().prepareIndex(INDEX_NAME).setId(Integer.toString(docCount)).setSource("field", "value" + docCount).execute().get(); + internalCluster().startDataOnlyNode(); + client().admin().indices().delete(new DeleteIndexRequest(INDEX_NAME)).actionGet(); + } + + public void testDeleteIndexWhileReplicating() throws Exception { + internalCluster().startClusterManagerOnlyNode(); + final int docCount = scaledRandomIntBetween(10, 200); + for (int i = 0; i < docCount; i++) { + client().prepareIndex(INDEX_NAME).setId(Integer.toString(i)).setSource("field", "value" + i).execute().get(); + } + refresh(INDEX_NAME); + client().admin().indices().delete(new DeleteIndexRequest(INDEX_NAME)).actionGet(); + } + + public void testFullRestartDuringReplication() throws Exception { + internalCluster().startNode(); + final int docCount = scaledRandomIntBetween(10, 200); + for (int i = 0; i < docCount; i++) { + client().prepareIndex(INDEX_NAME).setId(Integer.toString(i)).setSource("field", "value" + i).execute().get(); + } + refresh(INDEX_NAME); + internalCluster().fullRestart(); + ensureGreen(INDEX_NAME); + } +} diff --git a/server/src/main/java/org/opensearch/indices/recovery/RecoverySourceHandler.java b/server/src/main/java/org/opensearch/indices/recovery/RecoverySourceHandler.java index 5e278f06cfb8f..0b343fb0b0871 100644 --- a/server/src/main/java/org/opensearch/indices/recovery/RecoverySourceHandler.java +++ b/server/src/main/java/org/opensearch/indices/recovery/RecoverySourceHandler.java @@ -835,7 +835,7 @@ void finalizeRecovery(long targetLocalCheckpoint, long trimAboveSeqNo, ActionLis } else { // Force round of segment replication to update its checkpoint to primary's if (shard.indexSettings().isSegRepEnabled()) { - recoveryTarget.forceSegmentFileSync(); + cancellableThreads.execute(recoveryTarget::forceSegmentFileSync); } } stopWatch.stop(); diff --git a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationState.java b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationState.java index 7a996ec7aedaa..226ccbaf01afa 100644 --- a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationState.java +++ b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationState.java @@ -45,8 +45,7 @@ public enum Stage { GET_CHECKPOINT_INFO((byte) 3), FILE_DIFF((byte) 4), GET_FILES((byte) 5), - FINALIZE_REPLICATION((byte) 6), - CANCELLED((byte) 7); + FINALIZE_REPLICATION((byte) 6); private static final Stage[] STAGES = new Stage[Stage.values().length]; @@ -245,14 +244,6 @@ public void setStage(Stage stage) { overallTimer.stop(); timingData.put("OVERALL", overallTimer.time()); break; - case CANCELLED: - if (this.stage == Stage.DONE) { - throw new IllegalStateException("can't move replication to Cancelled state from Done."); - } - this.stage = Stage.CANCELLED; - overallTimer.stop(); - timingData.put("OVERALL", overallTimer.time()); - break; default: throw new IllegalArgumentException("unknown SegmentReplicationState.Stage [" + stage + "]"); } diff --git a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java index 9d724d6cc9dcf..f59a7c2368689 100644 --- a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java +++ b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java @@ -12,16 +12,15 @@ import org.apache.lucene.index.CorruptIndexException; import org.apache.lucene.index.IndexFormatTooNewException; import org.apache.lucene.index.IndexFormatTooOldException; -import org.apache.lucene.index.SegmentInfos; +import org.apache.lucene.store.AlreadyClosedException; import org.apache.lucene.store.BufferedChecksumIndexInput; import org.apache.lucene.store.ByteBuffersDataInput; import org.apache.lucene.store.ByteBuffersIndexInput; import org.apache.lucene.store.ChecksumIndexInput; -import org.opensearch.ExceptionsHelper; +import org.opensearch.OpenSearchCorruptionException; import org.opensearch.OpenSearchException; import org.opensearch.action.ActionListener; import org.opensearch.action.StepListener; -import org.opensearch.common.CheckedConsumer; import org.opensearch.common.UUIDs; import org.opensearch.common.bytes.BytesReference; import org.opensearch.common.lucene.Lucene; @@ -39,6 +38,8 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.Arrays; +import java.util.List; +import java.util.Locale; /** * Represents the target of a replication event. @@ -102,17 +103,11 @@ public SegmentReplicationTarget retryCopy() { @Override public String description() { - return "Segment replication from " + source.toString(); + return String.format(Locale.ROOT, "Id:[%d] Shard:[%s] Source:[%s]", getId(), shardId(), source.getDescription()); } @Override public void notifyListener(ReplicationFailedException e, boolean sendShardFailure) { - // Cancellations still are passed to our SegmentReplicationListener as failures, if we have failed because of cancellation - // update the stage. - final Throwable cancelledException = ExceptionsHelper.unwrap(e, CancellableThreads.ExecutionCancelledException.class); - if (cancelledException != null) { - state.setStage(SegmentReplicationState.Stage.CANCELLED); - } listener.onFailure(state(), e, sendShardFailure); } @@ -141,141 +136,115 @@ public void writeFileChunk( /** * Start the Replication event. + * * @param listener {@link ActionListener} listener. */ public void startReplication(ActionListener listener) { cancellableThreads.setOnCancel((reason, beforeCancelEx) -> { - // This method only executes when cancellation is triggered by this node and caught by a call to checkForCancel, - // SegmentReplicationSource does not share CancellableThreads. - final CancellableThreads.ExecutionCancelledException executionCancelledException = - new CancellableThreads.ExecutionCancelledException("replication was canceled reason [" + reason + "]"); - notifyListener(new ReplicationFailedException("Segment replication failed", executionCancelledException), false); - throw executionCancelledException; + throw new CancellableThreads.ExecutionCancelledException("replication was canceled reason [" + reason + "]"); }); + // TODO: Remove this useless state. state.setStage(SegmentReplicationState.Stage.REPLICATING); final StepListener checkpointInfoListener = new StepListener<>(); final StepListener getFilesListener = new StepListener<>(); - final StepListener finalizeListener = new StepListener<>(); - cancellableThreads.checkForCancel(); - logger.trace("[shardId {}] Replica starting replication [id {}]", shardId().getId(), getId()); + logger.trace(new ParameterizedMessage("Starting Replication Target: {}", description())); // Get list of files to copy from this checkpoint. state.setStage(SegmentReplicationState.Stage.GET_CHECKPOINT_INFO); + cancellableThreads.checkForCancel(); source.getCheckpointMetadata(getId(), checkpoint, checkpointInfoListener); - checkpointInfoListener.whenComplete(checkpointInfo -> getFiles(checkpointInfo, getFilesListener), listener::onFailure); - getFilesListener.whenComplete( - response -> finalizeReplication(checkpointInfoListener.result(), finalizeListener), - listener::onFailure - ); - finalizeListener.whenComplete(r -> listener.onResponse(null), listener::onFailure); + checkpointInfoListener.whenComplete(checkpointInfo -> { + final List filesToFetch = getFiles(checkpointInfo); + state.setStage(SegmentReplicationState.Stage.GET_FILES); + cancellableThreads.checkForCancel(); + source.getSegmentFiles(getId(), checkpointInfo.getCheckpoint(), filesToFetch, indexShard, getFilesListener); + }, listener::onFailure); + + getFilesListener.whenComplete(response -> { + finalizeReplication(checkpointInfoListener.result()); + listener.onResponse(null); + }, listener::onFailure); } - private void getFiles(CheckpointInfoResponse checkpointInfo, StepListener getFilesListener) - throws IOException { + private List getFiles(CheckpointInfoResponse checkpointInfo) throws IOException { cancellableThreads.checkForCancel(); state.setStage(SegmentReplicationState.Stage.FILE_DIFF); final Store.RecoveryDiff diff = Store.segmentReplicationDiff(checkpointInfo.getMetadataMap(), indexShard.getSegmentMetadataMap()); - logger.trace("Replication diff for checkpoint {} {}", checkpointInfo.getCheckpoint(), diff); + logger.trace(() -> new ParameterizedMessage("Replication diff for checkpoint {} {}", checkpointInfo.getCheckpoint(), diff)); /* * Segments are immutable. So if the replica has any segments with the same name that differ from the one in the incoming * snapshot from source that means the local copy of the segment has been corrupted/changed in some way and we throw an * IllegalStateException to fail the shard */ if (diff.different.isEmpty() == false) { - IllegalStateException illegalStateException = new IllegalStateException( + throw new OpenSearchCorruptionException( new ParameterizedMessage( "Shard {} has local copies of segments that differ from the primary {}", indexShard.shardId(), diff.different ).getFormattedMessage() ); - ReplicationFailedException rfe = new ReplicationFailedException( - indexShard.shardId(), - "different segment files", - illegalStateException - ); - fail(rfe, true); - throw rfe; } for (StoreFileMetadata file : diff.missing) { state.getIndex().addFileDetail(file.name(), file.length(), false); } - // always send a req even if not fetching files so the primary can clear the copyState for this shard. - state.setStage(SegmentReplicationState.Stage.GET_FILES); - cancellableThreads.checkForCancel(); - source.getSegmentFiles(getId(), checkpointInfo.getCheckpoint(), diff.missing, indexShard, getFilesListener); + return diff.missing; } - private void finalizeReplication(CheckpointInfoResponse checkpointInfoResponse, ActionListener listener) { + private void finalizeReplication(CheckpointInfoResponse checkpointInfoResponse) throws OpenSearchCorruptionException { // TODO: Refactor the logic so that finalize doesn't have to be invoked for remote store as source if (source instanceof RemoteStoreReplicationSource) { - ActionListener.completeWith(listener, () -> { - state.setStage(SegmentReplicationState.Stage.FINALIZE_REPLICATION); - return null; - }); + state.setStage(SegmentReplicationState.Stage.FINALIZE_REPLICATION); return; } - ActionListener.completeWith(listener, () -> { - cancellableThreads.checkForCancel(); - state.setStage(SegmentReplicationState.Stage.FINALIZE_REPLICATION); - Store store = null; + cancellableThreads.checkForCancel(); + state.setStage(SegmentReplicationState.Stage.FINALIZE_REPLICATION); + Store store = null; + try { + store = store(); + store.incRef(); + store.buildInfosFromBytes( + multiFileWriter.getTempFileNames(), + checkpointInfoResponse.getInfosBytes(), + checkpointInfoResponse.getCheckpoint().getSegmentsGen(), + indexShard::finalizeReplication + ); + } catch (CorruptIndexException | IndexFormatTooNewException | IndexFormatTooOldException ex) { + // this is a fatal exception at this stage. + // this means we transferred files from the remote that have not be checksummed and they are + // broken. We have to clean up this shard entirely, remove all files and bubble it up to the + // source shard since this index might be broken there as well? The Source can handle this and checks + // its content on disk if possible. try { - store = store(); - store.incRef(); - CheckedConsumer finalizeReplication = indexShard::finalizeReplication; - store.buildInfosFromBytes( - multiFileWriter.getTempFileNames(), - checkpointInfoResponse.getInfosBytes(), - checkpointInfoResponse.getCheckpoint().getSegmentsGen(), - finalizeReplication - ); - } catch (CorruptIndexException | IndexFormatTooNewException | IndexFormatTooOldException ex) { - // this is a fatal exception at this stage. - // this means we transferred files from the remote that have not be checksummed and they are - // broken. We have to clean up this shard entirely, remove all files and bubble it up to the - // source shard since this index might be broken there as well? The Source can handle this and checks - // its content on disk if possible. try { - try { - store.removeCorruptionMarker(); - } finally { - Lucene.cleanLuceneIndex(store.directory()); // clean up and delete all files - } - } catch (Exception e) { - logger.debug("Failed to clean lucene index", e); - ex.addSuppressed(e); - } - ReplicationFailedException rfe = new ReplicationFailedException( - indexShard.shardId(), - "failed to clean after replication", - ex - ); - fail(rfe, true); - throw rfe; - } catch (OpenSearchException ex) { - /* - Ignore closed replication target as it can happen due to index shard closed event in a separate thread. - In such scenario, ignore the exception - */ - assert cancellableThreads.isCancelled() : "Replication target closed but segment replication not cancelled"; - logger.info("Replication target closed", ex); - } catch (Exception ex) { - ReplicationFailedException rfe = new ReplicationFailedException( - indexShard.shardId(), - "failed to clean after replication", - ex - ); - fail(rfe, true); - throw rfe; - } finally { - if (store != null) { - store.decRef(); + store.removeCorruptionMarker(); + } finally { + Lucene.cleanLuceneIndex(store.directory()); // clean up and delete all files } + } catch (Exception e) { + logger.debug("Failed to clean lucene index", e); + ex.addSuppressed(e); } - return null; - }); + throw new OpenSearchCorruptionException(ex); + } catch (AlreadyClosedException ex) { + // In this case the shard is closed at some point while updating the reader. + // This can happen when the engine is closed in a separate thread. + logger.warn("Shard is already closed, closing replication"); + } catch (OpenSearchException ex) { + /* + Ignore closed replication target as it can happen due to index shard closed event in a separate thread. + In such scenario, ignore the exception + */ + assert cancellableThreads.isCancelled() : "Replication target closed but segment replication not cancelled"; + } catch (Exception ex) { + throw new OpenSearchCorruptionException(ex); + } finally { + if (store != null) { + store.decRef(); + } + } } /** @@ -288,10 +257,15 @@ private ChecksumIndexInput toIndexInput(byte[] input) { ); } + /** + * Trigger a cancellation, this method will not close the target a subsequent call to #fail is required from target service. + */ @Override - protected void onCancel(String reason) { - cancellableThreads.cancel(reason); - source.cancel(); - multiFileWriter.close(); + public void cancel(String reason) { + if (finished.get() == false) { + logger.trace(new ParameterizedMessage("Cancelling replication for target {}", description())); + cancellableThreads.cancel(reason); + source.cancel(); + } } } diff --git a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java index a7e0c0ec887ab..ac93412eef725 100644 --- a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java +++ b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java @@ -11,14 +11,15 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.message.ParameterizedMessage; -import org.opensearch.ExceptionsHelper; +import org.opensearch.OpenSearchCorruptionException; import org.opensearch.action.ActionListener; +import org.opensearch.action.support.ChannelActionListener; import org.opensearch.cluster.node.DiscoveryNode; import org.opensearch.cluster.routing.ShardRouting; import org.opensearch.cluster.service.ClusterService; import org.opensearch.common.Nullable; import org.opensearch.common.settings.Settings; -import org.opensearch.common.util.CancellableThreads; +import org.opensearch.common.util.concurrent.AbstractRunnable; import org.opensearch.common.util.concurrent.ConcurrentCollections; import org.opensearch.index.shard.IndexEventListener; import org.opensearch.index.shard.IndexShard; @@ -43,10 +44,8 @@ import org.opensearch.transport.TransportResponse; import org.opensearch.transport.TransportService; -import java.io.IOException; import java.util.Map; import java.util.Optional; -import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicLong; import static org.opensearch.indices.replication.SegmentReplicationSourceService.Actions.UPDATE_VISIBLE_CHECKPOINT; @@ -145,7 +144,7 @@ public SegmentReplicationTargetService( @Override public void beforeIndexShardClosed(ShardId shardId, @Nullable IndexShard indexShard, Settings indexSettings) { if (indexShard != null && indexShard.indexSettings().isSegRepEnabled()) { - onGoingReplications.cancelForShard(shardId, "shard closed"); + onGoingReplications.requestCancel(indexShard.shardId(), "Shard closing"); latestReceivedCheckpoint.remove(shardId); } } @@ -167,7 +166,7 @@ public void afterIndexShardStarted(IndexShard indexShard) { @Override public void shardRoutingChanged(IndexShard indexShard, @Nullable ShardRouting oldRouting, ShardRouting newRouting) { if (oldRouting != null && indexShard.indexSettings().isSegRepEnabled() && oldRouting.primary() == false && newRouting.primary()) { - onGoingReplications.cancelForShard(indexShard.shardId(), "shard has been promoted to primary"); + onGoingReplications.requestCancel(indexShard.shardId(), "Shard has been promoted to primary"); latestReceivedCheckpoint.remove(indexShard.shardId()); } } @@ -224,11 +223,13 @@ public synchronized void onNewCheckpoint(final ReplicationCheckpoint receivedChe if (ongoingReplicationTarget != null) { if (ongoingReplicationTarget.getCheckpoint().getPrimaryTerm() < receivedCheckpoint.getPrimaryTerm()) { logger.trace( - "Cancelling ongoing replication from old primary with primary term {}", - ongoingReplicationTarget.getCheckpoint().getPrimaryTerm() + () -> new ParameterizedMessage( + "Cancelling ongoing replication {} from old primary with primary term {}", + ongoingReplicationTarget.description(), + ongoingReplicationTarget.getCheckpoint().getPrimaryTerm() + ) ); - onGoingReplications.cancel(ongoingReplicationTarget.getId(), "Cancelling stuck target after new primary"); - completedReplications.put(replicaShard.shardId(), ongoingReplicationTarget); + ongoingReplicationTarget.cancel("Cancelling stuck target after new primary"); } else { logger.trace( () -> new ParameterizedMessage( @@ -268,21 +269,20 @@ public void onReplicationFailure( ReplicationFailedException e, boolean sendShardFailure ) { - logger.trace( + logger.error( () -> new ParameterizedMessage( "[shardId {}] [replication id {}] Replication failed, timing data: {}", replicaShard.shardId().getId(), state.getReplicationId(), state.getTimingData() - ) + ), + e ); if (sendShardFailure == true) { - logger.error("replication failure", e); - replicaShard.failShard("replication failure", e); + failShard(e, replicaShard); } } }); - } } else { logger.trace( @@ -305,7 +305,14 @@ protected void updateVisibleCheckpoint(long replicationId, IndexShard replicaSha final TransportRequestOptions options = TransportRequestOptions.builder() .withTimeout(recoverySettings.internalActionTimeout()) .build(); - logger.debug("Updating replication checkpoint to {}", request.getCheckpoint()); + logger.trace( + () -> new ParameterizedMessage( + "Updating Primary shard that replica {}-{} is synced to checkpoint {}", + replicaShard.shardId(), + replicaShard.routingEntry().allocationId(), + request.getCheckpoint() + ) + ); RetryableTransportClient transportClient = new RetryableTransportClient( transportService, getPrimaryNode(primaryShard), @@ -315,19 +322,23 @@ protected void updateVisibleCheckpoint(long replicationId, IndexShard replicaSha final ActionListener listener = new ActionListener<>() { @Override public void onResponse(Void unused) { - logger.debug( - "Successfully updated replication checkpoint {} for replica {}", - replicaShard.shardId(), - request.getCheckpoint() + logger.trace( + () -> new ParameterizedMessage( + "Successfully updated replication checkpoint {} for replica {}", + replicaShard.shardId(), + request.getCheckpoint() + ) ); } @Override public void onFailure(Exception e) { logger.error( - "Failed to update visible checkpoint for replica {}, {}: {}", - replicaShard.shardId(), - request.getCheckpoint(), + () -> new ParameterizedMessage( + "Failed to update visible checkpoint for replica {}, {}:", + replicaShard.shardId(), + request.getCheckpoint() + ), e ); } @@ -350,6 +361,13 @@ private DiscoveryNode getPrimaryNode(ShardRouting primaryShard) { protected boolean processLatestReceivedCheckpoint(IndexShard replicaShard, Thread thread) { final ReplicationCheckpoint latestPublishedCheckpoint = latestReceivedCheckpoint.get(replicaShard.shardId()); if (latestPublishedCheckpoint != null && latestPublishedCheckpoint.isAheadOf(replicaShard.getLatestReplicationCheckpoint())) { + logger.trace( + () -> new ParameterizedMessage( + "Processing latest received checkpoint for shard {} {}", + replicaShard.shardId(), + latestPublishedCheckpoint + ) + ); Runnable runnable = () -> onNewCheckpoint(latestReceivedCheckpoint.get(replicaShard.shardId()), replicaShard); // Checks if we are using same thread and forks if necessary. if (thread == Thread.currentThread()) { @@ -381,7 +399,15 @@ public SegmentReplicationTarget startReplication(final IndexShard indexShard, fi // pkg-private for integration tests void startReplication(final SegmentReplicationTarget target) { - final long replicationId = onGoingReplications.start(target, recoverySettings.activityTimeout()); + final long replicationId; + try { + replicationId = onGoingReplications.startSafe(target, recoverySettings.activityTimeout()); + } catch (ReplicationFailedException e) { + // replication already running for shard. + target.fail(e, false); + return; + } + logger.trace(() -> new ParameterizedMessage("Added new replication to collection {}", target.description())); threadPool.generic().execute(new ReplicationRunner(replicationId)); } @@ -410,7 +436,7 @@ default void onFailure(ReplicationState state, ReplicationFailedException e, boo /** * Runnable implementation to trigger a replication event. */ - private class ReplicationRunner implements Runnable { + private class ReplicationRunner extends AbstractRunnable { final long replicationId; @@ -419,47 +445,49 @@ public ReplicationRunner(long replicationId) { } @Override - public void run() { + public void onFailure(Exception e) { + try (final ReplicationRef ref = onGoingReplications.get(replicationId)) { + logger.error(() -> new ParameterizedMessage("Error during segment replication, {}", ref.get().description()), e); + } + onGoingReplications.fail(replicationId, new ReplicationFailedException("Unexpected Error during replication", e), false); + } + + @Override + public void doRun() { start(replicationId); } } private void start(final long replicationId) { + final SegmentReplicationTarget target; try (ReplicationRef replicationRef = onGoingReplications.get(replicationId)) { // This check is for handling edge cases where the reference is removed before the ReplicationRunner is started by the // threadpool. if (replicationRef == null) { return; } - SegmentReplicationTarget target = onGoingReplications.getTarget(replicationId); - replicationRef.get().startReplication(new ActionListener<>() { - @Override - public void onResponse(Void o) { - onGoingReplications.markAsDone(replicationId); - if (target.state().getIndex().recoveredFileCount() != 0 && target.state().getIndex().recoveredBytes() != 0) { - completedReplications.put(target.shardId(), target); - } - + target = replicationRef.get(); + } + target.startReplication(new ActionListener<>() { + @Override + public void onResponse(Void o) { + logger.trace(() -> new ParameterizedMessage("Finished replicating {} marking as done.", target.description())); + onGoingReplications.markAsDone(replicationId); + if (target.state().getIndex().recoveredFileCount() != 0 && target.state().getIndex().recoveredBytes() != 0) { + completedReplications.put(target.shardId(), target); } + } - @Override - public void onFailure(Exception e) { - Throwable cause = ExceptionsHelper.unwrapCause(e); - if (cause instanceof CancellableThreads.ExecutionCancelledException) { - if (onGoingReplications.getTarget(replicationId) != null) { - IndexShard indexShard = onGoingReplications.getTarget(replicationId).indexShard(); - // if the target still exists in our collection, the primary initiated the cancellation, fail the replication - // but do not fail the shard. Cancellations initiated by this node from Index events will be removed with - // onGoingReplications.cancel and not appear in the collection when this listener resolves. - onGoingReplications.fail(replicationId, new ReplicationFailedException(indexShard, cause), false); - completedReplications.put(target.shardId(), target); - } - } else { - onGoingReplications.fail(replicationId, new ReplicationFailedException("Segment Replication failed", e), false); - } + @Override + public void onFailure(Exception e) { + logger.error(() -> new ParameterizedMessage("Exception replicating {} marking as failed.", target.description()), e); + if (e instanceof OpenSearchCorruptionException) { + onGoingReplications.fail(replicationId, new ReplicationFailedException("Store corruption during replication", e), true); + return; } - }); - } + onGoingReplications.fail(replicationId, new ReplicationFailedException("Segment Replication failed", e), false); + } + }); } private class FileChunkTransportRequestHandler implements TransportRequestHandler { @@ -484,27 +512,31 @@ public void messageReceived(final FileChunkRequest request, TransportChannel cha private class ForceSyncTransportRequestHandler implements TransportRequestHandler { @Override public void messageReceived(final ForceSyncRequest request, TransportChannel channel, Task task) throws Exception { - assert indicesService != null; - final IndexShard indexShard = indicesService.getShardOrNull(request.getShardId()); - // Proceed with round of segment replication only when it is allowed - if (indexShard == null || indexShard.getReplicationEngine().isEmpty()) { - logger.info("Ignore force segment replication sync as it is not allowed"); - channel.sendResponse(TransportResponse.Empty.INSTANCE); - return; - } + forceReplication(request, new ChannelActionListener<>(channel, Actions.FORCE_SYNC, request)); + } + } + + private void forceReplication(ForceSyncRequest request, ActionListener listener) { + final ShardId shardId = request.getShardId(); + assert indicesService != null; + final IndexShard indexShard = indicesService.getShardOrNull(shardId); + // Proceed with round of segment replication only when it is allowed + if (indexShard == null || indexShard.getReplicationEngine().isEmpty()) { + listener.onResponse(TransportResponse.Empty.INSTANCE); + } else { startReplication(indexShard, new SegmentReplicationTargetService.SegmentReplicationListener() { @Override public void onReplicationDone(SegmentReplicationState state) { - logger.trace( - () -> new ParameterizedMessage( - "[shardId {}] [replication id {}] Replication complete to {}, timing data: {}", - indexShard.shardId().getId(), - state.getReplicationId(), - indexShard.getLatestReplicationCheckpoint(), - state.getTimingData() - ) - ); try { + logger.trace( + () -> new ParameterizedMessage( + "[shardId {}] [replication id {}] Force replication Sync complete to {}, timing data: {}", + shardId, + state.getReplicationId(), + indexShard.getLatestReplicationCheckpoint(), + state.getTimingData() + ) + ); // Promote engine type for primary target if (indexShard.recoveryState().getPrimary() == true) { indexShard.resetToWriteableEngine(); @@ -512,33 +544,40 @@ public void onReplicationDone(SegmentReplicationState state) { // Update the replica's checkpoint on primary's replication tracker. updateVisibleCheckpoint(state.getReplicationId(), indexShard); } - channel.sendResponse(TransportResponse.Empty.INSTANCE); - } catch (InterruptedException | TimeoutException | IOException e) { - throw new RuntimeException(e); + listener.onResponse(TransportResponse.Empty.INSTANCE); + } catch (Exception e) { + logger.error("Error while marking replication completed", e); + listener.onFailure(e); } } @Override public void onReplicationFailure(SegmentReplicationState state, ReplicationFailedException e, boolean sendShardFailure) { - logger.trace( + logger.error( () -> new ParameterizedMessage( "[shardId {}] [replication id {}] Replication failed, timing data: {}", indexShard.shardId().getId(), state.getReplicationId(), state.getTimingData() - ) + ), + e ); - if (sendShardFailure == true) { - indexShard.failShard("replication failure", e); - } - try { - channel.sendResponse(e); - } catch (IOException ex) { - throw new RuntimeException(ex); + if (sendShardFailure) { + failShard(e, indexShard); } + listener.onFailure(e); } }); } } + private void failShard(ReplicationFailedException e, IndexShard indexShard) { + try { + indexShard.failShard("unrecoverable replication failure", e); + } catch (Exception inner) { + logger.error("Error attempting to fail shard", inner); + e.addSuppressed(inner); + } + } + } diff --git a/server/src/main/java/org/opensearch/indices/replication/common/ReplicationCollection.java b/server/src/main/java/org/opensearch/indices/replication/common/ReplicationCollection.java index e918ac0a79691..c65ef27969154 100644 --- a/server/src/main/java/org/opensearch/indices/replication/common/ReplicationCollection.java +++ b/server/src/main/java/org/opensearch/indices/replication/common/ReplicationCollection.java @@ -70,6 +70,26 @@ public ReplicationCollection(Logger logger, ThreadPool threadPool) { this.threadPool = threadPool; } + /** + * Starts a new target event for a given shard, fails the given target if this shard is already replicating. + * @param target ReplicationTarget to start + * @param activityTimeout timeout for entire replication event + * @return The replication id + */ + public long startSafe(T target, TimeValue activityTimeout) { + synchronized (onGoingTargetEvents) { + final boolean isPresent = onGoingTargetEvents.values() + .stream() + .map(ReplicationTarget::shardId) + .anyMatch(t -> t.equals(target.shardId())); + if (isPresent) { + throw new ReplicationFailedException("Shard " + target.shardId() + " is already replicating"); + } else { + return start(target, activityTimeout); + } + } + } + /** * Starts a new target event for the given shard, source node and state * @@ -234,6 +254,22 @@ public boolean cancelForShard(ShardId shardId, String reason) { return cancelled; } + /** + * Trigger cancel on the target but do not remove it from the collection. + * This is intended to be called to ensure replication events are removed from the collection + * only when the target has closed. + * + * @param shardId {@link ShardId} shard events to cancel + * @param reason {@link String} reason for cancellation + */ + public void requestCancel(ShardId shardId, String reason) { + for (T value : onGoingTargetEvents.values()) { + if (value.shardId().equals(shardId)) { + value.cancel(reason); + } + } + } + /** * Get target for shard * diff --git a/server/src/main/java/org/opensearch/indices/replication/common/ReplicationTarget.java b/server/src/main/java/org/opensearch/indices/replication/common/ReplicationTarget.java index 4d75ff4896706..344a4040be119 100644 --- a/server/src/main/java/org/opensearch/indices/replication/common/ReplicationTarget.java +++ b/server/src/main/java/org/opensearch/indices/replication/common/ReplicationTarget.java @@ -173,6 +173,7 @@ public void cancel(String reason) { public void fail(ReplicationFailedException e, boolean sendShardFailure) { if (finished.compareAndSet(false, true)) { try { + logger.debug("marking target " + description() + " as failed", e); notifyListener(e, sendShardFailure); } finally { try { diff --git a/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java b/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java index b3876a8ea8fd0..1f5980ba9bfe0 100644 --- a/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java +++ b/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java @@ -10,6 +10,7 @@ import org.apache.lucene.codecs.Codec; import org.apache.lucene.index.SegmentInfos; +import org.apache.lucene.store.AlreadyClosedException; import org.junit.Assert; import org.opensearch.ExceptionsHelper; import org.opensearch.action.ActionListener; @@ -86,11 +87,12 @@ import static org.hamcrest.Matchers.instanceOf; import static org.mockito.Mockito.any; import static org.mockito.Mockito.doAnswer; +import static org.mockito.Mockito.doThrow; import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.spy; import static org.mockito.Mockito.times; import static org.mockito.Mockito.verify; import static org.mockito.Mockito.when; -import static org.mockito.Mockito.spy; public class SegmentReplicationIndexShardTests extends OpenSearchIndexLevelReplicationTestCase { @@ -1156,6 +1158,125 @@ public void getSegmentFiles( } } + public void testCloseShardDuringFinalize() throws Exception { + try (ReplicationGroup shards = createGroup(1, settings, new NRTReplicationEngineFactory())) { + shards.startAll(); + IndexShard primary = shards.getPrimary(); + final IndexShard replica = shards.getReplicas().get(0); + final IndexShard replicaSpy = spy(replica); + + primary.refresh("Test"); + + doThrow(AlreadyClosedException.class).when(replicaSpy).finalizeReplication(any()); + + replicateSegments(primary, List.of(replicaSpy)); + } + } + + public void testCloseShardWhileGettingCheckpoint() throws Exception { + try (ReplicationGroup shards = createGroup(1, settings, new NRTReplicationEngineFactory())) { + shards.startAll(); + IndexShard primary = shards.getPrimary(); + final IndexShard replica = shards.getReplicas().get(0); + + primary.refresh("Test"); + + final SegmentReplicationSourceFactory sourceFactory = mock(SegmentReplicationSourceFactory.class); + final SegmentReplicationTargetService targetService = newTargetService(sourceFactory); + SegmentReplicationSource source = new TestReplicationSource() { + + ActionListener listener; + + @Override + public void getCheckpointMetadata( + long replicationId, + ReplicationCheckpoint checkpoint, + ActionListener listener + ) { + // set the listener, we will only fail it once we cancel the source. + this.listener = listener; + // shard is closing while we are copying files. + targetService.beforeIndexShardClosed(replica.shardId, replica, Settings.EMPTY); + } + + @Override + public void getSegmentFiles( + long replicationId, + ReplicationCheckpoint checkpoint, + List filesToFetch, + IndexShard indexShard, + ActionListener listener + ) { + Assert.fail("Unreachable"); + } + + @Override + public void cancel() { + // simulate listener resolving, but only after we have issued a cancel from beforeIndexShardClosed . + final RuntimeException exception = new CancellableThreads.ExecutionCancelledException("retryable action was cancelled"); + listener.onFailure(exception); + } + }; + when(sourceFactory.get(any())).thenReturn(source); + startReplicationAndAssertCancellation(replica, targetService); + + shards.removeReplica(replica); + closeShards(replica); + } + } + + public void testBeforeIndexShardClosedWhileCopyingFiles() throws Exception { + try (ReplicationGroup shards = createGroup(1, settings, new NRTReplicationEngineFactory())) { + shards.startAll(); + IndexShard primary = shards.getPrimary(); + final IndexShard replica = shards.getReplicas().get(0); + + primary.refresh("Test"); + + final SegmentReplicationSourceFactory sourceFactory = mock(SegmentReplicationSourceFactory.class); + final SegmentReplicationTargetService targetService = newTargetService(sourceFactory); + SegmentReplicationSource source = new TestReplicationSource() { + + ActionListener listener; + + @Override + public void getCheckpointMetadata( + long replicationId, + ReplicationCheckpoint checkpoint, + ActionListener listener + ) { + resolveCheckpointInfoResponseListener(listener, primary); + } + + @Override + public void getSegmentFiles( + long replicationId, + ReplicationCheckpoint checkpoint, + List filesToFetch, + IndexShard indexShard, + ActionListener listener + ) { + // set the listener, we will only fail it once we cancel the source. + this.listener = listener; + // shard is closing while we are copying files. + targetService.beforeIndexShardClosed(replica.shardId, replica, Settings.EMPTY); + } + + @Override + public void cancel() { + // simulate listener resolving, but only after we have issued a cancel from beforeIndexShardClosed . + final RuntimeException exception = new CancellableThreads.ExecutionCancelledException("retryable action was cancelled"); + listener.onFailure(exception); + } + }; + when(sourceFactory.get(any())).thenReturn(source); + startReplicationAndAssertCancellation(replica, targetService); + + shards.removeReplica(replica); + closeShards(replica); + } + } + public void testPrimaryCancelsExecution() throws Exception { try (ReplicationGroup shards = createGroup(1, settings, new NRTReplicationEngineFactory())) { shards.startAll(); @@ -1245,7 +1366,6 @@ public void onReplicationDone(SegmentReplicationState state) { @Override public void onReplicationFailure(SegmentReplicationState state, ReplicationFailedException e, boolean sendShardFailure) { assertFalse(sendShardFailure); - assertEquals(SegmentReplicationState.Stage.CANCELLED, state.getStage()); latch.countDown(); } } diff --git a/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetServiceTests.java b/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetServiceTests.java index c632f2843cba2..32203c28c8ed8 100644 --- a/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetServiceTests.java +++ b/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetServiceTests.java @@ -8,8 +8,8 @@ package org.opensearch.indices.replication; +import org.apache.lucene.store.AlreadyClosedException; import org.junit.Assert; -import org.mockito.Mockito; import org.opensearch.OpenSearchException; import org.opensearch.Version; import org.opensearch.action.ActionListener; @@ -26,6 +26,7 @@ import org.opensearch.index.replication.TestReplicationSource; import org.opensearch.index.shard.IndexShard; import org.opensearch.index.shard.IndexShardTestCase; +import org.opensearch.index.shard.ShardId; import org.opensearch.index.store.StoreFileMetadata; import org.opensearch.indices.IndicesService; import org.opensearch.indices.recovery.ForceSyncRequest; @@ -49,6 +50,7 @@ import java.util.concurrent.TimeUnit; import static org.mockito.ArgumentMatchers.any; +import static org.mockito.ArgumentMatchers.anyLong; import static org.mockito.Mockito.atLeastOnce; import static org.mockito.Mockito.doAnswer; import static org.mockito.Mockito.doNothing; @@ -62,7 +64,6 @@ import static org.mockito.Mockito.verify; import static org.mockito.Mockito.verifyNoInteractions; import static org.mockito.Mockito.when; -import static org.opensearch.indices.replication.SegmentReplicationState.Stage.CANCELLED; public class SegmentReplicationTargetServiceTests extends IndexShardTestCase { @@ -240,71 +241,81 @@ public void testAlreadyOnNewCheckpoint() { verify(spy, times(0)).startReplication(any(), any()); } - public void testShardAlreadyReplicating() throws InterruptedException { - // Create a spy of Target Service so that we can verify invocation of startReplication call with specific checkpoint on it. - SegmentReplicationTargetService serviceSpy = spy(sut); - final SegmentReplicationTarget target = new SegmentReplicationTarget( - replicaShard, - replicationSource, - mock(SegmentReplicationTargetService.SegmentReplicationListener.class) - ); - // Create a Mockito spy of target to stub response of few method calls. - final SegmentReplicationTarget targetSpy = Mockito.spy(target); - CountDownLatch latch = new CountDownLatch(1); - // Mocking response when startReplication is called on targetSpy we send a new checkpoint to serviceSpy and later reduce countdown - // of latch. - doAnswer(invocation -> { - final ActionListener listener = invocation.getArgument(0); - // a new checkpoint arrives before we've completed. - serviceSpy.onNewCheckpoint(aheadCheckpoint, replicaShard); - listener.onResponse(null); - latch.countDown(); - return null; - }).when(targetSpy).startReplication(any()); - doNothing().when(targetSpy).onDone(); - - // start replication of this shard the first time. - serviceSpy.startReplication(targetSpy); + public void testShardAlreadyReplicating() { + sut.startReplication(replicaShard, mock(SegmentReplicationTargetService.SegmentReplicationListener.class)); + sut.startReplication(replicaShard, new SegmentReplicationTargetService.SegmentReplicationListener() { + @Override + public void onReplicationDone(SegmentReplicationState state) { + Assert.fail("Should not succeed"); + } - // wait for the new checkpoint to arrive, before the listener completes. - latch.await(30, TimeUnit.SECONDS); - verify(targetSpy, times(0)).cancel(any()); - verify(serviceSpy, times(0)).startReplication(eq(replicaShard), any()); + @Override + public void onReplicationFailure(SegmentReplicationState state, ReplicationFailedException e, boolean sendShardFailure) { + assertEquals("Shard " + replicaShard.shardId() + " is already replicating", e.getMessage()); + assertFalse(sendShardFailure); + } + }); } - public void testOnNewCheckpointFromNewPrimaryCancelOngoingReplication() throws IOException, InterruptedException { + public void testOnNewCheckpointFromNewPrimaryCancelOngoingReplication() throws InterruptedException { // Create a spy of Target Service so that we can verify invocation of startReplication call with specific checkpoint on it. SegmentReplicationTargetService serviceSpy = spy(sut); + doNothing().when(serviceSpy).updateVisibleCheckpoint(anyLong(), any()); + // skip post replication actions so we can assert execution counts. This will continue to process bc replica's pterm is not advanced + // post replication. + doReturn(true).when(serviceSpy).processLatestReceivedCheckpoint(any(), any()); // Create a Mockito spy of target to stub response of few method calls. - final SegmentReplicationTarget targetSpy = spy( - new SegmentReplicationTarget( - replicaShard, - replicationSource, - mock(SegmentReplicationTargetService.SegmentReplicationListener.class) - ) - ); CountDownLatch latch = new CountDownLatch(1); - // Mocking response when startReplication is called on targetSpy we send a new checkpoint to serviceSpy and later reduce countdown - // of latch. - doAnswer(invocation -> { - // short circuit loop on new checkpoint request - doReturn(null).when(serviceSpy).startReplication(eq(replicaShard), any()); - // a new checkpoint arrives before we've completed. - serviceSpy.onNewCheckpoint(newPrimaryCheckpoint, replicaShard); - try { - invocation.callRealMethod(); - } catch (CancellableThreads.ExecutionCancelledException e) { + SegmentReplicationSource source = new TestReplicationSource() { + + ActionListener listener; + + @Override + public void getCheckpointMetadata( + long replicationId, + ReplicationCheckpoint checkpoint, + ActionListener listener + ) { + // set the listener, we will only fail it once we cancel the source. + this.listener = listener; latch.countDown(); + // do not resolve this listener yet, wait for cancel to hit. } - return null; - }).when(targetSpy).startReplication(any()); + + @Override + public void getSegmentFiles( + long replicationId, + ReplicationCheckpoint checkpoint, + List filesToFetch, + IndexShard indexShard, + ActionListener listener + ) { + Assert.fail("Unreachable"); + } + + @Override + public void cancel() { + // simulate listener resolving, but only after we have issued a cancel from beforeIndexShardClosed . + final RuntimeException exception = new CancellableThreads.ExecutionCancelledException("retryable action was cancelled"); + listener.onFailure(exception); + } + }; + + final SegmentReplicationTarget targetSpy = spy( + new SegmentReplicationTarget(replicaShard, source, mock(SegmentReplicationTargetService.SegmentReplicationListener.class)) + ); // start replication. This adds the target to on-ongoing replication collection serviceSpy.startReplication(targetSpy); + + // wait until we get to getCheckpoint step. latch.await(); - // wait for the new checkpoint to arrive, before the listener completes. - assertEquals(CANCELLED, targetSpy.state().getStage()); + + // new checkpoint arrives with higher pterm. + serviceSpy.onNewCheckpoint(newPrimaryCheckpoint, replicaShard); + + // ensure the old target is cancelled. and new iteration kicks off. verify(targetSpy, times(1)).cancel("Cancelling stuck target after new primary"); verify(serviceSpy, times(1)).startReplication(eq(replicaShard), any()); } @@ -467,6 +478,7 @@ public void testForceSegmentSyncHandler() throws Exception { } public void testForceSegmentSyncHandlerWithFailure() throws Exception { + allowShardFailures(); IndexShard spyReplicaShard = spy(replicaShard); ForceSyncRequest forceSyncRequest = new ForceSyncRequest(1L, 1L, replicaShard.shardId()); when(indicesService.getShardOrNull(forceSyncRequest.getShardId())).thenReturn(spyReplicaShard); @@ -488,4 +500,57 @@ public void testForceSegmentSyncHandlerWithFailure() throws Exception { assertTrue(nestedException instanceof IOException); assertTrue(nestedException.getMessage().contains("dummy failure")); } + + public void testForceSync_ShardDoesNotExist() { + ForceSyncRequest forceSyncRequest = new ForceSyncRequest(1L, 1L, new ShardId("no", "", 0)); + when(indicesService.getShardOrNull(forceSyncRequest.getShardId())).thenReturn(null); + transportService.submitRequest( + localNode, + SegmentReplicationTargetService.Actions.FORCE_SYNC, + forceSyncRequest, + TransportRequestOptions.builder().withTimeout(TRANSPORT_TIMEOUT).build(), + EmptyTransportResponseHandler.INSTANCE_SAME + ).txGet(); + } + + public void testForceSegmentSyncHandlerWithFailure_AlreadyClosedException_swallowed() throws Exception { + IndexShard spyReplicaShard = spy(replicaShard); + ForceSyncRequest forceSyncRequest = new ForceSyncRequest(1L, 1L, replicaShard.shardId()); + when(indicesService.getShardOrNull(forceSyncRequest.getShardId())).thenReturn(spyReplicaShard); + + AlreadyClosedException exception = new AlreadyClosedException("shard closed"); + doThrow(exception).when(spyReplicaShard).finalizeReplication(any()); + + // prevent shard failure to avoid test setup assertion + doNothing().when(spyReplicaShard).failShard(eq("replication failure"), any()); + transportService.submitRequest( + localNode, + SegmentReplicationTargetService.Actions.FORCE_SYNC, + forceSyncRequest, + TransportRequestOptions.builder().withTimeout(TRANSPORT_TIMEOUT).build(), + EmptyTransportResponseHandler.INSTANCE_SAME + ).txGet(); + } + + public void testTargetCancelledBeforeStartInvoked() { + final SegmentReplicationTarget target = new SegmentReplicationTarget( + replicaShard, + mock(SegmentReplicationSource.class), + new SegmentReplicationTargetService.SegmentReplicationListener() { + @Override + public void onReplicationDone(SegmentReplicationState state) { + Assert.fail(); + } + + @Override + public void onReplicationFailure(SegmentReplicationState state, ReplicationFailedException e, boolean sendShardFailure) { + // failures leave state object in last entered stage. + assertEquals(SegmentReplicationState.Stage.GET_CHECKPOINT_INFO, state.getStage()); + assertTrue(e.getCause() instanceof CancellableThreads.ExecutionCancelledException); + } + } + ); + target.cancel("test"); + sut.startReplication(target); + } } diff --git a/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetTests.java b/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetTests.java index ac8904527f7fb..4fb1edb4e496e 100644 --- a/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetTests.java +++ b/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetTests.java @@ -27,6 +27,7 @@ import org.junit.Assert; import org.mockito.Mockito; import org.opensearch.ExceptionsHelper; +import org.opensearch.OpenSearchCorruptionException; import org.opensearch.action.ActionListener; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.common.settings.Settings; @@ -373,8 +374,9 @@ public void onResponse(Void replicationResponse) { @Override public void onFailure(Exception e) { - assert (e instanceof ReplicationFailedException); - assert (e.getMessage().contains("different segment files")); + assertTrue(e instanceof OpenSearchCorruptionException); + assertTrue(e.getMessage().contains("has local copies of segments that differ from the primary")); + segrepTarget.fail(new ReplicationFailedException(e), false); } }); } diff --git a/server/src/test/java/org/opensearch/recovery/ReplicationCollectionTests.java b/server/src/test/java/org/opensearch/recovery/ReplicationCollectionTests.java index 75ac1075e8ee0..0bb1bc210d946 100644 --- a/server/src/test/java/org/opensearch/recovery/ReplicationCollectionTests.java +++ b/server/src/test/java/org/opensearch/recovery/ReplicationCollectionTests.java @@ -38,12 +38,15 @@ import org.opensearch.index.shard.IndexShard; import org.opensearch.index.shard.ShardId; import org.opensearch.index.store.Store; +import org.opensearch.indices.replication.SegmentReplicationSource; +import org.opensearch.indices.replication.SegmentReplicationTarget; import org.opensearch.indices.replication.common.ReplicationCollection; import org.opensearch.indices.replication.common.ReplicationFailedException; import org.opensearch.indices.replication.common.ReplicationListener; import org.opensearch.indices.replication.common.ReplicationState; import org.opensearch.indices.recovery.RecoveryState; import org.opensearch.indices.recovery.RecoveryTarget; +import org.opensearch.indices.replication.common.ReplicationTarget; import java.util.concurrent.CountDownLatch; import java.util.concurrent.TimeUnit; @@ -51,6 +54,7 @@ import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.lessThan; +import static org.mockito.Mockito.mock; public class ReplicationCollectionTests extends OpenSearchIndexLevelReplicationTestCase { static final ReplicationListener listener = new ReplicationListener() { @@ -108,7 +112,30 @@ public void onFailure(ReplicationState state, ReplicationFailedException e, bool } } - public void testMultiReplicationsForSingleShard() throws Exception { + public void testStartMultipleReplicationsForSingleShard() throws Exception { + try (ReplicationGroup shards = createGroup(0)) { + shards.startAll(); + final ReplicationCollection collection = new ReplicationCollection<>(logger, threadPool); + final IndexShard shard = shards.addReplica(); + shards.recoverReplica(shard); + final SegmentReplicationTarget target1 = new SegmentReplicationTarget( + shard, + mock(SegmentReplicationSource.class), + mock(ReplicationListener.class) + ); + final SegmentReplicationTarget target2 = new SegmentReplicationTarget( + shard, + mock(SegmentReplicationSource.class), + mock(ReplicationListener.class) + ); + collection.startSafe(target1, TimeValue.timeValueMinutes(30)); + assertThrows(ReplicationFailedException.class, () -> collection.startSafe(target2, TimeValue.timeValueMinutes(30))); + target1.decRef(); + target2.decRef(); + } + } + + public void testGetReplicationTargetMultiReplicationsForSingleShard() throws Exception { try (ReplicationGroup shards = createGroup(0)) { final ReplicationCollection collection = new ReplicationCollection<>(logger, threadPool); final IndexShard shard1 = shards.addReplica(); From 0b42c2c0e41c8af6c527eace384d4ad4245a5a1f Mon Sep 17 00:00:00 2001 From: Thomas Farr Date: Wed, 12 Jul 2023 04:45:29 +1200 Subject: [PATCH 5/7] Bump org.apache.logging.log4j:log4j-core from 2.17.1 to 2.20.0 (#8307) - Use log4j's PluginProcessor annotation processor for plugin discovery - Remove usage of deprecated explicit plugin discovery - Pass through Configuration in OpenSearchJsonLayout to work around https://issues.apache.org/jira/browse/LOG4J2-3562 Signed-off-by: Thomas Signed-off-by: Daniel (dB.) Doubrovkine Signed-off-by: Thomas Farr Co-authored-by: Daniel (dB.) Doubrovkine Co-authored-by: Andrew Ross --- .gitignore | 5 ++- CHANGELOG.md | 1 + .../thirdPartyAudit/sample_jars/build.gradle | 2 +- buildSrc/version.properties | 3 +- libs/core/licenses/log4j-api-2.17.1.jar.sha1 | 1 - libs/core/licenses/log4j-api-2.20.0.jar.sha1 | 1 + .../licenses/log4j-1.2-api-2.17.1.jar.sha1 | 1 - .../licenses/log4j-1.2-api-2.20.0.jar.sha1 | 1 + .../licenses/log4j-1.2-api-2.17.1.jar.sha1 | 1 - .../licenses/log4j-1.2-api-2.20.0.jar.sha1 | 1 + .../licenses/log4j-1.2-api-2.17.1.jar.sha1 | 1 - .../licenses/log4j-1.2-api-2.20.0.jar.sha1 | 1 + .../licenses/log4j-1.2-api-2.17.1.jar.sha1 | 1 - .../licenses/log4j-1.2-api-2.20.0.jar.sha1 | 1 + .../licenses/log4j-slf4j-impl-2.17.1.jar.sha1 | 1 - .../licenses/log4j-slf4j-impl-2.20.0.jar.sha1 | 1 + .../licenses/log4j-1.2-api-2.17.1.jar.sha1 | 1 - .../licenses/log4j-1.2-api-2.20.0.jar.sha1 | 1 + qa/wildfly/build.gradle | 3 +- .../src/main/resources/log4j2.properties | 20 ----------- server/build.gradle | 5 +++ server/licenses/log4j-api-2.17.1.jar.sha1 | 1 - server/licenses/log4j-api-2.20.0.jar.sha1 | 1 + server/licenses/log4j-core-2.17.1.jar.sha1 | 1 - server/licenses/log4j-core-2.20.0.jar.sha1 | 1 + server/licenses/log4j-jul-2.17.1.jar.sha1 | 1 - server/licenses/log4j-jul-2.20.0.jar.sha1 | 1 + .../common/logging/LogConfigurator.java | 10 ------ .../common/logging/OpenSearchJsonLayout.java | 35 ++++++++++++++++--- test/framework/build.gradle | 3 ++ .../opensearch/test/OpenSearchTestCase.java | 2 -- 31 files changed, 57 insertions(+), 52 deletions(-) delete mode 100644 libs/core/licenses/log4j-api-2.17.1.jar.sha1 create mode 100644 libs/core/licenses/log4j-api-2.20.0.jar.sha1 delete mode 100644 plugins/discovery-azure-classic/licenses/log4j-1.2-api-2.17.1.jar.sha1 create mode 100644 plugins/discovery-azure-classic/licenses/log4j-1.2-api-2.20.0.jar.sha1 delete mode 100644 plugins/discovery-ec2/licenses/log4j-1.2-api-2.17.1.jar.sha1 create mode 100644 plugins/discovery-ec2/licenses/log4j-1.2-api-2.20.0.jar.sha1 delete mode 100644 plugins/discovery-gce/licenses/log4j-1.2-api-2.17.1.jar.sha1 create mode 100644 plugins/discovery-gce/licenses/log4j-1.2-api-2.20.0.jar.sha1 delete mode 100644 plugins/repository-gcs/licenses/log4j-1.2-api-2.17.1.jar.sha1 create mode 100644 plugins/repository-gcs/licenses/log4j-1.2-api-2.20.0.jar.sha1 delete mode 100644 plugins/repository-hdfs/licenses/log4j-slf4j-impl-2.17.1.jar.sha1 create mode 100644 plugins/repository-hdfs/licenses/log4j-slf4j-impl-2.20.0.jar.sha1 delete mode 100644 plugins/repository-s3/licenses/log4j-1.2-api-2.17.1.jar.sha1 create mode 100644 plugins/repository-s3/licenses/log4j-1.2-api-2.20.0.jar.sha1 delete mode 100644 qa/wildfly/src/main/resources/log4j2.properties delete mode 100644 server/licenses/log4j-api-2.17.1.jar.sha1 create mode 100644 server/licenses/log4j-api-2.20.0.jar.sha1 delete mode 100644 server/licenses/log4j-core-2.17.1.jar.sha1 create mode 100644 server/licenses/log4j-core-2.20.0.jar.sha1 delete mode 100644 server/licenses/log4j-jul-2.17.1.jar.sha1 create mode 100644 server/licenses/log4j-jul-2.20.0.jar.sha1 diff --git a/.gitignore b/.gitignore index 9ab7de894636a..7514d55cc3c9a 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,9 @@ out/ benchmarks/src/main/generated/* benchmarks/bin/* benchmarks/build-eclipse-default/* +server/bin/* +server/build-eclipse-default/* +test/framework/build-eclipse-default/* # eclipse files .project @@ -61,4 +64,4 @@ testfixtures_shared/ .ci/jobs/ # build files generated -doc-tools/missing-doclet/bin/ +doc-tools/missing-doclet/bin/ \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index fc367be22c1fd..5f0d733bd7c55 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -143,6 +143,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Bump `com.google.jimfs:jimfs` from 1.2 to 1.3.0 (#8577, #8571) - Bump `com.networknt:json-schema-validator` from 1.0.85 to 1.0.86 ([#8573](https://github.com/opensearch-project/OpenSearch/pull/8573)) - Bump `com.google.cloud:google-cloud-core-http` from 2.17.0 to 2.21.0 ([#8586](https://github.com/opensearch-project/OpenSearch/pull/8586)) +- Bump `org.apache.logging.log4j:log4j-core` from 2.17.1 to 2.20.0 ([#8307](https://github.com/opensearch-project/OpenSearch/pull/8307)) ### Changed - Replace jboss-annotations-api_1.2_spec with jakarta.annotation-api ([#7836](https://github.com/opensearch-project/OpenSearch/pull/7836)) diff --git a/buildSrc/src/testKit/thirdPartyAudit/sample_jars/build.gradle b/buildSrc/src/testKit/thirdPartyAudit/sample_jars/build.gradle index b31e3b168c587..cb8050d1718c4 100644 --- a/buildSrc/src/testKit/thirdPartyAudit/sample_jars/build.gradle +++ b/buildSrc/src/testKit/thirdPartyAudit/sample_jars/build.gradle @@ -16,7 +16,7 @@ repositories { mavenCentral() } dependencies { - implementation 'org.apache.logging.log4j:log4j-core:2.20.0' + implementation "org.apache.logging.log4j:log4j-core:2.20.0" } ["0.0.1", "0.0.2"].forEach { v -> diff --git a/buildSrc/version.properties b/buildSrc/version.properties index 7a2ddc24aabcf..246f6e3444224 100644 --- a/buildSrc/version.properties +++ b/buildSrc/version.properties @@ -14,8 +14,7 @@ jackson_databind = 2.15.2 snakeyaml = 2.0 icu4j = 70.1 supercsv = 2.4.0 -# Update to 2.17.2+ is breaking OpenSearchJsonLayout (see https://issues.apache.org/jira/browse/LOG4J2-3562) -log4j = 2.17.1 +log4j = 2.20.0 slf4j = 1.7.36 asm = 9.5 jettison = 1.5.4 diff --git a/libs/core/licenses/log4j-api-2.17.1.jar.sha1 b/libs/core/licenses/log4j-api-2.17.1.jar.sha1 deleted file mode 100644 index 9d0e5dc631ed5..0000000000000 --- a/libs/core/licenses/log4j-api-2.17.1.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -d771af8e336e372fb5399c99edabe0919aeaf5b2 \ No newline at end of file diff --git a/libs/core/licenses/log4j-api-2.20.0.jar.sha1 b/libs/core/licenses/log4j-api-2.20.0.jar.sha1 new file mode 100644 index 0000000000000..37154d9861ac0 --- /dev/null +++ b/libs/core/licenses/log4j-api-2.20.0.jar.sha1 @@ -0,0 +1 @@ +1fe6082e660daf07c689a89c94dc0f49c26b44bb \ No newline at end of file diff --git a/plugins/discovery-azure-classic/licenses/log4j-1.2-api-2.17.1.jar.sha1 b/plugins/discovery-azure-classic/licenses/log4j-1.2-api-2.17.1.jar.sha1 deleted file mode 100644 index 23aa5c60bd596..0000000000000 --- a/plugins/discovery-azure-classic/licenses/log4j-1.2-api-2.17.1.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -db3a7e7f07e878b92ac4a8f1100bee8325d5713a \ No newline at end of file diff --git a/plugins/discovery-azure-classic/licenses/log4j-1.2-api-2.20.0.jar.sha1 b/plugins/discovery-azure-classic/licenses/log4j-1.2-api-2.20.0.jar.sha1 new file mode 100644 index 0000000000000..9829576d38ce0 --- /dev/null +++ b/plugins/discovery-azure-classic/licenses/log4j-1.2-api-2.20.0.jar.sha1 @@ -0,0 +1 @@ +689151374756cb809cb029f2501015bdc7733179 \ No newline at end of file diff --git a/plugins/discovery-ec2/licenses/log4j-1.2-api-2.17.1.jar.sha1 b/plugins/discovery-ec2/licenses/log4j-1.2-api-2.17.1.jar.sha1 deleted file mode 100644 index 23aa5c60bd596..0000000000000 --- a/plugins/discovery-ec2/licenses/log4j-1.2-api-2.17.1.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -db3a7e7f07e878b92ac4a8f1100bee8325d5713a \ No newline at end of file diff --git a/plugins/discovery-ec2/licenses/log4j-1.2-api-2.20.0.jar.sha1 b/plugins/discovery-ec2/licenses/log4j-1.2-api-2.20.0.jar.sha1 new file mode 100644 index 0000000000000..9829576d38ce0 --- /dev/null +++ b/plugins/discovery-ec2/licenses/log4j-1.2-api-2.20.0.jar.sha1 @@ -0,0 +1 @@ +689151374756cb809cb029f2501015bdc7733179 \ No newline at end of file diff --git a/plugins/discovery-gce/licenses/log4j-1.2-api-2.17.1.jar.sha1 b/plugins/discovery-gce/licenses/log4j-1.2-api-2.17.1.jar.sha1 deleted file mode 100644 index 23aa5c60bd596..0000000000000 --- a/plugins/discovery-gce/licenses/log4j-1.2-api-2.17.1.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -db3a7e7f07e878b92ac4a8f1100bee8325d5713a \ No newline at end of file diff --git a/plugins/discovery-gce/licenses/log4j-1.2-api-2.20.0.jar.sha1 b/plugins/discovery-gce/licenses/log4j-1.2-api-2.20.0.jar.sha1 new file mode 100644 index 0000000000000..9829576d38ce0 --- /dev/null +++ b/plugins/discovery-gce/licenses/log4j-1.2-api-2.20.0.jar.sha1 @@ -0,0 +1 @@ +689151374756cb809cb029f2501015bdc7733179 \ No newline at end of file diff --git a/plugins/repository-gcs/licenses/log4j-1.2-api-2.17.1.jar.sha1 b/plugins/repository-gcs/licenses/log4j-1.2-api-2.17.1.jar.sha1 deleted file mode 100644 index 23aa5c60bd596..0000000000000 --- a/plugins/repository-gcs/licenses/log4j-1.2-api-2.17.1.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -db3a7e7f07e878b92ac4a8f1100bee8325d5713a \ No newline at end of file diff --git a/plugins/repository-gcs/licenses/log4j-1.2-api-2.20.0.jar.sha1 b/plugins/repository-gcs/licenses/log4j-1.2-api-2.20.0.jar.sha1 new file mode 100644 index 0000000000000..9829576d38ce0 --- /dev/null +++ b/plugins/repository-gcs/licenses/log4j-1.2-api-2.20.0.jar.sha1 @@ -0,0 +1 @@ +689151374756cb809cb029f2501015bdc7733179 \ No newline at end of file diff --git a/plugins/repository-hdfs/licenses/log4j-slf4j-impl-2.17.1.jar.sha1 b/plugins/repository-hdfs/licenses/log4j-slf4j-impl-2.17.1.jar.sha1 deleted file mode 100644 index 894ed8d886c3f..0000000000000 --- a/plugins/repository-hdfs/licenses/log4j-slf4j-impl-2.17.1.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -84692d456bcce689355d33d68167875e486954dd \ No newline at end of file diff --git a/plugins/repository-hdfs/licenses/log4j-slf4j-impl-2.20.0.jar.sha1 b/plugins/repository-hdfs/licenses/log4j-slf4j-impl-2.20.0.jar.sha1 new file mode 100644 index 0000000000000..800a4aa87ba0e --- /dev/null +++ b/plugins/repository-hdfs/licenses/log4j-slf4j-impl-2.20.0.jar.sha1 @@ -0,0 +1 @@ +7ab4f082fd162f60afcaf2b8744a3d959feab3e8 \ No newline at end of file diff --git a/plugins/repository-s3/licenses/log4j-1.2-api-2.17.1.jar.sha1 b/plugins/repository-s3/licenses/log4j-1.2-api-2.17.1.jar.sha1 deleted file mode 100644 index 23aa5c60bd596..0000000000000 --- a/plugins/repository-s3/licenses/log4j-1.2-api-2.17.1.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -db3a7e7f07e878b92ac4a8f1100bee8325d5713a \ No newline at end of file diff --git a/plugins/repository-s3/licenses/log4j-1.2-api-2.20.0.jar.sha1 b/plugins/repository-s3/licenses/log4j-1.2-api-2.20.0.jar.sha1 new file mode 100644 index 0000000000000..9829576d38ce0 --- /dev/null +++ b/plugins/repository-s3/licenses/log4j-1.2-api-2.20.0.jar.sha1 @@ -0,0 +1 @@ +689151374756cb809cb029f2501015bdc7733179 \ No newline at end of file diff --git a/qa/wildfly/build.gradle b/qa/wildfly/build.gradle index 391d2c78b489b..5d37be47e782e 100644 --- a/qa/wildfly/build.gradle +++ b/qa/wildfly/build.gradle @@ -58,11 +58,10 @@ dependencies { api "com.fasterxml.jackson.jakarta.rs:jackson-jakarta-rs-base:${versions.jackson}" api "com.fasterxml.jackson.jakarta.rs:jackson-jakarta-rs-json-provider:${versions.jackson}" api "com.github.fge:json-patch:1.9" - api "org.apache.logging.log4j:log4j-api:${versions.log4j}" - api "org.apache.logging.log4j:log4j-core:${versions.log4j}" api(project(path: ':client:rest-high-level')) { exclude module: 'jakarta.annotation-api' } + testImplementation "org.apache.logging.log4j:log4j-slf4j-impl:${versions.log4j}" testImplementation(project(':test:framework')) { exclude module: 'jakarta.annotation-api' } diff --git a/qa/wildfly/src/main/resources/log4j2.properties b/qa/wildfly/src/main/resources/log4j2.properties deleted file mode 100644 index 4fd5d69daf65b..0000000000000 --- a/qa/wildfly/src/main/resources/log4j2.properties +++ /dev/null @@ -1,20 +0,0 @@ -# -# SPDX-License-Identifier: Apache-2.0 -# -# The OpenSearch Contributors require contributions made to -# this file be licensed under the Apache-2.0 license or a -# compatible open source license. -# -# Modifications Copyright OpenSearch Contributors. See -# GitHub history for details. -# - -status = error - -appender.console.type = Console -appender.console.name = console -appender.console.layout.type = PatternLayout -appender.console.layout.pattern = [%d{ISO8601}][%-5p][%-25c{1.}] %marker%m%n - -rootLogger.level = info -rootLogger.appenderRef.console.ref = console diff --git a/server/build.gradle b/server/build.gradle index 38bbf020d860b..e80aa0f77a164 100644 --- a/server/build.gradle +++ b/server/build.gradle @@ -146,6 +146,7 @@ dependencies { api "org.apache.logging.log4j:log4j-api:${versions.log4j}" api "org.apache.logging.log4j:log4j-jul:${versions.log4j}" api "org.apache.logging.log4j:log4j-core:${versions.log4j}", optional + annotationProcessor "org.apache.logging.log4j:log4j-core:${versions.log4j}" // jna api "net.java.dev.jna:jna:${versions.jna}" @@ -175,6 +176,10 @@ tasks.withType(JavaCompile).configureEach { options.compilerArgs -= '-Xlint:unchecked' } +compileJava { + options.compilerArgs += ['-processor', 'org.apache.logging.log4j.core.config.plugins.processor.PluginProcessor'] +} + tasks.named("internalClusterTest").configure { // TODO: these run faster with C2 only because they run for so, so long jvmArgs -= '-XX:TieredStopAtLevel=1' diff --git a/server/licenses/log4j-api-2.17.1.jar.sha1 b/server/licenses/log4j-api-2.17.1.jar.sha1 deleted file mode 100644 index 9d0e5dc631ed5..0000000000000 --- a/server/licenses/log4j-api-2.17.1.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -d771af8e336e372fb5399c99edabe0919aeaf5b2 \ No newline at end of file diff --git a/server/licenses/log4j-api-2.20.0.jar.sha1 b/server/licenses/log4j-api-2.20.0.jar.sha1 new file mode 100644 index 0000000000000..37154d9861ac0 --- /dev/null +++ b/server/licenses/log4j-api-2.20.0.jar.sha1 @@ -0,0 +1 @@ +1fe6082e660daf07c689a89c94dc0f49c26b44bb \ No newline at end of file diff --git a/server/licenses/log4j-core-2.17.1.jar.sha1 b/server/licenses/log4j-core-2.17.1.jar.sha1 deleted file mode 100644 index 7d4634f3d4e18..0000000000000 --- a/server/licenses/log4j-core-2.17.1.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -779f60f3844dadc3ef597976fcb1e5127b1f343d \ No newline at end of file diff --git a/server/licenses/log4j-core-2.20.0.jar.sha1 b/server/licenses/log4j-core-2.20.0.jar.sha1 new file mode 100644 index 0000000000000..49c972626563b --- /dev/null +++ b/server/licenses/log4j-core-2.20.0.jar.sha1 @@ -0,0 +1 @@ +eb2a9a47b1396e00b5eee1264296729a70565cc0 \ No newline at end of file diff --git a/server/licenses/log4j-jul-2.17.1.jar.sha1 b/server/licenses/log4j-jul-2.17.1.jar.sha1 deleted file mode 100644 index 4afb381a696e9..0000000000000 --- a/server/licenses/log4j-jul-2.17.1.jar.sha1 +++ /dev/null @@ -1 +0,0 @@ -881333b463d47828eda7443b19811763367b1916 \ No newline at end of file diff --git a/server/licenses/log4j-jul-2.20.0.jar.sha1 b/server/licenses/log4j-jul-2.20.0.jar.sha1 new file mode 100644 index 0000000000000..a456651e4569e --- /dev/null +++ b/server/licenses/log4j-jul-2.20.0.jar.sha1 @@ -0,0 +1 @@ +8170e6118eac1ab332046c179718a0f107f688e1 \ No newline at end of file diff --git a/server/src/main/java/org/opensearch/common/logging/LogConfigurator.java b/server/src/main/java/org/opensearch/common/logging/LogConfigurator.java index 4438bf53fd62c..b1a338e65746f 100644 --- a/server/src/main/java/org/opensearch/common/logging/LogConfigurator.java +++ b/server/src/main/java/org/opensearch/common/logging/LogConfigurator.java @@ -41,7 +41,6 @@ import org.apache.logging.log4j.core.config.builder.api.ConfigurationBuilderFactory; import org.apache.logging.log4j.core.config.builder.impl.BuiltConfiguration; import org.apache.logging.log4j.core.config.composite.CompositeConfiguration; -import org.apache.logging.log4j.core.config.plugins.util.PluginManager; import org.apache.logging.log4j.core.config.properties.PropertiesConfiguration; import org.apache.logging.log4j.core.config.properties.PropertiesConfigurationFactory; import org.apache.logging.log4j.status.StatusConsoleListener; @@ -140,13 +139,6 @@ public static void configure(final Environment environment) throws IOException, configure(environment.settings(), environment.configDir(), environment.logsDir()); } - /** - * Load logging plugins so we can have {@code node_name} in the pattern. - */ - public static void loadLog4jPlugins() { - PluginManager.addPackage(LogConfigurator.class.getPackage().getName()); - } - /** * Sets the node name. This is called before logging is configured if the * node name is set in opensearch.yml. Otherwise it is called as soon @@ -172,8 +164,6 @@ private static void configure(final Settings settings, final Path configsPath, f Objects.requireNonNull(configsPath); Objects.requireNonNull(logsPath); - loadLog4jPlugins(); - setLogConfigurationSystemProperty(logsPath, settings); // we initialize the status logger immediately otherwise Log4j will complain when we try to get the context configureStatusLogger(); diff --git a/server/src/main/java/org/opensearch/common/logging/OpenSearchJsonLayout.java b/server/src/main/java/org/opensearch/common/logging/OpenSearchJsonLayout.java index 5896aec0ce71c..60df94036cc6b 100644 --- a/server/src/main/java/org/opensearch/common/logging/OpenSearchJsonLayout.java +++ b/server/src/main/java/org/opensearch/common/logging/OpenSearchJsonLayout.java @@ -34,10 +34,12 @@ import org.apache.logging.log4j.core.Layout; import org.apache.logging.log4j.core.LogEvent; +import org.apache.logging.log4j.core.config.Configuration; import org.apache.logging.log4j.core.config.Node; import org.apache.logging.log4j.core.config.plugins.Plugin; import org.apache.logging.log4j.core.config.plugins.PluginAttribute; import org.apache.logging.log4j.core.config.plugins.PluginBuilderFactory; +import org.apache.logging.log4j.core.config.plugins.PluginConfiguration; import org.apache.logging.log4j.core.config.plugins.PluginFactory; import org.apache.logging.log4j.core.layout.AbstractStringLayout; import org.apache.logging.log4j.core.layout.ByteBufferDestination; @@ -94,11 +96,18 @@ public class OpenSearchJsonLayout extends AbstractStringLayout { private final PatternLayout patternLayout; - protected OpenSearchJsonLayout(String typeName, Charset charset, String[] opensearchMessageFields, int maxMessageLength) { + protected OpenSearchJsonLayout( + String typeName, + Charset charset, + String[] opensearchMessageFields, + int maxMessageLength, + Configuration configuration + ) { super(charset); this.patternLayout = PatternLayout.newBuilder() .withPattern(pattern(typeName, opensearchMessageFields, maxMessageLength)) .withAlwaysWriteExceptions(false) + .withConfiguration(configuration) .build(); } @@ -173,8 +182,14 @@ private String inQuotes(String s) { } @PluginFactory - public static OpenSearchJsonLayout createLayout(String type, Charset charset, String[] opensearchmessagefields, int maxMessageLength) { - return new OpenSearchJsonLayout(type, charset, opensearchmessagefields, maxMessageLength); + public static OpenSearchJsonLayout createLayout( + String type, + Charset charset, + String[] opensearchmessagefields, + int maxMessageLength, + Configuration configuration + ) { + return new OpenSearchJsonLayout(type, charset, opensearchmessagefields, maxMessageLength, configuration); } PatternLayout getPatternLayout() { @@ -202,6 +217,9 @@ public static class Builder> extends A @PluginAttribute(value = "maxmessagelength", defaultInt = 10000) private int maxMessageLength; + @PluginConfiguration + private Configuration configuration; + public Builder() { setCharset(StandardCharsets.UTF_8); setMaxMessageLength(10000); @@ -210,7 +228,7 @@ public Builder() { @Override public OpenSearchJsonLayout build() { String[] split = Strings.isNullOrEmpty(opensearchMessageFields) ? new String[] {} : opensearchMessageFields.split(","); - return OpenSearchJsonLayout.createLayout(type, charset, split, maxMessageLength); + return OpenSearchJsonLayout.createLayout(type, charset, split, maxMessageLength, configuration); } public Charset getCharset() { @@ -248,6 +266,15 @@ public B setMaxMessageLength(final int maxMessageLength) { this.maxMessageLength = maxMessageLength; return asBuilder(); } + + public Configuration getConfiguration() { + return configuration; + } + + public B setConfiguration(final Configuration configuration) { + this.configuration = configuration; + return asBuilder(); + } } @PluginBuilderFactory diff --git a/test/framework/build.gradle b/test/framework/build.gradle index 2532fdf1938fd..c65bf51c6af36 100644 --- a/test/framework/build.gradle +++ b/test/framework/build.gradle @@ -49,11 +49,14 @@ dependencies { api "org.mockito:mockito-core:${versions.mockito}" api "net.bytebuddy:byte-buddy:${versions.bytebuddy}" api "org.objenesis:objenesis:${versions.objenesis}" + + annotationProcessor "org.apache.logging.log4j:log4j-core:${versions.log4j}" } compileJava.options.compilerArgs -= '-Xlint:cast' compileJava.options.compilerArgs -= '-Xlint:rawtypes' compileJava.options.compilerArgs -= '-Xlint:unchecked' +compileJava.options.compilerArgs += ['-processor', 'org.apache.logging.log4j.core.config.plugins.processor.PluginProcessor'] compileTestJava.options.compilerArgs -= '-Xlint:rawtypes' // the main files are actually test files, so use the appropriate forbidden api sigs diff --git a/test/framework/src/main/java/org/opensearch/test/OpenSearchTestCase.java b/test/framework/src/main/java/org/opensearch/test/OpenSearchTestCase.java index ec397a2baa640..1bea22852df4c 100644 --- a/test/framework/src/main/java/org/opensearch/test/OpenSearchTestCase.java +++ b/test/framework/src/main/java/org/opensearch/test/OpenSearchTestCase.java @@ -83,7 +83,6 @@ import org.opensearch.common.logging.DeprecatedMessage; import org.opensearch.common.logging.HeaderWarning; import org.opensearch.common.logging.HeaderWarningAppender; -import org.opensearch.common.logging.LogConfigurator; import org.opensearch.common.logging.Loggers; import org.opensearch.common.settings.Setting; import org.opensearch.common.settings.Settings; @@ -239,7 +238,6 @@ public void tearDown() throws Exception { static { TEST_WORKER_VM_ID = System.getProperty(TEST_WORKER_SYS_PROPERTY, DEFAULT_TEST_WORKER_ID); setTestSysProps(); - LogConfigurator.loadLog4jPlugins(); String leakLoggerName = "io.netty.util.ResourceLeakDetector"; Logger leakLogger = LogManager.getLogger(leakLoggerName); From 3d7d33bdbc02ec8e363c5394a9d89451f76e979a Mon Sep 17 00:00:00 2001 From: Suraj Singh Date: Tue, 11 Jul 2023 10:01:08 -0700 Subject: [PATCH 6/7] [Segment Replication] Add logic back to update tracking replication checkpoint on source (#8560) * [Segment Replication] Add logic back to update tracking replication checkpoint on source Signed-off-by: Suraj Singh * Update comment Signed-off-by: Suraj Singh * Address review comments & mute breaking bwc-test Signed-off-by: Suraj Singh * Spotless check Signed-off-by: Suraj Singh * Stop timer inside try to prevent double stop on timer Signed-off-by: Suraj Singh * Update PressureITs to wait for appropriate transport call for replica update Signed-off-by: Suraj Singh * Spotless check Signed-off-by: Suraj Singh --------- Signed-off-by: Suraj Singh --- .../java/org/opensearch/upgrades/IndexingIT.java | 1 + .../replication/SegmentReplicationBaseIT.java | 12 +++++++++++- .../indices/replication/SegmentReplicationIT.java | 6 ------ .../replication/OngoingSegmentReplications.java | 6 +----- .../replication/SegmentReplicationSourceHandler.java | 12 +++++++++++- .../replication/SegmentReplicationTargetService.java | 5 +++++ .../SegmentReplicationSourceHandlerTests.java | 7 +++++-- 7 files changed, 34 insertions(+), 15 deletions(-) diff --git a/qa/rolling-upgrade/src/test/java/org/opensearch/upgrades/IndexingIT.java b/qa/rolling-upgrade/src/test/java/org/opensearch/upgrades/IndexingIT.java index 93c0bc96a5183..b60ee09d39048 100644 --- a/qa/rolling-upgrade/src/test/java/org/opensearch/upgrades/IndexingIT.java +++ b/qa/rolling-upgrade/src/test/java/org/opensearch/upgrades/IndexingIT.java @@ -247,6 +247,7 @@ public void testIndexing() throws IOException, ParseException { * * @throws Exception */ + @AwaitsFix(bugUrl = "https://github.com/opensearch-project/OpenSearch/issues/8322") public void testIndexingWithSegRep() throws Exception { if (UPGRADE_FROM_VERSION.before(Version.V_2_4_0)) { logger.info("--> Skip test for version {} where segment replication feature is not available", UPGRADE_FROM_VERSION); diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationBaseIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationBaseIT.java index 8fef88cbe6820..4692210ccc577 100644 --- a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationBaseIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationBaseIT.java @@ -19,6 +19,7 @@ import org.opensearch.common.Nullable; import org.opensearch.common.settings.Settings; import org.opensearch.common.lease.Releasable; +import org.opensearch.common.util.FeatureFlags; import org.opensearch.index.Index; import org.opensearch.index.IndexModule; import org.opensearch.index.IndexService; @@ -222,6 +223,11 @@ protected IndexShard getIndexShard(String node, String indexName) { return indexService.getShard(shardId.get()); } + protected boolean segmentReplicationWithRemoteEnabled() { + return IndexMetadata.INDEX_REMOTE_STORE_ENABLED_SETTING.get(indexSettings()).booleanValue() + && "true".equalsIgnoreCase(featureFlagSettings().get(FeatureFlags.SEGMENT_REPLICATION_EXPERIMENTAL)); + } + protected Releasable blockReplication(List nodes, CountDownLatch latch) { CountDownLatch pauseReplicationLatch = new CountDownLatch(nodes.size()); for (String node : nodes) { @@ -231,7 +237,11 @@ protected Releasable blockReplication(List nodes, CountDownLatch latch) node )); mockTargetTransportService.addSendBehavior((connection, requestId, action, request, options) -> { - if (action.equals(SegmentReplicationSourceService.Actions.UPDATE_VISIBLE_CHECKPOINT)) { + String actionToWaitFor = SegmentReplicationSourceService.Actions.GET_SEGMENT_FILES; + if (segmentReplicationWithRemoteEnabled()) { + actionToWaitFor = SegmentReplicationSourceService.Actions.UPDATE_VISIBLE_CHECKPOINT; + } + if (action.equals(actionToWaitFor)) { try { latch.countDown(); pauseReplicationLatch.await(); diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java index 873c05843fb56..099c15267c2f7 100644 --- a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java @@ -49,7 +49,6 @@ import org.opensearch.common.settings.Settings; import org.opensearch.common.unit.TimeValue; import org.opensearch.common.lease.Releasable; -import org.opensearch.common.util.FeatureFlags; import org.opensearch.index.IndexModule; import org.opensearch.index.SegmentReplicationPerGroupStats; import org.opensearch.index.SegmentReplicationPressureService; @@ -1324,9 +1323,4 @@ public void testPrimaryReceivesDocsDuringReplicaRecovery() throws Exception { ensureGreen(INDEX_NAME); waitForSearchableDocs(2, nodes); } - - private boolean segmentReplicationWithRemoteEnabled() { - return IndexMetadata.INDEX_REMOTE_STORE_ENABLED_SETTING.get(indexSettings()).booleanValue() - && "true".equalsIgnoreCase(featureFlagSettings().get(FeatureFlags.SEGMENT_REPLICATION_EXPERIMENTAL)); - } } diff --git a/server/src/main/java/org/opensearch/indices/replication/OngoingSegmentReplications.java b/server/src/main/java/org/opensearch/indices/replication/OngoingSegmentReplications.java index 050a66bedcf5d..f32887175d4f3 100644 --- a/server/src/main/java/org/opensearch/indices/replication/OngoingSegmentReplications.java +++ b/server/src/main/java/org/opensearch/indices/replication/OngoingSegmentReplications.java @@ -120,11 +120,7 @@ void startSegmentCopy(GetSegmentFilesRequest request, ActionListener listener) { + // Short circuit when no files to transfer + if (request.getFilesToFetch().isEmpty()) { + // before completion, alert the primary of the replica's state. + shard.updateVisibleCheckpointForShard(request.getTargetAllocationId(), copyState.getCheckpoint()); + listener.onResponse(new GetSegmentFilesResponse(Collections.emptyList())); + return; + } + final ReplicationTimer timer = new ReplicationTimer(); if (isReplicating.compareAndSet(false, true) == false) { throw new OpenSearchException("Replication to {} is already running.", shard.shardId()); @@ -159,10 +168,11 @@ public synchronized void sendFiles(GetSegmentFilesRequest request, ActionListene sendFileStep.whenComplete(r -> { try { + shard.updateVisibleCheckpointForShard(allocationId, copyState.getCheckpoint()); future.onResponse(new GetSegmentFilesResponse(List.of(storeFileMetadata))); + timer.stop(); } finally { IOUtils.close(resources); - timer.stop(); logger.trace( "[replication id {}] Source node completed sending files to target node [{}], timing: {}", request.getReplicationId(), diff --git a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java index ac93412eef725..467f499056345 100644 --- a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java +++ b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java @@ -292,6 +292,11 @@ public void onReplicationFailure( } protected void updateVisibleCheckpoint(long replicationId, IndexShard replicaShard) { + // Update replication checkpoint on source via transport call only supported for remote store integration. For node- + // node communication, checkpoint update is piggy-backed to GET_SEGMENT_FILES transport call + if (replicaShard.indexSettings().isRemoteStoreEnabled() == false) { + return; + } ShardRouting primaryShard = clusterService.state().routingTable().shardRoutingTable(replicaShard.shardId()).primaryShard(); final UpdateVisibleCheckpointRequest request = new UpdateVisibleCheckpointRequest( diff --git a/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationSourceHandlerTests.java b/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationSourceHandlerTests.java index 607f9dd91e35e..b4e9166f377ec 100644 --- a/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationSourceHandlerTests.java +++ b/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationSourceHandlerTests.java @@ -196,11 +196,13 @@ public void testReplicationAlreadyRunning() throws IOException { 1 ); + final List expectedFiles = List.of(new StoreFileMetadata("_0.si", 20, "test", Version.CURRENT.luceneVersion)); + final GetSegmentFilesRequest getSegmentFilesRequest = new GetSegmentFilesRequest( 1L, replica.routingEntry().allocationId().getId(), replicaDiscoveryNode, - Collections.emptyList(), + expectedFiles, latestReplicationCheckpoint ); @@ -224,11 +226,12 @@ public void testCancelReplication() throws IOException, InterruptedException { 1 ); + final List expectedFiles = List.of(new StoreFileMetadata("_0.si", 20, "test", Version.CURRENT.luceneVersion)); final GetSegmentFilesRequest getSegmentFilesRequest = new GetSegmentFilesRequest( 1L, replica.routingEntry().allocationId().getId(), replicaDiscoveryNode, - Collections.emptyList(), + expectedFiles, latestReplicationCheckpoint ); From 2004ba09eec681175a94f289bb472399db08aa86 Mon Sep 17 00:00:00 2001 From: Mingshi Liu <113382730+mingshl@users.noreply.github.com> Date: Tue, 11 Jul 2023 11:00:20 -0700 Subject: [PATCH 7/7] [Search pipelines] Add Global Ignore_failure options for Processors (#8373) * Add Global Ingore_failure options for Processors Signed-off-by: Mingshi Liu * add changelog Signed-off-by: Mingshi Liu * Add ignore_failure to 40_rename_response Signed-off-by: Mingshi Liu * Change Boolean to boolean and refactor AbstractProcessor Signed-off-by: Mingshi Liu * rename to isIgnoreFailure and add tests Signed-off-by: Mingshi Liu * rename to isIgnoreFailure and add tests Signed-off-by: Mingshi Liu * add ignoreFailure to runSearchPhaseResultsTransformer Signed-off-by: Mingshi Liu * fix filter query and change log warn message Signed-off-by: Mingshi Liu * Add test on matching each processor stat Signed-off-by: Mingshi Liu * Add test on matching each processor stat Signed-off-by: Mingshi Liu * remove extra spaces and words Signed-off-by: Mingshi Liu * use IGNORE_FAILURE_KEY Signed-off-by: Mingshi Liu --------- Signed-off-by: Mingshi Liu --- CHANGELOG.md | 1 + .../common/FilterQueryRequestProcessor.java | 13 +- .../common/RenameFieldResponseProcessor.java | 24 +- .../common/ScriptRequestProcessor.java | 8 +- .../FilterQueryRequestProcessorTests.java | 6 +- .../RenameFieldResponseProcessorTests.java | 7 +- .../common/ScriptRequestProcessorTests.java | 11 +- .../search_pipeline/40_rename_response.yml | 33 +- .../opensearch/ingest/ConfigurationUtils.java | 8 +- .../search/pipeline}/AbstractProcessor.java | 15 +- .../opensearch/search/pipeline/Pipeline.java | 50 ++- .../search/pipeline/PipelineWithMetrics.java | 6 +- .../opensearch/search/pipeline/Processor.java | 6 + .../pipeline/SearchPipelineServiceTests.java | 379 +++++++++++++----- 14 files changed, 440 insertions(+), 127 deletions(-) rename {modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common => server/src/main/java/org/opensearch/search/pipeline}/AbstractProcessor.java (60%) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5f0d733bd7c55..0de0d823aa764 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -101,6 +101,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Adds mock implementation for TelemetryPlugin ([#7545](https://github.com/opensearch-project/OpenSearch/issues/7545)) - Support transport action names when registering NamedRoutes ([#7957](https://github.com/opensearch-project/OpenSearch/pull/7957)) - Create concept of persistent ThreadContext headers that are unstashable ([#8291]()https://github.com/opensearch-project/OpenSearch/pull/8291) +- [Search pipelines] Add Global Ignore_failure options for Processors ([#8373](https://github.com/opensearch-project/OpenSearch/pull/8373)) - Enable Partial Flat Object ([#7997](https://github.com/opensearch-project/OpenSearch/pull/7997)) - Add jdk.incubator.vector module support for JDK 20+ ([#8601](https://github.com/opensearch-project/OpenSearch/pull/8601)) diff --git a/modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common/FilterQueryRequestProcessor.java b/modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common/FilterQueryRequestProcessor.java index d8862aa59cede..c776181a64b26 100644 --- a/modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common/FilterQueryRequestProcessor.java +++ b/modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common/FilterQueryRequestProcessor.java @@ -21,6 +21,7 @@ import org.opensearch.ingest.ConfigurationUtils; import org.opensearch.search.builder.SearchSourceBuilder; import org.opensearch.search.pipeline.Processor; +import org.opensearch.search.pipeline.AbstractProcessor; import org.opensearch.search.pipeline.SearchRequestProcessor; import java.io.InputStream; @@ -53,12 +54,13 @@ public String getType() { /** * Constructor that takes a filter query. * - * @param tag processor tag - * @param description processor description + * @param tag processor tag + * @param description processor description + * @param ignoreFailure option to ignore failure * @param filterQuery the query that will be added as a filter to incoming queries */ - public FilterQueryRequestProcessor(String tag, String description, QueryBuilder filterQuery) { - super(tag, description); + FilterQueryRequestProcessor(String tag, String description, boolean ignoreFailure, QueryBuilder filterQuery) { + super(tag, description, ignoreFailure); this.filterQuery = filterQuery; } @@ -101,6 +103,7 @@ public FilterQueryRequestProcessor create( Map> processorFactories, String tag, String description, + boolean ignoreFailure, Map config, PipelineContext pipelineContext ) throws Exception { @@ -114,7 +117,7 @@ public FilterQueryRequestProcessor create( XContentParser parser = XContentType.JSON.xContent() .createParser(namedXContentRegistry, LoggingDeprecationHandler.INSTANCE, stream) ) { - return new FilterQueryRequestProcessor(tag, description, parseInnerQueryBuilder(parser)); + return new FilterQueryRequestProcessor(tag, description, ignoreFailure, parseInnerQueryBuilder(parser)); } } } diff --git a/modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common/RenameFieldResponseProcessor.java b/modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common/RenameFieldResponseProcessor.java index c8b3c06a71562..c410895304f76 100644 --- a/modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common/RenameFieldResponseProcessor.java +++ b/modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common/RenameFieldResponseProcessor.java @@ -19,6 +19,7 @@ import org.opensearch.ingest.ConfigurationUtils; import org.opensearch.search.SearchHit; import org.opensearch.search.pipeline.Processor; +import org.opensearch.search.pipeline.AbstractProcessor; import org.opensearch.search.pipeline.SearchRequestProcessor; import org.opensearch.search.pipeline.SearchResponseProcessor; @@ -41,14 +42,22 @@ public class RenameFieldResponseProcessor extends AbstractProcessor implements S /** * Constructor that takes a target field to rename and the new name * - * @param tag processor tag - * @param description processor description - * @param oldField name of field to be renamed - * @param newField name of field that will replace the old field + * @param tag processor tag + * @param description processor description + * @param ignoreFailure option to ignore failure + * @param oldField name of field to be renamed + * @param newField name of field that will replace the old field * @param ignoreMissing if true, do not throw error if oldField does not exist within search response */ - public RenameFieldResponseProcessor(String tag, String description, String oldField, String newField, boolean ignoreMissing) { - super(tag, description); + public RenameFieldResponseProcessor( + String tag, + String description, + boolean ignoreFailure, + String oldField, + String newField, + boolean ignoreMissing + ) { + super(tag, description, ignoreFailure); this.oldField = oldField; this.newField = newField; this.ignoreMissing = ignoreMissing; @@ -140,13 +149,14 @@ public RenameFieldResponseProcessor create( Map> processorFactories, String tag, String description, + boolean ignoreFailure, Map config, PipelineContext pipelineContext ) throws Exception { String oldField = ConfigurationUtils.readStringProperty(TYPE, tag, config, "field"); String newField = ConfigurationUtils.readStringProperty(TYPE, tag, config, "target_field"); boolean ignoreMissing = ConfigurationUtils.readBooleanProperty(TYPE, tag, config, "ignore_missing", false); - return new RenameFieldResponseProcessor(tag, description, oldField, newField, ignoreMissing); + return new RenameFieldResponseProcessor(tag, description, ignoreFailure, oldField, newField, ignoreMissing); } } } diff --git a/modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common/ScriptRequestProcessor.java b/modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common/ScriptRequestProcessor.java index 43ab3d4622d6b..e100fe0e3edce 100644 --- a/modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common/ScriptRequestProcessor.java +++ b/modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common/ScriptRequestProcessor.java @@ -25,6 +25,7 @@ import org.opensearch.script.ScriptType; import org.opensearch.script.SearchScript; import org.opensearch.search.pipeline.Processor; +import org.opensearch.search.pipeline.AbstractProcessor; import org.opensearch.search.pipeline.SearchRequestProcessor; import org.opensearch.search.pipeline.common.helpers.SearchRequestMap; @@ -54,6 +55,7 @@ public final class ScriptRequestProcessor extends AbstractProcessor implements S * * @param tag The processor's tag. * @param description The processor's description. + * @param ignoreFailure The option to ignore failure * @param script The {@link Script} to execute. * @param precompiledSearchScript The {@link Script} precompiled * @param scriptService The {@link ScriptService} used to execute the script. @@ -61,11 +63,12 @@ public final class ScriptRequestProcessor extends AbstractProcessor implements S ScriptRequestProcessor( String tag, String description, + boolean ignoreFailure, Script script, @Nullable SearchScript precompiledSearchScript, ScriptService scriptService ) { - super(tag, description); + super(tag, description, ignoreFailure); this.script = script; this.precompiledSearchScript = precompiledSearchScript; this.scriptService = scriptService; @@ -146,6 +149,7 @@ public ScriptRequestProcessor create( Map> registry, String processorTag, String description, + boolean ignoreFailure, Map config, PipelineContext pipelineContext ) throws Exception { @@ -174,7 +178,7 @@ public ScriptRequestProcessor create( } catch (ScriptException e) { throw newConfigurationException(TYPE, processorTag, null, e); } - return new ScriptRequestProcessor(processorTag, description, script, searchScript, scriptService); + return new ScriptRequestProcessor(processorTag, description, ignoreFailure, script, searchScript, scriptService); } } } diff --git a/modules/search-pipeline-common/src/test/java/org/opensearch/search/pipeline/common/FilterQueryRequestProcessorTests.java b/modules/search-pipeline-common/src/test/java/org/opensearch/search/pipeline/common/FilterQueryRequestProcessorTests.java index ecf746af556a2..576660d05b96d 100644 --- a/modules/search-pipeline-common/src/test/java/org/opensearch/search/pipeline/common/FilterQueryRequestProcessorTests.java +++ b/modules/search-pipeline-common/src/test/java/org/opensearch/search/pipeline/common/FilterQueryRequestProcessorTests.java @@ -23,7 +23,7 @@ public class FilterQueryRequestProcessorTests extends AbstractBuilderTestCase { public void testFilterQuery() throws Exception { QueryBuilder filterQuery = new TermQueryBuilder("field", "value"); - FilterQueryRequestProcessor filterQueryRequestProcessor = new FilterQueryRequestProcessor(null, null, filterQuery); + FilterQueryRequestProcessor filterQueryRequestProcessor = new FilterQueryRequestProcessor(null, null, false, filterQuery); QueryBuilder incomingQuery = new TermQueryBuilder("text", "foo"); SearchSourceBuilder source = new SearchSourceBuilder().query(incomingQuery); SearchRequest request = new SearchRequest().source(source); @@ -39,13 +39,13 @@ public void testFilterQuery() throws Exception { public void testFactory() throws Exception { FilterQueryRequestProcessor.Factory factory = new FilterQueryRequestProcessor.Factory(this.xContentRegistry()); Map configMap = new HashMap<>(Map.of("query", Map.of("term", Map.of("field", "value")))); - FilterQueryRequestProcessor processor = factory.create(Collections.emptyMap(), null, null, configMap, null); + FilterQueryRequestProcessor processor = factory.create(Collections.emptyMap(), null, null, false, configMap, null); assertEquals(new TermQueryBuilder("field", "value"), processor.filterQuery); // Missing "query" parameter: expectThrows( IllegalArgumentException.class, - () -> factory.create(Collections.emptyMap(), null, null, Collections.emptyMap(), null) + () -> factory.create(Collections.emptyMap(), null, null, false, Collections.emptyMap(), null) ); } } diff --git a/modules/search-pipeline-common/src/test/java/org/opensearch/search/pipeline/common/RenameFieldResponseProcessorTests.java b/modules/search-pipeline-common/src/test/java/org/opensearch/search/pipeline/common/RenameFieldResponseProcessorTests.java index 7f3a2acfbdc08..b7d1510d065ae 100644 --- a/modules/search-pipeline-common/src/test/java/org/opensearch/search/pipeline/common/RenameFieldResponseProcessorTests.java +++ b/modules/search-pipeline-common/src/test/java/org/opensearch/search/pipeline/common/RenameFieldResponseProcessorTests.java @@ -58,6 +58,7 @@ public void testRenameResponse() throws Exception { RenameFieldResponseProcessor renameFieldResponseProcessor = new RenameFieldResponseProcessor( null, null, + false, "field 0", "new field", false @@ -74,6 +75,7 @@ public void testRenameResponseWithMapping() throws Exception { RenameFieldResponseProcessor renameFieldResponseProcessor = new RenameFieldResponseProcessor( null, null, + false, "field 0", "new field", true @@ -97,6 +99,7 @@ public void testMissingField() throws Exception { RenameFieldResponseProcessor renameFieldResponseProcessor = new RenameFieldResponseProcessor( null, null, + false, "field", "new field", false @@ -115,7 +118,7 @@ public void testFactory() throws Exception { config.put("target_field", newField); RenameFieldResponseProcessor.Factory factory = new RenameFieldResponseProcessor.Factory(); - RenameFieldResponseProcessor processor = factory.create(Collections.emptyMap(), null, null, config, null); + RenameFieldResponseProcessor processor = factory.create(Collections.emptyMap(), null, null, false, config, null); assertEquals(processor.getType(), "rename_field"); assertEquals(processor.getOldField(), oldField); assertEquals(processor.getNewField(), newField); @@ -123,7 +126,7 @@ public void testFactory() throws Exception { expectThrows( OpenSearchParseException.class, - () -> factory.create(Collections.emptyMap(), null, null, Collections.emptyMap(), null) + () -> factory.create(Collections.emptyMap(), null, null, false, Collections.emptyMap(), null) ); } } diff --git a/modules/search-pipeline-common/src/test/java/org/opensearch/search/pipeline/common/ScriptRequestProcessorTests.java b/modules/search-pipeline-common/src/test/java/org/opensearch/search/pipeline/common/ScriptRequestProcessorTests.java index 2fb3b2345e7e2..df383e778c7ba 100644 --- a/modules/search-pipeline-common/src/test/java/org/opensearch/search/pipeline/common/ScriptRequestProcessorTests.java +++ b/modules/search-pipeline-common/src/test/java/org/opensearch/search/pipeline/common/ScriptRequestProcessorTests.java @@ -82,7 +82,7 @@ public void setupScripting() { } public void testScriptingWithoutPrecompiledScriptFactory() throws Exception { - ScriptRequestProcessor processor = new ScriptRequestProcessor(randomAlphaOfLength(10), null, script, null, scriptService); + ScriptRequestProcessor processor = new ScriptRequestProcessor(randomAlphaOfLength(10), null, false, script, null, scriptService); SearchRequest searchRequest = new SearchRequest(); searchRequest.source(createSearchSourceBuilder()); @@ -92,7 +92,14 @@ public void testScriptingWithoutPrecompiledScriptFactory() throws Exception { } public void testScriptingWithPrecompiledIngestScript() throws Exception { - ScriptRequestProcessor processor = new ScriptRequestProcessor(randomAlphaOfLength(10), null, script, searchScript, scriptService); + ScriptRequestProcessor processor = new ScriptRequestProcessor( + randomAlphaOfLength(10), + null, + false, + script, + searchScript, + scriptService + ); SearchRequest searchRequest = new SearchRequest(); searchRequest.source(createSearchSourceBuilder()); diff --git a/modules/search-pipeline-common/src/yamlRestTest/resources/rest-api-spec/test/search_pipeline/40_rename_response.yml b/modules/search-pipeline-common/src/yamlRestTest/resources/rest-api-spec/test/search_pipeline/40_rename_response.yml index 3b705f9bd5356..0528440f7584d 100644 --- a/modules/search-pipeline-common/src/yamlRestTest/resources/rest-api-spec/test/search_pipeline/40_rename_response.yml +++ b/modules/search-pipeline-common/src/yamlRestTest/resources/rest-api-spec/test/search_pipeline/40_rename_response.yml @@ -63,6 +63,26 @@ teardown: } - match: { acknowledged: true } + - do: + search_pipeline.put: + id: "my_pipeline_4" + body: > + { + "description": "test pipeline with ignore missing false and ignore failure true", + "response_processors": [ + { + "rename_field": + { + "field": "aa", + "target_field": "b", + "ignore_missing": false, + "ignore_failure": true + } + } + ] + } + - match: { acknowledged: true } + - do: indices.create: index: test @@ -119,8 +139,8 @@ teardown: - match: { hits.total.value: 1 } - match: {hits.hits.0._source: { "a": "foo" } } - # Pipeline with ignore_missing set to true - # Should still pass even though index does not contain field + # Pipeline with ignore_missing set to false + # Should throw illegal_argument_exception - do: catch: bad_request search: @@ -128,6 +148,15 @@ teardown: search_pipeline: "my_pipeline_3" - match: { error.type: "illegal_argument_exception" } + # Pipeline with ignore_missing set to false and ignore_failure set to true + # Should return while catching error + - do: + search: + index: test + search_pipeline: "my_pipeline_4" + - match: { hits.total.value: 1 } + - match: {hits.hits.0._source: { "a": "foo" } } + # No source, using stored_fields - do: search: diff --git a/server/src/main/java/org/opensearch/ingest/ConfigurationUtils.java b/server/src/main/java/org/opensearch/ingest/ConfigurationUtils.java index dc41d1985fe42..b212e67a59434 100644 --- a/server/src/main/java/org/opensearch/ingest/ConfigurationUtils.java +++ b/server/src/main/java/org/opensearch/ingest/ConfigurationUtils.java @@ -67,6 +67,7 @@ public final class ConfigurationUtils { public static final String TAG_KEY = "tag"; public static final String DESCRIPTION_KEY = "description"; + public static final String IGNORE_FAILURE_KEY = "ignore_failure"; private ConfigurationUtils() {} @@ -194,7 +195,7 @@ public static String readOptionalStringOrIntProperty( return readStringOrInt(processorType, processorTag, propertyName, value); } - public static Boolean readBooleanProperty( + public static boolean readBooleanProperty( String processorType, String processorTag, Map configuration, @@ -214,7 +215,7 @@ private static Boolean readBoolean(String processorType, String processorTag, St return null; } if (value instanceof Boolean) { - return (Boolean) value; + return (boolean) value; } throw newConfigurationException( processorType, @@ -530,10 +531,11 @@ public static Processor readProcessor( ) throws Exception { String tag = ConfigurationUtils.readOptionalStringProperty(null, null, config, TAG_KEY); String description = ConfigurationUtils.readOptionalStringProperty(null, tag, config, DESCRIPTION_KEY); + boolean ignoreFailure = ConfigurationUtils.readBooleanProperty(null, null, config, IGNORE_FAILURE_KEY, false); Script conditionalScript = extractConditional(config); Processor.Factory factory = processorFactories.get(type); + if (factory != null) { - boolean ignoreFailure = ConfigurationUtils.readBooleanProperty(null, null, config, "ignore_failure", false); List> onFailureProcessorConfigs = ConfigurationUtils.readOptionalList( null, null, diff --git a/modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common/AbstractProcessor.java b/server/src/main/java/org/opensearch/search/pipeline/AbstractProcessor.java similarity index 60% rename from modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common/AbstractProcessor.java rename to server/src/main/java/org/opensearch/search/pipeline/AbstractProcessor.java index e62497cb54db5..d197051c5952b 100644 --- a/modules/search-pipeline-common/src/main/java/org/opensearch/search/pipeline/common/AbstractProcessor.java +++ b/server/src/main/java/org/opensearch/search/pipeline/AbstractProcessor.java @@ -6,20 +6,20 @@ * compatible open source license. */ -package org.opensearch.search.pipeline.common; - -import org.opensearch.search.pipeline.Processor; +package org.opensearch.search.pipeline; /** * Base class for common processor behavior. */ -abstract class AbstractProcessor implements Processor { +public abstract class AbstractProcessor implements Processor { private final String tag; private final String description; + private final boolean ignoreFailure; - protected AbstractProcessor(String tag, String description) { + protected AbstractProcessor(String tag, String description, boolean ignoreFailure) { this.tag = tag; this.description = description; + this.ignoreFailure = ignoreFailure; } @Override @@ -31,4 +31,9 @@ public String getTag() { public String getDescription() { return description; } + + @Override + public boolean isIgnoreFailure() { + return ignoreFailure; + } } diff --git a/server/src/main/java/org/opensearch/search/pipeline/Pipeline.java b/server/src/main/java/org/opensearch/search/pipeline/Pipeline.java index 92826eee5a4f4..5ada11b59556b 100644 --- a/server/src/main/java/org/opensearch/search/pipeline/Pipeline.java +++ b/server/src/main/java/org/opensearch/search/pipeline/Pipeline.java @@ -8,6 +8,8 @@ package org.opensearch.search.pipeline; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.opensearch.action.search.SearchPhaseContext; import org.opensearch.action.search.SearchPhaseResults; import org.opensearch.action.search.SearchRequest; @@ -32,6 +34,9 @@ class Pipeline { public static final String REQUEST_PROCESSORS_KEY = "request_processors"; public static final String RESPONSE_PROCESSORS_KEY = "response_processors"; public static final String PHASE_PROCESSORS_KEY = "phase_results_processors"; + + private static final Logger logger = LogManager.getLogger(Pipeline.class); + private final String id; private final String description; private final Integer version; @@ -132,7 +137,18 @@ SearchRequest transformRequest(SearchRequest request) throws SearchPipelineProce request = processor.processRequest(request); } catch (Exception e) { onRequestProcessorFailed(processor); - throw e; + if (processor.isIgnoreFailure()) { + logger.warn( + "The exception from request processor [" + + processor.getType() + + "] in the search pipeline [" + + id + + "] was ignored", + e + ); + } else { + throw e; + } } finally { long took = TimeUnit.NANOSECONDS.toMillis(relativeTimeSupplier.getAsLong() - start); afterRequestProcessor(processor, took); @@ -161,7 +177,18 @@ SearchResponse transformResponse(SearchRequest request, SearchResponse response) response = processor.processResponse(request, response); } catch (Exception e) { onResponseProcessorFailed(processor); - throw e; + if (processor.isIgnoreFailure()) { + logger.warn( + "The exception from response processor [" + + processor.getType() + + "] in the search pipeline [" + + id + + "] was ignored", + e + ); + } else { + throw e; + } } finally { long took = TimeUnit.NANOSECONDS.toMillis(relativeTimeSupplier.getAsLong() - start); afterResponseProcessor(processor, took); @@ -184,12 +211,27 @@ void runSearchPhaseResultsTransformer( String currentPhase, String nextPhase ) throws SearchPipelineProcessingException { - try { for (SearchPhaseResultsProcessor searchPhaseResultsProcessor : searchPhaseResultsProcessors) { if (currentPhase.equals(searchPhaseResultsProcessor.getBeforePhase().getName()) && nextPhase.equals(searchPhaseResultsProcessor.getAfterPhase().getName())) { - searchPhaseResultsProcessor.process(searchPhaseResult, context); + try { + searchPhaseResultsProcessor.process(searchPhaseResult, context); + } catch (Exception e) { + if (searchPhaseResultsProcessor.isIgnoreFailure()) { + logger.warn( + "The exception from search phase results processor [" + + searchPhaseResultsProcessor.getType() + + "] in the search pipeline [" + + id + + "] was ignored", + e + ); + } else { + throw e; + } + } + } } } catch (RuntimeException e) { diff --git a/server/src/main/java/org/opensearch/search/pipeline/PipelineWithMetrics.java b/server/src/main/java/org/opensearch/search/pipeline/PipelineWithMetrics.java index 060894a37e5ed..3e473f070d9c6 100644 --- a/server/src/main/java/org/opensearch/search/pipeline/PipelineWithMetrics.java +++ b/server/src/main/java/org/opensearch/search/pipeline/PipelineWithMetrics.java @@ -22,6 +22,7 @@ import java.util.function.LongSupplier; import static org.opensearch.ingest.ConfigurationUtils.TAG_KEY; +import static org.opensearch.ingest.ConfigurationUtils.IGNORE_FAILURE_KEY; import static org.opensearch.ingest.Pipeline.DESCRIPTION_KEY; import static org.opensearch.ingest.Pipeline.VERSION_KEY; @@ -150,8 +151,11 @@ private static List readProcessors( } Map config = (Map) entry.getValue(); String tag = ConfigurationUtils.readOptionalStringProperty(null, null, config, TAG_KEY); + boolean ignoreFailure = ConfigurationUtils.readBooleanProperty(null, null, config, IGNORE_FAILURE_KEY, false); String description = ConfigurationUtils.readOptionalStringProperty(null, tag, config, DESCRIPTION_KEY); - processors.add(processorFactories.get(type).create(processorFactories, tag, description, config, pipelineContext)); + processors.add( + processorFactories.get(type).create(processorFactories, tag, description, ignoreFailure, config, pipelineContext) + ); if (config.isEmpty() == false) { String processorName = type; if (tag != null) { diff --git a/server/src/main/java/org/opensearch/search/pipeline/Processor.java b/server/src/main/java/org/opensearch/search/pipeline/Processor.java index cc96132479c74..fb33f46acada4 100644 --- a/server/src/main/java/org/opensearch/search/pipeline/Processor.java +++ b/server/src/main/java/org/opensearch/search/pipeline/Processor.java @@ -43,6 +43,11 @@ public interface Processor { */ String getDescription(); + /** + * Gets the setting of ignoreFailure of a processor. + */ + boolean isIgnoreFailure(); + /** * A factory that knows how to construct a processor based on a map of maps. */ @@ -63,6 +68,7 @@ T create( Map> processorFactories, String tag, String description, + boolean ignoreFailure, Map config, PipelineContext pipelineContext ) throws Exception; diff --git a/server/src/test/java/org/opensearch/search/pipeline/SearchPipelineServiceTests.java b/server/src/test/java/org/opensearch/search/pipeline/SearchPipelineServiceTests.java index 84f39e4bdab42..0103d4c677b00 100644 --- a/server/src/test/java/org/opensearch/search/pipeline/SearchPipelineServiceTests.java +++ b/server/src/test/java/org/opensearch/search/pipeline/SearchPipelineServiceTests.java @@ -78,16 +78,16 @@ public class SearchPipelineServiceTests extends OpenSearchTestCase { private static final SearchPipelinePlugin DUMMY_PLUGIN = new SearchPipelinePlugin() { @Override public Map> getRequestProcessors(Parameters parameters) { - return Map.of("foo", (factories, tag, description, config, ctx) -> null); + return Map.of("foo", (factories, tag, description, ignoreFailure, config, ctx) -> null); } public Map> getResponseProcessors(Parameters parameters) { - return Map.of("bar", (factories, tag, description, config, ctx) -> null); + return Map.of("bar", (factories, tag, description, ignoreFailure, config, ctx) -> null); } @Override public Map> getSearchPhaseResultsProcessors(Parameters parameters) { - return Map.of("zoe", (factories, tag, description, config, ctx) -> null); + return Map.of("zoe", (factories, tag, description, ignoreFailure, config, ctx) -> null); } }; @@ -208,39 +208,15 @@ public void testResolveIndexDefaultPipeline() throws Exception { assertEquals(5, pipelinedRequest.source().size()); } - private static abstract class FakeProcessor implements Processor { - private final String type; - private final String tag; - private final String description; - - protected FakeProcessor(String type, String tag, String description) { - this.type = type; - this.tag = tag; - this.description = description; - } - - @Override - public String getType() { - return type; - } - - @Override - public String getTag() { - return tag; - } - - @Override - public String getDescription() { - return description; - } - } - - private static class FakeRequestProcessor extends FakeProcessor implements SearchRequestProcessor { + private static class FakeRequestProcessor extends AbstractProcessor implements SearchRequestProcessor { private final Consumer executor; - public FakeRequestProcessor(String type, String tag, String description, Consumer executor) { - super(type, tag, description); + private final String type; + + public FakeRequestProcessor(String type, String tag, String description, boolean ignoreFailure, Consumer executor) { + super(tag, description, ignoreFailure); this.executor = executor; + this.type = type; } @Override @@ -248,14 +224,30 @@ public SearchRequest processRequest(SearchRequest request) throws Exception { executor.accept(request); return request; } + + /* + * Gets the type of processor + */ + @Override + public String getType() { + return this.type; + } } - private static class FakeResponseProcessor extends FakeProcessor implements SearchResponseProcessor { + private static class FakeResponseProcessor extends AbstractProcessor implements SearchResponseProcessor { private final Consumer executor; + private String type; - public FakeResponseProcessor(String type, String tag, String description, Consumer executor) { - super(type, tag, description); + public FakeResponseProcessor( + String type, + String tag, + String description, + boolean ignoreFailure, + Consumer executor + ) { + super(tag, description, ignoreFailure); this.executor = executor; + this.type = type; } @Override @@ -263,19 +255,31 @@ public SearchResponse processResponse(SearchRequest request, SearchResponse resp executor.accept(response); return response; } + + /** + * Gets the type of processor + */ + @Override + public String getType() { + return this.type; + } } - private static class FakeSearchPhaseResultsProcessor extends FakeProcessor implements SearchPhaseResultsProcessor { + private static class FakeSearchPhaseResultsProcessor extends AbstractProcessor implements SearchPhaseResultsProcessor { private Consumer querySearchResultConsumer; + private String type; + public FakeSearchPhaseResultsProcessor( String type, String tag, String description, + boolean ignoreFailure, Consumer querySearchResultConsumer ) { - super(type, tag, description); + super(tag, description, ignoreFailure); this.querySearchResultConsumer = querySearchResultConsumer; + this.type = type; } @Override @@ -297,30 +301,45 @@ public SearchPhaseName getBeforePhase() { public SearchPhaseName getAfterPhase() { return SearchPhaseName.FETCH; } + + /** + * Gets the type of processor + */ + @Override + public String getType() { + return this.type; + } } private SearchPipelineService createWithProcessors() { Map> requestProcessors = new HashMap<>(); - requestProcessors.put("scale_request_size", (processorFactories, tag, description, config, ctx) -> { + requestProcessors.put("scale_request_size", (processorFactories, tag, description, ignoreFailure, config, ctx) -> { float scale = ((Number) config.remove("scale")).floatValue(); return new FakeRequestProcessor( "scale_request_size", tag, description, + ignoreFailure, req -> req.source().size((int) (req.source().size() * scale)) ); }); Map> responseProcessors = new HashMap<>(); - responseProcessors.put("fixed_score", (processorFactories, tag, description, config, ctx) -> { + responseProcessors.put("fixed_score", (processorFactories, tag, description, ignoreFailure, config, ctx) -> { float score = ((Number) config.remove("score")).floatValue(); - return new FakeResponseProcessor("fixed_score", tag, description, rsp -> rsp.getHits().forEach(h -> h.score(score))); + return new FakeResponseProcessor( + "fixed_score", + tag, + description, + ignoreFailure, + rsp -> rsp.getHits().forEach(h -> h.score(score)) + ); }); Map> searchPhaseProcessors = new HashMap<>(); - searchPhaseProcessors.put("max_score", (processorFactories, tag, description, config, context) -> { + searchPhaseProcessors.put("max_score", (processorFactories, tag, description, ignoreFailure, config, context) -> { final float finalScore = config.containsKey("score") ? ((Number) config.remove("score")).floatValue() : 100f; final Consumer querySearchResultConsumer = (result) -> result.queryResult().topDocs().maxScore = finalScore; - return new FakeSearchPhaseResultsProcessor("max_score", tag, description, querySearchResultConsumer); + return new FakeSearchPhaseResultsProcessor("max_score", tag, description, ignoreFailure, querySearchResultConsumer); }); return createWithProcessors(requestProcessors, responseProcessors, searchPhaseProcessors); @@ -893,7 +912,7 @@ public void testInfo() { } public void testExceptionOnPipelineCreation() { - Map> badFactory = Map.of("bad_factory", (pf, t, f, c, ctx) -> { + Map> badFactory = Map.of("bad_factory", (pf, t, i, f, c, ctx) -> { throw new RuntimeException(); }); SearchPipelineService searchPipelineService = createWithProcessors(badFactory, Collections.emptyMap(), Collections.emptyMap()); @@ -910,12 +929,12 @@ public void testExceptionOnPipelineCreation() { } public void testExceptionOnRequestProcessing() { - SearchRequestProcessor throwingRequestProcessor = new FakeRequestProcessor("throwing_request", null, null, r -> { + SearchRequestProcessor throwingRequestProcessor = new FakeRequestProcessor("throwing_request", null, null, false, r -> { throw new RuntimeException(); }); Map> throwingRequestProcessorFactory = Map.of( "throwing_request", - (pf, t, f, c, ctx) -> throwingRequestProcessor + (pf, t, i, f, c, ctx) -> throwingRequestProcessor ); SearchPipelineService searchPipelineService = createWithProcessors( @@ -935,12 +954,12 @@ public void testExceptionOnRequestProcessing() { } public void testExceptionOnResponseProcessing() throws Exception { - SearchResponseProcessor throwingResponseProcessor = new FakeResponseProcessor("throwing_response", null, null, r -> { + SearchResponseProcessor throwingResponseProcessor = new FakeResponseProcessor("throwing_response", null, null, false, r -> { throw new RuntimeException(); }); Map> throwingResponseProcessorFactory = Map.of( "throwing_response", - (pf, t, f, c, ctx) -> throwingResponseProcessor + (pf, t, i, f, c, ctx) -> throwingResponseProcessor ); SearchPipelineService searchPipelineService = createWithProcessors( @@ -962,61 +981,103 @@ public void testExceptionOnResponseProcessing() throws Exception { expectThrows(SearchPipelineProcessingException.class, () -> pipelinedRequest.transformResponse(response)); } + public void testCatchExceptionOnRequestProcessing() throws IllegalAccessException { + SearchRequestProcessor throwingRequestProcessor = new FakeRequestProcessor("throwing_request", null, null, true, r -> { + throw new RuntimeException(); + }); + Map> throwingRequestProcessorFactory = Map.of( + "throwing_request", + (pf, t, i, f, c, ctx) -> throwingRequestProcessor + ); + + SearchPipelineService searchPipelineService = createWithProcessors( + throwingRequestProcessorFactory, + Collections.emptyMap(), + Collections.emptyMap() + ); + + Map pipelineSourceMap = new HashMap<>(); + pipelineSourceMap.put(Pipeline.REQUEST_PROCESSORS_KEY, List.of(Map.of("throwing_request", Collections.emptyMap()))); + + SearchSourceBuilder sourceBuilder = SearchSourceBuilder.searchSource().searchPipelineSource(pipelineSourceMap); + SearchRequest searchRequest = new SearchRequest().source(sourceBuilder); + + // Caught Exception thrown when processing the request and produced warn level logging message + try (MockLogAppender mockAppender = MockLogAppender.createForLoggers(LogManager.getLogger(Pipeline.class))) { + mockAppender.addExpectation( + new MockLogAppender.SeenEventExpectation( + "test1", + Pipeline.class.getCanonicalName(), + Level.WARN, + "The exception from request processor [throwing_request] in the search pipeline [_ad_hoc_pipeline] was ignored" + ) + ); + PipelinedRequest pipelinedRequest = searchPipelineService.resolvePipeline(searchRequest); + mockAppender.assertAllExpectationsMatched(); + } + } + + public void testCatchExceptionOnResponseProcessing() throws Exception { + SearchResponseProcessor throwingResponseProcessor = new FakeResponseProcessor("throwing_response", null, null, true, r -> { + throw new RuntimeException(); + }); + Map> throwingResponseProcessorFactory = Map.of( + "throwing_response", + (pf, t, i, f, c, ctx) -> throwingResponseProcessor + ); + + SearchPipelineService searchPipelineService = createWithProcessors( + Collections.emptyMap(), + throwingResponseProcessorFactory, + Collections.emptyMap() + ); + + Map pipelineSourceMap = new HashMap<>(); + pipelineSourceMap.put(Pipeline.RESPONSE_PROCESSORS_KEY, List.of(Map.of("throwing_response", Collections.emptyMap()))); + + SearchSourceBuilder sourceBuilder = SearchSourceBuilder.searchSource().size(100).searchPipelineSource(pipelineSourceMap); + SearchRequest searchRequest = new SearchRequest().source(sourceBuilder); + + PipelinedRequest pipelinedRequest = searchPipelineService.resolvePipeline(searchRequest); + + SearchResponse response = new SearchResponse(null, null, 0, 0, 0, 0, null, null); + + // Caught Exception thrown when processing response and produced warn level logging message + try (MockLogAppender mockAppender = MockLogAppender.createForLoggers(LogManager.getLogger(Pipeline.class))) { + mockAppender.addExpectation( + new MockLogAppender.SeenEventExpectation( + "test1", + Pipeline.class.getCanonicalName(), + Level.WARN, + "The exception from response processor [throwing_response] in the search pipeline [_ad_hoc_pipeline] was ignored" + ) + ); + pipelinedRequest.transformResponse(response); + mockAppender.assertAllExpectationsMatched(); + } + } + public void testStats() throws Exception { - SearchRequestProcessor throwingRequestProcessor = new FakeRequestProcessor("throwing_request", "1", null, r -> { + SearchRequestProcessor throwingRequestProcessor = new FakeRequestProcessor("throwing_request", "1", null, false, r -> { throw new RuntimeException(); }); Map> requestProcessors = Map.of( "successful_request", - (pf, t, f, c, ctx) -> new FakeRequestProcessor("successful_request", "2", null, r -> {}), + (pf, t, i, f, c, ctx) -> new FakeRequestProcessor("successful_request", "2", null, false, r -> {}), "throwing_request", - (pf, t, f, c, ctx) -> throwingRequestProcessor + (pf, t, i, f, c, ctx) -> throwingRequestProcessor ); - SearchResponseProcessor throwingResponseProcessor = new FakeResponseProcessor("throwing_response", "3", null, r -> { + SearchResponseProcessor throwingResponseProcessor = new FakeResponseProcessor("throwing_response", "3", null, false, r -> { throw new RuntimeException(); }); Map> responseProcessors = Map.of( "successful_response", - (pf, t, f, c, ctx) -> new FakeResponseProcessor("successful_response", "4", null, r -> {}), + (pf, t, i, f, c, ctx) -> new FakeResponseProcessor("successful_response", "4", null, false, r -> {}), "throwing_response", - (pf, t, f, c, ctx) -> throwingResponseProcessor + (pf, t, i, f, c, ctx) -> throwingResponseProcessor ); - SearchPipelineService searchPipelineService = createWithProcessors(requestProcessors, responseProcessors, Collections.emptyMap()); - SearchPipelineMetadata metadata = new SearchPipelineMetadata( - Map.of( - "good_response_pipeline", - new PipelineConfiguration( - "good_response_pipeline", - new BytesArray("{\"response_processors\" : [ { \"successful_response\": {} } ] }"), - XContentType.JSON - ), - "bad_response_pipeline", - new PipelineConfiguration( - "bad_response_pipeline", - new BytesArray("{\"response_processors\" : [ { \"throwing_response\": {} } ] }"), - XContentType.JSON - ), - "good_request_pipeline", - new PipelineConfiguration( - "good_request_pipeline", - new BytesArray("{\"request_processors\" : [ { \"successful_request\": {} } ] }"), - XContentType.JSON - ), - "bad_request_pipeline", - new PipelineConfiguration( - "bad_request_pipeline", - new BytesArray("{\"request_processors\" : [ { \"throwing_request\": {} } ] }"), - XContentType.JSON - ) - ) - ); - ClusterState clusterState = ClusterState.builder(new ClusterName("_name")).build(); - ClusterState previousState = clusterState; - clusterState = ClusterState.builder(clusterState) - .metadata(Metadata.builder().putCustom(SearchPipelineMetadata.TYPE, metadata)) - .build(); - searchPipelineService.applyClusterState(new ClusterChangedEvent("", clusterState, previousState)); + SearchPipelineService searchPipelineService = getSearchPipelineService(requestProcessors, responseProcessors); SearchRequest request = new SearchRequest(); SearchResponse response = new SearchResponse(null, null, 0, 0, 0, 0, null, null); @@ -1079,6 +1140,142 @@ public void testStats() throws Exception { } } + public void testStatsEnabledIgnoreFailure() throws Exception { + SearchRequestProcessor throwingRequestProcessor = new FakeRequestProcessor("throwing_request", "1", null, true, r -> { + throw new RuntimeException(); + }); + Map> requestProcessorsEnableIgnoreFailure = Map.of( + "successful_request", + (pf, t, i, f, c, ctx) -> new FakeRequestProcessor("successful_request", "2", null, true, r -> {}), + "throwing_request", + (pf, t, i, f, c, ctx) -> throwingRequestProcessor + ); + SearchResponseProcessor throwingResponseProcessor = new FakeResponseProcessor("throwing_response", "3", null, true, r -> { + throw new RuntimeException(); + }); + Map> responseProcessorsEnableIgnoreFailure = Map.of( + "successful_response", + (pf, t, i, f, c, ctx) -> new FakeResponseProcessor("successful_response", "4", null, true, r -> {}), + "throwing_response", + (pf, t, i, f, c, ctx) -> throwingResponseProcessor + ); + + SearchPipelineService searchPipelineService = getSearchPipelineService( + requestProcessorsEnableIgnoreFailure, + responseProcessorsEnableIgnoreFailure + ); + + SearchRequest request = new SearchRequest(); + SearchResponse response = new SearchResponse(null, null, 0, 0, 0, 0, null, null); + + searchPipelineService.resolvePipeline(request.pipeline("good_request_pipeline")).transformResponse(response); + // Caught Exception here + searchPipelineService.resolvePipeline(request.pipeline("bad_request_pipeline")).transformResponse(response); + searchPipelineService.resolvePipeline(request.pipeline("good_response_pipeline")).transformResponse(response); + // Caught Exception here + searchPipelineService.resolvePipeline(request.pipeline("bad_response_pipeline")).transformResponse(response); + + // when ignoreFailure enabled, the search pipelines will all succeed. + SearchPipelineStats stats = searchPipelineService.stats(); + assertPipelineStats(stats.getTotalRequestStats(), 2, 0); + assertPipelineStats(stats.getTotalResponseStats(), 2, 0); + + for (SearchPipelineStats.PerPipelineStats perPipelineStats : stats.getPipelineStats()) { + SearchPipelineStats.PipelineDetailStats detailStats = stats.getPerPipelineProcessorStats() + .get(perPipelineStats.getPipelineId()); + switch (perPipelineStats.getPipelineId()) { + case "good_request_pipeline": + assertPipelineStats(perPipelineStats.getRequestStats(), 1, 0); + assertPipelineStats(perPipelineStats.getResponseStats(), 0, 0); + assertEquals(1, detailStats.requestProcessorStats().size()); + assertEquals(0, detailStats.responseProcessorStats().size()); + assertEquals("successful_request:2", detailStats.requestProcessorStats().get(0).getProcessorName()); + assertEquals("successful_request", detailStats.requestProcessorStats().get(0).getProcessorType()); + assertPipelineStats(detailStats.requestProcessorStats().get(0).getStats(), 1, 0); + break; + case "bad_request_pipeline": + // pipeline succeed when ignore failure is true + assertPipelineStats(perPipelineStats.getRequestStats(), 1, 0); + assertPipelineStats(perPipelineStats.getResponseStats(), 0, 0); + assertEquals(1, detailStats.requestProcessorStats().size()); + assertEquals(0, detailStats.responseProcessorStats().size()); + assertEquals("throwing_request:1", detailStats.requestProcessorStats().get(0).getProcessorName()); + assertEquals("throwing_request", detailStats.requestProcessorStats().get(0).getProcessorType()); + // processor stats got 1 count and 1 failed + assertPipelineStats(detailStats.requestProcessorStats().get(0).getStats(), 1, 1); + break; + case "good_response_pipeline": + assertPipelineStats(perPipelineStats.getRequestStats(), 0, 0); + assertPipelineStats(perPipelineStats.getResponseStats(), 1, 0); + assertEquals(0, detailStats.requestProcessorStats().size()); + assertEquals(1, detailStats.responseProcessorStats().size()); + assertEquals("successful_response:4", detailStats.responseProcessorStats().get(0).getProcessorName()); + assertEquals("successful_response", detailStats.responseProcessorStats().get(0).getProcessorType()); + assertPipelineStats(detailStats.responseProcessorStats().get(0).getStats(), 1, 0); + break; + case "bad_response_pipeline": + // pipeline succeed when ignore failure is true + assertPipelineStats(perPipelineStats.getRequestStats(), 0, 0); + assertPipelineStats(perPipelineStats.getResponseStats(), 1, 0); + assertEquals(0, detailStats.requestProcessorStats().size()); + assertEquals(1, detailStats.responseProcessorStats().size()); + assertEquals("throwing_response:3", detailStats.responseProcessorStats().get(0).getProcessorName()); + assertEquals("throwing_response", detailStats.responseProcessorStats().get(0).getProcessorType()); + // processor stats got 1 count and 1 failed + assertPipelineStats(detailStats.responseProcessorStats().get(0).getStats(), 1, 1); + break; + } + } + + } + + private SearchPipelineService getSearchPipelineService( + Map> requestProcessorsEnableIgnoreFailure, + Map> responseProcessorsEnableIgnoreFailure + ) { + SearchPipelineService searchPipelineService = createWithProcessors( + requestProcessorsEnableIgnoreFailure, + responseProcessorsEnableIgnoreFailure, + Collections.emptyMap() + ); + + SearchPipelineMetadata metadata = new SearchPipelineMetadata( + Map.of( + "good_response_pipeline", + new PipelineConfiguration( + "good_response_pipeline", + new BytesArray("{\"response_processors\" : [ { \"successful_response\": {} } ] }"), + XContentType.JSON + ), + "bad_response_pipeline", + new PipelineConfiguration( + "bad_response_pipeline", + new BytesArray("{\"response_processors\" : [ { \"throwing_response\": {} } ] }"), + XContentType.JSON + ), + "good_request_pipeline", + new PipelineConfiguration( + "good_request_pipeline", + new BytesArray("{\"request_processors\" : [ { \"successful_request\": {} } ] }"), + XContentType.JSON + ), + "bad_request_pipeline", + new PipelineConfiguration( + "bad_request_pipeline", + new BytesArray("{\"request_processors\" : [ { \"throwing_request\": {} } ] }"), + XContentType.JSON + ) + ) + ); + ClusterState clusterState = ClusterState.builder(new ClusterName("_name")).build(); + ClusterState previousState = clusterState; + clusterState = ClusterState.builder(clusterState) + .metadata(Metadata.builder().putCustom(SearchPipelineMetadata.TYPE, metadata)) + .build(); + searchPipelineService.applyClusterState(new ClusterChangedEvent("", clusterState, previousState)); + return searchPipelineService; + } + private static void assertPipelineStats(OperationStats stats, long count, long failed) { assertEquals(stats.getCount(), count); assertEquals(stats.getFailedCount(), failed); @@ -1086,11 +1283,11 @@ private static void assertPipelineStats(OperationStats stats, long count, long f public void testAdHocRejectingProcessor() { String processorType = "ad_hoc_rejecting"; - Map> requestProcessorFactories = Map.of(processorType, (pf, t, d, c, ctx) -> { + Map> requestProcessorFactories = Map.of(processorType, (pf, t, d, i, c, ctx) -> { if (ctx.getPipelineSource() == Processor.PipelineSource.SEARCH_REQUEST) { throw new IllegalArgumentException(processorType + " cannot be created as part of a pipeline defined in a search request"); } - return new FakeRequestProcessor(processorType, t, d, r -> {}); + return new FakeRequestProcessor(processorType, t, d, i, r -> {}); }); SearchPipelineService searchPipelineService = createWithProcessors(