From 14e2091a8e8d92bbf976f59ecac5a99eebc1b9c4 Mon Sep 17 00:00:00 2001 From: Suraj Singh Date: Tue, 11 Jul 2023 12:28:35 -0700 Subject: [PATCH] [Segment Replication] Add logic back to update tracking replication checkpoint on source (#8560) (#8633) * [Segment Replication] Add logic back to update tracking replication checkpoint on source Signed-off-by: Suraj Singh * Update comment Signed-off-by: Suraj Singh * Address review comments & mute breaking bwc-test Signed-off-by: Suraj Singh * Spotless check Signed-off-by: Suraj Singh * Stop timer inside try to prevent double stop on timer Signed-off-by: Suraj Singh * Update PressureITs to wait for appropriate transport call for replica update Signed-off-by: Suraj Singh * Spotless check Signed-off-by: Suraj Singh --------- Signed-off-by: Suraj Singh (cherry picked from commit 3d7d33bdbc02ec8e363c5394a9d89451f76e979a) --- .../replication/SegmentReplicationBaseIT.java | 12 +++++++++++- .../indices/replication/SegmentReplicationIT.java | 6 ------ .../replication/OngoingSegmentReplications.java | 6 +----- .../replication/SegmentReplicationSourceHandler.java | 12 +++++++++++- .../replication/SegmentReplicationTargetService.java | 5 +++++ .../SegmentReplicationSourceHandlerTests.java | 7 +++++-- 6 files changed, 33 insertions(+), 15 deletions(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationBaseIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationBaseIT.java index 8fef88cbe6820..4692210ccc577 100644 --- a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationBaseIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationBaseIT.java @@ -19,6 +19,7 @@ import org.opensearch.common.Nullable; import org.opensearch.common.settings.Settings; import org.opensearch.common.lease.Releasable; +import org.opensearch.common.util.FeatureFlags; import org.opensearch.index.Index; import org.opensearch.index.IndexModule; import org.opensearch.index.IndexService; @@ -222,6 +223,11 @@ protected IndexShard getIndexShard(String node, String indexName) { return indexService.getShard(shardId.get()); } + protected boolean segmentReplicationWithRemoteEnabled() { + return IndexMetadata.INDEX_REMOTE_STORE_ENABLED_SETTING.get(indexSettings()).booleanValue() + && "true".equalsIgnoreCase(featureFlagSettings().get(FeatureFlags.SEGMENT_REPLICATION_EXPERIMENTAL)); + } + protected Releasable blockReplication(List nodes, CountDownLatch latch) { CountDownLatch pauseReplicationLatch = new CountDownLatch(nodes.size()); for (String node : nodes) { @@ -231,7 +237,11 @@ protected Releasable blockReplication(List nodes, CountDownLatch latch) node )); mockTargetTransportService.addSendBehavior((connection, requestId, action, request, options) -> { - if (action.equals(SegmentReplicationSourceService.Actions.UPDATE_VISIBLE_CHECKPOINT)) { + String actionToWaitFor = SegmentReplicationSourceService.Actions.GET_SEGMENT_FILES; + if (segmentReplicationWithRemoteEnabled()) { + actionToWaitFor = SegmentReplicationSourceService.Actions.UPDATE_VISIBLE_CHECKPOINT; + } + if (action.equals(actionToWaitFor)) { try { latch.countDown(); pauseReplicationLatch.await(); diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java index 1677959ff0e99..98abbe369c660 100644 --- a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java @@ -49,7 +49,6 @@ import org.opensearch.common.settings.Settings; import org.opensearch.common.unit.TimeValue; import org.opensearch.common.lease.Releasable; -import org.opensearch.common.util.FeatureFlags; import org.opensearch.index.IndexModule; import org.opensearch.index.SegmentReplicationPerGroupStats; import org.opensearch.index.SegmentReplicationPressureService; @@ -1325,9 +1324,4 @@ public void testPrimaryReceivesDocsDuringReplicaRecovery() throws Exception { ensureGreen(INDEX_NAME); waitForSearchableDocs(2, nodes); } - - private boolean segmentReplicationWithRemoteEnabled() { - return IndexMetadata.INDEX_REMOTE_STORE_ENABLED_SETTING.get(indexSettings()).booleanValue() - && "true".equalsIgnoreCase(featureFlagSettings().get(FeatureFlags.SEGMENT_REPLICATION_EXPERIMENTAL)); - } } diff --git a/server/src/main/java/org/opensearch/indices/replication/OngoingSegmentReplications.java b/server/src/main/java/org/opensearch/indices/replication/OngoingSegmentReplications.java index 050a66bedcf5d..f32887175d4f3 100644 --- a/server/src/main/java/org/opensearch/indices/replication/OngoingSegmentReplications.java +++ b/server/src/main/java/org/opensearch/indices/replication/OngoingSegmentReplications.java @@ -120,11 +120,7 @@ void startSegmentCopy(GetSegmentFilesRequest request, ActionListener listener) { + // Short circuit when no files to transfer + if (request.getFilesToFetch().isEmpty()) { + // before completion, alert the primary of the replica's state. + shard.updateVisibleCheckpointForShard(request.getTargetAllocationId(), copyState.getCheckpoint()); + listener.onResponse(new GetSegmentFilesResponse(Collections.emptyList())); + return; + } + final ReplicationTimer timer = new ReplicationTimer(); if (isReplicating.compareAndSet(false, true) == false) { throw new OpenSearchException("Replication to {} is already running.", shard.shardId()); @@ -159,10 +168,11 @@ public synchronized void sendFiles(GetSegmentFilesRequest request, ActionListene sendFileStep.whenComplete(r -> { try { + shard.updateVisibleCheckpointForShard(allocationId, copyState.getCheckpoint()); future.onResponse(new GetSegmentFilesResponse(List.of(storeFileMetadata))); + timer.stop(); } finally { IOUtils.close(resources); - timer.stop(); logger.trace( "[replication id {}] Source node completed sending files to target node [{}], timing: {}", request.getReplicationId(), diff --git a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java index ac93412eef725..467f499056345 100644 --- a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java +++ b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java @@ -292,6 +292,11 @@ public void onReplicationFailure( } protected void updateVisibleCheckpoint(long replicationId, IndexShard replicaShard) { + // Update replication checkpoint on source via transport call only supported for remote store integration. For node- + // node communication, checkpoint update is piggy-backed to GET_SEGMENT_FILES transport call + if (replicaShard.indexSettings().isRemoteStoreEnabled() == false) { + return; + } ShardRouting primaryShard = clusterService.state().routingTable().shardRoutingTable(replicaShard.shardId()).primaryShard(); final UpdateVisibleCheckpointRequest request = new UpdateVisibleCheckpointRequest( diff --git a/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationSourceHandlerTests.java b/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationSourceHandlerTests.java index 607f9dd91e35e..b4e9166f377ec 100644 --- a/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationSourceHandlerTests.java +++ b/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationSourceHandlerTests.java @@ -196,11 +196,13 @@ public void testReplicationAlreadyRunning() throws IOException { 1 ); + final List expectedFiles = List.of(new StoreFileMetadata("_0.si", 20, "test", Version.CURRENT.luceneVersion)); + final GetSegmentFilesRequest getSegmentFilesRequest = new GetSegmentFilesRequest( 1L, replica.routingEntry().allocationId().getId(), replicaDiscoveryNode, - Collections.emptyList(), + expectedFiles, latestReplicationCheckpoint ); @@ -224,11 +226,12 @@ public void testCancelReplication() throws IOException, InterruptedException { 1 ); + final List expectedFiles = List.of(new StoreFileMetadata("_0.si", 20, "test", Version.CURRENT.luceneVersion)); final GetSegmentFilesRequest getSegmentFilesRequest = new GetSegmentFilesRequest( 1L, replica.routingEntry().allocationId().getId(), replicaDiscoveryNode, - Collections.emptyList(), + expectedFiles, latestReplicationCheckpoint );