Skip to content

Commit

Permalink
Changing checksum setting to support modes (#15622)
Browse files Browse the repository at this point in the history
* Changing checksum setting to support modes

Signed-off-by: Himshikha Gupta <himshikh@amazon.com>
  • Loading branch information
himshikha committed Sep 4, 2024
1 parent 17b5f98 commit a608fca
Show file tree
Hide file tree
Showing 5 changed files with 406 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,10 @@ protected Settings nodeSettings(int nodeOrdinal) {
)
.put("node.attr." + REMOTE_STORE_ROUTING_TABLE_REPOSITORY_NAME_ATTRIBUTE_KEY, REMOTE_ROUTING_TABLE_REPO)
.put(REMOTE_PUBLICATION_EXPERIMENTAL, true)
.put(RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING.getKey(), true)
.put(
RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING.getKey(),
RemoteClusterStateService.RemoteClusterStateValidationMode.FAILURE
)
.build();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,10 @@ protected Settings nodeSettings(int nodeOrdinal) {
.put("node.attr." + REMOTE_STORE_ROUTING_TABLE_REPOSITORY_NAME_ATTRIBUTE_KEY, routingTableRepoName)
.put(routingTableRepoTypeAttributeKey, ReloadableFsRepository.TYPE)
.put(routingTableRepoSettingsAttributeKeyPrefix + "location", segmentRepoPath)
.put(RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING.getKey(), true)
.put(
RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING.getKey(),
RemoteClusterStateService.RemoteClusterStateValidationMode.FAILURE
)
.build();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -740,7 +740,7 @@ public void apply(Settings value, Settings current, Settings previous) {
IndicesService.CLUSTER_INDEX_RESTRICT_REPLICATION_TYPE_SETTING,
RemoteRoutingTableBlobStore.REMOTE_ROUTING_TABLE_PATH_TYPE_SETTING,
RemoteRoutingTableBlobStore.REMOTE_ROUTING_TABLE_PATH_HASH_ALGO_SETTING,
RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING,
RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING,

// Admission Control Settings
AdmissionControlSettings.ADMISSION_CONTROL_TRANSPORT_LAYER_MODE,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
Expand Down Expand Up @@ -141,13 +142,49 @@ public class RemoteClusterStateService implements Closeable {
Setting.Property.NodeScope
);

public static final Setting<Boolean> REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING = Setting.boolSetting(
"cluster.remote_store.state.checksum_validation.enabled",
false,
Property.Dynamic,
Property.NodeScope
public static final Setting<RemoteClusterStateValidationMode> REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING = new Setting<>(
"cluster.remote_store.state.checksum_validation.mode",
RemoteClusterStateValidationMode.NONE.name(),
RemoteClusterStateValidationMode::parseString,
Setting.Property.Dynamic,
Setting.Property.NodeScope
);

/**
* Validation mode for cluster state checksum.
* None: Validation will be disabled.
* Debug: Validation enabled but only matches checksum and logs failing entities.
* Trace: Matches checksum and downloads full cluster state to find diff in failing entities. Only logs failures.
* Failure: Throws exception on failing validation.
*/
public enum RemoteClusterStateValidationMode {
DEBUG("debug"),
TRACE("trace"),
FAILURE("failure"),
NONE("none");

public final String mode;

RemoteClusterStateValidationMode(String mode) {
this.mode = mode;
}

public static RemoteClusterStateValidationMode parseString(String mode) {
try {
return RemoteClusterStateValidationMode.valueOf(mode.toUpperCase(Locale.ROOT));
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(
"["
+ mode
+ "] mode is not supported. "
+ "supported modes are ["
+ Arrays.toString(RemoteClusterStateValidationMode.values())
+ "]"
);
}
}
}

private TimeValue remoteStateReadTimeout;
private final String nodeId;
private final Supplier<RepositoriesService> repositoriesService;
Expand All @@ -159,7 +196,7 @@ public class RemoteClusterStateService implements Closeable {
private BlobStoreTransferService blobStoreTransferService;
private RemoteRoutingTableService remoteRoutingTableService;
private volatile TimeValue slowWriteLoggingThreshold;
private boolean checksumValidationEnabled;
private RemoteClusterStateValidationMode remoteClusterStateValidationMode;

private final RemotePersistenceStats remoteStateStats;
private RemoteClusterStateCleanupManager remoteClusterStateCleanupManager;
Expand Down Expand Up @@ -206,11 +243,8 @@ public RemoteClusterStateService(
clusterSettings.addSettingsUpdateConsumer(SLOW_WRITE_LOGGING_THRESHOLD, this::setSlowWriteLoggingThreshold);
this.remoteStateReadTimeout = clusterSettings.get(REMOTE_STATE_READ_TIMEOUT_SETTING);
clusterSettings.addSettingsUpdateConsumer(REMOTE_STATE_READ_TIMEOUT_SETTING, this::setRemoteStateReadTimeout);
this.checksumValidationEnabled = clusterSettings.get(REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING);
clusterSettings.addSettingsUpdateConsumer(
REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING,
this::setChecksumValidationEnabled
);
this.remoteClusterStateValidationMode = REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING.get(settings);
clusterSettings.addSettingsUpdateConsumer(REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING, this::setChecksumValidationMode);

this.remoteStateStats = new RemotePersistenceStats();
this.namedWriteableRegistry = namedWriteableRegistry;
Expand Down Expand Up @@ -272,7 +306,7 @@ public RemoteClusterStateManifestInfo writeFullMetadata(ClusterState clusterStat
uploadedMetadataResults,
previousClusterUUID,
clusterStateDiffManifest,
checksumValidationEnabled ? new ClusterStateChecksum(clusterState) : null,
!remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE) ? new ClusterStateChecksum(clusterState) : null,
false,
codecVersion
);
Expand Down Expand Up @@ -472,7 +506,7 @@ public RemoteClusterStateManifestInfo writeIncrementalMetadata(
uploadedMetadataResults,
previousManifest.getPreviousClusterUUID(),
clusterStateDiffManifest,
checksumValidationEnabled ? new ClusterStateChecksum(clusterState) : null,
!remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE) ? new ClusterStateChecksum(clusterState) : null,
false,
previousManifest.getCodecVersion()
);
Expand Down Expand Up @@ -917,7 +951,7 @@ public RemoteClusterStateManifestInfo markLastStateAsCommitted(ClusterState clus
uploadedMetadataResults,
previousManifest.getPreviousClusterUUID(),
previousManifest.getDiffManifest(),
checksumValidationEnabled ? previousManifest.getClusterStateChecksum() : null,
!remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE) ? new ClusterStateChecksum(clusterState) : null,
true,
previousManifest.getCodecVersion()
);
Expand Down Expand Up @@ -1003,8 +1037,8 @@ private void setSlowWriteLoggingThreshold(TimeValue slowWriteLoggingThreshold) {
this.slowWriteLoggingThreshold = slowWriteLoggingThreshold;
}

private void setChecksumValidationEnabled(Boolean checksumValidationEnabled) {
this.checksumValidationEnabled = checksumValidationEnabled;
private void setChecksumValidationMode(RemoteClusterStateValidationMode remoteClusterStateValidationMode) {
this.remoteClusterStateValidationMode = remoteClusterStateValidationMode;
}

// Package private for unit test
Expand Down Expand Up @@ -1376,7 +1410,9 @@ public ClusterState getClusterStateForManifest(
includeEphemeral
);

if (includeEphemeral && checksumValidationEnabled && manifest.getClusterStateChecksum() != null) {
if (includeEphemeral
&& !remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE)
&& manifest.getClusterStateChecksum() != null) {
validateClusterStateFromChecksum(manifest, clusterState, clusterName, localNodeId, true);
}
} else {
Expand Down Expand Up @@ -1498,7 +1534,7 @@ public ClusterState getClusterStateUsingDiff(ClusterMetadataManifest manifest, C
.routingTable(new RoutingTable(manifest.getRoutingTableVersion(), indexRoutingTables))
.build();

if (checksumValidationEnabled && manifest.getClusterStateChecksum() != null) {
if (!remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE) && manifest.getClusterStateChecksum() != null) {
validateClusterStateFromChecksum(manifest, clusterState, previousState.getClusterName().value(), localNodeId, false);
}
final long durationMillis = TimeValue.nsecToMSec(relativeTimeNanosSupplier.getAsLong() - startTimeNanos);
Expand All @@ -1517,20 +1553,24 @@ void validateClusterStateFromChecksum(
) {
ClusterStateChecksum newClusterStateChecksum = new ClusterStateChecksum(clusterState);
List<String> failedValidation = newClusterStateChecksum.getMismatchEntities(manifest.getClusterStateChecksum());
if (!failedValidation.isEmpty()) {
logger.error(
() -> new ParameterizedMessage(
"Cluster state checksums do not match. Checksum from manifest {}, checksum from created cluster state {}. Entities failing validation {}",
manifest.getClusterStateChecksum(),
newClusterStateChecksum,
failedValidation
)
if (failedValidation.isEmpty()) {
return;
}
logger.error(
() -> new ParameterizedMessage(
"Cluster state checksums do not match. Checksum from manifest {}, checksum from created cluster state {}. Entities failing validation {}",
manifest.getClusterStateChecksum(),
newClusterStateChecksum,
failedValidation
)
);
if (isFullStateDownload && remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.FAILURE)) {
throw new IllegalStateException(
"Cluster state checksums do not match during full state read. Validation failed for " + failedValidation
);
if (isFullStateDownload) {
throw new IllegalStateException(
"Cluster state checksums do not match during full state read. Validation failed for " + failedValidation
);
}
}
if (remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.FAILURE)
|| remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.TRACE)) {
// download full cluster state and match against state created for the failing entities
ClusterState fullClusterState = readClusterStateInParallel(
ClusterState.builder(new ClusterName(clusterName)).build(),
Expand Down Expand Up @@ -1663,6 +1703,8 @@ void validateClusterStateFromChecksum(
break;
}
}
}
if (remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.FAILURE)) {
throw new IllegalStateException(
"Cluster state checksums do not match during diff read. Validation failed for " + failedValidation
);
Expand Down
Loading

0 comments on commit a608fca

Please sign in to comment.