Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Backport 2.x] Changing checksum setting to support modes #15672

Merged
merged 1 commit into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,10 @@ protected Settings nodeSettings(int nodeOrdinal) {
)
.put("node.attr." + REMOTE_STORE_ROUTING_TABLE_REPOSITORY_NAME_ATTRIBUTE_KEY, REMOTE_ROUTING_TABLE_REPO)
.put(REMOTE_PUBLICATION_EXPERIMENTAL, true)
.put(RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING.getKey(), true)
.put(
RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING.getKey(),
RemoteClusterStateService.RemoteClusterStateValidationMode.FAILURE
)
.build();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,10 @@ protected Settings nodeSettings(int nodeOrdinal) {
.put("node.attr." + REMOTE_STORE_ROUTING_TABLE_REPOSITORY_NAME_ATTRIBUTE_KEY, routingTableRepoName)
.put(routingTableRepoTypeAttributeKey, ReloadableFsRepository.TYPE)
.put(routingTableRepoSettingsAttributeKeyPrefix + "location", segmentRepoPath)
.put(RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING.getKey(), true)
.put(
RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING.getKey(),
RemoteClusterStateService.RemoteClusterStateValidationMode.FAILURE
)
.build();
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -743,7 +743,7 @@ public void apply(Settings value, Settings current, Settings previous) {
IndicesService.CLUSTER_INDEX_RESTRICT_REPLICATION_TYPE_SETTING,
RemoteRoutingTableBlobStore.REMOTE_ROUTING_TABLE_PATH_TYPE_SETTING,
RemoteRoutingTableBlobStore.REMOTE_ROUTING_TABLE_PATH_HASH_ALGO_SETTING,
RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING,
RemoteClusterStateService.REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING,

AdmissionControlSettings.ADMISSION_CONTROL_TRANSPORT_LAYER_MODE,
CpuBasedAdmissionControllerSettings.CPU_BASED_ADMISSION_CONTROLLER_TRANSPORT_LAYER_MODE,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@
import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
Expand Down Expand Up @@ -141,13 +142,49 @@
Setting.Property.NodeScope
);

public static final Setting<Boolean> REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING = Setting.boolSetting(
"cluster.remote_store.state.checksum_validation.enabled",
false,
Property.Dynamic,
Property.NodeScope
public static final Setting<RemoteClusterStateValidationMode> REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING = new Setting<>(
"cluster.remote_store.state.checksum_validation.mode",
RemoteClusterStateValidationMode.NONE.name(),
RemoteClusterStateValidationMode::parseString,
Setting.Property.Dynamic,
Setting.Property.NodeScope
);

/**
* Validation mode for cluster state checksum.
* None: Validation will be disabled.
* Debug: Validation enabled but only matches checksum and logs failing entities.
* Trace: Matches checksum and downloads full cluster state to find diff in failing entities. Only logs failures.
* Failure: Throws exception on failing validation.
*/
public enum RemoteClusterStateValidationMode {
DEBUG("debug"),
TRACE("trace"),
FAILURE("failure"),
NONE("none");

public final String mode;

RemoteClusterStateValidationMode(String mode) {
this.mode = mode;
}

public static RemoteClusterStateValidationMode parseString(String mode) {
try {
return RemoteClusterStateValidationMode.valueOf(mode.toUpperCase(Locale.ROOT));
} catch (IllegalArgumentException e) {
throw new IllegalArgumentException(

Check warning on line 176 in server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java#L175-L176

Added lines #L175 - L176 were not covered by tests
"["
+ mode
+ "] mode is not supported. "
+ "supported modes are ["
+ Arrays.toString(RemoteClusterStateValidationMode.values())

Check warning on line 181 in server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java

View check run for this annotation

Codecov / codecov/patch

server/src/main/java/org/opensearch/gateway/remote/RemoteClusterStateService.java#L181

Added line #L181 was not covered by tests
+ "]"
);
}
}
}

private TimeValue remoteStateReadTimeout;
private final String nodeId;
private final Supplier<RepositoriesService> repositoriesService;
Expand All @@ -159,7 +196,7 @@
private BlobStoreTransferService blobStoreTransferService;
private RemoteRoutingTableService remoteRoutingTableService;
private volatile TimeValue slowWriteLoggingThreshold;
private boolean checksumValidationEnabled;
private RemoteClusterStateValidationMode remoteClusterStateValidationMode;

private final RemotePersistenceStats remoteStateStats;
private RemoteClusterStateCleanupManager remoteClusterStateCleanupManager;
Expand Down Expand Up @@ -206,11 +243,8 @@
clusterSettings.addSettingsUpdateConsumer(SLOW_WRITE_LOGGING_THRESHOLD, this::setSlowWriteLoggingThreshold);
this.remoteStateReadTimeout = clusterSettings.get(REMOTE_STATE_READ_TIMEOUT_SETTING);
clusterSettings.addSettingsUpdateConsumer(REMOTE_STATE_READ_TIMEOUT_SETTING, this::setRemoteStateReadTimeout);
this.checksumValidationEnabled = clusterSettings.get(REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING);
clusterSettings.addSettingsUpdateConsumer(
REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_ENABLED_SETTING,
this::setChecksumValidationEnabled
);
this.remoteClusterStateValidationMode = REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING.get(settings);
clusterSettings.addSettingsUpdateConsumer(REMOTE_CLUSTER_STATE_CHECKSUM_VALIDATION_MODE_SETTING, this::setChecksumValidationMode);

this.remoteStateStats = new RemotePersistenceStats();
this.namedWriteableRegistry = namedWriteableRegistry;
Expand Down Expand Up @@ -272,7 +306,7 @@
uploadedMetadataResults,
previousClusterUUID,
clusterStateDiffManifest,
checksumValidationEnabled ? new ClusterStateChecksum(clusterState) : null,
!remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE) ? new ClusterStateChecksum(clusterState) : null,
false,
codecVersion
);
Expand Down Expand Up @@ -472,7 +506,7 @@
uploadedMetadataResults,
previousManifest.getPreviousClusterUUID(),
clusterStateDiffManifest,
checksumValidationEnabled ? new ClusterStateChecksum(clusterState) : null,
!remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE) ? new ClusterStateChecksum(clusterState) : null,
false,
previousManifest.getCodecVersion()
);
Expand Down Expand Up @@ -917,7 +951,7 @@
uploadedMetadataResults,
previousManifest.getPreviousClusterUUID(),
previousManifest.getDiffManifest(),
checksumValidationEnabled ? previousManifest.getClusterStateChecksum() : null,
!remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE) ? new ClusterStateChecksum(clusterState) : null,
true,
previousManifest.getCodecVersion()
);
Expand Down Expand Up @@ -1003,8 +1037,8 @@
this.slowWriteLoggingThreshold = slowWriteLoggingThreshold;
}

private void setChecksumValidationEnabled(Boolean checksumValidationEnabled) {
this.checksumValidationEnabled = checksumValidationEnabled;
private void setChecksumValidationMode(RemoteClusterStateValidationMode remoteClusterStateValidationMode) {
this.remoteClusterStateValidationMode = remoteClusterStateValidationMode;
}

// Package private for unit test
Expand Down Expand Up @@ -1376,7 +1410,9 @@
includeEphemeral
);

if (includeEphemeral && checksumValidationEnabled && manifest.getClusterStateChecksum() != null) {
if (includeEphemeral
&& !remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE)
&& manifest.getClusterStateChecksum() != null) {
validateClusterStateFromChecksum(manifest, clusterState, clusterName, localNodeId, true);
}
} else {
Expand Down Expand Up @@ -1498,7 +1534,7 @@
.routingTable(new RoutingTable(manifest.getRoutingTableVersion(), indexRoutingTables))
.build();

if (checksumValidationEnabled && manifest.getClusterStateChecksum() != null) {
if (!remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.NONE) && manifest.getClusterStateChecksum() != null) {
validateClusterStateFromChecksum(manifest, clusterState, previousState.getClusterName().value(), localNodeId, false);
}
final long durationMillis = TimeValue.nsecToMSec(relativeTimeNanosSupplier.getAsLong() - startTimeNanos);
Expand All @@ -1517,20 +1553,24 @@
) {
ClusterStateChecksum newClusterStateChecksum = new ClusterStateChecksum(clusterState);
List<String> failedValidation = newClusterStateChecksum.getMismatchEntities(manifest.getClusterStateChecksum());
if (!failedValidation.isEmpty()) {
logger.error(
() -> new ParameterizedMessage(
"Cluster state checksums do not match. Checksum from manifest {}, checksum from created cluster state {}. Entities failing validation {}",
manifest.getClusterStateChecksum(),
newClusterStateChecksum,
failedValidation
)
if (failedValidation.isEmpty()) {
return;
}
logger.error(
() -> new ParameterizedMessage(
"Cluster state checksums do not match. Checksum from manifest {}, checksum from created cluster state {}. Entities failing validation {}",
manifest.getClusterStateChecksum(),
newClusterStateChecksum,
failedValidation
)
);
if (isFullStateDownload && remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.FAILURE)) {
throw new IllegalStateException(
"Cluster state checksums do not match during full state read. Validation failed for " + failedValidation
);
if (isFullStateDownload) {
throw new IllegalStateException(
"Cluster state checksums do not match during full state read. Validation failed for " + failedValidation
);
}
}
if (remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.FAILURE)
|| remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.TRACE)) {
// download full cluster state and match against state created for the failing entities
ClusterState fullClusterState = readClusterStateInParallel(
ClusterState.builder(new ClusterName(clusterName)).build(),
Expand Down Expand Up @@ -1663,6 +1703,8 @@
break;
}
}
}
if (remoteClusterStateValidationMode.equals(RemoteClusterStateValidationMode.FAILURE)) {
throw new IllegalStateException(
"Cluster state checksums do not match during diff read. Validation failed for " + failedValidation
);
Expand Down
Loading
Loading