Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merge to stable-23-3: making blockstore cluster survive after destruction of static BS group in local-emergency test #2055

Merged
merged 4 commits into from
Sep 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion cloud/blockstore/libs/daemon/ydb/bootstrap.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -231,17 +231,31 @@ void TBootstrapYdb::InitKikimrService()
.NodeType = Configs->StorageConfig->GetNodeType(),
};

bool loadCmsConfigs = Configs->Options->LoadCmsConfigs;
bool emergencyMode =
Configs->StorageConfig->GetHiveProxyFallbackMode() ||
Configs->StorageConfig->GetSSProxyFallbackMode();

if (loadCmsConfigs && emergencyMode) {
STORAGE_INFO("Disable loading configs from CMS in emergency mode");
loadCmsConfigs = false;
}

NCloud::NStorage::TRegisterDynamicNodeOptions registerOpts {
.Domain = Configs->Options->Domain,
.SchemeShardDir = Configs->StorageConfig->GetSchemeShardDir(),
.NodeBrokerAddress = Configs->Options->NodeBrokerAddress,
.NodeBrokerPort = Configs->Options->NodeBrokerPort,
.UseNodeBrokerSsl = Configs->Options->UseNodeBrokerSsl,
.InterconnectPort = Configs->Options->InterconnectPort,
.LoadCmsConfigs = Configs->Options->LoadCmsConfigs,
.LoadCmsConfigs = loadCmsConfigs,
.Settings = std::move(settings)
};

if (emergencyMode) {
registerOpts.SchemeShardDir = "";
}

if (Configs->Options->LocationFile) {
NProto::TLocation location;
ParseProtoTextFromFile(Configs->Options->LocationFile, location);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -427,8 +427,9 @@ void TStartVolumeActor::StartTablet(const TActorContext& ctx)
}

LOG_INFO(ctx, TBlockStoreComponents::SERVICE,
"[%lu] Starting tablet",
VolumeTabletId);
"[%lu] Starting tablet (gen: %u)",
VolumeTabletId,
VolumeGeneration);

const auto* appData = AppData(ctx);

Expand Down Expand Up @@ -551,7 +552,7 @@ void TStartVolumeActor::HandleTabletDead(

if (PendingRequest == EPendingRequest::START) {
LOG_ERROR(ctx, TBlockStoreComponents::SERVICE,
"[%lu] Tablet boot failed during actor stopping",
"[%lu] Tablet boot failed during actor starting",
VolumeTabletId);

PendingRequest = EPendingRequest::NONE;
Expand Down Expand Up @@ -579,7 +580,7 @@ void TStartVolumeActor::HandleTabletDead(
0, // cookie
error);

bool delay;
bool delay = true;
switch (msg->Reason) {
case TEvTablet::TEvTabletDead::ReasonBootRace:
// Avoid unnecessary delays
Expand All @@ -591,7 +592,6 @@ void TStartVolumeActor::HandleTabletDead(
++VolumeGeneration;
break;
default:
delay = true;
break;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -912,8 +912,11 @@ void TVolumeActor::ReplyErrorOnNormalGetChangedBlocksRequestForDiskRegistryBased
TGetChangedBlocksMethod::Name,
errorMsg.c_str());

auto response = std::make_unique<TGetChangedBlocksMethod::TResponse>();
*response->Record.MutableError() = MakeError(E_NOT_IMPLEMENTED, errorMsg);
ui32 flags = 0;
SetProtoFlag(flags, NProto::EF_SILENT);
auto error = MakeError(E_NOT_IMPLEMENTED, errorMsg, flags);
auto response = std::make_unique<TGetChangedBlocksMethod::TResponse>(
std::move(error));

NCloud::Reply(ctx, *ev, std::move(response));
}
Expand Down
48 changes: 22 additions & 26 deletions cloud/blockstore/tests/loadtest/local-emergency/test.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# import os
import pytest

import yatest.common as common
Expand All @@ -12,7 +11,7 @@
from ydb.tests.library.harness.kikimr_runner import get_unique_path_for_current_test, ensure_path_exists


def default_storage_config(cache_folder):
def default_storage_config(backups_folder):
storage = storage_config_with_default_limits()
storage.SSDSystemChannelPoolKind = "ssd"
storage.SSDLogChannelPoolKind = "ssd"
Expand All @@ -21,17 +20,18 @@ def default_storage_config(cache_folder):
storage.SSDMergedChannelPoolKind = "ssd"

storage.TabletBootInfoBackupFilePath = \
cache_folder + "/tablet_boot_info_backup.txt"
backups_folder + "/tablet_boot_info_backup.txt"
storage.PathDescriptionBackupFilePath = \
cache_folder + "/path_description_backup.txt"
backups_folder + "/path_description_backup.txt"

return storage


def storage_config_with_emergency_mode(cache_folder):
storage = default_storage_config(cache_folder)
def storage_config_with_emergency_mode(backups_folder):
storage = default_storage_config(backups_folder)
storage.HiveProxyFallbackMode = True
storage.SSProxyFallbackMode = True
storage.DisableLocalService = True
return storage


Expand All @@ -51,47 +51,43 @@ def __init__(self, name, config_path):


def __run_test(test_case):
cache_folder = get_unique_path_for_current_test(
backups_folder = get_unique_path_for_current_test(
output_path=common.output_path(),
sub_folder="cache",
sub_folder="backups",
)
ensure_path_exists(cache_folder)

storage_config_patches = [
default_storage_config(cache_folder),
storage_config_with_emergency_mode(cache_folder),
]
ensure_path_exists(backups_folder)

env = LocalLoadTest(
"",
storage_config_patches=storage_config_patches,
storage_config_patches=[default_storage_config(backups_folder)],
dynamic_pdisks=[dict(user_kind=1)],
dynamic_storage_pools=[
dict(name="dynamic_storage_pool:1", kind="system", pdisk_user_kind=0),
dict(name="dynamic_storage_pool:2", kind="ssd", pdisk_user_kind=1)
],
bs_cache_file_path=cache_folder + "/bs_cache.txt",
bs_cache_file_path=backups_folder + "/bs_cache.txt",
load_configs_from_cms=False,
)

client = CreateClient(env.endpoint)
client.create_volume("vol0", 4096, 1000000, 1, protos.EStorageMediaKind.Value("STORAGE_MEDIA_SSD"))

session = Session(client, "vol0", "")
session.mount_volume()
session.write_blocks(0, [b'\1' * 4096])
session.write_blocks(100500, [b'\1' * 4096])
# TODO: should not unmount volume to make emergency unexpected
session.unmount_volume()

static_pdisk_paths = []
for info in env.pdisks_info:
if info["pdisk_user_kind"] == 0:
static_pdisk_paths += [info["pdisk_path"]]
assert len(static_pdisk_paths) == 1

# Destroy static group in order to emulate emergency.
# TODO: survive outage of kikimr static tablets.
# os.remove(static_pdisk_paths[0])
client.execute_action(action="BackupPathDescriptions", input_bytes=str.encode(""))
client.execute_action(action="BackupTabletBootInfos", input_bytes=str.encode(""))

env.kikimr_cluster.format_static_pdisks()
# spoil config to prevent BS Controller from starting otherwise it will
# erase dynamic groups data
env.kikimr_cluster.spoil_bs_controller_config()
env.kikimr_cluster.restart_nodes()

env.nbs.storage_config_patches = [storage_config_with_emergency_mode(backups_folder)]
env.nbs.restart()

try:
Expand Down
6 changes: 5 additions & 1 deletion cloud/blockstore/tests/python/lib/loadtest_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def __init__(
kikimr_binary_path=None,
with_endpoint_proxy=False,
with_netlink=False,
load_configs_from_cms=True,
):

self.__endpoint = endpoint
Expand All @@ -76,6 +77,9 @@ def __init__(
if run_kikimr:
self.kikimr_cluster.start()
kikimr_port = list(self.kikimr_cluster.nodes.values())[0].port
else:
# makes sense only when Kikimr is running
load_configs_from_cms = False

self.__devices = []

Expand Down Expand Up @@ -107,7 +111,7 @@ def __init__(
discovery_config=discovery_config,
restart_interval=restart_interval,
dynamic_storage_pools=dynamic_storage_pools,
load_configs_from_cms=run_kikimr,
load_configs_from_cms=load_configs_from_cms,
features_config_patch=features_config_patch,
grpc_trace=grpc_trace,
rack=rack)
Expand Down
8 changes: 4 additions & 4 deletions cloud/blockstore/tests/python/lib/nbs_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,10 +135,6 @@ def __init__(
if kms_config is not None:
self.__proto_configs["kms.txt"] = kms_config

if storage_config_patches is not None and len(storage_config_patches) > 0:
for i in range(len(storage_config_patches)):
self.__proto_configs["storage-%s.txt" % i] = self.__generate_patched_storage_txt(i)

if discovery_config is not None:
self.__proto_configs["discovery.txt"] = discovery_config
self.__use_discovery = True
Expand All @@ -165,6 +161,10 @@ def __init__(
self.__init_daemon()

def __init_daemon(self):
if self.storage_config_patches is not None and len(self.storage_config_patches) > 0:
for i in range(len(self.storage_config_patches)):
self.__proto_configs["storage-%s.txt" % i] = self.__generate_patched_storage_txt(i)

cp = None
if self.__binary_path:
cp = core_pattern(self.__binary_path, self.__cwd)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,15 @@ void THiveProxyFallbackActor::HandleBootExternal(
return;
}

// increment suggested generation to ensure that the tablet does not get
// stuck with an outdated generation, no matter what
auto request = std::make_unique<
TEvHiveProxyPrivate::TEvUpdateTabletBootInfoBackupRequest>(
r->StorageInfo,
r->SuggestedGeneration + 1
);
NCloud::Send(ctx, TabletBootInfoBackup, std::move(request));

auto response = std::make_unique<TResponse>(
std::move(r->StorageInfo),
r->SuggestedGeneration,
Expand Down
8 changes: 8 additions & 0 deletions cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1152,6 +1152,14 @@ Y_UNIT_TEST_SUITE(THiveProxyTest)
auto result2 = env.SendBootExternalRequest(
sender, 0xdeadbeaf, E_REJECTED);
UNIT_ASSERT(!result2.StorageInfo);

auto result3 = env.SendBootExternalRequest(sender, FakeTablet2, S_OK);
UNIT_ASSERT(result3.StorageInfo);
UNIT_ASSERT_VALUES_EQUAL(
FakeTablet2,
result3.StorageInfo->TabletID);
// suggested generation should be incremented after last boot
UNIT_ASSERT_VALUES_EQUAL(2u, result3.SuggestedGeneration);
}
}

Expand Down
9 changes: 7 additions & 2 deletions cloud/storage/core/libs/kikimr/node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,11 @@ struct TLegacyNodeRegistrant
{
NClient::TKikimr kikimr(CreateKikimrConfig(Options, nodeBrokerAddress));

TMaybe<TString> path;
if (Options.SchemeShardDir) {
path = Options.SchemeShardDir;
}

auto registrant = kikimr.GetNodeRegistrant();
auto result = registrant.SyncRegisterNode(
Options.Domain,
Expand All @@ -232,8 +237,8 @@ struct TLegacyNodeRegistrant
HostAddress,
HostName,
Location,
false, //request fixed node id
Options.SchemeShardDir);
false, // fixedNodeId
path);

if (!result.IsSuccess()) {
return MakeError(
Expand Down
24 changes: 24 additions & 0 deletions ydb/tests/library/harness/kikimr_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,30 @@ def __instantiate_udfs_dir(self):
os.symlink(udf_path, link_name)
return self.__common_udfs_dir

# TODO(svartmetal): remove this when YDB learns not to erase dynamic groups
# data after formatting of static pdisks
def spoil_bs_controller_config(self):
flat_bs_controller = [{
"info": {
"channels": [{
"channel": 0,
"channel_erasure_name": str(self.__configurator.static_erasure),
"history": [{
"from_generation": 0,
"group_id": 100500
}]
}]
}
}]
self.__configurator.yaml_config["system_tablets"]["flat_bs_controller"] = flat_bs_controller
self.__write_configs()

def format_static_pdisks(self):
for node_id in self.__configurator.all_node_ids():
for pdisk in self.__configurator.pdisks_info:
if pdisk["pdisk_user_kind"] == 0:
self.nodes[node_id].format_pdisk(**pdisk)

def __format_disks(self, node_id):
for pdisk in self.__configurator.pdisks_info:
if pdisk['node_id'] != node_id:
Expand Down
Loading