diff --git a/cloud/blockstore/libs/daemon/ydb/bootstrap.cpp b/cloud/blockstore/libs/daemon/ydb/bootstrap.cpp index 74ed4b224b..eac661418e 100644 --- a/cloud/blockstore/libs/daemon/ydb/bootstrap.cpp +++ b/cloud/blockstore/libs/daemon/ydb/bootstrap.cpp @@ -231,6 +231,16 @@ void TBootstrapYdb::InitKikimrService() .NodeType = Configs->StorageConfig->GetNodeType(), }; + bool loadCmsConfigs = Configs->Options->LoadCmsConfigs; + bool emergencyMode = + Configs->StorageConfig->GetHiveProxyFallbackMode() || + Configs->StorageConfig->GetSSProxyFallbackMode(); + + if (loadCmsConfigs && emergencyMode) { + STORAGE_INFO("Disable loading configs from CMS in emergency mode"); + loadCmsConfigs = false; + } + NCloud::NStorage::TRegisterDynamicNodeOptions registerOpts { .Domain = Configs->Options->Domain, .SchemeShardDir = Configs->StorageConfig->GetSchemeShardDir(), @@ -238,10 +248,14 @@ void TBootstrapYdb::InitKikimrService() .NodeBrokerPort = Configs->Options->NodeBrokerPort, .UseNodeBrokerSsl = Configs->Options->UseNodeBrokerSsl, .InterconnectPort = Configs->Options->InterconnectPort, - .LoadCmsConfigs = Configs->Options->LoadCmsConfigs, + .LoadCmsConfigs = loadCmsConfigs, .Settings = std::move(settings) }; + if (emergencyMode) { + registerOpts.SchemeShardDir = ""; + } + if (Configs->Options->LocationFile) { NProto::TLocation location; ParseProtoTextFromFile(Configs->Options->LocationFile, location); diff --git a/cloud/blockstore/libs/storage/service/volume_session_actor_start.cpp b/cloud/blockstore/libs/storage/service/volume_session_actor_start.cpp index 6eb4209a37..7ea6f42512 100644 --- a/cloud/blockstore/libs/storage/service/volume_session_actor_start.cpp +++ b/cloud/blockstore/libs/storage/service/volume_session_actor_start.cpp @@ -427,8 +427,9 @@ void TStartVolumeActor::StartTablet(const TActorContext& ctx) } LOG_INFO(ctx, TBlockStoreComponents::SERVICE, - "[%lu] Starting tablet", - VolumeTabletId); + "[%lu] Starting tablet (gen: %u)", + VolumeTabletId, + VolumeGeneration); const auto* appData = AppData(ctx); @@ -551,7 +552,7 @@ void TStartVolumeActor::HandleTabletDead( if (PendingRequest == EPendingRequest::START) { LOG_ERROR(ctx, TBlockStoreComponents::SERVICE, - "[%lu] Tablet boot failed during actor stopping", + "[%lu] Tablet boot failed during actor starting", VolumeTabletId); PendingRequest = EPendingRequest::NONE; @@ -579,7 +580,7 @@ void TStartVolumeActor::HandleTabletDead( 0, // cookie error); - bool delay; + bool delay = true; switch (msg->Reason) { case TEvTablet::TEvTabletDead::ReasonBootRace: // Avoid unnecessary delays @@ -591,7 +592,6 @@ void TStartVolumeActor::HandleTabletDead( ++VolumeGeneration; break; default: - delay = true; break; } diff --git a/cloud/blockstore/libs/storage/volume/volume_actor_checkpoint.cpp b/cloud/blockstore/libs/storage/volume/volume_actor_checkpoint.cpp index 1964ed88b2..2a34d4321d 100644 --- a/cloud/blockstore/libs/storage/volume/volume_actor_checkpoint.cpp +++ b/cloud/blockstore/libs/storage/volume/volume_actor_checkpoint.cpp @@ -912,8 +912,11 @@ void TVolumeActor::ReplyErrorOnNormalGetChangedBlocksRequestForDiskRegistryBased TGetChangedBlocksMethod::Name, errorMsg.c_str()); - auto response = std::make_unique(); - *response->Record.MutableError() = MakeError(E_NOT_IMPLEMENTED, errorMsg); + ui32 flags = 0; + SetProtoFlag(flags, NProto::EF_SILENT); + auto error = MakeError(E_NOT_IMPLEMENTED, errorMsg, flags); + auto response = std::make_unique( + std::move(error)); NCloud::Reply(ctx, *ev, std::move(response)); } diff --git a/cloud/blockstore/tests/loadtest/local-emergency/test.py b/cloud/blockstore/tests/loadtest/local-emergency/test.py index 37bd53666c..d3a42c266c 100644 --- a/cloud/blockstore/tests/loadtest/local-emergency/test.py +++ b/cloud/blockstore/tests/loadtest/local-emergency/test.py @@ -1,4 +1,3 @@ -# import os import pytest import yatest.common as common @@ -12,7 +11,7 @@ from ydb.tests.library.harness.kikimr_runner import get_unique_path_for_current_test, ensure_path_exists -def default_storage_config(cache_folder): +def default_storage_config(backups_folder): storage = storage_config_with_default_limits() storage.SSDSystemChannelPoolKind = "ssd" storage.SSDLogChannelPoolKind = "ssd" @@ -21,17 +20,18 @@ def default_storage_config(cache_folder): storage.SSDMergedChannelPoolKind = "ssd" storage.TabletBootInfoBackupFilePath = \ - cache_folder + "/tablet_boot_info_backup.txt" + backups_folder + "/tablet_boot_info_backup.txt" storage.PathDescriptionBackupFilePath = \ - cache_folder + "/path_description_backup.txt" + backups_folder + "/path_description_backup.txt" return storage -def storage_config_with_emergency_mode(cache_folder): - storage = default_storage_config(cache_folder) +def storage_config_with_emergency_mode(backups_folder): + storage = default_storage_config(backups_folder) storage.HiveProxyFallbackMode = True storage.SSProxyFallbackMode = True + storage.DisableLocalService = True return storage @@ -51,26 +51,22 @@ def __init__(self, name, config_path): def __run_test(test_case): - cache_folder = get_unique_path_for_current_test( + backups_folder = get_unique_path_for_current_test( output_path=common.output_path(), - sub_folder="cache", + sub_folder="backups", ) - ensure_path_exists(cache_folder) - - storage_config_patches = [ - default_storage_config(cache_folder), - storage_config_with_emergency_mode(cache_folder), - ] + ensure_path_exists(backups_folder) env = LocalLoadTest( "", - storage_config_patches=storage_config_patches, + storage_config_patches=[default_storage_config(backups_folder)], dynamic_pdisks=[dict(user_kind=1)], dynamic_storage_pools=[ dict(name="dynamic_storage_pool:1", kind="system", pdisk_user_kind=0), dict(name="dynamic_storage_pool:2", kind="ssd", pdisk_user_kind=1) ], - bs_cache_file_path=cache_folder + "/bs_cache.txt", + bs_cache_file_path=backups_folder + "/bs_cache.txt", + load_configs_from_cms=False, ) client = CreateClient(env.endpoint) @@ -78,20 +74,20 @@ def __run_test(test_case): session = Session(client, "vol0", "") session.mount_volume() - session.write_blocks(0, [b'\1' * 4096]) + session.write_blocks(100500, [b'\1' * 4096]) + # TODO: should not unmount volume to make emergency unexpected session.unmount_volume() - static_pdisk_paths = [] - for info in env.pdisks_info: - if info["pdisk_user_kind"] == 0: - static_pdisk_paths += [info["pdisk_path"]] - assert len(static_pdisk_paths) == 1 - - # Destroy static group in order to emulate emergency. - # TODO: survive outage of kikimr static tablets. - # os.remove(static_pdisk_paths[0]) + client.execute_action(action="BackupPathDescriptions", input_bytes=str.encode("")) + client.execute_action(action="BackupTabletBootInfos", input_bytes=str.encode("")) + env.kikimr_cluster.format_static_pdisks() + # spoil config to prevent BS Controller from starting otherwise it will + # erase dynamic groups data + env.kikimr_cluster.spoil_bs_controller_config() env.kikimr_cluster.restart_nodes() + + env.nbs.storage_config_patches = [storage_config_with_emergency_mode(backups_folder)] env.nbs.restart() try: diff --git a/cloud/blockstore/tests/python/lib/loadtest_env.py b/cloud/blockstore/tests/python/lib/loadtest_env.py index 65035c1560..f567d5dc22 100644 --- a/cloud/blockstore/tests/python/lib/loadtest_env.py +++ b/cloud/blockstore/tests/python/lib/loadtest_env.py @@ -51,6 +51,7 @@ def __init__( kikimr_binary_path=None, with_endpoint_proxy=False, with_netlink=False, + load_configs_from_cms=True, ): self.__endpoint = endpoint @@ -76,6 +77,9 @@ def __init__( if run_kikimr: self.kikimr_cluster.start() kikimr_port = list(self.kikimr_cluster.nodes.values())[0].port + else: + # makes sense only when Kikimr is running + load_configs_from_cms = False self.__devices = [] @@ -107,7 +111,7 @@ def __init__( discovery_config=discovery_config, restart_interval=restart_interval, dynamic_storage_pools=dynamic_storage_pools, - load_configs_from_cms=run_kikimr, + load_configs_from_cms=load_configs_from_cms, features_config_patch=features_config_patch, grpc_trace=grpc_trace, rack=rack) diff --git a/cloud/blockstore/tests/python/lib/nbs_runner.py b/cloud/blockstore/tests/python/lib/nbs_runner.py index b627511992..47166a92da 100644 --- a/cloud/blockstore/tests/python/lib/nbs_runner.py +++ b/cloud/blockstore/tests/python/lib/nbs_runner.py @@ -135,10 +135,6 @@ def __init__( if kms_config is not None: self.__proto_configs["kms.txt"] = kms_config - if storage_config_patches is not None and len(storage_config_patches) > 0: - for i in range(len(storage_config_patches)): - self.__proto_configs["storage-%s.txt" % i] = self.__generate_patched_storage_txt(i) - if discovery_config is not None: self.__proto_configs["discovery.txt"] = discovery_config self.__use_discovery = True @@ -165,6 +161,10 @@ def __init__( self.__init_daemon() def __init_daemon(self): + if self.storage_config_patches is not None and len(self.storage_config_patches) > 0: + for i in range(len(self.storage_config_patches)): + self.__proto_configs["storage-%s.txt" % i] = self.__generate_patched_storage_txt(i) + cp = None if self.__binary_path: cp = core_pattern(self.__binary_path, self.__cwd) diff --git a/cloud/storage/core/libs/hive_proxy/hive_proxy_fallback_actor.cpp b/cloud/storage/core/libs/hive_proxy/hive_proxy_fallback_actor.cpp index d060e22a76..91018c49a0 100644 --- a/cloud/storage/core/libs/hive_proxy/hive_proxy_fallback_actor.cpp +++ b/cloud/storage/core/libs/hive_proxy/hive_proxy_fallback_actor.cpp @@ -242,6 +242,15 @@ void THiveProxyFallbackActor::HandleBootExternal( return; } + // increment suggested generation to ensure that the tablet does not get + // stuck with an outdated generation, no matter what + auto request = std::make_unique< + TEvHiveProxyPrivate::TEvUpdateTabletBootInfoBackupRequest>( + r->StorageInfo, + r->SuggestedGeneration + 1 + ); + NCloud::Send(ctx, TabletBootInfoBackup, std::move(request)); + auto response = std::make_unique( std::move(r->StorageInfo), r->SuggestedGeneration, diff --git a/cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp b/cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp index 1523e87bd4..d22411bdd0 100644 --- a/cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp +++ b/cloud/storage/core/libs/hive_proxy/hive_proxy_ut.cpp @@ -1152,6 +1152,14 @@ Y_UNIT_TEST_SUITE(THiveProxyTest) auto result2 = env.SendBootExternalRequest( sender, 0xdeadbeaf, E_REJECTED); UNIT_ASSERT(!result2.StorageInfo); + + auto result3 = env.SendBootExternalRequest(sender, FakeTablet2, S_OK); + UNIT_ASSERT(result3.StorageInfo); + UNIT_ASSERT_VALUES_EQUAL( + FakeTablet2, + result3.StorageInfo->TabletID); + // suggested generation should be incremented after last boot + UNIT_ASSERT_VALUES_EQUAL(2u, result3.SuggestedGeneration); } } diff --git a/cloud/storage/core/libs/kikimr/node.cpp b/cloud/storage/core/libs/kikimr/node.cpp index e87a1ed6bc..2f0f2a6fb3 100644 --- a/cloud/storage/core/libs/kikimr/node.cpp +++ b/cloud/storage/core/libs/kikimr/node.cpp @@ -224,6 +224,11 @@ struct TLegacyNodeRegistrant { NClient::TKikimr kikimr(CreateKikimrConfig(Options, nodeBrokerAddress)); + TMaybe path; + if (Options.SchemeShardDir) { + path = Options.SchemeShardDir; + } + auto registrant = kikimr.GetNodeRegistrant(); auto result = registrant.SyncRegisterNode( Options.Domain, @@ -232,8 +237,8 @@ struct TLegacyNodeRegistrant HostAddress, HostName, Location, - false, //request fixed node id - Options.SchemeShardDir); + false, // fixedNodeId + path); if (!result.IsSuccess()) { return MakeError( diff --git a/ydb/tests/library/harness/kikimr_runner.py b/ydb/tests/library/harness/kikimr_runner.py index 0d0a1fd777..4e527c74e1 100644 --- a/ydb/tests/library/harness/kikimr_runner.py +++ b/ydb/tests/library/harness/kikimr_runner.py @@ -494,6 +494,30 @@ def __instantiate_udfs_dir(self): os.symlink(udf_path, link_name) return self.__common_udfs_dir + # TODO(svartmetal): remove this when YDB learns not to erase dynamic groups + # data after formatting of static pdisks + def spoil_bs_controller_config(self): + flat_bs_controller = [{ + "info": { + "channels": [{ + "channel": 0, + "channel_erasure_name": str(self.__configurator.static_erasure), + "history": [{ + "from_generation": 0, + "group_id": 100500 + }] + }] + } + }] + self.__configurator.yaml_config["system_tablets"]["flat_bs_controller"] = flat_bs_controller + self.__write_configs() + + def format_static_pdisks(self): + for node_id in self.__configurator.all_node_ids(): + for pdisk in self.__configurator.pdisks_info: + if pdisk["pdisk_user_kind"] == 0: + self.nodes[node_id].format_pdisk(**pdisk) + def __format_disks(self, node_id): for pdisk in self.__configurator.pdisks_info: if pdisk['node_id'] != node_id: