diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs index 9a5ebc95bdd3..6839ef69f592 100644 --- a/libs/pageserver_api/src/controller_api.rs +++ b/libs/pageserver_api/src/controller_api.rs @@ -245,6 +245,17 @@ impl From for NodeAvailabilityWrapper { } } +/// Scheduling policy enables us to selectively disable some automatic actions that the +/// controller performs on a tenant shard. This is only set to a non-default value by +/// human intervention, and it is reset to the default value (Active) when the tenant's +/// placement policy is modified away from Attached. +/// +/// The typical use of a non-Active scheduling policy is one of: +/// - Pinnning a shard to a node (i.e. migrating it there & setting a non-Active scheduling policy) +/// - Working around a bug (e.g. if something is flapping and we need to stop it until the bug is fixed) +/// +/// If you're not sure which policy to use to pin a shard to its current location, you probably +/// want Pause. #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)] pub enum ShardSchedulingPolicy { // Normal mode: the tenant's scheduled locations may be updated at will, including diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs index 14cc51240d10..7ca80c7dfeec 100644 --- a/storage_controller/src/persistence.rs +++ b/storage_controller/src/persistence.rs @@ -636,6 +636,13 @@ impl Persistence { .into_boxed(), }; + // Clear generation_pageserver if we are moving into a state where we won't have + // any attached pageservers. + let input_generation_pageserver = match input_placement_policy { + None | Some(PlacementPolicy::Attached(_)) => None, + Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None), + }; + #[derive(AsChangeset)] #[diesel(table_name = crate::schema::tenant_shards)] struct ShardUpdate { @@ -643,6 +650,7 @@ impl Persistence { placement_policy: Option, config: Option, scheduling_policy: Option, + generation_pageserver: Option>, } let update = ShardUpdate { @@ -655,6 +663,7 @@ impl Persistence { .map(|c| serde_json::to_string(&c).unwrap()), scheduling_policy: input_scheduling_policy .map(|p| serde_json::to_string(&p).unwrap()), + generation_pageserver: input_generation_pageserver, }; query.set(update).execute(conn)?; diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 083c78233a8a..7e4ee53b4cbf 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -513,6 +513,9 @@ struct ShardUpdate { /// If this is None, generation is not updated. generation: Option, + + /// If this is None, scheduling policy is not updated. + scheduling_policy: Option, } enum StopReconciliationsReason { @@ -2376,6 +2379,23 @@ impl Service { } }; + // Ordinarily we do not update scheduling policy, but when making major changes + // like detaching or demoting to secondary-only, we need to force the scheduling + // mode to Active, or the caller's expected outcome (detach it) will not happen. + let scheduling_policy = match req.config.mode { + LocationConfigMode::Detached | LocationConfigMode::Secondary => { + // Special case: when making major changes like detaching or demoting to secondary-only, + // we need to force the scheduling mode to Active, or nothing will happen. + Some(ShardSchedulingPolicy::Active) + } + LocationConfigMode::AttachedMulti + | LocationConfigMode::AttachedSingle + | LocationConfigMode::AttachedStale => { + // While attached, continue to respect whatever the existing scheduling mode is. + None + } + }; + let mut create = true; for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) { // Saw an existing shard: this is not a creation @@ -2401,6 +2421,7 @@ impl Service { placement_policy: placement_policy.clone(), tenant_config: req.config.tenant_conf.clone(), generation: set_generation, + scheduling_policy, }); } @@ -2497,6 +2518,7 @@ impl Service { placement_policy, tenant_config, generation, + scheduling_policy, } in &updates { self.persistence @@ -2505,7 +2527,7 @@ impl Service { Some(placement_policy.clone()), Some(tenant_config.clone()), *generation, - None, + *scheduling_policy, ) .await?; } @@ -2521,6 +2543,7 @@ impl Service { placement_policy, tenant_config, generation: update_generation, + scheduling_policy, } in updates { let Some(shard) = tenants.get_mut(&tenant_shard_id) else { @@ -2539,6 +2562,10 @@ impl Service { shard.generation = Some(generation); } + if let Some(scheduling_policy) = scheduling_policy { + shard.set_scheduling_policy(scheduling_policy); + } + shard.schedule(scheduler, &mut schedule_context)?; let maybe_waiter = self.maybe_reconcile_shard(shard, nodes); @@ -2992,9 +3019,17 @@ impl Service { let TenantPolicyRequest { placement, - scheduling, + mut scheduling, } = req; + if let Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) = placement { + // When someone configures a tenant to detach, we force the scheduling policy to enable + // this to take effect. + if scheduling.is_none() { + scheduling = Some(ShardSchedulingPolicy::Active); + } + } + self.persistence .update_tenant_shard( TenantFilter::Tenant(tenant_id), diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index f878116d533d..9f74dcccb99e 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -3230,3 +3230,55 @@ def has_hit_migration_failpoint(): # Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown env.storage_controller.configure_failpoints((migration_failpoint.value, "off")) raise + + +@run_only_on_default_postgres("Postgres version makes no difference here") +def test_storage_controller_detached_stopped( + neon_env_builder: NeonEnvBuilder, +): + """ + Test that detaching a tenant while it has scheduling policy set to Paused or Stop works + """ + + remote_storage_kind = s3_storage() + neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind) + + neon_env_builder.num_pageservers = 1 + + env = neon_env_builder.init_configs() + env.start() + virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True) + + tenant_id = TenantId.generate() + env.storage_controller.tenant_create( + tenant_id, + shard_count=1, + ) + + assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1 + + # Disable scheduling: ordinarily this would prevent the tenant's configuration being + # reconciled to pageservers, but this should be overridden when detaching. + env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy.*") + env.storage_controller.tenant_policy_update( + tenant_id, + {"scheduling": "Stop"}, + ) + + env.storage_controller.consistency_check() + + # Detach the tenant + virtual_ps_http.tenant_location_conf( + tenant_id, + { + "mode": "Detached", + "secondary_conf": None, + "tenant_conf": {}, + "generation": None, + }, + ) + + env.storage_controller.consistency_check() + + # Confirm the detach happened + assert env.pageserver.http_client().tenant_list_locations()["tenant_shards"] == []