Disallow archived timelines to be detached or reparented (#9578)

Disallow a request for timeline ancestor detach if either the to be detached timeline, or any of the to be reparented timelines are offloaded or archived. In theory we could support timelines that are archived but not offloaded, but archived timelines are at the risk of being offloaded, so we treat them like offloaded timelines. As for offloaded timelines, any code to "support" them would amount to unoffloading them, at which point we can just demand to have the timelines be unarchived. Part of #8088
neondatabase · Oct 30, 2024 · d0a02f3 · d0a02f3 · github-actions · Oct 30, 2024
1 parent 8af9412
commit d0a02f3
Show file tree

Hide file tree

Showing 2 changed files with 78 additions and 4 deletions.
diff --git a/pageserver/src/tenant/timeline/detach_ancestor.rs b/pageserver/src/tenant/timeline/detach_ancestor.rs
@@ -29,6 +29,9 @@ pub(crate) enum Error {
     #[error("shutting down, please retry later")]
     ShuttingDown,
 
+    #[error("archived: {}", .0)]
+    Archived(TimelineId),
+
     #[error(transparent)]
     NotFound(crate::tenant::GetTimelineError),
 
@@ -79,8 +82,9 @@ impl From<Error> for ApiError {
     fn from(value: Error) -> Self {
         match value {
             Error::NoAncestor => ApiError::Conflict(value.to_string()),
-            Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{}", value)),
+            Error::TooManyAncestors => ApiError::BadRequest(anyhow::anyhow!("{value}")),
             Error::ShuttingDown => ApiError::ShuttingDown,
+            Error::Archived(_) => ApiError::BadRequest(anyhow::anyhow!("{value}")),
             Error::OtherTimelineDetachOngoing(_) | Error::FailedToReparentAll => {
                 ApiError::ResourceUnavailable(value.to_string().into())
             }
@@ -201,12 +205,18 @@ pub(super) async fn prepare(
         }));
     };
 
+    if detached.is_archived() != Some(false) {
+        return Err(Archived(detached.timeline_id));
+    }
+
     if !ancestor_lsn.is_valid() {
         // rare case, probably wouldn't even load
         tracing::error!("ancestor is set, but ancestor_lsn is invalid, this timeline needs fixing");
         return Err(NoAncestor);
     }
 
+    check_no_archived_children_of_ancestor(tenant, detached, &ancestor, ancestor_lsn)?;
+
     if ancestor.ancestor_timeline.is_some() {
         // non-technical requirement; we could flatten N ancestors just as easily but we chose
         // not to, at least initially
@@ -950,3 +960,36 @@ where
         }
     })
 }
+
+fn check_no_archived_children_of_ancestor(
+    tenant: &Tenant,
+    detached: &Arc<Timeline>,
+    ancestor: &Arc<Timeline>,
+    ancestor_lsn: Lsn,
+) -> Result<(), Error> {
+    let timelines = tenant.timelines.lock().unwrap();
+    let timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
+    for timeline in reparentable_timelines(timelines.values(), detached, ancestor, ancestor_lsn) {
+        if timeline.is_archived() == Some(true) {
+            return Err(Error::Archived(timeline.timeline_id));
+        }
+    }
+    for timeline_offloaded in timelines_offloaded.values() {
+        if timeline_offloaded.ancestor_timeline_id != Some(ancestor.timeline_id) {
+            continue;
+        }
+        // This forbids the detach ancestor feature if flattened timelines are present,
+        // even if the ancestor_lsn is from after the branchpoint of the detached timeline.
+        // But as per current design, we don't record the ancestor_lsn of flattened timelines.
+        // This is a bit unfortunate, but as of writing this we don't support flattening
+        // anyway. Maybe we can evolve the data model in the future.
+        if let Some(retain_lsn) = timeline_offloaded.ancestor_retain_lsn {
+            let is_earlier = retain_lsn <= ancestor_lsn;
+            if !is_earlier {
+                continue;
+            }
+        }
+        return Err(Error::Archived(timeline_offloaded.timeline_id));
+    }
+    Ok(())
+}
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -9,7 +9,7 @@
 from threading import Barrier
 
 import pytest
-from fixtures.common_types import Lsn, TimelineId
+from fixtures.common_types import Lsn, TimelineArchivalState, TimelineId
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     LogCursor,
@@ -634,7 +634,13 @@ def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, shard
     shards = 2 if sharded else 1
 
     neon_env_builder.num_pageservers = shards
-    env = neon_env_builder.init_start(initial_tenant_shard_count=shards if sharded else None)
+    env = neon_env_builder.init_start(
+        initial_tenant_shard_count=shards if sharded else None,
+        initial_tenant_conf={
+            # turn off gc, we want to do manual offloading here.
+            "gc_period": "0s",
+        },
+    )
 
     pageservers = dict((int(p.id), p) for p in env.pageservers)
 
@@ -656,7 +662,9 @@ def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, shard
         client.detach_ancestor(env.initial_tenant, env.initial_timeline)
     assert info.value.status_code == 409
 
-    _ = env.create_branch("first_branch")
+    early_branch = env.create_branch("early_branch")
+
+    first_branch = env.create_branch("first_branch")
 
     second_branch = env.create_branch("second_branch", ancestor_branch_name="first_branch")
 
@@ -665,6 +673,29 @@ def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, shard
         client.detach_ancestor(env.initial_tenant, second_branch)
     assert info.value.status_code == 400
 
+    client.timeline_archival_config(
+        env.initial_tenant, second_branch, TimelineArchivalState.ARCHIVED
+    )
+
+    client.timeline_archival_config(
+        env.initial_tenant, early_branch, TimelineArchivalState.ARCHIVED
+    )
+
+    with pytest.raises(PageserverApiException, match=f".*archived: {early_branch}") as info:
+        client.detach_ancestor(env.initial_tenant, first_branch)
+    assert info.value.status_code == 400
+
+    if not sharded:
+        client.timeline_offload(env.initial_tenant, early_branch)
+
+    client.timeline_archival_config(
+        env.initial_tenant, first_branch, TimelineArchivalState.ARCHIVED
+    )
+
+    with pytest.raises(PageserverApiException, match=f".*archived: {first_branch}") as info:
+        client.detach_ancestor(env.initial_tenant, first_branch)
+    assert info.value.status_code == 400
+
 
 def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder):
     """